@@ -447,22 +447,25 @@ (- (current-seconds) start-seconds))))) (kill-tries 0)) (let loop ((minutes (calc-minutes))) - (let ((db (open-db))) + (let* ((db (open-db)) + (cpuload (get-cpu-load)) + (diskfree (get-df (current-directory))) + (tmpfree (get-df "/tmp"))) + (if (not cpuload) (begin (debug:print 0 "WARNING: CPULOAD not found.") (set! cpuload "n/a"))) + (if (not diskfree) (begin (debug:print 0 "WARNING: DISKFREE not found.") (set! diskfree "n/a"))) (set! kill-job? (test-get-kill-request db run-id test-name itemdat)) - (test-update-meta-info db run-id test-name itemdat minutes) + (test-update-meta-info db run-id test-name itemdat minutes cpuload diskfree tmpfree) (if kill-job? (begin (mutex-lock! m) (let* ((pid (vector-ref exit-info 0))) (if (number? pid) (begin (debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")") - ;;(cond - ;;((> kill-tries 0) ; 2) (let ((processes (cmd-run->list (conc "pgrep -l -P " pid)))) (for-each (lambda (p) (let* ((parts (string-split p)) (p-id (if (> (length parts) 0) @@ -472,47 +475,18 @@ (begin (debug:print 0 "Killing " (cadr parts) "; kill -9 " p-id) (system (conc "kill -9 " p-id)))))) (car processes)) (system (conc "kill -9 " pid)))) - ;;(let* ((ppid (process-group-id pid)) - ;; (kcmd (conc "pkill -9 -g " ppid))) - ;; ;; (process-signal pid signal/term) - ;; ;; (process-signal pid signal/kill) - ;; (debug:print 0 "Attempting to kill pid " pid " and children in process group " ppid " with command:\n " kcmd) - ;; (debug:print 0 "Children:") - ;; (system (conc "pgrep -g -l " ppid)) - ;; (system kcmd) - ;; (sleep 1) ;; give it a rest - ;; (test-set-status! db run-id test-name "KILLED" "FAIL" - ;; itemdat (args:get-arg "-m")) - ;; (sqlite3:finalize! db) - ;; (exit 1))))) (begin (debug:print 0 "WARNING: Request received to kill job but problem with process, attempting to kill manager process") (test-set-status! db run-id test-name "KILLED" "FAIL" itemdat (args:get-arg "-m") #f) (sqlite3:finalize! db) (exit 1)))) - ;; (thread-terminate! job-thread))) (set! kill-tries (+ 1 kill-tries)) (mutex-unlock! m))) - ;; (handle-exceptions - ;; exn - ;; (begin - ;; (debug:print 0 "ERROR: Problem killing process " (vector-ref exit-info 0)) - ;; (abort exn)) - ;; (let* ((pid (vector-ref exit-info 0)) - ;; ;; (pgid (process-group-id pid)) - ;; ;; (cmd (conc "pkill -9 -P " pgid)) - ;; ) - ;; ;; (debug:print 0 "Running \"" cmd "\"") - ;; ;; (system cmd) - ;; (debug:print 0 "Running \"kill -9 " pid "\"") - ;; (system (conc "kill -9 " pid)) - ;; ;; (process-signal (vector-ref exit-info 0) signal/kill) - ;; )))) (sqlite3:finalize! db) (thread-sleep! (+ 8 (random 4))) ;; add some jitter to the call home time to spread out the db accesses (loop (calc-minutes))))))) (th1 (make-thread monitorjob)) (th2 (make-thread runit)))