Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -1593,22 +1593,25 @@ (with-output-to-file fullpath (lambda ()(pp dat)))))) ;; get cpu load by reading from /proc/loadavg, return all three values ;; (define (common:get-cpu-load remote-host) - (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) - (or (common:get-cached-info actual-hostname "cpu-load") - (let ((result (if remote-host - (map (lambda (res) - (if (eof-object? res) 9e99 res)) - (with-input-from-pipe - (conc "ssh " remote-host " cat /proc/loadavg") + (handle-exceptions + exn + '(99 99 99) + (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) + (or (common:get-cached-info actual-hostname "cpu-load") + (let ((result (if remote-host + (map (lambda (res) + (if (eof-object? res) 9e99 res)) + (with-input-from-pipe + (conc "ssh " remote-host " cat /proc/loadavg") (lambda ()(list (read)(read)(read))))) - (with-input-from-file "/proc/loadavg" - (lambda ()(list (read)(read)(read))))))) - (common:write-cached-info actual-hostname "cpu-load" result) - result)))) + (with-input-from-file "/proc/loadavg" + (lambda ()(list (read)(read)(read))))))) + (common:write-cached-info actual-hostname "cpu-load" result) + result))))) ;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads ;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc. ;; keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load ;; Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -413,12 +413,14 @@ (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") - (set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") - (set! kill-job? #t))) + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) + (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") + ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING + (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) (when do-sync ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)