@@ -395,24 +395,37 @@ (delta (abs (- df disk-free)))) (if (and (> df 0) (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg df #f))) - (do-sync (or new-cpu-load new-disk-free over-time))) + (do-sync (or new-cpu-load new-disk-free over-time)) + + (test-info (rmt:get-test-info-by-id run-id test-id)) + (state (db:test-get-state test-info)) + (status (db:test-get-status test-info)) + (kill-reason "no kill reason specified") + (kill-job? #f)) + + (cond + ((test-get-kill-request run-id test-id) + (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") + (set! kill-job? #t)) + ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) + (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)) + (set! kill-job? #t)) + ((equal? status "DEAD") + (set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests") + (set! kill-job? #t))) + (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) - (set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)) - (and runtlim (let* ((run-seconds (- (current-seconds) start-seconds)) - (time-exceeded (> run-seconds runtlim))) - (if time-exceeded - (begin - (debug:print-info 0 *default-log-port* "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim) - #t) - #f))))) + (launch:handle-zombie-tests run-id) (if do-sync (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) + (if kill-job? (begin + (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries? (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0)) @@ -442,14 +455,14 @@ #f (process-signal pid-num signal/kill))) (process:get-sub-pids pid)))) ;; (debug:print-info 0 *default-log-port* "not killing process " pid " as it is not alive")))) pids) - (tests:test-set-status! run-id test-id "KILLED" "KILLED" (args:get-arg "-m") #f)) + (tests:test-set-status! run-id test-id "KILLED" "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) (begin (debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2) - (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (args:get-arg "-m") #f) + (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ))) (mutex-unlock! m) ;; no point in sticking around. Exit now. (exit))) (if (hash-table-ref/default misc-flags 'keep-going #f) @@ -620,11 +633,11 @@ ) (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")) (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed") (exit)))) - (debug:print 2 *default-log-port* "Exectuing " test-name " (id: " test-id ") on " (get-host-name)) + (debug:print 2 *default-log-port* "Executing " test-name " (id: " test-id ") on " (get-host-name)) (set! keys (rmt:get-keys)) ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process ;; one of these is defunct/redundant ... (if (not (launch:setup force-reread: #t)) (begin @@ -1332,10 +1345,32 @@ (begin (debug:print-error 0 *default-log-port* "Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries) ;; (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat remtries: (- remtries 1))) (list #f #f))))) + + +(define (launch:handle-zombie-tests run-id) + (let* ((key (conc "zombiescan-runid-"run-id)) + (now (current-seconds)) + (threshold (- (current-seconds) (* 2 (or (configf:lookup-number *configdat* "setup" "deadtime") 120)))) + (val (rmt:get-var key)) + (do-scan? + (cond + ((not val) + #t) + ((< val threshold) + #t) + (else #f)))) + (when do-scan? + (debug:print 1 *default-log-port* "INFO: search and mark zombie tests") + (rmt:set-var key (current-seconds)) + (rmt:find-and-mark-incomplete run-id #f)))) + + + + ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host