Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1698,11 +1698,11 @@ (begin (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration) (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) db "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');" - run-id remotehoststart-deadtime) ;; default time 230 seconds. + run-id remotehoststart-deadtime) ;; default time 230 seconds ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config ;; ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row @@ -1732,37 +1732,37 @@ ;; incompleted)) (min-incompleted-ids (map car incompleted)) ;; do 'em all (all-ids (append min-incompleted-ids (map car oldlaunched)))) (if (> (length all-ids) 0) (begin + ;; (launch:is-test-alive "localhost" 435) (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD") (for-each (lambda (test-id) - (let* ( - (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id)) - (result (db:get-status-from-final-status-file run-dir))) - (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) - (begin - (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD") - (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "PASS" "Test stopped responding but it has PASSED; marking it PASS in the DB.") - ) - (begin - (debug:print 0 *default-log-port* "INFO: test " test-id " final state/status is not COMPLETED/PASS. It is " result) - (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.") - ) - ) ;;call end of eud of run detection for posthook - (launch:end-of-run-check run-id) - ) - ) - all-ids) - ) - ) - ) - ) - ) - ) -) + (let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id)) + (tinfo (db:get-test-info-by-id dbstruct run-id test-id)) + (run-dir (db:test-get-rundir tinfo)) + (host (db:test-get-host tinfo)) + (pid (db:test-get-process_id tinfo)) + (result (db:get-status-from-final-status-file run-dir))) + (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) + (begin + (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD") + (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "PASS" + "Test stopped responding but it has PASSED; marking it PASS in the DB.")) + (let ((is-alive (launch:is-test-alive host pid))) + (if is-alive + (debug:print 0 *default-log-port* "INFO: test " test-id " on host " host " has a process on pid " pid ", NOT setting to DEAD.") + (begin + (debug:print 0 *default-log-port* "INFO: test " test-id " final state/status is not COMPLETED/PASS. It is " result) + (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" + "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))))))) + all-ids) + ;;call end of eud of run detection for posthook + (launch:end-of-run-check run-id) + ))))))) + ;; ALL REPLACED BY THE BLOCK ABOVE ;; ;; (sqlite3:execute ;; db