@@ -1614,10 +1614,30 @@ ;; given a launch delay (minimum time from last launch) return amount of time to wait ;; ;; (define (db:launch-delay-left dbstruct run-id launch-delay) + + +(define (db:get-status-from-final-status-file run-dir) + (let ( + (infile (conc run-dir "/.final-status"))) + + ;; first verify we are able to write the output file + (if (not (file-read-access? infile)) + (begin + (debug:print 0 *default-log-port* "ERROR: cannot read " infile) + (debug:print 0 *default-log-port* "ERROR: run-dir is " run-dir) + #f + ) + (with-input-from-file infile read-lines) + ) + ) +) + + + ;; select end_time-now from ;; (select testname,item_path,event_time+run_duration as ;; end_time,strftime('%s','now') as now from tests where state in ;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED')); @@ -1624,10 +1644,12 @@ (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) (let* ((incompleted '()) (oldlaunched '()) (toplevels '()) + ;; The default running-deadtime is 720 seconds = 12 minutes. + ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30)) (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime"))) (server-start-allowance 200) (server-overloaded-budget 200) (launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30)) (launch-monitor-on-time-budget 30) @@ -1635,10 +1657,13 @@ (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30)) (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default)) (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period))) (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period) ) + (debug:print-info 4 *default-log-port* "running-deadtime = " running-deadtime) + (debug:print-info 4 *default-log-port* "deadtime-trim = " deadtime-trim) + (db:with-db dbstruct #f #f (lambda (db) ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes ;; @@ -1657,12 +1682,13 @@ (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) (begin (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)))) db + "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');" - run-id running-deadtime) + run-id running-deadtime) ;; default time 720 seconds (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) (if (and (equal? uname "n/a") @@ -1674,11 +1700,11 @@ (begin (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration) (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) db "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');" - run-id remotehoststart-deadtime) + run-id remotehoststart-deadtime) ;; default time 230 seconds. ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config ;; ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row @@ -1710,14 +1736,34 @@ (all-ids (append min-incompleted-ids (map car oldlaunched)))) (if (> (length all-ids) 0) (begin (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD") (for-each - (lambda (test-id) - (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")) - ;;(db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS. ref ticket 220546828 - all-ids)))))))) + (lambda (test-id) + (let* ( + (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id)) + (result (db:get-status-from-final-status-file run-dir))) + (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) + (begin + (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD") + (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "PASS" "Test stopped responding but it has PASSED; marking it PASS in the DB.") + ) + (begin + (debug:print 0 *default-log-port* "INFO: test " test-id " final state/status is not COMPLETED/PASS. It is " result) + (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.") + ) + ) + ) + ) + all-ids) + ) + ) + ) + ) + ) + ) +) ;; ALL REPLACED BY THE BLOCK ABOVE ;; ;; (sqlite3:execute ;; db