Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1614,10 +1614,30 @@ ;; given a launch delay (minimum time from last launch) return amount of time to wait ;; ;; (define (db:launch-delay-left dbstruct run-id launch-delay) + + +(define (db:get-status-from-final-status-file run-dir) + (let ( + (infile (conc run-dir "/.final-status"))) + + ;; first verify we are able to write the output file + (if (not (file-read-access? infile)) + (begin + (debug:print 0 *default-log-port* "ERROR: cannot read " infile) + (debug:print 0 *default-log-port* "ERROR: run-dir is " run-dir) + #f + ) + (with-input-from-file infile read-lines) + ) + ) +) + + + ;; select end_time-now from ;; (select testname,item_path,event_time+run_duration as ;; end_time,strftime('%s','now') as now from tests where state in ;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED')); @@ -1624,10 +1644,12 @@ (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) (let* ((incompleted '()) (oldlaunched '()) (toplevels '()) + ;; The default running-deadtime is 720 seconds = 12 minutes. + ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30)) (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime"))) (server-start-allowance 200) (server-overloaded-budget 200) (launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30)) (launch-monitor-on-time-budget 30) @@ -1635,10 +1657,13 @@ (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30)) (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default)) (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period))) (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period) ) + (debug:print-info 4 *default-log-port* "running-deadtime = " running-deadtime) + (debug:print-info 4 *default-log-port* "deadtime-trim = " deadtime-trim) + (db:with-db dbstruct #f #f (lambda (db) ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes ;; @@ -1657,12 +1682,13 @@ (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) (begin (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)))) db + "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');" - run-id running-deadtime) + run-id running-deadtime) ;; default time 720 seconds (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) (if (and (equal? uname "n/a") @@ -1674,11 +1700,11 @@ (begin (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration) (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) db "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');" - run-id remotehoststart-deadtime) + run-id remotehoststart-deadtime) ;; default time 230 seconds. ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config ;; ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row @@ -1710,14 +1736,34 @@ (all-ids (append min-incompleted-ids (map car oldlaunched)))) (if (> (length all-ids) 0) (begin (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD") (for-each - (lambda (test-id) - (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")) - ;;(db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS. ref ticket 220546828 - all-ids)))))))) + (lambda (test-id) + (let* ( + (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id)) + (result (db:get-status-from-final-status-file run-dir))) + (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) + (begin + (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD") + (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "PASS" "Test stopped responding but it has PASSED; marking it PASS in the DB.") + ) + (begin + (debug:print 0 *default-log-port* "INFO: test " test-id " final state/status is not COMPLETED/PASS. It is " result) + (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.") + ) + ) + ) + ) + all-ids) + ) + ) + ) + ) + ) + ) +) ;; ALL REPLACED BY THE BLOCK ABOVE ;; ;; (sqlite3:execute ;; db Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -784,11 +784,12 @@ ;; now is also a good time to write the .testconfig file (let* ((tconfig-fname (conc work-area "/.testconfig")) (tconfig-tmpfile (conc tconfig-fname ".tmp")) (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t))) ;; 'return-procs))) (configf:write-alist tconfig tconfig-tmpfile) - (file-move tconfig-tmpfile tconfig-fname #t)) + (file-move tconfig-tmpfile tconfig-fname #t) + (delete-file* ".final-status")) ;; (let* ((m (make-mutex)) (kill-job? #f) (exit-info (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status (job-thread #f) @@ -832,10 +833,14 @@ ((eq? (launch:einf-rollup-status exit-info) 4) "WAIVED") ((eq? (launch:einf-rollup-status exit-info) 5) "ABORT") ((eq? (launch:einf-rollup-status exit-info) 6) "SKIP") (else "FAIL")))) ;; (db:test-get-status testinfo))) (debug:print-info 1 *default-log-port* "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (launch:einf-exit-status exit-info) " and rollup-status of " (launch:einf-rollup-status exit-info)) + + ;; Leave a .final-status file for each sub-test + (tests:save-final-status run-id test-id) + (tests:test-set-status! run-id test-id new-state new-status (args:get-arg "-m") #f) @@ -842,12 +847,14 @@ ;; need to update the top test record if PASS or FAIL and this is a subtest ;; NO NEED TO CALL set-state-status-and-roll-up-items HERE, THIS IS DONE IN set-state-status-and-roll-up-items called by tests:test-set-status! )) ;; for automated creation of the rollup html file this is a good place... (if (not (equal? item-path "")) - (tests:summarize-items run-id test-id test-name #f)) + (tests:summarize-items run-id test-id test-name #f)) (tests:summarize-test run-id test-id) ;; don't force - just update if no + ;; Leave a .final-status file for the top level test + (tests:save-final-status run-id test-id) (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) (mutex-unlock! m) (launch:end-of-run-check run-id ) (debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n") Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1996,11 +1996,11 @@ (lambda (f x) (let ((fullname (conc real-dir "/" f))) (if (not (string-search (regexp "testdat.db") f)) (runs:recursive-delete-with-error-msg fullname))) (+ 1 x)) - 0 real-dir) + 0 real-dir #t) ;; then the entire directory (runs:recursive-delete-with-error-msg real-dir)) ;; cleanup often needs to remove all but the last N runs per target ;; Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -1401,10 +1401,29 @@ ;; (string