@@ -519,11 +519,11 @@ 10) ;; obviously haven't had any work to do for a while (else ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero? (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)))) - (let* ((num-running (rmt:get-count-tests-running run-id)) + (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) @@ -1744,11 +1744,11 @@ extras) '()))) (waitons (delete-duplicates (append (tests:testqueue-get-waitons test-record) extra-waits) equal?)) (newtal (append tal (list hed))) (regfull (>= (length reg) reglen)) - (num-running (rmt:get-count-tests-running-for-run-id run-id)) + (num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes (testdat (make-runs:testdat hed: hed tal: tal reg: reg reruns: reruns @@ -1964,29 +1964,31 @@ (rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle - (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id)) + (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes (prev-num-running 0)) ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup *configdat* "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) - (begin - (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) - (set! last-time-incomplete (current-seconds)) - (rmt:find-and-mark-incomplete run-id #f))) - (if (not (eq? num-running prev-num-running)) - (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds))))) + (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no + (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id + ". Running as pid " (current-process-id) " on " (get-host-name)) + (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! + (rmt:find-and-mark-incomplete run-id #f) + (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running + " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " + (time->string (seconds->local-time (current-seconds)))))) (thread-sleep! 5) - ;; (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) - (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) + (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes + num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 *default-log-port* "Calling Post Hook") ;; (runs:run-post-hook run-id)