@@ -61,11 +61,38 @@ (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup waitons testmode newtal itemmaps prereqs-not-met) + +(module runsmod + ( + runs:wait-if-seen-recently + ) +(import scheme chicken data-structures extras files) +(import posix typed-records srfi-18 srfi-69 + md5 message-digest + regex srfi-1) + +(define *last-seen-ht* (make-hash-table)) + +(define (runs:wait-if-seen-recently wait-until . keys) + (let* ((full-key (string-intersperse keys "-")) + (last-seen (hash-table-ref/default *last-seen-ht* full-key 0)) + (now (current-seconds)) + (delta (- now last-seen)) + (needed (if (< delta wait-until) + 0 + (- wait-until delta)))) + (if (> needed 0)(thread-sleep! needed)) + (hash-table-set! *last-seen-ht* full-key (current-seconds)) + needed)) +) + +(import runsmod) + ;; look in the $MT_RUN_AREA_HOME/.softlocks directory for key-host-pid.softlock files ;; - remove any that are over 3600 seconds old ;; - if there are any that are younger than 10 seconds ;; * sleep 10 seconds ;; * touch my key-host-pid.softlock file @@ -321,11 +348,11 @@ (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) - (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no + (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) @@ -1565,11 +1592,11 @@ extras) '()))) (waitons (delete-duplicates (append (tests:testqueue-get-waitons test-record) extra-waits) equal?)) (newtal (append tal (list hed))) (regfull (>= (length reg) reglen)) - (num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes + (num-running (rmt:get-count-tests-running-for-run-id run-id)) (testdat (make-runs:testdat hed: hed tal: tal reg: reg reruns: reruns @@ -1715,10 +1742,13 @@ (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) (- remtries 1))))))) ))))) ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed + (let ((waited (runs:wait-if-seen-recently 5 "prereqs-not-met" hed item-path))) ;; if we've been down this path in the past 5 seconds - wait out the difference + (if (> waited 0)(debug:print 0 *default-log-port* "Waited for prereqs-not-met-"hed"-"item-path" for " waited "seconds."))) + (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) @@ -1831,31 +1861,31 @@ ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle - (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes + (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id)) (prev-num-running 0)) ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup *configdat* "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) - (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no + (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id))) (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) - (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes + (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 *default-log-port* "Calling Post Hook")