@@ -51,10 +51,11 @@ can-run-more-tests ((can-run-more-tests-count 0) : fixnum) (last-fuel-check 0) ;; time when we last checked fuel (beginning-of-time (current-seconds)) (load-mgmt-function #f) + (wait-for-jobs-function #f) ) (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup @@ -1676,40 +1677,51 @@ (debug:print-info 4 *default-log-port* "cond branch - " "rtq-2") (debug:print-info 4 *default-log-port* "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) - (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) - - ;; This would be a good place to block on homehost load ;; gonna try a strategy change here. ;; ;; check if can run more tests. if yes, continue, if no, rest for 10 seconds, check again ;; repeat until can run more tests ;; ;; look at the test jobgroup and tot jobs running - (let loop-can-run-more - ((res (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) - (remtries 60)) - (match res - ((run-more num-running . rem) - (if (or run-more - (< remtries 1)) - (begin - (if (runs:lownoise "num-running" 30) - (debug:print-info 0 *default-log-port* "Have "num-running" tests of max " max-concurrent-jobs)) - (runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through - (begin - (if (runs:lownoise "num-running" 10) - (debug:print-info 0 *default-log-port* "Can't run more tests, have "num-running" tests of " - max-concurrent-jobs " allowed.")) - (thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable - (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) - (- remtries 1))))))) - - (let ((loop-list (runs:process-expanded-tests runsdat testdat))) + (if (not (runs:dat-wait-for-jobs-funcion runsdat)) + (runs:dat-wait-for-jobs-function-set! + runsdat + (lambda () + (let loop-can-run-more + ((res (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) + (remtries 1440)) ;; we can wait for up to two hours for jobs to get done + (match res + ((run-more num-running . rem) + (if (or run-more + (< remtries 1)) + (begin + (if (runs:lownoise "num-running" 30) + (debug:print-info 0 *default-log-port* "Have "num-running" tests of max " max-concurrent-jobs)) + (runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through + (begin + (if (runs:lownoise "num-running" 10) + (debug:print-info 0 *default-log-port* "Can't run more tests, have "num-running" tests of " + max-concurrent-jobs " allowed.")) + (thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable + + ;; wait for load here + (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) + (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) + (- remtries 1))))))) + ))) + + ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed + (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) + + ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed + (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) + + (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running (if loop-list (apply loop loop-list)))) ;; items processed into a list but not came in as a list been processed ;; ((and (list? items) ;; thus we know our items are already calculated @@ -2085,15 +2097,23 @@ (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test) (debug:print-info 1 *default-log-port* "SKIPPING Test " full-test-name " due to " skip-test)) ;; ;; Here the test is handed off to launch.scm for launch-test to complete the launch process ;; - (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags)) - (begin - (print "ERROR: Failed to launch the test. Exiting as soon as possible") - (set! *globalexitstatus* 1) ;; - (process-signal (current-process-id) signal/kill)))))))) + (begin + ;; wait for less than max jobs here + (if (runs:dat-wait-for-jobs-function runsdat) + ((runs:dat-wait-for-jobs-function runsdat))) + + (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags)) + (begin + (print "ERROR: Failed to launch the test. Exiting as soon as possible") + (set! *globalexitstatus* 1) ;; + (process-signal (current-process-id) signal/kill)) + ) + ;; wait again here? + )))))) ((KILLED) (debug:print 1 *default-log-port* "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.") (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)) ;; KILLED)) ((LAUNCHED REMOTEHOSTSTART RUNNING) (debug:print 2 *default-log-port* "NOTE: " test-name " is already running"))