@@ -46,16 +46,19 @@ (defstruct runs:dat reglen regfull runname max-concurrent-jobs run-id test-patts required-tests test-registry registry-mutex flags keyvals run-info all-tests-registry - can-run-more-tests + ;; stores results from last runs:can-run-more-tests + (can-run-more-tests #f) ;; (list can-run-more-flag num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit) ((can-run-more-tests-count 0) : fixnum) (last-fuel-check 0) ;; time when we last checked fuel (beginning-of-time (current-seconds)) (load-mgmt-function #f) (wait-for-jobs-function #f) + (last-load-check-time 0) + (last-jobs-check-time 0) ) (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup @@ -318,25 +321,10 @@ (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) -;; (thread-sleep! (cond ;; BB: check with Matt. Should this sleep move -;; ;; to cond clauses below where we determine we -;; ;; have too many jobs running rather than each -;; ;; time the and condition above is true (which -;; ;; seems like always)? -;; ((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way -;; 0) -;; ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time -;; (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ...")) -;; 10) ;; obviously haven't had any work to do for a while -;; (else 0))) -;; ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero? -;; (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01) -;; ))) - (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) @@ -1239,11 +1227,11 @@ ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path)) ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! *max-tries-hash* (make-hash-table)) - (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat) + (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat testdat) (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) (if (or (not (null? tal))(not (null? reg))) @@ -1680,40 +1668,53 @@ (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) ;; gonna try a strategy change here. ;; - ;; check if can run more tests. if yes, continue, if no, rest for 10 seconds, check again - ;; repeat until can run more tests + ;; check if can run more tests. if yes, continue, if no, rest until can run more ;; ;; look at the test jobgroup and tot jobs running (if (not (runs:dat-wait-for-jobs-function runsdat)) (runs:dat-wait-for-jobs-function-set! runsdat - (lambda () - (let loop-can-run-more - ((res (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) - (remtries 1440)) ;; we can wait for up to two hours for jobs to get done - (match res - ((run-more num-running . rem) - (if (or run-more - (< remtries 1)) - (begin - (if (runs:lownoise "num-running" 30) - (debug:print-info 0 *default-log-port* "Have "num-running" tests of max " max-concurrent-jobs)) - (runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through - (begin - (if (runs:lownoise "num-running" 10) - (debug:print-info 0 *default-log-port* "Can't run more tests, have "num-running" tests of " - max-concurrent-jobs " allowed.")) - (thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable - - ;; wait for load here - (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) - (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) - (- remtries 1))))))) - ))) + (lambda (testdat-in) + (let* ((jobgroup (runs:testdat-jobgroup testdat-in)) + (can-run-more-tests (runs:dat-can-run-more-tests runsdat)) + (last-jobs-check-time (runs:dat-last-jobs-check-time runsdat)) + (should-check-jobs (match can-run-more-tests + ((can-run-more-flag num-running nr-in-jobgroup max-concurrent-jobs . params) + (if (< (- max-concurrent-jobs num-running) 25) + (begin + (debug:print-info 0 *default-log-port* + "less than 20 jobs headroom, ("max-concurrent + "-"num-running")>20. Forcing prelaunch check.") + #t) + #f)) + (else #f)))) ;; no record yet + (if should-check-jobs + (let loop-can-run-more + ((res (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) + (remtries 1440)) ;; we can wait for up to two hours for jobs to get done + (match res + ((run-more num-running . rem) + (if (or run-more + (< remtries 1)) + (begin + (if (runs:lownoise "num-running" 30) + (debug:print-info 0 *default-log-port* "Have "num-running" tests of max " max-concurrent-jobs)) + (runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through + (begin + (if (runs:lownoise "num-running" 10) + (debug:print-info 0 *default-log-port* "Can't run more tests, have "num-running" tests of " + max-concurrent-jobs " allowed.")) + (thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable + + ;; wait for load here + (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) + (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) + (- remtries 1))))))) + ))))) ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed @@ -1906,11 +1907,11 @@ (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t)))) lst)) ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step ;; -(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry runsdat) +(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry runsdat testdat-rec) ;; All these vars might be referenced by the testconfig file reader ;; ;; NEED to reprocess testconfig here, ensuring that item variables are available. ;; This is for Tal's issue with item-specific env vars not being set for use in skip. ;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273 @@ -2100,11 +2101,11 @@ ;; Here the test is handed off to launch.scm for launch-test to complete the launch process ;; (begin ;; wait for less than max jobs here (if (runs:dat-wait-for-jobs-function runsdat) - ((runs:dat-wait-for-jobs-function runsdat))) + ((runs:dat-wait-for-jobs-function runsdat) testdat-rec)) (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags)) (begin (print "ERROR: Failed to launch the test. Exiting as soon as possible") (set! *globalexitstatus* 1) ;;