@@ -16,11 +16,12 @@ ;; along with Megatest. If not, see . ;; strftime('%m/%d/%Y %H:%M:%S','now','localtime') (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) - posix-extras directory-utils pathname-expand typed-records format sxml-serializer sxml-modifications) + posix-extras directory-utils pathname-expand typed-records format sxml-serializer + sxml-modifications matchable) (declare (unit runs)) (declare (uses db)) (declare (uses common)) (declare (uses items)) @@ -45,20 +46,29 @@ (defstruct runs:dat reglen regfull runname max-concurrent-jobs run-id test-patts required-tests test-registry registry-mutex flags keyvals run-info all-tests-registry - can-run-more-tests + ;; stores results from last runs:can-run-more-tests + (can-run-more-tests #f) ;; (list can-run-more-flag num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit) ((can-run-more-tests-count 0) : fixnum) (last-fuel-check 0) ;; time when we last checked fuel (beginning-of-time (current-seconds)) + (load-mgmt-function #f) + (wait-for-jobs-function #f) + (last-load-check-time 0) + (last-jobs-check-time 0) ) (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup - waitons testmode newtal itemmaps prereqs-not-met) + waitons testmode newtal + itemmaps + (prereqs-not-met #f) + (last-update 0) ;; + ) ;; look in the $MT_RUN_AREA_HOME/.softlocks directory for key-host-pid.softlock files ;; - remove any that are over 3600 seconds old ;; - if there are any that are younger than 10 seconds ;; * sleep 10 seconds @@ -89,11 +99,11 @@ lock-files))) (if fresh-locks (begin (if (runs:lownoise "runners-softlock-wait" 360) (debug:print-info 0 *default-log-port* "Other runners in flight, giving up some time...")) - (thread-sleep! 10)) + (thread-sleep! 2)) (begin (if (runs:lownoise "runners-softlock-nowait" 360) (debug:print-info 0 *default-log-port* "No runners in flight, updating softlock")) (let* ((ouf (open-output-file my-lock-file))) (with-output-to-port ouf @@ -102,12 +112,12 @@ (runs:dat-last-fuel-check-set! rdat (current-seconds)))))) ;; Fourth try, do accounting through time ;; (define (runs:parallel-runners-mgmt rdat) - (let ((time-to-check 10) ;; 28 - (time-to-wait 30) + (let ((time-to-check (configf:lookup-number *configdat* "runners" "time-to-check" default: 10)) ;; 28 + (time-to-wait (configf:lookup-number *configdat* "runners" "time-to-wait" default: 30)) (now-time (current-seconds))) (if (> (- now-time (runs:dat-last-fuel-check rdat)) time-to-check) ;; time to check (runs:wait-on-softlock rdat "runners")))) ;; To test parallel-runners management start a repl: @@ -213,15 +223,17 @@ exn (let ((call-chain (get-call-chain)) (msg ((condition-property-accessor 'exn 'message) exn))) (if (< count 5) (begin ;; this call is colliding, do some crude stuff to fix it. - (debug:print 0 *default-log-port* "ERROR: *configdat* was inaccessible! This should never happen. Retry #" count) + (debug:print 0 *default-log-port* "ERROR: *configdat* was inaccessible! This should never happen. Retry #" count + ", exn=" exn) (launch:setup force-reread: #t) (fatal-loop (+ count 1))) (begin - (debug:print 0 *default-log-port* "FATAL: *configdat* was inaccessible! This should never happen. Retried " count " times. Message: " msg) + (debug:print 0 *default-log-port* "FATAL: *configdat* was inaccessible! This should never happen. Retried " count + " times. Message: " msg) (debug:print 0 *default-log-port* "Call chain:") (with-output-to-port *default-log-port* (lambda () (print "*configdat* is >>"*configdat*"<<") @@ -311,23 +323,12 @@ ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) - (thread-sleep! (cond ;; BB: check with Matt. Should this sleep move - ;; to cond clauses below where we determine we - ;; have too many jobs running rather than each - ;; time the and condition above is true (which - ;; seems like always)? - ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time - (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ...")) - 10) ;; obviously haven't had any work to do for a while - (else 0))) -;; ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero? -;; (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01) -;; ))) - + (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) + (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) @@ -378,11 +379,11 @@ (if (null? existing-tests) (let* ((use-log-dir (if (not (directory-exists? log-dir)) (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir) + (debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir ", exn=" exn) #f) (create-directory log-dir #t) #t) #t)) (start-time (current-seconds)) @@ -389,11 +390,11 @@ (actual-logf (if use-log-dir full-log-fname log-file))) (handle-exceptions exn (begin (print-call-chain *default-log-port*) - (debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (debug:print 0 *default-log-port* "ERROR: failed to run pre-hook " run-pre-hook ", check the log " log-file)) (debug:print-info 0 *default-log-port* "running run-pre-hook: \"" run-pre-hook "\", log is " actual-logf) (system (conc run-pre-hook " >> " actual-logf " 2>&1")) (debug:print-info 0 *default-log-port* "pre-hook \"" run-pre-hook "\" took " (- (current-seconds) start-time) " seconds to run."))) (debug:print 0 *default-log-port* "Skipping pre-hook call \"" run-pre-hook "\" as there are existing tests for this run."))))) @@ -418,11 +419,11 @@ ;; (debug:print 0 *default-log-port* "Skipping post-hook call \"" run-post-hook "\" as there are existing tests for this run."))))) (let* ((use-log-dir (if (not (directory-exists? log-dir)) (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir) + (debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir ", exn=" exn) #f) (create-directory log-dir #t) #t) #t)) (start-time (current-seconds)) @@ -429,11 +430,11 @@ (actual-logf (if use-log-dir full-log-fname log-file))) (handle-exceptions exn (begin (print-call-chain *default-log-port*) - (debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (debug:print 0 *default-log-port* "ERROR: failed to run post-hook " run-post-hook ", check the log " log-file)) (debug:print-info 0 *default-log-port* "running run-post-hook: \"" run-post-hook "\", log is " actual-logf) (system (conc run-post-hook " >> " actual-logf " 2>&1")) (debug:print-info 0 *default-log-port* "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run.")))))) @@ -535,11 +536,11 @@ ;; register this run in monitor.db (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) (rmt:tasks-set-state-given-param-key task-key "running") - (common:telemetry-log "run-tests" + #;(common:telemetry-log "run-tests" payload: `( (target . ,target) (run-name . ,runname) (test-patts . ,test-patts) ) ) @@ -749,11 +750,11 @@ (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going (handle-exceptions exn - (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id) + (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn) (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) run-ids))) "runs: mark-incompletes"))) ;; (thread-start! th1) (thread-start! th2) @@ -831,10 +832,24 @@ ;; reg - list of previously visited tests ;; tal - list of never visited tests ;; prefer next hed to be from reg than tal. (define runs:nothing-left-in-queue-count 0) + +;; cache the result of get-prereqs-not-met and don't call it if called in past 10 seconds +;; NOTE: This is assuming that testdat is highly specific to this test +;; +(define (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path #!key (mode '(normal))(itemmaps #f)) + ;; mode: testmode itemmaps: itemmaps) + (if (and (runs:testdat-prereqs-not-met testdat) + (< (- (current-seconds) (runs:testdat-last-update testdat)) 10)) ;;; only refresh for this test if + ;;; it has been at least 10 seconds + (runs:testdat-prereqs-not-met testdat) ;; return the cached result + (let* ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: mode itemmaps: itemmaps))) + (runs:testdat-prereqs-not-met-set! testdat res) + (runs:testdat-last-update-set! testdat (current-seconds)) + res))) ;;====================================================================== ;; runs:expand-items is called by runs:run-tests-queue ;;====================================================================== ;; @@ -841,22 +856,18 @@ ;; return value of runs:expand-items is passed back to runs-tests-queue and is fed to named loop with this signature: ;; (let loop ((hed (car sorted-test-names)) ;; (tal (cdr sorted-test-names)) ;; (reg '()) ;; registered, put these at the head of tal ;; (reruns '())) -(define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps) +(define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs + run-id waitons item-path testmode test-record + can-run-more items runname tconfig reglen test-registry + test-records itemmaps testdat) (let* ((loop-list (list hed tal reg reruns)) - (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))) - (if (list? res) - res - (begin - (debug:print 0 *default-log-port* - "ERROR: rmt:get-prereqs-not-met returned non-list!\n" - " res=" res " run-id=" run-id " waitons=" waitons " hed=" hed " item-path=" item-path " testmode=" testmode " itemmaps=" itemmaps) - '())))) - (have-itemized (not (null? (lset-intersection eq? testmode '(itemmatch itemwait))))) - ;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap)) + (prereqs-not-met (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path + mode: testmode itemmaps: itemmaps)) + (have-itemized (not (null? (lset-intersection eq? testmode '(itemmatch itemwait))))) (fails (runs:calc-fails prereqs-not-met)) (prereq-fails (runs:calc-prereq-fail prereqs-not-met)) (non-completed (runs:calc-not-completed prereqs-not-met)) (runnables (runs:calc-runnable prereqs-not-met)) (unexpanded-prereqs @@ -987,13 +998,17 @@ (null? non-completed)) (debug:print-info 4 *default-log-port* "cond branch - " "ei-4") (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) - (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; + (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0) ", going to wait 60 sec.") ;; ;; getting here likely means the system is way overloaded, kill a full minute before continuing - (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) + ;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!! + ;; No runsdat, can't do this yet + ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) + ;; + (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") @@ -1119,11 +1134,29 @@ (conc " WARNING: t is not a vector=" t ))) prereqs-not-met) ", ") ") fails: " fails "\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) - + ;; well, first lets see if cpu load throttling is enabled. If so wait around until the + ;; average cpu load is under the threshold before continuing + ;; + (if (runs:dat-load-mgmt-function runsdat) + ((runs:dat-load-mgmt-function runsdat)) + (runs:dat-load-mgmt-function-set! + runsdat + (lambda () + ;; jobtools maxload is useful for where the full Megatest run is done on one machine + (if (and (not (common:on-homehost?)) + maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized + (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) + + ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues + (if maxhomehostload + (common:wait-for-homehost-load maxhomehostload + (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))))) + + (if (and (not (null? prereqs-not-met)) (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) (debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) @@ -1192,11 +1225,11 @@ ((not have-resources) ;; simply try again after waiting a second (if (runs:lownoise "no resources" 60) (debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ...")) ;; Have gone back and forth on this but db starvation is an issue. ;; wait one second before looking again to run jobs. - (thread-sleep! 1) + (thread-sleep! 0.25) ;; could have done hed tal here but doing car/cdr of newtal to rotate tests (list (car newtal)(cdr newtal) reg reruns)) ;; This is the final stage, everything is in place so launch the test ;; @@ -1208,28 +1241,17 @@ ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path)) ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! *max-tries-hash* (make-hash-table)) - ;; well, first lets see if cpu load throttling is enabled. If so wait around until the - ;; average cpu load is under the threshold before continuing - - ;; jobtools maxload is useful for where the full Megatest run is done on one machine - (if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized - (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) - - ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues - (if maxhomehostload - (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))) - - (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) + (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat testdat) (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) (if (or (not (null? tal))(not (null? reg))) - (runs:loop-values tal reg reglen regfull reruns) + (runs:loop-values tal reg reglen regfull reruns) ;; hed should be dropped at this time #f)) ;; must be we have unmet prerequisites ;; (else @@ -1246,19 +1268,18 @@ (member 'toplevel testmode)) (begin ;; couldn't run, take a breather (if (runs:lownoise "Waiting for more work to do..." 60) (debug:print-info 0 *default-log-port* "Waiting for more work to do...")) - (thread-sleep! 1) + + ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) + (thread-sleep! 5) (list (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (begin (let ((my-test-id (rmt:get-test-id run-id test-name item-path))) - (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2")) - - - + (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2")) (if (or (not (null? reg))(not (null? tal))) (if (vector? hed) (begin (debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path " from the launch list as it has prerequistes that are FAIL") @@ -1434,10 +1455,13 @@ ;; every time though the loop increment the test/itempatt val. ;; when the min is > max-allowed and none running then force exit ;; (define *max-tries-hash* (make-hash-table)) +(define (runs:pretty-long-list lst) + (if (> (length lst) 8)(append (take lst 3)(list "...")) lst)) + ;;====================================================================== ;; runs:run-tests-queue is called by runs:run-tests ;;====================================================================== ;; ;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... > @@ -1573,19 +1597,10 @@ itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) - ;; -- removed BB 17ww28 - no longer needed. - ;; every 15 minutes verify the server is there for this run - ;; (if (and (common:low-noise-print 240 "try start server" run-id) - ;; (not (or (and *runremote* - ;; (remote-server-url *runremote*) - ;; (server:ping (remote-server-url *runremote*))) - ;; (server:check-if-running *toppath*)))) - ;; (server:kind-run *toppath*)) - (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000))) (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1))) @@ -1614,11 +1629,11 @@ (runs:incremental-print-results run-id) (debug:print 4 *default-log-port* "TOP OF LOOP => " "test-name: " test-name "\n hed: " hed - "\n tal: " tal + "\n tal: " (runs:pretty-long-list tal) "\n reg: " reg "\n test-record " test-record "\n itemdat: " itemdat "\n items: " items "\n item-path: " item-path @@ -1628,11 +1643,11 @@ "\n regfull: " regfull "\n reglen: " reglen "\n length reg: " (length reg) ) - (runs:parallel-runners-mgmt runsdat) + ;; (runs:parallel-runners-mgmt runsdat) ;; check for hed in waitons => this would be circular, remove it and issue an ;; error (if (member test-name waitons) (begin @@ -1664,13 +1679,65 @@ (debug:print-info 4 *default-log-port* "cond branch - " "rtq-2") (debug:print-info 4 *default-log-port* "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) - (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) - (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running - (let ((loop-list (runs:process-expanded-tests runsdat testdat))) + + ;; gonna try a strategy change here. + ;; + ;; check if can run more tests. if yes, continue, if no, rest until can run more + ;; look at the test jobgroup and tot jobs running + ;; + ;; NOTE: This does NOT actually gate here, only captures the proc to be called later + ;; + (if (not (runs:dat-wait-for-jobs-function runsdat)) + (runs:dat-wait-for-jobs-function-set! + runsdat + (lambda (testdat-in) + (let* ((jobgroup (runs:testdat-jobgroup testdat-in)) + (can-run-more-tests (runs:dat-can-run-more-tests runsdat)) + (last-jobs-check-time (runs:dat-last-jobs-check-time runsdat)) + (should-check-jobs (match can-run-more-tests + ((can-run-more-flag num-running nr-in-jobgroup max-concurrent-jobs . params) + (if (< (- max-concurrent-jobs num-running) 25) + (begin + (debug:print-info 0 *default-log-port* + "less than 20 jobs headroom, ("max-concurrent-jobs + "-"num-running")>20. Forcing prelaunch check.") + #t) + #f)) + (else #f)))) ;; no record yet + (if should-check-jobs + (let loop-can-run-more + ((res (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) + (remtries 1440)) ;; we can wait for up to two hours for jobs to get done + (match res + ((run-more num-running . rem) + (if (or run-more + (< remtries 1)) + (begin + (if (runs:lownoise "num-running" 30) + (debug:print-info 0 *default-log-port* "Have "num-running" tests of max " max-concurrent-jobs)) + (runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through + (begin + (if (runs:lownoise "num-running" 10) + (debug:print-info 0 *default-log-port* "Can't run more tests, have "num-running" tests of " + max-concurrent-jobs " allowed.")) + (thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable + + ;; wait for load here + (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) + (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) + (- remtries 1))))))) + ))))) + + ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed + + ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed + (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) + + (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running (if loop-list (apply loop loop-list)))) ;; items processed into a list but not came in as a list been processed ;; ((and (list? items) ;; thus we know our items are already calculated @@ -1729,19 +1796,20 @@ ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS ((or (procedure? items)(eq? items 'have-procedure)) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-4") - (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) + (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) - (car can-run-more)) - (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here + (car can-run-more)) ;; itemized test expanded here + (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup + max-concurrent-jobs run-id waitons item-path + testmode test-record can-run-more items runname + tconfig reglen test-registry test-records itemmaps))) (if loop-list (apply loop loop-list) - (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed) - ) - ) + (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed))) ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns)))) ;; this case should not happen, added to help catch any bugs ((and (list? items) itemdat) @@ -1774,11 +1842,13 @@ ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched (rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) + ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle + (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes (prev-num-running 0)) ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup *configdat* "setup" "run-wait") "yes")) @@ -1793,11 +1863,12 @@ (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) - (thread-sleep! 5) + ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) + (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. @@ -1852,11 +1923,11 @@ (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t)))) lst)) ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step ;; -(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry) +(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry runsdat testdat-rec) ;; All these vars might be referenced by the testconfig file reader ;; ;; NEED to reprocess testconfig here, ensuring that item variables are available. ;; This is for Tal's issue with item-specific env vars not being set for use in skip. ;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273 @@ -1930,12 +2001,13 @@ (set! test-id (rmt:get-test-id run-id test-name item-path)))) (debug:print-info 4 *default-log-port* "test-id=" test-id ", run-id=" run-id ", test-name=" test-name ", item-path=\"" item-path "\"") (set! testdat (rmt:get-test-info-by-id run-id test-id)) (if (not testdat) (begin - (debug:print-info 0 *default-log-port* "WARNING: server is overloaded, trying again in one second") - (thread-sleep! 1) + (debug:print-info 0 *default-log-port* "WARNING: server is overloaded, trying again in two seconds") + ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) + (thread-sleep! 2) (loop))))) (if (not testdat) ;; should NOT happen (debug:print-error 0 *default-log-port* "failed to get test record for test-id " test-id)) (set! test-id (db:test-get-id testdat)) (if (common:file-exists? test-path) @@ -2042,15 +2114,23 @@ (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test) (debug:print-info 1 *default-log-port* "SKIPPING Test " full-test-name " due to " skip-test)) ;; ;; Here the test is handed off to launch.scm for launch-test to complete the launch process ;; - (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags)) - (begin - (print "ERROR: Failed to launch the test. Exiting as soon as possible") - (set! *globalexitstatus* 1) ;; - (process-signal (current-process-id) signal/kill)))))))) + (begin + ;; wait for less than max jobs here + (if (runs:dat-wait-for-jobs-function runsdat) + ((runs:dat-wait-for-jobs-function runsdat) testdat-rec)) + + (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags)) + (begin + (print "ERROR: Failed to launch the test. Exiting as soon as possible") + (set! *globalexitstatus* 1) ;; + (process-signal (current-process-id) signal/kill)) + ) + ;; wait again here? + )))))) ((KILLED) (debug:print 1 *default-log-port* "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.") (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)) ;; KILLED)) ((LAUNCHED REMOTEHOSTSTART RUNNING) (debug:print 2 *default-log-port* "NOTE: " test-name " is already running")) @@ -2119,11 +2199,11 @@ (let* ((targets (string-split target-patts ",")) (keys (rmt:get-keys)) (res-ht (make-hash-table))) ;; target -> ( runrecord1 runrecord2 ... ) (for-each (lambda (target-patt) - (let ((runs (rmt:simple-get-runs runpatt #f #f target-patt))) + (let ((runs (rmt:simple-get-runs runpatt #f #f target-patt #f))) (for-each (lambda (run) (let ((target (simple-run-target run))) (hash-table-set! res-ht target (cons run (hash-table-ref/default res-ht target '()))))) runs))) @@ -2521,11 +2601,11 @@ (if (not (null? tal)) (loop (car tal)(cdr tal)))) ((run-wait) ;; BB TODO - manage has-subrun case (debug:print-info 2 *default-log-port* "still waiting, " (length tests) " tests still running") - (thread-sleep! 10) + (thread-sleep! 5) (let ((new-tests (proc-get-tests run-id))) (if (null? new-tests) (debug:print-info 1 *default-log-port* "Run completed according to zero tests matching provided criteria.") (loop (car new-tests)(cdr new-tests))))) ((archive) @@ -2628,18 +2708,18 @@ (if (symbolic-link? run-dir) (begin (debug:print-info 1 *default-log-port* "Removing symlink " run-dir) (handle-exceptions exn - (debug:print-error 0 *default-log-port* " Failed to remove symlink " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue") + (debug:print-error 0 *default-log-port* " Failed to remove symlink " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue, exn=" exn) (delete-file run-dir))) (if (directory? run-dir) (if (> (directory-fold (lambda (f x)(+ 1 x)) 0 run-dir) 0) (debug:print 0 *default-log-port* "WARNING: refusing to remove " run-dir " as it is not empty") (handle-exceptions exn - (debug:print-error 0 *default-log-port* " Failed to remove directory " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue") + (debug:print-error 0 *default-log-port* " Failed to remove directory " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue, exn=" exn) (delete-directory run-dir))) (if (and run-dir (not (member run-dir (list "n/a" "/tmp/badname")))) (debug:print 0 *default-log-port* "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink") (debug:print 0 *default-log-port* "NOTE: the run dir for this test is undefined. Test may have already been deleted.")) @@ -2912,11 +2992,11 @@ (let* ((final-doc ((sxml-modify `("testsuite" insert-into (@ (name ,xml-ts-name) (tests ,tests-count) (errors ,error-cnt) (failures ,fail-cnt)))) new-doc))) (debug:print 0 *default-log-port* "modify attrib error=" error-cnt " fail= " fail-cnt) (handle-exceptions exn (let* ((msg ((condition-property-accessor 'exn 'message) exn))) - (debug:print 0 *default-log-port* (conc "WARNING: Failed to update file" xml-path". Message:" msg))) + (debug:print 0 *default-log-port* (conc "WARNING: Failed to update file" xml-path". Message:" msg ", exn=" exn))) (if (not (file-exists? xml-dir)) (create-directory xml-dir #t)) (if (not (rmt:no-sync-get/default keyname #f)) (begin @@ -2946,10 +3026,10 @@ (debug:print-info 0 *default-log-port* "Removing cached files:\n " (string-intersperse files "\n ")) (for-each (lambda (f) (handle-exceptions exn - (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f) + (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f ", exn=" exn) (delete-file f))) files)))) (debug:print-error 0 *default-log-port* "-clean-cache requires -runname.")) (debug:print-error 0 *default-log-port* "-clean-cache requires -target or -reqtarg")))