@@ -408,10 +408,154 @@ (debug:print 4 "ERROR: No handler for this condition.") ;; TRY (thread-sleep! (+ 1 *global-delta*)) (set! loop-list (list (car newtal)(cdr newtal) reg reruns)))) ;; END OF IF CAN RUN MORE loop-list)) +(define (runs:process-expanded-tests hed tal reg reruns reglen regfull test-record runname test-name item-path jobgroup max-concurrent-jobs run-id waitons item-path testmode) + (let* ((run-limits-info (runs:can-run-more-tests jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running + ;; (open-run-close runs:can-run-more-tests #f jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running + (have-resources (car run-limits-info)) + (num-running (list-ref run-limits-info 1)) + (num-running-in-jobgroup (list-ref run-limits-info 2)) + (max-concurrent-jobs (list-ref run-limits-info 3)) + (job-group-limit (list-ref run-limits-info 4)) + (prereqs-not-met (mt:get-prereqs-not-met run-id waitons item-path mode: testmode)) + (fails (runs:calc-fails prereqs-not-met)) + (non-completed (runs:calc-not-completed prereqs-not-met)) + (loop-list (list hed tal reg reruns))) + (debug:print-info 4 "have-resources: " have-resources " prereqs-not-met: (" + (string-intersperse + (map (lambda (t) + (if (vector? t) + (conc (db:test-get-state t) "/" (db:test-get-status t)) + (conc " WARNING: t is not a vector=" t ))) + prereqs-not-met) ", ") ") fails: " fails) + (debug:print-info 4 "hed=" hed "\n test-record=" test-record "\n test-name: " test-name "\n item-path: " item-path "\n test-patts: " test-patts) + + ;; Don't know at this time if the test have been launched at some time in the past + ;; i.e. is this a re-launch? + (debug:print-info 4 "run-limits-info = " run-limits-info) + + (cond ;; INNER COND #1 for a launchable test + + ;; Check item path against item-patts, + ;; + ((not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) ;; This test/itempath is not to be run + ;; else the run is stuck, temporarily or permanently + ;; but should check if it is due to lack of resources vs. prerequisites + (debug:print-info 1 "Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " test-patts) + ;; (thread-sleep! *global-delta*) + (if (or (not (null? tal))(not (null? reg))) + (set! loop-list + (list (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + reruns)))) + + ;; Register tests + ;; + ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) + (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) + (let ((th (make-thread (lambda () + (mutex-lock! registry-mutex) + (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'start) + (mutex-unlock! registry-mutex) + ;; If haven't done it before register a top level test if this is an itemized test + (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done)) + (cdb:tests-register-test *runremote* run-id test-name "")) + (cdb:tests-register-test *runremote* run-id test-name item-path) + (mutex-lock! registry-mutex) + (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done) + (mutex-unlock! registry-mutex)) + (conc test-name "/" item-path)))) + (thread-start! th)) + (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + (if (and (null? tal)(null? reg)) + (set! loop-list (list hed tal (append reg (list hed)) reruns)) + (set! loop-list (list (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + ;; NB// Here we are building reg as we register tests + ;; if regfull we must pop the front item off reg + (if regfull + (append (cdr reg) (list hed)) + (append reg (list hed))) + reruns)))) + + ;; At this point hed test registration must be completed. + ;; + ((eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f) + 'start) + (debug:print-info 0 "Waiting on test registration(s): " + (string-intersperse + (filter (lambda (x) + (eq? (hash-table-ref/default test-registry x #f) 'start)) + (hash-table-keys test-registry)) + ", ")) + (thread-sleep! 0.1) + (set! loop-list (list hed tal reg reruns))) + + ;; If no resources are available just kill time and loop again + ;; + ((not have-resources) ;; simply try again after waiting a second + (debug:print-info 1 "no resources to run new tests, waiting ...") + ;; Have gone back and forth on this but db starvation is an issue. + ;; wait one second before looking again to run jobs. + (thread-sleep! 1) ;; (+ 2 *global-delta*)) + ;; could have done hed tal here but doing car/cdr of newtal to rotate tests + (set! loop-list (list (car newtal)(cdr newtal) reg reruns))) + ;; (loop hed tal reg reruns)) + + ;; This is the final stage, everything is in place so launch the test + ;; + ((and have-resources + (or (null? prereqs-not-met) + (and (eq? testmode 'toplevel) + (null? non-completed)))) + (run:test run-id run-info keyvals runname test-record flags #f test-registry) + (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'running) + (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + ;; (thread-sleep! *global-delta*) + (if (or (not (null? tal))(not (null? reg))) + (set! loop-list (list (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + reruns)))) + + ;; must be we have unmet prerequisites + ;; + (else + (debug:print 4 "FAILS: " fails) + ;; If one or more of the prereqs-not-met are FAIL then we can issue + ;; a message and drop hed from the items to be processed. + (if (null? fails) + (begin + ;; couldn't run, take a breather + (debug:print-info 4 "Shouldn't really get here, race condition? Unable to launch more tests at this moment, killing time ...") + ;; (thread-sleep! (+ 0.01 *global-delta*)) ;; long sleep here - no resources, may as well be patient + ;; we made new tal by sticking hed at the back of the list + (set! loop-list (list (car newtal)(cdr newtal) reg reruns))) + ;; the waiton is FAIL so no point in trying to run hed ever again + (if (or (not (null? reg))(not (null? tal))) + (if (vector? hed) + (begin + (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed) + " from the launch list as it has prerequistes that are FAIL") + (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + ;; (thread-sleep! *global-delta*) + (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'removed) + (set! loop (list (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + (cons hed reruns)))) + (begin + (debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") + (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + (set! loop-list (list hed tal reg reruns)))))))) + loop-list)) ;; END OF INNER COND + +;; End of INNER COND for launchable test. + ;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... > (define (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests reglen-in) ;; At this point the list of parent tests is expanded ;; NB// Should expand items here and then insert into the run queue. @@ -503,149 +647,12 @@ ((not items) (debug:print-info 4 "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) - (let* ((run-limits-info (runs:can-run-more-tests jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running - ;; (open-run-close runs:can-run-more-tests #f jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running - (have-resources (car run-limits-info)) - (num-running (list-ref run-limits-info 1)) - (num-running-in-jobgroup (list-ref run-limits-info 2)) - (max-concurrent-jobs (list-ref run-limits-info 3)) - (job-group-limit (list-ref run-limits-info 4)) - (prereqs-not-met (mt:get-prereqs-not-met run-id waitons item-path mode: testmode)) - (fails (runs:calc-fails prereqs-not-met)) - (non-completed (runs:calc-not-completed prereqs-not-met))) - (debug:print-info 4 "have-resources: " have-resources " prereqs-not-met: (" - (string-intersperse - (map (lambda (t) - (if (vector? t) - (conc (db:test-get-state t) "/" (db:test-get-status t)) - (conc " WARNING: t is not a vector=" t ))) - prereqs-not-met) ", ") ") fails: " fails) - (debug:print-info 4 "hed=" hed "\n test-record=" test-record "\n test-name: " test-name "\n item-path: " item-path "\n test-patts: " test-patts) - - ;; Don't know at this time if the test have been launched at some time in the past - ;; i.e. is this a re-launch? - (debug:print-info 4 "run-limits-info = " run-limits-info) - - (cond ;; INNER COND #1 for a launchable test - - ;; Check item path against item-patts, - ;; - ((not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) ;; This test/itempath is not to be run - ;; else the run is stuck, temporarily or permanently - ;; but should check if it is due to lack of resources vs. prerequisites - (debug:print-info 1 "Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " test-patts) - ;; (thread-sleep! *global-delta*) - (if (or (not (null? tal))(not (null? reg))) - (loop (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - reruns))) - - ;; Register tests - ;; - ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) - (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) - (let ((th (make-thread (lambda () - (mutex-lock! registry-mutex) - (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'start) - (mutex-unlock! registry-mutex) - ;; If haven't done it before register a top level test if this is an itemized test - (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done)) - (cdb:tests-register-test *runremote* run-id test-name "")) - (cdb:tests-register-test *runremote* run-id test-name item-path) - (mutex-lock! registry-mutex) - (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done) - (mutex-unlock! registry-mutex)) - (conc test-name "/" item-path)))) - (thread-start! th)) - (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) - (if (and (null? tal)(null? reg)) - (loop hed tal (append reg (list hed)) reruns) - (loop (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - ;; NB// Here we are building reg as we register tests - ;; if regfull we must pop the front item off reg - (if regfull - (append (cdr reg) (list hed)) - (append reg (list hed))) - reruns))) - - ;; At this point hed test registration must be completed. - ;; - ((eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f) - 'start) - (debug:print-info 0 "Waiting on test registration(s): " - (string-intersperse - (filter (lambda (x) - (eq? (hash-table-ref/default test-registry x #f) 'start)) - (hash-table-keys test-registry)) - ", ")) - (thread-sleep! 0.1) - (loop hed tal reg reruns)) - - ;; If no resources are available just kill time and loop again - ;; - ((not have-resources) ;; simply try again after waiting a second - (debug:print-info 1 "no resources to run new tests, waiting ...") - ;; Have gone back and forth on this but db starvation is an issue. - ;; wait one second before looking again to run jobs. - (thread-sleep! 1) ;; (+ 2 *global-delta*)) - ;; could have done hed tal here but doing car/cdr of newtal to rotate tests - (loop (car newtal)(cdr newtal) reg reruns)) - ;; (loop hed tal reg reruns)) - - ;; This is the final stage, everything is in place so launch the test - ;; - ((and have-resources - (or (null? prereqs-not-met) - (and (eq? testmode 'toplevel) - (null? non-completed)))) - (run:test run-id run-info keyvals runname test-record flags #f test-registry) - (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'running) - (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) - ;; (thread-sleep! *global-delta*) - (if (or (not (null? tal))(not (null? reg))) - (loop (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - reruns))) - - ;; must be we have unmet prerequisites - ;; - (else - (debug:print 4 "FAILS: " fails) - ;; If one or more of the prereqs-not-met are FAIL then we can issue - ;; a message and drop hed from the items to be processed. - (if (null? fails) - (begin - ;; couldn't run, take a breather - (debug:print-info 4 "Shouldn't really get here, race condition? Unable to launch more tests at this moment, killing time ...") - ;; (thread-sleep! (+ 0.01 *global-delta*)) ;; long sleep here - no resources, may as well be patient - ;; we made new tal by sticking hed at the back of the list - (loop (car newtal)(cdr newtal) reg reruns)) - ;; the waiton is FAIL so no point in trying to run hed ever again - (if (or (not (null? reg))(not (null? tal))) - (if (vector? hed) - (begin - (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed) - " from the launch list as it has prerequistes that are FAIL") - (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) - ;; (thread-sleep! *global-delta*) - (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'removed) - (loop (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - (cons hed reruns))) - (begin - (debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") - (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) - (loop hed tal reg reruns))))))))) ;; END OF INNER COND - - ;; End of INNER COND for launchable test. + (let ((loop-list (runs:process-expanded-tests hed tal reg reruns reglen regfull test-record runname test-name item-path jobgroup max-concurrent-jobs run-id waitons item-path testmode))) + (apply loop loop-list))) ;; case where an items came in as a list been processed ((and (list? items) ;; thus we know our items are already calculated (not itemdat)) ;; and not yet expanded into the list of things to be done (debug:print-info 4 "INNER COND: (and (list? items)(not itemdat))")