Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -184,20 +184,23 @@ (hash-table-set! *runs:denoise* key currtime) #t) #f))) (define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) + ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) - (thread-sleep! (cond + + (thread-sleep! (cond ;; BB: check with Matt. Should this sleep move to cond clauses below where we determine we have too many jobs running rather than each time the and condition above is true (which seems like always)? ((> (runs:dat-can-run-more-tests-count runsdat) 20) (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ...")) (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.1) ;; was 2 );; obviously haven't had any work to do for a while (else 0))) + (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (config-lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) @@ -208,11 +211,11 @@ (begin (debug:print 2 *default-log-port* "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running) (set! *last-num-running-tests* num-running))) (if (not (eq? 0 *globalexitstatus*)) (list #f num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit) - (let ((can-not-run-more (cond + (let* ((can-not-run-more (cond ;; if max-concurrent-jobs is set and the number running is greater ;; than it then cannot run more jobs ((and max-concurrent-jobs (>= num-running max-concurrent-jobs)) (if (runs:lownoise "mcj msg" 60) (debug:print 0 *default-log-port* "WARNING: Max running jobs exceeded, current number running: " num-running @@ -718,13 +721,14 @@ (cond ;; all prereqs met, fire off the test ;; or, if it is a 'toplevel test and all prereqs not met are COMPLETED then launch - ((and (not (member 'toplevel testmode)) + ((and (not (member 'toplevel testmode)) (member (hash-table-ref/default test-registry (db:test-make-full-name hed item-path) 'n/a) '(DONOTRUN removed CANNOTRUN))) ;; *common:cant-run-states-sym*) ;; '(COMPLETED KILLED WAIVED UNKNOWN INCOMPLETE)) ;; try to catch repeat processing of COMPLETED tests here + (debug:print-info 4 *default-log-port* "cond branch - " "ei-1") (debug:print-info 1 *default-log-port* "Test " hed " set to \"" (hash-table-ref test-registry (db:test-make-full-name hed item-path)) "\". Removing it from the queue") (if (or (not (null? tal)) (not (null? reg))) (runs:loop-values tal reg reglen regfull reruns) (begin @@ -747,13 +751,14 @@ ;; case 2 - mode is toplevel ;; - prereqs are completed. ;; - or no prereqs can complete ;; case 3 - mode not specified ;; - prereqs are completed and passed (we could consider removing "and passed" -- it would change behavior from current) - ((or (null? prereqs-not-met) + ((or (null? prereqs-not-met) (and (member 'toplevel testmode) (null? non-completed))) + (debug:print-info 4 *default-log-port* "cond branch - " "ei-2") (debug:print-info 4 *default-log-port* "runs:expand-items: (or (null? prereqs-not-met) (and (member 'toplevel testmode)(null? non-completed)))") (let ((test-name (tests:testqueue-get-testname test-record))) (setenv "MT_TEST_NAME" test-name) ;; (setenv "MT_RUNNAME" runname) (runs:set-megatest-env-vars run-id inrunname: runname) ;; these may be needed by the launching process @@ -770,13 +775,14 @@ (list hed tal reg reruns)) (begin (debug:print-error 0 *default-log-port* "The proc from reading the items table did not yield a list - please report this") (exit 1)))))) - ((and (null? fails) + ((and (null? fails) (null? prereq-fails) (not (null? non-completed))) + (debug:print-info 4 *default-log-port* "cond branch - " "ei-3") (let* ((allinqueue (map (lambda (x)(if (string? x) x (db:test-get-testname x))) (append newtal reruns))) ;; prereqstrs is a list of test names as strings that are prereqs for hed (prereqstrs (delete-duplicates (map (lambda (x)(if (string? x) x (db:test-get-testname x))) prereqs-not-met))) @@ -810,13 +816,14 @@ #f (runs:loop-values trimmed-tal trimmed-reg reglen regfull reruns) )) (list (car newtal)(append (cdr newtal) reg) '() reruns)))) - ((and (null? fails) ;; have not-started tests, but unable to run them. everything looks completed with no prospect of unsticking something that is stuck. we should mark hed as moribund and exit or continue if there are more tests to consider + ((and (null? fails) ;; have not-started tests, but unable to run them. everything looks completed with no prospect of unsticking something that is stuck. we should mark hed as moribund and exit or continue if there are more tests to consider (null? prereq-fails) (null? non-completed)) + (debug:print-info 4 *default-log-port* "cond branch - " "ei-4") (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; ;; getting here likely means the system is way overloaded, kill a full minute before continuing @@ -829,14 +836,15 @@ (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while."))) (runs:loop-values tal reg reglen regfull reruns) ))) - ((and + ((and (or (not (null? fails)) (not (null? prereq-fails))) (member 'normal testmode)) + (debug:print-info 4 *default-log-port* "cond branch - " "ei-5") (debug:print-info 1 *default-log-port* "test " hed " (mode=" testmode ") has failed prerequisite(s); " (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ") ", removing it from to-do list") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id @@ -851,15 +859,19 @@ (runs:loop-values tal reg reglen regfull (cons hed reruns)) ) #f)) ;; #f flags do not loop ((and (not (null? fails))(member 'toplevel testmode)) + (debug:print-info 4 *default-log-port* "cond branch - " "ei-6") (if (or (not (null? reg))(not (null? tal))) (list (car newtal)(append (cdr newtal) reg) '() reruns) #f)) - ((null? runnables) #f) ;; if we get here and non-completed is null then it is all over. + ((null? runnables) + (debug:print-info 4 *default-log-port* "cond branch - " "ei-7") + #f) ;; if we get here and non-completed is null then it is all over. (else + (debug:print-info 4 *default-log-port* "cond branch - " "ei-8") (debug:print 0 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now") (list (car newtal)(cdr newtal) reg reruns))))) (define (runs:mixed-list-testname-and-testrec->list-of-strings inlst) (if (null? inlst) @@ -1250,11 +1262,11 @@ ;; Do mark-and-find clean up of db before starting runing of quue ;; ;; (rmt:find-and-mark-incomplete) - (let* ((run-info (rmt:get-run-info run-id)) + (let* ((run-info (rmt:get-run-info run-id)) (tests-info (mt:get-tests-for-run run-id #f '() '())) ;; qryvals: "id,testname,item_path")) (sorted-test-names (tests:sort-by-priority-and-waiton test-records)) (test-registry (make-hash-table)) (registry-mutex (make-mutex)) (num-retries 0) @@ -1305,14 +1317,16 @@ (hash-table-set! test-registry (db:test-make-full-name tn ip) (string->symbol st))))) tests-info) (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100)) (let loop ((hed (car sorted-test-names)) - (tal (cdr sorted-test-names)) + (tal (cdr sorted-test-names)) (reg '()) ;; registered, put these at the head of tal (reruns '())) + + (runs:incremental-print-results run-id) (if (not (null? reruns))(debug:print-info 4 *default-log-port* "reruns=" reruns)) ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes @@ -1396,23 +1410,24 @@ ;; (loop (car tal)(cdr tal) reg reruns)))) (runs:incremental-print-results run-id) (debug:print 4 *default-log-port* "TOP OF LOOP => " "test-name: " test-name - "\n test-record " test-record "\n hed: " hed - "\n itemdat: " itemdat + "\n tal: " tal + "\n reg: " reg + "\n test-record " test-record + "\n itemdat: " itemdat "\n items: " items "\n item-path: " item-path "\n waitons: " waitons "\n num-retries: " num-retries - "\n tal: " tal - "\n reruns: " reruns + "\n reruns: " reruns "\n regfull: " regfull "\n reglen: " reglen "\n length reg: " (length reg) - "\n reg: " reg) + ) ;; check for hed in waitons => this would be circular, remove it and issue an ;; error (if (member test-name waitons) (begin @@ -1432,16 +1447,18 @@ (if (and (not (member waiton tal)) ;; this waiton is not in the list to be tried to run (not (member waiton reruns))) 1 #f)) waitons))))) ;; could do this more elegantly with a marker.... + (debug:print-info 4 *default-log-port* "cond branch - " "rtq-1") (debug:print 0 *default-log-port* "WARNING: Marking test " tfullname " as not runnable. It is waiting on tests that cannot be run. Giving up now.") (hash-table-set! test-registry tfullname 'removed)) ;; items is #f then the test is ok to be handed off to launch (but not before) ;; ((not items) + (debug:print-info 4 *default-log-port* "cond branch - " "rtq-2") (debug:print-info 4 *default-log-port* "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) @@ -1451,10 +1468,11 @@ ;; items processed into a list but not came in as a list been processed ;; ((and (list? items) ;; thus we know our items are already calculated (not itemdat)) ;; and not yet expanded into the list of things to be done + (debug:print-info 4 *default-log-port* "cond branch - " "rtq-3") (debug:print-info 4 *default-log-port* "OUTER COND: (and (list? items)(not itemdat))") ;; Must determine if the items list is valid. Discard the test if it is not. (if (and (list? items) (> (length items) 0) (and (list? (car items)) @@ -1466,26 +1484,37 @@ (string-intersperse varval "=")) row) " ") "\n")) items))) - (for-each - (lambda (my-itemdat) - (let* ((new-test-record (let ((newrec (make-tests:testqueue))) - (vector-copy! test-record newrec) - newrec)) - (my-item-path (item-list->path my-itemdat))) - (if (tests:match test-patts hed my-item-path required: required-tests) ;; (patt-list-match my-item-path item-patts) ;; yes, we want to process this item, NOTE: Should not need this check here! - (let ((newtestname (db:test-make-full-name hed my-item-path))) ;; test names are unique on testname/item-path - (tests:testqueue-set-items! new-test-record #f) - (tests:testqueue-set-itemdat! new-test-record my-itemdat) - (tests:testqueue-set-item_path! new-test-record my-item-path) - (hash-table-set! test-records newtestname new-test-record) - (set! tal (append tal (list newtestname))))))) ;; since these are itemized create new test names testname/itempath - items) - - ;; (debug:print-info 0 *default-log-port* "Test " (tests:testqueue-get-testname test-record) " is itemized but has no items") + + (let* ((items-in-testpatt + (filter + (lambda (my-itemdat) + (tests:match test-patts hed (item-list->path my-itemdat) required: required-tests)) + items) )) + (if (null? items-in-testpatt) + (let ((test-id (rmt:get-test-id run-id test-name ""))) + (debug:print-info 0 *default-log-port* "Test " (tests:testqueue-get-testname test-record) " is itemized but has no items matching test pattern -- marking status ZERO_ITEMS") + (if test-id + (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "ZERO_ITEMS" "This test has no items which match test pattern."))) + + (for-each (lambda (my-itemdat) + (let* ((new-test-record (let ((newrec (make-tests:testqueue))) + (vector-copy! test-record newrec) + newrec)) + (my-item-path (item-list->path my-itemdat)) + (newtestname (db:test-make-full-name hed my-item-path))) ;; test names are unique on testname/item-path + (tests:testqueue-set-items! new-test-record #f) + (tests:testqueue-set-itemdat! new-test-record my-itemdat) + (tests:testqueue-set-item_path! new-test-record my-item-path) + (hash-table-set! test-records newtestname new-test-record) + (set! tal (append tal (list newtestname))))) ;; since these are itemized create new test names testname/itempath + items-in-testpatt))) + + + ;; At this point we have possibly added items to tal but all must be handed off to ;; INNER COND logic. I think loop without rotating the queue ;; (loop hed tal reg reruns)) ;; (let ((newtal (append tal (list hed)))) ;; We should discard hed as it has been expanded into it's items? Yes, but only if this *is* an itemized test @@ -1495,25 +1524,31 @@ (loop (car tal)(cdr tal) reg reruns))) ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS - ((or (procedure? items)(eq? items 'have-procedure)) ;; BB - target vars are env vars here? to allow expansion of [items]\nsomething [system echo $SOMETARGVAR], which is wonky + ((or (procedure? items)(eq? items 'have-procedure)) + (debug:print-info 4 *default-log-port* "cond branch - " "rtq-4") (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) (car can-run-more)) (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here (if loop-list - (apply loop loop-list))) + (apply loop loop-list) + (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed) + ) + ) ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns)))) ;; this case should not happen, added to help catch any bugs ((and (list? items) itemdat) + (debug:print-info 4 *default-log-port* "cond branch - " "rtq-5") (debug:print-error 0 *default-log-port* "Should not have a list of items in a test and the itemspath set - please report this") (exit 1)) ((not (null? reruns)) + (debug:print-info 4 *default-log-port* "cond branch - " "rtq-6") (let* ((newlst (tests:filter-non-runnable run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED, (junked (lset-difference equal? tal newlst))) (debug:print-info 4 *default-log-port* "full drop through, if reruns is less than 100 we will force retry them, reruns=" reruns ", tal=" tal) (if (< num-retries max-retries) (set! newlst (append reruns newlst))) @@ -1521,15 +1556,18 @@ ;; (thread-sleep! (+ 1 *global-delta*)) (if (not (null? newlst)) ;; since reruns have been tacked on to newlst create new reruns from junked (loop (car newlst)(cdr newlst) reg (delete-duplicates junked))))) ((not (null? tal)) + (debug:print-info 4 *default-log-port* "cond branch - " "rtq-7") (debug:print-info 4 *default-log-port* "I'm pretty sure I shouldn't get here.")) ((not (null? reg)) ;; could we get here with leftovers? + (debug:print-info 4 *default-log-port* "cond branch - " "rtq-8") (debug:print-info 0 *default-log-port* "Have leftovers!") (loop (car reg)(cdr reg) '() reruns)) (else + (debug:print-info 4 *default-log-port* "cond branch - " "rtq-9") (debug:print-info 4 *default-log-port* "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) ))) ;; end loop on sorted test names ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode)