Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -278,10 +278,11 @@ (if (not (null? required-tests)) (debug:print 1 "INFO: Adding " required-tests " to the run queue")) ;; NOTE: these are all parent tests, items are not expanded yet. (debug:print 4 "INFO: test-records=" (hash-table->alist test-records)) (runs:run-tests-queue run-id runname test-records keyvallst flags) + (debug:print 1 "INFO: running queue one more time to catch any changed test states") (runs:run-tests-queue run-id runname test-records keyvallst flags) (debug:print 4 "INFO: All done by here"))) ;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... > (define (runs:run-tests-queue run-id runname test-records keyvallst flags) @@ -288,14 +289,17 @@ ;; At this point the list of parent tests is expanded ;; NB// Should expand items here and then insert into the run queue. (debug:print 5 "test-records: " test-records ", keyvallst: " keyvallst " flags: " (hash-table->alist flags)) (let ((sorted-test-names (tests:sort-by-priority-and-waiton test-records)) (item-patts (hash-table-ref/default flags "-itempatt" #f)) - (test-registery (make-hash-table))) + (test-registery (make-hash-table)) + (num-retries 0)) (if (not (null? sorted-test-names)) (let loop ((hed (car sorted-test-names)) - (tal (cdr sorted-test-names))) + (tal (cdr sorted-test-names)) + (reruns '())) + (if (not (null? reruns))(debug:print 4 "INFO: reruns=" reruns)) (let* ((test-record (hash-table-ref test-records hed)) (test-name (tests:testqueue-get-testname test-record)) (tconfig (tests:testqueue-get-testconfig test-record)) (testmode (let ((m (config-lookup tconfig "requirements" "mode"))) (if m (string->symbol m) 'normal))) @@ -305,11 +309,10 @@ (items (tests:testqueue-get-items test-record)) (item-path (item-list->path itemdat)) (newtal (append tal (list hed))) (calc-fails (lambda (prereqs-not-met) (filter (lambda (test) - (debug:print 9 "test: " test) (and (vector? test) ;; not (string? test)) (equal? (db:test-get-state test) "COMPLETED") (not (member (db:test-get-status test) '("PASS" "WARN" "CHECK" "WAIVED"))))) prereqs-not-met))) @@ -326,15 +329,16 @@ (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t)))) lst)))) (debug:print 6 "test-name: " test-name - "\n hed: " hed + "\n hed: " hed "\n itemdat: " itemdat - "\n items: " items - "\n item-path: " item-path - "\n waitons: " waitons) + "\n items: " items + "\n item-path: " item-path + "\n waitons: " waitons + "\n num-retries: " num-retries) ;; check for hed in waitons => this would be circular, remove it and issue an ;; error (if (member test-name waitons) (begin @@ -361,20 +365,20 @@ ((not (patt-list-match item-path item-patts)) ;; else the run is stuck, temporarily or permanently ;; but should check if it is due to lack of resources vs. prerequisites (debug:print 1 "INFO: Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " item-patts) (if (not (null? tal)) - (loop (car tal)(cdr tal)))) + (loop (car tal)(cdr tal) reruns))) ((not (hash-table-ref/default test-registery (conc test-name "/" item-path) #f)) (open-run-close db:tests-register-test #f run-id test-name item-path) (hash-table-set! test-registery (conc test-name "/" item-path) #t) - (loop (car newtal)(cdr newtal))) + (loop (car newtal)(cdr newtal) reruns)) ((not have-resources) ;; simply try again after waiting a second (thread-sleep! (+ 1 *global-delta*)) (debug:print 1 "INFO: no resources to run new tests, waiting ...") ;; could have done hed tal here but doing car/cdr of newtal to rotate tests - (loop (car newtal)(cdr newtal))) + (loop (car newtal)(cdr newtal) reruns)) ((and have-resources (or (null? prereqs-not-met) (and (eq? testmode 'toplevel) (null? non-completed)))) ;; no loop here, just drop though and use the loop at the bottom @@ -387,20 +391,20 @@ (begin ;; couldn't run, take a breather (debug:print 4 "INFO: Shouldn't really get here, race condition? Unable to launch more tests at this moment, killing time ...") (thread-sleep! (+ 1 *global-delta*)) ;; long sleep here - no resources, may as well be patient ;; we made new tal by sticking hed at the back of the list - (loop (car newtal)(cdr newtal))) + (loop (car newtal)(cdr newtal) reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (if (not (null? tal)) (if (vector? hed) (begin (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed) " from the launch list as it has prerequistes that are FAIL") - (loop (car tal)(cdr tal))) + (loop (car tal)(cdr tal) (cons hed reruns))) (begin (debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") - (loop hed tal))))))))) + (loop hed tal reruns))))))))) ;; case where an items came in as a list been processed ((and (list? items) ;; thus we know our items are already calculated (not itemdat)) ;; and not yet expanded into the list of things to be done (if (and (>= *verbosity* 1) @@ -424,11 +428,11 @@ (tests:testqueue-set-item_path! new-test-record my-item-path) (hash-table-set! test-records newtestname new-test-record) (set! tal (cons newtestname tal)))))) ;; since these are itemized create new test names testname/itempath items) (if (not (null? tal)) - (loop (car tal)(cdr tal)))) + (loop (car tal)(cdr tal) reruns))) ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ((or (procedure? items)(eq? items 'have-procedure)) (let ((can-run-more (open-run-close runs:can-run-more-tests #f test-record))) @@ -435,16 +439,19 @@ (if can-run-more (let* ((prereqs-not-met (open-run-close db:get-prereqs-not-met #f run-id waitons item-path mode: testmode)) (fails (calc-fails prereqs-not-met)) (non-completed (calc-not-completed prereqs-not-met))) (debug:print 8 "INFO: can-run-more: " can-run-more + "\n testname: " hed "\n prereqs-not-met: " (pretty-string prereqs-not-met) "\n non-completed: " (pretty-string non-completed) "\n fails: " (pretty-string fails) "\n testmode: " testmode - "\n (eq? testmode 'toplevel) " (eq? testmode 'toplevel) - "\n (null? non-completed) " (null? non-completed)) + "\n num-retries: " num-retries + "\n (eq? testmode 'toplevel): " (eq? testmode 'toplevel) + "\n (null? non-completed): " (null? non-completed) + "\n reruns: " reruns) (cond ((or (null? prereqs-not-met) ;; all prereqs met, fire off the test ;; or, if it is a 'toplevel test and all prereqs not met are COMPLETED then launch (and (eq? testmode 'toplevel) (null? non-completed))) @@ -454,34 +461,36 @@ (open-run-close-measure set-megatest-env-vars #f run-id) ;; these may be needed by the launching process (let ((items-list (items:get-items-from-config tconfig))) (if (list? items-list) (begin (tests:testqueue-set-items! test-record items-list) - (loop hed tal)) + (loop hed tal reruns)) (begin (debug:print 0 "ERROR: The proc from reading the setup did not yield a list - please report this") (exit 1)))))) ((null? fails) - (loop (car newtal)(cdr newtal))) ;; an issue with prereqs not yet met? + (debug:print 4 "INFO: fails is null, moving on in the queue but keeping " hed " for now") + (loop (car newtal)(cdr newtal) reruns)) ;; an issue with prereqs not yet met? ((and (not (null? fails))(eq? testmode 'normal)) (debug:print 1 "INFO: test " hed " (mode=" testmode ") has failed prerequisite(s); " (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ") ", removing it from to-do list") (if (not (null? tal)) - (loop (car tal)(cdr tal)))) + (loop (car tal)(cdr tal)(cons hed reruns)))) (else (debug:print 8 "ERROR: No handler for this condition.") ;; "\n hed: " hed ;; "\n fails: " (string-intersperse (map db:test-get-testname fails) ",") ;; "\n testmode: " testmode ;; "\n prereqs-not-met: " (pretty-string prereqs-not-met) ;; "\n items: " items) - (loop (car newtal)(cdr newtal))))) + (loop (car newtal)(cdr newtal) reruns)))) ;; if can't run more just loop with next possible test (begin + (debug:print 4 "INFO: processing the case with a lambda for items or 'have-procedure. Moving through the queue without dropping " hed) (thread-sleep! (+ 1 *global-delta*)) - (loop (car newtal)(cdr newtal)))))) + (loop (car newtal)(cdr newtal) reruns))))) ;; this case should not happen, added to help catch any bugs ((and (list? items) itemdat) (debug:print 0 "ERROR: Should not have a list of items in a test and the itemspath set - please report this") (exit 1)))) @@ -495,14 +504,20 @@ ;; FIXME! This harsh exit should not be necessary.... (if (not *runremote*)(exit)) ;; #f) ;; return a #f as a hint that we are done ;; Here we need to check that all the tests remaining to be run are eligible to run ;; and are not blocked by failed - (let ((newlst (open-run-close tests:filter-non-runnable #f run-id tal test-records))) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED, + (let* ((newlst (open-run-close tests:filter-non-runnable #f run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED, + (junked (lset-difference equal? tal newlst))) + (debug:print 4 "INFO: full drop through, if reruns is less than 100 we will force retry them: " reruns) + (if (< num-retries 100) + (set! newlst (append reruns newlst))) + (set! num-retries (+ num-retries 1)) (thread-sleep! *global-delta*) (if (not (null? newlst)) - (loop (car newlst)(cdr newlst))))))))) + ;; since reruns have been tacked on to newlst create new reruns from junked + (loop (car newlst)(cdr newlst)(delete-duplicates junked))))))))) ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step (define (run:test run-id runname keyvallst test-record flags parent-test) ;; All these vars might be referenced by the testconfig file reader (let* ((test-name (tests:testqueue-get-testname test-record)) Index: tests/Makefile ================================================================== --- tests/Makefile +++ tests/Makefile @@ -27,10 +27,12 @@ test5 : fullprep cd fullrun;$(MEGATEST) -runall -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_aa -debug $(DEBUG) > aa.log 2> aa.log & cd fullrun;$(MEGATEST) -runall -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_ab -debug $(DEBUG) > ab.log 2> ab.log & cd fullrun;$(MEGATEST) -runall -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_ac -debug $(DEBUG) > ac.log 2> ac.log & cd fullrun;$(MEGATEST) -runall -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_ad -debug $(DEBUG) > ad.log 2> ad.log & + cd fullrun;$(MEGATEST) -runall -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_ae -debug $(DEBUG) > ae.log 2> ae.log & + cd fullrun;$(MEGATEST) -runall -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_af -debug $(DEBUG) > af.log 2> af.log & test6: fullprep cd fullrun;$(MEGATEST) -runtests runfirst -itempatt %/1 -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_itempatt -v cd fullrun;$(MEGATEST) -runtests runfirst -itempatt %blahha% -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_itempatt -debug 10