Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -144,10 +144,21 @@ (define (runs:inc-cant-run-tests testname) (hash-table-set! *seen-cant-run-tests* testname (+ (hash-table-ref/default *seen-cant-run-tests* testname 0) 1))) (define (runs:can-keep-running? testname n) (< (hash-table-ref/default *seen-cant-run-tests* testname 0) n)) + +(define *runs:denoise* (make-hash-table)) ;; key => last-time-ran + +(define (runs:lownoise key waitval) + (let ((lasttime (hash-table-ref/default *runs:denoise* key 0)) + (currtime (current-seconds))) + (if (> (- currtime lasttime) waitval) + (begin + (hash-table-set! *runs:denoise* key currtime) + #t) + #f))) (define (runs:can-run-more-tests jobgroup max-concurrent-jobs) (thread-sleep! (cond ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while (else 0))) @@ -164,12 +175,13 @@ (list #f num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit) (let ((can-not-run-more (cond ;; if max-concurrent-jobs is set and the number running is greater ;; than it than cannot run more jobs ((and max-concurrent-jobs (>= num-running max-concurrent-jobs)) - (debug:print 0 "WARNING: Max running jobs exceeded, current number running: " num-running - ", max_concurrent_jobs: " max-concurrent-jobs) + (if (runs:lownoise "mcj msg" 60) + (debug:print 0 "WARNING: Max running jobs exceeded, current number running: " num-running + ", max_concurrent_jobs: " max-concurrent-jobs)) #t) ;; if job-group-limit is set and number of jobs in the group is greater ;; than the limit then cannot run more jobs of this kind ((and job-group-limit (>= num-running-in-jobgroup job-group-limit)) @@ -353,11 +365,11 @@ (cdr reg) (if (null? tal) ;; if tal is null and reg not full then '() as reg contents moved to tal '() reg))) -(define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen) +(define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry) (let* ((loop-list (list hed tal reg reruns)) (prereqs-not-met (mt:get-prereqs-not-met run-id waitons item-path mode: testmode)) (fails (runs:calc-fails prereqs-not-met)) (non-completed (runs:calc-not-completed prereqs-not-met))) (debug:print-info 4 "START OF INNER COND #2 " @@ -374,10 +386,20 @@ "\n can-run-more: " can-run-more) (cond ;; all prereqs met, fire off the test ;; or, if it is a 'toplevel test and all prereqs not met are COMPLETED then launch + + ((member (hash-table-ref/default test-registry (runs:make-full-test-name hed item-path) 'n/a) + '(COMPLETED INCOMPLETE)) ;; try to catch repeat processing of COMPLETED tests here + (debug:print-info 1 "Test " hed " is " (hash-table-ref test-registry (runs:make-full-test-name hed item-path)) ". Removing it from the queue") + (list (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + reruns)) + + ;; ((or (null? prereqs-not-met) (and (eq? testmode 'toplevel) (null? non-completed))) (debug:print-info 4 "runs:expand-items: (or (null? prereqs-not-met) (and (eq? testmode 'toplevel)(null? non-completed)))") (let ((test-name (tests:testqueue-get-testname test-record))) @@ -393,43 +415,46 @@ (debug:print 0 "ERROR: The proc from reading the setup did not yield a list - please report this") (exit 1)))))) ((and (null? fails) (not (null? non-completed))) - (debug:print-info 1 "no fails in prerequisites for " hed ", waiting on tests; " - (string-intersperse (map (lambda (x) - (if (string? x) - x - (runs:make-full-test-name (db:test-get-testname x) - (db:test-get-item-path x)))) - non-completed) ", ") - ", delay launching " hed) + ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal ;; but also lets check that the prerequisites are all in the newtal or reruns lists - - - - ;; (let* ((allinqueue (map (lambda (x)(if (string? x) x (db:test-get-testname x))) - ;; (append newtal reruns))) - ;; (prereqstrs (map (lambda (x)(if (string? x) x (db:test-get-testname x))) - ;; prereqs-not-met)) - ;; (notinqueue (filter (lambda (x) - ;; (not (member x allinqueue))) - ;; prereqstrs))) - ;; (if (null? notinqueue) - ;; (if (runs:can-keep-running? hed 5) - ;; (begin - ;; (runs:inc-cant-run-tests hed) - ;; (list (car newtal)(append (cdr newtal) reg) '() reruns)) - ;; (begin - ;; (debug:print 0 "WARNING: dropping " hed " from queue as it has prerequisites missing from the queue: " (string-intersperse notinqueue ", ")) - ;; (list (runs:queue-next-hed tal reg reglen regfull) - ;; (runs:queue-next-tal tal reg reglen regfull) - ;; (runs:queue-next-reg tal reg reglen regfull) - ;; reruns))) - (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; )) ;; an issue with prereqs not yet met? + + (let* ((allinqueue (map (lambda (x)(if (string? x) x (db:test-get-testname x))) + (append newtal reruns))) + (prereqstrs (map (lambda (x)(if (string? x) x (db:test-get-testname x))) + prereqs-not-met)) + (notinqueue (filter (lambda (x) + (not (member x allinqueue))) + prereqstrs))) + (if (null? notinqueue) + (if (runs:can-keep-running? hed 5) ;; try five times + (begin + (runs:inc-cant-run-tests hed) + (list (car newtal)(append (cdr newtal) reg) '() reruns)) + (begin + (if (runs:lownoise (conc "no fails prereq, null notinqueue " hed) 30) + (debug:print 1 "WARNING: test " hed " has no failed prerequisites but does have prerequistes that are NOT in the queue: " (string-intersperse notinqueue ", "))) + (list (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + reruns))) + ;; have prereqs in queue, keep going. + (begin + (if (runs:lownoise (conc "no fails prereq " hed) 30) + (debug:print-info 1 "no fails in prerequisites for " hed ", waiting on tests; " + (string-intersperse (map (lambda (x) + (if (string? x) + x + (runs:make-full-test-name (db:test-get-testname x) + (db:test-get-item-path x)))) + non-completed) ", ") + ". Delaying launch of " hed ".")) + (list (car newtal)(append (cdr newtal) reg) '() reruns))))) ;; an issue with prereqs not yet met? ((and (null? fails) (null? non-completed)) (if (runs:can-keep-running? hed 5) (begin @@ -453,14 +478,31 @@ (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) (cons hed reruns)) #f)) ;; #f flags do not loop - + ((and (not (null? fails))(eq? testmode 'toplevel)) + (if (or (not (null? reg))(not (null? tal))) + (list (car newtal)(append (cdr newtal) reg) '() reruns) + #f)) (else - (debug:print 1 "ERROR: No handler for this condition.") - (list (car newtal)(cdr newtal) reg reruns))))) + (debug:print 1 "WARNING: FAILS or incomplete tests are preventing completion of this run. Dropping test " hed " from the run queue") + (list (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + reruns))))) ;; (list (car newtal)(cdr newtal) reg reruns))))) + +(define (runs:mixed-list-testname-and-testrec->list-of-strings inlst) + (map (lambda (t) + (cond + ((vector? t) + (conc (db:test-get-state t) "/" (db:test-get-status t))) + ((string? t) + t) + (else + (conc t)))) + inlst)) (define (runs:process-expanded-tests hed tal reg reruns reglen regfull test-record runname test-name item-path jobgroup max-concurrent-jobs run-id waitons item-path testmode test-patts required-tests test-registry registry-mutex flags keyvals run-info newtal all-tests-registry) (let* ((run-limits-info (runs:can-run-more-tests jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running (have-resources (car run-limits-info)) (num-running (list-ref run-limits-info 1)) @@ -478,11 +520,11 @@ (conc (db:test-get-state t) "/" (db:test-get-status t)) (conc " WARNING: t is not a vector=" t ))) prereqs-not-met) ", ") ") fails: " fails) (if (not (null? prereqs-not-met)) - (debug:print-info 1 "waiting on tests; " (string-intersperse prereqs-not-met ", "))) + (debug:print-info 1 "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) ;; Don't know at this time if the test have been launched at some time in the past ;; i.e. is this a re-launch? (debug:print-info 4 "run-limits-info = " run-limits-info) @@ -548,11 +590,12 @@ (list hed tal reg reruns)) ;; If no resources are available just kill time and loop again ;; ((not have-resources) ;; simply try again after waiting a second - (debug:print-info 1 "no resources to run new tests, waiting ...") + (if (runs:lownoise "no resources" 60) + (debug:print-info 1 "no resources to run new tests, waiting ...")) ;; Have gone back and forth on this but db starvation is an issue. ;; wait one second before looking again to run jobs. (thread-sleep! 1) ;; could have done hed tal here but doing car/cdr of newtal to rotate tests (list (car newtal)(cdr newtal) reg reruns)) @@ -757,11 +800,11 @@ ;; EXPAND ITEMS ((or (procedure? items)(eq? items 'have-procedure)) (let ((can-run-more (runs:can-run-more-tests jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) (car can-run-more)) - (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen))) + (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry))) (if loop-list (apply loop loop-list))) ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns)))) @@ -970,11 +1013,13 @@ 600) ;; i.e. no update for more than 600 seconds (begin (debug:print 0 "WARNING: Test " test-name " appears to be dead. Forcing it to state INCOMPLETE and status STUCK/DEAD") (tests:test-set-status! test-id "INCOMPLETE" "STUCK/DEAD" "Test is stuck or dead" #f)) (debug:print 2 "NOTE: " test-name " is already running"))) - (else (debug:print 0 "ERROR: Failed to launch test " full-test-name ". Unrecognised state " (test:get-state testdat))))))) + (else + (debug:print 0 "ERROR: Failed to launch test " full-test-name ". Unrecognised state " (test:get-state testdat)) + (hash-table-set! test-registry (runs:make-full-test-name test-name test-path)(string->symbol (test:get-state testdat)))))))) ;;====================================================================== ;; END OF NEW STUFF ;;======================================================================