Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -55,10 +55,13 @@ db.o ezsteps.o keys.o launch.o megatest.o monitor.o runs-for-ref.o runs.o tests.o : key_records.scm tests.o tasks.o dashboard-tasks.o : task_records.scm runs.o : test_records.scm megatest.o : megatest-fossil-hash.scm +# Temporary while transitioning to new routine +runs.o : run-tests-queue-classic.scm run-tests-queue-new.scm + megatest-fossil-hash.scm : $(SRCFILES) megatest.scm *_records.scm echo "(define megatest-fossil-hash \"$(MTESTHASH)\")" > megatest-fossil-hash.new if ! diff -q megatest-fossil-hash.new megatest-fossil-hash.scm ; then echo copying .new to .scm;cp -f megatest-fossil-hash.new megatest-fossil-hash.scm;fi $(OFILES) $(GOFILES) : common_records.scm Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -126,11 +126,11 @@ (change-directory *toppath*) (set-megatest-env-vars run-id) ;; these may be needed by the launching process (change-directory work-area) - (open-run-close set-run-config-vars #f run-id keys keyvals) + (set-run-config-vars run-id keys keyvals target) ;; (db:get-target db run-id)) ;; environment overrides are done *before* the remaining critical envars. (alist->env-vars env-ovrd) (set-megatest-env-vars run-id) (set-item-env-vars itemdat) (save-environment-as-files "megatest") @@ -336,11 +336,11 @@ (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO-WARN" "WARN")) (else "FAIL")) (args:get-arg "-m") #f))) ;; for automated creation of the rollup html file this is a good place... (if (not (equal? item-path "")) - (open-run-close tests:summarize-items #f run-id test-name #f)) ;; don't force - just update if no + (tests:summarize-items #f run-id test-name #f)) ;; don't force - just update if no ) (mutex-unlock! m) ;; (exec-results (cmd-run->list fullrunscript)) ;; (list ">" (conc test-name "-run.log")))) ;; (success exec-results)) ;; (eq? (cadr exec-results) 0))) (debug:print 2 "Output from running " fullrunscript ", pid " (vector-ref exit-info 0) " in work area " @@ -406,18 +406,16 @@ ;; ;; All log file links should be stored relative to the top of link path ;; ;; - [ - ] ;; -(define (create-work-area db run-id test-id test-src-path disk-path testname itemdat) - (let* ((run-info (cdb:remote-run db:get-run-info #f run-id)) - (item-path (item-list->path itemdat)) +(define (create-work-area run-id run-info key-vals test-id test-src-path disk-path testname itemdat) + (let* ((item-path (item-list->path itemdat)) (runname (db:get-value-by-header (db:get-row run-info) (db:get-header run-info) "runname")) ;; convert back to db: from rdb: - this is always run at server end - (key-vals (cdb:remote-run db:get-key-vals #f run-id)) (target (string-intersperse key-vals "/")) (not-iterated (equal? "" item-path)) ;; all tests are found at /test-base or /test-base @@ -556,11 +554,11 @@ ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) -(define (launch-test db run-id runname test-conf keyvallst test-name test-path itemdat params) +(define (launch-test test-id run-id run-info key-vals runname test-conf keyvallst test-name test-path itemdat params) (change-directory *toppath*) (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute" (list ;; (list "MT_TEST_RUN_DIR" work-area) (list "MT_RUN_AREA_HOME" *toppath*) (list "MT_TEST_NAME" test-name) @@ -595,11 +593,11 @@ (diskpath #f) (cmdparms #f) (fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x)))) (mt-bindir-path #f) (item-path (item-list->path itemdat)) - (test-id (cdb:remote-run db:get-test-id #f run-id test-name item-path)) + ;; (test-id (cdb:remote-run db:get-test-id #f run-id test-name item-path)) (testinfo (cdb:get-test-info-by-id *runremote* test-id)) (mt_target (string-intersperse (map cadr keyvallst) "/")) (debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '()) (if (args:get-arg "-logging")(list "-logging") '())))) (if hosts (set! hosts (string-split hosts))) @@ -608,11 +606,11 @@ (set! mt-bindir-path (pathname-directory remote-megatest)) (if launcher (set! launcher (string-split launcher))) ;; set up the run work area for this test (set! diskpath (get-best-disk *configdat*)) (if diskpath - (let ((dat (open-run-close create-work-area db run-id test-id test-path diskpath test-name itemdat))) + (let ((dat (create-work-area run-id run-info key-vals test-id test-path diskpath test-name itemdat))) (set! work-area (car dat)) (set! toptest-work-area (cadr dat)) (debug:print-info 2 "Using work area " work-area)) (begin (set! work-area (conc test-path "/tmp_run")) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -1,7 +1,7 @@ ;; Always use two digit decimal ;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.5418) +(define megatest-version 1.5419) ADDED run-tests-queue-classic.scm Index: run-tests-queue-classic.scm ================================================================== --- /dev/null +++ run-tests-queue-classic.scm @@ -0,0 +1,298 @@ + +;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... > +(define (runs:run-tests-queue-classic run-id runname test-records keyvallst flags test-patts) + ;; At this point the list of parent tests is expanded + ;; NB// Should expand items here and then insert into the run queue. + (debug:print 5 "test-records: " test-records ", keyvallst: " keyvallst " flags: " (hash-table->alist flags)) + (let ((run-info (cdb:remote-run db:get-run-info #f run-id)) + (key-vals (cdb:remote-run db:get-key-vals #f run-id)) + (sorted-test-names (tests:sort-by-priority-and-waiton test-records)) + (test-registry (make-hash-table)) + (registry-mutex (make-mutex)) + (num-retries 0) + (max-retries (config-lookup *configdat* "setup" "maxretries")) + (max-concurrent-jobs (let ((mcj (config-lookup *configdat* "setup" "max_concurrent_jobs"))) + (if (and mcj (string->number mcj)) + (string->number mcj) + 1)))) + (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100)) + (if (not (null? sorted-test-names)) + (let loop ((hed (car sorted-test-names)) + (tal (cdr sorted-test-names)) + (reruns '())) + (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns)) + ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns) + (let* ((test-record (hash-table-ref test-records hed)) + (test-name (tests:testqueue-get-testname test-record)) + (tconfig (tests:testqueue-get-testconfig test-record)) + (testmode (let ((m (config-lookup tconfig "requirements" "mode"))) + (if m (string->symbol m) 'normal))) + (waitons (tests:testqueue-get-waitons test-record)) + (priority (tests:testqueue-get-priority test-record)) + (itemdat (tests:testqueue-get-itemdat test-record)) ;; itemdat can be a string, list or #f + (items (tests:testqueue-get-items test-record)) + (item-path (item-list->path itemdat)) + (newtal (append tal (list hed)))) + + (debug:print 6 + "test-name: " test-name + "\n hed: " hed + "\n itemdat: " itemdat + "\n items: " items + "\n item-path: " item-path + "\n waitons: " waitons + "\n num-retries: " num-retries + "\n tal: " tal + "\n reruns: " reruns) + + ;; check for hed in waitons => this would be circular, remove it and issue an + ;; error + (if (member test-name waitons) + (begin + (debug:print 0 "ERROR: test " test-name " has listed itself as a waiton, please correct this!") + (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons)))) + + (cond ;; OUTER COND + ((not items) ;; when false the test is ok to be handed off to launch (but not before) + (let* ((run-limits-info (runs:can-run-more-tests test-record max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running + (have-resources (car run-limits-info)) + (num-running (list-ref run-limits-info 1)) + (num-running-in-jobgroup (list-ref run-limits-info 2)) + (max-concurrent-jobs (list-ref run-limits-info 3)) + (job-group-limit (list-ref run-limits-info 4)) + (prereqs-not-met (db:get-prereqs-not-met run-id waitons item-path mode: testmode)) + (fails (runs:calc-fails prereqs-not-met)) + (non-completed (runs:calc-not-completed prereqs-not-met))) + (debug:print-info 8 "have-resources: " have-resources " prereqs-not-met: " + (string-intersperse + (map (lambda (t) + (if (vector? t) + (conc (db:test-get-state t) "/" (db:test-get-status t)) + (conc " WARNING: t is not a vector=" t ))) + prereqs-not-met) ", ") " fails: " fails) + (debug:print-info 4 "hed=" hed "\n test-record=" test-record "\n test-name: " test-name "\n item-path: " item-path "\n test-patts: " test-patts) + + ;; Don't know at this time if the test have been launched at some time in the past + ;; i.e. is this a re-launch? + (debug:print-info 4 "run-limits-info = " run-limits-info) + (cond ;; INNER COND #1 for a launchable test + ;; Check item path against item-patts + ((not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path)) ;; This test/itempath is not to be run + ;; else the run is stuck, temporarily or permanently + ;; but should check if it is due to lack of resources vs. prerequisites + (debug:print-info 1 "Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " test-patts) + ;; (thread-sleep! *global-delta*) + (if (not (null? tal)) + (loop (car tal)(cdr tal) reruns))) + ;; Registry has been started for this test but has not yet completed + ;; this should be rare, the case where there are only a couple of tests and the db is slow + ;; delay a short while and continue + ;; ((eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f) 'start) + ;; (thread-sleep! 0.01) + ;; (loop (car newtal)(cdr newtal) reruns)) + ;; count number of 'done, if more than 100 then skip on through. + (;; (and (< (length (filter (lambda (x)(eq? x 'done))(hash-table-values test-registry))) 100) ;; why get more than 200 ahead? + (not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) ;; ) ;; too many changes required. Implement later. + (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) + ;; NEED TO THREADIFY THIS + (let ((th (make-thread (lambda () + (mutex-lock! registry-mutex) + (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'start) + (mutex-unlock! registry-mutex) + ;; If haven't done it before register a top level test if this is an itemized test + (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done)) + (cdb:tests-register-test *runremote* run-id test-name "")) + (cdb:tests-register-test *runremote* run-id test-name item-path) + (mutex-lock! registry-mutex) + (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done) + (mutex-unlock! registry-mutex)) + (conc test-name "/" item-path)))) + (thread-start! th)) + ;; TRY (thread-sleep! *global-delta*) + (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + (loop (car newtal)(cdr newtal) reruns)) + ;; At this point *all* test registrations must be completed. + ((not (null? (filter (lambda (x)(eq? 'start x))(hash-table-values test-registry)))) + (debug:print-info 0 "Waiting on test registrations: " (string-intersperse + (filter (lambda (x) + (eq? (hash-table-ref/default test-registry x #f) 'start)) + (hash-table-keys test-registry)) + ", ")) + (thread-sleep! 0.1) + (loop hed tal reruns)) + ((not have-resources) ;; simply try again after waiting a second + (debug:print-info 1 "no resources to run new tests, waiting ...") + ;; Have gone back and forth on this but db starvation is an issue. + ;; wait one second before looking again to run jobs. + (thread-sleep! 1) ;; (+ 2 *global-delta*)) + ;; could have done hed tal here but doing car/cdr of newtal to rotate tests + (loop (car newtal)(cdr newtal) reruns)) + ((and have-resources + (or (null? prereqs-not-met) + (and (eq? testmode 'toplevel) + (null? non-completed)))) + (run:test run-id run-info key-vals runname keyvallst test-record flags #f) + (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'running) + (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + ;; (thread-sleep! *global-delta*) + (if (not (null? tal)) + (loop (car tal)(cdr tal) reruns))) + (else ;; must be we have unmet prerequisites + (debug:print 4 "FAILS: " fails) + ;; If one or more of the prereqs-not-met are FAIL then we can issue + ;; a message and drop hed from the items to be processed. + (if (null? fails) + (begin + ;; couldn't run, take a breather + (debug:print-info 4 "Shouldn't really get here, race condition? Unable to launch more tests at this moment, killing time ...") + ;; (thread-sleep! (+ 0.01 *global-delta*)) ;; long sleep here - no resources, may as well be patient + ;; we made new tal by sticking hed at the back of the list + (loop (car newtal)(cdr newtal) reruns)) + ;; the waiton is FAIL so no point in trying to run hed ever again + (if (not (null? tal)) + (if (vector? hed) + (begin + (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed) + " from the launch list as it has prerequistes that are FAIL") + (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + ;; (thread-sleep! *global-delta*) + (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'removed) + (loop (car tal)(cdr tal) (cons hed reruns))) + (begin + (debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") + (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + ;; (thread-sleep! (+ 0.01 *global-delta*)) + (loop hed tal reruns))))))))) ;; END OF INNER COND + + ;; case where an items came in as a list been processed + ((and (list? items) ;; thus we know our items are already calculated + (not itemdat)) ;; and not yet expanded into the list of things to be done + (if (and (debug:debug-mode 1) ;; (>= *verbosity* 1) + (> (length items) 0) + (> (length (car items)) 0)) + (pp items)) + (for-each + (lambda (my-itemdat) + (let* ((new-test-record (let ((newrec (make-tests:testqueue))) + (vector-copy! test-record newrec) + newrec)) + (my-item-path (item-list->path my-itemdat))) + (if (tests:match test-patts hed my-item-path) ;; (patt-list-match my-item-path item-patts) ;; yes, we want to process this item, NOTE: Should not need this check here! + (let ((newtestname (runs:make-full-test-name hed my-item-path))) ;; test names are unique on testname/item-path + (tests:testqueue-set-items! new-test-record #f) + (tests:testqueue-set-itemdat! new-test-record my-itemdat) + (tests:testqueue-set-item_path! new-test-record my-item-path) + (hash-table-set! test-records newtestname new-test-record) + (set! tal (cons newtestname tal)))))) ;; since these are itemized create new test names testname/itempath + items) + (if (not (null? tal)) + (begin + (debug:print-info 4 "End of items list, looping with next after short delay") + ;; (thread-sleep! (+ 0.01 *global-delta*)) + (loop (car tal)(cdr tal) reruns)))) + + ;; if items is a proc then need to run items:get-items-from-config, get the list and loop + ;; - but only do that if resources exist to kick off the job + ((or (procedure? items)(eq? items 'have-procedure)) + (let ((can-run-more (runs:can-run-more-tests test-record max-concurrent-jobs))) + (if (and (list? can-run-more) + (car can-run-more)) + (let* ((prereqs-not-met (db:get-prereqs-not-met run-id waitons item-path mode: testmode)) + (fails (runs:calc-fails prereqs-not-met)) + (non-completed (runs:calc-not-completed prereqs-not-met))) + (debug:print-info 8 "can-run-more: " can-run-more + "\n testname: " hed + "\n prereqs-not-met: " (runs:pretty-string prereqs-not-met) + "\n non-completed: " (runs:pretty-string non-completed) + "\n fails: " (runs:pretty-string fails) + "\n testmode: " testmode + "\n num-retries: " num-retries + "\n (eq? testmode 'toplevel): " (eq? testmode 'toplevel) + "\n (null? non-completed): " (null? non-completed) + "\n reruns: " reruns + "\n items: " items + "\n can-run-more: " can-run-more) + ;; (thread-sleep! (+ 0.01 *global-delta*)) + (cond ;; INNER COND #2 + ((or (null? prereqs-not-met) ;; all prereqs met, fire off the test + ;; or, if it is a 'toplevel test and all prereqs not met are COMPLETED then launch + (and (eq? testmode 'toplevel) + (null? non-completed))) + (let ((test-name (tests:testqueue-get-testname test-record))) + (setenv "MT_TEST_NAME" test-name) ;; + (setenv "MT_RUNNAME" runname) + (set-megatest-env-vars run-id) ;; these may be needed by the launching process + (let ((items-list (items:get-items-from-config tconfig))) + (if (list? items-list) + (begin + (tests:testqueue-set-items! test-record items-list) + ;; (thread-sleep! *global-delta*) + (loop hed tal reruns)) + (begin + (debug:print 0 "ERROR: The proc from reading the setup did not yield a list - please report this") + (exit 1)))))) + ((null? fails) + (debug:print-info 4 "fails is null, moving on in the queue but keeping " hed " for now") + ;; only increment num-retries when there are no tests runing + (if (eq? 0 (list-ref can-run-more 1)) + (begin + ;; TRY (if (> num-retries 100) ;; first 100 retries are low time cost + ;; TRY (thread-sleep! (+ 2 *global-delta*)) + ;; TRY (thread-sleep! (+ 0.01 *global-delta*))) + (set! num-retries (+ num-retries 1)))) + (if (> num-retries max-retries) + (if (not (null? tal)) + (loop (car tal)(cdr tal) reruns)) + (loop (car newtal)(cdr newtal) reruns))) ;; an issue with prereqs not yet met? + ((and (not (null? fails))(eq? testmode 'normal)) + (debug:print-info 1 "test " hed " (mode=" testmode ") has failed prerequisite(s); " + (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ") + ", removing it from to-do list") + (if (not (null? tal)) + (begin + ;; (thread-sleep! *global-delta*) + (loop (car tal)(cdr tal)(cons hed reruns))))) + (else + (debug:print 8 "ERROR: No handler for this condition.") + ;; TRY (thread-sleep! (+ 1 *global-delta*)) + (loop (car newtal)(cdr newtal) reruns)))) ;; END OF IF CAN RUN MORE + + ;; if can't run more just loop with next possible test + (begin + (debug:print-info 4 "processing the case with a lambda for items or 'have-procedure. Moving through the queue without dropping " hed) + ;; (thread-sleep! (+ 2 *global-delta*)) + (loop (car newtal)(cdr newtal) reruns))))) ;; END OF (or (procedure? items)(eq? items 'have-procedure)) + + ;; this case should not happen, added to help catch any bugs + ((and (list? items) itemdat) + (debug:print 0 "ERROR: Should not have a list of items in a test and the itemspath set - please report this") + (exit 1)) + ((not (null? reruns)) + (let* ((newlst (tests:filter-non-runnable run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED, + (junked (lset-difference equal? tal newlst))) + (debug:print-info 4 "full drop through, if reruns is less than 100 we will force retry them, reruns=" reruns ", tal=" tal) + (if (< num-retries max-retries) + (set! newlst (append reruns newlst))) + (set! num-retries (+ num-retries 1)) + ;; (thread-sleep! (+ 1 *global-delta*)) + (if (not (null? newlst)) + ;; since reruns have been tacked on to newlst create new reruns from junked + (loop (car newlst)(cdr newlst)(delete-duplicates junked))))) + ((not (null? tal)) + (debug:print-info 4 "I'm pretty sure I shouldn't get here.")) + (else + (debug:print-info 4 "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) + )))) ;; LET* ((test-record + + ;; we get here on "drop through" - loop for next test in queue + ;; FIXME!!!! THIS SHOULD NOT REQUIRE AN EXIT!!!!!!! + + (debug:print-info 1 "All tests launched") + (thread-sleep! 0.5) + ;; FIXME! This harsh exit should not be necessary.... + ;; (if (not *runremote*)(exit)) ;; + #f)) ;; return a #f as a hint that we are done + ;; Here we need to check that all the tests remaining to be run are eligible to run + ;; and are not blocked by failed + + ADDED run-tests-queue-new.scm Index: run-tests-queue-new.scm ================================================================== --- /dev/null +++ run-tests-queue-new.scm @@ -0,0 +1,337 @@ +;; (use trace) +;; (trace +;; runs:queue-next-hed +;; runs:queue-next-tal +;; runs:queue-next-reg +;; ) + +;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... > +(define (runs:run-tests-queue-new run-id runname test-records keyvallst flags test-patts reglen) + ;; At this point the list of parent tests is expanded + ;; NB// Should expand items here and then insert into the run queue. + (debug:print 5 "test-records: " test-records ", keyvallst: " keyvallst " flags: " (hash-table->alist flags)) + (let ((run-info (cdb:remote-run db:get-run-info #f run-id)) + (key-vals (cdb:remote-run db:get-key-vals #f run-id)) + (sorted-test-names (tests:sort-by-priority-and-waiton test-records)) + (test-registry (make-hash-table)) + (registry-mutex (make-mutex)) + (num-retries 0) + (max-retries (config-lookup *configdat* "setup" "maxretries")) + (max-concurrent-jobs (let ((mcj (config-lookup *configdat* "setup" "max_concurrent_jobs"))) + (if (and mcj (string->number mcj)) + (string->number mcj) + 1)))) ;; length of the register queue ahead + (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100)) + (if (not (null? sorted-test-names)) + (let loop ((hed (car sorted-test-names)) + (tal (cdr sorted-test-names)) + (reg '()) ;; registered, put these at the head of tal + (reruns '())) + (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns)) + ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns) + (let* ((test-record (hash-table-ref test-records hed)) + (test-name (tests:testqueue-get-testname test-record)) + (tconfig (tests:testqueue-get-testconfig test-record)) + (testmode (let ((m (config-lookup tconfig "requirements" "mode"))) + (if m (string->symbol m) 'normal))) + (waitons (tests:testqueue-get-waitons test-record)) + (priority (tests:testqueue-get-priority test-record)) + (itemdat (tests:testqueue-get-itemdat test-record)) ;; itemdat can be a string, list or #f + (items (tests:testqueue-get-items test-record)) + (item-path (item-list->path itemdat)) + (newtal (append tal (list hed))) + (regfull (> (length reg) reglen))) + ;; (if (> (length reg) 10) + ;; (begin + ;; (set! tal (cons hed tal)) + ;; (set! hed (car reg)) + ;; (set! reg (cdr reg)) + ;; (set! newtal tal))) + (debug:print 6 + "test-name: " test-name + "\n hed: " hed + "\n itemdat: " itemdat + "\n items: " items + "\n item-path: " item-path + "\n waitons: " waitons + "\n num-retries: " num-retries + "\n tal: " tal + "\n reruns: " reruns) + + ;; check for hed in waitons => this would be circular, remove it and issue an + ;; error + (if (member test-name waitons) + (begin + (debug:print 0 "ERROR: test " test-name " has listed itself as a waiton, please correct this!") + (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons)))) + + (cond ;; OUTER COND + ((not items) ;; when false the test is ok to be handed off to launch (but not before) + (let* ((run-limits-info (runs:can-run-more-tests test-record max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running + (have-resources (car run-limits-info)) + (num-running (list-ref run-limits-info 1)) + (num-running-in-jobgroup (list-ref run-limits-info 2)) + (max-concurrent-jobs (list-ref run-limits-info 3)) + (job-group-limit (list-ref run-limits-info 4)) + (prereqs-not-met (db:get-prereqs-not-met run-id waitons item-path mode: testmode)) + (fails (runs:calc-fails prereqs-not-met)) + (non-completed (runs:calc-not-completed prereqs-not-met))) + (debug:print-info 8 "have-resources: " have-resources " prereqs-not-met: " + (string-intersperse + (map (lambda (t) + (if (vector? t) + (conc (db:test-get-state t) "/" (db:test-get-status t)) + (conc " WARNING: t is not a vector=" t ))) + prereqs-not-met) ", ") " fails: " fails) + (debug:print-info 4 "hed=" hed "\n test-record=" test-record "\n test-name: " test-name "\n item-path: " item-path "\n test-patts: " test-patts) + + ;; Don't know at this time if the test have been launched at some time in the past + ;; i.e. is this a re-launch? + (debug:print-info 4 "run-limits-info = " run-limits-info) + (cond ;; INNER COND #1 for a launchable test + ;; Check item path against item-patts + ((not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path)) ;; This test/itempath is not to be run + ;; else the run is stuck, temporarily or permanently + ;; but should check if it is due to lack of resources vs. prerequisites + (debug:print-info 1 "Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " test-patts) + ;; (thread-sleep! *global-delta*) + (if (not (null? tal)) + (loop (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + reruns))) + ;; Registry has been started for this test but has not yet completed + ;; this should be rare, the case where there are only a couple of tests and the db is slow + ;; delay a short while and continue + ;; ((eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f) 'start) + ;; (thread-sleep! 0.01) + ;; (loop (car newtal)(cdr newtal) reruns)) + ;; count number of 'done, if more than 100 then skip on through. + ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) ;; ) ;; too many changes required. Implement later. + (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) + (let ((th (make-thread (lambda () + (mutex-lock! registry-mutex) + (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'start) + (mutex-unlock! registry-mutex) + ;; If haven't done it before register a top level test if this is an itemized test + (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done)) + (cdb:tests-register-test *runremote* run-id test-name "")) + (cdb:tests-register-test *runremote* run-id test-name item-path) + (mutex-lock! registry-mutex) + (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done) + (mutex-unlock! registry-mutex)) + (conc test-name "/" item-path)))) + (thread-start! th)) + (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + (if (and (null? tal)(null? reg)) + (loop hed tal reg reruns) + (loop (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (let ((newl (append reg (list hed)))) + (if regfull + (cdr newl) + newl)) + reruns))) + ;; At this point hed test registration must be completed. + ((eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f) + 'start) + (debug:print-info 0 "Waiting on test registration(s): " (string-intersperse + (filter (lambda (x) + (eq? (hash-table-ref/default test-registry x #f) 'start)) + (hash-table-keys test-registry)) + ", ")) + (thread-sleep! 0.1) + (loop hed tal reg reruns)) + ((not have-resources) ;; simply try again after waiting a second + (debug:print-info 1 "no resources to run new tests, waiting ...") + ;; Have gone back and forth on this but db starvation is an issue. + ;; wait one second before looking again to run jobs. + (thread-sleep! 1) ;; (+ 2 *global-delta*)) + ;; could have done hed tal here but doing car/cdr of newtal to rotate tests + (loop (car newtal)(cdr newtal) reg reruns)) + ((and have-resources + (or (null? prereqs-not-met) + (and (eq? testmode 'toplevel) + (null? non-completed)))) + (run:test run-id run-info key-vals runname keyvallst test-record flags #f) + (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'running) + (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + ;; (thread-sleep! *global-delta*) + (if (not (null? tal)) + (loop (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + reruns))) + (else ;; must be we have unmet prerequisites + (debug:print 4 "FAILS: " fails) + ;; If one or more of the prereqs-not-met are FAIL then we can issue + ;; a message and drop hed from the items to be processed. + (if (null? fails) + (begin + ;; couldn't run, take a breather + (debug:print-info 4 "Shouldn't really get here, race condition? Unable to launch more tests at this moment, killing time ...") + ;; (thread-sleep! (+ 0.01 *global-delta*)) ;; long sleep here - no resources, may as well be patient + ;; we made new tal by sticking hed at the back of the list + (loop (car newtal)(cdr newtal) reg reruns)) + ;; the waiton is FAIL so no point in trying to run hed ever again + (if (not (null? tal)) + (if (vector? hed) + (begin + (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed) + " from the launch list as it has prerequistes that are FAIL") + (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + ;; (thread-sleep! *global-delta*) + (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'removed) + (loop (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + (cons hed reruns))) + (begin + (debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") + (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + ;; (thread-sleep! (+ 0.01 *global-delta*)) + (loop hed tal reg reruns))))))))) ;; END OF INNER COND + + ;; case where an items came in as a list been processed + ((and (list? items) ;; thus we know our items are already calculated + (not itemdat)) ;; and not yet expanded into the list of things to be done + (if (and (debug:debug-mode 1) ;; (>= *verbosity* 1) + (> (length items) 0) + (> (length (car items)) 0)) + (pp items)) + (for-each + (lambda (my-itemdat) + (let* ((new-test-record (let ((newrec (make-tests:testqueue))) + (vector-copy! test-record newrec) + newrec)) + (my-item-path (item-list->path my-itemdat))) + (if (tests:match test-patts hed my-item-path) ;; (patt-list-match my-item-path item-patts) ;; yes, we want to process this item, NOTE: Should not need this check here! + (let ((newtestname (runs:make-full-test-name hed my-item-path))) ;; test names are unique on testname/item-path + (tests:testqueue-set-items! new-test-record #f) + (tests:testqueue-set-itemdat! new-test-record my-itemdat) + (tests:testqueue-set-item_path! new-test-record my-item-path) + (hash-table-set! test-records newtestname new-test-record) + (set! tal (cons newtestname tal)))))) ;; since these are itemized create new test names testname/itempath + items) + (if (not (null? tal)) + (begin + (debug:print-info 4 "End of items list, looping with next after short delay") + ;; (thread-sleep! (+ 0.01 *global-delta*)) + (loop (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + reruns)))) + + ;; if items is a proc then need to run items:get-items-from-config, get the list and loop + ;; - but only do that if resources exist to kick off the job + ((or (procedure? items)(eq? items 'have-procedure)) + (let ((can-run-more (runs:can-run-more-tests test-record max-concurrent-jobs))) + (if (and (list? can-run-more) + (car can-run-more)) + (let* ((prereqs-not-met (db:get-prereqs-not-met run-id waitons item-path mode: testmode)) + (fails (runs:calc-fails prereqs-not-met)) + (non-completed (runs:calc-not-completed prereqs-not-met))) + (debug:print-info 8 "can-run-more: " can-run-more + "\n testname: " hed + "\n prereqs-not-met: " (runs:pretty-string prereqs-not-met) + "\n non-completed: " (runs:pretty-string non-completed) + "\n fails: " (runs:pretty-string fails) + "\n testmode: " testmode + "\n num-retries: " num-retries + "\n (eq? testmode 'toplevel): " (eq? testmode 'toplevel) + "\n (null? non-completed): " (null? non-completed) + "\n reruns: " reruns + "\n items: " items + "\n can-run-more: " can-run-more) + ;; (thread-sleep! (+ 0.01 *global-delta*)) + (cond ;; INNER COND #2 + ((or (null? prereqs-not-met) ;; all prereqs met, fire off the test + ;; or, if it is a 'toplevel test and all prereqs not met are COMPLETED then launch + (and (eq? testmode 'toplevel) + (null? non-completed))) + (let ((test-name (tests:testqueue-get-testname test-record))) + (setenv "MT_TEST_NAME" test-name) ;; + (setenv "MT_RUNNAME" runname) + (set-megatest-env-vars run-id) ;; these may be needed by the launching process + (let ((items-list (items:get-items-from-config tconfig))) + (if (list? items-list) + (begin + (tests:testqueue-set-items! test-record items-list) + ;; (thread-sleep! *global-delta*) + (loop hed tal reg reruns)) + (begin + (debug:print 0 "ERROR: The proc from reading the setup did not yield a list - please report this") + (exit 1)))))) + ((null? fails) + (debug:print-info 4 "fails is null, moving on in the queue but keeping " hed " for now") + ;; only increment num-retries when there are no tests runing + (if (eq? 0 (list-ref can-run-more 1)) + (begin + ;; TRY (if (> num-retries 100) ;; first 100 retries are low time cost + ;; TRY (thread-sleep! (+ 2 *global-delta*)) + ;; TRY (thread-sleep! (+ 0.01 *global-delta*))) + (set! num-retries (+ num-retries 1)))) + (if (> num-retries max-retries) + (if (not (null? tal)) + (loop (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + reruns)) + (loop (car newtal)(cdr newtal) reg reruns))) ;; an issue with prereqs not yet met? + ((and (not (null? fails))(eq? testmode 'normal)) + (debug:print-info 1 "test " hed " (mode=" testmode ") has failed prerequisite(s); " + (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ") + ", removing it from to-do list") + (if (not (null? tal)) + (begin + ;; (thread-sleep! *global-delta*) + (loop (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + (cons hed reruns))))) + (else + (debug:print 8 "ERROR: No handler for this condition.") + ;; TRY (thread-sleep! (+ 1 *global-delta*)) + (loop (car newtal)(cdr newtal) reg reruns)))) ;; END OF IF CAN RUN MORE + + ;; if can't run more just loop with next possible test + (begin + (debug:print-info 4 "processing the case with a lambda for items or 'have-procedure. Moving through the queue without dropping " hed) + ;; (thread-sleep! (+ 2 *global-delta*)) + (loop (car newtal)(cdr newtal) reg reruns))))) ;; END OF (or (procedure? items)(eq? items 'have-procedure)) + + ;; this case should not happen, added to help catch any bugs + ((and (list? items) itemdat) + (debug:print 0 "ERROR: Should not have a list of items in a test and the itemspath set - please report this") + (exit 1)) + ((not (null? reruns)) + (let* ((newlst (tests:filter-non-runnable run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED, + (junked (lset-difference equal? tal newlst))) + (debug:print-info 4 "full drop through, if reruns is less than 100 we will force retry them, reruns=" reruns ", tal=" tal) + (if (< num-retries max-retries) + (set! newlst (append reruns newlst))) + (set! num-retries (+ num-retries 1)) + ;; (thread-sleep! (+ 1 *global-delta*)) + (if (not (null? newlst)) + ;; since reruns have been tacked on to newlst create new reruns from junked + (loop (car newlst)(cdr newlst) reg (delete-duplicates junked))))) + ((not (null? tal)) + (debug:print-info 4 "I'm pretty sure I shouldn't get here.")) + ((not (null? reg)) ;; could we get here with leftovers? + (debug:print-info 0 "Have leftovers!") + (loop (car reg)(cdr reg) '() reruns)) + (else + (debug:print-info 4 "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) + )))) ;; LET* ((test-record + + ;; we get here on "drop through" - loop for next test in queue + ;; FIXME!!!! THIS SHOULD NOT REQUIRE AN EXIT!!!!!!! + + (debug:print-info 1 "All tests launched") + (thread-sleep! 0.5) + ;; FIXME! This harsh exit should not be necessary.... + ;; (if (not *runremote*)(exit)) ;; + #f)) ;; return a #f as a hint that we are done + ;; Here we need to check that all the tests remaining to be run are eligible to run + ;; and are not blocked by failed + Index: runconfig.scm ================================================================== --- runconfig.scm +++ runconfig.scm @@ -59,16 +59,16 @@ sections) (debug:print 2 "---") (set! *already-seen-runconfig-info* #t))) finaldat)) -(define (set-run-config-vars db run-id keys keyvals) +(define (set-run-config-vars run-id keys keyvals targ-from-db) (push-directory *toppath*) (let ((runconfigf (conc *toppath* "/runconfigs.config")) (targ (or (args:get-arg "-target") (args:get-arg "-reqtarg") - (db:get-target db run-id)))) + targ-from-db))) (pop-directory) (if (file-exists? runconfigf) (setup-env-defaults runconfigf run-id #t keys keyvals environ-patt: (conc "(default" (if targ ADDED runs-launch-loop-test.scm Index: runs-launch-loop-test.scm ================================================================== --- /dev/null +++ runs-launch-loop-test.scm @@ -0,0 +1,59 @@ +(use srfi-69) + +(define (runs:queue-next-hed tal reg n regful) + (if regful + (car reg) + (car tal))) + +(define (runs:queue-next-tal tal reg n regful) + (if regful + tal + (let ((newtal (cdr tal))) + (if (null? newtal) + reg + newtal + )))) + +(define (runs:queue-next-reg tal reg n regful) + (if regful + (cdr reg) + (if (eq? (length tal) 1) + '() + reg))) + +(use trace) +(trace runs:queue-next-hed + runs:queue-next-tal + runs:queue-next-reg) + + +(define tests '(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20)) + +(define test-registry (make-hash-table)) + +(define n 3) + +(let loop ((hed (car tests)) + (tal (cdr tests)) + (reg '())) + (let* ((reglen (length reg)) + (regful (> reglen n))) + (print "hed=" hed ", length reg=" (length reg) ", (> lenreg n)=" (> (length reg) n)) + (let ((newtal (append tal (list hed)))) ;; used if we are not done with this test + (cond + ((not (hash-table-ref/default test-registry hed #f)) + (hash-table-set! test-registry hed #t) + (print "Registering #" hed) + (if (not (null? tal)) + (loop (runs:queue-next-hed tal reg n regful) + (runs:queue-next-tal tal reg n regful) + (let ((newl (append reg (list hed)))) + (if regful + (cdr newl) + newl))))) + (else + (print "Running #" hed) + (if (not (null? tal)) + (loop (runs:queue-next-hed tal reg n regful) + (runs:queue-next-tal tal reg n regful) + (runs:queue-next-reg tal reg n regful)))))))) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -293,11 +293,14 @@ (if (not (null? required-tests)) (debug:print-info 1 "Adding " required-tests " to the run queue")) ;; NOTE: these are all parent tests, items are not expanded yet. (debug:print-info 4 "test-records=" (hash-table->alist test-records)) - (runs:run-tests-queue run-id runname test-records keyvallst flags test-patts) + (let ((reglen (any->number (configf:lookup *configdat* "setup" "runqueue")))) + (if reglen + (runs:run-tests-queue-new run-id runname test-records keyvallst flags test-patts reglen) + (runs:run-tests-queue-classic run-id runname test-records keyvallst flags test-patts))) (debug:print-info 4 "All done by here"))) (define (runs:calc-fails prereqs-not-met) (filter (lambda (test) (and (vector? test) ;; not (string? test)) @@ -321,307 +324,38 @@ lst)) (define (runs:make-full-test-name testname itempath) (if (equal? itempath "") testname (conc testname "/" itempath))) -;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... > -(define (runs:run-tests-queue run-id runname test-records keyvallst flags test-patts) - ;; At this point the list of parent tests is expanded - ;; NB// Should expand items here and then insert into the run queue. - (debug:print 5 "test-records: " test-records ", keyvallst: " keyvallst " flags: " (hash-table->alist flags)) - (let ((sorted-test-names (tests:sort-by-priority-and-waiton test-records)) - (test-registery (make-hash-table)) - (registery-mutex (make-mutex)) - (num-retries 0) - (max-retries (config-lookup *configdat* "setup" "maxretries")) - (max-concurrent-jobs (let ((mcj (config-lookup *configdat* "setup" "max_concurrent_jobs"))) - (if (and mcj (string->number mcj)) - (string->number mcj) - 1)))) - (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100)) - (if (not (null? sorted-test-names)) - (let loop ((hed (car sorted-test-names)) - (tal (cdr sorted-test-names)) - (reruns '())) - (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns)) - ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns) - (let* ((test-record (hash-table-ref test-records hed)) - (test-name (tests:testqueue-get-testname test-record)) - (tconfig (tests:testqueue-get-testconfig test-record)) - (testmode (let ((m (config-lookup tconfig "requirements" "mode"))) - (if m (string->symbol m) 'normal))) - (waitons (tests:testqueue-get-waitons test-record)) - (priority (tests:testqueue-get-priority test-record)) - (itemdat (tests:testqueue-get-itemdat test-record)) ;; itemdat can be a string, list or #f - (items (tests:testqueue-get-items test-record)) - (item-path (item-list->path itemdat)) - (newtal (append tal (list hed)))) - - (debug:print 6 - "test-name: " test-name - "\n hed: " hed - "\n itemdat: " itemdat - "\n items: " items - "\n item-path: " item-path - "\n waitons: " waitons - "\n num-retries: " num-retries - "\n tal: " tal - "\n reruns: " reruns) - - ;; check for hed in waitons => this would be circular, remove it and issue an - ;; error - (if (member test-name waitons) - (begin - (debug:print 0 "ERROR: test " test-name " has listed itself as a waiton, please correct this!") - (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons)))) - - (cond ;; OUTER COND - ((not items) ;; when false the test is ok to be handed off to launch (but not before) - (let* ((run-limits-info (runs:can-run-more-tests test-record max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running - (have-resources (car run-limits-info)) - (num-running (list-ref run-limits-info 1)) - (num-running-in-jobgroup (list-ref run-limits-info 2)) - (max-concurrent-jobs (list-ref run-limits-info 3)) - (job-group-limit (list-ref run-limits-info 4)) - (prereqs-not-met (db:get-prereqs-not-met run-id waitons item-path mode: testmode)) - (fails (runs:calc-fails prereqs-not-met)) - (non-completed (runs:calc-not-completed prereqs-not-met))) - (debug:print-info 8 "have-resources: " have-resources " prereqs-not-met: " - (string-intersperse - (map (lambda (t) - (if (vector? t) - (conc (db:test-get-state t) "/" (db:test-get-status t)) - (conc " WARNING: t is not a vector=" t ))) - prereqs-not-met) ", ") " fails: " fails) - (debug:print-info 4 "hed=" hed "\n test-record=" test-record "\n test-name: " test-name "\n item-path: " item-path "\n test-patts: " test-patts) - - ;; Don't know at this time if the test have been launched at some time in the past - ;; i.e. is this a re-launch? - (debug:print-info 4 "run-limits-info = " run-limits-info) - (cond ;; INNER COND #1 for a launchable test - ;; Check item path against item-patts - ((not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path)) ;; This test/itempath is not to be run - ;; else the run is stuck, temporarily or permanently - ;; but should check if it is due to lack of resources vs. prerequisites - (debug:print-info 1 "Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " test-patts) - ;; (thread-sleep! *global-delta*) - (if (not (null? tal)) - (loop (car tal)(cdr tal) reruns))) - ;; Registery has been started for this test but has not yet completed - ;; this should be rare, the case where there are only a couple of tests and the db is slow - ;; delay a short while and continue - ;; ((eq? (hash-table-ref/default test-registery (runs:make-full-test-name test-name item-path) #f) 'start) - ;; (thread-sleep! 0.01) - ;; (loop (car newtal)(cdr newtal) reruns)) - ;; count number of 'done, if more than 100 then skip on through. - (;; (and (< (length (filter (lambda (x)(eq? x 'done))(hash-table-values test-registery))) 100) ;; why get more than 200 ahead? - (not (hash-table-ref/default test-registery (runs:make-full-test-name test-name item-path) #f)) ;; ) ;; too many changes required. Implement later. - (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) - ;; NEED TO THREADIFY THIS - (let ((th (make-thread (lambda () - (mutex-lock! registery-mutex) - (hash-table-set! test-registery (runs:make-full-test-name test-name item-path) 'start) - (mutex-unlock! registery-mutex) - ;; If haven't done it before register a top level test if this is an itemized test - (if (not (eq? (hash-table-ref/default test-registery (runs:make-full-test-name test-name "") #f) 'done)) - (cdb:tests-register-test *runremote* run-id test-name "")) - (cdb:tests-register-test *runremote* run-id test-name item-path) - (mutex-lock! registery-mutex) - (hash-table-set! test-registery (runs:make-full-test-name test-name item-path) 'done) - (mutex-unlock! registery-mutex)) - (conc test-name "/" item-path)))) - (thread-start! th)) - ;; TRY (thread-sleep! *global-delta*) - (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) - (loop (car newtal)(cdr newtal) reruns)) - ;; At this point *all* test registrations must be completed. - ((not (null? (filter (lambda (x)(eq? 'start x))(hash-table-values test-registery)))) - (debug:print-info 0 "Waiting on test registrations: " (string-intersperse - (filter (lambda (x) - (eq? (hash-table-ref/default test-registery x #f) 'start)) - (hash-table-keys test-registery)) - ", ")) - (thread-sleep! 0.1) - (loop hed tal reruns)) - ((not have-resources) ;; simply try again after waiting a second - (debug:print-info 1 "no resources to run new tests, waiting ...") - ;; Have gone back and forth on this but db starvation is an issue. - ;; wait one second before looking again to run jobs. - (thread-sleep! 1) ;; (+ 2 *global-delta*)) - ;; could have done hed tal here but doing car/cdr of newtal to rotate tests - (loop (car newtal)(cdr newtal) reruns)) - ((and have-resources - (or (null? prereqs-not-met) - (and (eq? testmode 'toplevel) - (null? non-completed)))) - (run:test run-id runname keyvallst test-record flags #f) - (hash-table-set! test-registery (runs:make-full-test-name test-name item-path) 'running) - (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) - ;; (thread-sleep! *global-delta*) - (if (not (null? tal)) - (loop (car tal)(cdr tal) reruns))) - (else ;; must be we have unmet prerequisites - (debug:print 4 "FAILS: " fails) - ;; If one or more of the prereqs-not-met are FAIL then we can issue - ;; a message and drop hed from the items to be processed. - (if (null? fails) - (begin - ;; couldn't run, take a breather - (debug:print-info 4 "Shouldn't really get here, race condition? Unable to launch more tests at this moment, killing time ...") - ;; (thread-sleep! (+ 0.01 *global-delta*)) ;; long sleep here - no resources, may as well be patient - ;; we made new tal by sticking hed at the back of the list - (loop (car newtal)(cdr newtal) reruns)) - ;; the waiton is FAIL so no point in trying to run hed ever again - (if (not (null? tal)) - (if (vector? hed) - (begin - (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed) - " from the launch list as it has prerequistes that are FAIL") - (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) - ;; (thread-sleep! *global-delta*) - (hash-table-set! test-registery (runs:make-full-test-name test-name item-path) 'removed) - (loop (car tal)(cdr tal) (cons hed reruns))) - (begin - (debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") - (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) - ;; (thread-sleep! (+ 0.01 *global-delta*)) - (loop hed tal reruns))))))))) ;; END OF INNER COND - - ;; case where an items came in as a list been processed - ((and (list? items) ;; thus we know our items are already calculated - (not itemdat)) ;; and not yet expanded into the list of things to be done - (if (and (debug:debug-mode 1) ;; (>= *verbosity* 1) - (> (length items) 0) - (> (length (car items)) 0)) - (pp items)) - (for-each - (lambda (my-itemdat) - (let* ((new-test-record (let ((newrec (make-tests:testqueue))) - (vector-copy! test-record newrec) - newrec)) - (my-item-path (item-list->path my-itemdat))) - (if (tests:match test-patts hed my-item-path) ;; (patt-list-match my-item-path item-patts) ;; yes, we want to process this item, NOTE: Should not need this check here! - (let ((newtestname (runs:make-full-test-name hed my-item-path))) ;; test names are unique on testname/item-path - (tests:testqueue-set-items! new-test-record #f) - (tests:testqueue-set-itemdat! new-test-record my-itemdat) - (tests:testqueue-set-item_path! new-test-record my-item-path) - (hash-table-set! test-records newtestname new-test-record) - (set! tal (cons newtestname tal)))))) ;; since these are itemized create new test names testname/itempath - items) - (if (not (null? tal)) - (begin - (debug:print-info 4 "End of items list, looping with next after short delay") - ;; (thread-sleep! (+ 0.01 *global-delta*)) - (loop (car tal)(cdr tal) reruns)))) - - ;; if items is a proc then need to run items:get-items-from-config, get the list and loop - ;; - but only do that if resources exist to kick off the job - ((or (procedure? items)(eq? items 'have-procedure)) - (let ((can-run-more (runs:can-run-more-tests test-record max-concurrent-jobs))) - (if (and (list? can-run-more) - (car can-run-more)) - (let* ((prereqs-not-met (db:get-prereqs-not-met run-id waitons item-path mode: testmode)) - (fails (runs:calc-fails prereqs-not-met)) - (non-completed (runs:calc-not-completed prereqs-not-met))) - (debug:print-info 8 "can-run-more: " can-run-more - "\n testname: " hed - "\n prereqs-not-met: " (runs:pretty-string prereqs-not-met) - "\n non-completed: " (runs:pretty-string non-completed) - "\n fails: " (runs:pretty-string fails) - "\n testmode: " testmode - "\n num-retries: " num-retries - "\n (eq? testmode 'toplevel): " (eq? testmode 'toplevel) - "\n (null? non-completed): " (null? non-completed) - "\n reruns: " reruns - "\n items: " items - "\n can-run-more: " can-run-more) - ;; (thread-sleep! (+ 0.01 *global-delta*)) - (cond ;; INNER COND #2 - ((or (null? prereqs-not-met) ;; all prereqs met, fire off the test - ;; or, if it is a 'toplevel test and all prereqs not met are COMPLETED then launch - (and (eq? testmode 'toplevel) - (null? non-completed))) - (let ((test-name (tests:testqueue-get-testname test-record))) - (setenv "MT_TEST_NAME" test-name) ;; - (setenv "MT_RUNNAME" runname) - (set-megatest-env-vars run-id) ;; these may be needed by the launching process - (let ((items-list (items:get-items-from-config tconfig))) - (if (list? items-list) - (begin - (tests:testqueue-set-items! test-record items-list) - ;; (thread-sleep! *global-delta*) - (loop hed tal reruns)) - (begin - (debug:print 0 "ERROR: The proc from reading the setup did not yield a list - please report this") - (exit 1)))))) - ((null? fails) - (debug:print-info 4 "fails is null, moving on in the queue but keeping " hed " for now") - ;; only increment num-retries when there are no tests runing - (if (eq? 0 (list-ref can-run-more 1)) - (begin - ;; TRY (if (> num-retries 100) ;; first 100 retries are low time cost - ;; TRY (thread-sleep! (+ 2 *global-delta*)) - ;; TRY (thread-sleep! (+ 0.01 *global-delta*))) - (set! num-retries (+ num-retries 1)))) - (if (> num-retries max-retries) - (if (not (null? tal)) - (loop (car tal)(cdr tal) reruns)) - (loop (car newtal)(cdr newtal) reruns))) ;; an issue with prereqs not yet met? - ((and (not (null? fails))(eq? testmode 'normal)) - (debug:print-info 1 "test " hed " (mode=" testmode ") has failed prerequisite(s); " - (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ") - ", removing it from to-do list") - (if (not (null? tal)) - (begin - ;; (thread-sleep! *global-delta*) - (loop (car tal)(cdr tal)(cons hed reruns))))) - (else - (debug:print 8 "ERROR: No handler for this condition.") - ;; TRY (thread-sleep! (+ 1 *global-delta*)) - (loop (car newtal)(cdr newtal) reruns)))) ;; END OF IF CAN RUN MORE - - ;; if can't run more just loop with next possible test - (begin - (debug:print-info 4 "processing the case with a lambda for items or 'have-procedure. Moving through the queue without dropping " hed) - ;; (thread-sleep! (+ 2 *global-delta*)) - (loop (car newtal)(cdr newtal) reruns))))) ;; END OF (or (procedure? items)(eq? items 'have-procedure)) - - ;; this case should not happen, added to help catch any bugs - ((and (list? items) itemdat) - (debug:print 0 "ERROR: Should not have a list of items in a test and the itemspath set - please report this") - (exit 1)) - ((not (null? reruns)) - (let* ((newlst (tests:filter-non-runnable run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED, - (junked (lset-difference equal? tal newlst))) - (debug:print-info 4 "full drop through, if reruns is less than 100 we will force retry them, reruns=" reruns ", tal=" tal) - (if (< num-retries max-retries) - (set! newlst (append reruns newlst))) - (set! num-retries (+ num-retries 1)) - ;; (thread-sleep! (+ 1 *global-delta*)) - (if (not (null? newlst)) - ;; since reruns have been tacked on to newlst create new reruns from junked - (loop (car newlst)(cdr newlst)(delete-duplicates junked))))) - ((not (null? tal)) - (debug:print-info 4 "I'm pretty sure I shouldn't get here.")) - (else - (debug:print-info 4 "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) - )))) ;; LET* ((test-record - - ;; we get here on "drop through" - loop for next test in queue - ;; FIXME!!!! THIS SHOULD NOT REQUIRE AN EXIT!!!!!!! - - (debug:print-info 1 "All tests launched") - (thread-sleep! 0.5) - ;; FIXME! This harsh exit should not be necessary.... - ;; (if (not *runremote*)(exit)) ;; - #f)) ;; return a #f as a hint that we are done - ;; Here we need to check that all the tests remaining to be run are eligible to run - ;; and are not blocked by failed - +(define (runs:queue-next-hed tal reg n regful) + (if regful + (if (null? reg) ;; doesn't make sense, this is probably NOT the problem of the car + (car tal) + (car reg)) + (car tal))) + +(define (runs:queue-next-tal tal reg n regful) + (if regful + tal + (let ((newtal (cdr tal))) + (if (null? newtal) + reg + newtal + )))) + +(define (runs:queue-next-reg tal reg n regful) + (if regful + (cdr reg) + (if (eq? (length tal) 1) + '() + reg))) + +(include "run-tests-queue-classic.scm") +(include "run-tests-queue-new.scm") ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step -(define (run:test run-id runname keyvallst test-record flags parent-test) +(define (run:test run-id run-info key-vals runname keyvallst test-record flags parent-test) ;; All these vars might be referenced by the testconfig file reader (let* ((test-name (tests:testqueue-get-testname test-record)) (test-waitons (tests:testqueue-get-waitons test-record)) (test-conf (tests:testqueue-get-testconfig test-record)) (itemdat (tests:testqueue-get-itemdat test-record)) @@ -647,11 +381,11 @@ ;; Here is where the test_meta table is best updated ;; Yes, another use of a global for caching. Need a better way? (if (not (hash-table-ref/default *test-meta-updated* test-name #f)) (begin (hash-table-set! *test-meta-updated* test-name #t) - (open-run-close runs:update-test_meta db test-name test-conf))) + (runs:update-test_meta test-name test-conf))) ;; (lambda (itemdat) ;;; ((ripeness "overripe") (temperature "cool") (season "summer")) (let* ((new-test-path (string-intersperse (cons test-path (map cadr itemdat)) "/")) (new-test-name (if (equal? item-path "") test-name (conc test-name "/" item-path))) ;; just need it to be unique (test-id (cdb:remote-run db:get-test-id #f run-id test-name item-path)) @@ -721,11 +455,11 @@ "\" and status \"" (test:get-status testdat) "\", use -rerun \"" (test:get-status testdat) "\" or -force to override")) ;; NOTE: No longer be checking prerequisites here! Will never get here unless prereqs are ;; already met. ;; This would be a great place to do the process-fork - (if (not (launch-test #f run-id runname test-conf keyvallst test-name test-path itemdat flags)) + (if (not (launch-test test-id run-id run-info key-vals runname test-conf keyvallst test-name test-path itemdat flags)) (begin (print "ERROR: Failed to launch the test. Exiting as soon as possible") (set! *globalexitstatus* 1) ;; (process-signal (current-process-id) signal/kill)))))) ((KILLED) @@ -959,26 +693,26 @@ ;;====================================================================== ;; Rollup runs ;;====================================================================== ;; Update the test_meta table for this test -(define (runs:update-test_meta db test-name test-conf) - (let ((currrecord (cdb:remote-run db:testmeta-get-record db test-name))) +(define (runs:update-test_meta test-name test-conf) + (let ((currrecord (cdb:remote-run db:testmeta-get-record #f test-name))) (if (not currrecord) (begin (set! currrecord (make-vector 10 #f)) - (cdb:remote-run db:testmeta-add-record db test-name))) + (cdb:remote-run db:testmeta-add-record #f test-name))) (for-each (lambda (key) (let* ((idx (cadr key)) (fld (car key)) (val (config-lookup test-conf "test_meta" fld))) ;; (debug:print 5 "idx: " idx " fld: " fld " val: " val) (if (and val (not (equal? (vector-ref currrecord idx) val))) (begin (print "Updating " test-name " " fld " to " val) - (cdb:remote-run db:testmeta-update-field db test-name fld val))))) + (cdb:remote-run db:testmeta-update-field #f test-name fld val))))) '(("author" 2)("owner" 3)("description" 4)("reviewed" 5)("tags" 9))))) ;; Update test_meta for all tests (define (runs:update-all-test_meta db) (let ((test-names (get-all-legal-tests))) @@ -988,11 +722,11 @@ (test-configf (conc test-path "/testconfig")) (testexists (and (file-exists? test-configf)(file-read-access? test-configf))) ;; read configs with tricks turned off (i.e. no system) (test-conf (if testexists (read-config test-configf #f #f)(make-hash-table)))) ;; use the open-run-close instead of passing in db - (runs:update-test_meta #f test-name test-conf))) + (runs:update-test_meta test-name test-conf))) test-names))) ;; This could probably be refactored into one complex query ... (define (runs:rollup-run keys keyvallst runname user) ;; was target, now keyvallst (debug:print 4 "runs:rollup-run, keys: " keys " keyvallst: " keyvallst " :runname " runname " user: " user) ADDED tests/fdktestqa/testqa/Makefile Index: tests/fdktestqa/testqa/Makefile ================================================================== --- /dev/null +++ tests/fdktestqa/testqa/Makefile @@ -0,0 +1,3 @@ + +all : + megatest -runtests % -target a/b :runname c Index: tests/fdktestqa/testqa/megatest.config ================================================================== --- tests/fdktestqa/testqa/megatest.config +++ tests/fdktestqa/testqa/megatest.config @@ -1,7 +1,8 @@ [setup] testcopycmd cp --remove-destination -rlv TEST_SRC_PATH/. TEST_TARG_PATH/. >> TEST_TARG_PATH/mt_launch.log 2>> TEST_TARG_PATH/mt_launch.log +# runqueue 2 [include ../fdk.config] [server] timeout 0.01 Index: tests/fdktestqa/testqa/tests/bigrun/testconfig ================================================================== --- tests/fdktestqa/testqa/tests/bigrun/testconfig +++ tests/fdktestqa/testqa/tests/bigrun/testconfig @@ -7,11 +7,11 @@ # waiton setup priority 0 # Iteration for your tests are controlled by the items section [items] -NUMBER #{scheme (string-intersperse (map number->string (sort (let loop ((a 0)(res '()))(if (< a 150)(loop (+ a 1)(cons a res)) res)) >)) " ")} +NUMBER #{scheme (string-intersperse (map number->string (sort (let loop ((a 0)(res '()))(if (< a (or (any->number (get-environment-variable "NUMTESTS")) 1100))(loop (+ a 1)(cons a res)) res)) >)) " ")} # test_meta is a section for storing additional data on your test [test_meta] author matt owner matt Index: tests/fullrun/megatest.config ================================================================== --- tests/fullrun/megatest.config +++ tests/fullrun/megatest.config @@ -13,10 +13,14 @@ [setup] # Set launchwait to yes to use the old launch run code that waits for the launch process to return before # proceeding. # launchwait yes +# If defined the runs:run-tests-queue-new queue code is used with the register test depth +# given. Otherwise the old code is used. The old code will be removed in the future and +# a default of 10 used. +# runqueue 2 # It is possible (but not recommended) to override the rsync command used # to populate the test directories. For test development the following # example can be useful #