Index: configf.scm ================================================================== --- configf.scm +++ configf.scm @@ -14,10 +14,11 @@ ;;====================================================================== (use regex regex-case) ;; directory-utils) (declare (unit configf)) (declare (uses process)) +(declare (uses env)) (include "common_records.scm") ;; return list (path fullpath configname) (define (find-config configname #!key (toppath #f)) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -33,12 +33,23 @@ ;; (include "debugger.scm") ;; use this struct to facilitate refactoring ;; -(defstruct runs:dat hed tal reg reruns reglen regfull test-record runname test-name item-path jobgroup max-concurrent-jobs run-id waitons item-path testmode test-patts required-tests test-registry registry-mutex flags keyvals run-info newtal all-tests-registry itemmaps) +(defstruct runs:dat + reglen regfull + runname max-concurrent-jobs run-id + test-patts required-tests test-registry + registry-mutex flags keyvals run-info all-tests-registry + can-run-more-tests + ((can-run-more-tests-count 0) : fixnum)) + +(defstruct runs:testdat + hed tal reg reruns test-record + test-name item-path jobgroup + waitons testmode newtal itemmaps prereqs-not-met) ;; set up needed environment variables given a run-id and optionally a target, itempath etc. ;; (define (runs:set-megatest-env-vars run-id #!key (inkeys #f)(inrunname #f)(inkeyvals #f)(intarget #f)(testname #f)(itempath #f)) (let* ((target (or intarget @@ -99,13 +110,20 @@ ;; Every time can-run-more-tests is called increment the delay ;; ;; NOTE: We run this server-side!! Do not use this global except in the runs:can-run-more-tests routine ;; (define *last-num-running-tests* 0) -(define *runs:can-run-more-tests-count* 0) -(define (runs:shrink-can-run-more-tests-count) - (set! *runs:can-run-more-tests-count* 0)) ;; (/ *runs:can-run-more-tests-count* 2))) +;; (define *runs:can-run-more-tests-count* 0) +(define (runs:shrink-can-run-more-tests-count runsdat) + (runs:dat-can-run-more-tests-count-set! runsdat 0)) + +(define (runs:inc-can-run-more-tests-count runsdat) + (runs:dat-can-run-more-tests-count-set! + runsdat + (+ (runs:dat-can-run-more-tests-count runsdat) 1))) + +;; (set! *runs:can-run-more-tests-count* 0)) ;; (/ *runs:can-run-more-tests-count* 2))) ;; Temporary globals. Move these into the logic or into common ;; (define *seen-cant-run-tests* (make-hash-table)) ;; use to track tests that we suspect cannot be run (define (runs:inc-cant-run-tests testname) @@ -130,13 +148,17 @@ (begin (hash-table-set! *runs:denoise* key currtime) #t) #f))) -(define (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) +(define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) + ;; Take advantage of a good place to exit if running the one-pass methodology + (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) + (args:get-arg "-one-pass")) + (exit 0)) (thread-sleep! (cond - ((> *runs:can-run-more-tests-count* 20) + ((> (runs:dat-can-run-more-tests-count runsdat) 20) (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ...")) 2);; obviously haven't had any work to do for a while (else 0))) (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) @@ -143,11 +165,11 @@ (job-group-limit (let ((jobg-count (config-lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) (if (> (+ num-running num-running-in-jobgroup) 0) - (set! *runs:can-run-more-tests-count* (+ *runs:can-run-more-tests-count* 1))) + (runs:inc-can-run-more-tests-count runsdat)) ;; (set! *runs:can-run-more-tests-count* (+ *runs:can-run-more-tests-count* 1))) (if (not (eq? *last-num-running-tests* num-running)) (begin (debug:print 2 *default-log-port* "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running) (set! *last-num-running-tests* num-running))) (if (not (eq? 0 *globalexitstatus*)) @@ -652,46 +674,50 @@ (conc t)))) inlst))) ;; hed tal reg reruns reglen regfull test-record runname test-name item-path jobgroup max-concurrent-jobs run-id waitons item-path testmode test-patts required-tests test-registry registry-mutex flags keyvals run-info newtal all-tests-registry itemmaps) -(define (runs:process-expanded-tests runsdat) - (let* ((hed (runs:dat-hed runsdat)) - (tal (runs:dat-tal runsdat)) - (reg (runs:dat-reg runsdat)) - (reruns (runs:dat-reruns runsdat)) - (reglen (runs:dat-reglen runsdat)) +(define (runs:process-expanded-tests runsdat testdat) + ;; unroll the contents of runsdat and testdat (due to ongoing refactoring). + (let* ((hed (runs:testdat-hed testdat)) + (tal (runs:testdat-tal testdat)) + (reg (runs:testdat-reg testdat)) + (reruns (runs:testdat-reruns testdat)) + (test-name (runs:testdat-test-name testdat)) + (item-path (runs:testdat-item-path testdat)) + (jobgroup (runs:testdat-jobgroup testdat)) + (waitons (runs:testdat-waitons testdat)) + (item-path (runs:testdat-item-path testdat)) + (testmode (runs:testdat-testmode testdat)) + (newtal (runs:testdat-newtal testdat)) + (itemmaps (runs:testdat-itemmaps testdat)) + (test-record (runs:testdat-test-record testdat)) + (prereqs-not-met (runs:testdat-prereqs-not-met testdat)) + + (reglen (runs:dat-reglen runsdat)) (regfull (runs:dat-regfull runsdat)) - (test-record (runs:dat-test-record runsdat)) - (runname (runs:dat-runname runsdat)) - (test-name (runs:dat-test-name runsdat)) - (item-path (runs:dat-item-path runsdat)) - (jobgroup (runs:dat-jobgroup runsdat)) - (max-concurrent-jobs (runs:dat-max-concurrent-jobs runsdat)) - (run-id (runs:dat-run-id runsdat)) - (waitons (runs:dat-waitons runsdat)) - (item-path (runs:dat-item-path runsdat)) - (testmode (runs:dat-testmode runsdat)) - (test-patts (runs:dat-test-patts runsdat)) - (required-tests (runs:dat-required-tests runsdat)) - (test-registry (runs:dat-test-registry runsdat)) - (registry-mutex (runs:dat-registry-mutex runsdat)) + (runname (runs:dat-runname runsdat)) + (max-concurrent-jobs (runs:dat-max-concurrent-jobs runsdat)) + (run-id (runs:dat-run-id runsdat)) + (test-patts (runs:dat-test-patts runsdat)) + (required-tests (runs:dat-required-tests runsdat)) + (test-registry (runs:dat-test-registry runsdat)) + (registry-mutex (runs:dat-registry-mutex runsdat)) (flags (runs:dat-flags runsdat)) (keyvals (runs:dat-keyvals runsdat)) (run-info (runs:dat-run-info runsdat)) - (newtal (runs:dat-newtal runsdat)) - (all-tests-registry (runs:dat-all-tests-registry runsdat)) - (itemmaps (runs:dat-itemmaps runsdat)) - (run-limits-info (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running - (have-resources (car run-limits-info)) - (num-running (list-ref run-limits-info 1)) - (num-running-in-jobgroup (list-ref run-limits-info 2)) - (max-concurrent-jobs (list-ref run-limits-info 3)) - (job-group-limit (list-ref run-limits-info 4)) - (prereqs-not-met (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) + (all-tests-registry (runs:dat-all-tests-registry runsdat)) + (run-limits-info (runs:dat-can-run-more-tests runsdat)) + ;; (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running + (have-resources (car run-limits-info)) + (num-running (list-ref run-limits-info 1)) + (num-running-in-jobgroup(list-ref run-limits-info 2)) + (max-concurrent-jobs (list-ref run-limits-info 3)) + (job-group-limit (list-ref run-limits-info 4)) + ;; (prereqs-not-met (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap)) - (fails (if (list? prereqs-not-met) + (fails (if (list? prereqs-not-met) (runs:calc-fails prereqs-not-met) (begin (debug:print-error 0 *default-log-port* "prereqs-not-met is not a list! " prereqs-not-met) '()))) (non-completed (filter (lambda (x) ;; remove hed from not completed list, duh, of course it is not completed! @@ -754,11 +780,11 @@ (if (not (eq? (hash-table-ref/default test-registry (db:test-make-full-name test-name "") #f) 'done)) (begin (rmt:register-test run-id test-name "") (if (rmt:get-test-id run-id test-name "") (hash-table-set! test-registry (db:test-make-full-name test-name "") 'done)))) - (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) (if (and (null? tal)(null? reg)) (list hed tal (append reg (list hed)) reruns) (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) ;; NB// Here we are building reg as we register tests @@ -807,11 +833,11 @@ (if (configf:lookup *configdat* "jobtools" "maxload") ;; only gate if maxload is specified (common:wait-for-cpuload maxload numcpus waitdelay)) (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) - (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) (if (or (not (null? tal))(not (null? reg))) (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) @@ -844,11 +870,11 @@ (begin (debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path " from the launch list as it has prerequistes that are FAIL") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites"))) - (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) ;; This next is for the items (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'removed) (list (runs:queue-next-hed tal reg reglen regfull) @@ -873,11 +899,11 @@ (+ nth-try 1) 0)) (if (runs:lownoise (conc "not removing test " hed) 60) (debug:print 1 *default-log-port* "WARNING: not removing test " hed " from queue although it may not be runnable due to FAILED prerequisites")) ;; may not have processed correctly. Could be a race condition in your test implementation? Dropping test " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") - (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) + (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (list hed tal reg reruns) ;; (list (car newtal)(cdr newtal) reg reruns) ;; (hash-table-set! test-registry hed 'removed) (list (runs:queue-next-hed newtal reg reglen regfull) (runs:queue-next-tal newtal reg reglen regfull) @@ -1016,11 +1042,11 @@ ;; Do mark-and-find clean up of db before starting runing of quue ;; ;; (rmt:find-and-mark-incomplete) - (let ((run-info (rmt:get-run-info run-id)) + (let* ((run-info (rmt:get-run-info run-id)) (tests-info (mt:get-tests-for-run run-id #f '() '())) ;; qryvals: "id,testname,item_path")) (sorted-test-names (tests:sort-by-priority-and-waiton test-records)) (test-registry (make-hash-table)) (registry-mutex (make-mutex)) (num-retries 0) @@ -1030,11 +1056,40 @@ (string->number mcj) 1))) ;; length of the register queue ahead (reglen (if (number? reglen-in) reglen-in 1)) (last-time-incomplete (- (current-seconds) 900)) ;; force at least one clean up cycle (last-time-some-running (current-seconds)) - (tdbdat (tasks:open-db))) + (tdbdat (tasks:open-db)) + (runsdat (make-runs:dat + ;; hed: hed + ;; tal: tal + ;; reg: reg + ;; reruns: reruns + reglen: reglen + regfull: #f ;; regfull + ;; test-record: test-record + runname: runname + ;; test-name: test-name + ;; item-path: item-path + ;; jobgroup: jobgroup + max-concurrent-jobs: max-concurrent-jobs + run-id: run-id + ;; waitons: waitons + ;; testmode: testmode + test-patts: test-patts + required-tests: required-tests + test-registry: test-registry + registry-mutex: registry-mutex + flags: flags + keyvals: keyvals + run-info: run-info + ;; newtal: newtal + all-tests-registry: all-tests-registry + ;; itemmaps: itemmaps + ;; prereqs-not-met: (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps) + ;; can-run-more-tests: (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) ;; look at the test jobgroup and tot jobs running + ))) ;; Initialize the test-registery hash with tests that already have a record ;; convert state to symbol and use that as the hash value (for-each (lambda (trec) (let ((id (db:test-get-id trec)) @@ -1078,12 +1133,27 @@ (items (tests:testqueue-get-items test-record)) (item-path (item-list->path itemdat)) (tfullname (db:test-make-full-name test-name item-path)) (newtal (append tal (list hed))) (regfull (>= (length reg) reglen)) - (num-running (rmt:get-count-tests-running-for-run-id run-id))) - + (num-running (rmt:get-count-tests-running-for-run-id run-id)) + (testdat (make-runs:testdat + hed: hed + tal: tal + reg: reg + reruns: reruns + test-record: test-record + test-name: test-name + item-path: item-path + jobgroup: jobgroup + waitons: waitons + testmode: testmode + newtal: newtal + itemmaps: itemmaps + ;; prereqs-not-met: prereqs-not-met + ))) + (runs:dat-regfull-set! runsdat regfull) ;; every couple minutes verify the server is there for this run (if (and (common:low-noise-print 60 "try start server" run-id) (tasks:need-server run-id)) (tasks:start-and-wait-for-server tdbdat run-id 10)) ;; NOTE: delay and wait is done under the hood @@ -1129,21 +1199,10 @@ "\n reruns: " reruns "\n regfull: " regfull "\n reglen: " reglen "\n length reg: " (length reg) "\n reg: " reg) - - ;; lets use the debugger eh? -;; (debugger-start start: 7) -;; (debugger-trace-var "runs:run-tests-queue" "") -;; (debugger-trace-var "hed" hed) -;; (debugger-trace-var "tal" tal) -;; (debugger-trace-var "items" items) -;; (debugger-trace-var "item-path" item-path) -;; (debugger-trace-var "waitons" waitons) -;; (debugger-pauser) - ;; check for hed in waitons => this would be circular, remove it and issue an ;; error (if (member test-name waitons) (begin @@ -1173,39 +1232,14 @@ ((not items) (debug:print-info 4 *default-log-port* "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) - (let ((runsdat (make-runs:dat - hed: hed - tal: tal - reg: reg - reruns: reruns - reglen: reglen - regfull: regfull - test-record: test-record - runname: runname - test-name: test-name - item-path: item-path - jobgroup: jobgroup - max-concurrent-jobs: max-concurrent-jobs - run-id: run-id - waitons: waitons - item-path: item-path - testmode: testmode - test-patts: test-patts - required-tests: required-tests - test-registry: test-registry - registry-mutex: registry-mutex - flags: flags - keyvals: keyvals - run-info: run-info - newtal: newtal - all-tests-registry: all-tests-registry - itemmaps: itemmaps))) - (let ((loop-list (runs:process-expanded-tests runsdat))) - (if loop-list (apply loop loop-list))))) + (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) + (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running + (let ((loop-list (runs:process-expanded-tests runsdat testdat))) + (if loop-list (apply loop loop-list)))) ;; items processed into a list but not came in as a list been processed ;; ((and (list? items) ;; thus we know our items are already calculated (not itemdat)) ;; and not yet expanded into the list of things to be done @@ -1252,11 +1286,11 @@ ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS ((or (procedure? items)(eq? items 'have-procedure)) - (let ((can-run-more (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs))) + (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) (car can-run-more)) (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) (if loop-list (apply loop loop-list)))