@@ -50,17 +50,18 @@ waitons testmode newtal itemmaps prereqs-not-met) ;; set up needed environment variables given a run-id and optionally a target, itempath etc. ;; (define (runs:set-megatest-env-vars run-id #!key (inkeys #f)(inrunname #f)(inkeyvals #f)(intarget #f)(testname #f)(itempath #f)) + ;;(bb-check-path msg: "runs:set-megatest-env-vars entry") (let* ((target (or intarget (common:args-get-target) (get-environment-variable "MT_TARGET"))) (keys (if inkeys inkeys (rmt:get-keys))) (keyvals (if inkeyvals inkeyvals (keys:target->keyval keys target))) (vals (hash-table-ref/default *env-vars-by-run-id* run-id #f)) - (link-tree (configf:lookup *configdat* "setup" "linktree"))) + (link-tree (common:get-linktree))) ;; (configf:lookup *configdat* "setup" "linktree"))) (if testname (setenv "MT_TEST_NAME" testname)) (if itempath (setenv "MT_ITEMPATH" itempath)) ;; get the info from the db and put it in the cache (if link-tree @@ -73,26 +74,61 @@ (for-each (lambda (key) (hash-table-set! vals (car key) (cadr key))) keyvals))) ;; from the cached data set the vars + (hash-table-for-each vals (lambda (key val) (debug:print 2 *default-log-port* "setenv " key " " val) (safe-setenv key val))) + ;;(bb-check-path msg: "runs:set-megatest-env-vars block 1") + ;;(BB> "*env-vars-by-run-id*/runid("run-id" vals="(hash-table->alist vals)) + (if (not (get-environment-variable "MT_TARGET"))(setenv "MT_TARGET" target)) - (alist->env-vars (hash-table-ref/default *configdat* "env-override" '())) + ;; we had a case where there was an exception generated by the hash-table-ref + ;; due to *configdat* being #f Adding a handle and exit + (let fatal-loop ((count 0)) + (handle-exceptions + exn + (let ((call-chain (get-call-chain)) + (msg ((condition-property-accessor 'exn 'message) exn))) + (if (< count 5) + (begin ;; this call is colliding, do some crude stuff to fix it. + (debug:print 0 *default-log-port* "ERROR: *configdat* was inaccessible! This should never happen. Retry #" count) + (launch:setup force-reread: #t) + (fatal-loop (+ count 1))) + (begin + (debug:print 0 *default-log-port* "FATAL: *configdat* was inaccessible! This should never happen. Retried " count " times. Message: " msg) + (debug:print 0 *default-log-port* "Call chain:") + (with-output-to-port *default-log-port* + + (lambda () + (print "*configdat* is >>"*configdat*"<<") + (pp *configdat*) + (pp call-chain))) + + (exit 1)))) + ;;(bb-check-path msg: "runs:set-megatest-env-vars block 1.5") + (when (or (not *configdat*) (not (hash-table? *configdat*))) + (debug:print 0 *default-log-port* "WARNING: *configdat* was inaccessible! This should never happen. Brute force reread.") + ;;(BB> "ERROR: *configdat* was inaccessible! This should never happen. Brute force reread.") + (thread-sleep! 2) ;; assuming nfs lag. + (launch:setup force-reread: #t)) + (alist->env-vars (hash-table-ref/default *configdat* "env-override" '())))) ;;;; environment is tainted HERE in this let block. + ;;(bb-check-path msg: "runs:set-megatest-env-vars block 2") ;; Lets use this as an opportunity to put MT_RUNNAME in the environment (let ((runname (if inrunname inrunname (rmt:get-run-name-from-id run-id)))) (if runname (setenv "MT_RUNNAME" runname) (debug:print-error 0 *default-log-port* "no value for runname for id " run-id))) (setenv "MT_RUN_AREA_HOME" *toppath*) ;; if a testname and itempath are available set the remaining appropriate variables (if testname (setenv "MT_TEST_NAME" testname)) (if itempath (setenv "MT_ITEMPATH" itempath)) + ;;(bb-check-path msg: "runs:set-megatest-env-vars block 3") (if (and testname link-tree) (setenv "MT_TEST_RUN_DIR" (conc (getenv "MT_LINKTREE") "/" (getenv "MT_TARGET") "/" (getenv "MT_RUNNAME") "/" (getenv "MT_TEST_NAME") @@ -191,36 +227,121 @@ " in jobgroup \"" jobgroup "\" exceeds limit of " job-group-limit)) #t) (else #f)))) (list (not can-not-run-more) num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit))))) +(define (runs:run-pre-hook run-id) + (let* ((run-pre-hook (configf:lookup *configdat* "runs" "pre-hook")) + (existing-tests (if run-pre-hook + (rmt:get-tests-for-run run-id "%" '() '() ;; run-id testpatt states statuses + #f #f ;; offset limit + #f ;; not-in + #f ;; sort-by + #f ;; sort-order + #f ;; get full data (not 'shortlist) + 0 ;; (runs:gendat-inc-results-last-update *runs:general-data*) ;; last update time + 'dashboard) + '())) + (log-dir (conc *toppath* "/logs")) + (log-file (conc "pre-hook-" (string-translate (getenv "MT_TARGET") "/" "-") "-" (getenv "MT_RUNNAME") ".log")) + (full-log-fname (conc log-dir "/" log-file))) + (if run-pre-hook + (if (null? existing-tests) + (let* ((use-log-dir (if (not (directory-exists? log-dir)) + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir) + #f) + (create-directory log-dir #t) + #t) + #t)) + (start-time (current-seconds)) + (actual-logf (if use-log-dir full-log-fname log-file))) + (handle-exceptions + exn + (begin + (print-call-chain *default-log-port*) + (debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "ERROR: failed to run pre-hook " run-pre-hook ", check the log " log-file)) + (debug:print-info 0 *default-log-port* "running run-pre-hook: \"" run-pre-hook "\", log is " actual-logf) + (system (conc run-pre-hook " >> " actual-logf " 2>&1")) + (debug:print-info 0 *default-log-port* "pre-hook \"" run-pre-hook "\" took " (- (current-seconds) start-time) " seconds to run."))) + (debug:print 0 *default-log-port* "Skipping pre-hook call \"" run-pre-hook "\" as there are existing tests for this run."))))) + +(define (runs:run-post-hook run-id) + (let* ((run-post-hook (configf:lookup *configdat* "runs" "post-hook")) + (existing-tests (if run-post-hook + (rmt:get-tests-for-run run-id "%" '() '() ;; run-id testpatt states statuses + #f #f ;; offset limit + #f ;; not-in + #f ;; sort-by + #f ;; sort-order + #f ;; get full data (not 'shortlist) + 0 ;; (runs:gendat-inc-results-last-update *runs:general-data*) ;; last update time + 'dashboard) + '())) + (log-dir (conc *toppath* "/logs")) + (log-file (conc "post-hook-" (string-translate (getenv "MT_TARGET") "/" "-") "-" (getenv "MT_RUNNAME") ".log")) + (full-log-fname (conc log-dir "/" log-file))) + (if run-post-hook + ;; (if (null? existing-tests) + ;; (debug:print 0 *default-log-port* "Skipping post-hook call \"" run-post-hook "\" as there are existing tests for this run."))))) + (let* ((use-log-dir (if (not (directory-exists? log-dir)) + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir) + #f) + (create-directory log-dir #t) + #t) + #t)) + (start-time (current-seconds)) + (actual-logf (if use-log-dir full-log-fname log-file))) + (handle-exceptions + exn + (begin + (print-call-chain *default-log-port*) + (debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "ERROR: failed to run post-hook " run-post-hook ", check the log " log-file)) + (debug:print-info 0 *default-log-port* "running run-post-hook: \"" run-post-hook "\", log is " actual-logf) + (system (conc run-post-hook " >> " actual-logf " 2>&1")) + (debug:print-info 0 *default-log-port* "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run.")))))) ;; test-names: Comma separated patterns same as test-patts but used in selection ;; of tests to run. The item portions are not respected. ;; FIXME: error out if /patt specified ;; (define (runs:run-tests target runname test-patts user flags #!key (run-count 1)) ;; test-names (let* ((keys (keys:config-get-fields *configdat*)) (keyvals (keys:target->keyval keys target)) - (run-id (rmt:register-run keyvals runname "new" "n/a" user)) ;; test-name))) + (run-id (rmt:register-run keyvals runname "new" "n/a" user (args:get-arg "-contour"))) ;; test-name))) ;; (deferred '()) ;; delay running these since they have a waiton clause (runconfigf (conc *toppath* "/runconfigs.config")) + (dbfile (conc *toppath* "/megatest.db")) + (readonly-mode (not (file-write-access? dbfile))) (test-records (make-hash-table)) ;; need to process runconfigs before generating these lists (all-tests-registry #f) ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names (all-test-names #f) ;; (hash-table-keys all-tests-registry)) (test-names #f) ;; Generated by a call to (tests:filter-test-names all-test-names test-patts)) (required-tests #f) ;; Put fully qualified test/testpath names in this list to be done (task-key (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id))) - (tdbdat (tasks:open-db)) + ;; (tdbdat (tasks:open-db)) (config-reruns (let ((x (configf:lookup *configdat* "setup" "reruns"))) - (if x (string->number x) #f)))) + (if x (string->number x) #f))) + (allowed-tests #f)) + + ;; check if readonly + (when readonly-mode + (debug:print-error 0 *default-log-port* "megatest.db is readonly. Cannot proceed.") + (exit 1)) ;; per user request. If less than 100Meg space on dbdir partition, bail out with error ;; this will reduce issues in database corruption (common:check-db-dir-and-exit-if-insufficient) - + ;; override the number of reruns from the configs (if (and config-reruns (> run-count config-reruns)) (set! run-count config-reruns)) @@ -229,12 +350,12 @@ (let ((sighand (lambda (signum) ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting (set! *time-to-exit* #t) (print "Received signal " signum ", cleaning up before exit. Please wait...") (let ((th1 (make-thread (lambda () - (let ((tdbdat (tasks:open-db))) - (rmt:tasks-set-state-given-param-key task-key "killed")) + ;; (let ((tdbdat (tasks:open-db))) + (rmt:tasks-set-state-given-param-key task-key "killed") ;; ) (print "Killed by signal " signum ". Exiting") (thread-sleep! 3) (exit)))) (th2 (make-thread (lambda () (thread-sleep! 5) @@ -244,28 +365,48 @@ (thread-start! th1) (thread-join! th2))))) (set-signal-handler! signal/int sighand) (set-signal-handler! signal/term sighand)) + ;; force the starting of a server -- removed BB 17ww28 - no longer needed. + ;;(debug:print 0 *default-log-port* "waiting on server...") + ;;(server:start-and-wait *toppath*) + (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process - (set! runconf (if (file-exists? runconfigf) + (set! runconf (if (common:file-exists? runconfigf) (setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target) (begin (debug:print 0 *default-log-port* "WARNING: You do not have a run config file: " runconfigf) #f))) (if (not test-patts) ;; first time in - adjust testpatt (set! test-patts (common:args-get-testpatt runconf))) + ;; if test-patts is #f at this point there is something wrong and we need to bail out + (if (not test-patts) + (begin + (debug:print 0 *default-log-port* "WARNING: there is no test pattern for this run. Exiting now.") + (exit 0))) + + (if (args:get-arg "-tagexpr") + (begin + (set! allowed-tests (string-join (runs:get-tests-matching-tags (args:get-arg "-tagexpr")) ",")) + (debug:print-info 0 *default-log-port* "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests) + ));; tests will be ANDed with this list ;; register this run in monitor.db (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) (rmt:tasks-set-state-given-param-key task-key "running") ;; Now generate all the tests lists (set! all-tests-registry (tests:get-all)) ;; hash of testname => path-to-test (set! all-test-names (hash-table-keys all-tests-registry)) - (set! test-names (tests:filter-test-names all-test-names test-patts)) + ;; filter first for allowed-tests (from -tagexpr) then for test-patts. + (set! test-names (tests:filter-test-names + (if allowed-tests + (tests:filter-test-names all-test-names allowed-tests) + all-test-names) + test-patts)) ;; I think seeding required-tests with all test-names makes sense but lack analysis to back that up. ;; NEW STRATEGY HERE: ;; 1. fill required tests with test-patts @@ -300,17 +441,26 @@ ;; ;; (rmt:general-call 'delete-tests-in-state run-id "NOT_STARTED") ;; Now convert anything in allow-auto-rerun to NOT_STARTED ;; - (for-each (lambda (state) - (rmt:set-tests-state-status run-id test-names state #f "NOT_STARTED" state)) - (string-split (or (configf:lookup *configdat* "setup" "allow-auto-rerun") ""))))) + (for-each + (lambda (state-status) + (let* ((ss-lst (string-split-fields "/" state-status #:infix)) + (state (if (> (length ss-lst) 0)(car ss-lst) #f)) + (status (if (> (length ss-lst) 1)(cadr ss-lst) #f))) + (rmt:set-tests-state-status run-id test-names state status "NOT_STARTED" status))) + ;; list of state/status pairs separated by spaces + (string-split (or (configf:lookup *configdat* "setup" "allow-auto-rerun") ""))))) ;; Ensure all tests are registered in the test_meta table (runs:update-all-test_meta #f) + ;; run the run prehook if there are no tests yet run for this run: + ;; + (runs:run-pre-hook run-id) + ;; now add non-directly referenced dependencies (i.e. waiton) ;;====================================================================== ;; refactoring this block into tests:get-full-data ;; ;; What happended, this code is now duplicated in tests!? @@ -403,21 +553,17 @@ (let ((reglen (configf:lookup *configdat* "setup" "runqueue"))) (if (> (length (hash-table-keys test-records)) 0) (let* ((keep-going #t) (run-queue-retries 5) (th1 (make-thread (lambda () - (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry)) - ;; (handle-exceptions - ;; exn - ;; (begin - ;; (print-call-chain (current-error-port)) - ;; (debug:print-error 0 *default-log-port* "failure in runs:run-tests-queue thread, error: " ((condition-property-accessor 'exn 'message) exn)) - ;; (if (> run-queue-retries 0) - ;; (begin - ;; (set! run-queue-retries (- run-queue-retries 1)) - ;; (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry)))) - ;; (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry))) + (handle-exceptions + exn + (begin + (print-call-chain) + (print " message: " ((condition-property-accessor 'exn 'message) exn))) + (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests + (any->number reglen) all-tests-registry))) "runs:run-tests-queue")) (th2 (make-thread (lambda () ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) @@ -477,12 +623,21 @@ (cdr reg) (if (null? tal) ;; if tal is null and reg not full then '() as reg contents moved to tal '() reg))) +;; this is the list of parameters to the named loop "loop" near the top of runs:run-tests-queue, look around line 1216 +;; +(define (runs:loop-values tal reg reglen regfull reruns) + (list (runs:queue-next-hed tal reg reglen regfull) ;; hed + (runs:queue-next-tal tal reg reglen regfull) ;; tal + (runs:queue-next-reg tal reg reglen regfull) ;; reg + reruns)) ;; reruns + (define runs:nothing-left-in-queue-count 0) +;; BB: for future reference - suspect target vars are not expanded to env vars at this point (item expansion using [items]\nwhatever [system echo $TARGETVAR] doesnt work right whereas [system echo #{targetvar}] does.. Tal and Randy have tix on this. on first pass, var not set, on second pass, ok. (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps) (let* ((loop-list (list hed tal reg reruns)) (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))) (if (list? res) res @@ -518,14 +673,11 @@ (member (hash-table-ref/default test-registry (db:test-make-full-name hed item-path) 'n/a) '(DONOTRUN removed CANNOTRUN))) ;; *common:cant-run-states-sym*) ;; '(COMPLETED KILLED WAIVED UNKNOWN INCOMPLETE)) ;; try to catch repeat processing of COMPLETED tests here (debug:print-info 1 *default-log-port* "Test " hed " set to \"" (hash-table-ref test-registry (db:test-make-full-name hed item-path)) "\". Removing it from the queue") (if (or (not (null? tal)) (not (null? reg))) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - reruns) + (runs:loop-values tal reg reglen regfull reruns) (begin (debug:print-info 0 *default-log-port* "Nothing left in the queue!") ;; If get here twice then we know we've tried to expand all items ;; since there must be a logic issue with the handling of loops in the ;; items expand phase we will brute force an exit here. @@ -594,14 +746,12 @@ (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_DISCARDED" "Failed to run due to discarded prerequisites"))) (if (and (null? trimmed-tal) (null? trimmed-reg)) #f - (list (runs:queue-next-hed trimmed-tal trimmed-reg reglen regfull) - (runs:queue-next-tal trimmed-tal trimmed-reg reglen regfull) - (runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull) - reruns))) + (runs:loop-values trimmed-tal trimmed-reg reglen regfull reruns) + )) (list (car newtal)(append (cdr newtal) reg) '() reruns)))) ((and (null? fails) (null? prereq-fails) (null? non-completed)) @@ -616,14 +766,12 @@ (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while."))) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - reruns)))) + (runs:loop-values tal reg reglen regfull reruns) + ))) ((and (or (not (null? fails)) (not (null? prereq-fails))) (member 'normal testmode)) @@ -636,27 +784,21 @@ (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_DISCARDED" "Failed to run due to prior failed prerequisites") (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))) (if (or (not (null? reg))(not (null? tal))) (begin (hash-table-set! test-registry hed 'CANNOTRUN) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - (cons hed reruns))) + (runs:loop-values tal reg reglen regfull (cons hed reruns)) + ) #f)) ;; #f flags do not loop ((and (not (null? fails))(member 'toplevel testmode)) (if (or (not (null? reg))(not (null? tal))) (list (car newtal)(append (cdr newtal) reg) '() reruns) #f)) - ((null? runnables) #f) ;; if we get here and non-completed is null the it's all over. + ((null? runnables) #f) ;; if we get here and non-completed is null then it is all over. (else (debug:print 0 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now") - ;; (list (runs:queue-next-hed tal reg reglen regfull) - ;; (runs:queue-next-tal tal reg reglen regfull) - ;; (runs:queue-next-reg tal reg reglen regfull) - ;; reruns) (list (car newtal)(cdr newtal) reg reruns))))) (define (runs:mixed-list-testname-and-testrec->list-of-strings inlst) (if (null? inlst) '() @@ -724,12 +866,13 @@ (not (equal? x hed))) (runs:calc-not-completed prereqs-not-met))) (loop-list (list hed tal reg reruns)) ;; configure the load runner (numcpus (common:get-num-cpus #f)) - (maxload (string->number (or (configf:lookup *configdat* "jobtools" "maxload") "3"))) - (waitdelay (string->number (or (configf:lookup *configdat* "jobtools" "waitdelay") "60")))) + (maxload (string->number (or (configf:lookup *configdat* "jobtools" "maxload") "3.0"))) ;; use a non-number string to disable + (maxhomehostload (string->number (or (configf:lookup *configdat* "jobtools" "maxhomehostload") "1.2"))) ;; use a non-number string to disable + (waitdelay (string->number (or (configf:lookup *configdat* "jobtools" "waitdelay") "60")))) (debug:print-info 4 *default-log-port* "have-resources: " have-resources " prereqs-not-met: (" (string-intersperse (map (lambda (t) (if (vector? t) (conc (db:test-get-state t) "/" (db:test-get-status t)) @@ -755,14 +898,11 @@ ((not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) ;; This test/itempath is not to be run ;; else the run is stuck, temporarily or permanently ;; but should check if it is due to lack of resources vs. prerequisites (debug:print-info 1 *default-log-port* "Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " test-patts) (if (or (not (null? tal))(not (null? reg))) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - reruns) + (runs:loop-values tal reg reglen regfull reruns) #f)) ;; Register tests ;; ((not (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) @@ -783,11 +923,11 @@ (if (rmt:get-test-id run-id test-name "") (hash-table-set! test-registry (db:test-make-full-name test-name "") 'done)))) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) (if (and (null? tal)(null? reg)) (list hed tal (append reg (list hed)) reruns) - (list (runs:queue-next-hed tal reg reglen regfull) + (list (runs:queue-next-hed tal reg reglen regfull) ;; cannot replace with a call to runs:loop-values as the logic is different for reg (runs:queue-next-tal tal reg reglen regfull) ;; NB// Here we are building reg as we register tests ;; if regfull we must pop the front item off reg (if regfull (append (cdr reg) (list hed)) @@ -821,29 +961,30 @@ ;; This is the final stage, everything is in place so launch the test ;; ((and have-resources (or (null? prereqs-not-met) (and (member 'toplevel testmode) ;; 'toplevel) - (null? non-completed)))) + (null? non-completed) + (not (member 'exclusive testmode))))) ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path)) ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! *max-tries-hash* (make-hash-table)) ;; well, first lets see if cpu load throttling is enabled. If so wait around until the ;; average cpu load is under the threshold before continuing - (if (configf:lookup *configdat* "jobtools" "maxload") ;; only gate if maxload is specified - (common:wait-for-cpuload maxload numcpus waitdelay)) + (if maxload ;; only gate if maxload is specified + (common:wait-for-cpuload maxload numcpus waitdelay)) + (if maxhomehostload + (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))) + (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) (if (or (not (null? tal))(not (null? reg))) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - reruns) + (runs:loop-values tal reg reglen regfull reruns) #f)) ;; must be we have unmet prerequisites ;; (else @@ -875,25 +1016,18 @@ (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) ;; This next is for the items (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'removed) - (list (runs:queue-next-hed tal reg reglen regfull) - (runs:queue-next-tal tal reg reglen regfull) - (runs:queue-next-reg tal reg reglen regfull) - reruns ;; WAS: (cons hed reruns) ;; but that makes no sense? - )) + (runs:loop-values tal reg reglen regfull reruns)) (let ((nth-try (hash-table-ref/default test-registry hed 0))) (cond ((member "RUNNING" (map db:test-get-state prereqs-not-met)) (if (runs:lownoise (conc "possible RUNNING prerequistes " hed) 60) (debug:print 0 *default-log-port* "WARNING: test " hed " has possible RUNNING prerequisites, don't give up on it yet.")) (thread-sleep! 4) - (list (runs:queue-next-hed newtal reg reglen regfull) - (runs:queue-next-tal newtal reg reglen regfull) - (runs:queue-next-reg newtal reg reglen regfull) - reruns)) + (runs:loop-values tal reg reglen regfull reruns)) ((or (not nth-try) (and (number? nth-try) (< nth-try 10))) (hash-table-set! test-registry hed (if (number? nth-try) (+ nth-try 1) @@ -900,17 +1034,11 @@ 0)) (if (runs:lownoise (conc "not removing test " hed) 60) (debug:print 1 *default-log-port* "WARNING: not removing test " hed " from queue although it may not be runnable due to FAILED prerequisites")) ;; may not have processed correctly. Could be a race condition in your test implementation? Dropping test " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) - ;; (list hed tal reg reruns) - ;; (list (car newtal)(cdr newtal) reg reruns) - ;; (hash-table-set! test-registry hed 'removed) - (list (runs:queue-next-hed newtal reg reglen regfull) - (runs:queue-next-tal newtal reg reglen regfull) - (runs:queue-next-reg newtal reg reglen regfull) - reruns)) + (runs:loop-values newtal reg reglen regfull reruns)) ((symbol? nth-try) (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW (if (null? tal) #f ;; yes, really (list (car tal)(cdr tal) reg reruns)) @@ -917,14 +1045,11 @@ (begin (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60) (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state " nth-try " will be overridden and we'll retry.")) (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f) (hash-table-set! test-registry hed 0) - (list (runs:queue-next-hed newtal reg reglen regfull) - (runs:queue-next-tal newtal reg reglen regfull) - (runs:queue-next-reg newtal reg reglen regfull) - reruns)))) + (runs:loop-values newtal reg reglen regfull)))) (else (if (runs:lownoise (conc "FAILED prerequitests and we tried" hed) 60) (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequitests and we've tried at least 10 times to run it. Giving up now.")) ;; (debug:print 0 *default-log-port* " prereqs: " prereqs-not-met) (hash-table-set! test-registry hed 'removed) @@ -934,17 +1059,14 @@ (list (if (null? tal)(car newtal)(car tal)) tal reg reruns))))) ;; can't drop this - maybe running? Just keep trying - (let ((runable-tests (runs:runable-tests prereqs-not-met))) + (let ((runable-tests (runs:runable-tests prereqs-not-met))) ;; SUSPICIOUS: Should look at more than just prereqs-not-met? (if (null? runable-tests) #f ;; I think we are truly done here - (list (runs:queue-next-hed newtal reg reglen regfull) - (runs:queue-next-tal newtal reg reglen regfull) - (runs:queue-next-reg newtal reg reglen regfull) - reruns))))))))) + (runs:loop-values newtal reg reglen regfull reruns))))))))) ;; scan a list of tests looking to see if any are potentially runnable ;; (define (runs:runable-tests tests) (filter (lambda (t) @@ -1043,25 +1165,22 @@ ;; Do mark-and-find clean up of db before starting runing of quue ;; ;; (rmt:find-and-mark-incomplete) - (let* ((run-info (rmt:get-run-info run-id)) + (let* ((run-info (rmt:get-run-info run-id)) (tests-info (mt:get-tests-for-run run-id #f '() '())) ;; qryvals: "id,testname,item_path")) (sorted-test-names (tests:sort-by-priority-and-waiton test-records)) (test-registry (make-hash-table)) (registry-mutex (make-mutex)) (num-retries 0) (max-retries (config-lookup *configdat* "setup" "maxretries")) - (max-concurrent-jobs (let ((mcj (config-lookup *configdat* "setup" "max_concurrent_jobs"))) - (if (and mcj (string->number mcj)) - (string->number mcj) - 1))) ;; length of the register queue ahead - (reglen (if (number? reglen-in) reglen-in 1)) + (max-concurrent-jobs (configf:lookup-number *configdat* "setup" "max_concurrent_jobs" default: 50)) + (reglen (if (number? reglen-in) reglen-in 1)) (last-time-incomplete (- (current-seconds) 900)) ;; force at least one clean up cycle (last-time-some-running (current-seconds)) - (tdbdat (tasks:open-db)) + ;; (tdbdat (tasks:open-db)) (runsdat (make-runs:dat ;; hed: hed ;; tal: tal ;; reg: reg ;; reruns: reruns @@ -1151,14 +1270,19 @@ newtal: newtal itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) - ;; every couple minutes verify the server is there for this run - (if (and (common:low-noise-print 60 "try start server" run-id) - (tasks:need-server run-id)) - (tasks:start-and-wait-for-server tdbdat run-id 10)) ;; NOTE: delay and wait is done under the hood + + ;; -- removed BB 17ww28 - no longer needed. + ;; every 15 minutes verify the server is there for this run + ;; (if (and (common:low-noise-print 240 "try start server" run-id) + ;; (not (or (and *runremote* + ;; (remote-server-url *runremote*) + ;; (server:ping (remote-server-url *runremote*))) + ;; (server:check-if-running *toppath*)))) + ;; (server:kind-run *toppath*)) (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000))) @@ -1286,11 +1410,11 @@ (loop (car tal)(cdr tal) reg reruns))) ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS - ((or (procedure? items)(eq? items 'have-procedure)) + ((or (procedure? items)(eq? items 'have-procedure)) ;; BB - target vars are env vars here? to allow expansion of [items]\nsomething [system echo $SOMETARGVAR], which is wonky (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) (car can-run-more)) (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) (if loop-list @@ -1324,11 +1448,11 @@ ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) (thread-sleep! 5) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id)) (prev-num-running 0)) - ;; (BB> "num-running=" num-running ", prev-num-running=" prev-num-running) + ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup *configdat* "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes @@ -1343,10 +1467,11 @@ (thread-sleep! 5) ;; (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! + (runs:run-post-hook run-id) (debug:print-info 1 *default-log-port* "All tests launched"))) (define (runs:calc-fails prereqs-not-met) (filter (lambda (test) (and (vector? test) ;; not (string? test)) @@ -1381,11 +1506,12 @@ (filter (lambda (t) (or (not (vector? t)) (and (equal? "NOT_STARTED" (db:test-get-state t)) (member (db:test-get-status t) - '("n/a" "KEEP_TRYING"))))) + '("n/a" "KEEP_TRYING"))) + (and (equal? "RUNNING" (db:test-get-state t))))) ;; account for a test that is running prereqs-not-met)) (define (runs:pretty-string lst) (map (lambda (t) (if (not (vector? t)) @@ -1466,11 +1592,11 @@ (thread-sleep! 1) (loop))))) (if (not testdat) ;; should NOT happen (debug:print-error 0 *default-log-port* "failed to get test record for test-id " test-id)) (set! test-id (db:test-get-id testdat)) - (if (file-exists? test-path) + (if (common:file-exists? test-path) (change-directory test-path) (begin (debug:print-error 0 *default-log-port* "test run path not created before attempting to run the test. Perhaps you are running -remove-runs at the same time?") (change-directory *toppath*))) (case (if force ;; (args:get-arg "-force") @@ -1534,13 +1660,12 @@ (let ((running-tests (rmt:get-tests-for-runs-mindata #f full-test-name '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED") '() #f))) (if (not (null? running-tests)) ;; have to skip (set! skip-test "Skipping due to previous tests running")))) ((and skip-check (configf:lookup test-conf "skip" "fileexists")) - (if (file-exists? (configf:lookup test-conf "skip" "fileexists")) + (if (common:file-exists? (configf:lookup test-conf "skip" "fileexists")) (set! skip-test (conc "Skipping due to existance of file " (configf:lookup test-conf "skip" "fileexists"))))) - ((and skip-check (configf:lookup test-conf "skip" "rundelay")) ;; run-ids = #f means *all* runs (let* ((numseconds (common:hms-string->seconds (configf:lookup test-conf "skip" "rundelay"))) (running-tests (rmt:get-tests-for-runs-mindata #f full-test-name '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED") '() #f)) @@ -1626,23 +1751,33 @@ ;; 'remove-runs ;; 'set-state-status ;; ;; NB// should pass in keys? ;; -(define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(mode 'remove-all)(options '())) +(define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(mode #f)(options '())) (common:clear-caches) ;; clear all caches (let* ((db #f) - (tdbdat (tasks:open-db)) + ;; (tdbdat (tasks:open-db)) (keys (rmt:get-keys)) (rundat (mt:get-runs-by-patt keys runnamepatt target)) (header (vector-ref rundat 0)) (runs (vector-ref rundat 1)) (states (if state (string-split state ",") '())) (statuses (if status (string-split status ",") '())) (state-status (if (string? new-state-status) (string-split new-state-status ",") '(#f #f))) (rp-mutex (make-mutex)) - (bup-mutex (make-mutex))) + (bup-mutex (make-mutex)) + (keep-records (args:get-arg "-keep-records"))) ;; used in conjunction with -remove-runs to keep the records, TODO: consolidate this with "mode". + + (let* ((write-access-actions '(remove-runs set-state-status archive run-wait)) + (dbfile (conc *toppath* "/megatest.db")) + (readonly-mode (not (file-write-access? dbfile)))) + (when (and readonly-mode + (member action write-access-actions)) + (debug:print-error 0 *default-log-port* "megatest.db is readonly. Cannot proceed with action ["action"] in which write-access isrequired .") + (exit 1))) + (debug:print-info 4 *default-log-port* "runs:operate-on => Header: " header " action: " action " new-state-status: " new-state-status) (if (> 2 (length state-status)) (begin (debug:print-error 0 *default-log-port* "the parameter to -set-state-status is a comma delimited string. E.g. COMPLETED,FAIL") (exit))) @@ -1677,11 +1812,11 @@ (tasks:kill-runner target run-name testpatt) ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt)) (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((set-state-status) - (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) + ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) (debug:print 1 *default-log-port* "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((print-run) (debug:print 1 *default-log-port* "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header) action) ((run-wait) @@ -1788,29 +1923,32 @@ ((archive) (if (and run-dir (not toplevel-with-children)) (let ((ddir (conc run-dir "/"))) (case (string->symbol (args:get-arg "-archive")) ((save save-remove keep-html) - (if (file-exists? ddir) + (if (common:file-exists? ddir) (debug:print-info 0 *default-log-port* "Estimating disk space usage for " test-fulln ": " (common:get-disk-space-used ddir))))))) (if (not (null? tal)) (loop (car tal)(cdr tal)))) ))) ) (if worker-thread (thread-join! worker-thread)))))) ;; remove the run if zero tests remain (if (eq? action 'remove-runs) - (let ((remtests (mt:get-tests-for-run (db:get-value-by-header run header "id") #f '("DELETED") '("n/a") not-in: #t))) + (let* ((run-id (db:get-value-by-header run header "id")) ;; NB// masks run-id from above? + (remtests (mt:get-tests-for-run run-id #f '("DELETED") '("n/a") not-in: #t))) (if (null? remtests) ;; no more tests remaining (let* ((dparts (string-split lasttpath "/")) (runpath (conc "/" (string-intersperse (take dparts (- (length dparts) 1)) "/")))) (debug:print 1 *default-log-port* "Removing run: " runkey " " (db:get-value-by-header run header "runname") " and related record") - (rmt:delete-run run-id) - (rmt:delete-old-deleted-test-records) - ;; (rmt:set-var "DELETED_TESTS" (current-seconds)) + (if (not keep-records) + (begin + (rmt:delete-run run-id) + (rmt:delete-old-deleted-test-records))) + ;; (rmt:set-var "DELETED_TESTS" (current-seconds)) ;; need to figure out the path to the run dir and remove it if empty ;; (if (null? (glob (conc runpath "/*"))) ;; (begin ;; (debug:print 1 *default-log-port* "Removing run dir " runpath) ;; (system (conc "rmdir -p " runpath)))) @@ -1821,25 +1959,41 @@ ) #t) (define (runs:remove-test-directory test mode) ;; remove-data-only) (let* ((run-dir (db:test-get-rundir test)) ;; run dir is from the link tree - (real-dir (if (file-exists? run-dir) + (real-dir (if (common:file-exists? run-dir) ;; (resolve-pathname run-dir) (common:nice-path run-dir) - #f))) - (case mode + #f)) + (clean-mode (or mode 'remove-all)) + (test-id (db:test-get-id test)) + ;; (lock-key (conc "test-" test-id)) + ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) + ;; (expire-time (+ (current-seconds) 30))) ;; give up on getting the lock and steal it after 15 seconds + ;; (if (car lock) + ;; #t + ;; (if (> (current-seconds) expire-time) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to clean test with id " test-id) + ;; (rmt:no-sync-del! lock-key) ;; destroy the lock + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; + ;; (begin + ;; (thread-sleep! 1) + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time))))))) + ) + (case clean-mode ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f)) ((remove-all) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f)) ((archive-remove) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVE_REMOVING" #f #f))) (debug:print-info 1 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir) (if (and real-dir (> (string-length real-dir) 5) - (file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc. + (common:file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc. (begin ;; let* ((realpath (resolve-pathname run-dir))) (debug:print-info 1 *default-log-port* "Recursively removing " real-dir) - (if (file-exists? real-dir) + (if (common:file-exists? real-dir) (runs:safe-delete-test-dir real-dir) (debug:print 0 *default-log-port* "WARNING: test dir " real-dir " appears to not exist or is not readable"))) (if real-dir (debug:print 0 *default-log-port* "WARNING: directory " real-dir " does not exist") (debug:print 0 *default-log-port* "WARNING: no real directory corrosponding to link " run-dir ", nothing done"))) @@ -1861,14 +2015,16 @@ (not (member run-dir (list "n/a" "/tmp/badname")))) (debug:print 0 *default-log-port* "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink") (debug:print 0 *default-log-port* "NOTE: the run dir for this test is undefined. Test may have already been deleted.")) )) ;; Only delete the records *after* removing the directory. If things fail we have a record - (case mode - ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "NOT_STARTED" "n/a" #f)) + (case clean-mode + ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) (db:test-get-state test)(db:test-get-status test) #f)) ((archive-remove) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVED" #f #f)) - (else (rmt:delete-test-records (db:test-get-run_id test) (db:test-get-id test)))))) + (else (rmt:delete-test-records (db:test-get-run_id test) (db:test-get-id test)))) + ;; (rmt:no-sync-del! lock-key) + )) ;;====================================================================== ;; Routines for manipulating runs ;;====================================================================== @@ -1888,14 +2044,17 @@ (let (;; (db #f) (keys #f)) (if (launch:setup) (begin (full-runconfigs-read) ;; cache the run config - (launch:cache-config)) ;; do not cache here - need to be sure runconfigs is processed + ;; (launch:cache-config) ;; there are two independent config cache locations, turning this one off for now. MRW. + ) ;; do not cache here - need to be sure runconfigs is processed (begin (debug:print 0 *default-log-port* "Failed to setup, exiting") (exit 1))) + + (set! keys (keys:config-get-fields *configdat*)) ;; have enough to process -target or -reqtarg here (if (args:get-arg "-reqtarg") (let* ((runconfigf (conc *toppath* "/runconfigs.config")) ;; DO NOT EVALUATE ALL (runconfig (read-config runconfigf #f #t environ-patt: #f))) @@ -1968,11 +2127,11 @@ (let* ((tagdata (rmt:get-tests-tags)) (res '())) ;; list of tests that match one or more tags (for-each (lambda (tag) (if (patt-list-match tag tagpatt) - (set! res (append (hash-table-ref tagdata tag))))) + (set! res (append (hash-table-ref tagdata tag) res)))) (hash-table-keys tagdata)) res)) ;; Update test_meta for all tests @@ -1989,11 +2148,11 @@ ;; (define (runs:rollup-run keys runname user keyvals) (debug:print 4 *default-log-port* "runs:rollup-run, keys: " keys " -runname " runname " user: " user) (let* ((db #f) ;; register run operates on the main db - (new-run-id (rmt:register-run keyvals runname "new" "n/a" user)) + (new-run-id (rmt:register-run keyvals runname "new" "n/a" user (args:get-arg "-contour"))) (prev-tests (rmt:get-matching-previous-test-run-records new-run-id "%" "%")) (curr-tests (mt:get-tests-for-run new-run-id "%/%" '() '())) (curr-tests-hash (make-hash-table))) (rmt:update-run-event_time new-run-id) ;; index the already saved tests by testname and itemdat in curr-tests-hash @@ -2041,5 +2200,28 @@ (db:test-get-id testdat)))) )) prev-tests))) +;; clean cache files +(define (runs:clean-cache target runname toppath) + (if target + (if runname + (let* ((linktree (common:get-linktree)) ;; (if toppath (configf:lookup *configdat* "setup" "linktree"))) + (runtop (conc linktree "/" target "/" runname)) + (files (if (common:file-exists? runtop) + (append (glob (conc runtop "/.megatest*")) + (glob (conc runtop "/.runconfig*"))) + '()))) + (if (null? files) + (debug:print-info 0 *default-log-port* "No cached megatest or runconfigs files found. None removed.") + (begin + (debug:print-info 0 *default-log-port* "Removing cached files:\n " (string-intersperse files "\n ")) + (for-each + (lambda (f) + (handle-exceptions + exn + (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f) + (delete-file f))) + files)))) + (debug:print-error 0 *default-log-port* "-clean-cache requires -runname.")) + (debug:print-error 0 *default-log-port* "-clean-cache requires -target or -reqtarg")))