@@ -11,12 +11,12 @@ ;;====================================================================== ;; launch a task - this runs on the originating host, tests themselves ;; ;;====================================================================== -(use regex regex-case base64 sqlite3 srfi-18 directory-utils posix-extras z3 call-with-environment-variables) -(use defstruct) +(use regex regex-case base64 sqlite3 srfi-18 directory-utils posix-extras z3 call-with-environment-variables csv) +(use defstruct pathname-expand) (import (prefix base64 base64:)) (import (prefix sqlite3 sqlite3:)) (declare (unit launch)) @@ -55,10 +55,29 @@ (common:read-encoded-string enccmd) '()))) ;; 0 1 2 3 (defstruct launch:einf (pid #t)(exit-status #t)(exit-code #t)(rollup-status 0)) + +;; return (conc status ": " comment) from the final section so that +;; the comment can be set in the step record in launch.scm +;; +(define (launch:load-logpro-dat run-id test-id stepname) + (let ((cname (conc stepname ".dat"))) + (if (file-exists? cname) + (let* ((dat (read-config cname #f #f)) + (csvr (db:logpro-dat->csv dat stepname)) + (csvt (let-values (( (fmt-cell fmt-record fmt-csv) (make-format ","))) + (fmt-csv (map list->csv-record csvr)))) + (status (configf:lookup dat "final" "exit-status")) + (msg (configf:lookup dat "final" "message"))) + (rmt:csv->test-data run-id test-id csvt) + (cond + ((equal? status "PASS") "PASS") ;; skip the message part if status is pass + (status (conc (configf:lookup dat "final" "exit-status") ": " (if msg msg "no message"))) + (else #f))) + #f))) (define (launch:runstep ezstep run-id test-id exit-info m tal testconfig) (let* ((stepname (car ezstep)) ;; do stuff to run the step (stepinfo (cadr ezstep)) (stepparts (string-match (regexp "^(\\{([^\\}]*)\\}\\s*|)(.*)$") stepinfo)) @@ -65,10 +84,11 @@ (stepparms (list-ref stepparts 2)) ;; for future use, {VAR=1,2,3}, run step for each (stepcmd (list-ref stepparts 3)) (script "") ; "#!/bin/bash\n") ;; yep, we depend on bin/bash FIXME!!!\ (logpro-file (conc stepname ".logpro")) (html-file (conc stepname ".html")) + (dat-file (conc stepname ".dat")) (tconfig-logpro (configf:lookup testconfig "logpro" stepname)) (logpro-used (file-exists? logpro-file))) (if (and tconfig-logpro (not logpro-used)) ;; no logpro file found but have a defn in the testconfig @@ -79,11 +99,11 @@ ";;") (print tconfig-logpro))) (set! logpro-used #t))) ;; NB// can safely assume we are in test-area directory - (debug:print 4 "ezsteps:\n stepname: " stepname " stepinfo: " stepinfo " stepparts: " stepparts + (debug:print 4 *default-log-port* "ezsteps:\n stepname: " stepname " stepinfo: " stepinfo " stepparts: " stepparts " stepparms: " stepparms " stepcmd: " stepcmd) ;; ;; first source the previous environment ;; (let ((prev-env (conc ".ezsteps/" prevstep (if (string-search (regexp "csh") ;; (get-environment-variable "SHELL")) ".csh" ".sh")))) @@ -91,11 +111,11 @@ ;; (set! script (conc script "source " prev-env)))) ;; call the command using mt_ezstep ;; (set! script (conc "mt_ezstep " stepname " " (if prevstep prevstep "x") " " stepcmd)) - (debug:print 4 "script: " script) + (debug:print 4 *default-log-port* "script: " script) (rmt:teststep-set-status! run-id test-id stepname "start" "-" #f #f) ;; now launch the actual process (call-with-environment-variables (list (cons "PATH" (conc (get-environment-variable "PATH") ":."))) (lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1") @@ -112,11 +132,11 @@ (if (eq? pid-val 0) (begin (thread-sleep! 2) (processloop (+ i 1)))) ))))) - (debug:print-info 0 "step " stepname " completed with exit code " (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2)) + (debug:print-info 0 *default-log-port* "step " stepname " completed with exit code " (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2)) ;; now run logpro if needed (if logpro-used (let ((pid (process-run (conc "logpro " logpro-file " " (conc stepname ".html") " < " stepname ".log")))) (let processloop ((i 0)) (let-values (((pid-val exit-status exit-code)(process-wait pid #t))) @@ -128,25 +148,31 @@ (mutex-unlock! m) (if (eq? pid-val 0) (begin (thread-sleep! 2) (processloop (+ i 1))))) - (debug:print-info 0 "logpro for step " stepname " exited with code " (launch:einf-exit-code exit-info))))) ;; (vector-ref exit-info 2))))) + (debug:print-info 0 *default-log-port* "logpro for step " stepname " exited with code " (launch:einf-exit-code exit-info))))) ;; (vector-ref exit-info 2))))) (let ((exinfo (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2)) - (logfna (if logpro-used (conc stepname ".html") ""))) - (rmt:teststep-set-status! run-id test-id stepname "end" exinfo #f logfna)) - (if logpro-used - (rmt:test-set-log! run-id test-id (conc stepname ".html"))) + (logfna (if logpro-used (conc stepname ".html") "")) + (comment #f)) + (if logpro-used + (let ((datfile (conc stepname ".dat"))) + ;; load the .dat file into the test_data table if it exists + (if (file-exists? datfile) + (set! comment (launch:load-logpro-dat run-id test-id stepname))) + (rmt:test-set-log! run-id test-id (conc stepname ".html")))) + (rmt:teststep-set-status! run-id test-id stepname "end" exinfo comment logfna)) ;; set the test final status (let* ((process-exit-status (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2)) (this-step-status (cond - ((and (eq? process-exit-status 2) logpro-used) 'warn) ;; logpro 2 = warnings - ((and (eq? process-exit-status 3) logpro-used) 'check) ;; logpro 3 = check - ((and (eq? process-exit-status 4) logpro-used) 'waived) ;; logpro 4 = abort - ((and (eq? process-exit-status 5) logpro-used) 'abort) ;; logpro 4 = abort - ((eq? process-exit-status 0) 'pass) ;; logpro 0 = pass + ((and (eq? process-exit-status 2) logpro-used) 'warn) ;; logpro 2 = warnings + ((and (eq? process-exit-status 3) logpro-used) 'check) ;; logpro 3 = check + ((and (eq? process-exit-status 4) logpro-used) 'waived) ;; logpro 4 = waived + ((and (eq? process-exit-status 5) logpro-used) 'abort) ;; logpro 5 = abort + ((and (eq? process-exit-status 6) logpro-used) 'skip) ;; logpro 6 = skip + ((eq? process-exit-status 0) 'pass) ;; logpro 0 = pass (else 'fail))) (overall-status (cond ((eq? (launch:einf-rollup-status exit-info) 2) 'warn) ;; rollup-status (vector-ref exit-info 3) ((eq? (launch:einf-rollup-status exit-info) 0) 'pass) ;; (vector-ref exit-info 3) (else 'fail))) @@ -158,13 +184,12 @@ (else 'fail))) (next-state ;; "RUNNING") ;; WHY WAS THIS CHANGED TO NOT USE (null? tal) ?? (cond ((null? tal) ;; more to run? "COMPLETED") - (else "RUNNING"))) - ) - (debug:print 4 "Exit value received: " (launch:einf-exit-code exit-info) " logpro-used: " logpro-used + (else "RUNNING")))) + (debug:print 4 *default-log-port* "Exit value received: " (launch:einf-exit-code exit-info) " logpro-used: " logpro-used " this-step-status: " this-step-status " overall-status: " overall-status " next-status: " next-status " rollup-status: " (launch:einf-rollup-status exit-info)) ;; (vector-ref exit-info 3)) (case next-status ((warn) (launch:einf-rollup-status-set! exit-info 2) ;; (vector-set! exit-info 3 2) ;; rollup-status @@ -176,27 +201,195 @@ (launch:einf-rollup-status-set! exit-info 3) ;; (vector-set! exit-info 3 3) ;; rollup-status ;; NB// test-set-status! does rdb calls under the hood (tests:test-set-status! run-id test-id next-state "CHECK" (if (eq? this-step-status 'check) "Logpro check found" #f) #f)) + ((waived) + (launch:einf-rollup-status-set! exit-info 4) ;; (vector-set! exit-info 3 3) ;; rollup-status + ;; NB// test-set-status! does rdb calls under the hood + (tests:test-set-status! run-id test-id next-state "WAIVED" + (if (eq? this-step-status 'check) "Logpro waived found" #f) + #f)) ((abort) - (launch:einf-rollup-status-set! exit-info 4) ;; (vector-set! exit-info 3 4) ;; rollup-status + (launch:einf-rollup-status-set! exit-info 5) ;; (vector-set! exit-info 3 4) ;; rollup-status ;; NB// test-set-status! does rdb calls under the hood (tests:test-set-status! run-id test-id next-state "ABORT" (if (eq? this-step-status 'abort) "Logpro abort found" #f) #f)) + ((skip) + (launch:einf-rollup-status-set! exit-info 6) ;; (vector-set! exit-info 3 4) ;; rollup-status + ;; NB// test-set-status! does rdb calls under the hood + (tests:test-set-status! run-id test-id next-state "SKIP" + (if (eq? this-step-status 'skip) "Logpro skip found" #f) + #f)) ((pass) (tests:test-set-status! run-id test-id next-state "PASS" #f #f)) (else ;; 'fail (launch:einf-rollup-status-set! exit-info 1) ;; (vector-set! exit-info 3 1) ;; force fail, this used to be next-state but that doesn't make sense. should always be "COMPLETED" (tests:test-set-status! run-id test-id "COMPLETED" "FAIL" (conc "Failed at step " stepname) #f) ))) logpro-used)) +(define (launch:manage-steps run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m) + ;; (let-values + ;; (((pid exit-status exit-code) + ;; (run-n-wait fullrunscript))) + ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) + ;; Since we should have a clean slate at this time there is no need to do + ;; any of the other stuff that tests:test-set-status! does. Let's just + ;; force RUNNING/n/a + + ;; (thread-sleep! 0.3) + (tests:test-force-state-status! run-id test-id "RUNNING" "n/a") + (rmt:roll-up-pass-fail-counts run-id test-name item-path #f "RUNNING") + ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here + + ;; if there is a runscript do it first + (if fullrunscript + (let ((pid (process-run fullrunscript))) + (rmt:test-set-top-process-pid run-id test-id pid) + (let loop ((i 0)) + (let-values + (((pid-val exit-status exit-code) (process-wait pid #t))) + (mutex-lock! m) + (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) + (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status) + (launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code) + (launch:einf-rollup-status-set! exit-info exit-code) ;; (vector-set! exit-info 3 exit-code) ;; rollup status + (mutex-unlock! m) + (if (eq? pid-val 0) + (begin + (thread-sleep! 2) + (loop (+ i 1))) + ))))) + ;; then, if runscript ran ok (or did not get called) + ;; do all the ezsteps (if any) + (if ezsteps + (let* ((testconfig ;; (read-config (conc work-area "/testconfig") #f #t environ-patt: "pre-launch-env-vars")) ;; FIXME??? is allow-system ok here? + ;; NOTE: it is tempting to turn off force-create of testconfig but dynamic + ;; ezstep names need a full re-eval here. + (tests:get-testconfig test-name tconfigreg #t force-create: #t)) ;; 'return-procs))) + (ezstepslst (if (hash-table? testconfig) + (hash-table-ref/default testconfig "ezsteps" '()) + #f))) + (if testconfig + (hash-table-set! *testconfigs* test-name testconfig) ;; cached for lazy reads later ... + (begin + (launch:setup) + (debug:print 0 *default-log-port* "WARNING: no testconfig found for " test-name " in search path:\n " + (string-intersperse (tests:get-tests-search-path *configdat*) "\n ")))) + ;; after all that, still no testconfig? Time to abort + (if (not testconfig) + (begin + (debug:print-error 0 *default-log-port* "Failed to resolve megatest.config, runconfigs.config and testconfig issues. Giving up now") + (exit 1))) + (if (not (file-exists? ".ezsteps"))(create-directory ".ezsteps")) + ;; if ezsteps was defined then we are sure to have at least one step but check anyway + (if (not (> (length ezstepslst) 0)) + (debug:print-error 0 *default-log-port* "ezsteps defined but ezstepslst is zero length") + (let loop ((ezstep (car ezstepslst)) + (tal (cdr ezstepslst)) + (prevstep #f)) + ;; check exit-info (vector-ref exit-info 1) + (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) + (let ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig)) + (stepname (car ezstep))) + ;; if logpro-used read in the stepname.dat file + (if (and logpro-used (file-exists? (conc stepname ".dat"))) + (launch:load-logpro-dat run-id test-id stepname)) + (if (steprun-good? logpro-used (launch:einf-exit-code exit-info)) + (if (not (null? tal)) + (loop (car tal) (cdr tal) stepname)) + (debug:print 4 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping"))) + (debug:print 4 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep))))))) + +(define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) + (let* ((start-seconds (current-seconds)) + (calc-minutes (lambda () + (inexact->exact + (round + (- + (current-seconds) + start-seconds))))) + (kill-tries 0)) + ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) + ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) + (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) + (let loop ((minutes (calc-minutes)) + (cpu-load (get-cpu-load)) + (disk-free (get-df (current-directory)))) + (let ((new-cpu-load (let* ((load (get-cpu-load)) + (delta (abs (- load cpu-load)))) + (if (> delta 0.6) ;; don't bother updating with small changes + load + #f))) + (new-disk-free (let* ((df (get-df (current-directory))) + (delta (abs (- df disk-free)))) + (if (> delta 200) ;; ignore changes under 200 Meg + df + #f)))) + (set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)) + (and runtlim (let* ((run-seconds (- (current-seconds) start-seconds)) + (time-exceeded (> run-seconds runtlim))) + (if time-exceeded + (begin + (debug:print-info 0 *default-log-port* "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim) + #t) + #f))))) + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) + (if kill-job? + (begin + (mutex-lock! m) + ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this + ;; section and the runit section? Or add a loop that tries three times with a 1/4 second + ;; between tries? + (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0)) + (pid2 (rmt:test-get-top-process-pid run-id test-id)) + (pids (delete-duplicates (filter number? (list pid1 pid2))))) + (if (not (null? pids)) + (begin + (for-each + (lambda (pid) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "Unable to kill process with pid " pid ", possibly already killed.") + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))) + (debug:print 0 *default-log-port* "WARNING: Request received to kill job " pid) ;; " (attempt # " kill-tries ")") + (debug:print-info 0 *default-log-port* "Signal mask=" (signal-mask)) + ;; (if (process:alive? pid) + ;; (begin + (map (lambda (pid-num) + (process-signal pid-num signal/term)) + (process:get-sub-pids pid)) + (thread-sleep! 5) + ;; (if (process:process-alive? pid) + (map (lambda (pid-num) + (handle-exceptions + exn + #f + (process-signal pid-num signal/kill))) + (process:get-sub-pids pid)))) + ;; (debug:print-info 0 *default-log-port* "not killing process " pid " as it is not alive")))) + pids) + (tests:test-set-status! run-id test-id "KILLED" "KILLED" (args:get-arg "-m") #f)) + (begin + (debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2) + (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (args:get-arg "-m") #f) + ))) + (mutex-unlock! m) + ;; no point in sticking around. Exit now. + (exit))) + (if (hash-table-ref/default misc-flags 'keep-going #f) + (begin + (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses + (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta + (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free))))))) + (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional + (define (launch:execute encoded-cmd) - - (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) + (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg (tests:get-all))) (setenv "MT_CMDINFO" encoded-cmd) (if (list? cmdinfo) ;; ((testpath /tmp/mrwellan/jazzmind/src/example_run/tests/sqlitespeed) ;; (test-name sqlitespeed) (runscript runscript.rb) (db-host localhost) (run-id 1)) (let* ((testpath (assoc/default 'testpath cmdinfo)) ;; testpath is the test spec area @@ -229,77 +422,81 @@ (let ((fulln (conc testpath "/" runscript))) (if (and (file-exists? fulln) (file-execute-access? fulln)) fulln runscript))))) ;; assume it is on the path - ;; (rollup-status 0) - ) + ) ;; (rollup-status 0) ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) (if (or (file-exists? top-path) (> count 10)) (change-directory top-path) (begin - (debug:print 0 "INFO: Not starting job yet - directory " top-path " not found") + (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " top-path " not found") (thread-sleep! 10) (loop (+ count 1))))) (let ((sighand (lambda (signum) ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting (if (eq? signum signal/stop) - (debug:print 0 "ERROR: attempt to STOP process. Exiting.")) + (debug:print-error 0 *default-log-port* "attempt to STOP process. Exiting.")) (set! *time-to-exit* #t) (print "Received signal " signum ", cleaning up before exit. Please wait...") (let ((th1 (make-thread (lambda () (tests:test-force-state-status! run-id test-id "INCOMPLETE" "KILLED") (print "Killed by signal " signum ". Exiting") (thread-sleep! 1) (exit 1)))) (th2 (make-thread (lambda () (thread-sleep! 2) - (debug:print 0 "Done") + (debug:print 0 *default-log-port* "Done") (exit 4))))) (thread-start! th2) (thread-start! th1) (thread-join! th2))))) (set-signal-handler! signal/int sighand) (set-signal-handler! signal/term sighand) - (set-signal-handler! signal/stop sighand)) + ) ;; (set-signal-handler! signal/stop sighand) - ;; (set-signal-handler! signal/int (lambda () - ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* ;; - (let ((test-info (rmt:get-testinfo-state-status run-id test-id))) + (let* ((test-info (rmt:get-test-info-by-id run-id test-id)) + (test-host (db:test-get-host test-info)) + (test-pid (db:test-get-process_id test-info))) (cond ((member (dbr:test-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "KILLREQ" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun - (debug:print 0 "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") + (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")) ;; prime it for running ((not (member (dbr:test-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) + ;; ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART")) + (if (process:alive-on-host? test-host test-pid) + (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed") + (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a"))) (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")) (else ;; (member (dbr:test-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")) - (debug:print 0 "ERROR: test state is " (dbr:test-state test-info) ", cannot proceed") + ;; (debug:print 0 "ERROR: test state is " (dbr:test-state test-info) ", cannot proceed") + (debug:print-error 0 *default-log-port* "test state is " (db:test-state test-info) ", cannot proceed") (exit)))) - (debug:print 2 "Exectuing " test-name " (id: " test-id ") on " (get-host-name)) + (debug:print 2 *default-log-port* "Exectuing " test-name " (id: " test-id ") on " (get-host-name)) (set! keys (rmt:get-keys)) ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process ;; one of these is defunct/redundant ... - (if (not (launch:setup-for-run force: #t)) + (if (not (launch:setup force: #t)) (begin - (debug:print 0 "Failed to setup, exiting") + (debug:print 0 *default-log-port* "Failed to setup, exiting") ;; (sqlite3:finalize! db) ;; (sqlite3:finalize! tdb) (exit 1))) (change-directory *toppath*) ;; NOTE: Current order is to process runconfigs *before* setting the MT_ vars. This ;; seems non-ideal but could well break stuff ;; BUG? BUG? BUG? - + (let ((rconfig (full-runconfigs-read))) ;; (read-config (conc *toppath* "/runconfigs.config") #f #t sections: (list "default" target)))) ;; (setup-env-defaults (conc *toppath* "/runconfigs.config") run-id (make-hash-table) keyvals target) ;; (set-run-config-vars run-id keyvals target) ;; (db:get-target db run-id)) ;; Now have runconfigs data loaded, set environment vars (for-each (lambda (section) @@ -307,47 +504,47 @@ (let ((var (car varval)) (val (cadr varval))) (if (and (string? var)(string? val)) (begin (setenv var (config:eval-string-in-environment val))) ;; val) - (debug:print 0 "ERROR: bad variable spec, " var "=" val)))) + (debug:print-error 0 *default-log-port* "bad variable spec, " var "=" val)))) (configf:get-section rconfig section))) (list "default" target))) ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) (if (or (file-exists? work-area) (> count 10)) (change-directory work-area) (begin - (debug:print 0 "INFO: Not starting job yet - directory " work-area " not found") + (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " work-area " not found") (thread-sleep! 10) (loop (+ count 1))))) ;; (change-directory work-area) (set! keyvals (keys:target->keyval keys target)) ;; apply pre-overrides before other variables. The pre-override vars must not ;; clobbers things from the official sources such as megatest.config and runconfigs.config (if (string? set-vars) (let ((varpairs (string-split set-vars ","))) - (debug:print 4 "varpairs: " varpairs) + (debug:print 4 *default-log-port* "varpairs: " varpairs) (map (lambda (varpair) (let ((varval (string-split varpair "="))) (if (eq? (length varval) 2) (let ((var (car varval)) (val (cadr varval))) - (debug:print 1 "Adding pre-var/val " var " = " val " to the environment") + (debug:print 1 *default-log-port* "Adding pre-var/val " var " = " val " to the environment") (setenv var val))))) varpairs))) (for-each (lambda (varval) (let ((var (car varval)) (val (cadr varval))) (if val (setenv var val) (begin - (debug:print 0 "ERROR: required variable " var " does not have a valid value. Exiting") + (debug:print-error 0 *default-log-port* "required variable " var " does not have a valid value. Exiting") (exit))))) (list (list "MT_TEST_RUN_DIR" work-area) (list "MT_TEST_NAME" test-name) (list "MT_ITEM_INFO" (conc itemdat)) @@ -372,11 +569,11 @@ ;; open-run-close not needed for test-set-meta-info ;; (tests:set-full-meta-info #f test-id run-id 0 work-area) ;; (tests:set-full-meta-info test-id run-id 0 work-area) (tests:set-full-meta-info #f test-id run-id 0 work-area 10) - (thread-sleep! 0.3) ;; NFS slowness has caused grief here + ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here (if (args:get-arg "-xterm") (set! fullrunscript "xterm") (if (and fullrunscript (file-exists? fullrunscript) @@ -385,163 +582,31 @@ ;; We are about to actually kick off the test ;; so this is a good place to remove the records for ;; any previous runs ;; (db:test-remove-steps db run-id testname itemdat) - + ;; (let* ((m (make-mutex)) (kill-job? #f) (exit-info (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status (job-thread #f) - (keep-going #t) + ;; (keep-going #t) + (misc-flags (let ((ht (make-hash-table))) + (hash-table-set! ht 'keep-going #t) + ht)) (runit (lambda () - ;; (let-values - ;; (((pid exit-status exit-code) - ;; (run-n-wait fullrunscript))) - ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) - ;; Since we should have a clean slate at this time there is no need to do - ;; any of the other stuff that tests:test-set-status! does. Let's just - ;; force RUNNING/n/a - - - ;; (thread-sleep! 0.3) - (tests:test-force-state-status! run-id test-id "RUNNING" "n/a") - (rmt:roll-up-pass-fail-counts run-id test-name item-path #f "RUNNING") - ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here - - ;; if there is a runscript do it first - (if fullrunscript - (let ((pid (process-run fullrunscript))) - (rmt:test-set-top-process-pid run-id test-id pid) - (let loop ((i 0)) - (let-values - (((pid-val exit-status exit-code) (process-wait pid #t))) - (mutex-lock! m) - (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) - (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status) - (launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code) - (launch:einf-rollup-status-set! exit-info exit-code) ;; (vector-set! exit-info 3 exit-code) ;; rollup status - (mutex-unlock! m) - (if (eq? pid-val 0) - (begin - (thread-sleep! 2) - (loop (+ i 1))) - ))))) - ;; then, if runscript ran ok (or did not get called) - ;; do all the ezsteps (if any) - (if ezsteps - (let* ((testconfig ;; (read-config (conc work-area "/testconfig") #f #t environ-patt: "pre-launch-env-vars")) ;; FIXME??? is allow-system ok here? - ;; NOTE: it is tempting to turn off force-create of testconfig but dynamic - ;; ezstep names need a full re-eval here. - (tests:get-testconfig test-name tconfigreg #t force-create: #t)) ;; 'return-procs))) - (ezstepslst (hash-table-ref/default testconfig "ezsteps" '()))) - (hash-table-set! *testconfigs* test-name testconfig) ;; cached for lazy reads later ... - (if (not (file-exists? ".ezsteps"))(create-directory ".ezsteps")) - ;; if ezsteps was defined then we are sure to have at least one step but check anyway - (if (not (> (length ezstepslst) 0)) - (debug:print 0 "ERROR: ezsteps defined but ezstepslst is zero length") - (let loop ((ezstep (car ezstepslst)) - (tal (cdr ezstepslst)) - (prevstep #f)) - ;; check exit-info (vector-ref exit-info 1) - (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) - (let ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig))) - (if (and (steprun-good? logpro-used (launch:einf-exit-code exit-info)) - (not (null? tal))) - (loop (car tal) (cdr tal) stepname))) - (debug:print 4 "WARNING: a prior step failed, stopping at " ezstep)))))))) + (launch:manage-steps run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m))) (monitorjob (lambda () - (let* ((start-seconds (current-seconds)) - (calc-minutes (lambda () - (inexact->exact - (round - (- - (current-seconds) - start-seconds))))) - (kill-tries 0)) - ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) - ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) - (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) - (let loop ((minutes (calc-minutes)) - (cpu-load (get-cpu-load)) - (disk-free (get-df (current-directory)))) - (let ((new-cpu-load (let* ((load (get-cpu-load)) - (delta (abs (- load cpu-load)))) - (if (> delta 0.6) ;; don't bother updating with small changes - load - #f))) - (new-disk-free (let* ((df (get-df (current-directory))) - (delta (abs (- df disk-free)))) - (if (> delta 200) ;; ignore changes under 200 Meg - df - #f)))) - (set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)) - (and runtlim (let* ((run-seconds (- (current-seconds) start-seconds)) - (time-exceeded (> run-seconds runtlim))) - (if time-exceeded - (begin - (debug:print-info 0 "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim) - #t) - #f))))) - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) - (if kill-job? - (begin - (mutex-lock! m) - ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this - ;; section and the runit section? Or add a loop that tries three times with a 1/4 second - ;; between tries? - (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0)) - (pid2 (rmt:test-get-top-process-pid run-id test-id)) - (pids (delete-duplicates (filter number? (list pid1 pid2))))) - (if (not (null? pids)) - (begin - (for-each - (lambda (pid) - (handle-exceptions - exn - (begin - (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.") - (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))) - (debug:print 0 "WARNING: Request received to kill job " pid) ;; " (attempt # " kill-tries ")") - (debug:print-info 0 "Signal mask=" (signal-mask)) - ;; (if (process:alive? pid) - ;; (begin - (map (lambda (pid-num) - (process-signal pid-num signal/term)) - (process:get-sub-pids pid)) - (thread-sleep! 5) - ;; (if (process:process-alive? pid) - (map (lambda (pid-num) - (handle-exceptions - exn - #f - (process-signal pid-num signal/kill))) - (process:get-sub-pids pid)))) - ;; (debug:print-info 0 "not killing process " pid " as it is not alive")))) - pids) - (tests:test-set-status! run-id test-id "KILLED" "KILLED" (args:get-arg "-m") #f)) - (begin - (debug:print 0 "ERROR: Nothing to kill, pid1=" pid1 ", pid2=" pid2) - (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (args:get-arg "-m") #f) - ))) - (mutex-unlock! m) - ;; no point in sticking around. Exit now. - (exit))) - (if keep-going - (begin - (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses - (if keep-going ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta - (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free))))))) - (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f)))) ;; NOTE: Checking twice for keep-going is intentional + (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags))) (th1 (make-thread monitorjob "monitor job")) (th2 (make-thread runit "run job"))) (set! job-thread th2) (thread-start! th1) (thread-start! th2) (thread-join! th2) - (debug:print-info 0 "Megatest exectute of test " test-name ", item path " item-path " complete. Notifying the db ...") - (set! keep-going #f) + (debug:print-info 0 *default-log-port* "Megatest exectute of test " test-name ", item path " item-path " complete. Notifying the db ...") + (hash-table-set! misc-flags 'keep-going #f) (thread-join! th1) (thread-sleep! 1) ;; givbe thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec. (mutex-lock! m) (let* ((item-path (item-list->path itemdat)) ;; only state and status needed - use lazy routine @@ -559,12 +624,15 @@ (if (equal? (dbr:test-status testinfo) "AUTO") "AUTO" "PASS")) ((eq? (launch:einf-rollup-status exit-info) 1) "FAIL") ;; (vector-ref exit-info 3) ((eq? (launch:einf-rollup-status exit-info) 2) ;; (vector-ref exit-info 3) ;; if the current status is AUTO the defer to the calculated value but qualify (i.e. make this AUTO-WARN) (if (equal? (dbr:test-status testinfo) "AUTO") "AUTO-WARN" "WARN")) - (else "FAIL")))) ;; (dbr:test-status testinfo))) - (debug:print-info 1 "Test exited in state=" (dbr:test-state testinfo) ", setting state/status based on exit code of " (launch:einf-exit-status exit-info) " and rollup-status of " (launch:einf-rollup-status exit-info)) + ((eq? (launch:einf-rollup-status exit-info) 3) "CHECK") + ((eq? (launch:einf-rollup-status exit-info) 4) "WAIVED") + ((eq? (launch:einf-rollup-status exit-info) 5) "ABORT") + ((eq? (launch:einf-rollup-status exit-info) 6) "SKIP")))) + (debug:print-info 1 *default-log-port* "Test exited in state=" (db:test-state testinfo) ", setting state/status based on exit code of " (launch:einf-exit-status exit-info) " and rollup-status of " (launch:einf-rollup-status exit-info)) (tests:test-set-status! run-id test-id new-state new-status (args:get-arg "-m") #f) @@ -573,113 +641,196 @@ )) ;; for automated creation of the rollup html file this is a good place... (if (not (equal? item-path "")) (tests:summarize-items run-id test-id test-name #f)) (tests:summarize-test run-id test-id) ;; don't force - just update if no - ) + (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) (mutex-unlock! m) - (debug:print 2 "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " + (debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n") (if (not (launch:einf-exit-status exit-info)) (exit 4))))))) -;; set up the very basics needed for doing anything here. -(define (launch:setup-for-run #!key (force #f)) - ;; would set values for KEYS in the environment here for better support of env-override but - ;; have chicken/egg scenario. need to read megatest.config then read it again. Going to - ;; pass on that idea for now - ;; special case - (if (or force (not (hash-table? *configdat*))) ;; no need to re-open on every call - (begin - (set! *configinfo* (or (if (get-environment-variable "MT_CMDINFO") ;; we are inside a test - do not reprocess configs - (let ((alistconfig (conc (get-environment-variable "MT_LINKTREE") "/" - (get-environment-variable "MT_TARGET") "/" - (get-environment-variable "MT_RUNNAME") "/" - ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) - (if (file-exists? alistconfig) - (list (configf:read-alist alistconfig) - (get-environment-variable "MT_RUN_AREA_HOME")) - #f)) - #f) ;; no config cached - give up - (let ((runname (or (args:get-arg "-runname")(args:get-arg ":runname")))) - (if runname (setenv "MT_RUNNAME" runname)) - (find-and-read-config - (if (args:get-arg "-config")(args:get-arg "-config") "megatest.config") - environ-patt: "env-override" - given-toppath: (get-environment-variable "MT_RUN_AREA_HOME") - pathenvvar: "MT_RUN_AREA_HOME")))) - (set! *configdat* (if (car *configinfo*)(car *configinfo*) #f)) - (set! *toppath* (if (car *configinfo*)(cadr *configinfo*) #f)) - (let* ((tmptransport (configf:lookup *configdat* "server" "transport")) - (transport (if tmptransport (string->symbol tmptransport) 'http))) - (if (member transport '(http rpc nmsg)) - (set! *transport-type* transport) - (begin - (debug:print 0 "ERROR: Unrecognised transport " transport) - (exit)))) - (let ((linktree (configf:lookup *configdat* "setup" "linktree"))) ;; link tree is critical - (if linktree - (if (not (file-exists? linktree)) - (begin - (handle-exceptions - exn - (begin - (debug:print 0 "ERROR: Something went wrong when trying to create linktree dir at " linktree) - (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) - (exit 1)) - (create-directory linktree #t)))) - (begin - (debug:print 0 "ERROR: linktree not defined in [setup] section of megatest.config") - (exit 1))) - (if linktree - (let ((dbdir (conc linktree "/.db"))) - (handle-exceptions - exn - (begin - (debug:print 0 "ERROR: failed to create the " dbdir " area for your database files") - (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))) - (if (not (directory-exists? dbdir))(create-directory dbdir))) - (setenv "MT_LINKTREE" linktree)) - (begin - (debug:print 0 "ERROR: linktree is required in your megatest.config [setup] section") - (exit 1))) - (if (and *toppath* - (directory-exists? *toppath*)) - (setenv "MT_RUN_AREA_HOME" *toppath*) - (begin - (debug:print 0 "ERROR: failed to find the top path to your Megatest area.") - (exit 1))) - ))) - *toppath*) - (define (launch:cache-config) ;; if we have a linktree and -runtests and -target and the directory exists dump the config ;; to megatest-(current-seconds).cfg and symlink it to megatest.cfg (if (and *configdat* (or (args:get-arg "-run") - (args:get-arg "-runtests"))) + (args:get-arg "-runtests") + (args:get-arg "-execute"))) (let* ((linktree (get-environment-variable "MT_LINKTREE")) (target (common:args-get-target)) (runname (or (args:get-arg "-runname") - (args:get-arg ":runname"))) + (args:get-arg ":runname") + (getenv "MT_RUNNAME"))) (fulldir (conc linktree "/" target "/" runname))) - (debug:print-info 0 "Have -runtests with target=" target ", runname=" runname ", fulldir=" fulldir ", testpatt=" (or (args:get-arg "-testpatt") "%")) - (if (file-exists? linktree) ;; can't proceed without linktree + (if (and linktree (file-exists? linktree)) ;; can't proceed without linktree (begin + (debug:print-info 0 *default-log-port* "Have -run with target=" target ", runname=" runname ", fulldir=" fulldir ", testpatt=" (or (args:get-arg "-testpatt") "%")) (if (not (file-exists? fulldir)) (create-directory fulldir #t)) ;; need to protect with exception handler (if (and target runname (file-exists? fulldir)) (let ((tmpfile (conc fulldir "/.megatest.cfg." (current-seconds))) - (targfile (conc fulldir "/.megatest.cfg-" megatest-version "-" megatest-fossil-hash))) - (debug:print-info 0 "Caching megatest.config in " fulldir "/.megatest.cfg") - (configf:write-alist *configdat* tmpfile) - (system (conc "ln -sf " tmpfile " " targfile)) - ))))))) + (targfile (conc fulldir "/.megatest.cfg-" megatest-version "-" megatest-fossil-hash)) + (rconfig (conc fulldir "/.runconfig." megatest-version "-" megatest-fossil-hash))) + (if (file-exists? rconfig) ;; only cache megatest.config AFTER runconfigs has been cached + (begin + (debug:print-info 0 *default-log-port* "Caching megatest.config in " tmpfile) + (configf:write-alist *configdat* tmpfile) + (system (conc "ln -sf " tmpfile " " targfile)))) + ))) + (debug:print-info 1 *default-log-port* "No linktree yet, no caching configs."))))) + + +;; gather available information, if legit read configs in this order: +;; +;; if have cache; +;; read it a return it +;; else +;; megatest.config (do not cache) +;; runconfigs.config (cache if all vars avail) +;; megatest.config (cache if all vars avail) +;; returns: +;; *toppath* +;; side effects: +;; sets; *configdat* (megatest.config info) +;; *runconfigdat* (runconfigs.config info) +;; *configstatus* (status of the read data) +;; +(define (launch:setup-new #!key (force #f)) + (let* ((toppath (or *toppath* (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath + (runname (common:args-get-runname)) + (target (common:args-get-target)) + (linktree (common:get-linktree)) + (sections (if target (list "default" target) #f)) ;; for runconfigs + (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config + (rundir (if (and runname target linktree)(conc linktree "/" target "/" runname) #f)) + (mtcachef (and rundir (conc rundir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) + (rccachef (and rundir (conc rundir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) + (cancreate (and rundir (file-exists? rundir)(file-write-access? rundir)))) + ;; (print "runname: " runname " target: " target " mtcachef: " mtcachef " rccachef: " rccachef) + (set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource + (cond + ;; data was read and cached and available in *configstatus*, toppath has already been set + ((eq? *configstatus* 'fulldata) + *toppath*) + ;; if mtcachef exists just read it, however we need to assume toppath is available in $MT_RUN_AREA_HOME + ((and mtcachef (file-exists? mtcachef) (get-environment-variable "MT_RUN_AREA_HOME")) + (set! *configdat* (configf:read-alist mtcachef)) + (set! *runconfigdat* (configf:read-alist rccachef)) + (set! *configinfo* (list *configdat* (get-environment-variable "MT_RUN_AREA_HOME"))) + (set! *configstatus* 'fulldata) + (set! *toppath* (get-environment-variable "MT_RUN_AREA_HOME")) + *toppath*) + ;; we have all the info needed to fully process runconfigs and megatest.config + (mtcachef + (let* ((first-pass (find-and-read-config ;; NB// sets MT_RUN_AREA_HOME as side effect + mtconfig + environ-patt: "env-override" + given-toppath: toppath + pathenvvar: "MT_RUN_AREA_HOME")) + (first-rundat (let ((toppath (if toppath + toppath + (car first-pass)))) + (read-config ;; (conc toppath "/runconfigs.config") + (conc (if (string? toppath) + toppath + (get-environment-variable "MT_RUN_AREA_HOME")) + "/runconfigs.config") + *runconfigdat* #t + sections: sections)))) + (set! *runconfigdat* first-rundat) + (if first-pass ;; + (begin + (set! *configdat* (car first-pass)) + (set! *configinfo* first-pass) + (set! *toppath* (or toppath (cadr first-pass))) ;; use the gathered data unless already have it + (set! toppath *toppath*) + (if (not *toppath*) + (begin + (debug:print-error 0 *default-log-port* "you are not in a megatest area!") + (exit 1))) + (setenv "MT_RUN_AREA_HOME" *toppath*) + ;; the seed read is done, now read runconfigs, cache it then read megatest.config one more time and cache it + (let* ((keys (rmt:get-keys)) + (key-vals (keys:target->keyval keys target)) + (linktree (or (getenv "MT_LINKTREE") + (if *configdat* (configf:lookup *configdat* "setup" "linktree") #f))) + (second-pass (find-and-read-config + mtconfig + environ-patt: "env-override" + given-toppath: toppath + pathenvvar: "MT_RUN_AREA_HOME")) + (runconfigdat (begin ;; this read of the runconfigs will see any adjustments made by re-reading megatest.config + (for-each (lambda (kt) + (setenv (car kt) (cadr kt))) + key-vals) + (read-config (conc toppath "/runconfigs.config") *runconfigdat* #t + sections: sections)))) + (if cancreate (configf:write-alist runconfigdat rccachef)) + (set! *runconfigdat* runconfigdat) + (if cancreate (configf:write-alist *configdat* mtcachef)) + (if cancreate (set! *configstatus* 'fulldata)))) + ;; no configs found? should not happen but let's try to recover gracefully, return an empty hash-table + (set! *configdat* (make-hash-table)) + ))) + ;; else read what you can and set the flag accordingly + (else + (let* ((cfgdat (find-and-read-config + (or (args:get-arg "-config") "megatest.config") + environ-patt: "env-override" + given-toppath: (get-environment-variable "MT_RUN_AREA_HOME") + pathenvvar: "MT_RUN_AREA_HOME"))) + (if cfgdat + (let* ((toppath (or (get-environment-variable "MT_RUN_AREA_HOME")(cadr cfgdat))) + (rdat (read-config (conc toppath + "/runconfigs.config") *runconfigdat* #t sections: sections))) + (set! *configinfo* cfgdat) + (set! *configdat* (car cfgdat)) + (set! *runconfigdat* rdat) + (set! *toppath* toppath) + (set! *configstatus* 'partial)) + (begin + (debug:print-error 0 *default-log-port* "No " mtconfig " file found. Giving up.") + (exit 2)))))) + ;; additional house keeping + (let* ((linktree (or (getenv "MT_LINKTREE") + (if *configdat* (configf:lookup *configdat* "setup" "linktree") #f)))) + (if linktree + (begin + (if (not (file-exists? linktree)) + (begin + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* "Something went wrong when trying to create linktree dir at " linktree) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (exit 1)) + (create-directory linktree #t)))) + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* "Something went wrong when trying to create link to linktree at " *toppath*) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))) + (let ((tlink (conc *toppath* "/lt"))) + (if (not (file-exists? tlink)) + (create-symbolic-link linktree tlink))))) + (begin + (debug:print-error 0 *default-log-port* "linktree not defined in [setup] section of megatest.config") + ))) + (if (and *toppath* + (directory-exists? *toppath*)) + (setenv "MT_RUN_AREA_HOME" *toppath*) + (begin + (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area."))) + *toppath*)) + +(define launch:setup launch:setup-new) (define (get-best-disk confdat testconfig) (let* ((disks (or (and testconfig (hash-table-ref/default testconfig "disks" #f)) (hash-table-ref/default confdat "disks" #f))) (minspace (let ((m (configf:lookup confdat "setup" "minspace"))) @@ -687,12 +838,12 @@ (if disks (let ((res (common:get-disk-with-most-free-space disks minspace))) ;; min size of 1000, seems tad dumb (if res (cdr res) (begin - (if (common:low-noise-print 20 "no valid disks") - (debug:print 0 "ERROR: No valid disks found in megatest.config. Please add some to your [disks] section and ensure the directory exists!")) + (if (common:low-noise-print 20 "No valid disks or no disk with enough space") + (debug:print-error 0 *default-log-port* "No valid disks found in megatest.config. Please add some to your [disks] section and ensure the directory exists and has enough space!\n You can change minspace in the [setup] section of megatest.config. Current setting is: " minspace)) (exit 1))))))) ;; Desired directory structure: ;; ;; - - -. @@ -739,22 +890,22 @@ ;; Update the rundir path in the test record for all, rundir=physical, shortdir=logical ;; rundir shortdir (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path) - (debug:print 2 "INFO:\n lnkbase=" lnkbase "\n lnkpath=" lnkpath "\n toptest-path=" toptest-path "\n test-path=" test-path) + (debug:print 2 *default-log-port* "INFO:\n lnkbase=" lnkbase "\n lnkpath=" lnkpath "\n toptest-path=" toptest-path "\n test-path=" test-path) (if (not (file-exists? linktree)) (begin - (debug:print 0 "WARNING: linktree did not exist! Creating it now at " linktree) + (debug:print 0 *default-log-port* "WARNING: linktree did not exist! Creating it now at " linktree) (create-directory linktree #t))) ;; (system (conc "mkdir -p " linktree)))) ;; create the directory for the tests dir links, this is needed no matter what... (if (and (not (directory-exists? lnkbase)) (not (file-exists? lnkbase))) (handle-exceptions exn (begin - (debug:print "ERROR: Problem creating linktree base at " lnkbase) + (debug:print-error 0 *default-log-port* "Problem creating linktree base at " lnkbase) (print-error-message exn (current-error-port))) (create-directory lnkbase #t))) ;; update the toptest record with its location rundir, cache the path ;; This wass highly inefficient, one db write for every subtest, potentially @@ -765,32 +916,32 @@ ;; if the test is iterated it is necessary to create the parent path ;; to the iteration. use pathname-directory to trim the path by one ;; level (if (not not-iterated) ;; i.e. iterated (let ((iterated-parent (pathname-directory (conc lnkpath "/" item-path)))) - (debug:print-info 2 "Creating iterated parent " iterated-parent) + (debug:print-info 2 *default-log-port* "Creating iterated parent " iterated-parent) (handle-exceptions exn (begin - (debug:print 0 "ERROR: Failed to create directory " iterated-parent ((condition-property-accessor 'exn 'message) exn) ", exiting") + (debug:print-error 0 *default-log-port* " Failed to create directory " iterated-parent ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit 1)) (create-directory iterated-parent #t)))) (if (symbolic-link? lnkpath) (handle-exceptions exn (begin - (debug:print 0 "ERROR: Failed to remove symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", exiting") + (debug:print-error 0 *default-log-port* " Failed to remove symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit 1)) (delete-file lnkpath))) (if (not (or (file-exists? lnkpath) (symbolic-link? lnkpath))) (handle-exceptions exn (begin - (debug:print 0 "ERROR: Failed to create symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", exiting") + (debug:print-error 0 *default-log-port* " Failed to create symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit 1)) (create-symbolic-link toptest-path lnkpath))) ;; NB - This was not working right - some top tests are not getting the path set!!! ;; @@ -807,18 +958,19 @@ #f))) (hash-table-set! *toptest-paths* testname curr-test-path) ;; NB// Was this for the test or for the parent in an iterated test? (rmt:general-call 'test-set-rundir-shortdir run-id lnkpath (if (file-exists? lnkpath) - (resolve-pathname lnkpath) + ;; (resolve-pathname lnkpath) + (common:nice-path lnkpath) lnkpath) testname "") ;; (rmt:general-call 'test-set-rundir run-id lnkpath testname "") ;; toptest-path) (if (or (not curr-test-path) (not (directory-exists? toptest-path))) (begin - (debug:print-info 2 "Creating " toptest-path " and link " lnkpath) + (debug:print-info 2 *default-log-port* "Creating " toptest-path " and link " lnkpath) (handle-exceptions exn #f ;; don't care to catch and deal with errors here for now. (create-directory toptest-path #t)) (hash-table-set! *toptest-paths* testname toptest-path))))) @@ -825,27 +977,27 @@ ;; The toptest path has been created, the link to the test in the linktree has ;; been created. Now, if this is an iterated test the real test dir must be created (if (not not-iterated) ;; this is an iterated test (begin ;; (let ((lnktarget (conc lnkpath "/" item-path))) - (debug:print 2 "Setting up sub test run area") - (debug:print 2 " - creating run area in " test-path) + (debug:print 2 *default-log-port* "Setting up sub test run area") + (debug:print 2 *default-log-port* " - creating run area in " test-path) (handle-exceptions exn (begin - (debug:print 0 "ERROR: Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) ", exiting") + (debug:print-error 0 *default-log-port* " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit 1)) (create-directory test-path #t)) - (debug:print 2 + (debug:print 2 *default-log-port* " - creating link from: " test-path "\n" " to: " lnktarget) ;; If there is already a symlink delete it and recreate it. (handle-exceptions exn (begin - (debug:print 0 "ERROR: Failed to re-create link " lnktarget ((condition-property-accessor 'exn 'message) exn) ", exiting") + (debug:print-error 0 *default-log-port* " Failed to re-create link " lnktarget ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit)) (if (symbolic-link? lnktarget) (delete-file lnktarget)) (if (not (file-exists? lnktarget)) (create-symbolic-link test-path lnktarget))))) (if (not (directory? test-path)) @@ -863,15 +1015,15 @@ ovrcmd (conc "rsync -av" (if (debug:debug-mode 1) "" "q") " " test-src-path "/ " test-path "/" " >> " test-path "/mt_launch.log 2>> " test-path "/mt_launch.log"))) (status (system cmd))) (if (not (eq? status 0)) - (debug:print 2 "ERROR: problem with running \"" cmd "\""))) + (debug:print 2 *default-log-port* "ERROR: problem with running \"" cmd "\""))) (list lnkpathf lnkpath )) (if (and test-src-path (> remtries 0)) (begin - (debug:print 0 "ERROR: Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries) + (debug:print-error 0 *default-log-port* "Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries) ;; (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat remtries: (- remtries 1))) (list #f #f))))) ;; 1. look though disks list for disk with most space @@ -945,11 +1097,12 @@ (if launcher (set! launcher (string-split launcher))) ;; set up the run work area for this test (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run (not (member (dbr:test-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir (begin - (debug:print-info 0 "attempting to preclean directory " (dbr:test-rundir testinfo) " for test " test-name "/" item-path) + ;; (debug:print-info 0 "attempting to preclean directory " (dbr:test-rundir testinfo) " for test " test-name "/" item-path) + (debug:print-info 0 *default-log-port* "attempting to preclean directory " (dbr:test-rundir testinfo) " for test " test-name "/" item-path) (runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record ;; prevent overlapping actions - set to LAUNCHED as early as possible ;; (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) @@ -957,15 +1110,15 @@ (set! diskpath (get-best-disk *configdat* tconfig)) (if diskpath (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) (set! work-area (car dat)) (set! toptest-work-area (cadr dat)) - (debug:print-info 2 "Using work area " work-area)) + (debug:print-info 2 *default-log-port* "Using work area " work-area)) (begin (set! work-area (conc test-path "/tmp_run")) (create-directory work-area #t) - (debug:print 0 "WARNING: No disk work area specified - running in the test directory under tmp_run"))) + (debug:print 0 *default-log-port* "WARNING: No disk work area specified - running in the test directory under tmp_run"))) (set! cmdparms (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) @@ -999,17 +1152,17 @@ ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms)))) (launcher (set! fullcmd (append launcher (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) ;; (set! fullcmd (append launcher (list remote-megatest test-sig "-execute" cmdparms)))) (else - (if (not useshell)(debug:print 0 "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section")) + (if (not useshell)(debug:print 0 *default-log-port* "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section")) (set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" "")))))) ;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" ""))))) (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm")))) - (debug:print 1 "Launching " work-area) + (debug:print 1 *default-log-port* "Launching " work-area) ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done - (debug:print 4 "fullcmd: " fullcmd) + (debug:print 4 *default-log-port* "fullcmd: " fullcmd) (let* ((commonprevvals (alist->env-vars (hash-table-ref/default *configdat* "env-override" '()))) (testprevvals (alist->env-vars (hash-table-ref/default tconfig "pre-launch-env-overrides" '()))) (miscprevvals (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute" @@ -1022,11 +1175,11 @@ ) itemdat))) ;; Launchwait defaults to true, must override it to turn off wait (launchwait (if (equal? (configf:lookup *configdat* "setup" "launchwait") "no") #f #t)) (launch-results (apply (if launchwait - cmd-run-with-stderr->list + process:cmd-run-with-stderr->list process-run) (if useshell (let ((cmdstr (string-intersperse fullcmd " "))) (if launchwait cmdstr @@ -1042,12 +1195,12 @@ (print "LAUNCHCMD: " (string-intersperse fullcmd " ")) (if (list? launch-results) (apply print launch-results) (print "NOTE: launched \"" fullcmd "\"\n but did not wait for it to proceed. Add the following to megatest.config \n[setup]\nlaunchwait yes\n if you have problems with this")) #:append)) - (debug:print 2 "Launching completed, updating db") - (debug:print 2 "Launch results: " launch-results) + (debug:print 2 *default-log-port* "Launching completed, updating db") + (debug:print 2 *default-log-port* "Launch results: " launch-results) (if (not launch-results) (begin (print "ERROR: Failed to run " (string-intersperse fullcmd " ") ", exiting now") ;; (sqlite3:finalize! db) ;; good ole "exit" seems not to work