@@ -19,10 +19,11 @@ (declare (unit launch)) (declare (uses common)) (declare (uses configf)) (declare (uses db)) +(declare (uses tdb)) (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") @@ -91,11 +92,11 @@ (debug:print 2 "Exectuing " test-name " (id: " test-id ") on " (get-host-name)) ;; Setup the *runremote* global var (if *runremote* (debug:print 2 "ERROR: I'm not expecting *runremote* to be set at this time")) ;; (set! *runremote* runremote) ;; (set! *transport-type* (string->symbol transport)) - (set! keys (cdb:remote-run db:get-keys #f)) + (set! keys (rmt:get-keys)) (set! keyvals (keys:target->keyval keys target)) ;; apply pre-overrides before other variables. The pre-override vars must not ;; clobbers things from the official sources such as megatest.config and runconfigs.config (if (string? set-vars) (let ((varpairs (string-split set-vars ","))) @@ -106,17 +107,27 @@ (let ((var (car varval)) (val (cadr varval))) (debug:print 1 "Adding pre-var/val " var " = " val " to the environment") (setenv var val))))) varpairs))) - (setenv "MT_TEST_RUN_DIR" work-area) - (setenv "MT_TEST_NAME" test-name) - (setenv "MT_ITEM_INFO" (conc itemdat)) - (setenv "MT_RUNNAME" runname) - (setenv "MT_MEGATEST" megatest) - (setenv "MT_TARGET" target) - (setenv "MT_LINKTREE" (configf:lookup *configdat* "setup" "linktree")) + (for-each + (lambda (varval) + (let ((var (car varval)) + (val (cadr varval))) + (if val + (setenv var val) + (begin + (debug:print 0 "ERROR: required variable " var " does not have a valid value. Exiting") + (exit))))) + (list + (list "MT_TEST_RUN_DIR" work-area) + (list "MT_TEST_NAME" test-name) + (list "MT_ITEM_INFO" (conc itemdat)) + (list "MT_RUNNAME" runname) + (list "MT_MEGATEST" megatest) + (list "MT_TARGET" target) + (list "MT_LINKTREE" (configf:lookup *configdat* "setup" "linktree")))) (if mt-bindir-path (setenv "PATH" (conc (getenv "PATH") ":" mt-bindir-path))) ;; (change-directory top-path) (if (not (setup-for-run)) (begin (debug:print 0 "Failed to setup, exiting") @@ -125,21 +136,21 @@ (exit 1))) ;; Can setup as client for server mode now ;; (client:setup) (change-directory *toppath*) - (set-megatest-env-vars run-id) ;; these may be needed by the launching process + (set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process (change-directory work-area) (set-run-config-vars run-id keyvals target) ;; (db:get-target db run-id)) ;; environment overrides are done *before* the remaining critical envars. (alist->env-vars env-ovrd) - (set-megatest-env-vars run-id) + (set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) (set-item-env-vars itemdat) (save-environment-as-files "megatest") ;; open-run-close not needed for test-set-meta-info - (tests:set-full-meta-info #f test-id run-id 0 work-area) + (tests:set-full-meta-info test-id run-id 0 work-area) ;; (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f) (tests:test-force-state-status! test-id "REMOTEHOSTSTART" "n/a") (thread-sleep! 0.3) ;; NFS slowness has caused grief here @@ -225,12 +236,11 @@ ;; call the command using mt_ezstep (set! script (conc "mt_ezstep " stepname " " (if prevstep prevstep "-") " " stepcmd)) (debug:print 4 "script: " script) - ;; DO NOT remote - (db:teststep-set-status! #f test-id stepname "start" "-" #f #f work-area: work-area) + (rmt:teststep-set-status! test-id stepname "start" "-" #f #f) ;; now launch (let ((pid (process-run script))) (let processloop ((i 0)) (let-values (((pid-val exit-status exit-code)(process-wait pid #t))) (mutex-lock! m) @@ -243,14 +253,13 @@ (thread-sleep! 2) (processloop (+ i 1)))) )) (let ((exinfo (vector-ref exit-info 2)) (logfna (if logpro-used (conc stepname ".html") ""))) - ;; testing if procedures called in a remote call cause problems (ans: no or so I suspect) - (db:teststep-set-status! #f test-id stepname "end" exinfo #f logfna work-area: work-area)) + (rmt:teststep-set-status! test-id stepname "end" exinfo #f logfna)) (if logpro-used - (cdb:test-set-log! *runremote* test-id (conc stepname ".html"))) + (rmt:test-set-log! test-id (conc stepname ".html"))) ;; set the test final status (let* ((this-step-status (cond ((and (eq? (vector-ref exit-info 2) 2) logpro-used) 'warn) ((eq? (vector-ref exit-info 2) 0) 'pass) (else 'fail))) @@ -261,15 +270,15 @@ (next-status (cond ((eq? overall-status 'pass) this-step-status) ((eq? overall-status 'warn) (if (eq? this-step-status 'fail) 'fail 'warn)) (else 'fail))) - (next-state "RUNNING") - ;; (cond - ;; ((null? tal) ;; more to run? - ;; "COMPLETED") - ;; (else "RUNNING")) + (next-state ;; "RUNNING") ;; WHY WAS THIS CHANGED TO NOT USE (null? tal) ?? + (cond + ((null? tal) ;; more to run? + "COMPLETED") + (else "RUNNING"))) ) (debug:print 4 "Exit value received: " (vector-ref exit-info 2) " logpro-used: " logpro-used " this-step-status: " this-step-status " overall-status: " overall-status " next-status: " next-status " rollup-status: " rollup-status) (case next-status @@ -280,12 +289,12 @@ (if (eq? this-step-status 'warn) "Logpro warning found" #f) #f)) ((pass) (tests:test-set-status! test-id next-state "PASS" #f #f)) (else ;; 'fail - (set! rollup-status 1) ;; force fail - (tests:test-set-status! test-id next-state "FAIL" (conc "Failed at step " stepname) #f) + (set! rollup-status 1) ;; force fail, this used to be next-state but that doesn't make sense. should always be "COMPLETED" + (tests:test-set-status! test-id "COMPLETED" "FAIL" (conc "Failed at step " stepname) #f) )))) (if (and (steprun-good? logpro-used (vector-ref exit-info 2)) (not (null? tal))) (loop (car tal) (cdr tal) stepname))) (debug:print 4 "WARNING: a prior step failed, stopping at " ezstep)))))))) @@ -296,11 +305,11 @@ (round (- (current-seconds) start-seconds))))) (kill-tries 0)) - (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) + (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (let loop ((minutes (calc-minutes))) (begin (set! kill-job? (or (test-get-kill-request test-id) ;; run-id test-name itemdat)) (and runtlim (let* ((run-seconds (- (current-seconds) start-seconds)) (time-exceeded (> run-seconds runtlim))) @@ -308,11 +317,11 @@ (begin (debug:print-info 0 "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim) #t) #f))))) ;; open-run-close not needed for test-set-meta-info - (tests:set-partial-meta-info #f test-id run-id minutes work-area) + (tests:set-partial-meta-info test-id run-id minutes work-area) (if kill-job? (begin (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second @@ -337,34 +346,33 @@ ;; (system (conc "kill -9 -" pid)))) (begin (debug:print 0 "WARNING: Request received to kill job but problem with process, attempting to kill manager process") (tests:test-set-status! test-id "KILLED" "FAIL" (args:get-arg "-m") #f) - (sqlite3:finalize! tdb) - (exit 1)))) + (exit 1) ;; IS THIS NECESSARY OR WISE??? + ))) (set! kill-tries (+ 1 kill-tries)) (mutex-unlock! m))) - ;; (sqlite3:finalize! db) (if keep-going (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if keep-going - (loop (calc-minutes)))))))))) ;; NOTE: Checking twice for keep-going is intentional + (loop (calc-minutes))))))) + (tests:update-central-meta-info test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f)))) ;; NOTE: Checking twice for keep-going is intentional (th1 (make-thread monitorjob "monitor job")) (th2 (make-thread runit "run job"))) (set! job-thread th2) (thread-start! th1) (thread-start! th2) (thread-join! th2) (set! keep-going #f) (thread-join! th1) - ;; (thread-sleep! 1) - ;; (thread-terminate! th1) ;; Not sure if this is a good idea (thread-sleep! 1) ;; give thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec. (mutex-lock! m) (let* ((item-path (item-list->path itemdat)) - (testinfo (cdb:get-test-info-by-id *runremote* test-id))) ;; )) ;; run-id test-name item-path))) + ;; only state and status needed - use lazy routine + (testinfo (rmt:get-testinfo-state-status test-id))) ;;;(cdb:get-test-info-by-id *runremote* test-id))) ;; )) ;; run-id test-name item-path))) ;; Am I completed? (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) ;; NOTE: It should *not* be REMOTEHOSTSTART but for reasons I don't yet understand it sometimes gets stuck in that state ;; (not (equal? (db:test-get-state testinfo) "COMPLETED")) (let ((new-state (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status ;; "COMPLETED" ;; (db:test-get-state testinfo))) ;; else preseve the state as set within the test @@ -391,11 +399,11 @@ ;; (thread-sleep! 0.1) ;; give other processes an opportunity to access the db as rollup is lower priority ;; (cdb:roll-up-pass-fail-counts *runremote* run-id test-name item-path new-status))) )) ;; for automated creation of the rollup html file this is a good place... (if (not (equal? item-path "")) - (tests:summarize-items #f run-id test-id test-name #f))) ;; don't force - just update if no + (tests:summarize-items run-id test-id test-name #f))) ;; don't force - just update if no (mutex-unlock! m) (debug:print 2 "Output from running " fullrunscript ", pid " (vector-ref exit-info 0) " in work area " work-area ":\n====\n exit code " (vector-ref exit-info 2) "\n" "====\n") (if (not (vector-ref exit-info 1)) (exit 4))))))) @@ -414,11 +422,16 @@ given-toppath: (get-environment-variable "MT_RUN_AREA_HOME") pathenvvar: "MT_RUN_AREA_HOME")) (set! *configdat* (if (car *configinfo*)(car *configinfo*) #f)) (set! *toppath* (if (car *configinfo*)(cadr *configinfo*) #f)) (if *toppath* - (setenv "MT_RUN_AREA_HOME" *toppath*) ;; to be deprecated + (let ((dbdir (conc *toppath* "/db"))) + (handle-exceptions + exn + (debug:print 0 "ERROR: failed to create the " dbdir " area for your database files") + (if (not (directory-exists? dbdir))(create-directory dbdir))) + (setenv "MT_RUN_AREA_HOME" *toppath*)) (debug:print 0 "ERROR: failed to find the top path to your Megatest area.")))) *toppath*) (define (get-best-disk confdat) (let* ((disks (hash-table-ref/default confdat "disks" #f)) @@ -485,11 +498,11 @@ (lnkbase (conc linktree "/" target "/" runname)) (lnkpath (conc lnkbase "/" testname)) (lnkpathf (conc lnkpath (if not-iterated "" "/") item-path))) ;; Update the rundir path in the test record for all - (cdb:test-set-rundir-by-test-id *runremote* test-id lnkpathf) + (rmt:general-call 'test-set-rundir-by-test-id lnkpathf test-id) (debug:print 2 "INFO:\n lnkbase=" lnkbase "\n lnkpath=" lnkpath "\n toptest-path=" toptest-path "\n test-path=" test-path) (if (not (file-exists? linktree)) (begin (debug:print 0 "WARNING: linktree did not exist! Creating it now at " linktree) @@ -505,15 +518,15 @@ ;; again. ;; NB - This is not working right - some top tests are not getting the path set!!! (if (not (hash-table-ref/default *toptest-paths* testname #f)) - (let* ((testinfo (cdb:get-test-info-by-id *runremote* test-id)) ;; run-id testname item-path)) + (let* ((testinfo (rmt:get-test-info-by-id test-id)) ;; run-id testname item-path)) (curr-test-path (if testinfo (db:test-get-rundir testinfo) #f))) (hash-table-set! *toptest-paths* testname curr-test-path) ;; NB// Was this for the test or for the parent in an iterated test? - (cdb:test-set-rundir! *runremote* run-id testname "" lnkpath) ;; toptest-path) + (rmt:general-call 'test-set-rundir lnkpath run-id testname "") ;; toptest-path) (if (or (not curr-test-path) (not (directory-exists? toptest-path))) (begin (debug:print-info 2 "Creating " toptest-path " and link " lnkpath) (create-directory toptest-path #t) @@ -568,24 +581,15 @@ ;; If there is already a symlink delete it and recreate it. (handle-exceptions exn (begin - (debug:print 0 "ERROR: Failed to re-create link " lnktarget ((condition-property-accessor 'exn 'message) exn) ", exiting") + (debug:print 0 "ERROR: Failed to re-create link " linktarget ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit)) (if (symbolic-link? lnktarget) (delete-file lnktarget)) (if (not (file-exists? lnktarget)) (create-symbolic-link test-path lnktarget))))) - ;; I suspect this section was deleting test directories under some - ;; wierd sitations? This doesn't make sense - reenabling the rm -f - ;; I honestly don't remember *why* this chunk was needed... - ;; (let ((testlink (conc lnkpath "/" testname))) - ;; (if (and (file-exists? testlink) - ;; (or (regular-file? testlink) - ;; (symbolic-link? testlink))) - ;; (system (conc "rm -f " testlink))) - ;; (system (conc "ln -sf " test-path " " testlink))) (if (directory? test-path) (begin (let* ((ovrcmd (let ((cmd (config-lookup *configdat* "setup" "testcopycmd"))) (if cmd ;; substitute the TEST_SRC_PATH and TEST_TARG_PATH @@ -654,11 +658,11 @@ (cmdparms #f) (fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x)))) (mt-bindir-path #f) (item-path (item-list->path itemdat)) ;; (test-id (cdb:remote-run db:get-test-id #f run-id test-name item-path)) - (testinfo (cdb:get-test-info-by-id *runremote* test-id)) + (testinfo (rmt:get-test-info-by-id test-id)) (mt_target (string-intersperse (map cadr keyvals) "/")) (debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '()) (if (args:get-arg "-logging")(list "-logging") '())))) (setenv "MT_ITEMPATH" item-path) (if hosts (set! hosts (string-split hosts)))