@@ -136,11 +136,15 @@ (set-megatest-env-vars run-id) (set-item-env-vars itemdat) (save-environment-as-files "megatest") ;; open-run-close not needed for test-set-meta-info (tests:set-full-meta-info #f test-id run-id 0 work-area) - (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f) + + ;; (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f) + (tests:test-force-state-status! test-id "REMOTEHOSTSTART" "n/a") + (thread-sleep! 0.3) ;; NFS slowness has caused grief here + (if (args:get-arg "-xterm") (set! fullrunscript "xterm") (if (and fullrunscript (not (file-execute-access? fullrunscript))) (system (conc "chmod ug+x " fullrunscript)))) ;; We are about to actually kick off the test @@ -155,11 +159,20 @@ (keep-going #t) (runit (lambda () ;; (let-values ;; (((pid exit-status exit-code) ;; (run-n-wait fullrunscript))) - (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) + ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) + ;; Since we should have a clean slate at this time there is no need to do + ;; any of the other stuff that tests:test-set-status! does. Let's just + ;; force RUNNING/n/a + + + (thread-sleep! 0.3) + (tests:test-force-state-status! test-id "RUNNING" "n/a") + (thread-sleep! 0.3) ;; NFS slowness has caused grief here + ;; if there is a runscript do it first (if fullrunscript (let ((pid (process-run fullrunscript))) (let loop ((i 0)) (let-values @@ -170,11 +183,11 @@ (vector-set! exit-info 2 exit-code) (set! rollup-status exit-code) (mutex-unlock! m) (if (eq? pid-val 0) (begin - (thread-sleep! 1) + (thread-sleep! 2) (loop (+ i 1))) ))))) ;; then, if runscript ran ok (or did not get called) ;; do all the ezsteps (if any) (if ezsteps @@ -224,11 +237,11 @@ (vector-set! exit-info 1 exit-status) (vector-set! exit-info 2 exit-code) (mutex-unlock! m) (if (eq? pid-val 0) (begin - (thread-sleep! 1) + (thread-sleep! 2) (processloop (+ i 1)))) )) (let ((exinfo (vector-ref exit-info 2)) (logfna (if logpro-used (conc stepname ".html") ""))) ;; testing if procedures called in a remote call cause problems (ans: no or so I suspect) @@ -330,11 +343,11 @@ (set! kill-tries (+ 1 kill-tries)) (mutex-unlock! m))) ;; (sqlite3:finalize! db) (if keep-going (begin - (thread-sleep! (+ 10 (random 10))) ;; add some jitter to the call home time to spread out the db accesses + (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if keep-going (loop (calc-minutes)))))))))) ;; NOTE: Checking twice for keep-going is intentional (th1 (make-thread monitorjob "monitor job")) (th2 (make-thread runit "run job"))) (set! job-thread th2) @@ -341,17 +354,17 @@ (thread-start! th1) (thread-start! th2) (thread-join! th2) (set! keep-going #f) (thread-sleep! 1) - (thread-terminate! th1) ;; Not sure if this is a good idea - (thread-sleep! 0.1) ;; give thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec. + ;; (thread-terminate! th1) ;; Not sure if this is a good idea + (thread-sleep! 1) ;; give thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec. (mutex-lock! m) (let* ((item-path (item-list->path itemdat)) (testinfo (cdb:get-test-info-by-id *runremote* test-id))) ;; )) ;; run-id test-name item-path))) ;; Am I completed? - (if (equal? (db:test-get-state testinfo) "RUNNING") ;; (not (equal? (db:test-get-state testinfo) "COMPLETED")) + (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) ;; NOTE: It should *not* be REMOTEHOSTSTART but for reasons I don't yet understand it sometimes gets stuck in that state ;; (not (equal? (db:test-get-state testinfo) "COMPLETED")) (let ((new-state (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status ;; "COMPLETED" ;; (db:test-get-state testinfo))) ;; else preseve the state as set within the test ) (new-status (cond