Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -136,11 +136,15 @@ (set-megatest-env-vars run-id) (set-item-env-vars itemdat) (save-environment-as-files "megatest") ;; open-run-close not needed for test-set-meta-info (tests:set-full-meta-info #f test-id run-id 0 work-area) - (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f) + + ;; (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f) + (tests:test-force-state-status! test-id "REMOTEHOSTSTART" "n/a") + (thread-sleep! 0.3) ;; NFS slowness has caused grief here + (if (args:get-arg "-xterm") (set! fullrunscript "xterm") (if (and fullrunscript (not (file-execute-access? fullrunscript))) (system (conc "chmod ug+x " fullrunscript)))) ;; We are about to actually kick off the test @@ -155,11 +159,20 @@ (keep-going #t) (runit (lambda () ;; (let-values ;; (((pid exit-status exit-code) ;; (run-n-wait fullrunscript))) - (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) + ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) + ;; Since we should have a clean slate at this time there is no need to do + ;; any of the other stuff that tests:test-set-status! does. Let's just + ;; force RUNNING/n/a + + + (thread-sleep! 0.3) + (tests:test-force-state-status! test-id "RUNNING" "n/a") + (thread-sleep! 0.3) ;; NFS slowness has caused grief here + ;; if there is a runscript do it first (if fullrunscript (let ((pid (process-run fullrunscript))) (let loop ((i 0)) (let-values @@ -170,11 +183,11 @@ (vector-set! exit-info 2 exit-code) (set! rollup-status exit-code) (mutex-unlock! m) (if (eq? pid-val 0) (begin - (thread-sleep! 1) + (thread-sleep! 2) (loop (+ i 1))) ))))) ;; then, if runscript ran ok (or did not get called) ;; do all the ezsteps (if any) (if ezsteps @@ -224,11 +237,11 @@ (vector-set! exit-info 1 exit-status) (vector-set! exit-info 2 exit-code) (mutex-unlock! m) (if (eq? pid-val 0) (begin - (thread-sleep! 1) + (thread-sleep! 2) (processloop (+ i 1)))) )) (let ((exinfo (vector-ref exit-info 2)) (logfna (if logpro-used (conc stepname ".html") ""))) ;; testing if procedures called in a remote call cause problems (ans: no or so I suspect) @@ -330,11 +343,11 @@ (set! kill-tries (+ 1 kill-tries)) (mutex-unlock! m))) ;; (sqlite3:finalize! db) (if keep-going (begin - (thread-sleep! (+ 10 (random 10))) ;; add some jitter to the call home time to spread out the db accesses + (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if keep-going (loop (calc-minutes)))))))))) ;; NOTE: Checking twice for keep-going is intentional (th1 (make-thread monitorjob "monitor job")) (th2 (make-thread runit "run job"))) (set! job-thread th2) @@ -341,17 +354,17 @@ (thread-start! th1) (thread-start! th2) (thread-join! th2) (set! keep-going #f) (thread-sleep! 1) - (thread-terminate! th1) ;; Not sure if this is a good idea - (thread-sleep! 0.1) ;; give thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec. + ;; (thread-terminate! th1) ;; Not sure if this is a good idea + (thread-sleep! 1) ;; give thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec. (mutex-lock! m) (let* ((item-path (item-list->path itemdat)) (testinfo (cdb:get-test-info-by-id *runremote* test-id))) ;; )) ;; run-id test-name item-path))) ;; Am I completed? - (if (equal? (db:test-get-state testinfo) "RUNNING") ;; (not (equal? (db:test-get-state testinfo) "COMPLETED")) + (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) ;; NOTE: It should *not* be REMOTEHOSTSTART but for reasons I don't yet understand it sometimes gets stuck in that state ;; (not (equal? (db:test-get-state testinfo) "COMPLETED")) (let ((new-state (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status ;; "COMPLETED" ;; (db:test-get-state testinfo))) ;; else preseve the state as set within the test ) (new-status (cond Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -34,28 +34,10 @@ (include "key_records.scm") (include "db_records.scm") (include "run_records.scm") (include "megatest-fossil-hash.scm") -;; Overall exit handling setup immediately -;; -(let ((original-exit (exit-handler))) - (exit-handler (lambda (#!optional (exit-code 0)) - (printf "Preparing to exit with exit code ~A ...\n" exit-code) - (children - (lambda (pid) - (handle-exceptions - exn - #t - (let-values (((pid-val exit-status exit-code) (process-wait pid #t))) - (if (or (eq? pid-val pid) - (eq? pid-val 0)) - (begin - (printf "Sending signal/term to ~A\n" pid) - (process-signal pid signal/term))))))) - (original-exit exit-code)))) - (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc"))) (if (file-exists? debugcontrolf) (load debugcontrolf))) @@ -270,10 +252,34 @@ (begin (print megatest-version) (exit))) (define *didsomething* #f) + +;; Overall exit handling setup immediately +;; +(if (or (args:get-arg "-process-reap")) + ;; (args:get-arg "-runtests") + ;; (args:get-arg "-execute") + ;; (args:get-arg "-remove-runs") + ;; (args:get-arg "-runstep")) + (let ((original-exit (exit-handler))) + (exit-handler (lambda (#!optional (exit-code 0)) + (printf "Preparing to exit with exit code ~A ...\n" exit-code) + (for-each + (lambda (pid) + (handle-exceptions + exn + #t + (let-values (((pid-val exit-status exit-code) (process-wait pid #t))) + (if (or (eq? pid-val pid) + (eq? pid-val 0)) + (begin + (printf "Sending signal/term to ~A\n" pid) + (process-signal pid signal/term)))))) + (process:children #f)) + (original-exit exit-code))))) ;; Force default transport to fs ;; (if ;; (and (or (args:get-arg "-list-targets") ;; ;; (args:get-arg "-list-db-targets")) ;; (not (args:get-arg "-transport")) Index: process.scm ================================================================== --- process.scm +++ process.scm @@ -113,11 +113,11 @@ ;;====================================================================== ;; MISC PROCESS RELATED STUFF ;;====================================================================== -(define (children proc) +(define (process:children proc) (with-input-from-pipe (conc "ps h --ppid " (current-process-id) " -o pid") (lambda () (let loop ((inl (read-line)) (res '())) Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -278,10 +278,13 @@ (loop (car tal)(cdr tal))) #f)))))) (pop-directory) result))))) +(define (tests:test-force-state-status! test-id state status) + (cdb:test-set-status-state *runremote* test-id status state #f) + (mt:process-triggers test-id state status)) ;; Do not rpc this one, do the underlying calls!!! (define (tests:test-set-status! test-id state status comment dat #!key (work-area #f)) (debug:print-info 4 "tests:test-set-status! test-id=" test-id ", state=" state ", status=" status ", dat=" dat) (let* ((db #f) Index: tests/fullrun/config/mt_include_1.config ================================================================== --- tests/fullrun/config/mt_include_1.config +++ tests/fullrun/config/mt_include_1.config @@ -1,17 +1,18 @@ [setup] # exectutable /path/to/megatest -max_concurrent_jobs 25 +max_concurrent_jobs 50 linktree #{getenv MT_RUN_AREA_HOME}/tmp/mt_links [jobtools] useshell yes # ## launcher launches jobs, the job is managed on the target host ## by megatest, comment out launcher to run local # workhosts localhost hermes -launcher exec nbfake +# launcher exec nbfake +launcher nbfind # launcher nodanggood ## use "xterm -e csi -- " as a launcher to examine the launch environment. ## exit with (exit) ## get a shell with (system "bash") Index: tests/fullrun/megatest.config ================================================================== --- tests/fullrun/megatest.config +++ tests/fullrun/megatest.config @@ -22,10 +22,11 @@ # this may save a few milliseconds on launching tests # launchwait no # Use http instead of direct filesystem access # transport http +transport fs # If set to "default" the old code is used. Otherwise defaults to 200 or uses # numeric value given. # runqueue 20 Index: utils/nbfake ================================================================== --- utils/nbfake +++ utils/nbfake @@ -4,9 +4,9 @@ # Can't always trust $PWD CURRWD=`pwd` if [[ $TARGETHOST == "" ]]; then - exec sh -c "cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=$PATH; nohup $* > NBFAKE-`date +%GWW%V.%u_%T` 2>&1 &" + sh -c "cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=$PATH; nohup $* > NBFAKE-`date +%GWW%V.%u_%T` 2>&1 &" else - exec ssh -n -f $TARGETHOST "sh -c \"cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=$PATH; nohup $* > NBFAKE-`date +%GWW%V.%u_%T` 2>&1 &\"" + ssh -n -f $TARGETHOST "sh -c \"cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=$PATH; nohup $* > NBFAKE-`date +%GWW%V.%u_%T` 2>&1 &\"" fi Index: utils/nbfind ================================================================== --- utils/nbfind +++ utils/nbfind @@ -13,10 +13,12 @@ max_load=50 else max_load=$MAX_ALLOWED_LOAD fi if [[ $lperc -lt $max_load ]];then - nbfake "$@" + echo "$@" | at now + 0 minutes +elif [[ "x$NBLAUNCHER" == "x" ]];then + echo "nbfind $@" | at now + 2 minutes else $NBLAUNCHER "$@" fi