Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -215,13 +215,169 @@
 	 (launch:einf-rollup-status-set! exit-info 1) ;; (vector-set! exit-info 3 1) ;; force fail, this used to be next-state but that doesn't make sense. should always be "COMPLETED" 
 	 (tests:test-set-status! run-id test-id "COMPLETED" "FAIL" (conc "Failed at step " stepname) #f)
 	 )))
     logpro-used))
 
+(define (launch:manage-steps run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m)
+  ;; (let-values
+  ;;  (((pid exit-status exit-code)
+  ;;    (run-n-wait fullrunscript)))
+  ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f)
+  ;; Since we should have a clean slate at this time there is no need to do 
+  ;; any of the other stuff that tests:test-set-status! does. Let's just 
+  ;; force RUNNING/n/a
+
+  ;; (thread-sleep! 0.3)
+  (tests:test-force-state-status! run-id test-id "RUNNING" "n/a")
+  (rmt:roll-up-pass-fail-counts run-id test-name item-path #f "RUNNING")
+  ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here
+
+  ;; if there is a runscript do it first
+  (if fullrunscript
+      (let ((pid (process-run fullrunscript)))
+	(rmt:test-set-top-process-pid run-id test-id pid)
+	(let loop ((i 0))
+	  (let-values
+	   (((pid-val exit-status exit-code) (process-wait pid #t)))
+	   (mutex-lock! m)
+	   (launch:einf-pid-set!           exit-info  pid)         ;; (vector-set! exit-info 0 pid)
+	   (launch:einf-exit-status-set!   exit-info  exit-status) ;; (vector-set! exit-info 1 exit-status)
+	   (launch:einf-exit-code-set!     exit-info  exit-code)   ;; (vector-set! exit-info 2 exit-code)
+	   (launch:einf-rollup-status-set! exit-info  exit-code)   ;; (vector-set! exit-info 3 exit-code)  ;; rollup status
+	   (mutex-unlock! m)
+	   (if (eq? pid-val 0)
+	       (begin
+		 (thread-sleep! 2)
+		 (loop (+ i 1)))
+	       )))))
+  ;; then, if runscript ran ok (or did not get called)
+  ;; do all the ezsteps (if any)
+  (if ezsteps
+      (let* ((testconfig ;; (read-config (conc work-area "/testconfig") #f #t environ-patt: "pre-launch-env-vars")) ;; FIXME??? is allow-system ok here?
+	      ;; NOTE: it is tempting to turn off force-create of testconfig but dynamic
+	      ;;       ezstep names need a full re-eval here.
+	      (tests:get-testconfig test-name tconfigreg #t force-create: #t)) ;; 'return-procs)))
+	     (ezstepslst (if (hash-table? testconfig)
+			     (hash-table-ref/default testconfig "ezsteps" '())
+			     #f)))
+	(if testconfig
+	    (hash-table-set! *testconfigs* test-name testconfig) ;; cached for lazy reads later ...
+	    (begin
+	      (launch:setup)
+	      (debug:print 0 "WARNING: no testconfig found for " test-name " in search path:\n  "
+			   (string-intersperse (tests:get-tests-search-path *configdat*) "\n  "))))
+	;; after all that, still no testconfig? Time to abort
+	(if (not testconfig)
+	    (begin
+	      (debug:print 0 "ERROR: Failed to resolve megatest.config, runconfigs.config and testconfig issues. Giving up now")
+	      (exit 1)))
+	(if (not (file-exists? ".ezsteps"))(create-directory ".ezsteps"))
+	;; if ezsteps was defined then we are sure to have at least one step but check anyway
+	(if (not (> (length ezstepslst) 0))
+	    (debug:print 0 "ERROR: ezsteps defined but ezstepslst is zero length")
+	    (let loop ((ezstep (car ezstepslst))
+		       (tal    (cdr ezstepslst))
+		       (prevstep #f))
+	      ;; check exit-info (vector-ref exit-info 1)
+	      (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1)
+		  (let ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig))
+			(stepname    (car ezstep)))
+		    ;; if logpro-used read in the stepname.dat file
+		    (if (and logpro-used (file-exists? (conc stepname ".dat")))
+			(launch:load-logpro-dat stepname))
+		    (if (steprun-good? logpro-used (launch:einf-exit-code exit-info))
+			(if (not (null? tal))
+			    (loop (car tal) (cdr tal) stepname))
+			(debug:print 4 "WARNING: step " (car ezstep) " failed. Stopping")))
+		  (debug:print 4 "WARNING: a prior step failed, stopping at " ezstep)))))))
+
+(define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)
+  (let* ((start-seconds (current-seconds))
+	 (calc-minutes  (lambda ()
+			  (inexact->exact 
+			   (round 
+			    (- 
+			     (current-seconds) 
+			     start-seconds)))))
+	 (kill-tries 0))
+    ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
+    ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
+    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
+    (let loop ((minutes   (calc-minutes))
+	       (cpu-load  (get-cpu-load))
+	       (disk-free (get-df (current-directory))))
+      (let ((new-cpu-load (let* ((load  (get-cpu-load))
+				 (delta (abs (- load cpu-load))))
+			    (if (> delta 0.6) ;; don't bother updating with small changes
+				load
+				#f)))
+	    (new-disk-free (let* ((df    (get-df (current-directory)))
+				  (delta (abs (- df disk-free))))
+			     (if (> delta 200) ;; ignore changes under 200 Meg
+				 df
+				 #f))))
+	(set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat))
+			    (and runtlim (let* ((run-seconds   (- (current-seconds) start-seconds))
+						(time-exceeded (> run-seconds runtlim)))
+					   (if time-exceeded
+					       (begin
+						 (debug:print-info 0 "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)
+						 #t)
+					       #f)))))
+	(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
+	(if kill-job? 
+	    (begin
+	      (mutex-lock! m)
+	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
+	      ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
+	      ;;       between tries?
+	      (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0))
+		     (pid2 (rmt:test-get-top-process-pid run-id test-id))
+		     (pids (delete-duplicates (filter number? (list pid1 pid2)))))
+		(if (not (null? pids))
+		    (begin
+		      (for-each
+		       (lambda (pid)
+			 (handle-exceptions
+			  exn
+			  (begin
+			    (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.")
+			    (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)))
+			  (debug:print 0 "WARNING: Request received to kill job " pid) ;;  " (attempt # " kill-tries ")")
+			  (debug:print-info 0 "Signal mask=" (signal-mask))
+			  ;; (if (process:alive? pid)
+			  ;;     (begin
+			  (map (lambda (pid-num)
+				 (process-signal pid-num signal/term))
+			       (process:get-sub-pids pid))
+			  (thread-sleep! 5)
+			  ;; (if (process:process-alive? pid)
+			  (map (lambda (pid-num)
+				 (handle-exceptions
+				  exn
+				  #f
+				  (process-signal pid-num signal/kill)))
+			       (process:get-sub-pids pid))))
+		       ;;    (debug:print-info 0 "not killing process " pid " as it is not alive"))))
+		       pids)
+		      (tests:test-set-status! run-id test-id "KILLED"  "KILLED" (args:get-arg "-m") #f))
+		    (begin
+		      (debug:print 0 "ERROR: Nothing to kill, pid1=" pid1 ", pid2=" pid2)
+		      (tests:test-set-status! run-id test-id "KILLED"  "FAILED TO KILL" (args:get-arg "-m") #f)
+		      )))
+	      (mutex-unlock! m)
+	      ;; no point in sticking around. Exit now.
+	      (exit)))
+	(if (hash-table-ref/default misc-flags 'keep-going #f)
+	    (begin
+	      (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
+	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
+		  (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free)))))))
+    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
+
 (define (launch:execute encoded-cmd)
-  
-   (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
+     (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
 	  (tconfigreg (tests:get-all)))
     (setenv "MT_CMDINFO" encoded-cmd)
     (if (list? cmdinfo) ;; ((testpath /tmp/mrwellan/jazzmind/src/example_run/tests/sqlitespeed)
 	;; (test-name sqlitespeed) (runscript runscript.rb) (db-host localhost) (run-id 1))
 	(let* ((testpath  (assoc/default 'testpath  cmdinfo))  ;; testpath is the test spec area
@@ -254,12 +410,11 @@
                                       (let ((fulln (conc testpath "/" runscript)))
 	                                  (if (and (file-exists? fulln)
                                                    (file-execute-access? fulln))
                                               fulln
                                               runscript))))) ;; assume it is on the path
-	       ;; (rollup-status 0)
-	       )
+	       ) ;; (rollup-status 0)
 
 	  ;; NFS might not have propagated the directory meta data to the run host - give it time if needed
 	  (let loop ((count 0))
 	    (if (or (file-exists? top-path)
 		    (> count 10))
@@ -419,175 +574,26 @@
 	  ;; 
 	  (let* ((m            (make-mutex))
 		 (kill-job?    #f)
 		 (exit-info    (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status
 		 (job-thread   #f)
-		 (keep-going   #t)
+		 ;; (keep-going   #t)
+		 (misc-flags   (let ((ht (make-hash-table)))
+				 (hash-table-set! ht 'keep-going #t)
+				 ht))
 		 (runit        (lambda ()
-				 ;; (let-values
-				 ;;  (((pid exit-status exit-code)
-				 ;;    (run-n-wait fullrunscript)))
-				 ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f)
-				 ;; Since we should have a clean slate at this time there is no need to do 
-				 ;; any of the other stuff that tests:test-set-status! does. Let's just 
-				 ;; force RUNNING/n/a
-				 
-
-				 ;; (thread-sleep! 0.3)
-				 (tests:test-force-state-status! run-id test-id "RUNNING" "n/a")
-				 (rmt:roll-up-pass-fail-counts run-id test-name item-path #f "RUNNING")
-				 ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here
-
-				 ;; if there is a runscript do it first
-				 (if fullrunscript
-				     (let ((pid (process-run fullrunscript)))
-				       (rmt:test-set-top-process-pid run-id test-id pid)
-				       (let loop ((i 0))
-					 (let-values
-					  (((pid-val exit-status exit-code) (process-wait pid #t)))
-					  (mutex-lock! m)
-					  (launch:einf-pid-set!           exit-info  pid)         ;; (vector-set! exit-info 0 pid)
-					  (launch:einf-exit-status-set!   exit-info  exit-status) ;; (vector-set! exit-info 1 exit-status)
-					  (launch:einf-exit-code-set!     exit-info  exit-code)   ;; (vector-set! exit-info 2 exit-code)
-					  (launch:einf-rollup-status-set! exit-info  exit-code)   ;; (vector-set! exit-info 3 exit-code)  ;; rollup status
-					  (mutex-unlock! m)
-					  (if (eq? pid-val 0)
-					      (begin
-						(thread-sleep! 2)
-						(loop (+ i 1)))
-					      )))))
-				 ;; then, if runscript ran ok (or did not get called)
-				 ;; do all the ezsteps (if any)
-				 (if ezsteps
-				     (let* ((testconfig ;; (read-config (conc work-area "/testconfig") #f #t environ-patt: "pre-launch-env-vars")) ;; FIXME??? is allow-system ok here?
-					     ;; NOTE: it is tempting to turn off force-create of testconfig but dynamic
-					     ;;       ezstep names need a full re-eval here.
-					     (tests:get-testconfig test-name tconfigreg #t force-create: #t)) ;; 'return-procs)))
-					    (ezstepslst (if (hash-table? testconfig)
-							    (hash-table-ref/default testconfig "ezsteps" '())
-							    #f)))
-				       (if testconfig
-					   (hash-table-set! *testconfigs* test-name testconfig) ;; cached for lazy reads later ...
-					   (begin
-					     (launch:setup)
-					     (debug:print 0 "WARNING: no testconfig found for " test-name " in search path:\n  "
-							  (string-intersperse (tests:get-tests-search-path *configdat*) "\n  "))))
-				       ;; after all that, still no testconfig? Time to abort
-				       (if (not testconfig)
-					   (begin
-					     (debug:print 0 "ERROR: Failed to resolve megatest.config, runconfigs.config and testconfig issues. Giving up now")
-					     (exit 1)))
-				       (if (not (file-exists? ".ezsteps"))(create-directory ".ezsteps"))
-				       ;; if ezsteps was defined then we are sure to have at least one step but check anyway
-				       (if (not (> (length ezstepslst) 0))
-					   (debug:print 0 "ERROR: ezsteps defined but ezstepslst is zero length")
-					   (let loop ((ezstep (car ezstepslst))
-						      (tal    (cdr ezstepslst))
-						      (prevstep #f))
-					     ;; check exit-info (vector-ref exit-info 1)
-					     (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1)
-						 (let ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig))
-						       (stepname    (car ezstep)))
-						   ;; if logpro-used read in the stepname.dat file
-						   (if (and logpro-used (file-exists? (conc stepname ".dat")))
-						       (launch:load-logpro-dat stepname))
-						   (if (steprun-good? logpro-used (launch:einf-exit-code exit-info))
-						       (if (not (null? tal))
-							   (loop (car tal) (cdr tal) stepname))
-						       (debug:print 4 "WARNING: step " (car ezstep) " failed. Stopping")))
-						 (debug:print 4 "WARNING: a prior step failed, stopping at " ezstep))))))))
+				 (launch:manage-steps run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m)))
 		 (monitorjob   (lambda ()
-				 (let* ((start-seconds (current-seconds))
-					(calc-minutes  (lambda ()
-							 (inexact->exact 
-							  (round 
-							   (- 
-							    (current-seconds) 
-							    start-seconds)))))
-					(kill-tries 0))
-				   ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
-				   ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
-				   (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
-				   (let loop ((minutes   (calc-minutes))
-					      (cpu-load  (get-cpu-load))
-					      (disk-free (get-df (current-directory))))
-				     (let ((new-cpu-load (let* ((load  (get-cpu-load))
-								(delta (abs (- load cpu-load))))
-							   (if (> delta 0.6) ;; don't bother updating with small changes
-							       load
-							       #f)))
-					   (new-disk-free (let* ((df    (get-df (current-directory)))
-								 (delta (abs (- df disk-free))))
-							    (if (> delta 200) ;; ignore changes under 200 Meg
-								df
-								#f))))
-				       (set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat))
-							   (and runtlim (let* ((run-seconds   (- (current-seconds) start-seconds))
-									       (time-exceeded (> run-seconds runtlim)))
-									  (if time-exceeded
-									      (begin
-										(debug:print-info 0 "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)
-										#t)
-									      #f)))))
-				       (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
-				       (if kill-job? 
-					   (begin
-					     (mutex-lock! m)
-					     ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
-					     ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
-					     ;;       between tries?
-					     (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0))
-						    (pid2 (rmt:test-get-top-process-pid run-id test-id))
-						    (pids (delete-duplicates (filter number? (list pid1 pid2)))))
-					       (if (not (null? pids))
-						   (begin
-						     (for-each
-						      (lambda (pid)
-							(handle-exceptions
-							 exn
-							 (begin
-							   (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.")
-							   (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)))
-							 (debug:print 0 "WARNING: Request received to kill job " pid) ;;  " (attempt # " kill-tries ")")
-							 (debug:print-info 0 "Signal mask=" (signal-mask))
-							 ;; (if (process:alive? pid)
-							 ;;     (begin
-							 (map (lambda (pid-num)
-								(process-signal pid-num signal/term))
-							      (process:get-sub-pids pid))
-							 (thread-sleep! 5)
-							 ;; (if (process:process-alive? pid)
-							 (map (lambda (pid-num)
-								(handle-exceptions
-								 exn
-								 #f
-								 (process-signal pid-num signal/kill)))
-							      (process:get-sub-pids pid))))
-							 ;;    (debug:print-info 0 "not killing process " pid " as it is not alive"))))
-						      pids)
-						     (tests:test-set-status! run-id test-id "KILLED"  "KILLED" (args:get-arg "-m") #f))
-						   (begin
-						     (debug:print 0 "ERROR: Nothing to kill, pid1=" pid1 ", pid2=" pid2)
-						     (tests:test-set-status! run-id test-id "KILLED"  "FAILED TO KILL" (args:get-arg "-m") #f)
-						     )))
-					     (mutex-unlock! m)
-					     ;; no point in sticking around. Exit now.
-					     (exit)))
-				       (if keep-going
-					   (begin
-					     (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
-					     (if keep-going    ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
-						 (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free)))))))
-				   (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f)))) ;; NOTE: Checking twice for keep-going is intentional
+				 (launch:monitor-job  run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)))
 		 (th1          (make-thread monitorjob "monitor job"))
 		 (th2          (make-thread runit "run job")))
 	    (set! job-thread th2)
 	    (thread-start! th1)
 	    (thread-start! th2)
 	    (thread-join! th2)
 	    (debug:print-info 0 "Megatest exectute of test " test-name ", item path " item-path " complete. Notifying the db ...")
-	    (set! keep-going #f)
+	    (hash-table-set! misc-flags 'keep-going #f)
 	    (thread-join! th1)
 	    (thread-sleep! 1)       ;; givbe thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec.
 	    (mutex-lock! m)
 	    (let* ((item-path (item-list->path itemdat))
 		   ;; only state and status needed - use lazy routine