Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -11,11 +11,12 @@
 ;;======================================================================
 ;; launch a task - this runs on the originating host, tests themselves
 ;;
 ;;======================================================================
 
-(use regex regex-case base64 sqlite3 srfi-18 directory-utils posix-extras z3)
+(use regex regex-case base64 sqlite3 srfi-18 directory-utils posix-extras z3 call-with-environment-variables)
+
 (import (prefix base64 base64:))
 (import (prefix sqlite3 sqlite3:))
 
 (declare (unit launch))
 (declare (uses common))
@@ -51,10 +52,115 @@
   (let ((enccmd (if encoded-cmd encoded-cmd (getenv "MT_CMDINFO"))))
     (if enccmd
 	(common:read-encoded-string enccmd)
 	'())))
 
+
+(define (launch:runstep ezstep run-id test-id exit-info m tal)
+  (let* ((stepname  (car ezstep))  ;; do stuff to run the step
+	 (stepinfo  (cadr ezstep))
+	 (stepparts (string-match (regexp "^(\\{([^\\}]*)\\}\\s*|)(.*)$") stepinfo))
+	 (stepparms (list-ref stepparts 2)) ;; for future use, {VAR=1,2,3}, run step for each 
+	 (stepcmd   (list-ref stepparts 3))
+	 (script    "") ; "#!/bin/bash\n") ;; yep, we depend on bin/bash FIXME!!!\
+	 (logpro-file (conc stepname ".logpro"))
+	 (html-file   (conc stepname ".html"))
+	 (logpro-used (file-exists? logpro-file)))
+    ;; NB// can safely assume we are in test-area directory
+    (debug:print 4 "ezsteps:\n stepname: " stepname " stepinfo: " stepinfo " stepparts: " stepparts
+		 " stepparms: " stepparms " stepcmd: " stepcmd)
+    
+    ;; ;; first source the previous environment
+    ;; (let ((prev-env (conc ".ezsteps/" prevstep (if (string-search (regexp "csh") 
+    ;;      							 (get-environment-variable "SHELL")) ".csh" ".sh"))))
+    ;;   (if (and prevstep (file-exists? prev-env))
+    ;;       (set! script (conc script "source " prev-env))))
+    
+    ;; call the command using mt_ezstep
+    ;; (set! script (conc "mt_ezstep " stepname " " (if prevstep prevstep "x") " " stepcmd))
+    
+    (debug:print 4 "script: " script)
+    (rmt:teststep-set-status! run-id test-id stepname "start" "-" #f #f)
+    ;; now launch the actual process
+    (call-with-environment-variables 
+     (list (cons "PATH" (conc (get-environment-variable "PATH") ":.")))
+     (lambda ()
+       (let* ((cmd (conc stepcmd " > " stepname ".log"))
+	      (pid (process-run cmd)))
+	 (rmt:test-set-top-process-pid run-id test-id pid)
+	 (let processloop ((i 0))
+	   (let-values (((pid-val exit-status exit-code)(process-wait pid #t)))
+		       (mutex-lock! m)
+		       (vector-set! exit-info 0 pid)
+		       (vector-set! exit-info 1 exit-status)
+		       (vector-set! exit-info 2 exit-code)
+		       (mutex-unlock! m)
+		       (if (eq? pid-val 0)
+			   (begin
+			     (thread-sleep! 2)
+			     (processloop (+ i 1))))
+		       )))))
+    (debug:print-info 0 "step " stepname " completed with exit code " (vector-ref exit-info 2))
+    ;; now run logpro if needed
+    (if logpro-used
+	(let ((pid (process-run (conc "logpro " logpro-file " " (conc stepname ".html") " < " stepname ".log"))))
+	  (let processloop ((i 0))
+	    (let-values (((pid-val exit-status exit-code)(process-wait pid #t)))
+			(mutex-lock! m)
+			(vector-set! exit-info 0 pid)
+			(vector-set! exit-info 1 exit-status)
+			(vector-set! exit-info 2 exit-code)
+			(mutex-unlock! m)
+			(if (eq? pid-val 0)
+			    (begin
+			      (thread-sleep! 2)
+			      (processloop (+ i 1)))))
+	    (debug:print-info 0 "logpro for step " stepname " exited with code " (vector-ref exit-info 2)))))
+    
+    (let ((exinfo (vector-ref exit-info 2))
+	  (logfna (if logpro-used (conc stepname ".html") "")))
+      (rmt:teststep-set-status! run-id test-id stepname "end" exinfo #f logfna))
+    (if logpro-used
+	(rmt:test-set-log! run-id test-id (conc stepname ".html")))
+    ;; set the test final status
+    (let* ((this-step-status (cond
+			      ((and (eq? (vector-ref exit-info 2) 2) logpro-used) 'warn)
+			      ((eq? (vector-ref exit-info 2) 0)                   'pass)
+			      (else 'fail)))
+	   (overall-status   (cond
+			      ((eq? (vector-ref exit-info 3) 2) 'warn) ;; rollup-status
+			      ((eq? (vector-ref exit-info 3) 0) 'pass)
+			      (else 'fail)))
+	   (next-status      (cond 
+			      ((eq? overall-status 'pass) this-step-status)
+			      ((eq? overall-status 'warn)
+			       (if (eq? this-step-status 'fail) 'fail 'warn))
+			      (else 'fail)))
+	   (next-state       ;; "RUNNING") ;; WHY WAS THIS CHANGED TO NOT USE (null? tal) ??
+	    (cond
+	     ((null? tal) ;; more to run?
+	      "COMPLETED")
+	     (else "RUNNING")))
+	   )
+      (debug:print 4 "Exit value received: " (vector-ref exit-info 2) " logpro-used: " logpro-used 
+		   " this-step-status: " this-step-status " overall-status: " overall-status 
+		   " next-status: " next-status " rollup-status: " (vector-ref exit-info 3))
+      (case next-status
+	((warn)
+	 (vector-set! exit-info 3 2) ;; rollup-status
+	 ;; NB// test-set-status! does rdb calls under the hood
+	 (tests:test-set-status! run-id test-id next-state "WARN" 
+				 (if (eq? this-step-status 'warn) "Logpro warning found" #f)
+				 #f))
+	((pass)
+	 (tests:test-set-status! run-id test-id next-state "PASS" #f #f))
+	(else ;; 'fail
+	 (vector-set! exit-info 3 1) ;; force fail, this used to be next-state but that doesn't make sense. should always be "COMPLETED" 
+	 (tests:test-set-status! run-id test-id "COMPLETED" "FAIL" (conc "Failed at step " stepname) #f)
+	 )))
+    logpro-used))
+
 (define (launch:execute encoded-cmd)
   (let* ((cmdinfo   (common:read-encoded-string encoded-cmd)))
     (setenv "MT_CMDINFO" encoded-cmd)
     (if (list? cmdinfo) ;; ((testpath /tmp/mrwellan/jazzmind/src/example_run/tests/sqlitespeed)
 	;; (test-name sqlitespeed) (runscript runscript.rb) (db-host localhost) (run-id 1))
@@ -88,11 +194,12 @@
                                       (let ((fulln (conc testpath "/" runscript)))
 	                                  (if (and (file-exists? fulln)
                                                    (file-execute-access? fulln))
                                               fulln
                                               runscript))))) ;; assume it is on the path
-	       (rollup-status 0))
+	       ;; (rollup-status 0)
+	       )
 	  (change-directory top-path)
 
 	  ;; (set-signal-handler! signal/int (lambda ()
 					    
 	  ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART,
@@ -197,11 +304,11 @@
 	  ;; any previous runs
 	  ;; (db:test-remove-steps db run-id testname itemdat)
 	  
 	  (let* ((m            (make-mutex))
 		 (kill-job?    #f)
-		 (exit-info    (vector #t #t #t))
+		 (exit-info    (vector #t #t #t 0))
 		 (job-thread   #f)
 		 (keep-going   #t)
 		 (runit        (lambda ()
 				 ;; (let-values
 				 ;;  (((pid exit-status exit-code)
@@ -210,14 +317,14 @@
 				 ;; Since we should have a clean slate at this time there is no need to do 
 				 ;; any of the other stuff that tests:test-set-status! does. Let's just 
 				 ;; force RUNNING/n/a
 				 
 
-				 (thread-sleep! 0.3)
+				 ;; (thread-sleep! 0.3)
 				 (tests:test-force-state-status! run-id test-id "RUNNING" "n/a")
 				 (rmt:roll-up-pass-fail-counts run-id test-name item-path "RUNNING")
-				 (thread-sleep! 0.3) ;; NFS slowness has caused grief here
+				 ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here
 
 				 ;; if there is a runscript do it first
 				 (if fullrunscript
 				     (let ((pid (process-run fullrunscript)))
 				       (rmt:test-set-top-process-pid run-id test-id pid)
@@ -226,11 +333,11 @@
 					  (((pid-val exit-status exit-code) (process-wait pid #t)))
 					  (mutex-lock! m)
 					  (vector-set! exit-info 0 pid)
 					  (vector-set! exit-info 1 exit-status)
 					  (vector-set! exit-info 2 exit-code)
-					  (set! rollup-status exit-code) 
+					  (vector-set! exit-info 3 exit-code)  ;; rollup status
 					  (mutex-unlock! m)
 					  (if (eq? pid-val 0)
 					      (begin
 						(thread-sleep! 2)
 						(loop (+ i 1)))
@@ -248,90 +355,11 @@
 					   (let loop ((ezstep (car ezstepslst))
 						      (tal    (cdr ezstepslst))
 						      (prevstep #f))
 					     ;; check exit-info (vector-ref exit-info 1)
 					     (if (vector-ref exit-info 1)
-						 (let* ((stepname  (car ezstep))  ;; do stuff to run the step
-							(stepinfo  (cadr ezstep))
-							(stepparts (string-match (regexp "^(\\{([^\\}]*)\\}\\s*|)(.*)$") stepinfo))
-							(stepparms (list-ref stepparts 2)) ;; for future use, {VAR=1,2,3}, run step for each 
-							(stepcmd   (list-ref stepparts 3))
-							(script    "") ; "#!/bin/bash\n") ;; yep, we depend on bin/bash FIXME!!!
-							(logpro-used #f))
-						   ;; NB// can safely assume we are in test-area directory
-						   (debug:print 4 "ezsteps:\n stepname: " stepname " stepinfo: " stepinfo " stepparts: " stepparts
-								" stepparms: " stepparms " stepcmd: " stepcmd)
-						   
-						   (if (file-exists? (conc stepname ".logpro"))(set! logpro-used #t))
-
-						   ;; ;; first source the previous environment
-						   ;; (let ((prev-env (conc ".ezsteps/" prevstep (if (string-search (regexp "csh") 
-						   ;;      							 (get-environment-variable "SHELL")) ".csh" ".sh"))))
-						   ;;   (if (and prevstep (file-exists? prev-env))
-						   ;;       (set! script (conc script "source " prev-env))))
-						   
-						   ;; call the command using mt_ezstep
-						   (set! script (conc "mt_ezstep " stepname " " (if prevstep prevstep "x") " " stepcmd))
-
-						   (debug:print 4 "script: " script)
-						   (rmt:teststep-set-status! run-id test-id stepname "start" "-" #f #f)
-						   ;; now launch
-						   (let ((pid (process-run script)))
-						     (rmt:test-set-top-process-pid run-id test-id pid)
-						     (let processloop ((i 0))
-						       (let-values (((pid-val exit-status exit-code)(process-wait pid #t)))
-								   (mutex-lock! m)
-								   (vector-set! exit-info 0 pid)
-								   (vector-set! exit-info 1 exit-status)
-								   (vector-set! exit-info 2 exit-code)
-								   (mutex-unlock! m)
-								   (if (eq? pid-val 0)
-								       (begin
-									 (thread-sleep! 2)
-									 (processloop (+ i 1))))
-								   ))
-                                                     (let ((exinfo (vector-ref exit-info 2))
-                                                           (logfna (if logpro-used (conc stepname ".html") "")))
-						       (rmt:teststep-set-status! run-id test-id stepname "end" exinfo #f logfna))
-						     (if logpro-used
-							 (rmt:test-set-log! run-id test-id (conc stepname ".html")))
-						     ;; set the test final status
-						     (let* ((this-step-status (cond
-									       ((and (eq? (vector-ref exit-info 2) 2) logpro-used) 'warn)
-									       ((eq? (vector-ref exit-info 2) 0)                   'pass)
-									       (else 'fail)))
-							    (overall-status   (cond
-									       ((eq? rollup-status 2) 'warn)
-									       ((eq? rollup-status 0) 'pass)
-									       (else 'fail)))
-							    (next-status      (cond 
-									       ((eq? overall-status 'pass) this-step-status)
-									       ((eq? overall-status 'warn)
-										(if (eq? this-step-status 'fail) 'fail 'warn))
-									       (else 'fail)))
-							    (next-state       ;; "RUNNING") ;; WHY WAS THIS CHANGED TO NOT USE (null? tal) ??
-							                       (cond
-									       ((null? tal) ;; more to run?
-									        "COMPLETED")
-									       (else "RUNNING")))
-							    )
-						       (debug:print 4 "Exit value received: " (vector-ref exit-info 2) " logpro-used: " logpro-used 
-								    " this-step-status: " this-step-status " overall-status: " overall-status 
-								    " next-status: " next-status " rollup-status: " rollup-status)
-						       (case next-status
-							 ((warn)
-							  (set! rollup-status 2)
-							  ;; NB// test-set-status! does rdb calls under the hood
-							  (tests:test-set-status! run-id test-id next-state "WARN" 
-									  (if (eq? this-step-status 'warn) "Logpro warning found" #f)
-									  #f))
-							 ((pass)
-							  (tests:test-set-status! run-id test-id next-state "PASS" #f #f))
-							 (else ;; 'fail
-							  (set! rollup-status 1) ;; force fail, this used to be next-state but that doesn't make sense. should always be "COMPLETED" 
-							  (tests:test-set-status! run-id test-id "COMPLETED" "FAIL" (conc "Failed at step " stepname) #f)
-							  ))))
+						 (let ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal)))
 						   (if (and (steprun-good? logpro-used (vector-ref exit-info 2))
 							    (not (null? tal)))
 						       (loop (car tal) (cdr tal) stepname)))
 						 (debug:print 4 "WARNING: a prior step failed, stopping at " ezstep))))))))
 		 (monitorjob   (lambda ()
@@ -374,16 +402,25 @@
 							 exn
 							 (begin
 							   (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.")
 							   (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)))
 							 (debug:print 0 "WARNING: Request received to kill job " pid) ;;  " (attempt # " kill-tries ")")
-							 (if (process:alive? pid)
-							     (begin
-							       (process-signal pid signal/int)
-							       (thread-sleep! 5)
-							       (if (process:process-alive? pid)
-								   (process-signal pid signal/kill))))))
+							 (debug:print-info 0 "Signal mask=" (signal-mask))
+							 ;; (if (process:alive? pid)
+							 ;;     (begin
+							 (map (lambda (pid-num)
+								(process-signal pid-num signal/term))
+							      (process:get-sub-pids pid))
+							 (thread-sleep! 5)
+							 ;; (if (process:process-alive? pid)
+							 (map (lambda (pid-num)
+								(handle-exceptions
+								 exn
+								 #f
+								 (process-signal pid-num signal/kill)))
+							      (process:get-sub-pids pid))))
+							 ;;    (debug:print-info 0 "not killing process " pid " as it is not alive"))))
 						      pids)
 						     (tests:test-set-status! run-id test-id "KILLED"  "KILLED" (args:get-arg "-m") #f))
 						   (begin
 						     (debug:print 0 "ERROR: Nothing to kill, pid1=" pid1 ", pid2=" pid2)
 						     (tests:test-set-status! run-id test-id "KILLED"  "FAILED TO KILL" (args:get-arg "-m") #f)
@@ -417,19 +454,19 @@
 				                                        ;; "COMPLETED"
 							                ;; (db:test-get-state testinfo)))   ;; else preseve the state as set within the test
 				    )
 			(new-status (cond
 				     ((not (vector-ref exit-info 1)) "FAIL") ;; job failed to run
-				     ((eq? rollup-status 0)
+				     ((eq? (vector-ref exit-info 3) 0)
 				      ;; if the current status is AUTO then defer to the calculated value (i.e. leave this AUTO)
 				      (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO" "PASS"))
-				     ((eq? rollup-status 1) "FAIL")
-				     ((eq? rollup-status 2)
+				     ((eq? (vector-ref exit-info 3) 1) "FAIL")
+				     ((eq? (vector-ref exit-info 3) 2)
 				      ;; if the current status is AUTO the defer to the calculated value but qualify (i.e. make this AUTO-WARN)
 				      (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO-WARN" "WARN"))
 				     (else "FAIL")))) ;; (db:test-get-status testinfo)))
-		    (debug:print-info 1 "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (vector-ref exit-info 1) " and rollup-status of " rollup-status)
+		    (debug:print-info 1 "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (vector-ref exit-info 1) " and rollup-status of " (vector-ref exit-info 3))
 		    (tests:test-set-status! run-id 
 					    test-id 
 					    new-state
 					    new-status
 					    (args:get-arg "-m") #f)

Index: process.scm
==================================================================
--- process.scm
+++ process.scm
@@ -11,10 +11,11 @@
 
 ;;======================================================================
 ;; Process convience utils
 ;;======================================================================
 
+(use regex)
 (declare (unit process))
 (declare (uses common))
 
 (define (conservative-read port)
   (let loop ((res ""))
@@ -147,5 +148,17 @@
    (file-exists? (conc "/proc/" pid))
    (let-values (((rpid exit-type exit-signal)(process-wait pid #t)))
        (and (number? rpid)
 	    (equal? rpid pid)))))
 	 
+(define (process:get-sub-pids pid)
+  (with-input-from-pipe
+   (conc "pstree -A -p " pid) ;; | tr 'a-z\\-+`()\\.' ' ' " pid)
+   (lambda ()
+     (let loop ((inl (read-line))
+		(res '()))
+       (if (eof-object? inl)
+	   (reverse res)
+	   (let ((nums (map string->number
+			    (string-split-fields "\\d+" inl))))
+	     (loop (read-line)
+		   (append res nums))))))))

Index: tests/fullrun/tests/runfirst/main.sh
==================================================================
--- tests/fullrun/tests/runfirst/main.sh
+++ tests/fullrun/tests/runfirst/main.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
+
+# (export DISPLAY=:0;xterm) 
 
 # megatest -step wasting_time :state start :status n/a -m "This is a test step comment"
 # sleep 20
 # megatest -step wasting_time :state end :status $?