@@ -31,14 +31,16 @@
 (declare (unit launch))
 (declare (uses subrun))
 (declare (uses common))
 (declare (uses configf))
 (declare (uses db))
+(declare (uses ezsteps))
 
 (include "common_records.scm")
 (include "key_records.scm")
 (include "db_records.scm")
+(include "megatest-fossil-hash.scm")
 
 ;;======================================================================
 ;; ezsteps
 ;;======================================================================
 
@@ -49,13 +51,19 @@
 ;; stepname {VAR=first,second,third ...} command ...
 ;; where the {VAR=first,second,third ...} is optional.
 
 ;; given an exit code and whether or not logpro was used calculate OK/BAD
 ;; return #t if we are ok, #f otherwise
-(define (steprun-good? logpro exitcode)
+(define (steprun-good? logpro exitcode stepparms)
   (or (eq? exitcode 0)
-      (and logpro (eq? exitcode 2))))
+      (and logpro (eq? exitcode 2)) ;; shouldn't this be (member exitcode 2 ...) with the other ok codes?
+      (let* ((params (alist-ref 'params stepparms)) ;; get the params section
+	     (keep-going (if params
+			     (alist-ref "keep-going" params equal?)
+			     #f)))
+	(debug:print 0 *default-log-port* "keep-going=" keep-going)
+	(and keep-going (equal? (car keep-going) "yes")))))
 
 ;; if handed a string, process it, else look for MT_CMDINFO
 (define (launch:get-cmdinfo-assoc-list #!key (encoded-cmd #f))
   (let ((enccmd (if encoded-cmd encoded-cmd (getenv "MT_CMDINFO"))))
     (if enccmd
@@ -86,199 +94,10 @@
 	   ((equal? status "PASS") "PASS") ;; skip the message part if status is pass
 	   (status (conc (configf:lookup dat "final" "exit-status") ": " (if msg msg "no message")))
 	   (else #f)))
 	#f)))
 
-(define (launch:runstep ezstep run-id test-id exit-info m tal testconfig) ;;; TODO: deprecate me in favor of ezsteps.scm
-  (let* ((stepname       (car ezstep))  ;; do stuff to run the step
-	 (stepinfo       (cadr ezstep))
-	;; (let ((info (cadr ezstep)))
-	;; 		   (if (proc? info) "" info)))
-	;; (stepproc       (let ((info (cadr ezstep)))
-	;; 		   (if (proc? info) info #f)))
-	 (stepparts      (string-match (regexp "^(\\{([^\\}\\{]*)\\}\\s*|)(.*)$") stepinfo))
-	 (stepparams     (if (and (list? stepparts)
-				  (> (length stepparts) 1))
-			     (list-ref stepparts 2)
-			     #f)) ;; for future use, {VAR=1,2,3}, run step for each
-	 (paramparts     (if (string? stepparams)
-			     (map (lambda (x)(string-split x "=")) (string-split-fields "[^;]*=[^;]*" stepparams))
-			     '()))
-	 (subrun         (alist-ref "subrun" paramparts equal?))
-	 (stepcmd        (if (and (list? stepparts)
-				  (> (length stepparts) 2))
-			     (list-ref stepparts 3)
-			     (conc "# error, no command for step "stepname)))
-	 (script         "") ; "#!/bin/bash\n") ;; yep, we depend on bin/bash FIXME!!!\
-	 (logpro-file    (conc stepname ".logpro"))
-	 (html-file      (conc stepname ".html"))
-	 (dat-file       (conc stepname ".dat"))
-	 (tconfig-logpro (configf:lookup testconfig "logpro" stepname))
-	 (logpro-used    (common:file-exists? logpro-file)))
-
-    (debug:print 0 *default-log-port* "stepparts: " stepparts ", stepparams: " stepparams
-                 ", paramparts: " paramparts ", subrun: " subrun ", stepcmd: " stepcmd)
-    
-    (if (and tconfig-logpro
-	     (not logpro-used)) ;; no logpro file found but have a defn in the testconfig
-	(begin
-	  (with-output-to-file logpro-file
-	    (lambda ()
-	      (print ";; logpro file extracted from testconfig\n"
-		     ";;")
-	      (print tconfig-logpro)))
-	  (set! logpro-used #t)))
-    
-    ;; NB// can safely assume we are in test-area directory
-    (debug:print 4 *default-log-port* "ezsteps:\n stepname: " stepname " stepinfo: " stepinfo " stepparts: " stepparts
-		 " stepparams: " stepparams " stepcmd: " stepcmd)
-    
-    ;; ;; first source the previous environment
-    ;; (let ((prev-env (conc ".ezsteps/" prevstep (if (string-search (regexp "csh") 
-    ;;      							 (get-environment-variable "SHELL")) ".csh" ".sh"))))
-    ;;   (if (and prevstep (common:file-exists? prev-env))
-    ;;       (set! script (conc script "source " prev-env))))
-    
-    ;; call the command using mt_ezstep
-    ;; (set! script (conc "mt_ezstep " stepname " " (if prevstep prevstep "x") " " stepcmd))
-    
-    (debug:print 4 *default-log-port* "script: " script)
-    (rmt:teststep-set-status! run-id test-id stepname "start" "-" #f #f)
-    ;; now launch the actual process
-    (call-with-environment-variables 
-     (list (cons "PATH" (conc (get-environment-variable "PATH") ":.")))
-     (lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1")
-       (let* ((cmd (conc stepcmd " > " stepname ".log 2>&1")) ;; >outfile 2>&1 
-	      (pid #f))
-	 (let ((proc (lambda ()
-		       (set! pid (process-run "/bin/bash" (list "-c" cmd))))))
-	   (if subrun
-               (begin
-                 (debug:print-info 0 *default-log-port* "Running without MT_.* environment variables.")
-                 (common:without-vars proc "^MT_.*"))
-	       (proc)))
-	 
-         (with-output-to-file "Makefile.ezsteps"
-           (lambda ()
-             (print stepname ".log :")
-             (print "\t" cmd)
-             (if (common:file-exists? (conc stepname ".logpro"))
-                 (print "\tlogpro " stepname ".logpro " stepname ".html < " stepname ".log"))
-             (print)
-             (print stepname " : " stepname ".log")
-             (print))
-           #:append)
-
-	 (rmt:test-set-top-process-pid run-id test-id pid)
-	 (let processloop ((i 0))
-	   (let-values (((pid-val exit-status exit-code)(process-wait pid #t)))
-		       (mutex-lock! m)
-		       (launch:einf-pid-set!         exit-info pid)         ;; (vector-set! exit-info 0 pid)
-		       (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status)
-		       (launch:einf-exit-code-set!   exit-info exit-code)   ;; (vector-set! exit-info 2 exit-code)
-		       (mutex-unlock! m)
-		       (if (eq? pid-val 0)
-			   (begin
-			     (thread-sleep! 2)
-			     (processloop (+ i 1))))
-		       )))))
-    (debug:print-info 0 *default-log-port* "step " stepname " completed with exit code " (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2))
-    ;; now run logpro if needed
-    (if logpro-used
-	(let* ((logpro-exe (or (getenv "LOGPRO_EXE") "logpro"))
-               (pid        (process-run (conc "/bin/sh -c '"logpro-exe" "logpro-file " " (conc stepname ".html") " < " stepname ".log > /dev/null'"))))
-	  (let processloop ((i 0))
-	    (let-values (((pid-val exit-status exit-code)(process-wait pid #t)))
-			(mutex-lock! m)
-			;; (make-launch:einf pid: pid exit-status: exit-status exit-code: exit-code)
-			(launch:einf-pid-set!         exit-info pid)         ;; (vector-set! exit-info 0 pid)
-			(launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status)
-			(launch:einf-exit-code-set!   exit-info exit-code)   ;; (vector-set! exit-info 2 exit-code)
-			(mutex-unlock! m)
-			(if (eq? pid-val 0)
-			    (begin
-			      (thread-sleep! 2)
-			      (processloop (+ i 1)))))
-	    (debug:print-info 0 *default-log-port* "logpro for step " stepname " exited with code " (launch:einf-exit-code exit-info))))) ;; (vector-ref exit-info 2)))))
-    
-    (let ((exinfo (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2))
-	  (logfna (if logpro-used (conc stepname ".html") ""))
-	  (comment #f))
-      (if logpro-used
-	  (let ((datfile (conc stepname ".dat")))
-	    ;; load the .dat file into the test_data table if it exists
-	    (if (common:file-exists? datfile)
-		(set! comment (launch:load-logpro-dat run-id test-id stepname)))
-	    (rmt:test-set-log! run-id test-id (conc stepname ".html"))))
-      (rmt:teststep-set-status! run-id test-id stepname "end" exinfo comment logfna))
-    ;; set the test final status
-    (let* ((process-exit-status (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2))
-	   (this-step-status (cond
-			      ((and (eq? process-exit-status 2) logpro-used) 'warn)   ;; logpro 2 = warnings
-			      ((and (eq? process-exit-status 3) logpro-used) 'check)  ;; logpro 3 = check
-			      ((and (eq? process-exit-status 4) logpro-used) 'waived) ;; logpro 4 = waived
-			      ((and (eq? process-exit-status 5) logpro-used) 'abort)  ;; logpro 5 = abort
-			      ((and (eq? process-exit-status 6) logpro-used) 'skip)   ;; logpro 6 = skip
-			      ((eq? process-exit-status 0)                   'pass)   ;; logpro 0 = pass
-			      (else 'fail)))
-	   (overall-status   (cond
-			      ((eq? (launch:einf-rollup-status exit-info) 2) 'warn) ;; rollup-status (vector-ref exit-info 3)
-			      ((eq? (launch:einf-rollup-status exit-info) 0) 'pass) ;; (vector-ref exit-info 3)
-			      (else 'fail)))
-	   (next-status      (cond 
-			      ((eq? overall-status 'pass) this-step-status)
-			      ((eq? overall-status 'warn)
-			       (if (eq? this-step-status 'fail) 'fail 'warn))
-			      ((eq? overall-status 'abort) 'abort)
-			      (else 'fail)))
-	   (next-state       ;; "RUNNING") ;; WHY WAS THIS CHANGED TO NOT USE (null? tal) ??
-	    (cond
-	     ((null? tal) ;; more to run?
-	      "COMPLETED")
-	     (else "RUNNING"))))
-      (debug:print 4 *default-log-port* "Exit value received: " (launch:einf-exit-code exit-info) " logpro-used: " logpro-used 
-		   " this-step-status: " this-step-status " overall-status: " overall-status 
-		   " next-status: " next-status " rollup-status: "  (launch:einf-rollup-status exit-info)) ;; (vector-ref exit-info 3))
-      (case next-status
-	((warn)
-	 (launch:einf-rollup-status-set! exit-info 2) ;; (vector-set! exit-info 3 2) ;; rollup-status
-	 ;; NB// test-set-status! does rdb calls under the hood
-	 (tests:test-set-status! run-id test-id next-state "WARN" 
-				 (if (eq? this-step-status 'warn) "Logpro warning found" #f)
-				 #f))
-	((check)
-	 (launch:einf-rollup-status-set! exit-info 3) ;; (vector-set! exit-info 3 3) ;; rollup-status
-	 ;; NB// test-set-status! does rdb calls under the hood
-	 (tests:test-set-status! run-id test-id next-state "CHECK" 
-				 (if (eq? this-step-status 'check) "Logpro check found" #f)
-				 #f))
-	((waived)
-	 (launch:einf-rollup-status-set! exit-info 4) ;; (vector-set! exit-info 3 3) ;; rollup-status
-	 ;; NB// test-set-status! does rdb calls under the hood
-	 (tests:test-set-status! run-id test-id next-state "WAIVED" 
-				 (if (eq? this-step-status 'check) "Logpro waived found" #f)
-				 #f))
-	((abort)
-	 (launch:einf-rollup-status-set! exit-info 5) ;; (vector-set! exit-info 3 4) ;; rollup-status
-	 ;; NB// test-set-status! does rdb calls under the hood
-	 (tests:test-set-status! run-id test-id next-state "ABORT" 
-				 (if (eq? this-step-status 'abort) "Logpro abort found" #f)
-				 #f))
-	((skip)
-	 (launch:einf-rollup-status-set! exit-info 6) ;; (vector-set! exit-info 3 4) ;; rollup-status
-	 ;; NB// test-set-status! does rdb calls under the hood
-	 (tests:test-set-status! run-id test-id next-state "SKIP" 
-				 (if (eq? this-step-status 'skip) "Logpro skip found" #f)
-				 #f))
-	((pass)
-	 (tests:test-set-status! run-id test-id next-state "PASS" #f #f))
-	(else ;; 'fail
-	 (launch:einf-rollup-status-set! exit-info 1) ;; (vector-set! exit-info 3 1) ;; force fail, this used to be next-state but that doesn't make sense. should always be "COMPLETED" 
-	 (tests:test-set-status! run-id test-id "COMPLETED" "FAIL" (conc "Failed at step " stepname) #f)
-	 )))
-    logpro-used))
-
 (define (launch:manage-steps run-id test-id item-path fullrunscript ezsteps subrun test-name tconfigreg exit-info m)
   ;; (let-values
   ;;  (((pid exit-status exit-code)
   ;;    (run-n-wait fullrunscript)))
   ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f)
@@ -349,11 +168,11 @@
                     (append (or ezstepslst '())
                             (list (list "subrun" (conc "{subrun=true} " mt-cmd)))))))
 
 	;; process the ezsteps
 	(if ezsteps
-	    (begin
+	    (let* ((all-steps-dat (make-hash-table))) ;; keep all the info around as stepname ==> alist; where  'params is the params list (add other stuff as needed)
 	      (if (not (common:file-exists? ".ezsteps"))(create-directory ".ezsteps"))
 	      ;; if ezsteps was defined then we are sure to have at least one step but check anyway
 	      (if (not (> (length ezstepslst) 0))
 		  (debug:print-error 0 *default-log-port* "ezsteps defined but ezstepslst is zero length")
 		  (let loop ((ezstep (car ezstepslst))
@@ -360,17 +179,19 @@
 			     (tal    (cdr ezstepslst))
 			     (prevstep #f))
                     (debug:print-info 0 *default-log-port* "Processing ezstep \"" (string-intersperse ezstep " ") "\"")
 		    ;; check exit-info (vector-ref exit-info 1)
 		    (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1)
-			(let ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig))
-			      (stepname    (car ezstep)))
+			(let* ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat))
+			       (stepname    (car ezstep))
+			       (stepparms   (hash-table-ref all-steps-dat stepname)))
 			  (setenv "MT_STEP_NAME" stepname)
+			  (pp (hash-table->alist all-steps-dat))
 			  ;; if logpro-used read in the stepname.dat file
 			  (if (and logpro-used (common:file-exists? (conc stepname ".dat")))
 			      (launch:load-logpro-dat run-id test-id stepname))
-			  (if (steprun-good? logpro-used (launch:einf-exit-code exit-info))
+			  (if (steprun-good? logpro-used (launch:einf-exit-code exit-info) stepparms)
 			      (if (not (null? tal))
 				  (loop (car tal) (cdr tal) stepname))
 			      (debug:print 0 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping")))
 			(debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep)))))))))
 
@@ -384,17 +205,19 @@
 			     (current-seconds) 
 			     start-seconds)))))
 	 (kill-tries 0))
     ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
     ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
-    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
+    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t)
 
     (let loop ((minutes   (calc-minutes))
 	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
 	       (disk-free (get-df (current-directory)))
                (last-sync (current-seconds)))
-      (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
+      ;; (common:telemetry-log "zombie" (conc "launch:monitor-job -
+      ;; top of loop encountered at "(current-seconds)" with
+      ;; last-sync="last-sync))
       (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
              (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                    (delta (abs (- load cpu-load))))
                               (if (> delta 0.1) ;; don't bother updating with small changes
                                   load
@@ -412,33 +235,28 @@
              (test-info   (rmt:get-test-info-by-id run-id test-id))
              (state       (db:test-get-state test-info))
              (status      (db:test-get-status test-info))
              (kill-reason  "no kill reason specified")
              (kill-job?    #f))
-        (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
+        #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
         (cond
          ((test-get-kill-request run-id test-id)
           (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
           (set! kill-job? #t))
          ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
           (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
           (set! kill-job? #t))
          ((equal? status "DEAD")
-          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
+          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t)
           (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
           ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
           (set! kill-job? #f)))
 
         (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
         (launch:handle-zombie-tests run-id)
-        (when do-sync
-          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
-          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
-          (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
-          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
-          (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
-        
+        (if do-sync ;; save meta data about the running of this test
+	    (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
 	(if kill-job? 
 	    (begin
               (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
 	      (mutex-lock! m)
 	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
@@ -453,11 +271,11 @@
 		       (lambda (pid)
 			 (handle-exceptions
 			  exn
 			  (begin
 			    (debug:print-info 0 *default-log-port* "Unable to kill process with pid " pid ", possibly already killed.")
-			    (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)))
+			    (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
 			  (debug:print 0 *default-log-port* "WARNING: Request received to kill job " pid) ;;  " (attempt # " kill-tries ")")
 			  (debug:print-info 0 *default-log-port* "Signal mask=" (signal-mask))
 			  ;; (if (process:alive? pid)
 			  ;;     (begin
 			  (map (lambda (pid-num)
@@ -465,13 +283,15 @@
 			       (process:get-sub-pids pid))
 			  (thread-sleep! 5)
 			  ;; (if (process:process-alive? pid)
 			  (map (lambda (pid-num)
 				 (handle-exceptions
-				  exn
-				  #f
-				  (process-signal pid-num signal/kill)))
+				     exn
+				   (begin
+				     (debug:print 0 *default-log-port* " .... had trouble sending kill to " pid-num ", exn=" exn)
+				     #f)
+				   (process-signal pid-num signal/kill)))
 			       (process:get-sub-pids pid))))
 		       ;;    (debug:print-info 0 *default-log-port* "not killing process " pid " as it is not alive"))))
 		       pids)
                       ;; BB: question to Matt -- does the tests:test-state-status! encompass rollup to toplevel?  If not, should it?
 		      (tests:test-set-status! run-id test-id "KILLED"  "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) ;; BB ADDED kill-reason -- confirm OK with Matt
@@ -489,11 +309,11 @@
 	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
 		  (loop (calc-minutes)
                         (or new-cpu-load cpu-load)
                         (or new-disk-free disk-free)
                         (if do-sync (current-seconds) last-sync)))))))
-    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
+    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional
 
 
 (define (launch:execute encoded-cmd)
   (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
 	 (tconfigreg #f))
@@ -642,11 +462,13 @@
 				  (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.")
 				  (exit))))
 		 (test-pid  (db:test-get-process_id  test-info)))
 	    (cond
              ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag.
-	     ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
+	     ((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
+		  (and (equal? (db:test-get-state test-info) "COMPLETED")                           ;; completed/abort => rerun if asked
+		       (member (db:test-get-status test-info) '("ABORT"))))
 	      (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
 	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
 
               (rmt:general-call 'set-test-start-time #f test-id)
               (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
@@ -653,15 +475,17 @@
 	      ) ;; prime it for running
 	     ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART"))
 	      (if (process:alive-on-host? test-host test-pid)
 		  (debug:print-error 0 *default-log-port* "test state is "  (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed")
 		  (exit)))
+	     ((member (db:test-get-state test-info) '("COMPLETED"))  ;; we do NOT want to re-run COMPLETED jobs. Mark as NOT_STARTED to run!
+	      (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed")
+	      (exit))
 	     ((not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")))
 	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
               (rmt:general-call 'set-test-start-time #f test-id)
-	      (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
-	      )
+	      (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f))
 	     (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))
 	      (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed")
 	      (exit))))
 
           ;; cleanup prior execution's steps
@@ -1220,18 +1044,18 @@
 		    (begin
 		      (handle-exceptions
 			  exn
 			  (begin
 			    (debug:print-error 0 *default-log-port* "Something went wrong when trying to create linktree dir at " linktree)
-			    (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
+			    (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)
 			    (exit 1))
 			(create-directory linktree #t))))
 		(handle-exceptions
 		    exn
 		    (begin
 		      (debug:print-error 0 *default-log-port* "Something went wrong when trying to create link to linktree at " *toppath*)
-		      (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)))
+		      (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
 		  (let ((tlink (conc *toppath* "/lt")))
 		    (if (not (common:file-exists? tlink))
 			(create-symbolic-link linktree tlink)))))
 	      (begin
 		(debug:print-error 0 *default-log-port* "linktree not defined in [setup] section of megatest.config")
@@ -1292,11 +1116,15 @@
 		;;(exit 1)
                  (if (null? disks)
                      (cons 1 (conc *toppath* "/runs"))
                      (let ((paths (sort disks (lambda (x y) (> (string-length (cadr x)) (string-length (cadr y)))))))
                        (let loop ((head (car paths)) (tail (cdr paths)))
-                         (let ((result (handle-exceptions exn #f (create-directory (cadr head) #t))))
+                         (let ((result (handle-exceptions exn
+					 (begin
+					   (debug:print 0 *default-log-port* "failed to create dir " (cadr head) ", exn=" exn)
+					   #f)
+					 (create-directory (cadr head) #t))))
                            (if result
                                result
                                (if (null? tail)
                                    (cons 1 (conc *toppath* "/runs"))
                                    (loop (car tail) (cdr tail)))))))))))
@@ -1388,11 +1216,11 @@
       (let ((success (if (and (not (common:directory-exists? lnkbase))
 			      (not (common:file-exists? lnkbase)))
 			 (handle-exceptions
 			  exn
 			  (begin
-			    (debug:print-error 0 *default-log-port* "Problem creating linktree base at " lnkbase)
+			    (debug:print-error 0 *default-log-port* "Problem creating linktree base at " lnkbase ", exn=" exn)
 			    (print-error-message exn (current-error-port))
 			    #t)
 			  (create-directory lnkbase #t)
 			  #f))))
 	(if (and (not success)(> done 0))
@@ -1411,28 +1239,31 @@
 	(let ((iterated-parent  (pathname-directory (conc lnkpath "/" item-path))))
 	  (debug:print-info 2 *default-log-port* "Creating iterated parent " iterated-parent)
 	  (handle-exceptions
 	   exn
 	   (begin
-	     (debug:print-error 0 *default-log-port* " Failed to create directory " iterated-parent ((condition-property-accessor 'exn 'message) exn) ", continuing but link tree may be corrupted")
+	     (debug:print-error 0 *default-log-port* " Failed to create directory " iterated-parent ((condition-property-accessor 'exn 'message) exn)
+				", continuing but link tree may be corrupted, exn=" exn)
 	     #;(exit 1))
 	   (create-directory iterated-parent #t))))
 
     (if (symbolic-link? lnkpath) 
 	(handle-exceptions
 	 exn
 	 (begin
-	   (debug:print-error 0 *default-log-port* " Failed to remove symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", continuing but link tree may be corrupted.")
+	   (debug:print-error 0 *default-log-port* " Failed to remove symlink " lnkpath ((condition-property-accessor 'exn 'message) exn)
+			      ", continuing but link tree may be corrupted. exn=" exn)
 	   #;(exit 1))
 	 (delete-file lnkpath)))
 
     (if (not (or (common:file-exists? lnkpath)
 		 (symbolic-link? lnkpath)))
 	(handle-exceptions
 	 exn
 	 (begin
-	   (debug:print-error 0 *default-log-port* " Failed to create symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", continuing but link tree may be corrupted.")
+	   (debug:print-error 0 *default-log-port* " Failed to create symlink " lnkpath ((condition-property-accessor 'exn 'message) exn)
+			      ", continuing but link tree may be corrupted. exn=" exn)
 	   #;(exit 1))
 	 (create-symbolic-link toptest-path lnkpath)))
     
     ;; NB - This was not working right - some top tests are not getting the path set!!!
     ;;
@@ -1459,12 +1290,14 @@
 	  (if (or (not curr-test-path)
 		  (not (directory-exists? toptest-path)))
 	      (begin
 		(debug:print-info 2 *default-log-port* "Creating " toptest-path " and link " lnkpath)
 		(handle-exceptions
-		 exn
-		 #f ;; don't care to catch and deal with errors here for now.
+		    exn
+		  (begin
+		    (debug:print 0 *default-log-port* "failed to create directory " toptest-path ", exn=" exn)
+		    #f)
 		 (create-directory toptest-path #t))
 		(hash-table-set! *toptest-paths* testname toptest-path)))))
 
     ;; The toptest path has been created, the link to the test in the linktree has
     ;; been created. Now, if this is an iterated test the real test dir must be created
@@ -1473,22 +1306,23 @@
 	  (debug:print 2 *default-log-port* "Setting up sub test run area")
 	  (debug:print 2 *default-log-port* " - creating run area in " test-path)
 	  (handle-exceptions
 	   exn
 	   (begin
-	     (debug:print-error 0 *default-log-port* " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) ", exiting")
-	     (exit 1))
+	     (debug:print-error 0 *default-log-port* " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn)
+				", continuing (might cause downstream issues?), exn=" exn)
+	     #f)
 	   (create-directory test-path #t))
 	  (debug:print 2 *default-log-port* 
 		       " - creating link from: " test-path "\n"
 		       "                   to: " lnktarget)
 
 	  ;; If there is already a symlink delete it and recreate it.
 	  (handle-exceptions
 	   exn
 	   (begin
-	     (debug:print-error 0 *default-log-port* " Failed to re-create link " lnktarget ((condition-property-accessor 'exn 'message) exn) ", exiting")
+	     (debug:print-error 0 *default-log-port* " Failed to re-create link " lnktarget ((condition-property-accessor 'exn 'message) exn) ", exiting, exn=" exn)
 	     (exit))
 	   (if (symbolic-link? lnktarget)     (delete-file lnktarget))
 	   (if (not (common:file-exists? lnktarget)) (create-symbolic-link test-path lnktarget)))))
 
     (if (not (directory? test-path))
@@ -1550,11 +1384,11 @@
 	;; 			      (thread-sleep! 1)
 	;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time))))))
 	 (item-path       (item-list->path itemdat))
 	 (contour         #f)) ;; NOT READY FOR THIS (args:get-arg "-contour")))
     (let loop ((delta        (- (current-seconds) *last-launch*))
-	       (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1)))
+	       (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 0)))
       (if (> launch-delay delta)
 	  (begin
 	    (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay.
 		(debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds"))
 	    (thread-sleep! (- launch-delay delta))
@@ -1593,11 +1427,12 @@
 				(configf:lookup  *configdat* "setup" "runtimelim")))
 	   ;; FIXME SOMEDAY: not good how this is so obtuse, this hack is to 
 	   ;;                allow running from dashboard. Extract the path
 	   ;;                from the called megatest and convert dashboard
 	   ;;             	  or dboard to megatest
-	   (local-megatest  (let* ((lm  (car (argv)))
+	   (local-megatest  (common:find-local-megatest))
+	   #;(local-megatest  (let* ((lm  (car (argv)))
 				   (dir (pathname-directory lm))
 				   (exe (pathname-strip-directory lm)))
 			      (conc (if dir (conc dir "/") "")
 				    (case (string->symbol exe)
 				      ((dboard)    "../megatest")
@@ -1716,21 +1551,21 @@
 	     (testprevvals   (alist->env-vars
 			      (hash-table-ref/default tconfig "pre-launch-env-overrides" '())))
 	     ;; Launchwait defaults to true, must override it to turn off wait
 	     (launchwait     (if (equal? (configf:lookup *configdat* "setup" "launchwait") "no") #f #t))
 	     (launch-results-prev (apply (if launchwait ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed.
-					process:cmd-run-with-stderr-and-exitcode->list
-					process-run)
-				    (if useshell
-					(let ((cmdstr (string-intersperse fullcmd " ")))
-					  (if launchwait
-					      cmdstr
-					      (conc cmdstr " >> mt_launch.log 2>&1 &")))
-					(car fullcmd))
-				    (if useshell
-					'()
-					(cdr fullcmd))))
+					     process:cmd-run-with-stderr-and-exitcode->list
+					     process-run)
+					 (if useshell
+					     (let ((cmdstr (string-intersperse fullcmd " ")))
+					       (if launchwait
+						   cmdstr
+						   (conc cmdstr " >> mt_launch.log 2>&1 &")))
+					     (car fullcmd))
+					 (if useshell
+					     '()
+					     (cdr fullcmd))))
              (success        (if launchwait (equal? 0 (cadr launch-results-prev)) #t))
              (launch-results (if launchwait (car launch-results-prev) launch-results-prev)))
         (if (not success)
             (tests:test-set-status! run-id test-id "COMPLETED" "DEAD" "launcher failed; exited non-zero; check mt_launch.log" #f)) ;; (if launch-results launch-results "FAILED"))
         (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork.