@@ -168,34 +168,41 @@
                     (append (or ezstepslst '())
                             (list (list "subrun" (conc "{subrun=true} " mt-cmd)))))))
 
 	;; process the ezsteps
 	(if ezsteps
-	    (let* ((all-steps-dat (make-hash-table))) ;; keep all the info around as stepname ==> alist; where  'params is the params list (add other stuff as needed)
+	    (let* ((envdbf        (conc "/tmp/."(current-user-name)"-"(current-process-id)"-"run-id"-"test-id".db"))
+		   (all-steps-dat (make-hash-table))) ;; keep all the info around as stepname ==> alist;
+	                                              ;;; where  'params is the params list (add other
+	                                              ;;; stuff as needed)
 	      (if (not (common:file-exists? ".ezsteps"))(create-directory ".ezsteps"))
 	      ;; if ezsteps was defined then we are sure to have at least one step but check anyway
 	      (if (not (> (length ezstepslst) 0))
 		  (debug:print-error 0 *default-log-port* "ezsteps defined but ezstepslst is zero length")
-		  (let loop ((ezstep (car ezstepslst))
-			     (tal    (cdr ezstepslst))
-			     (prevstep #f))
-                    (debug:print-info 0 *default-log-port* "Processing ezstep \"" (string-intersperse ezstep " ") "\"")
-		    ;; check exit-info (vector-ref exit-info 1)
-		    (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1)
-			(let* ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat))
-			       (stepname    (car ezstep))
-			       (stepparms   (hash-table-ref all-steps-dat stepname)))
-			  (setenv "MT_STEP_NAME" stepname)
-			  (pp (hash-table->alist all-steps-dat))
-			  ;; if logpro-used read in the stepname.dat file
-			  (if (and logpro-used (common:file-exists? (conc stepname ".dat")))
-			      (launch:load-logpro-dat run-id test-id stepname))
-			  (if (steprun-good? logpro-used (launch:einf-exit-code exit-info) stepparms)
-			      (if (not (null? tal))
-				  (loop (car tal) (cdr tal) stepname))
-			      (debug:print 0 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping")))
-			(debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep)))))))))
+		  (let ((all-step-names (map car ezstepslst)))
+		    (setenv "MT_STEP_NAMES" (string-intersperse all-step-names " "))
+		    (let loop ((ezstep (car ezstepslst))
+			       (tal    (cdr ezstepslst))
+			       (prevstep #f))
+		      (debug:print-info 0 *default-log-port* "Processing ezstep \"" (string-intersperse ezstep " ") "\"")
+		      ;; check exit-info (vector-ref exit-info 1)
+		      (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1)
+			  (let* ((logpro-used (launch:runstep ezstep run-id test-id exit-info m
+							      tal testconfig all-steps-dat prevstep envdbf))
+				 (stepname    (car ezstep))
+				 (stepparms   (hash-table-ref all-steps-dat stepname)))
+			    (setenv "MT_STEP_NAME" stepname)
+			    (pp (hash-table->alist all-steps-dat))
+			    ;; if logpro-used read in the stepname.dat file
+			    (if (and logpro-used (common:file-exists? (conc stepname ".dat")))
+				(launch:load-logpro-dat run-id test-id stepname))
+			    (if (steprun-good? logpro-used (launch:einf-exit-code exit-info) stepparms)
+				(if (not (null? tal))
+				    (loop (car tal) (cdr tal) stepname))
+				(debug:print 0 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping")))
+			  (debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep))
+		      ))))))))
 
 (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)
   (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30")))
          (start-seconds (current-seconds))
 	 (calc-minutes  (lambda ()
@@ -205,17 +212,19 @@
 			     (current-seconds) 
 			     start-seconds)))))
 	 (kill-tries 0))
     ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
     ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
-    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
+    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t)
 
     (let loop ((minutes   (calc-minutes))
 	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
 	       (disk-free (get-df (current-directory)))
                (last-sync (current-seconds)))
-      (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
+      ;; (common:telemetry-log "zombie" (conc "launch:monitor-job -
+      ;; top of loop encountered at "(current-seconds)" with
+      ;; last-sync="last-sync))
       (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
              (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                    (delta (abs (- load cpu-load))))
                               (if (> delta 0.1) ;; don't bother updating with small changes
                                   load
@@ -233,33 +242,28 @@
              (test-info   (rmt:get-test-info-by-id run-id test-id))
              (state       (db:test-get-state test-info))
              (status      (db:test-get-status test-info))
              (kill-reason  "no kill reason specified")
              (kill-job?    #f))
-        (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
+        #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
         (cond
          ((test-get-kill-request run-id test-id)
           (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
           (set! kill-job? #t))
          ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
           (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
           (set! kill-job? #t))
          ((equal? status "DEAD")
-          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
+          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t)
           (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
           ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
           (set! kill-job? #f)))
 
         (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
         (launch:handle-zombie-tests run-id)
-        (when do-sync
-          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
-          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
-          (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
-          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
-          (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
-        
+        (if do-sync ;; save meta data about the running of this test
+	    (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
 	(if kill-job? 
 	    (begin
               (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
 	      (mutex-lock! m)
 	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
@@ -312,11 +316,11 @@
 	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
 		  (loop (calc-minutes)
                         (or new-cpu-load cpu-load)
                         (or new-disk-free disk-free)
                         (if do-sync (current-seconds) last-sync)))))))
-    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
+    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional
 
 
 (define (launch:execute encoded-cmd)
   (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
 	 (tconfigreg #f))
@@ -398,11 +402,10 @@
                                             ;; one more time, change to the work-area directory
                                             (change-directory work-area)))
 	       ) ;; let*
 
 	  (if contour (setenv "MT_CONTOUR" contour))
-	  
 	  ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ...
 	  ;;
 	  (setenv "MT_TESTSUITENAME" areaname)
 	  (setenv "MT_RUN_AREA_HOME" top-path)
 	  (set! *toppath* top-path)
@@ -465,10 +468,13 @@
 				  (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.")
 				  (exit))))
 		 (test-pid  (db:test-get-process_id  test-info)))
 	    (cond
              ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag.
+	     ;;((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
+	     ;;	  (and (equal? (db:test-get-state test-info) "COMPLETED")                           ;; completed/abort => rerun if asked
+	     ;;	       (member (db:test-get-status test-info) '("ABORT"))))
 	     ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
 	      (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
 	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
 
               (rmt:general-call 'set-test-start-time #f test-id)
@@ -731,11 +737,11 @@
 ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
 ;; 0 RUNNING ==> this is actually the first condition, should not get here
 
 (define (launch:end-of-run-check run-id )
     (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))  
-           (running-cnt (rmt:get-count-tests-running-for-run-id run-id #f)) ;; fastmode=no
+           (running-cnt (rmt:get-count-tests-running-for-run-id run-id))
            (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))
            (current-state (rmt:get-run-state run-id))
            (current-status (rmt:get-run-status run-id)))
      ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
      (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)