Index: TODO
==================================================================
--- TODO
+++ TODO
@@ -16,10 +16,17 @@
 #     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 TODO
 ====
 
+WW38
+. Add test_rundat to no-sync ==> correction, put in <testdir>/.meta/test-run.dat
+. Add STATE/STATUS transitions to .meta/test-run.dat or similar
+. Swizzle update-test-rundat to operate on no-sync
+. Swizzle update-run-duration, -uname-host and cpuload-diskfree to no-sync
+. On state/status change update tests table with duration
+
 WW15
 . fill newview matrix with data, filter pipeline gui elements
 . improve [script], especially indent handling
 
 WW16
@@ -35,11 +42,10 @@
 . break command line into sections; all, run control, queries, utilities etc.
 . pull in ftfplan (not integrated, just code pulled in)
 
 WW20
 . ./configure => ubuntu, sles11, sles12, rh7
-. Jenkins junit XML support
 . Add output flushing in teamcity support
 . Switch to using simple runs query everywhere
 . Add end_time to runs and add a rollup call that sets state, status and end_time
 
 Future

Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -543,11 +543,11 @@
 		  (handle-exceptions
 		   exn
 		   #f
 		   (if (directory? fullname)
 		       (begin
-			 (debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
+			 (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
 			 (inc-stat "directories"))
 		       (begin
 			 (delete-file* fullname)
 			 (inc-stat "deleted")))
 		   (hash-table-delete! all-files file)))))))
@@ -567,11 +567,11 @@
 			     (- num-logs max-allowed))))
 	    (for-each
 	     (lambda (file)
 	       (let* ((fullname (conc "logs/" file)))
 		 (if (directory? fullname)
-		     (debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
+		     (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
 		     (handle-exceptions
 		      exn
 		      (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn)
 		      (delete-file* fullname)))))
 	     files)
@@ -1766,24 +1766,37 @@
 ;;      - load-in, load as from uptime, NOT normalized
 ;;      - numcpus, number of cpus, ideally use the real cpus, not threads
 ;;
 (define (common:get-delay load-in numcpus)
   (let* ((ratio (/ load-in numcpus))
-         (new-option (configf:lookup *configdat* "load" "new-load-method")))
+         (new-option (configf:lookup *configdat* "load" "new-load-method"))
+	 (paramstr   (or (configf:lookup *configdat* "load" "exp-params")
+			 "15 12 1281453987.9543 0.75")) ;; 5 4 10 1"))
+	 (paramlst   (map string->number (string-split paramstr))))
     (if new-option
-      (begin
-        (cond ((and (>= ratio 0) (< ratio .5))
-          0
-        )
-        ((and (>= ratio 0.5) (<= ratio .9))
-          (* ratio (/ 5 .9)))
-        ((and (> ratio .9) (<= ratio 1.1))
-          (+ 5 (* (- ratio .9) (/ 55 .2)))) 
-        ((> ratio 1.1)
-          60)))
-      (max (/ (expt 5 (* 4 ratio)) 10) 0))))
-
+	(begin
+          (cond ((and (>= ratio 0) (< ratio .5))
+		 0)
+		((and (>= ratio 0.5) (<= ratio .9))
+		 (* ratio (/ 5 .9)))
+		((and (> ratio .9) (<= ratio 1.1))
+		 (+ 5 (* (- ratio .9) (/ 55 .2)))) 
+		((> ratio 1.1)
+		 60)))
+	(match paramlst
+	  ((r1 r2 s1 s2)
+	   (debug:print 3 *default-log-port* "Using params r1=" r1 " r2=" r2 " s1=" s1 " s2=" s2)
+	   (min (max (/ (expt r1 (* r2 s2 ratio)) s1) 0) 30))
+	  (else
+	   (debug:print 0 *default-log-port* "BAD exp-params, should be \"r1 r2 s1 s2\" but got " paramstr)
+	   30)))))
+
+(define (common:print-delay-table)
+  (let loop ((x 0))
+    (print x "," (common:get-delay x 1))
+    (if (< x 2)
+	(loop (+ x 0.1)))))
 
 (define (get-cpu-load #!key (remote-host #f))
   (car (common:get-cpu-load remote-host)))
 ;;   (let* ((load-res (process:cmd-run->list "uptime"))
 ;; 	 (load-rx  (regexp "load average:\\s+(\\d+)"))
@@ -1800,30 +1813,44 @@
 ;; get values from cached info from dropping file in logs dir
 ;;  e.g. key is host and dtype is normalized-load
 ;;
 (define (common:get-cached-info key dtype #!key (age 10))
   (if *toppath*
-      (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")))
+      (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log"))
+	     (delfile  (lambda ()
+			 (debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn)
+			 (delete-file* fullpath)
+			 #f)))
 	(if (and (file-exists? fullpath)
 		 (file-read-access? fullpath))
 	    (handle-exceptions
 		exn
 	      (begin
 		(debug:print 0 *default-log-port* "failed to get cached info from " fullpath ", exn=" exn)
 		#f)
 	     (debug:print 2 *default-log-port* "reading file " fullpath)
-	     (let ((real-age (- (current-seconds)(file-change-time fullpath)))) 
+	     (let ((real-age (- (current-seconds)
+				(handle-exceptions
+				    exn
+				  (begin
+				    (debug:print 1 *default-log-port* "Failed to read mod time on file "
+						 fullpath ", using 0, exn=" exn)
+				    0)
+				  (file-change-time fullpath)))))
 	       (if (< real-age age)
 		   (handle-exceptions
 		       exn
-		     (begin
-		       (debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn)
-		       (delete-file* fullpath)
-		       #f)
-		     (with-input-from-file fullpath read))
+		     (delfile)
+		     (let* ((res (with-input-from-file fullpath read)))
+		       (if (eof-object? res)
+			   (begin
+			     (delfile)
+			     #f)
+			   res)))
 		   (begin
-		     (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")
+		     (debug:print-info 2 *default-log-port* "file " fullpath
+				       " is too old (" real-age" seconds) to trust, skipping reading it")
 		     #f))))
 	    (begin
 	      (debug:print 2 *default-log-port* "not reading file " fullpath)
 	      #f)))
       #f))
@@ -2089,33 +2116,38 @@
                      #f
                      (common:get-homehost)))
          (hh     (if hh-dat (car hh-dat) #f)))
     (common:wait-for-normalized-load maxnormload msg hh)))
 
+(define *numcpus-cache* (make-hash-table))
 (define (common:get-num-cpus remote-host)
   (let* ((actual-host (or remote-host (get-host-name))))
-    (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) ;; hosts had better not be changing the number of cpus too often!
-	(let* ((proc   (lambda ()
-			 (let loop ((numcpu 0)
-				    (inl    (read-line)))
-			   (if (eof-object? inl)
-			       (if (> numcpu 0)
-				   numcpu
-				   #f) ;; if zero return #f so caller knows that things are not working
-			       (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl)
-					 (+ numcpu 1)
-					 numcpu)
-				     (read-line))))))
-	       (result (if remote-host
-			   (with-input-from-pipe 
-			       (conc "ssh " remote-host " cat /proc/cpuinfo")
-			     proc)
-			   (with-input-from-file "/proc/cpuinfo" proc))))
-	  (if (and (number? result)
-		   (> result 0))
-	      (common:write-cached-info actual-host "num-cpus" result))
-	  result))))
+    ;; hosts had better not be changing the number of cpus too often!
+    (or (hash-table-ref/default *numcpus-cache* actual-host #f)
+	(let* ((numcpus (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600)))
+			    (let* ((proc   (lambda ()
+					     (let loop ((numcpu 0)
+							(inl    (read-line)))
+					       (if (eof-object? inl)
+						   (if (> numcpu 0)
+						       numcpu
+						       #f) ;; if zero return #f so caller knows that things are not working
+						   (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl)
+							     (+ numcpu 1)
+							     numcpu)
+							 (read-line))))))
+				   (result (if remote-host
+					       (with-input-from-pipe 
+						(conc "ssh " remote-host " cat /proc/cpuinfo")
+						proc)
+					       (with-input-from-file "/proc/cpuinfo" proc))))
+			      (if (and (number? result)
+				       (> result 0))
+				  (common:write-cached-info actual-host "num-cpus" result))
+			      result))))
+	  (hash-table-set! *numcpus-cache* actual-host numcpus)
+	  numcpus))))
 
 ;; wait for normalized cpu load to drop below maxload
 ;;
 (define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5))
   (let ((num-cpus (common:get-num-cpus remote-host)))
@@ -2155,11 +2187,14 @@
 	 (effective-host    (or remote-host "localhost"))
 	 (normalized-effective-load (/ effective-load numcpus))
 	 (will-wait                 (> normalized-effective-load maxnormload)))
     (if (> recommended-delay 0)
 	(let* ((actual-delay (min recommended-delay 30)))
-	  (debug:print-info 0 *default-log-port* "Load control, delaying " actual-delay " seconds to maintain safe load.")
+	  (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load"))
+	      (debug:print-info 0 *default-log-port* "Load control, delaying "
+				actual-delay " seconds to maintain safe load. current normalized effective load is "
+				normalized-effective-load"."))
 	  (thread-sleep! actual-delay)))
     
     (cond
      ;; bad data, try again to get the data
      ((not will-wait)

Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -212,10 +212,12 @@
   (hash-table-set!
    (dboard:commondat-tabdats commondat)
    tabnum
    tabdat))
 
+(define *updater-running* #f) ;; move this into one of the stucts
+
 ;; gets and calls updater list based on curr-tab-num
 ;;
 (define (dboard:common-run-curr-updaters commondat #!key (tab-num #f))
   (if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat
       (let* ((tnum     (or tab-num (dboard:commondat-curr-tab-num commondat)))
@@ -240,11 +242,13 @@
 		     tnum
 		     (cons updater curr-updaters))))
 
 ;; data for each specific tab goes here
 ;;
-(defstruct dboard:tabdat 
+(defstruct dboard:tabdat
+  ;; flags
+  ((already-running #f)                  : boolean)     ;; the updater is already running. skip
   ;; runs
   ((allruns         '())                 : list)        ;; list of dboard:rundat records
   ((allruns-by-id    (make-hash-table))  : hash-table)  ;; hash of run-id -> dboard:rundat records
   ((done-runs       '())                 : list)        ;; list of runs already drawn
   ((not-done-runs   '())                 : list)        ;; list of runs not yet drawn
@@ -645,11 +649,11 @@
 ;;
 (define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals)
   (let* ((start-time   (current-seconds))
 	 (access-mode  (dboard:tabdat-access-mode tabdat))
          (num-to-get   (string->number (or (configf:lookup *configdat* "setup" "num-tests-to-get")
-                                           "200")))
+                                           "50"))) ;; was 200, which is fine in a normal run area.
 	 (states       (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat)))
 	 (statuses     (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat)))
          (do-not-use-db-file-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab
          (do-not-use-query-timestamps   #t) ;; (configf:lookup *configdat* "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab
 	 (sort-info    (get-curr-sort))
@@ -716,10 +720,11 @@
     (if got-all
 	(begin
 	  (dboard:rundat-last-update-set!     run-dat (- start-time 2))
 	  (dboard:rundat-run-data-offset-set! run-dat 0))
 	(begin
+	  ;;; (thread-sleep! 0.25) ;; give the rest of the gui some time to update. <-- this did NOT help
 	  (dboard:rundat-run-data-offset-set! run-dat
 					      (+ num-to-get (dboard:rundat-run-data-offset run-dat)))))
 
     (for-each 
      (lambda (tdat)
@@ -833,11 +838,13 @@
 ;;
 ;; create a virtual table of all the tests
 ;; keypatts: ( (KEY1 "abc%def")(KEY2 "%") )
 ;;
 (define (dboard:update-rundat tabdat runnamepatt numruns testnamepatt keypatts)
-  (let* ((access-mode      (dboard:tabdat-access-mode tabdat))
+  (dboard:tabdat-already-running-set! tabdat #t)
+  (let* (;; (already-running  (dboard:tabdat-already-running tabdat))
+	 (access-mode      (dboard:tabdat-access-mode tabdat))
          (keys             (dboard:tabdat-keys tabdat)) ;; (db:dispatch-query access-mode rmt:get-keys db:get-keys)))
 	 (last-runs-update (- (dboard:tabdat-last-runs-update tabdat) 2))
          (allruns          (rmt:get-runs runnamepatt numruns (dboard:tabdat-start-run-offset tabdat) keypatts))
          ;;(allruns-tree (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f))
          (allruns-tree    (rmt:get-runs-by-patt keys "%" #f #f #f #f 0)) ;; last-runs-update));;'("id" "runname")
@@ -901,27 +908,26 @@
 		  (hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id)
 		  (hash-table-set!    (dboard:tabdat-allruns-by-id tabdat) run-id run-struct))
 	      (if (or (null? tal)
 		      (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update
 		  (begin
-		    (when (> elapsed-time 2)   
+		    #;(when (> elapsed-time 2)   
                       (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
                       (let* ((old-val (iup:attribute *tim* "TIME"))
                              (new-val (number->string (inexact->exact (floor (* 2  (string->number old-val)))))))
                         (if (< (string->number new-val) 5000)
-                            ((debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
-                            (iup:attribute-set! *tim* "TIME" new-val))))
-
-
-                      )
+                            (begin
+			      (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
+			      (iup:attribute-set! *tim* "TIME" new-val)))))
 		    (dboard:tabdat-allruns-set! tabdat new-res)
 		    maxtests)
 		  (if (> (dboard:rundat-run-data-offset run-struct) 0)
 		      (loop run tal new-res newmaxtests) ;; not done getting data for this run
 		      (loop (car tal)(cdr tal) new-res newmaxtests)))))))
     (dboard:tabdat-filters-changed-set! tabdat #f)
-    (dboard:update-tree tabdat runs-hash header tb)))
+    (dboard:update-tree tabdat runs-hash header tb)
+    (dboard:tabdat-already-running-set! tabdat #f)))
 
 (define *collapsed* (make-hash-table))
 
 (define (toggle-hide lnum uidat) ; fulltestname)
   (let* ((btn (vector-ref (dboard:uidat-get-lftcol uidat) lnum))
@@ -2505,14 +2511,10 @@
 
         ;; insert extra widget here
         (if extra-widget
             extra-widget
             (iup:hbox)) ;; empty widget
-
-        
-
-        
         )))
 
      (let* ((status-toggles (map (lambda (status)
 				   (iup:toggle (conc status)
 					       #:fontsize 8 ;; btn-fontsz ;; "10"
@@ -3725,28 +3727,36 @@
 ;; (define (tabdat-values tabdat)
 
 ;; runs update-rundat using the various filters from the gui
 ;;
 (define (dashboard:do-update-rundat tabdat)
+  ;; this seems like a good place to check for already running and skip if so
+  ;;
+  ;; (set! *updater-running* #t)
+;;(if (dboard:tabdat-already-running tabdat)
+;;    (begin
+;;	(debug:print-info 0 *default-log-port* "Dashboard overloaded - updates will be slow, skipping update.")
+;;	(dboard:tabdat-target tabdat))
   (dboard:update-rundat
    tabdat
    (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "runname" "%")
    (dboard:tabdat-numruns tabdat)
    (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%")
    ;; generate key patterns from the target stored in tabdat
    (let* ((dbkeys (dboard:tabdat-dbkeys tabdat)))
      (let ((fres   (if (dboard:tabdat-target tabdat)
-                       (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%"))))
-                         (map (lambda (k v)(list k v)) dbkeys ptparts))
-                       (let ((res '()))
-                         (for-each (lambda (key)
-                                     (if (not (equal? key "runname"))
-                                         (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f)))
-                                           (if val (set! res (cons (list key val) res))))))
-                                   dbkeys)
-                         res))))
-       fres))))
+		       (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%"))))
+			 (map (lambda (k v)(list k v)) dbkeys ptparts))
+		       (let ((res '()))
+			 (for-each (lambda (key)
+				     (if (not (equal? key "runname"))
+					 (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f)))
+					   (if val (set! res (cons (list key val) res))))))
+				   dbkeys)
+			 res))))
+       fres)))
+  #;(set! *updater-running* #f))
 
 (define (dashboard:runs-tab-updater commondat tab-num)
   (debug:catch-and-dump 
    (lambda ()
      (let* ((tabdat (dboard:common-get-tabdat commondat tab-num: tab-num))
@@ -3801,22 +3811,25 @@
 	   (dashboard:runs-tab-updater commondat 1))
 	 tab-num: 2)
 	(iup:callback-set! *tim*
 			   "ACTION_CB"
 			   (lambda (time-obj)
-			     (let ((update-is-running #f))
-			     (mutex-lock! (dboard:commondat-update-mutex commondat))
-			     (set! update-is-running (dboard:commondat-updating commondat))
-			     (if (not update-is-running)
-			     (dboard:commondat-updating-set! commondat #t))
-			     (mutex-unlock! (dboard:commondat-update-mutex commondat))
-			     (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
-			     (begin
-			     (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)
-			     (mutex-lock! (dboard:commondat-update-mutex commondat))
-			     (dboard:commondat-updating-set! commondat #f)
-			     (mutex-unlock! (dboard:commondat-update-mutex commondat)))
+			     (if (not *updater-running*)
+				 (begin
+				   ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
+				   ;; (set! update-is-running (dboard:commondat-updating commondat))
+				   ;;(if (not update-is-running)
+				   ;;	   (dboard:commondat-updating-set! commondat #t))
+				   ;;(mutex-unlock! (dboard:commondat-update-mutex commondat))
+				   ;;(if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
+				   ;;	   (begin
+				   (set! *updater-running* #t)
+				   (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)
+				   (set! *updater-running* #f)
+				   ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
+				   ;;  (dboard:commondat-updating-set! commondat #f)
+				   ;;   (mutex-unlock! (dboard:commondat-update-mutex commondat)))
 				   ))
 			     1))))
       
       (let ((th1 (make-thread (lambda ()
 				(thread-sleep! 1)

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -459,11 +459,11 @@
 	  (print "Attempt to safely close sqlite3 db failed. Trying again. exn=" exn)
 	  (thread-sleep! 3)
 	  (sqlite3:interrupt! db)
 	  (db:safely-close-sqlite3-db db stmtcache try-num: (- try-num 1)))
 	(if (sqlite3:database? db)
-	    (let* ((stmts (hash-table-ref/default stmt-cache db #f)))
+	    (let* ((stmts (and stmt-cache (hash-table-ref/default stmt-cache db #f))))
 	      (if stmts (map sqlite3:finalize! (hash-table-values stmts)))
 	      (sqlite3:finalize! db)
 	      #t)
 	    #f))))
 
@@ -826,16 +826,17 @@
               )
             
 	    ;; first pass implementation, just insert all changed rows
 	    (for-each 
 	     (lambda (targdb)
-	       (let* ((db     (db:dbdat-get-db targdb))
-                      (drp-trigger (if (member "last_update" field-names)
-                                      (db:drop-trigger db tablename) 
-                                       #f))
-                       (is-trigger-dropped (if (member "last_update" field-names)
-                                              (db:is-trigger-dropped db tablename) #f)) 
+	       (let* ((db                 (db:dbdat-get-db targdb))
+                      (drp-trigger        (if (member "last_update" field-names)
+					      (db:drop-trigger db tablename) 
+					      #f))
+                      (is-trigger-dropped (if (member "last_update" field-names)
+                                              (db:is-trigger-dropped db tablename)
+					      #f)) 
 		      (stmth  (sqlite3:prepare db full-ins)))
 		 ;; (db:delay-if-busy targdb) ;; NO WAITING
                  (if (member "last_update" field-names)
                      (debug:print-info 3 *default-log-port* "is-trigger-dropped: " is-trigger-dropped)) 
 		 (for-each
@@ -1224,29 +1225,29 @@
                              FOR EACH ROW
                                BEGIN 
                                  UPDATE runs SET last_update=(strftime('%s','now'))
                                    WHERE id=old.id;
                                END;" ) 
-       (list "update_run_stats_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_run_stats_trigger AFTER UPDATE ON run_stats
+	   (list "update_run_stats_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_run_stats_trigger AFTER UPDATE ON run_stats
                              FOR EACH ROW
                                BEGIN 
                                  UPDATE run_stats SET last_update=(strftime('%s','now'))
                                    WHERE id=old.id;
                                END;" )
-       (list "update_tests_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_tests_trigger AFTER UPDATE ON tests
+	   (list "update_tests_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_tests_trigger AFTER UPDATE ON tests
                              FOR EACH ROW
                                BEGIN 
                                  UPDATE tests SET last_update=(strftime('%s','now'))
                                    WHERE id=old.id;
                                END;" )
-       (list "update_teststeps_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_teststeps_trigger AFTER UPDATE ON test_steps
+	   (list "update_teststeps_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_teststeps_trigger AFTER UPDATE ON test_steps
                              FOR EACH ROW
                                BEGIN 
                                  UPDATE test_steps SET last_update=(strftime('%s','now'))
                                    WHERE id=old.id;
                                END;" )
-       (list "update_test_data_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_test_data_trigger AFTER UPDATE ON test_data
+	   (list "update_test_data_trigger"  "CREATE TRIGGER  IF NOT EXISTS update_test_data_trigger AFTER UPDATE ON test_data
                              FOR EACH ROW
                                BEGIN 
                                  UPDATE test_data SET last_update=(strftime('%s','now'))
                                    WHERE id=old.id;
                                END;" )))
@@ -1261,40 +1262,43 @@
     (for-each (lambda (key)
               (sqlite3:execute db (cadr key)))
           db:trigger-list))
 
 (define (db:drop-all-triggers dbstruct)
-(db:with-db
+  (db:with-db
    dbstruct #f #f
    (lambda (db)
-(db:drop-triggers db))))
+     (db:drop-triggers db))))
 
 (define (db:is-trigger-dropped db tbl-name)
-     (let* ((trigger-name (if (equal? tbl-name "test_steps")
-				"update_teststeps_trigger" 
-                                (conc "update_" tbl-name "_trigger"))))
-            (sqlite3:for-each-row
-		   (lambda (name)
-                     ;(print name)
-		     (set! res (vector name)))
-		   db 
-		   "select name  from sqlite_master where type = 'trigger' ;" 
-		   )))
+  (let* ((trigger-name (if (equal? tbl-name "test_steps")
+			   "update_teststeps_trigger" 
+                           (conc "update_" tbl-name "_trigger")))
+	 (res          #f))
+    (sqlite3:for-each-row
+     (lambda (name)
+       (if (equal? name trigger-name)
+	   (set! res #t)))
+     db 
+     "SELECT name FROM sqlite_master WHERE type = 'trigger' ;" 
+     )))
 
 (define (db:drop-triggers db)
-  (for-each (lambda (key) 
-              (sqlite3:execute db (conc "drop trigger " (car key))))
-          db:trigger-list))
+  (for-each
+   (lambda (key) 
+     (sqlite3:execute db (conc "drop trigger if exists " (car key))))
+   db:trigger-list))
 
 (define  (db:drop-trigger db tbl-name)
-      (let* ((trigger-name (if (equal? tbl-name "test_steps")
-				"update_teststeps_trigger" 
-                                (conc "update_" tbl-name "_trigger"))))
-       (for-each (lambda (key) 
-             (if (equal? (car key) trigger-name)
-             (sqlite3:execute db (conc "drop trigger " trigger-name))))
-      db:trigger-list)))
+  (let* ((trigger-name (if (equal? tbl-name "test_steps")
+			   "update_teststeps_trigger" 
+                           (conc "update_" tbl-name "_trigger"))))
+    (for-each
+     (lambda (key) 
+       (if (equal? (car key) trigger-name)
+           (sqlite3:execute db (conc "drop trigger if exists " trigger-name))))
+     db:trigger-list)))
 
 (define  (db:create-trigger db tbl-name)
       (let* ((trigger-name (if (equal? tbl-name "test_steps")
                               "update_teststeps_trigger" 
                               (conc "update_" tbl-name "_trigger"))))
@@ -1530,11 +1534,11 @@
                               state        TEXT DEFAULT 'new',
                               status       TEXT DEFAULT 'n/a',
                               archive_type TEXT DEFAULT 'bup',
                               du           INTEGER,
                               archive_path TEXT);")))
-         (print "creating trigges from init") 
+         ;; (print "creating trigges from init") 
         (db:create-triggers db)    
      db)) ;; )
 
 ;;======================================================================
 ;; A R C H I V E S
@@ -1772,15 +1776,33 @@
           #f
           )
         (with-input-from-file infile read-lines)
 	)))
 
-;;  select end_time-now from
-;;      (select testname,item_path,event_time+run_duration as
-;;                          end_time,strftime('%s','now') as now from tests where state in
-;;      ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));
-
+;; check duration against test-run.dat file if it exists and update the value in
+;; the db if necessary
+;;
+(define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)
+  (let* ((datf             (conc run-dir ".mt_data/test-run.dat"))
+	 (modt             (if (and (file-exists? datf)
+				    (file-read-access? datf))
+			       (file-modification-time datf)
+			       #f)) ;; (+ event-time run-duration))))
+	 (alt-run-duration (if modt
+			       (- modt event-time)
+			       #f)))
+    (if (and alt-run-duration
+	     (> alt-run-duration run-duration))
+	(begin
+	  (debug:print 0 *default-log-port* "Test " test-id " run duration mismatch. Setting to " alt-run-duration)
+	  (db:with-db
+	   dbstruct #f #f
+	   (lambda (db)
+	     (sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id)
+	     #t)))
+	#f))) ;; #f = we did NOT adjust the time
+	      
 (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
   (let* ((incompleted '())
 	 (oldlaunched '())
 	 (toplevels   '())
           ;; The default running-deadtime is 720 seconds = 12 minutes.
@@ -1824,37 +1846,39 @@
 	 ;;                     (db:test-get-run_duration testdat)))
 	 ;;                    600) 
 	 ;; (db:delay-if-busy dbdat)
 	 (sqlite3:for-each-row 
 	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
-	    (if (and (equal? uname "n/a")
-		     (equal? item-path "")) ;; this is a toplevel test
-		;; what to do with toplevel? call rollup?
-		(begin
-		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
-		(begin
-		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
-		  (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
-				    test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
-				    " event-time="event-time" run-duration="run-duration))))
+	    (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
+		(if (and (equal? uname "n/a")
+			 (equal? item-path "")) ;; this is a toplevel test
+		    ;; what to do with toplevel? call rollup?
+		    (begin
+		      (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+		      (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+		    (begin
+		      (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
+		      (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
+					test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
+					" event-time="event-time" run-duration="run-duration)))))
 	  stmth1
 	  run-id running-deadtime) ;; default time 720 seconds
-       
+	    
 	 (sqlite3:for-each-row 
 	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
-	    (if (and (equal? uname "n/a")
-		     (equal? item-path "")) ;; this is a toplevel test
-		;; what to do with toplevel? call rollup?
-		(begin
-		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
-		(begin
-		  (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
-				    " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
-				    " run-duration="run-duration)
-		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
+	    (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
+		(if (and (equal? uname "n/a")
+			 (equal? item-path "")) ;; this is a toplevel test
+		    ;; what to do with toplevel? call rollup?
+		    (begin
+		      (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+		      (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+		    (begin
+		      (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
+					" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
+					" run-duration="run-duration)
+		      (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))))
 	  stmth2
 	  run-id remotehoststart-deadtime) ;; default time 230 seconds
 	 
 	 ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
 	 ;;
@@ -2180,12 +2204,12 @@
           (if newres
               newres
               res))
         res)))
 
-(define (db:no-sync-close-db db stmtcache)
-  (db:safely-close-sqlite3-db db stmtcache))
+(define (db:no-sync-close-db db stmt-cache)
+  (db:safely-close-sqlite3-db db stmt-cache))
 
 ;; transaction protected lock aquisition
 ;; either:
 ;;    fails    returns  (#f . lock-creation-time)
 ;;    succeeds (returns (#t . lock-creation-time)
@@ -2196,13 +2220,14 @@
     (sqlite3:with-transaction
      db
      (lambda ()
        (handle-exceptions
 	   exn
-	   (let ((lock-time (current-seconds)))
-	     (sqlite3:execute db "INSERT INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-time)
-	     `(#t . ,lock-time))
+	 (let ((lock-time (current-seconds)))
+	   (debug:print-info 2 *default-log-port* "db:no-sync-get-lock keyname=" keyname ", lock-time=" lock-time ", exn=" exn)
+	   (sqlite3:execute db "INSERT INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-time)
+	   `(#t . ,lock-time))
 	 `(#f . ,(sqlite3:first-result db "SELECT val FROM no_sync_metadat WHERE var=?;" keyname)))))))
 
 
 
 ;; use a global for some primitive caching, it is just silly to
@@ -2238,11 +2263,12 @@
                  (n   0))
         (if (equal? hed field)
             (handle-exceptions
              exn
              (begin
-               (debug:print 0 *default-log-port* "WARNING: attempt to read non-existant field, row=" row " header=" header " field=" field ", exn=" exn)
+               (debug:print 0 *default-log-port* "WARNING: attempt to read non-existant field, row="
+			    row " header=" header " field=" field ", exn=" exn)
                #f)
              (vector-ref row n))
 	    (if (null? tal) #f (loop (car tal)(cdr tal)(+ n 1)))))))
 
 ;; Accessors for the header/data structure
@@ -3461,11 +3487,11 @@
      (lambda (run-id)
        (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
 	 (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs)))
      run-ids)))
 
-;; Get test data using test_id, run-id is not used
+;; Get test data using test_id, run-id is not used - but it will be!
 ;; 
 (define (db:get-test-info-by-id dbstruct run-id test-id)
   (db:with-db
    dbstruct
    #f ;; run-id
@@ -4479,11 +4505,11 @@
 		 (db     (db:dbdat-get-db   dbdat)) ;; we'll return this so (db:delay--if-busy can be called inline
 		 (dbfj   (conc dbpath "-journal")))
 	    (if (handle-exceptions
 		 exn
 		 (begin
-		   (debug:print-info 0 *default-log-port* "WARNING: failed to test for existance of " dbfj)
+		   (debug:print-info 0 *default-log-port* "WARNING: failed to test for existance of " dbfj ", exn=" exn)
 		   (thread-sleep! 1)
 		   (db:delay-if-busy count (- count 1))) 
 		 (common:file-exists? dbfj))
 		(case count
 		  ((6)

Index: http-transport.scm
==================================================================
--- http-transport.scm
+++ http-transport.scm
@@ -566,17 +566,17 @@
                                   (debug:print 0 *default-log-port* msg)
                                   (if (common:file-exists? full-serv-fname)
                                       (system (conc "sleep 1;mv -f " full-serv-fname " " new-serv-fname))
                                       (debug:print 0 *default-log-port* "INFO: cannot move " full-serv-fname " to " new-serv-fname))
                                   (exit)))))
-    (if (and (not start-time-old) ;; last server start try was less than five seconds ago
+    #;(if (and (not start-time-old) ;; last server start try was less than five seconds ago
 	     (not server-starting))
 	(begin
 	  (cleanup-proc "NOT starting server, there is either a recently started server or a server in process of starting")
 	  (exit)))
     ;; lets not even bother to start if there are already three or more server files ready to go
-    (let* ((num-alive   (server:get-num-alive (server:get-list *toppath*))))
+    #;(let* ((num-alive   (server:get-num-alive (server:get-list *toppath*))))
       (if (> num-alive 3)
           (begin
             (cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up"))
             (exit))))
   (common:save-pkt `((action . start)

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -205,17 +205,19 @@
 			     (current-seconds) 
 			     start-seconds)))))
 	 (kill-tries 0))
     ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
     ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
-    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
+    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t)
 
     (let loop ((minutes   (calc-minutes))
 	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
 	       (disk-free (get-df (current-directory)))
                (last-sync (current-seconds)))
-      (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
+      ;; (common:telemetry-log "zombie" (conc "launch:monitor-job -
+      ;; top of loop encountered at "(current-seconds)" with
+      ;; last-sync="last-sync))
       (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
              (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                    (delta (abs (- load cpu-load))))
                               (if (> delta 0.1) ;; don't bother updating with small changes
                                   load
@@ -233,33 +235,28 @@
              (test-info   (rmt:get-test-info-by-id run-id test-id))
              (state       (db:test-get-state test-info))
              (status      (db:test-get-status test-info))
              (kill-reason  "no kill reason specified")
              (kill-job?    #f))
-        (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
+        #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
         (cond
          ((test-get-kill-request run-id test-id)
           (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
           (set! kill-job? #t))
          ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
           (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
           (set! kill-job? #t))
          ((equal? status "DEAD")
-          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
+          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t)
           (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
           ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
           (set! kill-job? #f)))
 
         (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
         (launch:handle-zombie-tests run-id)
-        (when do-sync
-          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
-          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
-          (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
-          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
-          (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
-        
+        (if do-sync ;; save meta data about the running of this test
+	    (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
 	(if kill-job? 
 	    (begin
               (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
 	      (mutex-lock! m)
 	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
@@ -312,11 +309,11 @@
 	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
 		  (loop (calc-minutes)
                         (or new-cpu-load cpu-load)
                         (or new-disk-free disk-free)
                         (if do-sync (current-seconds) last-sync)))))))
-    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
+    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional
 
 
 (define (launch:execute encoded-cmd)
   (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
 	 (tconfigreg #f))
@@ -1304,12 +1301,12 @@
 	  (debug:print 2 *default-log-port* " - creating run area in " test-path)
 	  (handle-exceptions
 	   exn
 	   (begin
 	     (debug:print-error 0 *default-log-port* " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn)
-				", exiting, exn=" exn)
-	     (exit 1))
+				", continuing (might cause downstream issues?), exn=" exn)
+	     #f)
 	   (create-directory test-path #t))
 	  (debug:print 2 *default-log-port* 
 		       " - creating link from: " test-path "\n"
 		       "                   to: " lnktarget)
 

Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -22,10 +22,12 @@
 
 (declare (unit rmt))
 (declare (uses api))
 (declare (uses http-transport))
 (include "common_records.scm")
+(include "db_records.scm")
+
 ;; (declare (uses rmtmod))
 
 ;; (import rmtmod)
 
 ;;
@@ -525,15 +527,24 @@
   (rmt:general-call 'register-test run-id run-id test-name item-path))
 
 (define (rmt:get-test-id run-id testname item-path)
   (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))
 
-;; run-id is NOT used
+;; run-id is NOT used - but it will be! 
 ;;
 (define (rmt:get-test-info-by-id run-id test-id)
   (if (number? test-id)
-      (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
+      (let* ((testdat  (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)))
+	     (trundatf (conc (db:test-get-rundir testdat) "/.mt_data/test-run.dat")))
+	;; now we can update a couple fields from the filesystem
+	(if (and (db:test-get-rundir testdat)
+		 (file-exists? trundatf))
+	    (let* ((duration   (db:test-get-run_duration testdat))
+		   (event-time (db:test-get-event_time   testdat))
+		   (last-touch (file-modification-time trundatf)))
+	      (db:test-set-run_duration! testdat (max duration (- last-touch event-time)))))
+	testdat)
       (begin
 	(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
 	(print-call-chain (current-error-port))
 	#f)))
 

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -16,11 +16,12 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;  strftime('%m/%d/%Y %H:%M:%S','now','localtime')
 
 (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) 
-     posix-extras directory-utils pathname-expand typed-records format  sxml-serializer sxml-modifications)
+     posix-extras directory-utils pathname-expand typed-records format  sxml-serializer
+     sxml-modifications matchable)
 
 (declare (unit runs))
 (declare (uses db))
 (declare (uses common))
 (declare (uses items))
@@ -45,14 +46,19 @@
 (defstruct runs:dat
   reglen regfull
   runname max-concurrent-jobs run-id
   test-patts required-tests test-registry
   registry-mutex flags keyvals run-info all-tests-registry
-  can-run-more-tests
+  ;; stores results from last runs:can-run-more-tests 
+  (can-run-more-tests      #f) ;; (list can-run-more-flag num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit)
   ((can-run-more-tests-count 0) : fixnum)
   (last-fuel-check         0)  ;; time when we last checked fuel
   (beginning-of-time       (current-seconds))
+  (load-mgmt-function      #f)
+  (wait-for-jobs-function  #f)
+  (last-load-check-time    0)
+  (last-jobs-check-time    0)
   )
 
 (defstruct runs:testdat
   hed tal reg reruns  test-record
   test-name item-path jobgroup
@@ -89,11 +95,11 @@
 				    lock-files)))
 	  (if fresh-locks
 	      (begin
 		(if (runs:lownoise "runners-softlock-wait" 360)
 		    (debug:print-info 0 *default-log-port* "Other runners in flight, giving up some time..."))
-		(thread-sleep! 10))
+		(thread-sleep! 2))
 	      (begin
 		(if (runs:lownoise "runners-softlock-nowait" 360)
 		    (debug:print-info 0 *default-log-port* "No runners in flight, updating softlock"))
 		(let* ((ouf (open-output-file my-lock-file)))
 		  (with-output-to-port ouf
@@ -313,25 +319,12 @@
   ;; Take advantage of a good place to exit if running the one-pass methodology
   (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20)
 	   (args:get-arg "-one-pass"))
       (exit 0))
 
-  (thread-sleep! (cond ;; BB: check with Matt.  Should this sleep move
-		       ;; to cond clauses below where we determine we
-		       ;; have too many jobs running rather than each
-		       ;; time the and condition above is true (which
-		  ;; seems like always)?
-		  ((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way
-		   0)
-        	  ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time
-		   (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ..."))
-		   10)  ;; obviously haven't had any work to do for a while
-		  (else 0)))
-;;		   ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?
-;;		   (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)
-;;		   )))
-  
+  (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+
   (let* ((num-running             (rmt:get-count-tests-running run-id #f)) ;; fastmode=no
 	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
 	 (job-group-limit         (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))
 				    (if (string? jobg-count)
 					(string->number jobg-count)
@@ -539,11 +532,11 @@
 
     ;; register this run in monitor.db
     (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params)
     (rmt:tasks-set-state-given-param-key task-key "running")
 
-    (common:telemetry-log "run-tests"
+    #;(common:telemetry-log "run-tests"
                           payload:
                           `( (target . ,target)
                              (run-name . ,runname)
                              (test-patts . ,test-patts) ) )
 
@@ -991,13 +984,17 @@
 	   (null? non-completed))
      (debug:print-info 4 *default-log-port* "cond branch - "  "ei-4")
       (if  (runs:can-keep-running? hed 20)
 	  (begin
 	    (runs:inc-cant-run-tests hed)
-	    (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; 
+	    (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0) ", going to wait 60 sec.") ;; 
 	    ;; getting here likely means the system is way overloaded, kill a full minute before continuing
-	    (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
+	    ;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!!
+	    ;; No runsdat, can't do this yet
+	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+	    ;;
+	    (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
 	    ;; num-retries code was here
 	    ;; we use this opportunity to move contents of reg to tal
 	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
 	  (begin
 	    (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
@@ -1123,11 +1120,29 @@
 				  (conc " WARNING: t is not a vector=" t )))
 			    prereqs-not-met)
 		       ", ") ") fails: " fails
 		       "\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f))
 			    
-
+    ;; well, first lets see if cpu load throttling is enabled. If so wait around until the
+    ;; average cpu load is under the threshold before continuing
+    ;;
+    (if (runs:dat-load-mgmt-function runsdat)
+	((runs:dat-load-mgmt-function runsdat))
+	(runs:dat-load-mgmt-function-set!
+	 runsdat
+	 (lambda ()
+	   ;; jobtools maxload is useful for where the full Megatest run is done on one machine
+	   (if (and (not (common:on-homehost?))
+		    maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
+	       (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
+	   
+	   ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
+	   (if maxhomehostload
+	       (common:wait-for-homehost-load maxhomehostload
+					      (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))))))
+    
+ 
     
     (if (and (not (null? prereqs-not-met))
 	     (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))
 	(debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", ")))
 
@@ -1212,29 +1227,17 @@
       ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path))
       ;; we are going to reset all the counters for test retries by setting a new hash table
       ;; this means they will increment only when nothing can be run
       (set! *max-tries-hash* (make-hash-table))
       
-      ;; well, first lets see if cpu load throttling is enabled. If so wait around until the
-      ;; average cpu load is under the threshold before continuing
-
-      ;; jobtools maxload is useful for where the full Megatest run is done on one machine
-      (if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
-          (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
-
-      ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
-      (if maxhomehostload
-          (common:wait-for-homehost-load maxhomehostload
-					 (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))
-      
-      (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry)
+      (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat testdat)
       (runs:incremental-print-results run-id)
       (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running)
       (runs:shrink-can-run-more-tests-count runsdat)  ;; DELAY TWEAKER (still needed?)
       ;; (thread-sleep! *global-delta*)
       (if (or (not (null? tal))(not (null? reg)))
-	  (runs:loop-values tal reg reglen regfull reruns)
+	  (runs:loop-values tal reg reglen regfull reruns) ;; hed should be dropped at this time
 	  #f))
      
      ;; must be we have unmet prerequisites
      ;;
      (else
@@ -1251,19 +1254,18 @@
 	      (member 'toplevel testmode))
 	  (begin
 	    ;; couldn't run, take a breather
 	    (if  (runs:lownoise "Waiting for more work to do..." 60)
 		 (debug:print-info 0 *default-log-port* "Waiting for more work to do..."))
-	    (thread-sleep! 1)
+	    
+	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+	    (thread-sleep! 5)
 	    (list (car newtal)(cdr newtal) reg reruns))
 	  ;; the waiton is FAIL so no point in trying to run hed ever again
 	  (begin
             (let ((my-test-id (rmt:get-test-id run-id test-name item-path)))
-              (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2"))
-
-
-            
+              (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2"))            
             (if (or (not (null? reg))(not (null? tal)))
                 (if (vector? hed)
 		  (begin
 		    (debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path
 				 " from the launch list as it has prerequistes that are FAIL")
@@ -1581,19 +1583,10 @@
 			   itemmaps:    itemmaps
 			   ;; prereqs-not-met: prereqs-not-met
 			   )))
 	(runs:dat-regfull-set! runsdat regfull)
     
-        ;; -- removed BB 17ww28 - no longer needed.
-	;; every 15 minutes verify the server is there for this run
-	;; (if (and (common:low-noise-print 240 "try start server"  run-id)
-	;; 	 (not (or (and *runremote*
-	;; 		       (remote-server-url *runremote*)
-	;; 		       (server:ping (remote-server-url *runremote*)))
-	;; 		  (server:check-if-running *toppath*))))
-	;;     (server:kind-run *toppath*))
-	
 	(if (> num-running 0)
             (set! last-time-some-running (current-seconds)))
 
         (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
             (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
@@ -1636,11 +1629,11 @@
 		     "\n  regfull:     " regfull
 		     "\n  reglen:      " reglen
 		     "\n  length reg:  " (length reg)
                      )
 
-	(runs:parallel-runners-mgmt runsdat)
+	;; (runs:parallel-runners-mgmt runsdat)
 
 	;; check for hed in waitons => this would be circular, remove it and issue an
 	;; error
 	(if (member test-name waitons)
 	    (begin
@@ -1672,19 +1665,67 @@
           (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-2")
 	  (debug:print-info 4 *default-log-port* "OUTER COND: (not items)")
 	  (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests))
 		   (not (null? tal)))
 	      (loop (car tal)(cdr tal) reg reruns))
+	  ;;(runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
+
+	  ;; gonna try a strategy change here.
+	  ;;
+	  ;; check if can run more tests. if yes, continue, if no, rest until can run more
+	  ;; look at the test jobgroup and tot jobs running
+	  ;;
+	  ;; NOTE: This does NOT actually gate here, only captures the proc to be called later
+	  ;; 
+	  (if (not (runs:dat-wait-for-jobs-function runsdat))
+	      (runs:dat-wait-for-jobs-function-set!
+	       runsdat 
+	       (lambda (testdat-in)
+		 (let* ((jobgroup              (runs:testdat-jobgroup testdat-in))
+			(can-run-more-tests    (runs:dat-can-run-more-tests runsdat))
+			(last-jobs-check-time  (runs:dat-last-jobs-check-time runsdat))
+			(should-check-jobs     (match can-run-more-tests
+						 ((can-run-more-flag num-running nr-in-jobgroup max-concurrent-jobs . params)
+						  (if (< (- max-concurrent-jobs num-running) 25)
+						      (begin
+							(debug:print-info 0 *default-log-port*
+									  "less than 20 jobs headroom, ("max-concurrent-jobs
+									  "-"num-running")>20. Forcing prelaunch check.")
+							#t)
+						      #f))
+						 (else #f)))) ;; no record yet
+		   (if should-check-jobs
+		       (let loop-can-run-more
+			   ((res      (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))
+			    (remtries 1440)) ;; we can wait for up to two hours for jobs to get done
+			 (match res
+			   ((run-more num-running . rem)
+			    (if (or run-more
+				    (< remtries 1))
+				(begin
+				  (if (runs:lownoise "num-running" 30)
+				      (debug:print-info 0 *default-log-port* "Have "num-running" tests of max " max-concurrent-jobs))
+				  (runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through
+				(begin
+				  (if (runs:lownoise "num-running" 10)
+				      (debug:print-info 0 *default-log-port* "Can't run more tests, have "num-running" tests of "
+							max-concurrent-jobs " allowed."))
+				  (thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable
+				  
+				  ;; wait for load here
+				  (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+				  (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)
+						     (- remtries 1)))))))
+		       )))))
+
+	  ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed
 	  (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
 
-	  ;; This would be a good place to block on homehost load
-
+	  ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed
+	  (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))
 
-
-	  
-	  (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
-	  (let ((loop-list (runs:process-expanded-tests runsdat testdat)))
+	  (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running
             (if loop-list (apply loop loop-list))))
 
 	 ;; items processed into a list but not came in as a list been processed
 	 ;;
 	 ((and (list? items)     ;; thus we know our items are already calculated
@@ -1743,19 +1784,20 @@
 	 ;; if items is a proc then need to run items:get-items-from-config, get the list and loop 
 	 ;;    - but only do that if resources exist to kick off the job
 	 ;; EXPAND ITEMS
 	 ((or (procedure? items)(eq? items 'have-procedure))
           (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-4")
-	  (let ((can-run-more    (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)))
+	  (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)))
 	    (if (and (list? can-run-more)
-		     (car can-run-more))
-		(let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here
+		     (car can-run-more)) ;; itemized test expanded here
+		(let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup
+						    max-concurrent-jobs run-id waitons item-path
+						    testmode test-record can-run-more items runname
+						    tconfig reglen test-registry test-records itemmaps)))
 		  (if loop-list
 		      (apply loop loop-list)
-                      (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)
-                      )
-                  )
+                      (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)))
 		;; if can't run more just loop with next possible test
 		(loop (car newtal)(cdr newtal) reg reruns))))
          
 	 ;; this case should not happen, added to help catch any bugs
 	 ((and (list? items) itemdat)
@@ -1788,11 +1830,13 @@
     ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched 
     (rmt:set-var (conc "lunch-complete-" run-id) "yes")  
         
     ;; now *if* -run-wait we wait for all tests to be done
     ;; Now wait for any RUNNING tests to complete (if in run-wait mode)
+    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
     (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle
+    
     (let wait-loop ((num-running      (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes
 		    (prev-num-running 0))
       ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running)
       (if (and (or (args:get-arg "-run-wait")
 		   (equal? (configf:lookup *configdat* "setup" "run-wait") "yes"))
@@ -1807,11 +1851,12 @@
 		  (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
 		  (rmt:find-and-mark-incomplete run-id #f)
 		  (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running
 				    " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at "
 				    (time->string (seconds->local-time (current-seconds))))))
-	    (thread-sleep! 5)
+	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+	    (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1))
 	    (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes
 		       num-running))))
     ;; LET* ((test-record
     ;; we get here on "drop through". All done!
     ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. 
@@ -1866,11 +1911,11 @@
 	     (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t))))
        lst))
 
 ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step
 ;;
-(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry)
+(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry runsdat testdat-rec)
   ;; All these vars might be referenced by the testconfig file reader
   ;; 
   ;; NEED to reprocess testconfig here, ensuring that item variables are available.
   ;; This is for Tal's issue with item-specific env vars not being set for use in skip.
   ;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273
@@ -1944,12 +1989,13 @@
 		  (set! test-id (rmt:get-test-id run-id test-name item-path))))
 	    (debug:print-info 4 *default-log-port* "test-id=" test-id ", run-id=" run-id ", test-name=" test-name ", item-path=\"" item-path "\"")
 	    (set! testdat (rmt:get-test-info-by-id run-id test-id))
 	    (if (not testdat)
 		(begin
-		  (debug:print-info 0 *default-log-port* "WARNING: server is overloaded, trying again in one second")
-		  (thread-sleep! 1)
+		  (debug:print-info 0 *default-log-port* "WARNING: server is overloaded, trying again in two seconds")
+		  ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+		  (thread-sleep! 2)
 		  (loop)))))
       (if (not testdat) ;; should NOT happen
 	  (debug:print-error 0 *default-log-port* "failed to get test record for test-id " test-id))
       (set! test-id (db:test-get-id testdat))
       (if (common:file-exists? test-path)
@@ -2056,15 +2102,23 @@
 		       (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test)
 		       (debug:print-info 1 *default-log-port* "SKIPPING Test " full-test-name " due to " skip-test))
 		     ;;
 		     ;; Here the test is handed off to launch.scm for launch-test to complete the launch process
 		     ;;
-		     (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags))
-			 (begin
-			   (print "ERROR: Failed to launch the test. Exiting as soon as possible")
-			   (set! *globalexitstatus* 1) ;; 
-			   (process-signal (current-process-id) signal/kill))))))))
+		     (begin
+		       ;; wait for less than max jobs here
+		       (if (runs:dat-wait-for-jobs-function runsdat)
+			   ((runs:dat-wait-for-jobs-function runsdat) testdat-rec))
+		       
+		       (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags))
+			   (begin
+			     (print "ERROR: Failed to launch the test. Exiting as soon as possible")
+			     (set! *globalexitstatus* 1) ;; 
+			     (process-signal (current-process-id) signal/kill))
+			   )
+		       ;; wait again here?
+		       ))))))
 	((KILLED) 
 	 (debug:print 1 *default-log-port* "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.")
 	 (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)) ;; KILLED))
 	((LAUNCHED REMOTEHOSTSTART RUNNING)  
 	 (debug:print 2 *default-log-port* "NOTE: " test-name " is already running"))
@@ -2535,11 +2589,11 @@
 				(if (not (null? tal))
 				    (loop (car tal)(cdr tal))))
 			       ((run-wait)
                                 ;; BB TODO - manage has-subrun case
 				(debug:print-info 2 *default-log-port* "still waiting, " (length tests) " tests still running")
-				(thread-sleep! 10)
+				(thread-sleep! 5)
 				(let ((new-tests (proc-get-tests run-id)))
 				  (if (null? new-tests)
 				      (debug:print-info 1 *default-log-port* "Run completed according to zero tests matching provided criteria.")
 				      (loop (car new-tests)(cdr new-tests)))))
 			       ((archive)

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -202,13 +202,13 @@
     (if (if (directory-exists? (conc areapath "/logs"))
 	    '()
 	    (if (file-write-access? areapath)
 		(begin
 		  (condition-case
-		      (create-directory (conc areapath "/logs") #t)
-		    (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
-		    (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list.")))
+		   (create-directory (conc areapath "/logs") #t)
+		   (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
+		   (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn)))
 		  (directory-exists? (conc areapath "/logs")))
 		'()))
 	(let* ((server-logs   (glob (conc areapath "/logs/server-*.log")))
 	       (num-serv-logs (length server-logs)))
 	  (if (null? server-logs)
@@ -215,15 +215,15 @@
 	      '()
 	      (let loop ((hed  (car server-logs))
 			 (tal  (cdr server-logs))
 			 (res '()))
 		(let* ((mod-time  (handle-exceptions
-				      exn
-				    (begin
-				      (print "failed to get modification time on " hed ", exn=" exn)
-				      (current-seconds)) ;; 0
-				    (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
+				   exn
+				   (begin
+				     (print "failed to get modification time on " hed ", exn=" exn)
+				     (current-seconds)) ;; 0
+				   (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
 		       (down-time (- (current-seconds) mod-time))
 		       (serv-dat  (if (or (< num-serv-logs 10)
 				  	  (< down-time 900)) ;; day-seconds))
 				      (server:logf-get-start-info hed)
 				      '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at
@@ -231,16 +231,16 @@
 		       (fmatch   (string-match fname-rx hed))
 		       (pid      (if fmatch (string->number (list-ref fmatch 2)) #f))
 		       (new-res  (if (null? serv-dat)
 				     res
 				     (cons (append serv-rec (list pid)) res))))
-		(if (null? tal)
-		    (if (and limit
-			     (> (length new-res) limit))
-			new-res ;; (take new-res limit)  <= need intelligent sorting before this will work
-			new-res)
-		    (loop (car tal)(cdr tal) new-res)))))))))
+		  (if (null? tal)
+		      (if (and limit
+			       (> (length new-res) limit))
+			  new-res ;; (take new-res limit)  <= need intelligent sorting before this will work
+			  new-res)
+		      (loop (car tal)(cdr tal) new-res)))))))))
 
 (define (server:get-num-alive srvlst)
   (let ((num-alive 0))
     (for-each
      (lambda (server)
@@ -274,14 +274,15 @@
 				    (mod-time   (list-ref rec 0)))
 				;; (print "start-time: " start-time " mod-time: " mod-time)
 				(and start-time mod-time
 				     (> (- now start-time) 0)    ;; been running at least 0 seconds
 				     (< (- now mod-time)   16)   ;; still alive - file touched in last 16 seconds
-				     (< (- now start-time)       
-					(+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600"))
-					      180)
-					   (random 360))) ;; under one hour running time +/- 180
+				     (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set
+					 (< (- now start-time)       
+					    (+ (- (string->number (configf:lookup *configdat* "server" "runtime"))
+					       180)
+					    (random 360)))) ;; under one hour running time +/- 180
 				     ))
 			      #f))
 			srvlst)
 		(lambda (a b)
 		  (< (list-ref a 3)
@@ -321,25 +322,39 @@
         *my-client-signature*)))
 
 ;; wait for server=start-last to be three seconds old
 ;;
 (define (server:wait-for-server-start-last-flag areapath)
-  (let* ((start-flag (conc areapath "/logs/server-start-last")))
+  (let* ((start-flag (conc areapath "/logs/server-start-last"))
+	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
+	 (reftime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
+	 (server-key (conc (get-host-name) "-" (current-process-id))))
     (if (file-exists? start-flag)
 	(let* ((fmodtime (file-modification-time start-flag))
-	       (reftime  (+ 3 (random 5)))
-	       (delta    (- (current-seconds) fmodtime)))
-	  (if (> delta reftime) ;; good enough
-	      (begin
-		(debug:print-info 0 *default-log-port* "Ready to start server, last start: "
-				  fmodtime ", delta: " delta ", reftime: " reftime)
-		(system (conc "touch " start-flag))) ;; lazy but safe
-	      (begin
-		(thread-sleep! 5)
-		(server:wait-for-server-start-last-flag areapath))))
-	(system (conc "touch " start-flag)))))
-	      
+	       (delta    (- (current-seconds) fmodtime))
+	       (all-go   (> delta reftime)))
+	  (if (handle-exceptions
+                 exn
+                 (begin
+                    (debug:print-info 0 *default-log-port* "Something not right with soft server locking: exn=" exn)
+                    #t)
+                 (and all-go
+		   (begin
+		     (with-output-to-file start-flag
+		       (lambda ()
+			 (print server-key)))
+		     (thread-sleep! 0.25)
+		     (let ((res (with-input-from-file start-flag
+				  (lambda ()
+				    (read-line)))))
+		       (equal? server-key res)))))
+	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe
+	      (begin
+		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
+				  fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
+		(thread-sleep! reftime)
+		(server:wait-for-server-start-last-flag areapath)))))))
 
 ;; kind start up of servers, wait 40 seconds before allowing another server for a given
 ;; run-id to be launched
 ;;
 (define (server:kind-run areapath)
@@ -356,12 +371,13 @@
 				((2)  300)
 				(else 600))
 			      (random 5)))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
 	     (lock-file    (conc areapath "/logs/server-start.lock")))
 	(if	(> (- (current-seconds) when-run) run-delay)
-		(begin
+		(let* ((start-flag (conc areapath "/logs/server-start-last")))
 		  (common:simple-file-lock-and-wait lock-file expire-time: 15)
+		  (system (conc "touch " start-flag)) ;; lazy but safe
 		  (server:run areapath)
 		  (thread-sleep! 2) ;; don't release the lock for at least a few seconds
 		  (common:simple-file-release-lock lock-file)))
 	(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))
 
@@ -574,18 +590,26 @@
                                (if (not (configf:lookup *configdat* "server" "disable-db-snapshot"))
                                    (common:snapshot-file mtdbfile subdir: ".db-snapshot"))
 		               (delete-file* staging-file)
 		               (let* ((start-time (current-milliseconds))
                                       (res (system sync-cmd))
+                                      (dbbackupfile (conc mtdbfile ".backup"))
                                       (res2 
                                        (cond
-                                        ((eq? 0 res)
-		                         (delete-file* (conc mtdbfile ".backup"))
+                                        ((eq? 0 res )
+                                          (handle-exceptions
+                                              exn
+                                              (begin
+                                                 (debug:print-info 0 *default-log-port* "Problem making backup db file, exn=" exn)
+                                                 #t)
+                                         (if (file-exists? dbbackupfile)
+		                           (delete-file* dbbackupfile)
+                                         )
                                          (if (eq? 0 (file-size sync-log))
                                              (delete-file sync-log))
 		                         (system (conc "/bin/mv " staging-file " " mtdbfile))
-                                         
+                                         )
                                          (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000))
                                          (set! off-time (calculate-off-time
                                                          last-sync-seconds
                                                          (cond
                                                           ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1))

Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -1944,28 +1944,52 @@
 	 tdb
 	 "SELECT count(id) FROM test_rundat;")
 	res))
   0)
 
-(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)
-  (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))
-  (if (and cpuload diskfree)
-      (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
-  (if minutes 
-      (rmt:general-call 'update-run-duration run-id minutes test-id))
-  (if (and uname hostname)
-      (rmt:general-call 'update-uname-host run-id uname hostname test-id)))
+;; 
+(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname #!key (update-db #f)(tmpfree #f))
+  (if (get-environment-variable "MT_TEST_RUN_DIR")
+      (let* ((dest-dir (conc (get-environment-variable "MT_TEST_RUN_DIR") "/.mt_data"))
+	     (or-dash  (lambda (instr)
+			 (cond
+			  ((not instr) "") ;; #f -> blank, indicates value unchanged since last measurement taken
+			  ((string? instr)(if (string-search " " instr) (conc "\"" instr "\"") instr))
+			  (else instr))))
+	     (file-new (not (directory-exists? dest-dir))))
+	(if file-new (create-directory dest-dir #t))
+	(let* ((outp (open-output-file (conc dest-dir "/test-run.dat") #:append)))
+	  (with-output-to-port outp
+	    (lambda ()
+	      (if file-new
+		  (print "epoch_time,run_id,test_id,cpuload,diskfree,tmpfree,run_minutes,hostname,uname"))
+	      (print (current-seconds) "," (or-dash run-id)   "," (or-dash test-id)  ","
+		     (or-dash cpuload) "," (or-dash diskfree) "," (or-dash tmpfree)  ","
+		     (or-dash minutes) "," (or-dash hostname) ","
+		     (or-dash uname)))) ;; put uname last as it has spaces in it
+	  (close-output-port outp)))
+      (begin
+	(rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))))
+  (if update-db
+      (begin
+	(if (and cpuload diskfree)
+	    (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
+	(if minutes 
+	    (rmt:general-call 'update-run-duration run-id minutes test-id))
+	(if (and uname hostname)
+	    (rmt:general-call 'update-uname-host run-id uname hostname test-id)))))
   
 ;; This one is for running with no db access (i.e. via rmt: internally)
-(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries)
+(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries #!key (update-db #f))
 ;; (define (tests:set-full-meta-info test-id run-id minutes work-area)
 ;;  (let ((remtries 10))
   (let* ((cpuload  (get-cpu-load))
 	 (diskfree (get-df (current-directory)))
+	 (tmpfree  (get-df "/tmp"))
 	 (uname    (get-uname "-srvpio"))
 	 (hostname (get-host-name)))
-    (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)))
+    (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db tmpfree: tmpfree)))
     
 ;; (define (tests:set-partial-meta-info test-id run-id minutes work-area)
 #;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries)
   (let* ((cpuload  (get-cpu-load))
 	 (diskfree (get-df (current-directory)))