Index: Makefile
==================================================================
--- Makefile
+++ Makefile
@@ -131,10 +131,14 @@
 	chmod a+x $@
 
 $(PREFIX)/bin/nbfake : utils/nbfake
 	$(INSTALL) $< $@
 	chmod a+x $@
+
+$(PREFIX)/bin/remrun : utils/remrun
+	$(INSTALL) $< $@
+	chmod a+x $@
 
 $(PREFIX)/bin/viewscreen : utils/viewscreen
 	$(INSTALL) $< $@
 	chmod a+x $@
 
@@ -159,10 +163,15 @@
 	chmod a+x $@
 
 deploytarg/nbfind : utils/nbfind
 	$(INSTALL) $< $@
 	chmod a+x $@
+
+$(PREFIX)/bin/mtest-reaper: helpers/mtest-reaper.scm helpers/ducttape-lib.scm helpers/inteldate.scm helpers/mimetypes.scm
+	make -C helpers $@ PREFIX=$(PREFIX) INSTALL=$(INSTALL) ARCHSTR=$(ARCHSTR)
+
+mtest-reaper: $(PREFIX)/bin/mtest-reaper
 
 # install dashboard as dboard so wrapper script can be called dashboard
 $(PREFIX)/bin/.$(ARCHSTR)/dboard : dboard $(FILES) utils/mk_wrapper
 	utils/mk_wrapper $(PREFIX) dboard $(PREFIX)/bin/dashboard
 	chmod a+x $(PREFIX)/bin/dashboard
@@ -169,14 +178,15 @@
 	$(INSTALL) dboard $(PREFIX)/bin/.$(ARCHSTR)/dboard
 
 install : $(PREFIX)/bin/.$(ARCHSTR) $(PREFIX)/bin/.$(ARCHSTR)/mtest $(PREFIX)/bin/megatest \
           $(PREFIX)/bin/.$(ARCHSTR)/dboard $(PREFIX)/bin/dashboard $(HELPERS) $(PREFIX)/bin/nbfake \
 	  $(PREFIX)/bin/nbfind $(PREFIX)/bin/loadrunner $(PREFIX)/bin/viewscreen $(PREFIX)/bin/mt_xterm \
-	  $(PREFIX)/share/docs/megatest_manual.html 
+	  $(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun
 
 $(PREFIX)/bin/.$(ARCHSTR) : 
 	mkdir -p $(PREFIX)/bin/.$(ARCHSTR)
+	mkdir -p $(PREFIX)/bin/.$(ARCHSTR)/lib
 
 test: tests/tests.scm
 	cd tests;csi -I .. -b -n tests.scm
 
 ext-tests/.fslckout : $(MTQA_FOSSIL)
@@ -211,11 +221,11 @@
 #	chicken-install -prefix deploytarg -deploy $$i;done
 
 # deploytarg/libsqlite3.so : 
 # 	CSC_OPTIONS="-Ideploytarg -Ldeploytarg" $CHICKEN_INSTALL -prefix deploytarg -deploy sqlite3
 
-deploy : deploytarg/mtest deploytarg/dboard $(DEPLOYHELPERS) deploytarg/nbfake deploytarg/viewsceen deploytarg/nbfind deploytarg/apropos.so
+deploy : deploytarg/mtest deploytarg/dboard $(DEPLOYHELPERS) deploytarg/nbfake deploytarg/remrun deploytarg/viewsceen deploytarg/nbfind deploytarg/apropos.so
 
 # deploytarg/libiupcd.so : $(CKPATH)/lib/libiupcd.so
 # 	for i in iup im cd av call sqlite; do \
 # 	  cp $(CKPATH)/lib/lib$$i* deploytarg/ ; \
 # 	done
@@ -278,5 +288,6 @@
 	   echo "(use postgresql)(hash-table-set! *available-db* 'postgresql #t)" >> altdb.scm;\
 	fi
 
 portlogger-example : portlogger-example.scm api.o archive.o client.o common.o configf.o daemon.o dashboard-tests.o db.o dcommon.o ezsteps.o filedb.o genexample.o gutils.o http-transport.o items.o keys.o launch.o lock-queue.o margs.o megatest-version.o mt.o ods.o portlogger.o process.o rmt.o rpc-transport.o runconfig.o runs.o sdb.o server.o synchash.o tasks.o tdb.o tests.o tree.o
 	csc $(CSCOPTS) portlogger-example.scm api.o archive.o client.o common.o configf.o daemon.o dashboard-tests.o db.o dcommon.o ezsteps.o filedb.o genexample.o gutils.o http-transport.o items.o keys.o launch.o lock-queue.o margs.o megatest-version.o mt.o ods.o portlogger.o process.o rmt.o rpc-transport.o runconfig.o runs.o sdb.o server.o synchash.o tasks.o tdb.o tests.o tree.o
+

Index: NOTES
==================================================================
--- NOTES
+++ NOTES
@@ -1,13 +1,19 @@
+=====================================================================
+NOTES from looking at branch v1.62-rpc
+=====================================================================
+
+*last-db-access* or *db-last-access* ==> which is it to be?
+seen in singletest: ERROR: Unrecognised arguments: :first_err This is the first error
 
 ======================================================================
 New way of launching needed to accomodate different target hosttypes
 for items
 ======================================================================
 
 [flavors]
-general ssh #{getbgesthost general}
+general ssh #{getbesthost general}
 nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo
 
 [hosts]
 general cubian xena
 

Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -39,10 +39,11 @@
     get-run-status
     get-run-stats
     get-targets
     get-target
     ;; register-run
+    get-tests-tags
     get-tests-for-run
     get-test-id
     get-tests-for-runs-mindata
     get-run-name-from-id
     get-runs
@@ -61,10 +62,12 @@
     synchash-get
     ))
 
 (define api:write-queries
   '(
+    get-keys-write ;; dummy "write" query to force server start
+
     ;; SERVERS
     start-server
     kill-server
 
     ;; TESTS
@@ -111,10 +114,11 @@
 ;;
 (define (api:execute-requests dbstruct dat)
   (handle-exceptions
    exn
    (let ((call-chain (get-call-chain)))
+     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer")
      (print-call-chain (current-error-port))
      (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
      (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
    (if (not (vector? dat))                    ;; it is an error to not receive a vector
        (vector #f #f "remote must be called with a vector")       
@@ -167,10 +171,11 @@
 	    ((mark-incomplete)              (apply db:find-and-mark-incomplete dbstruct params))
 
 	    ;; TESTMETA
 	    ((testmeta-add-record)       (apply db:testmeta-add-record dbstruct params))
 	    ((testmeta-update-field)     (apply db:testmeta-update-field dbstruct params))
+            ((get-tests-tags)            (db:get-tests-tags dbstruct))
 
 	    ;; TASKS
 	    ((tasks-add)                 (apply tasks:add dbstruct params))   
 	    ((tasks-set-state-given-param-key) (apply tasks:set-state-given-param-key dbstruct params))
 	    ((tasks-get-last)            (apply tasks:get-last dbstruct params))
@@ -186,10 +191,11 @@
 	    ;;======================================================================
 
 	    ;; KEYS
 	    ((get-key-val-pairs)               (apply db:get-key-val-pairs dbstruct params))
 	    ((get-keys)                        (db:get-keys dbstruct))
+            ((get-keys-write)                        (db:get-keys dbstruct)) ;; force a dummy "write" query to force server
 	    ((get-key-vals)                    (apply db:get-key-vals dbstruct params))
 	    ((get-target)                      (apply db:get-target dbstruct params))
 	    ((get-targets)                     (db:get-targets dbstruct))
 
 	    ;; ARCHIVES
@@ -239,10 +245,11 @@
 
 	    ;; TEST DATA
 	    ((read-test-data)               (apply db:read-test-data dbstruct params))
 
 	    ;; MISC
+            ((get-latest-host-load)         (apply db:get-latest-host-load dbstruct params))
 	    ((have-incompletes?)            (apply db:have-incompletes? dbstruct params))
 	    ((login)                        (apply db:login dbstruct params))
 	    ((general-call)                 (let ((stmtname   (car params))
 						  (run-id     (cadr params))
 						  (realparams (cddr params)))

Index: client.scm
==================================================================
--- client.scm
+++ client.scm
@@ -197,19 +197,19 @@
 							   (tasks:hostinfo-get-port      server-dat)
 							   " client:setup (server-dat = #t)")
 		      (if (> remaining-tries 8)
 			  (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little
 			  (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time
-		      (server:try-running run-id)
+		      (server:try-running *toppath*)
 		      (thread-sleep! 5)   ;; give server a little time to start up
 		      (client:setup run-id remaining-tries: (- remaining-tries 1))
 		      )))
 	      (begin    ;; no server registered
 		(let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id)))
 		  (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
 		  (if (< num-available 2)
-		      (server:try-running run-id))
+		      (server:try-running *toppath*))
 		  (thread-sleep! (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
 		  (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))
 
 ;; keep this as a function to ease future 
 (define (client:start run-id server-info)

Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -90,14 +90,13 @@
 (define *db-stats-mutex*      (make-mutex))
 ;; db access
 (define *db-last-access*      (current-seconds)) ;; last db access, used in server
 (define *db-write-access*     #t)
 ;; db sync
-(define *db-last-write*       0)                 ;; used to record last touch of db
 (define *db-last-sync*        0)                 ;; last time the sync to megatest.db happened
 (define *db-sync-in-progress* #f)                ;; if there is a sync in progress do not try to start another
-(define *db-multi-sync-mutex* (make-mutex))      ;; protect access to *db-sync-in-progress*, *db-last-sync* and *db-last-write*
+(define *db-multi-sync-mutex* (make-mutex))      ;; protect access to *db-sync-in-progress*, *db-last-sync*
 ;; task db
 (define *task-db*             #f) ;; (vector db path-to-db)
 (define *db-access-allowed*   #t) ;; flag to allow access
 (define *db-access-mutex*     (make-mutex))
 (define *db-cache-path*       #f)
@@ -131,28 +130,45 @@
 (define *toptest-paths*     (make-hash-table)) ;; cache toptest path settings here
 (define *test-paths*        (make-hash-table)) ;; cache test-id to test run paths here
 (define *test-ids*          (make-hash-table)) ;; cache run-id, testname, and item-path => test-id
 (define *test-info*         (make-hash-table)) ;; cache the test info records, update the state, status, run_duration etc. from testdat.db
 
-(define *run-info-cache*    (make-hash-table)) ;; run info is stable, no need to reget
+(define *run-info-cache*     (make-hash-table)) ;; run info is stable, no need to reget
 (define *launch-setup-mutex* (make-mutex))     ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set
 (define *homehost-mutex*     (make-mutex))
 
+(defstruct remote
+  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
+  (server-url        (if *toppath* (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*) #f))
+  (last-server-check 0)  ;; last time we checked to see if the server was alive
+  (conndat           #f)
+  (transport         *transport-type*)
+  (server-timeout    (or (server:get-timeout) 100))) ;; default to 100 seconds
+
+;; launching and hosts
+(defstruct host
+  (reachable    #f)
+  (last-update  0)
+  (last-used    0)
+  (last-cpuload 1))
+
+(define *host-loads*         (make-hash-table))
+
 ;; cache environment vars for each run here
 (define *env-vars-by-run-id* (make-hash-table))
 
 ;; Testconfig and runconfig caches. 
-(define *testconfigs*       (make-hash-table)) ;; test-name => testconfig
-(define *runconfigs*        (make-hash-table)) ;; target    => runconfig
+(define *testconfigs*        (make-hash-table)) ;; test-name => testconfig
+(define *runconfigs*         (make-hash-table)) ;; target    => runconfig
 
 ;; This is a cache of pre-reqs met, don't re-calc in cases where called with same params less than
 ;; five seconds ago
 (define *pre-reqs-met-cache* (make-hash-table))
 
 ;; cache of verbosity given string
 ;;
-(define *verbosity-cache* (make-hash-table))
+(define *verbosity-cache*    (make-hash-table))
 
 (define (common:clear-caches)
   (set! *target*             (make-hash-table))
   (set! *keys*               (make-hash-table))
   (set! *keyvals*            (make-hash-table))
@@ -522,47 +538,59 @@
 ;;======================================================================
 ;; E X I T   H A N D L I N G
 ;;======================================================================
 
 (define (common:run-sync?)
-  (let ((ohh (common:on-homehost?))
-	(srv (args:get-arg "-server")))
-    ;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv)
     (and (common:on-homehost?)
-	 (args:get-arg "-server"))))
+	 (args:get-arg "-server")))
+
+;;   (let ((ohh (common:on-homehost?))
+;; 	(srv (args:get-arg "-server")))
+;;     (and ohh srv)))
+    ;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv)
 
 ;;;; run-ids
 ;;    if #f use *db-local-sync* : or 'local-sync-flags
 ;;    if #t use timestamps      : or 'timestamps
 (define (common:sync-to-megatest.db dbstruct) 
   (let ((start-time         (current-seconds))
 	(res                (db:multi-db-sync dbstruct 'new2old)))
     (let ((sync-time (- (current-seconds) start-time)))
-      (debug:print-info 3 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds")
+      (debug:print-info 3 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id))
       (if (common:low-noise-print 30 "sync new to old")
-	  (debug:print-info 0 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds")))
+	  (debug:print-info 0 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id))))
     res))
 
+
+
+
+(define *wdnum* 0)
+(define *wdnum*mutex (make-mutex))
 ;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp
 ;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server)
 ;;
 (define (common:watchdog)
+  
   (thread-sleep! 0.05) ;; delay for startup
   (let ((legacy-sync (common:run-sync?))
 	(debug-mode  (debug:debug-mode 1))
-	(last-time   (current-seconds)))
-    (debug:print-info 0 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync)
-    (if legacy-sync
+	(last-time   (current-seconds))
+        (this-wd-num     (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x)))
+        )
+    (debug:print-info 0 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num)
+    (if (and legacy-sync (not *time-to-exit*))
 	(let ((dbstruct (db:setup)))
 	  (debug:print-info 0 *default-log-port* "Server running, periodic sync started.")
 	  (let loop ()
+            ;;(BB> "watchdog loop.  pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*)
 	    ;; sync for filesystem local db writes
 	    ;;
 	    (mutex-lock! *db-multi-sync-mutex*)
-	    (let* ((need-sync        (>= *db-last-write* *db-last-sync*)) ;; no sync since last write
+	    (let* ((need-sync        (>= *db-last-access* *db-last-sync*)) ;; no sync since last write
 		   (sync-in-progress *db-sync-in-progress*)
-		   (should-sync      (> (- (current-seconds) *db-last-sync*) 5)) ;; sync every five seconds minimum
+		   (should-sync      (and (not *time-to-exit*)
+                                          (> (- (current-seconds) *db-last-sync*) 5))) ;; sync every five seconds minimum
 		   (will-sync        (and (or need-sync should-sync)
 					  (not sync-in-progress)))
 		   (start-time       (current-seconds)))
 	      ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync)
 	      (if will-sync (set! *db-sync-in-progress* #t))
@@ -590,55 +618,71 @@
 	    
 	    ;; keep going unless time to exit
 	    ;;
 	    (if (not *time-to-exit*)
 		(let delay-loop ((count 0))
+                  ;;(BB> "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*)
+                                                            
 		  (if (and (not *time-to-exit*)
 			   (< count 4)) ;; was 11, changing to 4. 
 		      (begin
 			(thread-sleep! 1)
 			(delay-loop (+ count 1))))
-		  (loop)))
+		  (if (not *time-to-exit*) (loop))))
 	    (if (common:low-noise-print 30)
-		(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*)))))))
+		(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" this-wd-num="this-wd-num)))))))
 
 (define (std-exit-procedure)
+  (on-exit (lambda () 0))
+  ;;(BB> "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
   (let ((no-hurry  (if *time-to-exit* ;; hurry up
 		       #f
 		       (begin
 			 (set! *time-to-exit* #t)
 			 #t))))
     (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.")
     (if (and no-hurry (debug:debug-mode 18))
 	(rmt:print-db-stats))
     (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds
-			      (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated
+                              (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated
 			      (if *task-db*    
 				  (let ((db (cdr *task-db*)))
 				    (if (sqlite3:database? db)
 					(begin
 					  (sqlite3:interrupt! db)
 					  (sqlite3:finalize! db #t)
 					  ;; (vector-set! *task-db* 0 #f)
 					  (set! *task-db* #f)))))
-			      (close-output-port *default-log-port*)
+                              (if (and *runremote*
+                                       (remote-conndat *runremote*))
+                                  (begin
+                                    (http-client#close-all-connections!))) ;; for http-client
+                              (if (not (eq? *default-log-port* (current-error-port)))
+                                  (close-output-port *default-log-port*))
 			      (set! *default-log-port* (current-error-port))) "Cleanup db exit thread"))
 	  (th2 (make-thread (lambda ()
 			      (debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...")
 			      (if no-hurry
-				  (thread-sleep! 5) ;; give the clean up few seconds to do it's stuff
-				  (thread-sleep! 2))
-			      (debug:print 4 *default-log-port* " ... done")
-			      )
+                                  (begin
+                                    (thread-sleep! 5)) ;; give the clean up few seconds to do it's stuff
+                                  (begin
+      				  (thread-sleep! 2)))
+      			      (debug:print 4 *default-log-port* " ... done")
+      			      )
 			    "clean exit")))
       (thread-start! th1)
       (thread-start! th2)
-      (thread-join! th1))))
+      (thread-join! th1)
+      )
+    )
+
+  0)
 
 (define (std-signal-handler signum)
   ;; (signal-mask! signum)
   (set! *time-to-exit* #t)
+  ;;(BB> "got signal "signum)
   (debug:print-error 0 *default-log-port* "Received signal " signum " exiting promptly")
   ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway
   (exit))
 
 (set-signal-handler! signal/int  std-signal-handler)  ;; ^C
@@ -770,20 +814,24 @@
 
 (define (common:args-get-status)
   (or (args:get-arg "-status")(args:get-arg ":status")))
 
 (define (common:args-get-testpatt rconf)
-  (let* ((rtestpatt     (if rconf (runconfigs-get rconf "TESTPATT") #f))
-	 (args-testpatt (or (args:get-arg "-testpatt")
-			    (args:get-arg "-runtests")
-			    "%"))
-	 (testpatt    (or (and (equal? args-testpatt "%")
-			       rtestpatt)
-			  args-testpatt)))
-    (if rtestpatt (debug:print-info 0 *default-log-port* "TESTPATT from runconfigs: " rtestpatt))
-    testpatt))
-
+  (let* ((tagexpr (args:get-arg "-tagexpr"))
+         (tags-testpatt (if tagexpr (string-join (runs:get-tests-matching-tags tagexpr) ",") #f))
+         (testpatt-key  (if (args:get-arg "-mode") (args:get-arg "-mode") "TESTPATT"))
+         (args-testpatt (or (args:get-arg "-testpatt") (args:get-arg "-runtests") "%"))
+         (rtestpatt     (if rconf (runconfigs-get rconf testpatt-key) #f)))
+    (cond
+     (tags-testpatt
+      (debug:print-info 0 *default-log-port* "-tagexpr "tagexpr" selects testpatt "tags-testpatt)
+      tags-testpatt)
+     ((and (equal? args-testpatt "%") rtestpatt)
+      (debug:print-info 0 *default-log-port* "testpatt defined in "testpatt-key" from runconfigs: " rtestpatt)
+      rtestpatt)
+     (else args-testpatt))))
+     
 (define (common:get-linktree)
   (or (getenv "MT_LINKTREE")
       (if *configdat*
 	  (configf:lookup *configdat* "setup" "linktree"))))
 
@@ -1073,10 +1121,151 @@
 	   (with-input-from-pipe 
 	    (conc "ssh " remote-host " cat /proc/loadavg")
 	    (lambda ()(list (read)(read)(read)))))
       (with-input-from-file "/proc/loadavg" 
 	(lambda ()(list (read)(read)(read))))))
+
+;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads
+;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc.
+;;  keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load
+;;
+(define (common:get-normalized-cpu-load remote-host)
+  (let ((data (if remote-host
+                  (with-input-from-pipe 
+                   (conc "ssh " remote-host " cat /proc/loadavg;cat /proc/cpuinfo;echo end")
+                   read-lines)
+                  (append 
+                   (with-input-from-file "/proc/loadavg" 
+                     read-lines)
+                   (with-input-from-file "/proc/cpuinfo"
+                     read-lines)
+                   (list "end"))))
+        (load-rx  (regexp "^([\\d\\.]+)\\s+([\\d\\.]+)\\s+([\\d\\.]+)\\s+.*$"))
+        (proc-rx  (regexp "^processor\\s+:\\s+(\\d+)\\s*$"))
+        (core-rx  (regexp "^core id\\s+:\\s+(\\d+)\\s*$"))
+        (phys-rx  (regexp "^physical id\\s+:\\s+(\\d+)\\s*$"))
+        (max-num  (lambda (p n)(max (string->number p) n))))
+    ;; (print "data=" data)
+    (if (null? data) ;; something went wrong
+        #f
+        (let loop ((hed      (car data))
+                   (tal      (cdr data))
+                   (loads    #f)
+                   (proc-num 0)  ;; processor includes threads
+                   (phys-num 0)  ;; physical chip on motherboard
+                   (core-num 0)) ;; core
+          ;; (print hed ", " loads ", " proc-num ", " phys-num ", " core-num)
+          (if (null? tal) ;; have all our data, calculate normalized load and return result
+              (let* ((act-proc (+ proc-num 1))
+                     (act-phys (+ phys-num 1))
+                     (act-core (+ core-num 1))
+                     (adj-proc-load (/ (car loads) act-proc))
+                     (adj-core-load (/ (car loads) act-core)))
+                (append (list (cons 'adj-proc-load adj-proc-load)
+                              (cons 'adj-core-load adj-core-load))
+                        (list (cons '1m-load (car loads))
+                              (cons '5m-load (cadr loads))
+                              (cons '15m-load (caddr loads)))
+                        (list (cons 'proc act-proc)
+                              (cons 'core act-core)
+                              (cons 'phys act-phys))))
+              (regex-case
+               hed
+               (load-rx  ( x l1 l5 l15 ) (loop (car tal)(cdr tal)(map string->number (list l1 l5 l15)) proc-num phys-num core-num))
+               (proc-rx  ( x p         ) (loop (car tal)(cdr tal) loads           (max-num p proc-num) phys-num core-num))
+               (phys-rx  ( x p         ) (loop (car tal)(cdr tal) loads           proc-num (max-num p phys-num) core-num))
+               (core-rx  ( x c         ) (loop (car tal)(cdr tal) loads           proc-num phys-num (max-num c core-num)))
+               (else 
+                (begin
+                  ;; (print "NO MATCH: " hed)
+                  (loop (car tal)(cdr tal) loads proc-num phys-num core-num)))))))))
+
+(define (common:unix-ping hostname)
+  (let ((res (system (conc "ping -c 1 " hostname " > /dev/null"))))
+    (eq? res 0)))
+
+;; ideally put all this info into the db, no need to preserve it across moving homehost
+;;
+;; return list of
+;;  ( reachable? cpuload update-time )
+(define (common:get-host-info hostname)
+  (let* ((loadinfo (rmt:get-latest-host-load hostname))
+         (load (car loadinfo))
+         (load-sample-time (cdr loadinfo))
+         (load-sample-age (- (current-seconds) load-sample-time))
+         (loadinfo-timeout-seconds 20)
+         (host-last-update-timeout-seconds 10)
+         (host-rec (hash-table-ref/default *host-loads* hostname #f))
+         )
+    (cond
+     ((< load-sample-age loadinfo-timeout-seconds)
+      (list #t
+            load-sample-time
+            load))
+     ((and host-rec
+           (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds)))
+      (list #t
+            (host-last-update host-rec)
+            (host-last-cpuload host-rec )))
+     ((common:unix-ping hostname)
+      (list #t
+            (current-seconds)
+            (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname))))
+     (else
+      (list #f 0 -1)))))
+    
+(define (common:update-host-loads-table hosts-raw)
+  (let* ((hosts (filter (lambda (x)
+                          (string-match (regexp "^\\S+$") x))
+                        hosts-raw)))
+    (for-each
+     (lambda (hostname)
+       (let* ((rec       (let ((h (hash-table-ref/default *host-loads* hostname #f)))
+                          (if h
+                              h
+                              (let ((h (make-host)))
+                                (hash-table-set! *host-loads* hostname h)
+                                h))))
+              (host-info         (common:get-host-info hostname))
+              (is-reachable      (car host-info))
+              (last-reached-time (cadr host-info))
+              (load              (caddr host-info)))
+         (host-reachable-set!    rec is-reachable)
+         (host-last-update-set!  rec last-reached-time)
+         (host-last-cpuload-set! rec load)))
+     hosts)))
+
+(define (common:get-least-loaded-host hosts-raw)
+  (let* ((hosts (filter (lambda (x)
+                          (string-match (regexp "^\\S+$") x))
+                        hosts-raw))
+         (best-host #f)
+         (best-load 99999)
+         (curr-time (current-seconds)))
+    (common:update-host-loads-table hosts)
+    (for-each
+     (lambda (hostname)
+       (let* ((rec
+               (let ((h (hash-table-ref/default *host-loads* hostname #f)))
+                 (if h
+                     h
+                     (let ((h (make-host)))
+                       (hash-table-set! *host-loads* hostname h)
+                       h))))
+              (reachable (host-reachable rec))
+              (load      (host-last-cpuload   rec)))
+         (cond
+          ((not reachable) #f)
+          ((< (+ load (/ (random 250) 1000))         ;; add a random factor to keep from getting in a rut
+              (+ best-load (/ (random 250) 1000))  )
+           (set! best-load load)
+           (set! best-host hostname)))))
+     hosts)
+    best-host))
+
+
+
 
 (define (common:wait-for-cpuload maxload numcpus waitdelay #!key (count 1000) (msg #f)(remote-host #f))
   (let* ((loadavg (common:get-cpu-load remote-host))
 	 (first   (car loadavg))
 	 (next    (cadr loadavg))
@@ -1575,28 +1764,30 @@
     
 ;;======================================================================
 ;;  T E S T   L A U N C H I N G   P E R   I T E M   W I T H   H O S T   T Y P E S
 ;;======================================================================
 ;; 
-;; [host-types]
-;; general ssh #{getbgesthost general}
-;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo
+;; [hosts]
+;; arm cubie01 cubie02
+;; x86_64 zeus xena myth01
+;; allhosts #{g hosts arm} #{g hosts x86_64}
 ;; 
-;; [hosts]
-;; general cubian xena
+;; [host-types]
+;; general #MTLOWESTLOAD #{g hosts allhosts}
+;; arm     #MTLOWESTLOAD #{g hosts arm}
+;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo
 ;; 
 ;; [launchers]
 ;; envsetup general
 ;; xor/%/n 4C16G
 ;; % nbgeneral
 ;; 
 ;; [jobtools]
-;; launcher bsub
-;; # if defined and not "no" flexi-launcher will bypass launcher unless there is no
-;; # match.
+;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match.
 ;; flexi-launcher yes  
-
+;; launcher nbfake
+;;
 (define (common:get-launcher configdat testname itempath)
   (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher")))
     (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher
 	     (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no")))
 	(let* ((launchers         (hash-table-ref/default configdat "launchers" '())))
@@ -1609,11 +1800,16 @@
 		  (if (tests:match patt testname itempath)
 		      (begin
 			(debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type)
 			(let ((launcher (configf:lookup configdat "host-types" host-type)))
 			  (if launcher
-			      launcher
+			      (let* ((launcher-parts (string-split launcher))
+				     (launcher-exe   (car launcher-parts)))
+				(if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline
+				    (let ((targ-host (common:get-least-loaded-host (cdr launcher-parts))))
+				      (conc "remrun " targ-host))
+				    launcher))
 			      (begin
 				(debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type)
 				(if (null? tal)
 				    fallback-launcher
 				    (loop (car tal)(cdr tal)))))))

Index: common_records.scm
==================================================================
--- common_records.scm
+++ common_records.scm
@@ -121,10 +121,11 @@
 	      (db:log-event (apply conc params))
 	      (apply print params)
 	      )))))
 
 ;; Brandon's debug printer shortcut (indulge me :)
+(define *BB-process-starttime* (current-milliseconds))
 (define (BB> . in-args)
   (let* ((stack (get-call-chain))
          (location #f))
     (for-each
      (lambda (frame)
@@ -131,12 +132,59 @@
        (let* ((this-loc (vector-ref frame 0))
               (this-func (cadr (string-split this-loc " "))))
          (if (equal? this-func "BB>")
              (set! location this-loc))))
      stack)
-    (let ((dp-args (append (list 0 *default-log-port* location"   "  ) in-args)))
+    (let ((dp-args (append (list 0 *default-log-port* (conc location "@"(/ (- (current-milliseconds) *BB-process-starttime*) 1000)"   ")  ) in-args)))
       (apply debug:print dp-args))))
+
+(define *BBpp_custom_expanders_list* (make-hash-table))
+
+
+
+;; register hash tables with BBpp.
+(hash-table-set! *BBpp_custom_expanders_list* HASH_TABLE:
+                 (cons hash-table? hash-table->alist))
+
+;; test name converter
+(define (BBpp_custom_converter arg)
+  (let ((res #f))
+    (for-each
+     (lambda (custom-type-name)
+       (let* ((custom-type-info      (hash-table-ref *BBpp_custom_expanders_list* custom-type-name))
+              (custom-type-test      (car custom-type-info))
+              (custom-type-converter (cdr custom-type-info)))
+         (when (and (not res) (custom-type-test arg))
+           (set! res (custom-type-converter arg)))))
+     (hash-table-keys *BBpp_custom_expanders_list*))
+    (if res (BBpp_ res) arg)))
+
+(define (BBpp_ arg)
+  (cond
+   ;;((SOMESTRUCT? arg) (cons SOMESTRUCT: (SOMESTRUCT->alist arg)))
+   ;;((dboard:tabdat? arg) (cons dboard:tabdat: (dboard:tabdat->alist arg)))
+   ((hash-table? arg)
+    (let ((al (hash-table->alist arg)))
+      (BBpp_ (cons HASH_TABLE: al))))
+   ((null? arg) '())
+   ;;((list? arg) (cons (BBpp_ (car arg)) (BBpp_ (cdr arg))))
+   ((pair? arg) (cons (BBpp_ (car arg)) (BBpp_ (cdr arg))))
+   (else (BBpp_custom_converter arg))))
+
+;; Brandon's pretty printer.  It expands hashes and custom types in addition to regular pp
+(define (BBpp arg)
+  (pp (BBpp_ arg)))
+
+;(use define-macro)
+(define-syntax inspect
+  (syntax-rules ()
+    [(_ x)
+    ;; (with-output-to-port (current-error-port)
+       (printf "~a is: ~a\n" 'x (with-output-to-string (lambda () (BBpp x))))
+     ;;  )
+     ]
+    [(_ x y ...) (begin (inspect x) (inspect y ...))]))
 
 (define (debug:print-error n e . params)
   ;; normal print
   (if (debug:debug-mode n)
       (with-output-to-port (or e (current-error-port))

Index: configf.scm
==================================================================
--- configf.scm
+++ configf.scm
@@ -57,10 +57,11 @@
 ;;======================================================================
 ;; Make the regexp's needed globally available
 ;;======================================================================
 
 (define configf:include-rx (regexp "^\\[include\\s+(.*)\\]\\s*$"))
+(define configf:script-rx  (regexp "^\\[scriptinc\\s+(.*)\\]\\s*$")) ;; include output from a script
 (define configf:section-rx (regexp "^\\[(.*)\\]\\s*$"))
 (define configf:blank-l-rx (regexp "^\\s*$"))
 (define configf:key-sys-pr (regexp "^(\\S+)\\s+\\[system\\s+(\\S+.*)\\]\\s*$"))
 (define configf:key-val-pr (regexp "^(\\S+)(\\s+(.*)|())$"))
 (define configf:key-no-val (regexp "^(\\S+)(\\s*)$"))
@@ -68,11 +69,11 @@
 (define configf:cont-ln-rx (regexp "^(\\s+)(\\S+.*)$"))
 (define configf:settings   (regexp "^\\[configf:settings\\s+(\\S+)\\s+(\\S+)]\\s*$"))
 
 ;; read a line and process any #{ ... } constructs
 
-(define configf:var-expand-regex (regexp "^(.*)#\\{(scheme|system|shell|getenv|get|runconfigs-get|rget)\\s+([^\\}\\{]*)\\}(.*)"))
+(define configf:var-expand-regex (regexp "^(.*)#\\{(scheme|system|shell|getenv|get|runconfigs-get|rget|scm|sh|rp|gv|g|mtrah)\\s+([^\\}\\{]*)\\}(.*)"))
 
 (define (configf:process-line l ht allow-system #!key (linenum #f))
   (let loop ((res l))
     (if (string? res)
 	(let ((matchdat (string-search configf:var-expand-regex res)))
@@ -83,36 +84,42 @@
 		     (poststr (list-ref matchdat 4))
 		     (result  #f)
 		     (start-time (current-seconds))
 		     (cmdsym  (string->symbol cmdtype))
 		     (fullcmd (case cmdsym
-				((scheme)(conc "(lambda (ht)" cmd ")"))
-				((system)(conc "(lambda (ht)(system \"" cmd "\"))"))
-				((shell) (conc "(lambda (ht)(shell \""  cmd "\"))"))
-				((getenv)(conc "(lambda (ht)(get-environment-variable \"" cmd "\"))"))
-				((get)   
+				((scheme scm) (conc "(lambda (ht)" cmd ")"))
+				((system)     (conc "(lambda (ht)(system \"" cmd "\"))"))
+				((shell sh)   (conc "(lambda (ht)(string-translate (shell \""  cmd "\") \"\n\" \" \"))"))
+				((realpath rp)(conc "(lambda (ht)(common:nice-path \"" cmd "\"))"))
+				((getenv gv)  (conc "(lambda (ht)(get-environment-variable \"" cmd "\"))"))
+				((mtrah)      (conc "(lambda (ht)"
+                                                    "    (let ((extra \"" cmd "\"))"
+						    "       (conc (or *toppath* (get-environment-variable \"MT_RUN_AREA_HOME\"))"
+						    "             (if (string-null? extra) \"\" \"/\")"
+						    "             extra)))"))
+				((get g)   
 				 (let* ((parts (string-split cmd))
 					(sect  (car parts))
 					(var   (cadr parts)))
 				   (conc "(lambda (ht)(config-lookup ht \"" sect "\" \"" var "\"))")))
-				((runconfigs-get) (conc "(lambda (ht)(runconfigs-get ht \"" cmd "\"))"))
-				((rget)           (conc "(lambda (ht)(runconfigs-get ht \"" cmd "\"))"))
+				((runconfigs-get rget) (conc "(lambda (ht)(runconfigs-get ht \"" cmd "\"))"))
+				;; ((rget)           (conc "(lambda (ht)(runconfigs-get ht \"" cmd "\"))"))
 				(else "(lambda (ht)(print \"ERROR\") \"ERROR\")"))))
 		;; (print "fullcmd=" fullcmd)
 		(handle-exceptions
 		 exn
 		 (begin
 		   (debug:print 0 *default-log-port* "WARNING: failed to process config input \"" l "\"")
 		   (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
 		   ;; (print "exn=" (condition->list exn))
-		   (set! result (conc "#{( " cmdtype ") " cmd"}")))
+		   (set! result (conc "#{( " cmdtype ") " cmd "}, full expansion: " fullcmd)))
 		 (if (or allow-system
-			 (not (member cmdtype '("system" "shell"))))
+			 (not (member cmdtype '("system" "shell" "sh"))))
 		     (with-input-from-string fullcmd
 		       (lambda ()
 			 (set! result ((eval (read)) ht))))
-		    (set! result (conc "#{(" cmdtype ") "  cmd "}"))))
+		     (set! result (conc "#{(" cmdtype ") "  cmd "}"))))
 		(case cmdsym
 		  ((system shell scheme)
 		   (let ((delta (- (current-seconds) start-time)))
 		     (if (> delta 2)
 			 (debug:print-info 0 *default-log-port* "for line \"" l "\"\n command:  " cmd " took " delta " seconds to run with output:\n   " result)
@@ -182,16 +189,19 @@
 ;; post-section-procs alist of section-pattern => proc, where: (proc section-name next-section-name ht curr-path)
 ;;
 (define (read-config path ht allow-system #!key (environ-patt #f)(curr-section #f)(sections #f)(settings (make-hash-table))(keep-filenames #f)(post-section-procs '()))
   (debug:print-info 5 *default-log-port* "read-config " path " allow-system " allow-system " environ-patt " environ-patt " curr-section: " curr-section " sections: " sections " pwd: " (current-directory))
   (debug:print 9 *default-log-port* "START: " path)
-  (if (not (file-exists? path))
+  (if (and (not (port? path))
+	   (not (file-exists? path))) ;; for case where we are handed a port
       (begin 
 	(debug:print-info 1 *default-log-port* "read-config - file not found " path " current path: " (current-directory))
 	;; WARNING: This is a risky change but really, we should not return an empty hash table if no file read?
 	#f) ;; (if (not ht)(make-hash-table) ht))
-      (let ((inp        (open-input-file path))
+      (let ((inp        (if (string? path)
+			    (open-input-file path)
+			      path)) ;; we can be handed a port
 	    (res        (if (not ht)(make-hash-table) ht))
 	    (metapath   (if (or (debug:debug-mode 9)
 				keep-filenames)
 			    path #f)))
 	(let loop ((inl               (configf:read-line inp res (calc-allow-system allow-system curr-section sections) settings)) ;; (read-line inp))
@@ -199,11 +209,12 @@
 		   (var-flag #f);; turn on for key-var-pr and cont-ln-rx, turn off elsewhere
 		   (lead     #f))
 	  (debug:print-info 8 *default-log-port* "curr-section-name: " curr-section-name " var-flag: " var-flag "\n   inl: \"" inl "\"")
 	  (if (eof-object? inl) 
 	      (begin
-		(close-input-port inp)
+		(if (string? path) ;; we received a path, not a port, thus we are responsible for closing it.
+		    (close-input-port inp))
 		(hash-table-delete! res "") ;; we are using "" as a dumping ground and must remove it before returning the ht
 		(debug:print 9 *default-log-port* "END: " path)
 		res)
 	      (regex-case 
 	       inl 
@@ -229,10 +240,26 @@
 							      (loop (configf:read-line inp res (calc-allow-system allow-system curr-section-name sections) settings) curr-section-name #f #f))
 							    (begin
 							      (debug:print '(2 9) #f "INFO: include file " include-file " not found (called from " path ")")
 							      (debug:print 2 *default-log-port* "        " full-conf)
 							      (loop (configf:read-line inp res (calc-allow-system allow-system curr-section-name sections) settings) curr-section-name #f #f)))))
+	       (configf:script-rx ( x include-script );; handle-exceptions
+						      ;;    exn
+						      ;;    (begin
+						      ;;      (debug:print '(0 2 9) #f "INFO: include from script " include-script " failed.")
+						      ;;      (loop (configf:read-line inp res (calc-allow-system allow-system curr-section-name sections) settings) curr-section-name #f #f))
+							 (if (and (file-exists? include-script)(file-execute-access? include-script))
+							     (let* ((new-inp-port (open-input-pipe include-script)))
+							       (debug:print '(2 9) *default-log-port* "Including from script output: " include-script)
+							      ;;  (print "We got here, calling read-config next. Port is: " new-inp-port)
+							       (read-config new-inp-port res allow-system environ-patt: environ-patt curr-section: curr-section-name sections: sections settings: settings keep-filenames: keep-filenames)
+							       (close-input-port new-inp-port)
+							       (loop (configf:read-line inp res (calc-allow-system allow-system curr-section-name sections) settings) curr-section-name #f #f))
+							     (begin
+							       (debug:print 0 *default-log-port* "Script not found or not exectutable: " include-script)
+							       (loop (configf:read-line inp res (calc-allow-system allow-system curr-section-name sections) settings) curr-section-name #f #f)))
+							 ) ;; )
 	       (configf:section-rx ( x section-name ) (begin
 							;; call post-section-procs
 							(for-each 
 							 (lambda (dat)
 							   (let ((patt (car dat))

Index: dashboard-tests.scm
==================================================================
--- dashboard-tests.scm
+++ dashboard-tests.scm
@@ -473,11 +473,11 @@
 	       (testconfig    (begin
 				;; (runs:set-megatest-env-vars run-id inrunname: runname testname: test-name itempath: item-path)
 				(runs:set-megatest-env-vars run-id inkeyvals: keydat inrunname: runname intarget: keystring testname: testname itempath: item-path) ;; these may be needed by the launching process
 				(handle-exceptions
 				 exn
-				 (tests:get-testconfig (db:test-get-testname testdat) test-registry #f)
+				 (tests:get-testconfig (db:test-get-testname testdat) (db:test-get-item-path testdat) test-registry #f)
 				 (tests:get-testconfig (db:test-get-testname testdat) test-registry #t))))
 	       (viewlog    (lambda (x)
 			     (if (file-exists? logfile)
 					;(system (conc "firefox " logfile "&"))
 				 (dashboard-tests:run-html-viewer logfile)

Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -290,10 +290,21 @@
   ;; runs summary view
   
   tests-tree       ;; used in newdashboard
   )
 
+;; register tabdat with BBpp
+;; this is used by BBpp (Brandon's pretty printer) to convert dboard:tabdat into a composition of lists that pp will handle
+(hash-table-set! *BBpp_custom_expanders_list* TABDAT:
+                 (cons dboard:tabdat?
+                       (lambda (tabdat-item)
+                         (filter
+                          (lambda (alist-entry)
+                            (member (car alist-entry)
+                                    '(allruns-by-id allruns))) ;; FIELDS OF INTEREST
+                          (dboard:tabdat->alist tabdat-item)))))
+
 (define (dboard:tabdat-target-string vec)
   (let ((targ (dboard:tabdat-target vec)))
     (if (list? targ)(string-intersperse targ "/") "no-target-specified")))
 
 (define (dboard:tabdat-test-patts-use vec)    
@@ -360,10 +371,24 @@
   ((last-update   0)                 : fixnum) ;; last query to db got records from before last-update
   ((data-changed  #f)                : boolean)
   ((run-data-offset  0)              : number)      ;; get only 100 items per call, set back to zero when received less that 100 items
   (db-path #f)
   )
+
+;; register dboard:rundat with BBpp
+;; this is used by BBpp (Brandon's pretty printer) to convert dboard:rundat into a composition of lists that pp will handle
+(hash-table-set! *BBpp_custom_expanders_list* RUNDAT:
+                 (cons dboard:rundat?
+                       (lambda (tabdat-item)
+                         (filter
+                          (lambda (alist-entry)
+                            (member (car alist-entry)
+                                    '(run run-data-offset ))) ;; FIELDS OF INTEREST
+                          (dboard:rundat->alist tabdat-item)))))
+
+
+
 
 (define (dboard:rundat-make-init #!key (run #f)(key-vals #f)(tests #f));; -100 is before time began
   (make-dboard:rundat 
    run: run
    tests: (or tests (make-hash-table))
@@ -623,10 +648,12 @@
 			 (for-each (lambda (run)
 				     (hash-table-set! ht (db:get-value-by-header run header "id") run))
 				   runs-tree) ;; (vector-ref runs-dat 1))
 			 ht))
 	 (tb          (dboard:tabdat-runs-tree tabdat)))
+    ;;(BB> "In update-rundat")
+    ;;(inspect allruns runs-hash)
     (dboard:tabdat-last-runs-update-set! tabdat (- (current-seconds) 2))
     (dboard:tabdat-header-set! tabdat header)
     ;; 
     ;; trim runs to only those that are changing often here
     ;; 
@@ -740,11 +767,17 @@
 		   (run-struct  (or run-struct
 				    (dboard:rundat-make-init
 				     run:         run 
 				     tests:       tests-ht
 				     key-vals:    key-vals)))
-		   (new-res     (if (null? all-test-ids) res (cons run-struct res)))
+		   (new-res     (if (null? all-test-ids)
+                                    res
+                                    (delete-duplicates
+                                     (cons run-struct res)
+                                     (lambda (a b)
+                                       (eq? (db:get-value-by-header (dboard:rundat-run a) header "id")
+                                            (db:get-value-by-header (dboard:rundat-run b) header "id"))))))
 		   (elapsed-time (- (current-seconds) start-time)))
 	      (if (null? all-test-ids)
 		  (hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id)
 		  (hash-table-set!    (dboard:tabdat-allruns-by-id tabdat) run-id run-struct))
 	      (if (or (null? tal)
@@ -3391,10 +3424,13 @@
        ;;(print "RA => calling runs-tab-updater with commondat " commondat " tab-num " tab-num)
        ;;(tabdat-values tabdat) ;;RA added 
        ;; (pp (dboard:tabdat->alist tabdat))
        ;; (if (dashboard:database-changed? commondat tabdat context-key: 'runs-rundat)      
        (dashboard:do-update-rundat tabdat)
+       ;;(BB> "dashboard:runs-tab-updater")
+       ;;(inspect tabdat)
+
        (let ((uidat (dboard:commondat-uidat commondat)))
 	 ;;(print "RA => Calling update-buttons with tabdat : " tabdat " uidat " uidat)
 	 (update-buttons tabdat uidat (dboard:tabdat-numruns tabdat) (dboard:tabdat-num-tests tabdat)))
        ))
    "dashboard:runs-tab-updater"))

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -193,11 +193,11 @@
 ;;  (or (configf:lookup *configdat* "setup" "dbdir")
 ;;      (conc (configf:lookup *configdat* "setup" "linktree") "/.db")))
 	       
 (define (db:set-sync db)
   (let ((syncprag (configf:lookup *configdat* "setup" "sychronous")))
-    (sqlite3:execute db (conc "PRAGMA synchronous = " (or syncprag 1) ";")))) 
+    (sqlite3:execute db (conc "PRAGMA synchronous = " (or syncprag 0) ";")))) 
 
 ;; open an sql database inside a file lock
 ;; returns: db existed-prior-to-opening
 ;; RA => Returns a db handler; sets the lock if opened in writable mode
 ;;
@@ -211,11 +211,11 @@
     (if file-write ;; dir-writable
 	(let (;; (lock    (obtain-dot-lock fname 1 5 10))
 	      (db      (sqlite3:open-database fname)))
 	  (sqlite3:set-busy-handler! db (make-busy-timeout 136000))
 	  ;; (db:set-sync db)
-	  (sqlite3:execute db "PRAGMA synchronous = NORMAL;")
+	  (sqlite3:execute db "PRAGMA synchronous = 0;")
 	  (if (not file-exists)
 	      (begin
 		(if (string-match "^/tmp/.*" fname) ;; this is a file in /tmp
 		    (sqlite3:execute db "PRAGMA journal_mode=WAL;")
 		    (print "Creating " fname " in NON-WAL mode."))
@@ -335,10 +335,11 @@
     (let ((update_info (cons (if force-sync 0 *db-last-sync*) "last_update")))
       (mutex-unlock! *db-multi-sync-mutex*)
       (db:sync-tables (db:sync-all-tables-list dbstruct) update_info tmpdb refndb mtdb))
     (mutex-lock! *db-multi-sync-mutex*)
     (set! *db-last-sync* start-t)
+    (set! *db-last-access* start-t)
     (mutex-unlock! *db-multi-sync-mutex*)))
 
 ;; close all opened run-id dbs
 (define (db:close-all dbstruct)
   (if (dbr:dbstruct? dbstruct)
@@ -739,11 +740,11 @@
 
 ;; Add db direct
 ;;
 (define (db:dispatch-query access-mode rmt-cmd db-cmd . params)
   (if (eq? access-mode 'cached)
-      (print "not doing cached calls right now"))
+      (debug:print 2 *default-log-port* "not doing cached calls right now"))
 ;;      (apply db:call-with-cached-db db-cmd params)
       (apply rmt-cmd params))
 ;;)
 
 ;; return the target db handle so it can be used
@@ -1497,25 +1498,33 @@
 	   (min-incompleted-ids (map car incompleted)) ;; do 'em all
 	   (all-ids             (append min-incompleted-ids (map car oldlaunched))))
       (if (> (length all-ids) 0)
 	  (begin
 	    (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as INCOMPLETE")
-	    (sqlite3:execute 
-	     db
-	     (conc "UPDATE tests SET state='INCOMPLETE' WHERE id IN (" 
-		   (string-intersperse (map conc all-ids) ",")
-		   ");")))))
-
-    ;; Now do rollups for the toplevel tests
-    ;;
-    ;; (db:delay-if-busy dbdat)
-    (for-each
-     (lambda (toptest)
-       (let ((test-name (list-ref toptest 3)))
-;;	     (run-id    (list-ref toptest 5)))
-	 (db:top-test-set-per-pf-counts dbstruct run-id test-name)))
-     toplevels)))
+            (for-each
+             (lambda (test-id)
+               (db:test-set-status-state dbstruct run-id test-id "COMPLETE" "DEAD" "Test failed to complete"))
+             all-ids))))))
+
+;; ALL REPLACED BY THE BLOCK ABOVE
+;;
+;; 	    (sqlite3:execute 
+;; 	     db
+;; 	     (conc "UPDATE tests SET state='INCOMPLETE' WHERE run_id=? AND id IN (" 
+;; 		   (string-intersperse (map conc all-ids) ",")
+;; 		   ");")
+;;              run-id))))
+;; 
+;;     ;; Now do rollups for the toplevel tests
+;;     ;;
+;;     ;; (db:delay-if-busy dbdat)
+;;     (for-each
+;;      (lambda (toptest)
+;;        (let ((test-name (list-ref toptest 3)))
+;; ;;	     (run-id    (list-ref toptest 5)))
+;; 	 (db:top-test-set-per-pf-counts dbstruct run-id test-name)))
+;;      toplevels)))
 
 ;; BUG: Probably broken - does not explicitly use run-id in the query
 ;;
 (define (db:top-test-set-per-pf-counts dbstruct run-id test-name)
   (db:general-call (db:get-db dbstruct run-id) 'top-test-set-per-pf-counts (list test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name))) 
@@ -2038,11 +2047,12 @@
 		 (if (string? netstate)
 		     (begin
 		       (hash-table-set! totals netstate (+ (hash-table-ref/default totals netstate 0) count))
 		       (hash-table-set! curr   netstate (+ (hash-table-ref/default curr   netstate 0) count))))))
 	     db
-	     "SELECT state,status,count(id) FROM tests AS t GROUP BY state,status ORDER BY state,status DESC;")
+	     "SELECT state,status,count(id) FROM tests AS t WHERE run_id=? GROUP BY state,status ORDER BY state,status DESC;"
+             run-id)
 	    ;; add the per run counts to res
 	    (for-each (lambda (state)
 			(set! res (cons (list run-name state (hash-table-ref curr state)) res)))
 		      (sort (hash-table-keys curr) string>=))
 	    (set! curr (make-hash-table))))))
@@ -2587,10 +2597,13 @@
 	       ))
 	    0)))))
              ;; DEBUG FIXME - need to merge this v.155 query correctly   
              ;; AND testname in (SELECT testname FROM test_meta WHERE jobgroup=?)
              ;; AND NOT (uname = 'n/a' AND item_path = '');"
+
+;; tags: '("tag%" "tag2" "%ag6")
+;;
 
 ;; done with run when:
 ;;   0 tests in LAUNCHED, NOT_STARTED, REMOTEHOSTSTART, RUNNING
 (define (db:estimated-tests-remaining dbstruct run-id)
   (db:with-db
@@ -3054,23 +3067,26 @@
        (set! row-ids (cons rid row-ids)))
      runsqry)
     (sqlite3:finalize! runsqry)
     row-ids))
 
+;; finds latest matching all patts for given run-id
+;;
 (define (db:test-get-paths-matching-keynames-target-new dbstruct run-id keynames target res testpatt statepatt statuspatt runname)
   (let* ((testqry (tests:match->sqlqry testpatt))
-	 (tstsqry (conc "SELECT rundir FROM tests WHERE " testqry " AND state LIKE '" statepatt "' AND status LIKE '" statuspatt "' ORDER BY event_time ASC;")))
+	 (tstsqry (conc "SELECT rundir FROM tests WHERE run_id=? AND " testqry " AND state LIKE '" statepatt "' AND status LIKE '" statuspatt "' ORDER BY event_time ASC;")))
     (db:with-db
      dbstruct
      run-id
      #f
      (lambda (db)
        (sqlite3:for-each-row 
 	(lambda (p)
 	  (set! res (cons p res)))
 	db
-	tstsqry)
+	tstsqry
+	run-id)
        res))))
 
 (define (db:test-toplevel-num-items dbstruct run-id testname)
   (db:with-db
    dbstruct
@@ -3310,10 +3326,11 @@
 	'(delete-tests-in-state   ;; "DELETE FROM tests WHERE state=?;")                  ;; DONE
 	  "UPDATE tests SET state='DELETED' WHERE state=?")
 	'(tests:test-set-toplog   "UPDATE tests SET final_logf=? WHERE run_id=? AND testname=? AND item_path='';")
 	'(update-cpuload-diskfree "UPDATE tests SET cpuload=?,diskfree=? WHERE id=?;") ;; DONE
 	'(update-uname-host       "UPDATE tests SET uname=?,host=? WHERE id=?;")       ;; DONE
+        '(update-test-rundat      "INSERT INTO test_rundat (test_id,update_time,cpuload,diskfree,diskusage,run_duration) VALUES (?,?,?,?,?,?);")
 	'(update-test-state       "UPDATE tests SET state=? WHERE state=? AND run_id=? AND testname=? AND NOT (item_path='' AND testname IN (SELECT DISTINCT testname FROM tests WHERE testname=? AND item_path != ''));")
 	'(update-test-status      "UPDATE tests SET status=? WHERE status like ? AND run_id=? AND testname=? AND NOT (item_path='' AND testname IN (SELECT DISTINCT testname FROM tests WHERE testname=? AND item_path != ''));")
 	;; stuff for roll-up-pass-fail-counts
 	'(update-pass-fail-counts "UPDATE tests 
              SET fail_count=(SELECT count(id) FROM tests WHERE testname=? AND item_path != '' AND status IN ('FAIL','CHECK','INCOMPLETE','ABORT')),
@@ -3464,10 +3481,28 @@
        (set! res (cons (vector state status count) res)))
      db
      "SELECT state,status,count(state) FROM tests WHERE run_id=? AND testname=? AND item_path='' GROUP BY state,status;"
      run-id testname)
     res))
+
+
+(define (db:get-latest-host-load dbstruct raw-hostname)
+  (let* ((hostname (string-substitute "\\..*$" "" raw-hostname))
+        (res  (cons -1 0))
+        (mydb (db:dbdat-get-db (db:get-db dbstruct 0)))
+        )
+    (db:with-db
+     dbstruct
+     0
+     #f
+     (lambda (db)
+       (sqlite3:for-each-row
+        (lambda (cpuload update-time)  (set! res (cons cpuload update-time)))
+        db
+        "SELECT tr.cpuload, tr.update_time FROM test_rundat tr, tests t WHERE t.host=? AND tr.cpuload != -1  AND tr.test_id=t.id ORDER BY tr.update_time DESC LIMIT 1;"
+        hostname))) res ))
+
 
 (define (db:set-top-level-from-items dbstruct run-id testname)
   (let* ((dbdat (db:get-db dbstruct run-id))
 	 (db    (db:dbdat-get-db dbdat))
 	 (summ  (db:get-state-status-summary db run-id testname))
@@ -3604,10 +3639,29 @@
        res))))
 
 ;;======================================================================
 ;; Tests meta data
 ;;======================================================================
+
+;; returns a hash table of tags to tests
+;;
+(define (db:get-tests-tags dbstruct)
+  (let* ((dbdat   (db:get-db dbstruct #f))
+	 (db      (db:dbdat-get-db dbdat))
+         (res     (make-hash-table)))
+    (sqlite3:for-each-row
+     (lambda (testname tags-in)
+       (let ((tags (string-split tags-in ",")))
+         (for-each
+          (lambda (tag)
+            (hash-table-set! res tag
+                             (delete-duplicates
+                              (cons testname (hash-table-ref/default res tag '())))))
+          tags)))
+     db
+     "SELECT testname,tags FROM test_meta")
+    res))
 
 ;; read the record given a testname
 (define (db:testmeta-get-record dbstruct testname)
   (let ((res   #f))
     (db:with-db

Index: docs/manual/megatest_manual.html
==================================================================
--- docs/manual/megatest_manual.html
+++ docs/manual/megatest_manual.html
@@ -1325,11 +1325,105 @@
 </div>
 <div class="sect1">
 <h2 id="_reference">Reference</h2>
 <div class="sectionbody">
 <div class="sect2">
-<h3 id="_megatest_config_file_settings">Megatest Config File Settings</h3>
+<h3 id="_config_file_helpers">Config File Helpers</h3>
+<div class="paragraph"><p>Various helpers for more advanced config files.</p></div>
+<table class="tableblock frame-topbot grid-all"
+style="
+width:80%;
+">
+<caption class="title">Table 2. Helpers</caption>
+<col style="width:14%;">
+<col style="width:28%;">
+<col style="width:28%;">
+<col style="width:28%;">
+<thead>
+<tr>
+<th class="tableblock halign-center valign-top" >Helper                      </th>
+<th class="tableblock halign-left valign-top" > Purpose                       </th>
+<th class="tableblock halign-left valign-top" > Valid values            </th>
+<th class="tableblock halign-left valign-top" > Comments</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td class="tableblock halign-center valign-top" ><p class="tableblock">#{scheme (scheme code&#8230;)}</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Execute arbitrary scheme code</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Any valid scheme</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Value returned from the call is converted to a string and processed as part of the config file</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-center valign-top" ><p class="tableblock">#{system command}</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Execute program, inserts exit code</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Any valid Unix command</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Discards the output from the program</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-center valign-top" ><p class="tableblock">#{shell  command} or #{sh &#8230;}</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Execute program, inserts result from stdout</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Any valid Unix command</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Value returned from the call is converted to a string and processed as part of the config file</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-center valign-top" ><p class="tableblock">#{realpath path} or #{rp &#8230;}</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Replace with normalized path</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Must be a valid path</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced"></p></td>
+</tr>
+<tr>
+<td class="tableblock halign-center valign-top" ><p class="tableblock">#{getenv VAR} or #{gv VAR}</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Replace with content of env variable</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Must be a valid var</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced"></p></td>
+</tr>
+<tr>
+<td class="tableblock halign-center valign-top" ><p class="tableblock">#{get s v} or #{g s v}</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Replace with variable v from section s</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Variable must be defined before use</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced"></p></td>
+</tr>
+<tr>
+<td class="tableblock halign-center valign-top" ><p class="tableblock">#{rget v}</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Replace with variable v from target or default of runconfigs file</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced"></p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced"></p></td>
+</tr>
+<tr>
+<td class="tableblock halign-center valign-top" ><p class="tableblock"></p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced">Replace with the path to the megatest testsuite area</p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced"></p></td>
+<td class="tableblock halign-left valign-top" ><p class="tableblock monospaced"></p></td>
+</tr>
+</tbody>
+</table>
+</div>
+<div class="sect2">
+<h3 id="_config_file_settings">Config File Settings</h3>
+<div class="paragraph"><p>Settings in megatest.config</p></div>
+</div>
+<div class="sect2">
+<h3 id="_config_file_additional_features">Config File Additional Features</h3>
+<div class="paragraph"><p>Including output from a script as if it was inline to the config file:</p></div>
+<div class="listingblock">
+<div class="content monospaced">
+<pre>[scriptinc myscript.sh]</pre>
+</div></div>
+<div class="paragraph"><p>If the script outputs:</p></div>
+<div class="listingblock">
+<div class="content monospaced">
+<pre>[items]
+A a b c
+B d e f</pre>
+</div></div>
+<div class="paragraph"><p>Then the config file would effectively appear to contain an items section
+exactly like the output from the script. This is extremely useful when
+dynamically creating items, itemstables and other config structures. You can
+see the expansion of the call by looking in the cached files (look in your
+linktree for megatest.config and runconfigs.config cache files and in your
+test run areas for the expanded and cached testconfig).</p></div>
 <div class="sect3">
 <h4 id="_disk_space_checks">Disk Space Checks</h4>
 <div class="paragraph"><p>Some parameters you can put in the [setup] section of megatest.config:</p></div>
 <div class="listingblock">
 <div class="content monospaced">
@@ -1448,11 +1542,11 @@
 <h3 id="_database_settings">Database settings</h3>
 <table class="tableblock frame-topbot grid-all"
 style="
 width:70%;
 ">
-<caption class="title">Table 2. Database config settings in [setup] section of megatest.config</caption>
+<caption class="title">Table 3. Database config settings in [setup] section of megatest.config</caption>
 <col style="width:14%;">
 <col style="width:28%;">
 <col style="width:28%;">
 <col style="width:28%;">
 <thead>
@@ -1917,11 +2011,11 @@
 <div class="paragraph"><p>These routines can be called from the megatest repl.</p></div>
 <table class="tableblock frame-topbot grid-all"
 style="
 width:70%;
 ">
-<caption class="title">Table 3. API Keys Related Calls</caption>
+<caption class="title">Table 4. API Keys Related Calls</caption>
 <col style="width:14%;">
 <col style="width:28%;">
 <col style="width:28%;">
 <col style="width:28%;">
 <thead>
@@ -1969,10 +2063,10 @@
 </div>
 <div id="footnotes"><hr></div>
 <div id="footer">
 <div id="footer-text">
 Version 1.0<br>
-Last updated 2016-10-19 10:23:07 PDT
+Last updated 2016-12-12 13:03:08 PST
 </div>
 </div>
 </body>
 </html>

Index: docs/manual/reference.txt
==================================================================
--- docs/manual/reference.txt
+++ docs/manual/reference.txt
@@ -1,11 +1,56 @@
 
 Reference
 ---------
 
-Megatest Config File Settings
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Config File Helpers
+~~~~~~~~~~~~~~~~~~~
+
+Various helpers for more advanced config files.
+
+.Helpers
+[width="80%",cols="^,2m,2m,2m",frame="topbot",options="header"]
+|======================
+|Helper                      | Purpose                       | Valid values            | Comments
+| #{scheme (scheme code...)} | Execute arbitrary scheme code | Any valid scheme        | Value returned from the call is converted to a string and processed as part of the config file
+| #{system command}          | Execute program, inserts exit code  | Any valid Unix command  | Discards the output from the program
+| #{shell  command} or #{sh ...}  | Execute program, inserts result from stdout | Any valid Unix command | Value returned from the call is converted to a string and processed as part of the config file
+| #{realpath path} or #{rp ...}   | Replace with normalized path | Must be a valid path |
+| #{getenv VAR} or #{gv VAR}      | Replace with content of env variable | Must be a valid var |
+| #{get s v} or #{g s v}     | Replace with variable v from section s | Variable must be defined before use |
+| #{rget v}                  | Replace with variable v from target or default of runconfigs file | |
+| #{mtrah}                   | Replace with the path to the megatest testsuite area | | 
+|======================
+
+Config File Settings
+~~~~~~~~~~~~~~~~~~~~
+
+Settings in megatest.config
+
+Config File Additional Features
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Including output from a script as if it was inline to the config file:
+
+-------------------------
+[scriptinc myscript.sh]
+-------------------------
+
+If the script outputs:
+
+-------------------------
+[items]
+A a b c
+B d e f
+-------------------------
+
+Then the config file would effectively appear to contain an items section
+exactly like the output from the script. This is extremely useful when
+dynamically creating items, itemstables and other config structures. You can
+see the expansion of the call by looking in the cached files (look in your
+linktree for megatest.config and runconfigs.config cache files and in your
+test run areas for the expanded and cached testconfig).
 
 Disk Space Checks
 ^^^^^^^^^^^^^^^^^
 
 Some parameters you can put in the [setup] section of megatest.config:

Index: http-transport.scm
==================================================================
--- http-transport.scm
+++ http-transport.scm
@@ -217,11 +217,11 @@
   (let* ((fullurl    (if (vector? serverdat)
 			 (http-transport:server-dat-get-api-req serverdat)
 			 (begin
 			   (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info")
 			   (exit 1))))
-	 (res        #f)
+	 (res        (vector #f "uninitialized"))
 	 (success    #t)
 	 (sparams    (db:obj->string params transport: 'http)))
        (debug:print-info 11 *default-log-port* "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n")
        ;; set up the http-client here
        (max-retry-attempts 1)
@@ -383,30 +383,34 @@
 	 (server-going  #f))
     (let loop ((count         0)
 	       (server-state 'available)
 	       (bad-sync-count 0)
 	       (start-time     (current-milliseconds)))
-
+      ;;(BB> "http-transport: top of loop; count="count" server-state="server-state" bad-sync-count="bad-sync-count" server-going="server-going)
       ;; Use this opportunity to sync the tmp db to megatest.db
       (if (not server-going) ;; *dbstruct-db* 
 	    ;; Removed code is pasted below (keeping it around until we are clear it is not needed).
 	    ;; no *dbstruct-db* yet, set running after our first pass through and start the db
 	    (if (eq? server-state 'available)
 		(let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers
 		  (if (equal? new-server-id server-id)
 		      (begin
 			(tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep")
+                        ;;(BB> "http-transport: ->dbprep")
 			(thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access
 			(set! *dbstruct-db*  (db:setup)) ;;  run-id))
 			(set! server-going #t)
 			(tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")
-			(server:write-dotserver *toppath* (conc iface ":" port))
-			(delete-file* (conc *toppath* "/.starting-server")))
+                        ;;(BB> "http-transport: ->running")
+			(server:write-dotserver *toppath* iface port (current-process-id) 'http)
+                        (thread-start! *watchdog*)
+                        (server:complete-attempt *toppath*))
 		      (begin ;; gotta exit nicely
+                        ;;(BB> "http-transport: ->collision")
 			(tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision")
 			(http-transport:server-shutdown server-id port))))))
-
+      
       ;; when things go wrong we don't want to be doing the various queries too often
       ;; so we strive to run this stuff only every four seconds or so.
       (let* ((sync-time (- (current-milliseconds) start-time))
 	    (rem-time  (quotient (- 4000 sync-time) 1000)))
 	(if (and (<= rem-time 4)
@@ -424,11 +428,12 @@
       (if (or (not (equal? sdat (list iface port)))
 	      (not server-id))
 	  (begin 
 	    (debug:print-info 0 *default-log-port* "interface changed, refreshing iface and port info")
 	    (set! iface (car sdat))
-	    (set! port  (cadr sdat))))
+	    (set! port  (cadr sdat))
+            (server:write-dotserver *toppath* iface port (current-process-id) 'http)))
       
       ;; Transfer *db-last-access* to last-access to use in checking that we are still alive
       (mutex-lock! *heartbeat-mutex*)
       (set! last-access *db-last-access*)
       (mutex-unlock! *heartbeat-mutex*)
@@ -443,25 +448,30 @@
 	     (adjusted-timeout (if (> hrs-since-start 1)
 				   (- server-timeout (inexact->exact (round (* hrs-since-start 60))))  ;; subtract 60 seconds per hour
 				   server-timeout)))
 	(if (common:low-noise-print 120 "server timeout")
 	    (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout))
-	(if (and *server-run*
+	(cond
+         ((not (server:confirm-dotserver *toppath* iface port (current-process-id) 'http))
+          (debug:print-info 0 *default-log-port* "Server .server file does not exist or contents do not match.  Initiate server shutdown.")
+          (http-transport:server-shutdown server-id port))
+         ((and *server-run*
 		 (> (+ last-access server-timeout)
 		    (current-seconds)))
-	    (begin
-	      (if (common:low-noise-print 120 "server continuing")
-		  (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)))
-	      ;;
-	      ;; Consider implementing some smarts here to re-insert the record or kill self is
-	      ;; the db indicates so
-	      ;;
-	      ;; (if (tasks:server-am-i-the-server? tdb run-id)
-	      ;;     (tasks:server-set-state! tdb server-id "running"))
-	      ;;
-	      (loop 0 server-state bad-sync-count (current-milliseconds)))
-	    (http-transport:server-shutdown server-id port))))))
+          (if (common:low-noise-print 120 "server continuing")
+              (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)))
+          ;;
+          ;; Consider implementing some smarts here to re-insert the record or kill self is
+          ;; the db indicates so
+          ;;
+          ;; (if (tasks:server-am-i-the-server? tdb run-id)
+          ;;     (tasks:server-set-state! tdb server-id "running"))
+          ;;
+          (loop 0 server-state bad-sync-count (current-milliseconds)))
+         (else
+          (debug:print-info 0 *default-log-port* "Server timeed out. seconds since last db access: " (- (current-seconds) last-access))
+          (http-transport:server-shutdown server-id port)))))))
 
 ;; code cut out from above
 ;;
 ;; (condition-case
 ;;  ;; (if (and (member (mutex-state *db-sync-mutex*) '(abandoned not-abandoned))
@@ -486,11 +496,12 @@
 ;; 	(thread-sleep! rem-time)
 ;; 	(thread-sleep! 4))) ;; fallback for if the math is changed ...
 
 (define (http-transport:server-shutdown server-id port)
   (let ((tdbdat (tasks:open-db)))
-    (debug:print-info 0 *default-log-port* "Starting to shutdown the server.")
+    ;;(BB> "http-transport:server-shutdown called")
+    (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id))
     ;;
     ;; start_shutdown
     ;;
     (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down")
     (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up
@@ -513,21 +524,19 @@
 		      " ms")
     (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting")
     (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running complete")
     ;; if the .server file contained :myport then we can remove it
     (server:remove-dotserver-file *toppath* port)
+    ;;(BB> "http-transport:server-shutdown -> exit")
     (exit)))
 
 ;; all routes though here end in exit ...
 ;;
 ;; start_server? 
 ;;
 (define (http-transport:launch run-id)
-  (with-output-to-file
-      (conc *toppath* "/.starting-server")
-    (lambda ()
-      (print (current-process-id) " on " (get-host-name))))
+  (server:attempting-start *toppath*)
   (let* ((tdbdat (tasks:open-db)))
     (set! *run-id*   run-id)
     (if (args:get-arg "-daemonize")
 	(begin
 	  (daemon:ize)
@@ -539,11 +548,11 @@
              (server:check-if-running run-id))
 	(begin
 	  (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running")
 	  (exit 0))
 	(begin ;; ok, no server detected, clean out any lingering records
-	   (tasks:server-force-clean-running-records-for-run-id  (db:delay-if-busy tdbdat) run-id "notresponding")))
+          (tasks:server-force-clean-running-records-for-run-id  (db:delay-if-busy tdbdat) run-id "notresponding")))
     (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))
 	       (remtries  4))
       (if (not server-id)
 	  (if (> remtries 0)
 	      (begin
@@ -552,11 +561,11 @@
 		      (- remtries 1)))
 	      (begin
 		;; since we didn't get the server lock we are going to clean up and bail out
 		(debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
 		(tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch")
-		(delete-file* (conc *toppath* "/.starting-server"))
+                (server:complete-attempt *toppath*)
 		))
 	  (let* ((th2 (make-thread (lambda ()
 				     (debug:print-info 0 *default-log-port* "Server run thread started")
 				     (http-transport:run 
 				      (if (args:get-arg "-server")

DELETED inteldate.scm
Index: inteldate.scm
==================================================================
--- inteldate.scm
+++ /dev/null
@@ -1,180 +0,0 @@
-(use srfi-19)
-(use test)
-(use format)
-(use regex)
-(declare (unit inteldate))
-;; utility procedures to convert among
-;; different ways to express date (inteldate, seconds since epoch, isodate)
-;;
-;; samples:
-;; isodate   -> "2016-01-01"
-;; inteldate -> "16ww01.5"
-;; seconds   -> 1451631600
-
-;; procedures provided:
-;; ====================
-;; seconds->isodate
-;; seconds->inteldate
-;;
-;; isodate->seconds
-;; isodate->inteldate
-;;
-;; inteldate->seconds
-;; inteldate->isodate
-
-;; srfi-19 used extensively; this doc is better tha the eggref:
-;; http://srfi.schemers.org/srfi-19/srfi-19.html
-
-;; Author: brandon.j.barclay@intel.com 16ww18.6
-
-(define (date->seconds date)
-  (inexact->exact
-   (string->number
-    (date->string date "~s"))))
-
-(define (seconds->isodate seconds)
-  (let* ((date (seconds->date seconds))
-         (result (date->string date "~Y-~m-~d")))
-    result))
-
-(define (isodate->seconds isodate)
-  "Takes a string input of the form 'YY-MM-DD' or 'YYYY-MM-DD' and returns epoch time; for YY, assume after Y2K"
-  (let* ((numlist (map string->number (string-split isodate "-")))
-        (raw-year (car numlist))
-        (year (if (< raw-year 100) (+ raw-year 2000) raw-year))
-        (month (list-ref numlist 1))
-        (day (list-ref numlist 2))
-        (date (make-date 0 0 0 0 day month year))
-        (seconds (date->seconds date)))
-
-    seconds))
-
-;; adapted from perl Intel::WorkWeek perl module
-;; intel year consists of numbered weeks starting from week 1
-;;   week 1 is the week containing jan 1 of the year
-;;   days of week are numbered starting from 0 on sunday
-;;   intel year does not match calendar year in workweek 1
-;;     before jan1.
-(define (seconds->inteldate-values seconds)
-  (define (date-difference->seconds d1 d2)
-    (- (date->seconds d1) (date->seconds d2)))
-
-  (let* ((thisdate (seconds->date seconds))
-         (thisdow (string->number (date->string thisdate "~w")))
-
-         (year (date-year thisdate))
-         ;; intel workweek 1 begins on sunday of week containing jan1
-         (jan1 (make-date 0 0 0 0 1 1 year))
-         (jan1dow (date-week-day jan1))
-         (ww01 (date-subtract-duration jan1 (seconds->time (* 60 60 24 jan1dow))))
-
-         (ww01_delta_seconds (date-difference->seconds thisdate ww01))
-         (wwnum_initial (inexact->exact (add1 (floor (/ ww01_delta_seconds 24 3600 7) ))))
-         
-         ;; we could be in ww1 of next year
-         (this-saturday (seconds->date
-                         (+ seconds
-                            (* 60 60 24 (- 6 thisdow)))))
-         (this-week-ends-next-year?
-          (> (date-year this-saturday) year))
-         (intelyear
-          (if this-week-ends-next-year?
-              (add1 year)
-              year))
-         (intelweek
-          (if this-week-ends-next-year?
-              1
-              wwnum_initial)))
-   (values intelyear intelweek thisdow)))
-
-(define (seconds->inteldate seconds)
-  (define (string-leftpad in width pad-char)
-    (let* ((unpadded-str (->string in))
-           (padlen_temp (- width (string-length unpadded-str)))
-           (padlen (if (< padlen_temp 0) 0 padlen_temp))
-           (padding
-            (fold conc ""
-                  (map (lambda (x) (->string pad-char)) (iota padlen)))))
-      (conc padding unpadded-str)))
-  (define (zeropad num width)
-    (string-leftpad num width #:0))
-
-  (let-values (((intelyear intelweek day-of-week-num)
-                (seconds->inteldate-values seconds)))
-    (let ((intelyear-str
-           (zeropad
-            (->string
-             (if (> intelyear 1999)
-                 (- intelyear 2000) intelyear))
-            2))
-          (intelweek-str
-           (zeropad (->string intelweek) 2))
-          (dow-str (->string day-of-week-num)))
-      (conc intelyear-str "ww" intelweek-str "." dow-str))))
-
-(define (isodate->inteldate isodate)
-  (seconds->inteldate
-   (isodate->seconds isodate)))
-
-(define (inteldate->seconds inteldate)
-  (let ((match (string-match "^(\\d+)ww(\\d+).(\\d)$" inteldate)))
-    (if
-     (not match)
-     #f
-     (let* (
-            (intelyear-raw (string->number (list-ref match 1)))
-            (intelyear (if (< intelyear-raw 100)
-                           (+ intelyear-raw 2000)
-                           intelyear-raw))
-            (intelww (string->number (list-ref match 2)))
-            (dayofweek (string->number (list-ref match 3)))
-
-            (day-of-seconds (* 60 60 24 ))
-            (week-of-seconds (* day-of-seconds 7))
-            
-
-            ;; get seconds at ww1.0
-            (new-years-date (make-date 0 0 0 0 1 1 intelyear))
-            (new-years-seconds
-             (date->seconds new-years-date))
-            (new-years-dayofweek (date-week-day new-years-date))
-            (ww1.0_seconds (- new-years-seconds
-                              (* day-of-seconds
-                                 new-years-dayofweek)))
-            (workweek-adjustment (* week-of-seconds (sub1 intelww)))
-            (weekday-adjustment (* dayofweek day-of-seconds))
-
-            (result (+ ww1.0_seconds workweek-adjustment weekday-adjustment)))
-       result))))
-
-(define (inteldate->isodate inteldate)
-  (seconds->isodate (inteldate->seconds inteldate)))
-
-(define (inteldate-tests)
-  (test-group
-   "date conversion tests"
-   (let ((test-table
-          '(("16ww01.5" . "2016-01-01")
-            ("16ww18.5" . "2016-04-29")
-            ("1999ww33.5" . "1999-08-13")
-            ("16ww18.4" . "2016-04-28")
-            ("16ww18.3" . "2016-04-27")
-            ("13ww01.0" . "2012-12-30")
-            ("13ww52.6" . "2013-12-28")
-            ("16ww53.3" . "2016-12-28"))))
-     (for-each
-      (lambda (test-pair)
-        (let ((inteldate (car test-pair))
-              (isodate (cdr test-pair)))
-          (test
-           (conc "(isodate->inteldate "isodate ") => "inteldate)
-           inteldate
-           (isodate->inteldate isodate))
-          
-          (test
-           (conc "(inteldate->isodate "inteldate ")   => "isodate)
-           isodate
-           (inteldate->isodate inteldate))))
-      test-table))))
-
-;(inteldate-tests)

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -122,10 +122,22 @@
     (call-with-environment-variables 
      (list (cons "PATH" (conc (get-environment-variable "PATH") ":.")))
      (lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1")
        (let* ((cmd (conc stepcmd " > " stepname ".log 2>&1")) ;; >outfile 2>&1 
 	      (pid (process-run "/bin/bash" (list "-c" cmd))))
+
+         (with-output-to-file "Makefile.ezsteps"
+           (lambda ()
+             (print stepname ".log :")
+             (print "\t" cmd)
+             (if (file-exists? (conc stepname ".logpro"))
+                 (print "\tlogpro " stepname ".logpro " stepname ".html < " stepname ".log"))
+             (print)
+             (print stepname " : " stepname ".log")
+             (print))
+           #:append)
+
 	 (rmt:test-set-top-process-pid run-id test-id pid)
 	 (let processloop ((i 0))
 	   (let-values (((pid-val exit-status exit-code)(process-wait pid #t)))
 		       (mutex-lock! m)
 		       (launch:einf-pid-set!         exit-info pid)         ;; (vector-set! exit-info 0 pid)
@@ -268,11 +280,11 @@
   ;; do all the ezsteps (if any)
   (if ezsteps
       (let* ((testconfig ;; (read-config (conc work-area "/testconfig") #f #t environ-patt: "pre-launch-env-vars")) ;; FIXME??? is allow-system ok here?
 	      ;; NOTE: it is tempting to turn off force-create of testconfig but dynamic
 	      ;;       ezstep names need a full re-eval here.
-	      (tests:get-testconfig test-name tconfigreg #t force-create: #t)) ;; 'return-procs)))
+	      (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs)))
 	     (ezstepslst (if (hash-table? testconfig)
 			     (hash-table-ref/default testconfig "ezsteps" '())
 			     #f)))
 	(if testconfig
 	    (hash-table-set! *testconfigs* test-name testconfig) ;; cached for lazy reads later ...
@@ -316,15 +328,15 @@
 	 (kill-tries 0))
     ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
     ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
     (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
     (let loop ((minutes   (calc-minutes))
-	       (cpu-load  (get-cpu-load))
+	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
 	       (disk-free (get-df (current-directory))))
-      (let ((new-cpu-load (let* ((load  (get-cpu-load))
+      (let ((new-cpu-load (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
 				 (delta (abs (- load cpu-load))))
-			    (if (> delta 0.6) ;; don't bother updating with small changes
+			    (if (> delta 0.1) ;; don't bother updating with small changes
 				load
 				#f)))
 	    (new-disk-free (let* ((df    (get-df (current-directory)))
 				  (delta (abs (- df disk-free))))
 			     (if (> delta 200) ;; ignore changes under 200 Meg
@@ -846,11 +858,14 @@
 	     (directory-exists? *toppath*))
 	(begin
 	  (setenv "MT_RUN_AREA_HOME" *toppath*)
 	  (setenv "MT_TESTSUITE_NAME" (common:get-testsuite-name)))
 	(begin
-	  (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.")))
+	  (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.")
+          ;;(exit 1)
+          #f
+          ))
     *toppath*))
 
 (define (get-best-disk confdat testconfig)
   (let* ((disks   (or (and testconfig (hash-table-ref/default testconfig "disks" #f))
 		      (hash-table-ref/default confdat "disks" #f)))
@@ -861,11 +876,11 @@
 	  (if res
 	      (cdr res)
 	      (begin
 		(if (common:low-noise-print 20 "No valid disks or no disk with enough space")
 		    (debug:print-error 0 *default-log-port* "No valid disks found in megatest.config. Please add some to your [disks] section and ensure the directory exists and has enough space!\n    You can change minspace in the [setup] section of megatest.config. Current setting is: " minspace))
-		(exit 1)))))))
+		(exit 1))))))) ;; TODO - move the exit to the calling location and return #f
 
 ;; Desired directory structure:
 ;;
 ;;  <linkdir> - <target> - <testname> -.
 ;;                                     |
@@ -1053,198 +1068,201 @@
 ;; 4. remotely run the test on allocated host
 ;;    - could be ssh to host from hosts table (update regularly with load)
 ;;    - could be netbatch
 ;;      (launch-test db (cadr status) test-conf))
 (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params)
-  (let loop ((delta        (- (current-seconds) *last-launch*))
-	     (launch-delay (string->number (or (configf:lookup *configdat* "setup" "launch-delay") "5"))))
-    (if (> launch-delay delta)
-	(begin
-	  (debug:print-info 0 *default-log-port* "Delaying launch of " test-name " for " (- launch-delay delta) " seconds")
-	  (thread-sleep! (- launch-delay delta))
-	  (loop (- (current-seconds) *last-launch*) launch-delay))))
-  (set! *last-launch* (current-seconds))
-  (change-directory *toppath*)
-  (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute"
-   (list ;; (list "MT_TEST_RUN_DIR" work-area)
-    (list "MT_RUN_AREA_HOME" *toppath*)
-    (list "MT_TEST_NAME" test-name)
-    ;; (list "MT_ITEM_INFO" (conc itemdat)) 
-    (list "MT_RUNNAME"   runname)
-    ;; (list "MT_TARGET"    mt_target)
-    ))
-  (let* ((tregistry       (tests:get-all))
-	 (item-path       (let ((ip (item-list->path itemdat)))
-			    (alist->env-vars (list (list "MT_ITEMPATH" ip)))
-			    ip))
-	 (tconfig         (or (tests:get-testconfig test-name tregistry #t force-create: #t)
-			      test-conf)) ;; force re-read now that all vars are set
-	 (useshell        (let ((ush (config-lookup *configdat* "jobtools"     "useshell")))
-			    (if ush 
-				(if (equal? ush "no") ;; must use "no" to NOT use shell
-				    #f
-				    ush)
-				#t)))     ;; default is yes
-	 (runscript       (config-lookup tconfig   "setup"        "runscript"))
-	 (ezsteps         (> (length (hash-table-ref/default tconfig "ezsteps" '())) 0)) ;; don't send all the steps, could be big
-	 (diskspace       (config-lookup tconfig   "requirements" "diskspace"))
-	 (memory          (config-lookup tconfig   "requirements" "memory"))
-	 (hosts           (config-lookup *configdat* "jobtools"     "workhosts"))
-	 (remote-megatest (config-lookup *configdat* "setup" "executable"))
-	 (run-time-limit  (or (configf:lookup  tconfig   "requirements" "runtimelim")
-			      (configf:lookup  *configdat* "setup" "runtimelim")))
-	 ;; FIXME SOMEDAY: not good how this is so obtuse, this hack is to 
-	 ;;                allow running from dashboard. Extract the path
-	 ;;                from the called megatest and convert dashboard
-	 ;;             	  or dboard to megatest
-	 (local-megatest  (let* ((lm  (car (argv)))
-				 (dir (pathname-directory lm))
-				 (exe (pathname-strip-directory lm)))
-			    (conc (if dir (conc dir "/") "")
-				  (case (string->symbol exe)
-				    ((dboard)    "../megatest")
-				    ((mtest)     "../megatest")
-				    ((dashboard) "megatest")
-				    (else exe)))))
-	 (launcher        (common:get-launcher *configdat* test-name item-path)) ;; (config-lookup *configdat* "jobtools"     "launcher"))
-	 (test-sig   (conc (common:get-testsuite-name) ":" test-name ":" item-path)) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path
-	 (work-area  #f)
-	 (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all
-	 (diskpath   #f)
-	 (cmdparms   #f)
-	 (fullcmd    #f) ;; (define a (with-output-to-string (lambda ()(write x))))
-	 (mt-bindir-path #f)
-	 (testinfo   (rmt:get-test-info-by-id run-id test-id))
-	 (mt_target  (string-intersperse (map cadr keyvals) "/"))
-	 (debug-param (append (if (args:get-arg "-debug")  (list "-debug" (args:get-arg "-debug")) '())
-			      (if (args:get-arg "-logging")(list "-logging") '()))))
-
-    (setenv "MT_ITEMPATH" item-path)
-    (if hosts (set! hosts (string-split hosts)))
-    ;; set the megatest to be called on the remote host
-    (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest"))
-    (set! mt-bindir-path (pathname-directory remote-megatest))
-    (if launcher (set! launcher (string-split launcher)))
-    ;; set up the run work area for this test
-    (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run
-	     (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir
-	(begin
-	  (debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path)
-	  (runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record
-
-    ;; prevent overlapping actions - set to LAUNCHED as early as possible
-    ;;
-    ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail
-    (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED"))
-    (rmt:roll-up-pass-fail-counts run-id test-name item-path #f "LAUNCHED" #f)
-    (set! diskpath (get-best-disk *configdat* tconfig))
-    (if diskpath
-	(let ((dat  (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat)))
-	  (set! work-area (car dat))
-	  (set! toptest-work-area (cadr dat))
-	  (debug:print-info 2 *default-log-port* "Using work area " work-area))
-	(begin
-	  (set! work-area (conc test-path "/tmp_run"))
-	  (create-directory work-area #t)
-	  (debug:print 0 *default-log-port* "WARNING: No disk work area specified - running in the test directory under tmp_run")))
-    (set! cmdparms (base64:base64-encode 
-		    (z3:encode-buffer 
-		     (with-output-to-string
-		       (lambda () ;; (list 'hosts     hosts)
-			 (write (list (list 'testpath  test-path)
-				      (list 'transport (conc *transport-type*))
-				      ;; (list 'serverinf *server-info*)
-				      (list 'toppath   *toppath*)
-				      (list 'work-area work-area)
-				      (list 'test-name test-name) 
-				      (list 'runscript runscript) 
-				      (list 'run-id    run-id   )
-				      (list 'test-id   test-id  )
-				      ;; (list 'item-path item-path )
-				      (list 'itemdat   itemdat  )
-				      (list 'megatest  remote-megatest)
-				      (list 'ezsteps   ezsteps) 
-				      (list 'target    mt_target)
-				      (list 'runtlim   (if run-time-limit (common:hms-string->seconds run-time-limit) #f))
-				      (list 'env-ovrd  (hash-table-ref/default *configdat* "env-override" '())) 
-				      (list 'set-vars  (if params (hash-table-ref/default params "-setvars" #f)))
-				      (list 'runname   runname)
-				      (list 'mt-bindir-path mt-bindir-path))))))))
-
-    ;; clean out step records from previous run if they exist
-    ;; (rmt:delete-test-step-records run-id test-id)
-    ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway
-    (if (file-exists? work-area)
-	(change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir
-    (cond
-     ((and launcher hosts) ;; must be using ssh hostname
-      (set! fullcmd (append launcher (car hosts)(list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param)))
-     ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms))))
-     (launcher
-      (set! fullcmd (append launcher (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param)))
-     ;; (set! fullcmd (append launcher (list remote-megatest test-sig "-execute" cmdparms))))
-     (else
-      (if (not useshell)(debug:print 0 *default-log-port* "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section"))
-      (set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" ""))))))
-    ;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" "")))))
-    (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm"))))
-    (debug:print 1 *default-log-port* "Launching " work-area)
-    ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done
-    (debug:print 4 *default-log-port* "fullcmd: " fullcmd)
-    (let* ((commonprevvals (alist->env-vars
-			    (hash-table-ref/default *configdat* "env-override" '())))
-	   (testprevvals   (alist->env-vars
-			    (hash-table-ref/default tconfig "pre-launch-env-overrides" '())))
-	   (miscprevvals   (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute"
-			    (append (list (list "MT_TEST_RUN_DIR" work-area)
-					  (list "MT_TEST_NAME" test-name)
-					  (list "MT_ITEM_INFO" (conc itemdat)) 
-					  (list "MT_RUNNAME"   runname)
-					  (list "MT_TARGET"    mt_target)
-					  (list "MT_ITEMPATH"  item-path)
-					  )
-				    itemdat)))
-	   ;; Launchwait defaults to true, must override it to turn off wait
-	   (launchwait     (if (equal? (configf:lookup *configdat* "setup" "launchwait") "no") #f #t))
-	   (launch-results (apply (if launchwait
-				      process:cmd-run-with-stderr->list
-				      process-run)
-				  (if useshell
-				      (let ((cmdstr (string-intersperse fullcmd " ")))
-					(if launchwait
-					    cmdstr
-					    (conc cmdstr " >> mt_launch.log 2>&1")))
-				      (car fullcmd))
-				  (if useshell
-				      '()
-				      (cdr fullcmd)))))
-      (if (not launchwait) ;; give the OS a little time to allow the process to start
-	  (thread-sleep! 0.01))
-      (with-output-to-file "mt_launch.log"
-	(lambda ()
-	  (print "LAUNCHCMD: " (string-intersperse fullcmd " "))
-	  (if (list? launch-results)
-	      (apply print launch-results)
-	      (print "NOTE: launched \"" fullcmd "\"\n  but did not wait for it to proceed. Add the following to megatest.config \n[setup]\nlaunchwait yes\n  if you have problems with this"))
-	  #:append))
-      (debug:print 2 *default-log-port* "Launching completed, updating db")
-      (debug:print 2 *default-log-port* "Launch results: " launch-results)
-      (if (not launch-results)
-          (begin
-            (print "ERROR: Failed to run " (string-intersperse fullcmd " ") ", exiting now")
-            ;; (sqlite3:finalize! db)
-            ;; good ole "exit" seems not to work
-            ;; (_exit 9)
-            ;; but this hack will work! Thanks go to Alan Post of the Chicken email list
-            ;; NB// Is this still needed? Should be safe to go back to "exit" now?
-            (process-signal (current-process-id) signal/kill)
-            ))
-      (alist->env-vars miscprevvals)
-      (alist->env-vars testprevvals)
-      (alist->env-vars commonprevvals)
-      launch-results))
-  (change-directory *toppath*))
+  (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex
+  (let* ((item-path       (item-list->path itemdat)))
+    (let loop ((delta        (- (current-seconds) *last-launch*))
+	       (launch-delay (string->number (or (configf:lookup *configdat* "setup" "launch-delay") "5"))))
+      (if (> launch-delay delta)
+	  (begin
+	    (debug:print-info 0 *default-log-port* "Delaying launch of " test-name " for " (- launch-delay delta) " seconds")
+	    (thread-sleep! (- launch-delay delta))
+	    (loop (- (current-seconds) *last-launch*) launch-delay))))
+    (change-directory *toppath*)
+    (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars)
+     (append
+      (list
+       (list "MT_RUN_AREA_HOME" *toppath*)
+       (list "MT_TEST_NAME" test-name)
+       (list "MT_RUNNAME"   runname)
+       (list "MT_ITEMPATH"  item-path)
+       )
+      itemdat))
+    (let* ((tregistry       (tests:get-all)) ;; third param (below) is system-allowed
+           ;; for tconfig, why do we allow fallback to test-conf?
+	   (tconfig         (or (tests:get-testconfig test-name item-path tregistry #t force-create: #t)
+				(begin
+                                  (debug:print 0 *default-log-port* "WARNING: falling back to pre-calculated testconfig. This is likely not desired.")
+                                  test-conf))) ;; force re-read now that all vars are set
+	   (useshell        (let ((ush (config-lookup *configdat* "jobtools"     "useshell")))
+			      (if ush 
+				  (if (equal? ush "no") ;; must use "no" to NOT use shell
+				      #f
+				      ush)
+				  #t)))     ;; default is yes
+	   (runscript       (config-lookup tconfig   "setup"        "runscript"))
+	   (ezsteps         (> (length (hash-table-ref/default tconfig "ezsteps" '())) 0)) ;; don't send all the steps, could be big
+	   ;; (diskspace       (config-lookup tconfig   "requirements" "diskspace"))
+	   ;; (memory          (config-lookup tconfig   "requirements" "memory"))
+	   ;; (hosts           (config-lookup *configdat* "jobtools"     "workhosts")) ;; I'm pretty sure this was never completed
+	   (remote-megatest (config-lookup *configdat* "setup" "executable"))
+	   (run-time-limit  (or (configf:lookup  tconfig   "requirements" "runtimelim")
+				(configf:lookup  *configdat* "setup" "runtimelim")))
+	   ;; FIXME SOMEDAY: not good how this is so obtuse, this hack is to 
+	   ;;                allow running from dashboard. Extract the path
+	   ;;                from the called megatest and convert dashboard
+	   ;;             	  or dboard to megatest
+	   (local-megatest  (let* ((lm  (car (argv)))
+				   (dir (pathname-directory lm))
+				   (exe (pathname-strip-directory lm)))
+			      (conc (if dir (conc dir "/") "")
+				    (case (string->symbol exe)
+				      ((dboard)    "../megatest")
+				      ((mtest)     "../megatest")
+				      ((dashboard) "megatest")
+				      (else exe)))))
+	   (launcher        (common:get-launcher *configdat* test-name item-path)) ;; (config-lookup *configdat* "jobtools"     "launcher"))
+	   (test-sig   (conc (common:get-testsuite-name) ":" test-name ":" item-path)) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path
+	   (work-area  #f)
+	   (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all
+	   (diskpath   #f)
+	   (cmdparms   #f)
+	   (fullcmd    #f) ;; (define a (with-output-to-string (lambda ()(write x))))
+	   (mt-bindir-path #f)
+	   (testinfo   (rmt:get-test-info-by-id run-id test-id))
+	   (mt_target  (string-intersperse (map cadr keyvals) "/"))
+	   (debug-param (append (if (args:get-arg "-debug")  (list "-debug" (args:get-arg "-debug")) '())
+				(if (args:get-arg "-logging")(list "-logging") '()))))
+      ;; (if hosts (set! hosts (string-split hosts)))
+      ;; set the megatest to be called on the remote host
+      (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest"))
+      (set! mt-bindir-path (pathname-directory remote-megatest))
+      (if launcher (set! launcher (string-split launcher)))
+      ;; set up the run work area for this test
+      (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run
+	       (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir
+	  (begin
+	    (debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path)
+	    (runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record
+      
+      ;; prevent overlapping actions - set to LAUNCHED as early as possible
+      ;;
+      ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail
+      (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED"))
+      (rmt:roll-up-pass-fail-counts run-id test-name item-path #f "LAUNCHED" #f)
+      ;; (pp (hash-table->alist tconfig))
+      (set! diskpath (get-best-disk *configdat* tconfig))
+      (if diskpath
+	  (let ((dat  (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat)))
+	    (set! work-area (car dat))
+	    (set! toptest-work-area (cadr dat))
+	    (debug:print-info 2 *default-log-port* "Using work area " work-area))
+	  (begin
+	    (set! work-area (conc test-path "/tmp_run"))
+	    (create-directory work-area #t)
+	    (debug:print 0 *default-log-port* "WARNING: No disk work area specified - running in the test directory under tmp_run")))
+      (set! cmdparms (base64:base64-encode 
+		      (z3:encode-buffer 
+		       (with-output-to-string
+			 (lambda () ;; (list 'hosts     hosts)
+			   (write (list (list 'testpath  test-path)
+					(list 'transport (conc *transport-type*))
+					;; (list 'serverinf *server-info*)
+					(list 'toppath   *toppath*)
+					(list 'work-area work-area)
+					(list 'test-name test-name) 
+					(list 'runscript runscript) 
+					(list 'run-id    run-id   )
+					(list 'test-id   test-id  )
+					;; (list 'item-path item-path )
+					(list 'itemdat   itemdat  )
+					(list 'megatest  remote-megatest)
+					(list 'ezsteps   ezsteps) 
+					(list 'target    mt_target)
+					(list 'runtlim   (if run-time-limit (common:hms-string->seconds run-time-limit) #f))
+					(list 'env-ovrd  (hash-table-ref/default *configdat* "env-override" '())) 
+					(list 'set-vars  (if params (hash-table-ref/default params "-setvars" #f)))
+					(list 'runname   runname)
+					(list 'mt-bindir-path mt-bindir-path))))))))
+      
+      ;; clean out step records from previous run if they exist
+      ;; (rmt:delete-test-step-records run-id test-id)
+      ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway
+      (if (file-exists? work-area)
+	  (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir
+      (cond
+       ;; ((and launcher hosts) ;; must be using ssh hostname
+       ;;    (set! fullcmd (append launcher (car hosts)(list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param)))
+       ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms))))
+       (launcher
+	(set! fullcmd (append launcher (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param)))
+       ;; (set! fullcmd (append launcher (list remote-megatest test-sig "-execute" cmdparms))))
+       (else
+	(if (not useshell)(debug:print 0 *default-log-port* "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section"))
+	(set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" ""))))))
+      ;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" "")))))
+      (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm"))))
+      (debug:print 1 *default-log-port* "Launching " work-area)
+      ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done
+      (debug:print 4 *default-log-port* "fullcmd: " fullcmd)
+      (set! *last-launch* (current-seconds)) ;; all that junk above takes time, set this as late as possible.
+      (let* ((commonprevvals (alist->env-vars
+			      (hash-table-ref/default *configdat* "env-override" '())))
+	     (miscprevvals   (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute"
+			      (append (list (list "MT_TEST_RUN_DIR" work-area)
+					    (list "MT_TEST_NAME" test-name)
+					    (list "MT_ITEM_INFO" (conc itemdat)) 
+					    (list "MT_RUNNAME"   runname)
+					    (list "MT_TARGET"    mt_target)
+					    (list "MT_ITEMPATH"  item-path)
+					    )
+				      itemdat)))
+	     (testprevvals   (alist->env-vars
+			      (hash-table-ref/default tconfig "pre-launch-env-overrides" '())))
+	     ;; Launchwait defaults to true, must override it to turn off wait
+	     (launchwait     (if (equal? (configf:lookup *configdat* "setup" "launchwait") "no") #f #t))
+	     (launch-results (apply (if launchwait
+					process:cmd-run-with-stderr->list
+					process-run)
+				    (if useshell
+					(let ((cmdstr (string-intersperse fullcmd " ")))
+					  (if launchwait
+					      cmdstr
+					      (conc cmdstr " >> mt_launch.log 2>&1")))
+					(car fullcmd))
+				    (if useshell
+					'()
+					(cdr fullcmd)))))
+        (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork.
+	(if (not launchwait) ;; give the OS a little time to allow the process to start
+	    (thread-sleep! 0.01))
+	(with-output-to-file "mt_launch.log"
+	  (lambda ()
+	    (print "LAUNCHCMD: " (string-intersperse fullcmd " "))
+	    (if (list? launch-results)
+		(apply print launch-results)
+		(print "NOTE: launched \"" fullcmd "\"\n  but did not wait for it to proceed. Add the following to megatest.config \n[setup]\nlaunchwait yes\n  if you have problems with this"))
+	    #:append))
+	(debug:print 2 *default-log-port* "Launching completed, updating db")
+	(debug:print 2 *default-log-port* "Launch results: " launch-results)
+	(if (not launch-results)
+	    (begin
+	      (print "ERROR: Failed to run " (string-intersperse fullcmd " ") ", exiting now")
+	      ;; (sqlite3:finalize! db)
+	      ;; good ole "exit" seems not to work
+	      ;; (_exit 9)
+	      ;; but this hack will work! Thanks go to Alan Post of the Chicken email list
+	      ;; NB// Is this still needed? Should be safe to go back to "exit" now?
+	      (process-signal (current-process-id) signal/kill)
+	      ))
+	(alist->env-vars miscprevvals)
+	(alist->env-vars testprevvals)
+	(alist->env-vars commonprevvals)
+	launch-results))
+    (change-directory *toppath*)))
 
 ;; recover a test where the top controlling mtest may have died
 ;;
 (define (launch:recover-test run-id test-id)
   ;; this function is called on the test run host via ssh

Index: megatest-version.scm
==================================================================
--- megatest-version.scm
+++ megatest-version.scm
@@ -1,7 +1,7 @@
 ;; Always use two or four digit decimal
 ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..
 
 (declare (unit megatest-version))
 
-(define megatest-version 1.6208)
+(define megatest-version 1.6303)
 

Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -93,10 +93,12 @@
   -reqtarg key1/key2/...  : run for key1, key2, etc. but key1/key2 must be in runconfig
   -testpatt patt1/patt2,patt3/...  : % is wildcard
   -runname                : required, name for this particular test run
   -state                  : Applies to runs, tests or steps depending on context
   -status                 : Applies to runs, tests or steps depending on context
+  -mode key               : load testpatt from <key> in runconfigs instead of default TESTPATT
+  -tagexpr tag1,tag2%,..  : select tests with tags matching expression
 
 Test helpers (for use inside tests)
   -step stepname
   -test-status            : set the state and status of a test (use :state and :status)
   -setlog logfname        : set the path/filename to the final log relative to the test
@@ -118,11 +120,11 @@
                             fields category,variable,value,comment
 
 Queries
   -list-runs patt         : list runs matching pattern \"patt\", % is the wildcard
   -show-keys              : show the keys used in this megatest setup
-  -test-files targpatt    : get the most recent test path/file matching targpatt e.g. %/%... 
+  -test-files targpatt    : get the most recent test path/file matching targpatt e.g. %/% or '*.log'
                             returns list sorted by age ascending, see examples below
   -test-paths             : get the test paths matching target, runname, item and test
                             patterns.
   -list-disks             : list the disks available for storing runs
   -list-targets           : list the targets in runconfigs.config
@@ -209,11 +211,13 @@
 			":state"  
 			"-state"
 			":status"
 			"-status"
 			"-list-runs"
-			"-testpatt" 
+			"-testpatt"
+                        "-mode"
+                        "-tagexpr"
 			"-itempatt"
 			"-setlog"
 			"-set-toplog"
 			"-runstep"
 			"-logpro"
@@ -345,11 +349,13 @@
 
 ;; The watchdog is to keep an eye on things like db sync etc.
 ;;
 (define *watchdog* (make-thread common:watchdog "Watchdog thread"))
 
-(thread-start! *watchdog*)
+(if (not (args:get-arg "-server"))
+    (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog
+;;(BB> "thread-start! watchdog")
 
 (if (args:get-arg "-log")
     (let ((oup (open-output-file (args:get-arg "-log"))))
       (debug:print-info 0 *default-log-port* "Sending log output to " (args:get-arg "-log"))
       (set! *default-log-port* oup)))
@@ -790,23 +796,24 @@
 ;;======================================================================
 ;; Weird special calls that need to run *after* the server has started?
 ;;======================================================================
 
 (if (args:get-arg "-list-targets")
-    (let ((targets (common:get-runconfig-targets)))
-      (debug:print 1 *default-log-port* "Found "(length targets) " targets")
-      (case (string->symbol (or (args:get-arg "-dumpmode") "alist"))
-	((alist)
-	 (for-each (lambda (x)
-		     ;; (print "[" x "]"))
-		     (print x))
-		   targets))
-	((json)
-	 (json-write targets))
-	(else
-	 (debug:print-error 0 *default-log-port* "dump output format " (args:get-arg "-dumpmode") " not supported for -list-targets")))
-      (set! *didsomething* #t)))
+    (if (launch:setup)
+        (let ((targets (common:get-runconfig-targets)))
+          (debug:print 1 *default-log-port* "Found "(length targets) " targets")
+          (case (string->symbol (or (args:get-arg "-dumpmode") "alist"))
+            ((alist)
+             (for-each (lambda (x)
+                         ;; (print "[" x "]"))
+                         (print x))
+                       targets))
+            ((json)
+             (json-write targets))
+            (else
+             (debug:print-error 0 *default-log-port* "dump output format " (args:get-arg "-dumpmode") " not supported for -list-targets")))
+          (set! *didsomething* #t))))
 
 ;; cache the runconfigs in $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME/.runconfig
 ;;
 (define (full-runconfigs-read)
 ;; in the envprocessing branch the below code replaces the further below code
@@ -1018,29 +1025,30 @@
 	       ;;  	        "%"))
 	       (keys        (rmt:get-keys)) ;; (db:get-keys dbstruct))
 	       ;; (runsdat  (db:get-runs dbstruct runpatt #f #f '()))
 	;; (runsdat     (rmt:get-runs-by-patt keys (or runpatt "%") (common:args-get-target) ;; (db:get-runs-by-patt dbstruct keys (or runpatt "%") (common:args-get-target)
 	;; 		           	 #f #f '("id" "runname" "state" "status" "owner" "event_time" "comment") 0))
-	       (runsdat     (db:dispatch-query access-mode rmt:get-runs-by-patt db:get-runs-by-patt keys (or runpatt "%") 
-                                            (common:args-get-target) #f #f '("id" "runname" "state" "status" "owner" "event_time" "comment") 0))
+	       (runsdat     (rmt:get-runs-by-patt keys (or runpatt "%") 
+                                                  (common:args-get-target) #f #f '("id" "runname" "state" "status" "owner" "event_time" "comment") 0))
 	       (runstmp     (db:get-rows runsdat))
 	       (header      (db:get-header runsdat))
 	       ;; this is "-since" support. This looks at last mod times of <run-id>.db files
 	       ;; and collects those modified since the -since time.
-	       (runs        (if (and (not (null? runstmp))
-				     (args:get-arg "-since"))
-				(let ((changed-ids (db:get-changed-run-ids (string->number (args:get-arg "-since")))))
-				  (let loop ((hed (car runstmp))
-					     (tal (cdr runstmp))
-					     (res '()))
-				    (let ((new-res (if (member (db:get-value-by-header hed header "id") changed-ids)
-						       (cons hed res)
-						       res)))
-				      (if (null? tal)
-					  (reverse new-res)
-					  (loop (car tal)(cdr tal) new-res)))))
-				runstmp))
+	       (runs        runstmp)
+                        ;; (if (and (not (null? runstmp))
+			;;        (args:get-arg "-since"))
+			;;   (let ((changed-ids (db:get-changed-run-ids (string->number (args:get-arg "-since")))))
+			;;     (let loop ((hed (car runstmp))
+			;;   	     (tal (cdr runstmp))
+			;;   	     (res '()))
+			;;       (let ((new-res (if (member (db:get-value-by-header hed header "id") changed-ids)
+			;;   		       (cons hed res)
+			;;   		       res)))
+			;;         (if (null? tal)
+			;;   	  (reverse new-res)
+			;;   	  (loop (car tal)(cdr tal) new-res)))))
+			;;   runstmp))
 	       (db-targets  (args:get-arg "-list-db-targets"))
 	       (seen        (make-hash-table))
 	       (dmode       (let ((d (args:get-arg "-dumpmode")))
 			      (if d (string->symbol d) #f)))
 	       (data        (make-hash-table))
@@ -1527,11 +1535,12 @@
 	  (let* ((keys     (rmt:get-keys))
 		 ;; db:test-get-paths must not be run remote
 		 (paths    (tests:test-get-paths-matching keys target (args:get-arg "-test-files"))))
 	    (set! *didsomething* #t)
 	    (for-each (lambda (path)
-			(print path))
+			(if (file-exists? path)
+			(print path)))	
 		      paths)))
 	;; else do a general-run-call
 	(general-run-call 
 	 "-test-files"
 	 "Get paths to test"
@@ -1825,11 +1834,12 @@
     (begin
       (if (not (launch:setup))
 	  (begin
 	    (debug:print 0 *default-log-port* "Failed to setup, exiting") 
 	    (exit 1)))
-      (common:cleanup-db)
+      (let ((dbstruct (db:setup *toppath*)))
+        (common:cleanup-db dbstruct))
       (set! *didsomething* #t)))
 
 (if (args:get-arg "-mark-incompletes")
     (begin
       (if (not (launch:setup))
@@ -1847,13 +1857,11 @@
     (begin
       (if (not (launch:setup))
 	  (begin
 	    (debug:print 0 *default-log-port* "Failed to setup, exiting") 
 	    (exit 1)))
-      ;; now can find our db
-      ;; keep this one local
-      (open-run-close runs:update-all-test_meta #f)
+      (runs:update-all-test_meta #f)
       (set! *didsomething* #t)))
 
 ;;======================================================================
 ;; Start a repl
 ;;======================================================================
@@ -1982,17 +1990,22 @@
 
 ;;======================================================================
 ;; Exit and clean up
 ;;======================================================================
 
-(if *runremote* (close-all-connections!)) ;; for http-client
-
 (if (not *didsomething*)
     (debug:print 0 *default-log-port* help))
+;;(BB> "thread-join! watchdog")
+
+;; join the watchdog thread if it has been thread-start!ed  (it may not have been started in the case of a server that never enters running state)
+;;   (symbols returned by thread-state: created ready running blocked suspended sleeping terminated dead)
+(if (thread? *watchdog*)
+    (case (thread-state *watchdog*)
+      ((ready running blocked sleeping terminated dead)
+       (thread-join! *watchdog*))))
 
 (set! *time-to-exit* #t)
-(thread-join! *watchdog*)
 
 (if (not (eq? *globalexitstatus* 0))
     (if (or (args:get-arg "-run")(args:get-arg "-runtests")(args:get-arg "-runall"))
         (begin
            (debug:print 0 *default-log-port* "NOTE: Subprocesses with non-zero exit code detected: " *globalexitstatus*)

ADDED   minimal/manyservers.sh
Index: minimal/manyservers.sh
==================================================================
--- /dev/null
+++ minimal/manyservers.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+
+echo manyservers.sh pid $$
+
+logdir=$PWD/log-manysrv
+
+
+function reset {
+    rm -f .homehost .server .server.lock links/.db/monitor.db .starting-server
+    }
+
+function launch_many_servers {
+    # count  = $1
+    # logdir = $2
+    # prefx  = $3
+  perl -e 'foreach my $i (1 ... '$1'){print "'$2'/'$3'-srv-$i.log\n"}' | \
+     xargs -P $1 -n 1 megatest -server - -run-id 0 -daemonize -log
+}
+
+    
+function get_srv_pids {
+    ps auwx | grep "mtest -server" | grep $logdir | grep -v grep | awk '{print $2}' 
+}
+
+
+if [[ -e $logdir ]]; then rm -rf $logdir; fi
+if [[ ! -e $logdir ]]; then mkdir $logdir; fi
+
+reset
+
+simultaneous_servers=20
+server_collision_resolution_delay=15
+server_timeout_delay=65
+
+echo "Launching $simultaneous_servers simultaneous servers"
+launch_many_servers $simultaneous_servers $logdir "first"
+echo "Sleeping $server_collision_resolution_delay seconds to allow new servers to die because one is already running."
+sleep $server_collision_resolution_delay
+
+pids=`get_srv_pids`
+pids_left=`echo $pids | wc -w`
+echo "pids_left=$pids_left"
+echo "after $server_collision_resolution_delay seconds: servers remaining=$pids_left; expecting 1"
+if [[ $pids_left == 1 ]]; then
+    echo "All servers but 1 terminated. Still good."
+else
+    if [[ $pids_left == 0 ]]; then
+        echo "All servers died too soon.  Not good. Aborting."
+        echo "TEST FAIL"
+        exit 1
+    else
+        echo "Too many servers left.  Not good.  Aborting."
+        echo "TEST FAIL"
+        echo $pids | xargs kill
+        sleep 5
+        pids=`get_srv_pids`
+        pids_left=`echo $pids | wc -w`
+        if [[ ! ( $pids_left == 0 ) ]]; then
+            echo $pids | xargs kill -9
+        fi
+        exit 1
+    fi
+fi
+
+
+
+echo "launching another volley of $simultaneous_servers.  THey should all perish. right away, leaving the one server running."
+launch_many_servers $simultaneous_servers $logdir "second"
+sleep $server_collision_resolution_delay
+
+pids=`get_srv_pids`
+pids_left=`echo $pids | wc -w`
+echo "pids_left=$pids_left"
+echo "after $server_collision_resolution_delay seconds: servers remaining=$pids_left; expecting 1"
+if [[ $pids_left == 1 ]]; then
+    echo "All servers but 1 terminated. So far so good."
+else
+    if [[ $pids_left == 0 ]]; then
+        echo "All servers died too soon.  Not good. Aborting."
+        echo "TEST FAIL"
+        exit 1
+    else
+        echo "Too many servers left.  Not good.  Aborting."
+        echo "TEST FAIL"
+        echo $pids | xargs kill
+        sleep 5
+        pids=`get_srv_pids`
+        pids_left=`echo $pids | wc -w`
+        if [[ ! ( $pids_left == 0 ) ]]; then
+            echo $pids | xargs kill -9
+        fi
+        exit 1
+    fi
+fi
+
+
+
+echo "sleeping for awhile ($server_timeout_delay seconds) to let server exit on its own for no-request timeout"
+sleep $server_timeout_delay
+pids=`get_srv_pids`
+pids_left=`echo $pids | wc -w`
+echo "after $server_timeout_delay seconds: servers remaining=$pids_left; expecting 0"
+
+if [[ $pids_left == 0 ]]; then
+    echo "No servers remain. This is good."
+    echo "TEST PASS"
+    exit 0
+else
+    echo "Too many servers left.  Not good.  Aborting."
+    echo "TEST FAIL"
+    echo $pids | xargs kill
+    sleep 5
+    pids=`get_srv_pids`
+    pids_left=`echo $pids | wc -w`
+    if [[ ! ( $pids_left == 0 ) ]]; then
+        echo $pids | xargs kill -9
+    fi
+    exit 1
+fi

Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -24,27 +24,20 @@
 
 ;; generate entries for ~/.megatestrc with the following
 ;;
 ;;  grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u
 
-(defstruct remote
-  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
-  (server-url        (if *toppath* (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*) #f))
-  (last-server-check 0)  ;; last time we checked to see if the server was alive
-  (conndat           #f)
-  (transport         *transport-type*)
-  (server-timeout    (or (server:get-timeout) 100))) ;; default to 100 seconds
-
 ;;======================================================================
 ;;  S U P P O R T   F U N C T I O N S
 ;;======================================================================
 
 ;; if a server is either running or in the process of starting call client:setup
 ;; else return #f to let the calling proc know that there is no server available
 ;;
-(define (rmt:get-connection-info run-id)
-  (let ((cinfo (remote-conndat *runremote*)))
+(define (rmt:get-connection-info areapath) ;; TODO: push areapath down.
+  (let ((cinfo (remote-conndat *runremote*))
+        (run-id 0))
     (if cinfo
 	cinfo
 	(if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id)
 	    (client:setup run-id)
 	    #f))))
@@ -94,56 +87,85 @@
      ((and (cdr (remote-hh-dat *runremote*))   ;; on homehost
            (member cmd api:read-only-queries)) ;; this is a read
       (mutex-unlock! *rmt-mutex*)
       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  3")
       (rmt:open-qry-close-locally cmd 0 params))
+
+     ;; on homehost and this is a write, we already have a server, but server has died
+     ((and (cdr (remote-hh-dat *runremote*))         ;; on homehost
+           (not (member cmd api:read-only-queries))  ;; this is a write
+           (remote-server-url *runremote*)           ;; have a server
+           (not (server:read-dotserver *toppath*)))  ;; server has died.
+      (set! *runremote* #f)
+      (mutex-unlock! *rmt-mutex*)
+      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
+      (rmt:send-receive cmd rid params attemptnum: attemptnum))
+
      ;; on homehost and this is a write, we already have a server
      ((and (cdr (remote-hh-dat *runremote*))         ;; on homehost
            (not (member cmd api:read-only-queries))  ;; this is a write
            (remote-server-url *runremote*))          ;; have a server
       (mutex-unlock! *rmt-mutex*)
       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4")
       (rmt:open-qry-close-locally cmd 0 params))
-     ;; on homehost and this is a write, we have a server (we know because case 4 checked)
-     ((and (cdr (remote-hh-dat *runremote*))         ;; on homehost
-	   (not (member cmd api:read-only-queries)))
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
-      (rmt:open-qry-close-locally cmd 0 params))
-     ;; no server contact made and this is a write, passively start a server 
-     ((and (not (remote-server-url *runremote*))
+
+     ;; commented by bb; this was blocking server passive start on write on homehost (case 5)
+     ;; ;; on homehost and this is a write, we have a server (we know because case 4 checked)
+     ;; ((and (cdr (remote-hh-dat *runremote*))         ;; on homehost
+     ;;       (not (member cmd api:read-only-queries)))
+     ;;  (mutex-unlock! *rmt-mutex*)
+     ;;  (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
+     ;;  (rmt:open-qry-close-locally cmd 0 params))
+
+     
+     ;;  on homehost, no server contact made and this is a write, passively start a server 
+     ((and (cdr (remote-hh-dat *runremote*)) ; new
+           (not (remote-server-url *runremote*))
 	   (not (member cmd api:read-only-queries)))
       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
-      (let ((serverconn (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
-	(if serverconn
-	    (remote-server-url-set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed
+      (let ((server-url (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
+	(if server-url
+	    (remote-server-url-set! *runremote* server-url) ;; the string can be consumed by the client setup if needed
 	    (if (not (server:start-attempted? *toppath*))
 		(server:kind-run *toppath*))))
-      (if (cdr (remote-hh-dat *runremote*)) ;; we are on the homehost, just do the call
-          (begin
-            (mutex-unlock! *rmt-mutex*)
-	    (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.1")
-            (rmt:open-qry-close-locally cmd 0 params))
-          (begin                            ;; not on homehost, start server and wait
-            (mutex-unlock! *rmt-mutex*)
-	    (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.2")
-	    (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
-            (rmt:send-receive cmd rid params attemptnum: attemptnum))))
+             (mutex-unlock! *rmt-mutex*)
+             (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.1")
+             (rmt:open-qry-close-locally cmd 0 params))
+
+
+
+     ;;;
+           ;;     (begin                            ;; not on homehost, start server and wait
+            ;; (mutex-unlock! *rmt-mutex*)
+	    ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.2")
+	    ;; (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
+            ;; (rmt:send-receive cmd rid params attemptnum: attemptnum))   ;)  ;)
+;;;;
+     
      ;; if not on homehost ensure we have a connection to a live server
      ;; NOTE: we *have* a homehost record by now
-     ((and (not (cdr (remote-hh-dat *runremote*)))        ;; are we on a homehost?
+     ((and (not (cdr (remote-hh-dat *runremote*)))        ;; not on a homehost 
+           (not (remote-conndat *runremote*))             ;; and no connection
+           (server:read-dotserver *toppath*))             ;; .server file exists
+      ;; something caused the server entry in tdb to disappear, but the server is still running
+      (server:remove-dotserver-file *toppath* ".*")
+      (mutex-unlock! *rmt-mutex*)
+      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  20")
+      (rmt:send-receive cmd rid params attemptnum: (add1 attemptnum)))
+     ((and (not (cdr (remote-hh-dat *runremote*)))        ;; not on a homehost 
            (not (remote-conndat *runremote*)))            ;; and no connection
       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6  hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*))
       (mutex-unlock! *rmt-mutex*)
       (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
-      (remote-conndat-set! *runremote* (rmt:get-connection-info 0)) ;; calls client:setup which calls client:setup-http
-      (rmt:send-receive cmd rid params attemptnum: attemptnum))
+      (remote-conndat-set! *runremote* (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
+      (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as
      ;; all set up if get this far, dispatch the query
      ((cdr (remote-hh-dat *runremote*)) ;; we are on homehost
       (mutex-unlock! *rmt-mutex*)
       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  7")
       (rmt:open-qry-close-locally cmd (if rid rid 0) params))
+
      ;; not on homehost, do server query
      (else
       (mutex-unlock! *rmt-mutex*)
       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9")
       (let* ((conninfo (remote-conndat *runremote*))
@@ -257,11 +279,11 @@
 	  ;; (rmt:update-db-stats run-id cmd params duration)
 	  ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it
 	  (if qry-is-write
 	      (let ((start-time (current-seconds)))
 		(mutex-lock! *db-multi-sync-mutex*)
-		(set! *db-last-write* start-time) ;; the oldest "write"
+		(set! *db-last-access* start-time)  ;; THIS IS PROBABLY USELESS? (we are on a client)
                 (mutex-unlock! *db-multi-sync-mutex*)))))
     res))
 
 (define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)
   (let* ((run-id   (if run-id run-id 0))
@@ -320,10 +342,15 @@
 ;; added run-id to make looking up the correct db possible 
 ;;
 (define (rmt:general-call stmtname run-id . params)
   (rmt:send-receive 'general-call run-id (append (list stmtname run-id) params)))
 
+
+;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host
+(define (rmt:get-latest-host-load hostname)
+  (rmt:send-receive 'get-latest-host-load 0 (list hostname)))
+
 ;; (define (rmt:sync-inmem->db run-id)
 ;;   (rmt:send-receive 'sync-inmem->db run-id '()))
 
 (define (rmt:sdb-qry qry val run-id)
   ;; add caching if qry is 'getid or 'getstr
@@ -330,10 +357,17 @@
   (rmt:send-receive 'sdb-qry run-id (list qry val)))
 
 ;; NOT COMPLETED
 (define (rmt:runtests user run-id testpatt params)
   (rmt:send-receive 'runtests run-id testpatt))
+
+;;======================================================================
+;;  T E S T   M E T A 
+;;======================================================================
+
+(define (rmt:get-tests-tags)
+  (rmt:send-receive 'get-tests-tags #f '()))
 
 ;;======================================================================
 ;;  K E Y S 
 ;;======================================================================
 
@@ -345,10 +379,15 @@
 (define (rmt:get-keys)
   (if *db-keys* *db-keys* 
      (let ((res (rmt:send-receive 'get-keys #f '())))
        (set! *db-keys* res)
        res)))
+
+(define (rmt:get-keys-write) ;; dummy query to force server start
+  (let ((res (rmt:send-receive 'get-keys-write #f '())))
+    (set! *db-keys* res)
+    res))
 
 ;; we don't reuse run-id's (except possibly *after* a db cleanup) so it is safe
 ;; to cache the resuls in a hash
 ;;
 (define (rmt:get-key-vals run-id)
@@ -588,12 +627,12 @@
 
 (define (rmt:get-runs-by-patt  keys runnamepatt targpatt offset limit fields last-runs-update) ;; fields of #f uses default
   (rmt:send-receive 'get-runs-by-patt #f (list keys runnamepatt targpatt offset limit fields last-runs-update)))
 
 (define (rmt:find-and-mark-incomplete run-id ovr-deadtime)
-  (if (rmt:send-receive 'have-incompletes? run-id (list run-id ovr-deadtime))
-      (rmt:send-receive 'mark-incomplete run-id (list run-id ovr-deadtime))))
+  ;; (if (rmt:send-receive 'have-incompletes? run-id (list run-id ovr-deadtime))
+  (rmt:send-receive 'mark-incomplete run-id (list run-id ovr-deadtime))) ;; )
 
 (define (rmt:get-main-run-stats run-id)
   (rmt:send-receive 'get-main-run-stats #f (list run-id)))
 
 (define (rmt:get-var varname)

Index: rpc-transport.scm
==================================================================
--- rpc-transport.scm
+++ rpc-transport.scm
@@ -176,11 +176,11 @@
 	      (if ping-res
 		  (let ((server-dat (list iface port #f #f #f)))
 		    (hash-table-set! *runremote* run-id server-dat)
 		    server-dat)
 		  (begin
-		    (server:try-running run-id)
+		    (server:try-running *toppath*)
 		    (thread-sleep! 2)
 		    (rpc-transport:client-setup run-id (- remtries 1)))))
  	    (let* ((server-db-info (open-run-close tasks:get-server tasks:open-db run-id)))
  	      (debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
 	      (if server-db-info
@@ -191,15 +191,15 @@
  		    (if start-res
  			(begin
  			  (hash-table-set! *runremote* run-id server-dat)
 			  server-dat)
 			(begin
-			  (server:try-running run-id)
+			  (server:try-running *toppath*)
 			  (thread-sleep! 2)
 			  (rpc-transport:client-setup run-id (- remtries 1)))))
 		  (begin
-		    (server:try-running run-id)
+		    (server:try-running *toppath*)
 		    (thread-sleep! 2)
 		    (rpc-transport:client-setup run-id (- remtries 1)))))))))
 ;; 
 ;; 	     (port     (if (and hostinfo (> (length hostdat) 1))(cadr hostdat) #f)))
 ;; 	(if (and port

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -1958,10 +1958,23 @@
 	 (if (and val (not (equal? (vector-ref currrecord idx) val)))
 	     (begin
 	       (print "Updating " test-name " " fld " to " val)
 	       (rmt:testmeta-update-field test-name fld val)))))
      '(("author" 2)("owner" 3)("description" 4)("reviewed" 5)("tags" 9)("jobgroup" 10)))))
+
+;; find tests with matching tags, tagpatt is a string "tagpatt1,tagpatt2%, ..."
+;;
+(define (runs:get-tests-matching-tags tagpatt)
+  (let* ((tagdata (rmt:get-tests-tags))
+         (res     '())) ;; list of tests that match one or more tags
+    (for-each
+     (lambda (tag)
+       (if (patt-list-match tag tagpatt)
+           (set! res (append (hash-table-ref tagdata tag)))))
+     (hash-table-keys tagdata))
+    res))
+    
 
 ;; Update test_meta for all tests
 (define (runs:update-all-test_meta db)
   (let ((test-names (tests:get-all))) ;; (tests:get-valid-tests)))
     (for-each 

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -8,11 +8,11 @@
 ;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 ;;  PURPOSE.
 
 (require-extension (srfi 18) extras tcp s11n)
 
-(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest directory-utils)
+(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest directory-utils posix-extras)
 ;; (use zmq)
 
 (use spiffy uri-common intarweb http-client spiffy-request-vars)
 
 (declare (unit server))
@@ -47,11 +47,23 @@
 ;; all routes though here end in exit ...
 ;;
 ;; start_server
 ;;
 (define (server:launch run-id transport-type)
-  (BB> "server:launch fired for run-id="run-id" transport-type="transport-type)
+  ;;(BB> "server:launch fired for run-id="run-id" transport-type="transport-type)
+
+  (let ((attempt-in-progress (server:start-attempted? *toppath*))) ; check for .server-starting
+    (when attempt-in-progress
+      (debug:print-info 0 *default-log-port* "Server start attempt in progress in other process (=> "attempt-in-progress"<=).  Aborting server launch attempt in this process ("(current-process-id)")")
+      (exit)))
+      
+  (let ((dotserver-url (server:check-if-running *toppath*))) ;; check for .server  
+    (when dotserver-url
+      (debug:print-info 0 *default-log-port* "Server already running (=> "dotserver-url"<=).  Aborting server launch attempt in this process ("(current-process-id)")")
+      (exit)
+      ))
+  
   (case transport-type
     ((http)(http-transport:launch run-id))
     ;;((nmsg)(nmsg-transport:launch run-id))
     ((rpc)  (rpc-transport:launch run-id))
     (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type))))
@@ -103,52 +115,60 @@
 ;; Given a run id start a server process    ### NOTE ### > file 2>&1 
 ;; if the run-id is zero and the target-host is set 
 ;; try running on that host
 ;;   incidental: rotate logs in logs/ dir.
 ;;
-(define  (server:run areapath) ;; areapath is ignored for now.
+(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
   (let* ((curr-host   (get-host-name))
+         (attempt-in-progress (server:start-attempted? areapath))
+         (dot-server-url (server:check-if-running areapath))
 	 (curr-ip     (server:get-best-guess-address curr-host))
 	 (curr-pid    (current-process-id))
 	 (homehost    (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
 	 (target-host (car homehost))
 	 (testsuite   (common:get-testsuite-name))
-	 (logfile     (conc *toppath* "/logs/server.log"))
+	 (logfile     (conc areapath "/logs/server.log"))
 	 (cmdln (conc (common:get-megatest-exe)
 		      " -server " (or target-host "-") " -run-id " 0 (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
 									      (conc " -daemonize -log " logfile)
 									      "")
 		      " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &")))))
 	 (log-rotate  (make-thread common:rotate-logs  "server run, rotate logs thread")))
     ;; we want the remote server to start in *toppath* so push there
-    (push-directory *toppath*)
-    (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
-    (thread-start! log-rotate)
-
-    ;; host.domain.tld match host?
-    (if (and target-host 
-	     ;; look at target host, is it host.domain.tld or ip address and does it 
-	     ;; match current ip or hostname
-	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
-	     (not (equal? curr-ip target-host)))
-	(begin
-	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
-	  (setenv "TARGETHOST" target-host)))
-    
-    (setenv "TARGETHOST_LOGF" logfile)
-    (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever
-    (system (conc "nbfake " cmdln))
-    (unsetenv "TARGETHOST_LOGF")
-    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
-    (thread-join! log-rotate)
-    (pop-directory)))
-
+    (push-directory areapath)
+    (cond
+     (attempt-in-progress
+      (debug:print 0 *default-log-port* "INFO: Not trying to start server because attempt is in progress: "attempt-in-progress))
+     (dot-server-url
+            (debug:print 0 *default-log-port* "INFO: Not trying to start server because one is already running : "dot-server-url))
+     (else
+      (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
+      (thread-start! log-rotate)
+
+      ;; host.domain.tld match host?
+      (if (and target-host 
+               ;; look at target host, is it host.domain.tld or ip address and does it 
+               ;; match current ip or hostname
+               (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
+               (not (equal? curr-ip target-host)))
+          (begin
+            (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
+            (setenv "TARGETHOST" target-host)))
+      
+      (setenv "TARGETHOST_LOGF" logfile)
+      (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever
+      (system (conc "nbfake " cmdln))
+      (unsetenv "TARGETHOST_LOGF")
+      (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
+      (thread-join! log-rotate)
+      (pop-directory)))))
+    
 (define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
   (if *my-client-signature* *my-client-signature*
       (let ((sig (server:mk-signature)))
-	(set! *my-client-signature* sig)
-	*my-client-signature*)))
+        (set! *my-client-signature* sig)
+        *my-client-signature*)))
 
 ;; kind start up of servers, wait 40 seconds before allowing another server for a given
 ;; run-id to be launched
 (define (server:kind-run areapath)
   (let ((last-run-time (hash-table-ref/default *server-kind-run* areapath #f)))
@@ -156,96 +176,168 @@
 	    (> (- (current-seconds) last-run-time) 30))
 	(begin
 	  (server:run areapath)
 	  (hash-table-set! *server-kind-run* areapath (current-seconds))))))
 
-;; The generic run a server command. Dispatches the call to server 0 if run-id != 0
-;; 
-;;  (define (server:try-running run-id)
-;;    (if (eq? run-id 0)
-;;        (server:run run-id)
-;;        (rmt:start-server run-id)))
 (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG.
 
+(define (server:attempting-start areapath)
+  (with-output-to-file
+      (conc areapath "/.starting-server")
+    (lambda ()
+      (print (current-process-id) " on " (get-host-name)))))
+  
+(define (server:complete-attempt areapath)
+  (delete-file* (conc areapath "/.starting-server")))
+  
 (define (server:start-attempted? areapath)
   (let ((flagfile (conc areapath "/.starting-server")))
     (handle-exceptions
      exn
      #f  ;; if things go wrong pretend we can't see the file
-     (and (file-exists? flagfile)
-	  (< (- (current-seconds)
-		(file-modification-time flagfile))
-	     15))))) ;; exists and less than 15 seconds old
-    
+     (cond
+      ((and (file-exists? flagfile)
+            (< (- (current-seconds)
+                  (file-modification-time flagfile))
+               15)) ;; exists and less than 15 seconds old
+       (with-input-from-file flagfile (lambda () (read-line))))
+      ((file-exists? flagfile) ;; it is stale.
+       (server:complete-attempt areapath)
+       #f)
+      (else #f)))))
+
 (define (server:read-dotserver areapath)
   (let ((dotfile (conc areapath "/.server")))
     (handle-exceptions
      exn
      #f  ;; if things go wrong pretend we can't see the file
-     (if (and (file-exists? dotfile)
-	      (file-read-access? dotfile))
-	 (with-input-from-file
-	     dotfile
-	   (lambda ()
-	     (read-line)))
-	 #f))))
+     (cond
+      ((not (file-exists? dotfile))
+       #f)
+      ((not (file-read-access? dotfile))
+       #f)
+      ((> (server:dotserver-age-seconds areapath) (+ 5 (server:get-timeout)))
+       (server:remove-dotserver-file areapath ".*")
+       #f)
+      (else
+       (let* ((line
+               (with-input-from-file
+                   dotfile
+                 (lambda ()
+                   (read-line))))
+              (tokens (if (string? line) (string-split line ":") #f)))
+         (cond
+          ((eq? 4 (length tokens))
+           tokens)
+          (else #f))))))))
+       
+(define (server:read-dotserver->url areapath)
+  (let ((dotserver-tokens (server:read-dotserver areapath)))
+    (if dotserver-tokens
+        (conc (list-ref dotserver-tokens 0) ":" (list-ref dotserver-tokens 1))
+        #f)))
 
 ;; write a .server file in *toppath* with hostport
 ;; return #t on success, #f otherwise
 ;;
-(define (server:write-dotserver areapath hostport)
+(define (server:write-dotserver areapath host port pid transport)
   (let ((lock-file   (conc areapath "/.server.lock"))
 	(server-file (conc areapath "/.server")))
     (if (common:simple-file-lock lock-file)
 	(let ((res (handle-exceptions
 		    exn
 		    #f ;; failed for some reason, for the moment simply return #f
 		    (with-output-to-file server-file
 		      (lambda ()
-			(print hostport)))
+			(print (conc host ":" port ":" pid ":" transport))))
 		    #t)))
-	  (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " created")
+	  (debug:print-info 0 *default-log-port* "server file " server-file " for " host ":" port " created pid="pid)
 	  (common:simple-file-release-lock lock-file)
 	  res)
 	#f)))
 
+
+;; this will check that the .server file present matches the server calling this procedure.
+;; if parameters match (this-pid and transport) the file will be touched and #t returned
+;; otherwise #f will be returned.
+(define (server:confirm-dotserver areapath this-iface this-port this-pid this-transport)
+  (let* ((tokens (server:read-dotserver areapath)))
+    (cond
+     ((not tokens)
+      (debug:print-info 0 *default-log-port* "INFO: .server file does not exist.")
+      #f)
+     ((not (eq? 4 (length tokens)))
+      (debug:print-info 0 *default-log-port* "INFO: .server file is corrupt.  There are not 4 tokens as expeted; there are "(length tokens)".")
+      #f)
+     ((not (equal? this-iface (list-ref tokens 0)))
+      (debug:print-info 0 *default-log-port* "INFO: .server file mismatch.  for iface, server has value >"(list-ref tokens 0)"< but this server's value is >"this-iface"<")
+      #f)
+     ((not (equal? (->string this-port)  (list-ref tokens 1)))
+      (debug:print-info 0 *default-log-port* "INFO: .server file mismatch.  for port, .server has value >"(list-ref tokens 1)"< but this server's value is >"(->string this-port)"<")
+      #f)
+     ((not (equal? (->string this-pid)   (list-ref tokens 2)))
+      (debug:print-info 0 *default-log-port* "INFO: .server file mismatch.  for pid, .server has value >"(list-ref tokens 2)"< but this server's value is >"(->string this-pid)"<")
+      #f)
+     ((not (equal? (->string this-transport) (->string (list-ref tokens 3))))
+      (debug:print-info 0 *default-log-port* "INFO: .server file mismatch.  for transport, .server has value >"(list-ref tokens 3)"< but this server's value is >"this-transport"<")
+      #f)
+     (else (server:touch-dotserver areapath)
+      #t))))
+
+(define (server:touch-dotserver areapath)
+  (let ((server-file (conc areapath "/.server")))
+    (change-file-times server-file (current-seconds) (current-seconds))))
+
+(define (server:dotserver-age-seconds areapath)
+  (let ((server-file (conc areapath "/.server")))
+    (begin
+      (handle-exceptions
+       exn
+       #f
+       (- (current-seconds)
+          (file-modification-time server-file))))))
+    
 (define (server:remove-dotserver-file areapath hostport)
-  (let ((dotserver   (server:read-dotserver areapath))
+  (let ((dotserver-url   (server:read-dotserver->url areapath))
 	(server-file (conc areapath "/.server"))
 	(lock-file   (conc areapath "/.server.lock")))
-    (if (and dotserver (string-match (conc ".*:" hostport "$") dotserver)) ;; port matches, good enough info to decide to remove the file
+    (if (and dotserver-url (string-match (conc ".*:" hostport "$") dotserver-url)) ;; port matches, good enough info to decide to remove the file
 	(if (common:simple-file-lock lock-file)
 	    (begin
 	      (handle-exceptions
 	       exn
 	       #f
 	       (delete-file* server-file))
 	      (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " removed")
-	      (common:simple-file-release-lock lock-file))))))
+	      (common:simple-file-release-lock lock-file))
+            (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - could not get lock."))
+        (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - dotserver-url("dotserver-url") did not match hostport pattern ("hostport")"))))
 
 ;; no longer care if multiple servers are started by accident. older servers will drop off in time.
 ;;
 (define (server:check-if-running areapath)
-  (let* ((dotserver (server:read-dotserver areapath))) ;; tdbdat (tasks:open-db)))
-    (if dotserver
+  (let* ((dotserver-url (server:read-dotserver->url areapath))) ;; tdbdat (tasks:open-db)))
+    (if dotserver-url
 	(let* ((res (case *transport-type*
-		      ((http)(server:ping-server dotserver))
+		      ((http)(server:ping-server dotserver-url))
 		      ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
 		      )))
 	  (if res
-	      dotserver
-	      #f))
+	      dotserver-url
+	      (begin
+                (server:remove-dotserver-file areapath ".*") ;; remove stale dotserver
+                #f)))
 	#f)))
 
 ;; called in megatest.scm, host-port is string hostname:port
 ;;
 ;; NOTE: This is NOT called directly from clients as not all transports support a client running
 ;;       in the same process as the server.
 ;;
 (define (server:ping host-port-in #!key (do-exit #f))
   (let ((host:port (if (not host-port-in) ;; use read-dotserver to find
-		       (server:read-dotserver *toppath*)
+		       (server:read-dotserver->url *toppath*)
 		       (if (number? host-port-in) ;; we were handed a server-id
 			   (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in)))
 			     ;; (print "srec: " srec " host-port-in: " host-port-in)
 			     (if srec
 				 (conc (vector-ref srec 3) ":" (vector-ref srec 4))

Index: tasks.scm
==================================================================
--- tasks.scm
+++ tasks.scm
@@ -323,12 +323,12 @@
 	  (res    '()))
     (sqlite3:for-each-row
      (lambda (a . b)
        (set! res (cons (apply vector a b) res)))
      mdb
-     (conc "SELECT " selstr " FROM servers WHERE run_id=? AND state in ('available','running','dbprep') ORDER BY start_time DESC;")
-     run-id)
+     (conc "SELECT " selstr " FROM servers WHERE state in ('available','running','dbprep') ORDER BY start_time DESC;")
+     )
     (vector header res)))
 
 (define (tasks:get-server mdb run-id #!key (retries 10))
   (let ((res  #f)
 	(best #f))
@@ -402,11 +402,11 @@
 	       (< delay-time delay-max-tries))
 	  (begin
 	    (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id)
 		(debug:print 0 *default-log-port* "Try starting server for run-id " run-id))
 	    (thread-sleep! (/ (random 2000) 1000))
-	    (server:kind-run run-id)
+	    (server:kind-run *toppath*)
 	    (thread-sleep! (min delay-time 1))
             (if (not (or (server:start-attempted? *toppath*)
                          (server:read-dotserver *toppath*))) ;; no point in trying
                 (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1))
                 #f))

Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -141,11 +141,11 @@
 
 
 ;; returns waitons waitors tconfigdat
 ;;
 (define (tests:get-waitons test-name all-tests-registry)
-   (let* ((config  (tests:get-testconfig test-name all-tests-registry 'return-procs)))
+   (let* ((config  (tests:get-testconfig test-name #f all-tests-registry 'return-procs)))
      (let ((instr (if config 
 		      (config-lookup config "requirements" "waiton")
 		      (begin ;; No config means this is a non-existant test
 			(debug:print-error 0 *default-log-port* "non-existent required test \"" test-name "\"")
 			(exit 1))))
@@ -291,11 +291,11 @@
 
 ;; Check for waiver eligibility
 ;;
 (define (tests:check-waiver-eligibility testdat prev-testdat)
   (let* ((test-registry (make-hash-table))
-	 (testconfig  (tests:get-testconfig (db:test-get-testname testdat) test-registry #f))
+	 (testconfig  (tests:get-testconfig (db:test-get-testname testdat) (db:test-get-item-path testdat) test-registry #f))
 	 (test-rundir ;; (sdb:qry 'passstr 
 	  (db:test-get-rundir testdat)) ;; )
 	 (prev-rundir ;; (sdb:qry 'passstr 
 	  (db:test-get-rundir prev-testdat)) ;; )
 	 (waivers     (if testconfig (configf:section-vars testconfig "waivers") '()))
@@ -920,24 +920,30 @@
 	  
 ;; MUST BE CALLED local!
 ;;
 (define (tests:test-get-paths-matching keynames target fnamepatt #!key (res '()))
   ;; BUG: Move the values derived from args to parameters and push to megatest.scm
-  (let* ((testpatt   (if (args:get-arg "-testpatt")(args:get-arg "-testpatt") "%"))
-	 (statepatt  (if (args:get-arg ":state")   (args:get-arg ":state")    "%"))
-	 (statuspatt (if (args:get-arg ":status")  (args:get-arg ":status")   "%"))
-	 (runname    (if (args:get-arg ":runname") (args:get-arg ":runname")  "%"))
+  (let* ((testpatt   (or (args:get-arg "-testpatt")(args:get-arg "-testpatt") "%"))
+	 (statepatt  (or (args:get-arg "-state")   (args:get-arg ":state")    "%"))
+	 (statuspatt (or (args:get-arg "-status")  (args:get-arg ":status")   "%"))
+	 (runname    (or (args:get-arg "-runname") (args:get-arg ":runname")  "%"))
 	 (paths-from-db (rmt:test-get-paths-matching-keynames-target-new keynames target res
 					testpatt
 					statepatt
 					statuspatt
 					runname)))
     (if fnamepatt
 	(apply append 
 	       (map (lambda (p)
 		      (if (directory-exists? p)
-			  (glob (conc p "/" fnamepatt))
+			  (let ((glob-query (conc p "/" fnamepatt)))
+			    (handle-exceptions
+				exn
+				(with-input-from-pipe
+				    (conc "echo " glob-query)
+				  read-lines)  ;; we aren't going to try too hard. If glob breaks it is likely because someone tried to do */*/*.log or similar
+			      (glob glob-query)))
 			  '()))
 		    paths-from-db))
 	paths-from-db)))
 
 			      
@@ -973,11 +979,11 @@
 ;; if .testconfig exists in test directory read and return it
 ;; else if have cached copy in *testconfigs* return it IFF there is a section "have fulldata"
 ;; else read the testconfig file
 ;;   if have path to test directory save the config as .testconfig and return it
 ;;
-(define (tests:get-testconfig test-name test-registry system-allowed #!key (force-create #f))
+(define (tests:get-testconfig test-name item-path test-registry system-allowed #!key (force-create #f))
   (let* ((cache-path   (tests:get-test-path-from-environment))
 	 (cache-file   (and cache-path (conc cache-path "/.testconfig")))
 	 (cache-exists (and cache-file
 			    (not force-create)  ;; if force-create then pretend there is no cache to read
 			    (file-exists? cache-file)))
@@ -985,14 +991,17 @@
 				cache-exists)
 			   (handle-exceptions
 			    exn
 			    #f ;; any issues, just give up with the cached version and re-read
 			    (configf:read-alist cache-file))
-			   #f)))
+			   #f))
+         (test-full-name (if (and item-path (not (string-null? item-path)))
+                             (conc test-name "/" item-path)
+                             test-name)))
     (if cached-dat
 	cached-dat
-	(let ((dat (hash-table-ref/default *testconfigs* test-name #f)))
+	(let ((dat (hash-table-ref/default *testconfigs* test-full-name #f)))
 	  (if (and  dat ;; have a locally cached version
 		    (hash-table-ref/default dat "have fulldata" #f)) ;; marked as good data?
 	      dat
 	      ;; no cached data available
 	      (let* ((treg         (or test-registry
@@ -1006,11 +1015,11 @@
 						    environ-patt: (if system-allowed
 								      "pre-launch-env-vars"
 								      #f))
 				       #f)))
 		(if (and tcfg cache-file) (hash-table-set! tcfg "have fulldata" #t)) ;; mark this as fully read data
-		(if tcfg (hash-table-set! *testconfigs* test-name tcfg))
+		(if tcfg (hash-table-set! *testconfigs* test-full-name tcfg))
 		(if (and testexists
 			 cache-file
 			 (file-write-access? cache-path))
 		    (let ((tpath (conc cache-path "/.testconfig")))
 		      (debug:print-info 1 *default-log-port* "Caching testconfig for " test-name " in " tpath)
@@ -1234,11 +1243,12 @@
 (define (tests:get-full-data test-names test-records required-tests all-tests-registry)
   (if (not (null? test-names))
       (let loop ((hed (car test-names))
 		 (tal (cdr test-names)))         ;; 'return-procs tells the config reader to prep running system but return a proc
 	(debug:print-info 4 *default-log-port* "hed=" hed " at top of loop")
-	(let* ((config  (tests:get-testconfig hed all-tests-registry 'return-procs))
+        ;; don't know item-path at this time, let the testconfig get the top level testconfig
+	(let* ((config  (tests:get-testconfig hed #f all-tests-registry 'return-procs))
 	       (waitons (let ((instr (if config 
 					 (config-lookup config "requirements" "waiton")
 					 (begin ;; No config means this is a non-existant test
 					   (debug:print-error 0 *default-log-port* "non-existent required test \"" hed "\", grep through your testconfigs to find and remove or create the test. Discarding and continuing.")
 					     ""))))
@@ -1332,10 +1342,11 @@
 	 "SELECT count(id) FROM test_rundat;")
 	res))
   0)
 
 (define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)
+  (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))
   (if (and cpuload diskfree)
       (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
   (if minutes 
       (rmt:general-call 'update-run-duration run-id minutes test-id))
   (if (and uname hostname)

Index: tests/fullrun/tests/all_toplevel/testconfig
==================================================================
--- tests/fullrun/tests/all_toplevel/testconfig
+++ tests/fullrun/tests/all_toplevel/testconfig
@@ -1,8 +1,8 @@
 [ezsteps]
 calcresults megatest -list-runs $MT_RUNNAME -target $MT_TARGET
-check_triggers  cat $MT_RUN_AREA_HOME/triggers_$MT_RUN_NAME.dat
+check_triggers  cat $MT_RUN_AREA_HOME/triggers_$MT_RUNNAME.dat
 
 [logpro]
 check_triggers ;;
   (expect:error in "LogFileBody" = 0 "No errors" #/error/i)
 

ADDED   utils/remrun
Index: utils/remrun
==================================================================
--- /dev/null
+++ utils/remrun
@@ -0,0 +1,28 @@
+#!/bin/bash
+###############################################################################
+#
+# remrun - same behavior as nbfake but first param is a hosthane
+#          (capture command output in a logfile)
+#
+# remrun behavior can be changed by setting the following env var:
+#   NBFAKE_LOG        Logfile for nbfake output
+#
+###############################################################################
+
+if [[ -z "$@" ]]; then
+  cat <<__EOF
+
+remrun usage:
+
+remrun hostname <command to run>
+
+remrun behavior can be changed by setting the following env vars:
+   NBFAKE_LOG        Logfile for remrun output
+
+__EOF
+  exit
+fi
+
+export NBFAKE_HOST=$1
+shift
+exec nbfake $*