Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -758,22 +758,23 @@
 		(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" this-wd-num="this-wd-num)))))))
 
 ;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
 (define (common:watchdog)
   (debug:print-info 13 *default-log-port* "common:watchdog entered.")
-  (if (common:on-homehost?)
-      (let ((dbstruct (db:setup #t)))
-	(debug:print-info 13 *default-log-port* "after db:setup with dbstruct="dbstruct)
-	(cond
-	 ((dbr:dbstruct-read-only dbstruct)
-	  (debug:print-info 13 *default-log-port* "loading read-only watchdog")
-	  (common:readonly-watchdog dbstruct))
-	 (else
-	  (debug:print-info 13 *default-log-port* "loading writable-watchdog.")
-	  (common:writable-watchdog dbstruct)))
-	(debug:print-info 13 *default-log-port* "watchdog done."))
-      (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost")))
+  (if (launch:setup)
+      (if (common:on-homehost?)
+	  (let ((dbstruct (db:setup #t)))
+	    (debug:print-info 13 *default-log-port* "after db:setup with dbstruct=" dbstruct)
+	    (cond
+	     ((dbr:dbstruct-read-only dbstruct)
+	      (debug:print-info 13 *default-log-port* "loading read-only watchdog")
+	      (common:readonly-watchdog dbstruct))
+	     (else
+	      (debug:print-info 13 *default-log-port* "loading writable-watchdog.")
+	      (common:writable-watchdog dbstruct)))
+	    (debug:print-info 13 *default-log-port* "watchdog done."))
+	  (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost"))))
 
 
 (define (std-exit-procedure)
   (on-exit (lambda () 0))
   ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
@@ -1756,11 +1757,12 @@
 		      (let* ((key   (car keyval))
 			     (val   (cdr keyval))
 			     (delim (if (string-search whitesp val) 
 					"\""
 					"")))
-			(print (if (member key ignorevars)
+			(print (if (or (member key ignorevars)
+				       (string-search whitesp key))
 				   "# setenv "
 				   "setenv ")
 			       key " " delim (mungeval val) delim)))
 		    envvars)))
      (with-output-to-file (conc fname ".sh")
@@ -1770,10 +1772,11 @@
 			     (val (cdr keyval))
 			     (delim (if (string-search whitesp val) 
 					"\""
 					"")))
 			(print (if (or (member key ignorevars)
+				       (string-search whitesp key)
 				       (string-search ":" key)) ;; internal only values to be skipped.
 				   "# export "
 				   "export ")
 			       key "=" delim (mungeval val) delim)))
                     envvars)))))
@@ -1787,11 +1790,11 @@
 		    (let* ((var (car  p))
 			   (val (cadr p))
 			   (prv (get-environment-variable var)))
 		      (set! res (cons (list var prv) res))
 		      (if val 
-			  (setenv var (->string val))
+			  (safe-setenv var (->string val))
 			  (unsetenv var))))
 		  lst)
 	res)
       '()))
 

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -937,71 +937,71 @@
 ;;  'closeall     - close all opened dbs
 ;;  'schema       - attempt to apply schema changes
 ;;  run-ids: '(1 2 3 ...) or #f (for all)
 ;;
 (define (db:multi-db-sync dbstruct . options)
-  (if (not (launch:setup))
-      (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.")
-      (let* ((mtdb     (dbr:dbstruct-mtdb dbstruct))
-	     (tmpdb    (db:get-db dbstruct))
-             (refndb   (dbr:dbstruct-refndb dbstruct))
-	     (allow-cleanup #t) ;; (if run-ids #f #t))
-	     (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat)))
-	     (data-synced 0)) ;; count of changed records (I hope)
-
-	(for-each
-	 (lambda (option)
-
-	   (case option
-	     ;; kill servers
-	     ((killservers)
-	      (for-each
-	       (lambda (server)
-		 (match-let (((mod-time host port start-time pid) server))
-			    (if (and host pid)
-				(tasks:kill-server host pid))))
-	       servers))
-
-	     ;; clear out junk records
-	     ;;
-	     ((dejunk)
-	      (db:delay-if-busy mtdb) ;; ok to delay on mtdb
-	      (db:clean-up mtdb)
-	      (db:clean-up tmpdb)
-              (db:clean-up refndb))
-
-	     ;; sync runs, test_meta etc.
-	     ;;
-	     ((old2new)
-	      (set! data-synced
-		    (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb tmpdb refndb)
-		       data-synced)))
-	     
-	     ;; now ensure all newdb data are synced to megatest.db
-	     ;; do not use the run-ids list passed in to the function
-	     ;;
-	     ((new2old)
-	      (set! data-synced
-		    (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f tmpdb refndb mtdb)
-		       data-synced)))
-
-	     ((adj-target)
-	      (db:adj-target (db:dbdat-get-db mtdb))
-	      (db:adj-target (db:dbdat-get-db tmpdb))
-	      (db:adj-target (db:dbdat-get-db refndb)))
-	   
-	     ((schema)
-              (db:patch-schema-maindb (db:dbdat-get-db mtdb))
-              (db:patch-schema-maindb (db:dbdat-get-db tmpdb))
-              (db:patch-schema-maindb (db:dbdat-get-db refndb))
-              (db:patch-schema-rundb  (db:dbdat-get-db mtdb))
-              (db:patch-schema-rundb  (db:dbdat-get-db tmpdb))
-              (db:patch-schema-rundb  (db:dbdat-get-db refndb))))
-	
-	   (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb))
-	 options)
-	data-synced)))
+  ;; (if (not (launch:setup))
+  ;;    (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.")
+  (let* ((mtdb     (dbr:dbstruct-mtdb dbstruct))
+	 (tmpdb    (db:get-db dbstruct))
+	 (refndb   (dbr:dbstruct-refndb dbstruct))
+	 (allow-cleanup #t) ;; (if run-ids #f #t))
+	 (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat)))
+	 (data-synced 0)) ;; count of changed records (I hope)
+    
+    (for-each
+     (lambda (option)
+       
+       (case option
+	 ;; kill servers
+	 ((killservers)
+	  (for-each
+	   (lambda (server)
+	     (match-let (((mod-time host port start-time pid) server))
+	       (if (and host pid)
+		   (tasks:kill-server host pid))))
+	   servers))
+	 
+	 ;; clear out junk records
+	 ;;
+	 ((dejunk)
+	  (db:delay-if-busy mtdb) ;; ok to delay on mtdb
+	  (db:clean-up mtdb)
+	  (db:clean-up tmpdb)
+	  (db:clean-up refndb))
+
+	 ;; sync runs, test_meta etc.
+	 ;;
+	 ((old2new)
+	  (set! data-synced
+	    (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb tmpdb refndb)
+	       data-synced)))
+	 
+	 ;; now ensure all newdb data are synced to megatest.db
+	 ;; do not use the run-ids list passed in to the function
+	 ;;
+	 ((new2old)
+	  (set! data-synced
+	    (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f tmpdb refndb mtdb)
+	       data-synced)))
+
+	 ((adj-target)
+	  (db:adj-target (db:dbdat-get-db mtdb))
+	  (db:adj-target (db:dbdat-get-db tmpdb))
+	  (db:adj-target (db:dbdat-get-db refndb)))
+	 
+	 ((schema)
+	  (db:patch-schema-maindb (db:dbdat-get-db mtdb))
+	  (db:patch-schema-maindb (db:dbdat-get-db tmpdb))
+	  (db:patch-schema-maindb (db:dbdat-get-db refndb))
+	  (db:patch-schema-rundb  (db:dbdat-get-db mtdb))
+	  (db:patch-schema-rundb  (db:dbdat-get-db tmpdb))
+	  (db:patch-schema-rundb  (db:dbdat-get-db refndb))))
+       
+       (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb))
+     options)
+    data-synced))
 
 ;; keeping it around for debugging purposes only
 (define (open-run-close-no-exception-handling  proc idb . params)
   (debug:print-info 11 *default-log-port* "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params)
   (print "I don't work anymore. open-run-close-no-exception-handling needs fixing or removing...")
@@ -3259,11 +3259,13 @@
 	 (test-name    (if (number? test-name)
 			   (db:test-get-testname testdat)
 			   test-name))
 	 (item-path    (db:test-get-item-path testdat))
          (tl-testdat   (db:get-test-info dbstruct run-id test-name ""))
-         (tl-test-id   (db:test-get-id tl-testdat)))
+         (tl-test-id   (if tl-testdat
+			   (db:test-get-id tl-testdat)
+			   #f)))
     (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
 	(db:general-call dbstruct 'set-test-start-time (list test-id)))
     (mutex-lock! *db-transaction-mutex*)
     (db:with-db
      dbstruct #f #f
@@ -3325,11 +3327,12 @@
 						      "STARTED"
                                                       (car all-curr-statuses))))
                        ;; (print "bad-not-supported: " bad-not-support " all-curr-states: " all-curr-states " all-curr-statuses: " all-curr-states)
                        ;;      " newstate: " newstate " newstatus: " newstatus)
                        ;; NB// Pass the db so it is part of the transaction
-                       (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)))))))
+                       (if tl-test-id
+			   (db:test-set-state-status db run-id tl-test-id newstate newstatus #f))))))))
          (mutex-unlock! *db-transaction-mutex*)
          (if (and test-id state status (equal? status "AUTO")) 
              (db:test-data-rollup dbstruct run-id test-id status))
          tr-res)))))
 ;; BBnote: db:get-all-state-status-counts-for-test returns dbr:counts object aggregating state and status of items of a given test, *not including rollup state/status*

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -314,11 +314,12 @@
 			    (loop (car tal) (cdr tal) stepname))
 			(debug:print 4 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping")))
 		  (debug:print 4 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep)))))))
 
 (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)
-  (let* ((start-seconds (current-seconds))
+  (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30")))
+         (start-seconds (current-seconds))
 	 (calc-minutes  (lambda ()
 			  (inexact->exact 
 			   (round 
 			    (- 
 			     (current-seconds) 
@@ -327,30 +328,38 @@
     ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
     ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
     (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
     (let loop ((minutes   (calc-minutes))
 	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
-	       (disk-free (get-df (current-directory))))
-      (let ((new-cpu-load (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
-				 (delta (abs (- load cpu-load))))
-			    (if (> delta 0.1) ;; don't bother updating with small changes
-				load
-				#f)))
-	    (new-disk-free (let* ((df    (get-df (current-directory)))
-				  (delta (abs (- df disk-free))))
-			     (if (> delta 200) ;; ignore changes under 200 Meg
-				 df
-				 #f))))
+	       (disk-free (get-df (current-directory)))
+               (last-sync (current-seconds)))
+      (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
+             (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
+                                   (delta (abs (- load cpu-load))))
+                              (if (> delta 0.1) ;; don't bother updating with small changes
+                                  load
+                                  #f)))
+             (new-disk-free (let* ((df    (if over-time ;; only get df every 30 seconds
+                                              (get-df (current-directory))
+                                              disk-free))
+                                   (delta (abs (- df disk-free))))
+                              (if (and (> df 0)
+                                       (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
+                                  df
+                                  #f)))
+             (do-sync       (or new-cpu-load new-disk-free over-time)))
+        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
 	(set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat))
 			    (and runtlim (let* ((run-seconds   (- (current-seconds) start-seconds))
 						(time-exceeded (> run-seconds runtlim)))
 					   (if time-exceeded
 					       (begin
 						 (debug:print-info 0 *default-log-port* "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)
 						 #t)
 					       #f)))))
-	(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
+        (if do-sync
+            (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
 	(if kill-job? 
 	    (begin
 	      (mutex-lock! m)
 	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
 	      ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
@@ -394,11 +403,14 @@
 	      (exit)))
 	(if (hash-table-ref/default misc-flags 'keep-going #f)
 	    (begin
 	      (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
 	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
-		  (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free)))))))
+		  (loop (calc-minutes)
+                        (or new-cpu-load cpu-load)
+                        (or new-disk-free disk-free)
+                        (if do-sync (current-seconds) last-sync)))))))
     (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
 
 
 (define (launch:execute encoded-cmd)
   (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
@@ -842,13 +854,18 @@
 	     (toppath  (or *toppath* areapath (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath
 	     (target   (common:args-get-target))
 	     (sections (if target (list "default" target) #f)) ;; for runconfigs
 	     (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config 
              (cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig))
-	     (mtcachef   (car cachefiles)) ;; (and cachedir (conc cachedir "/" ".megatest.cfg-"  megatest-version "-" megatest-fossil-hash)))
-	     (rccachef   (cdr cachefiles)) ;; (and cachedir (conc cachedir "/" ".runconfigs.cfg-"  megatest-version "-" megatest-fossil-hash)))
-	     ) ;; (cancreate (and cachedir (common:file-exists? cachedir)(file-write-access? cachedir) (not (common:in-running-test?)))))
+	     ;; checking for null cachefiles should not be necessary, I was seeing error car of '(), might be a chicken bug or a red herring ...
+	     (mtcachef   (if (null? cachefiles)
+			     #f
+			     (car cachefiles))) ;; (and cachedir (conc cachedir "/" ".megatest.cfg-"  megatest-version "-" megatest-fossil-hash)))
+	     (rccachef   (if (null? cachefiles)
+			     #f
+			     (cdr cachefiles)))) ;; (and cachedir (conc cachedir "/" ".runconfigs.cfg-"  megatest-version "-" megatest-fossil-hash)))
+	      ;; (cancreate (and cachedir (common:file-exists? cachedir)(file-write-access? cachedir) (not (common:in-running-test?)))))
 	(set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource
         ;;(BB> "launch:setup-body -- cachefiles="cachefiles)
 	(cond
 	 ;; if mtcachef exists just read it, however we need to assume toppath is available in $MT_RUN_AREA_HOME
 	 ((and (not force-reread) mtcachef (common:file-exists? mtcachef) (get-environment-variable "MT_RUN_AREA_HOME") use-cache)

Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -387,11 +387,19 @@
 
 ;; The watchdog is to keep an eye on things like db sync etc.
 ;;
 
 ;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
-(define *watchdog* (make-thread common:watchdog "Watchdog thread"))
+(define *watchdog* (make-thread
+		    (lambda ()
+		      (handle-exceptions
+			  exn
+			  (begin
+			    (print-call-chain)
+			    (print " message: " ((condition-property-accessor 'exn 'message) exn)))
+			(common:watchdog)))
+		    "Watchdog thread"))
 
 ;;(if (not (args:get-arg "-server"))
 ;;    (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog
 (let* ((no-watchdog-args
        '("-list-runs"

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -513,11 +513,17 @@
     (let ((reglen (configf:lookup *configdat* "setup" "runqueue")))
       (if (> (length (hash-table-keys test-records)) 0)
 	  (let* ((keep-going        #t)
 		 (run-queue-retries 5)
 		 (th1        (make-thread (lambda ()
-					    (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry))
+					    (handle-exceptions
+						exn
+						(begin
+						  (print-call-chain)
+						  (print " message: " ((condition-property-accessor 'exn 'message) exn)))
+					      (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
+								    (any->number reglen) all-tests-registry)))
 					    ;; (handle-exceptions
 					    ;;  exn
 					    ;;  (begin
 					    ;;    (print-call-chain (current-error-port))
 					    ;;    (debug:print-error 0 *default-log-port* "failure in runs:run-tests-queue thread, error: " ((condition-property-accessor 'exn 'message) exn))

Index: tests/tests.scm
==================================================================
--- tests/tests.scm
+++ tests/tests.scm
@@ -17,10 +17,26 @@
 (import srfi-18)
 ;; (require-extension zmq)
 ;; (import zmq)
 
 (define test-work-dir (current-directory))
+
+;; given list of lists
+;;  ( ( msg expected param1 param2 ...)
+;;    ( ... ) )
+;; apply test to all
+;;
+(define (test-batch proc pname inlst #!key (post-proc #f))
+  (for-each
+   (lambda (spec)
+     (let ((msg    (conc pname " " (car spec)))
+           (result (cadr spec))
+           (params (cddr spec)))
+       (if post-proc
+           (test msg result (post-proc (apply proc params)))
+           (test msg result (apply proc params)))))
+   inlst))
 
 ;; read in all the _record files
 (let ((files (glob "*_records.scm")))
   (for-each
    (lambda (file)

Index: tests/unittests/all-rmt.scm
==================================================================
--- tests/unittests/all-rmt.scm
+++ tests/unittests/all-rmt.scm
@@ -28,13 +28,25 @@
 ;; DEF (rmt:kill-server run-id)
 ;; DEF (rmt:start-server run-id)
 (test #f '(#t "successful login")(rmt:login #f))
 ;; DEF (rmt:login-no-auto-client-setup connection-info)
 (test #f #t (pair? (rmt:get-latest-host-load (get-host-name))))
+
+;; get-latest-host-load does a lookup in the db, it won't return a useful value unless
+;; a test ran recently on host
+(test-batch rmt:get-latest-host-load
+            "rmt:get-latest-host-load"
+            (list (list "localhost"  #t (get-host-name))
+                  (list "not-a-host" #t "not-a-host"  ))
+            post-proc: pair?)
+                                           
 (test #f #t (list? (rmt:get-changed-record-ids 0)))
+
 (test #f #f (begin (runs:update-all-test_meta #f) #f))
+
 (test #f '("test1" "test2")(sort (alist-ref "tagtwo" (hash-table->alist (rmt:get-tests-tags)) equal?) string<=))
+
 (test #f '() (rmt:get-key-val-pairs 0))
 (test #f '("SYSTEM" "RELEASE") (rmt:get-keys))
 (test #f '("SYSTEM" "RELEASE") (rmt:get-keys-write)) ;; dummy query to force server start
 (test #f '() (rmt:get-key-vals 1))
 (test #f (vector '("SYSTEM" "RELEASE") '()) (rmt:get-targets))
@@ -82,11 +94,27 @@
 (test #f '()(rmt:get-prev-run-ids 1))
 (test #f #t (begin (rmt:lock/unlock-run 1 #t #f "mikey") #t))
 (test #f "JUSTFINE" (rmt:get-run-status 1))
 (test #f #t (begin (rmt:set-run-status 1 "NOTFINE" msg: "A message") #t))
 (test #f #t (begin (rmt:update-run-event_time 1) #t))
+
 ;; (rmt:get-runs-by-patt  keys runnamepatt targpatt offset limit fields last-runs-update) ;; fields of #f uses default
+;;
+(let ((keys (rmt:get-keys))
+      (rnp  "%")    ;; run name patt
+      (tpt  "%/%")) ;; target patt
+  (test-batch rmt:get-runs-by-patt
+              "rmt:get-runs-by-patt"
+              (list (list "t=0" #t keys rnp tpt #f #f #f 0)
+                    (list "t=current" #f keys rnp tpt #f #f #f (+ 100 (current-seconds))) ;; should be no records from the future
+                    )
+              post-proc: (lambda (res)
+                           ;; (print "rmt:get-runs-by-patt returned: " res)
+                           (and (vector? res)
+                                (let ((rows (vector-ref res 1)))
+                                  (> (length rows) 0))))))
+
 ;; (rmt:find-and-mark-incomplete run-id ovr-deadtime)
 ;; (rmt:get-main-run-stats run-id)
 ;; (rmt:get-var varname)
 ;; (rmt:set-var varname value)
 ;; (rmt:find-and-mark-incomplete-all-runs #!key (ovr-deadtime #f))