Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -39,10 +39,11 @@
     get-var
     get-keys
     get-key-vals
     test-toplevel-num-items
     get-test-info-by-id
+    get-test-state-status-by-id
     get-steps-info-by-id
     get-data-info-by-id
     test-get-rundir-from-test-id
     get-count-tests-running-for-testname
     get-count-tests-running
@@ -328,10 +329,11 @@
     ((test-get-archive-block-info)     (apply db:test-get-archive-block-info dbstruct params))
     
     ;; TESTS
     ((test-toplevel-num-items)         (apply db:test-toplevel-num-items dbstruct params))
     ((get-test-info-by-id)	       (apply db:get-test-info-by-id dbstruct params))
+    ((get-test-state-status-by-id)     (apply db:get-test-state-status-by-id dbstruct params))
     ((test-get-rundir-from-test-id)    (apply db:test-get-rundir-from-test-id dbstruct params))
     ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params))
     ((get-count-tests-running)         (apply db:get-count-tests-running dbstruct params))
     ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params))
     ;; ((delete-test-step-records)        (apply db:delete-test-step-records dbstruct params))
@@ -351,10 +353,11 @@
 
     ;; RUNS
     ((get-run-info)                 (apply db:get-run-info dbstruct params))
     ((get-run-status)               (apply db:get-run-status dbstruct params))
     ((get-run-state)                (apply db:get-run-state dbstruct params))
+    ((get-run-state-status)         (apply db:get-run-state-status dbstruct params))
     ((set-run-status)               (apply db:set-run-status dbstruct params))
     ((set-run-state-status)  			 (apply db:set-run-state-status dbstruct params))
     ((update-tesdata-on-repilcate-db) (apply db:update-tesdata-on-repilcate-db  dbstruct params)) 
     ((get-tests-for-run)            (apply db:get-tests-for-run dbstruct params))
     ((get-tests-for-run-state-status) (apply db:get-tests-for-run-state-status dbstruct params))

Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -317,10 +317,15 @@
 (define (common:logpro-exit-code->test-status exit-code)
   (status-sym->string (common:logpro-exit-code->status-sym exit-code)))
 
 ;; 
 (defstruct remote
+
+  ;; transport to be used
+  ;; http              - use http-transport
+  ;; http-read-cached  - use http-transport for writes but in-mem cached for reads
+  (rmode            'http)
   (hh-dat            (let ((res (or (server:choose-server *toppath* 'homehost)
 				    (cons #f #f))))
 		       (assert (pair? res)(conc "FATAL: hh-dat should be a pair, got "res))
 		       res))
   (server-url        #f) ;; (server:check-if-running *toppath*) #f))
@@ -1367,11 +1372,11 @@
 			  (else
 			   (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.")
 			   #t)))) ;; default to requiring server
     (if force-result
 	(begin
-	  (debug:print-info 0 *default-log-port* "forcing use of server, force setting is \"" force-setting "\".")
+	  (debug:print-info 0 *default-log-port* "ATTENTION! Forcing use of server, force setting is \"" force-setting "\".")
 	  #t)
 	#f)))
 
 ;;======================================================================
 ;; M I S C   L I S T S

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -468,11 +468,13 @@
 	       (hash-table-set! sync-durations (conc fname".db")
 				(- (current-milliseconds) start-time)))
 	     (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date")
          )))
      dbfiles)
-    (if dbdat (dbfile:add-dbdat dbstruct #f dbdat)))
+    ;; WHY does the dbdat need to be added back?
+    (if dbdat (dbfile:add-dbdat dbstruct #f dbdat))
+    )
   #t)
 
 ;; options:
 ;;
 ;;  'killservers  - kills all servers
@@ -598,10 +600,11 @@
 	      (mtdb   (dbr:subdb-mtdb subdb))
 	      (tmpdb  (db:get-subdb dbstruct run-id))
 	      (refndb (dbr:subdb-refndb subdb))
 	      (newres (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) last-update tmpdb refndb mtdb)))
 	 ;; (stack-push! (dbr:subdb-dbstack subdb) tmpdb)
+	 ;; BUG: verify this is really needed
 	 (dbfile:add-dbdat dbstruct run-id tmpdb)
 	 (set! res (cons newres res))))
      subdbs)
     res))
 
@@ -904,10 +907,11 @@
 	  "SELECT d.id,d.archive_area_name,disk_path,last_df,last_df_time FROM archive_disks AS d
              INNER JOIN archive_blocks AS b ON d.id=b.archive_disk_id
              WHERE b.id IN (" (string-intersperse (map conc res) ",") ") AND
          last_df > ?;")
 	 dneeded))
+    ;; BUG: Verfify this is really needed
     (dbfile:add-dbdat dbstruct #f dbdat)
     blocks))
     
 ;; returns id of the record, register a disk allocated to archiving and record it's last known
 ;; available space
@@ -2066,12 +2070,13 @@
      dbstruct #f #f
      (lambda (dbdat db)
        (sqlite3:for-each-row 
 	(lambda (status)
 	  (set! res status))
-	db
-	"SELECT status FROM runs WHERE id=?;" 
+	(db:get-cache-stmth
+	 dbdat db
+	 "SELECT status FROM runs WHERE id=?;" )
 	run-id)
        res))))
 
 (define (db:get-run-state dbstruct run-id)
   (let ((res "n/a"))
@@ -2079,12 +2084,27 @@
      dbstruct #f #f
      (lambda (dbdat db)
        (sqlite3:for-each-row 
 	(lambda (status)
 	  (set! res status))
-	db
-	"SELECT state FROM runs WHERE id=?;" 
+	(db:get-cache-stmth
+	 dbdat db
+	 "SELECT state FROM runs WHERE id=?;" )
+	run-id)
+       res))))
+
+(define (db:get-run-state-status dbstruct run-id)
+  (let ((res (cons "n/a" "n/a")))
+    (db:with-db
+     dbstruct #f #f
+     (lambda (dbdat db)
+       (sqlite3:for-each-row 
+	(lambda (state status)
+	  (set! res (cons state status)))
+	(db:get-cache-stmth
+	 dbdat db
+	 "SELECT state,status FROM runs WHERE id=?;" )
 	run-id)
        res))))
 
 
 ;;======================================================================
@@ -2696,11 +2716,11 @@
      (lambda (run-id)
        (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
 	 (db:prep-megatest.db-adj-test-ids (dbr:dbdat-dbh mtdb) run-id testrecs)))
      run-ids)))
 
-;; Get test data using test_id, run-id is not used
+;; Get test data using test_id
 ;; 
 (define (db:get-test-info-by-id dbstruct run-id test-id)
   (db:with-db
    dbstruct
    run-id
@@ -2713,10 +2733,26 @@
 	  (set! res (vector id run-id testname state status event-time host cpuload diskfree uname rundir-id item-path run_duration final-logf-id comment short-dir-id attemptnum archived last-update)))
 	(db:get-cache-stmth dbdat db
 			    (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id=?;"))
 	test-id)
        res))))
+
+;; Get test state, status using test_id
+;; 
+(define (db:get-test-state-status-by-id dbstruct run-id test-id)
+  (db:with-db
+   dbstruct
+   run-id
+   #f
+   (lambda (dbdat db)
+     (let ((res (cons #f #f)))
+       (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
+	(lambda (state status)
+	  (cons state status))
+	(db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=?;")
+	test-id)
+       res))))
 
 ;; Use db:test-get* to access
 ;; Get test data using test_ids. NB// Only works within a single run!!
 ;;
 (define (db:get-test-info-by-ids dbstruct run-id test-ids)

Index: dbfile.scm
==================================================================
--- dbfile.scm
+++ dbfile.scm
@@ -37,13 +37,15 @@
 	stack
 	files
 	ports
 
 	commonmod
+	;; debugprint
 	)
 
-;; (import debugprint)
+(define keep-age-param (make-parameter 10)) ;; qif file age, if over move to attic
+(define num-run-dbs (make-parameter 10))     ;; number of db's in .megatest
 
 ;;======================================================================
 ;;  R E C O R D S
 ;;======================================================================
 
@@ -191,17 +193,21 @@
 (define (dbfile:run-id->path apath run-id)
   (conc apath"/"(dbfile:run-id->dbname run-id)))
 
 (define (db:dbname->path apath dbname)
   (conc apath"/"dbname))
+
+(define (dbfile:run-id->dbnum run-id)
+  (cond
+   ((number? run-id)
+    (modulo run-id (num-run-dbs)))
+   ((not run-id) "main")   ;; 0 or main?
+   (else run-id)))
 
 ;; POTENTIAL BUG: this implementation could produce a db file if run-id is neither #f or a number
 (define (dbfile:run-id->dbname run-id)
-  (cond
-   ((number? run-id) (conc ".megatest/" (modulo run-id 100) ".db"))
-   ((not run-id)     (conc ".megatest/main.db"))
-   (else             run-id)))
+  (conc ".megatest/"(dbfile:run-id->dbnum run-id)".db"))
 
 ;; Make the dbstruct, setup up auxillary db's and call for main db at least once
 ;;
 ;; called in http-transport and replicated in rmt.scm for *local* access. 
 ;;
@@ -241,12 +247,16 @@
 	(begin
 	  (stack-pop! (dbr:subdb-dbstack subdb))))))
 
 ;; return a previously opened db handle to the stack of available handles
 (define (dbfile:add-dbdat dbstruct run-id dbdat)
-  (let* ((subdb (dbfile:get-subdb dbstruct run-id)))
-    (stack-push! (dbr:subdb-dbstack subdb) dbdat)
+  (let* ((subdb (dbfile:get-subdb dbstruct run-id))
+	 (dbstk (dbr:subdb-dbstack subdb))
+	 (count (stack-count dbstk)))
+    (if (> count 15)
+	(dbfile:print-err "WARNING: stack for "run-id".db is "count"."))
+    (stack-push! dbstk dbdat)
     dbdat))
 
 ;; set up a subdb
 ;;
 (define (dbfile:init-subdb dbstruct run-id init-proc)
@@ -886,11 +896,12 @@
           )
         )
 	tbls)
        (let* ((runtime      (- (current-milliseconds) start-time))
 	      (should-print (or ;; (debug:debug-mode 12)
-				(common:low-noise-print 120 "db sync" (> runtime 500))))) ;; low and high sync times treated as separate.
+			     (common:low-noise-print 120 "db sync")
+			     (> runtime 500)))) ;; low and high sync times treated as separate.
 	 (for-each 
 	  (lambda (dat)
 	    (let ((tblname (car dat))
 		  (count   (cdr dat)))
 	      (set! tot-count (+ tot-count count))
@@ -1004,53 +1015,53 @@
     ;; (mutex-unlock! *db-open-mutex*)
     dbdat))
 
 (define dbfile:db-init-proc (make-parameter #f))
 
-(define keep-age-param (make-parameter 10))
-(define qif-slope      (make-parameter 100))
+;; in xmaxima this gives a curve close to what I want:
+;;    plot2d ((exp(x/1.2)-1)/300, [x, 0, 10])$
+;;    plot2d ((exp(x/1.5)-1)/40, [x, 0, 10])$
+;;    plot2d ((exp(x/5)-1)/40, [x, 0, 20])$
+(define (dbfile:droop x)
+  (/ (- (exp (/ x 5)) 1) 40))
+  ;; (* numqrys (/ 1 (qif-slope))))
 
 ;; create a dropping near the db file in a qif dir
 ;; use count of such files to gate queries (queries in flight)
 ;;
 (define (dbfile:wait-for-qif fname run-id params)
   (let* ((thedir  (pathname-directory fname))
-	 (destdir (conc thedir"/qif-"run-id))
-	 (uniqn   (get-area-path-signature (conc (or run-id "main") params)))
+	 (dbnum   (dbfile:run-id->dbnum run-id))
+	 (destdir (conc thedir"/qif-"dbnum))
+	 (uniqn   (get-area-path-signature (conc dbnum params)))
 	 (crumbn  (conc destdir"/"(current-seconds)"-"uniqn"."(current-process-id))))
     (if (not (file-exists? destdir))(create-directory (conc destdir"/attic") #t))
     (let loop ((count 0))
       (let* ((currlks (glob (conc destdir"/*")))
 	     (numqrys (length currlks))
 	     (delayval (cond ;; do a droopish curve
-			((> numqrys 50)
-			 (if (> numqrys 50)
-			     (for-each
-			      (lambda (f)
-				(if (> (- (current-seconds)
-					  (handle-exceptions
-					      exn
-					    (current-seconds) ;; file is likely gone, just fake out
-					    (file-modification-time f)))
-				       (keep-age-param))
-				    (let* ((basedir (pathname-directory f))
-					   (filen   (pathname-file f))
-					   (destf   (conc basedir"/attic/"filen)))
-				      (dbfile:print-err "Moving qif file "f" older than 10 seconds to "destf)
-				      ;; (delete-file* f)
-				      (handle-exceptions
-					  exn
-					#t
-					(file-move f destf #t)))))
-			      currlks))
-			 1) ;; 50 and above => 1
-			((> numqrys 10) (* numqrys (/ 1 (qif-slope)))) ;; slope of 1/100
-			;; ((> numqrys 30) 0.50)
-			;; ((> numqrys 25) 0.20)
-			;; ((> numqrys 20) 0.10)
-			;; ((> numqrys 15) 0.05)
-			;; ((> numqrys 10) 0.01)
+			((> numqrys 25)
+			 (for-each
+			  (lambda (f)
+			    (if (> (- (current-seconds)
+				      (handle-exceptions
+					  exn
+					(current-seconds) ;; file is likely gone, just fake out
+					(file-modification-time f)))
+				   (keep-age-param))
+				(let* ((basedir (pathname-directory f))
+				       (filen   (pathname-file f))
+				       (destf   (conc basedir"/attic/"filen)))
+				  (dbfile:print-err "Moving qif file "f" older than 10 seconds to "destf)
+				  ;; (delete-file* f)
+				  (handle-exceptions
+				      exn
+				    #t
+				    (file-move f destf #t)))))
+			  currlks)
+			 4)
+			((> numqrys 0)  (dbfile:droop numqrys)) ;; slope of 1/100
 			(else #f))))
 	(if (and delayval
 		 (< count 5))
 	    (begin
 	      (thread-sleep! delayval)

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -29,18 +29,21 @@
 (import (prefix sqlite3 sqlite3:))
 
 (declare (unit launch))
 (declare (uses subrun))
 (declare (uses common))
+(declare (uses commonmod))
 (declare (uses configf))
 (declare (uses db))
 (declare (uses ezsteps))
 
 (include "common_records.scm")
 (include "key_records.scm")
 (include "db_records.scm")
 (include "megatest-fossil-hash.scm")
+
+(import commonmod)
 
 ;;======================================================================
 ;; ezsteps
 ;;======================================================================
 
@@ -205,11 +208,11 @@
                         )
 
                         ))))))
 
 (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)
-  (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30")))
+  (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "60")))
          (start-seconds (current-seconds))
 	 (calc-minutes  (lambda ()
 			  (inexact->exact 
 			   (round 
 			    (- 
@@ -239,13 +242,13 @@
                                        (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                   df
                                   #f)))
              (do-sync       (or new-cpu-load new-disk-free over-time))
 
-             (test-info   (rmt:get-test-info-by-id run-id test-id))
-             (state       (db:test-get-state test-info))
-             (status      (db:test-get-status test-info))
+             (test-info   (rmt:get-test-state-status-by-id run-id test-id))
+             (state       (car test-info));; (db:test-get-state test-info))
+             (status      (cdr test-info));; (db:test-get-status test-info))
              (kill-reason  "no kill reason specified")
              (kill-job?    #f))
         ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
         (cond
          ((test-get-kill-request run-id test-id)
@@ -259,11 +262,12 @@
           (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
           ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
           (set! kill-job? #f)))
 
         (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
-        (launch:handle-zombie-tests run-id)
+        (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty
+	    (launch:handle-zombie-tests run-id))
         (when do-sync
           ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
           ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
           ;; (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
           (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
@@ -765,20 +769,28 @@
 ;; new
 ;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup
 ;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
 ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
 ;; 0 RUNNING ==> this is actually the first condition, should not get here
-
+(define *last-rollup* 0)
 (define (launch:end-of-run-check run-id )
     (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))  
-           (running-cnt (rmt:get-count-tests-running-for-run-id run-id))
+           (running-cnt       (rmt:get-count-tests-running-for-run-id run-id))
            (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))
-           (current-state (rmt:get-run-state run-id))
-           (current-status (rmt:get-run-status run-id)))
-     ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
-     (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)                      
-     (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
+	   (current-state-status (rmt:get-run-state-status run-id))
+           (current-state        (car current-state-status))  ;; (rmt:get-run-state run-id))
+           (current-status       (cdr current-state-status))) ;; (rmt:get-run-status run-id)))
+      ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
+      (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)
+      ;;
+      ;; TODO: add a final rollup when run is done (if there isn't one already)
+      ;;
+      (if (or (< running-cnt 3)                              ;; have only few running
+	      (> (- (current-seconds) *last-rollup*) 10))    ;; or haven't rolled up in past ten seconds
+	  (begin
+	    (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
+	    (set! *last-rollup* (current-seconds))))
      (runs:update-junit-test-reporter-xml run-id) 
      (cond 
        ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" ))
                 (if (and (equal? (rmt:get-var (conc "end-of-run-" run-id)) "no") (common:simple-lock (conc "endOfRun" run-id)))
                 (begin

Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -118,176 +118,179 @@
             (if server-info
 		(begin
 			(remote-server-url-set! *runremote* (server:record->url server-info))
 			(remote-server-id-set! *runremote* (server:record->id server-info)))))  
 	  (set! runremote   *runremote*))) ;; new runremote will come from this on next iteration
-    
-    ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity
-    ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"];
-    ;; DOT SET_HOMEHOST -> MUTEXLOCK;
-    ;; ensure we have a homehost record
-    (if (or (not (pair? (remote-hh-dat runremote)))  ;; not on homehost
-	    (not (cdr (remote-hh-dat runremote))))   ;; not on homehost
-	(thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
-	(let ((hh-data (server:choose-server areapath 'homehost)))
-	  (remote-hh-dat-set! runremote (or hh-data (cons #f #f)))))
-    
-    ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
-    (cond
-     #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds
-      (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.")
-      (set! *runremote* #f)
-      ;; BUG: close-connections should go here?
-      (mutex-unlock! *rmt-mutex*)
-      (rmt:send-receive cmd rid params attemptnum: 1 area-dat: area-dat))
-     
-     ;;DOT EXIT;
-     ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" }
-     ;; give up if more than 150 attempts
-     ((> attemptnum 150)
-      (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.")
-      (exit 1))
-
-     ;;DOT CASE2 [label="local\nreadonly\nquery"];
-     ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2}
-     ;;DOT CASE2 -> "rmt:open-qry-close-locally";
-     ;; readonly mode, read request-  handle it - case 2
-     ((and readonly-mode
-           (member cmd api:read-only-queries)) 
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2")
-      (rmt:open-qry-close-locally cmd 0 params)
-      )
-
-     ;;DOT CASE3 [label="write in\nread-only mode"];
-     ;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3}
-     ;;DOT CASE3 -> "#f";
-     ;; readonly mode, write request.  Do nothing, return #f
-     (readonly-mode (extras-readonly-mode *rmt-mutex* *default-log-port* cmd params))
-
-     ;; This block was for pre-emptively resetting the connection if there had been no communication for some time.
-     ;; I don't think it adds any value. If the server is not there, just fail and start a new connection.
-     ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout)
-     ;;
-     ;;DOT CASE4 [label="reset\nconnection"];
-     ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4}
-     ;;DOT CASE4 -> "rmt:send-receive";
-     ;; reset the connection if it has been unused too long
-     ((and runremote
-           (remote-api-url runremote)
-	   (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on
-	      (+ (remote-last-access runremote)
-		 (remote-server-timeout runremote))))
-      (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses in " (remote-server-timeout runremote) " seconds, forcing new connection.")
-      (http-transport:close-connections runremote)
-      ;; moving this setting of runremote conndat to #f to inside the http-transport:close-connections
-      ;; (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
-      (mutex-unlock! *rmt-mutex*)
-      (rmt:send-receive cmd rid params attemptnum: attemptnum))
-     
-     ;;DOT CASE5 [label="local\nread"];
-     ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5};
-     ;;DOT CASE5 -> "rmt:open-qry-close-locally";
-
-     ;; on homehost and this is a read
-     ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required
-	   (rmt:on-homehost? runremote)
-           (member cmd api:read-only-queries))   ;; this is a read
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
-      (rmt:open-qry-close-locally cmd 0 params))
-
-     ;;DOT CASE6 [label="init\nremote"];
-     ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6};
-     ;;DOT CASE6 -> "rmt:send-receive";
-     ;; on homehost and this is a write, we already have a server, but server has died
-
-     ;; reinstate this keep-alive section but inject a time condition into the (add ...
-     ;;
-     ;; ((and (cdr (remote-hh-dat runremote))           ;; on homehost
-     ;;       (not (member cmd api:read-only-queries))  ;; this is a write
-     ;;       (remote-server-url runremote)             ;; have a server
-     ;;       (not (server:ping (remote-server-url runremote) (remote-server-id runremote))))  ;; server has died. NOTE: this is not a cheap call! Need better approach.
-     ;;  (debug:print 0 *default-log-port* "WARNING: server appears to have died, trying to reconnect, case 6")
-     ;;  (http-transport:close-connections area-dat: runremote) ;; make sure to clean up
-     ;;  (set! *runremote* (make-remote))
-     ;;  (let* ((server-info (remote-server-info *runremote*))) 
-     ;;        (if server-info
-     ;; 		(begin
-     ;; 		  (remote-server-url-set! *runremote* (server:record->url server-info))
-     ;;              (remote-server-id-set! *runremote* (server:record->id server-info)))))
-     ;;  (remote-force-server-set! runremote (common:force-server?))
-     ;;  (mutex-unlock! *rmt-mutex*)
-     ;;  (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6")
-     ;;  (rmt:send-receive cmd rid params attemptnum: attemptnum))
-
-     ;;DOT CASE7 [label="homehost\nwrite"];
-     ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7};
-     ;;DOT CASE7 -> "rmt:open-qry-close-locally";
-     ;; on homehost and this is a write, we already have a server
-     ((and (not (remote-force-server runremote))     ;; honor forced use of server, i.e. server NOT required
-	   (cdr (remote-hh-dat runremote))           ;; on homehost
-           (not (member cmd api:read-only-queries))  ;; this is a write
-           (remote-server-url runremote))            ;; have a server (needed to sync written data back)
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
-      (rmt:open-qry-close-locally cmd 0 params))
-
-     ;;DOT CASE8 [label="force\nserver"];
-     ;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8};
-     ;;DOT CASE8 -> "rmt:open-qry-close-locally";
-     ;;  on homehost, no server contact made and this is a write, passively start a server 
-     ((and (not (remote-force-server runremote))     ;; honor forced use of server, i.e. server NOT required
-	   (cdr (remote-hh-dat runremote))           ;; have homehost
-           (not (remote-server-url runremote))       ;; no connection yet
-	   (not (member cmd api:read-only-queries))) ;; not a read-only query
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
-      (let ((server-info  (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
-	(if server-info
-	    (begin
-              (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed
-              (remote-server-id-set! runremote (server:record->id server-info)))  
-	    (if (common:force-server?)
-		(server:start-and-wait *toppath*)
-		(server:kind-run *toppath*)))
+
+    (http-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode)))
+
+(define (http-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode)
+  ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity
+  ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"];
+  ;; DOT SET_HOMEHOST -> MUTEXLOCK;
+  ;; ensure we have a homehost record
+  (if (or (not (pair? (remote-hh-dat runremote)))  ;; not on homehost
+	  (not (cdr (remote-hh-dat runremote))))   ;; not on homehost
+      (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
+      (let ((hh-data (server:choose-server areapath 'homehost)))
+	(remote-hh-dat-set! runremote (or hh-data (cons #f #f)))))
+  
+  ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
+  (cond
+   #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds
+   (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.")
+   (set! *runremote* #f)
+   ;; BUG: close-connections should go here?
+   (mutex-unlock! *rmt-mutex*)
+   (rmt:send-receive cmd rid params attemptnum: 1 area-dat: area-dat))
+   
+   ;;DOT EXIT;
+   ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" }
+   ;; give up if more than 150 attempts
+   ((> attemptnum 150)
+    (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.")
+    (exit 1))
+
+   ;;DOT CASE2 [label="local\nreadonly\nquery"];
+   ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2}
+   ;;DOT CASE2 -> "rmt:open-qry-close-locally";
+   ;; readonly mode, read request-  handle it - case 2
+   ((and readonly-mode
+         (member cmd api:read-only-queries)) 
+    (mutex-unlock! *rmt-mutex*)
+    (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2")
+    (rmt:open-qry-close-locally cmd 0 params)
+    )
+
+   ;;DOT CASE3 [label="write in\nread-only mode"];
+   ;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3}
+   ;;DOT CASE3 -> "#f";
+   ;; readonly mode, write request.  Do nothing, return #f
+   (readonly-mode (extras-readonly-mode *rmt-mutex* *default-log-port* cmd params))
+
+   ;; This block was for pre-emptively resetting the connection if there had been no communication for some time.
+   ;; I don't think it adds any value. If the server is not there, just fail and start a new connection.
+   ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout)
+   ;;
+   ;;DOT CASE4 [label="reset\nconnection"];
+   ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4}
+   ;;DOT CASE4 -> "rmt:send-receive";
+   ;; reset the connection if it has been unused too long
+   ((and runremote
+         (remote-api-url runremote)
+	 (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on
+	    (+ (remote-last-access runremote)
+	       (remote-server-timeout runremote))))
+    (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses in " (remote-server-timeout runremote) " seconds, forcing new connection.")
+    (http-transport:close-connections runremote)
+    ;; moving this setting of runremote conndat to #f to inside the http-transport:close-connections
+    ;; (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
+    (mutex-unlock! *rmt-mutex*)
+    (rmt:send-receive cmd rid params attemptnum: attemptnum))
+   
+   ;;DOT CASE5 [label="local\nread"];
+   ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5};
+   ;;DOT CASE5 -> "rmt:open-qry-close-locally";
+
+   ;; on homehost and this is a read
+   ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required
+	 (rmt:on-homehost? runremote)
+         (member cmd api:read-only-queries))   ;; this is a read
+    (mutex-unlock! *rmt-mutex*)
+    (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
+    (rmt:open-qry-close-locally cmd 0 params))
+
+   ;;DOT CASE6 [label="init\nremote"];
+   ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6};
+   ;;DOT CASE6 -> "rmt:send-receive";
+   ;; on homehost and this is a write, we already have a server, but server has died
+
+   ;; reinstate this keep-alive section but inject a time condition into the (add ...
+   ;;
+   ;; ((and (cdr (remote-hh-dat runremote))           ;; on homehost
+   ;;       (not (member cmd api:read-only-queries))  ;; this is a write
+   ;;       (remote-server-url runremote)             ;; have a server
+   ;;       (not (server:ping (remote-server-url runremote) (remote-server-id runremote))))  ;; server has died. NOTE: this is not a cheap call! Need better approach.
+   ;;  (debug:print 0 *default-log-port* "WARNING: server appears to have died, trying to reconnect, case 6")
+   ;;  (http-transport:close-connections area-dat: runremote) ;; make sure to clean up
+   ;;  (set! *runremote* (make-remote))
+   ;;  (let* ((server-info (remote-server-info *runremote*))) 
+   ;;        (if server-info
+   ;; 		(begin
+   ;; 		  (remote-server-url-set! *runremote* (server:record->url server-info))
+   ;;              (remote-server-id-set! *runremote* (server:record->id server-info)))))
+   ;;  (remote-force-server-set! runremote (common:force-server?))
+   ;;  (mutex-unlock! *rmt-mutex*)
+   ;;  (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6")
+   ;;  (rmt:send-receive cmd rid params attemptnum: attemptnum))
+
+   ;;DOT CASE7 [label="homehost\nwrite"];
+   ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7};
+   ;;DOT CASE7 -> "rmt:open-qry-close-locally";
+   ;; on homehost and this is a write, we already have a server
+   ((and (not (remote-force-server runremote))     ;; honor forced use of server, i.e. server NOT required
+	 (cdr (remote-hh-dat runremote))           ;; on homehost
+         (not (member cmd api:read-only-queries))  ;; this is a write
+         (remote-server-url runremote))            ;; have a server (needed to sync written data back)
+    (mutex-unlock! *rmt-mutex*)
+    (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
+    (rmt:open-qry-close-locally cmd 0 params))
+
+   ;;DOT CASE8 [label="force\nserver"];
+   ;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8};
+   ;;DOT CASE8 -> "rmt:open-qry-close-locally";
+   ;;  on homehost, no server contact made and this is a write, passively start a server 
+   ((and (not (remote-force-server runremote))     ;; honor forced use of server, i.e. server NOT required
+	 (cdr (remote-hh-dat runremote))           ;; have homehost
+         (not (remote-server-url runremote))       ;; no connection yet
+	 (not (member cmd api:read-only-queries))) ;; not a read-only query
+    (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
+    (let ((server-info  (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
+      (if server-info
+	  (begin
+            (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed
+            (remote-server-id-set! runremote (server:record->id server-info)))  
+	  (if (common:force-server?)
+	      (server:start-and-wait *toppath*)
+	      (server:kind-run *toppath*)))
       (remote-force-server-set! runremote (common:force-server?))
       (mutex-unlock! *rmt-mutex*)
       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8.1")
       (rmt:open-qry-close-locally cmd 0 params)))
 
-     ;;DOT CASE9 [label="force server\nnot on homehost"];
-     ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9};
-     ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive";
-     ((or (and (remote-force-server runremote)              ;; we are forcing a server and don't yet have a connection to one
-	       (not (remote-api-url runremote)))
-	  (and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
-	       (not (remote-api-url runremote))))           ;; and no connection
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " runremote: " (remote->alist runremote))
-      (mutex-unlock! *rmt-mutex*)
-      (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up?
-	  (server:start-and-wait *toppath*))
-      ;; was: (remote-conndat-set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http
-      (set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http
-      (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as
-
-     ;;DOT CASE10 [label="on homehost"];
-     ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10};
-     ;;DOT CASE10 -> "rmt:open-qry-close-locally";
-     ;; all set up if get this far, dispatch the query
-     ((and (not (remote-force-server runremote))
-	   (cdr (remote-hh-dat runremote))) ;; we are on homehost
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10")
-      (rmt:open-qry-close-locally cmd (if rid rid 0) params))
-
-     ;;DOT CASE11 [label="send_receive"];
-     ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11};
-     ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"];
-     ;;DOT CASE11 -> "RESULT" [label="call succeeded"];
-     ;; not on homehost, do server query
-     (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid)))))
-    ;;DOT }
+   ;;DOT CASE9 [label="force server\nnot on homehost"];
+   ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9};
+   ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive";
+   ((or (and (remote-force-server runremote)              ;; we are forcing a server and don't yet have a connection to one
+	     (not (remote-api-url runremote)))
+	(and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
+	     (not (remote-api-url runremote))))           ;; and no connection
+    (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " runremote: " (remote->alist runremote))
+    (mutex-unlock! *rmt-mutex*)
+    (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up?
+	(server:start-and-wait *toppath*))
+    ;; was: (remote-conndat-set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http
+    (set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http
+    (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as
+
+   ;;DOT CASE10 [label="on homehost"];
+   ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10};
+   ;;DOT CASE10 -> "rmt:open-qry-close-locally";
+   ;; all set up if get this far, dispatch the query
+   ((and (not (remote-force-server runremote))
+	 (cdr (remote-hh-dat runremote))) ;; we are on homehost
+    (mutex-unlock! *rmt-mutex*)
+    (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10")
+    (rmt:open-qry-close-locally cmd (if rid rid 0) params))
+
+   ;;DOT CASE11 [label="send_receive"];
+   ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11};
+   ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"];
+   ;;DOT CASE11 -> "RESULT" [label="call succeeded"];
+   ;; not on homehost, do server query
+   (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid))))
+;;DOT }
 
 ;; bunch of small functions factored out of send-receive to make debug easier
 ;;
 
 (define (extras-case-11 *default-log-port* runremote cmd params attemptnum rid)
@@ -537,20 +540,21 @@
 
 (define (rmt:get-test-id run-id testname item-path)
   (assert (number? run-id) "FATAL: Run id required.")
   (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))
 
-;; run-id is NOT used
-;;
 (define (rmt:get-test-info-by-id run-id test-id)
   (if (number? test-id)
       (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
       (begin
 	(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
 	(print-call-chain (current-error-port))
 	#f)))
 
+(define (rmt:get-test-state-status-by-id run-id test-id)
+  (rmt:send-receive 'get-test-state-status-by-id run-id (list run-id test-id)))
+
 (define (rmt:test-get-rundir-from-test-id run-id test-id)
   (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))
 
 (define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f))
   (assert (number? run-id) "FATAL: Run id required.")
@@ -799,10 +803,13 @@
 
 (define (rmt:get-run-state run-id)
   (assert (number? run-id) "FATAL: Run id required.")
   (rmt:send-receive 'get-run-state #f (list run-id)))
 
+(define (rmt:get-run-state-status run-id)
+  (assert (number? run-id) "FATAL: Run id required.")
+  (rmt:send-receive 'get-run-state-status #f (list run-id)))
 
 (define (rmt:set-run-status run-id run-status #!key (msg #f))
   (assert (number? run-id) "FATAL: Run id required.")
   (rmt:send-receive 'set-run-status #f (list run-id run-status msg)))
 

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -1863,11 +1863,12 @@
                                    (newtestname (db:test-make-full-name hed my-item-path)))    ;; test names are unique on testname/item-path
                               (tests:testqueue-set-items!     new-test-record #f)
                               (tests:testqueue-set-itemdat!   new-test-record my-itemdat)
                               (tests:testqueue-set-item_path! new-test-record my-item-path)
                               (hash-table-set! test-records newtestname new-test-record)
-                              (set! tal (append tal (list newtestname)))))  ;; since these are itemized create new test names testname/itempath
+			      ;; BUG: This next line sucks up a lot of horsepower
+			      (set! tal (append tal (list newtestname)))))  ;; since these are itemized create new test names testname/itempath
                           items-in-testpatt)))
           
           
 
 	  ;; At this point we have possibly added items to tal but all must be handed off to 

Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -1966,13 +1966,13 @@
 ;;======================================================================
 
 ;; teststep-set-status! used to be here
 
 (define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)
-  (let* ((testdat   (rmt:get-test-info-by-id run-id test-id)))
+  (let* ((testdat   (rmt:get-test-state-status-by-id run-id test-id)))
     (and testdat
-	 (equal? (test:get-state testdat) "KILLREQ"))))
+	 (equal? (car testdat) "KILLREQ"))))
 
 (define (test:tdb-get-rundat-count tdb)
   (if tdb
       (let ((res 0))
 	(sqlite3:for-each-row