Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -7,11 +7,11 @@
 ;;  This program is distributed WITHOUT ANY WARRANTY; without even the
 ;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 ;;  PURPOSE.
 ;;======================================================================
 
-(use srfi-1 posix regex-case base64 format dot-locking csv-xml z3 sql-de-lite hostinfo md5 message-digest typed-records directory-utils)
+(use srfi-1 posix regex-case base64 format dot-locking csv-xml z3 sql-de-lite hostinfo md5 message-digest typed-records directory-utils stack)
 (require-extension regex posix)
 
 (require-extension (srfi 18) extras tcp rpc)
 
 (import (prefix sqlite3 sqlite3:))

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -13,11 +13,11 @@
 ;; Database access
 ;;======================================================================
 
 ;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc
 
-(use (srfi 18) extras tcp) ;; RADT => use of require-extension?
+(use (srfi 18) extras tcp stack) ;; RADT => use of require-extension?
 (use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest base64 format dot-locking z3 typed-records)
 (import (prefix sqlite3 sqlite3:))
 (import (prefix base64 base64:)) ;; RADT => prefix??
 
 (declare (unit db))
@@ -42,11 +42,12 @@
 
 ;; each db entry is a pair ( db . dbfilepath )
 ;; I propose this record evolves into the area record
 ;;
 (defstruct dbr:dbstruct 
-  (tmpdb       #f)
+  ;; (tmpdb       #f)
+  (dbstack     #f) ;; stack for tmp db handles, do not initialize with a stack
   (mtdb        #f)
   (refndb      #f)
   (homehost    #f) ;; not used yet
   (on-homehost #f) ;; not used yet
   )                ;; goal is to converge on one struct for an area but for now it is too confusing
@@ -91,12 +92,17 @@
 ;;    if #f     => get main db
 ;;    if db already open - return inmem
 ;;    if db not open, open inmem, rundb and sync then return inmem
 ;;    inuse gets set automatically for rundb's
 ;;
-(define (db:get-db dbstruct . blah) ;;  run-id) 
-  (or (dbr:dbstruct-tmpdb dbstruct)
+(define (db:get-db dbstruct) ;;  run-id) 
+  (if (stack? (dbr:dbstruct-dbstack dbstruct))
+      (if (stack-empty? (dbr:dbstruct-dbstack dbstruct))
+          (let ((newdb (db:open-megatest-db path: (db:dbfile-path))))
+            (stack-push! (dbr:dbstruct-dbstack dbstruct) newdb)
+            newdb)
+          (stack-pop! (dbr:dbstruct-dbstack dbstruct)))
       (db:open-db dbstruct)))
 
 ;; ;; legacy handling of structure for managing db's. Refactor this into dbr:?
 (define (db:dbdat-get-db dbdat)
   (if (pair? dbdat)
@@ -125,16 +131,11 @@
 
 ;; (db:with-db dbstruct run-id sqlite3:exec "select blah fgrom blaz;")
 ;; r/w is a flag to indicate if the db is modified by this query #t = yes, #f = no
 ;;
 (define (db:with-db dbstruct run-id r/w proc . params)
-  (let* ((dbdat (if (dbr:dbstruct? dbstruct)
-		    (db:get-db dbstruct run-id)
-		    (begin
-		      (print-call-chain)
-		      (print "db:with-db called with dbdat instead of dbstruct, FIXME!!")
-		      dbstruct))) ;; cheat, allow for passing in a dbdat
+  (let* ((dbdat (db:get-db dbstruct))
 	 (db    (db:dbdat-get-db dbdat))
 	 (use-mutex (> *api-process-request-count* 25)))
     (if (and use-mutex
 	     (common:low-noise-print 120 "over-50-parallel-api-requests"))
 	(debug:print-info 0 *default-log-port* *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access"))
@@ -143,15 +144,17 @@
     (handle-exceptions
      exn
      (begin
        (print-call-chain (current-error-port))
        (debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn))
-       )
+       ;; there is no recovering at this time. exit
+       (exit 50))
      (if use-mutex (mutex-lock! *db-with-db-mutex*))
      (let ((res (apply proc db params)))
        (if use-mutex (mutex-unlock! *db-with-db-mutex*))
        ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
+       (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat)
        res))))
 
 ;;======================================================================
 ;; K E E P   F I L E D B   I N   dbstruct
 ;;======================================================================
@@ -180,30 +183,19 @@
 ;;      (was planned to be;  zeroth db with name=main.db)
 ;; 
 ;; If run-id is #f return to create and retrieve the path where the db will live.
 ;;
 (define (db:dbfile-path . junk) ;;  run-id)
-  (let* ((dbdir           (common:get-db-tmp-area))) ;; (db:get-dbdir))
-;; 	 (fname           (if run-id
-;; 			      (if (eq? run-id 0) "main.db" (conc run-id ".db"))
-;; 			      #f)))
+  (let* ((dbdir           (common:get-db-tmp-area)))
     (handle-exceptions
      exn
      (begin
        (debug:print-error 0 *default-log-port* "Couldn't create path to " dbdir)
        (exit 1))
      (if (not (directory? dbdir))(create-directory dbdir #t)))
-    dbdir)) ;; (if fname
-;;	(conc dbdir "/" fname) 
-;;	dbdir)))
-
-;; Returns the database location as specified in config file
-;;
-;; (define db:get-dbdir common:get-db-tmp-area)
-;;  (or (configf:lookup *configdat* "setup" "dbdir")
-;;      (conc (configf:lookup *configdat* "setup" "linktree") "/.db")))
-	       
+    dbdir))
+
 (define (db:set-sync db)
   (let ((syncprag (configf:lookup *configdat* "setup" "sychronous")))
     (sqlite3:execute db (conc "PRAGMA synchronous = " (or syncprag 0) ";")))) 
 
 ;; open an sql database inside a file lock
@@ -273,14 +265,13 @@
 ;;     db))
 
 ;; This routine creates the db if not already present. It is only called if the db is not already opened
 ;;
 (define (db:open-db dbstruct #!key (areapath #f))
-  (let ((tmpdb (dbr:dbstruct-tmpdb dbstruct))) ;; RA => Returns the first reference in dbstruct
-    (if tmpdb
-	tmpdb
-        ;; (mutex-lock! *rundb-mutex*)
+  (let ((tmpdb-stack (dbr:dbstruct-dbstack dbstruct))) ;; RA => Returns the first reference in dbstruct
+    (if (stack? tmpdb-stack)
+	(db:get-db tmpdb-stack) ;; get previously opened db (will create new db handle if all in the stack are already used
         (let* ((dbpath       (db:dbfile-path)) ;;  0))
                (dbexists     (file-exists? dbpath))
 	       (dbfexists    (file-exists? (conc dbpath "/megatest.db")))
                (tmpdb        (db:open-megatest-db path: dbpath)) ;; lock-create-open dbpath db:initialize-main-db))
                (mtdb         (db:open-megatest-db))
@@ -287,11 +278,12 @@
                (refndb       (db:open-megatest-db path: dbpath name: "megatest_ref.db"))
                (write-access (file-write-access? dbpath)))
           (if (and dbexists (not write-access))
               (set! *db-write-access* #f))
           (dbr:dbstruct-mtdb-set!   dbstruct mtdb)
-          (dbr:dbstruct-tmpdb-set!  dbstruct tmpdb) ;; olddb is already a (cons db path)
+          (dbr:dbstruct-dbstack-set! dbstruct (make-stack))
+          (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb) ;; olddb is already a (cons db path)
           (dbr:dbstruct-refndb-set! dbstruct refndb)
           ;;	    (mutex-unlock! *rundb-mutex*)
           (if (and (not dbfexists)
                    write-access) ;; *db-write-access*) ;; did not have a prior db and do have write access
 	      (begin
@@ -333,11 +325,11 @@
     (cons db dbpath)))
 
 ;; sync run to disk if touched
 ;;
 (define (db:sync-touched dbstruct run-id #!key (force-sync #f))
-  (let ((tmpdb   (dbr:dbstruct-tmpdb  dbstruct))
+  (let ((tmpdb   (db:get-db dbstruct))
 	(mtdb    (dbr:dbstruct-mtdb   dbstruct))
         (refndb  (dbr:dbstruct-refndb dbstruct))
 	(start-t (current-seconds)))
     (debug:print-info 4 *default-log-port* "Syncing for run-id: " run-id)
     (mutex-lock! *db-multi-sync-mutex*)
@@ -345,21 +337,23 @@
       (mutex-unlock! *db-multi-sync-mutex*)
       (db:sync-tables (db:sync-all-tables-list dbstruct) update_info tmpdb refndb mtdb))
     (mutex-lock! *db-multi-sync-mutex*)
     (set! *db-last-sync* start-t)
     (set! *db-last-access* start-t)
-    (mutex-unlock! *db-multi-sync-mutex*)))
+    (mutex-unlock! *db-multi-sync-mutex*)
+    (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb)))
 
 ;; close all opened run-id dbs
 (define (db:close-all dbstruct)
   (if (dbr:dbstruct? dbstruct)
       (begin
         ;; (db:sync-touched dbstruct 0 force-sync: #t) ;; NO. Do not do this here. Instead we rely on a server to be started when there are writes, even if the server itself is not going to be used as a server.
-        (let ((tdb (db:dbdat-get-db (dbr:dbstruct-tmpdb  dbstruct)))
+        (let ((tdbs (map db:dbdat-get-db 
+                         (stack->list (dbr:dbstruct-dbstack dbstruct))))
               (mdb (db:dbdat-get-db (dbr:dbstruct-mtdb   dbstruct)))
               (rdb (db:dbdat-get-db (dbr:dbstruct-refndb dbstruct))))
-          (if tdb (sqlite3:finalize! tdb))
+          (map sqlite3:finalize! tdbs)
           (if mdb (sqlite3:finalize! mdb))
           (if rdb (sqlite3:finalize! rdb))))))
   
 ;;   (let ((locdbs (dbr:dbstruct-locdbs dbstruct)))
 ;;     (if (hash-table? locdbs)
@@ -823,11 +817,11 @@
 ;;
 (define (db:multi-db-sync dbstruct . options)
   (if (not (launch:setup))
       (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.")
       (let* ((mtdb     (dbr:dbstruct-mtdb dbstruct))
-	     (tmpdb    (dbr:dbstruct-tmpdb dbstruct))
+	     (tmpdb    (db:get-db dbstruct))
              (refndb   (dbr:dbstruct-refndb dbstruct))
 	     (allow-cleanup #t) ;; (if run-ids #f #t))
 	     (tdbdat  (tasks:open-db))
 	     (servers (tasks:get-all-servers (db:delay-if-busy tdbdat)))
 	     (data-synced 0)) ;; count of changed records (I hope)
@@ -952,10 +946,11 @@
 ;; 		      (delete-file fullname)))))
 ;; 	    dead-runs))))
 ;; 
 	;; (db:close-all dbstruct)
 	;; (sqlite3:finalize! mdb)
+        (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb)
 	data-synced)))
 
 ;; keeping it around for debugging purposes only
 (define (open-run-close-no-exception-handling  proc idb . params)
   (debug:print-info 11 *default-log-port* "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params)
@@ -1220,11 +1215,11 @@
 ;; dneeded is minimum space needed, scan for existing archives that 
 ;; are on disks with adequate space and already have this test/itempath
 ;; archived
 ;;
 (define (db:archive-get-allocations dbstruct testname itempath dneeded)
-  (let* ((dbdat        (db:get-db dbstruct #f)) ;; archive tables are in main.db
+  (let* ((dbdat        (db:get-db dbstruct)) ;; archive tables are in main.db
 	 (db           (db:dbdat-get-db dbdat))
 	 (res          '())
 	 (blocks       '())) ;; a block is an archive chunck that can be added too if there is space
     (sqlite3:for-each-row
      (lambda (id archive-disk-id disk-path last-du last-du-time)
@@ -1251,11 +1246,11 @@
     
 ;; returns id of the record, register a disk allocated to archiving and record it's last known
 ;; available space
 ;;
 (define (db:archive-register-disk dbstruct bdisk-name bdisk-path df)
-  (let* ((dbdat        (db:get-db dbstruct #f)) ;; archive tables are in main.db
+  (let* ((dbdat        (db:get-db dbstruct)) ;; archive tables are in main.db
 	 (db           (db:dbdat-get-db dbdat))
 	 (res          #f))
     (sqlite3:for-each-row
      (lambda (id)
        (set! res id))
@@ -1265,25 +1260,27 @@
     (if res ;; record exists, update df and return id
 	(begin
 	  (sqlite3:execute db "UPDATE archive_disks SET last_df=?,last_df_time=(strftime('%s','now'))
                                   WHERE archive_area_name=? AND disk_path=?;"
 			   df bdisk-name bdisk-path)
+          (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat)
 	  res)
 	(begin
 	  (sqlite3:execute
 	   db
 	   "INSERT OR REPLACE INTO archive_disks (archive_area_name,disk_path,last_df)
                 VALUES (?,?,?);"
 	   bdisk-name bdisk-path df)
+          (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat)
 	  (db:archive-register-disk dbstruct bdisk-name bdisk-path df)))))
 
 ;; record an archive path created on a given archive disk (identified by it's bdisk-id)
 ;; if path starts with / then it is full, otherwise it is relative to the archive disk
 ;; preference is to store the relative path.
 ;;
 (define (db:archive-register-block-name dbstruct bdisk-id archive-path #!key (du #f))
-  (let* ((dbdat        (db:get-db dbstruct #f)) ;; archive tables are in main.db
+  (let* ((dbdat        (db:get-db dbstruct)) ;; archive tables are in main.db
 	 (db           (db:dbdat-get-db dbdat))
 	 (res          #f))
     ;; first look to see if this path is already registered
     (sqlite3:for-each-row
      (lambda (id)
@@ -1380,62 +1377,63 @@
 ;;======================================================================
 ;; M A I N T E N A N C E
 ;;======================================================================
 
 (define (db:have-incompletes? dbstruct run-id ovr-deadtime)
-  (let* ((dbdat        (db:get-db dbstruct run-id))
-	 (db           (db:dbdat-get-db dbdat))
-	 (incompleted '())
+  (let* ((incompleted '())
 	 (oldlaunched '())
 	 (toplevels   '())
 	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime"))
 	 (deadtime     (if (and deadtime-str
 				(string->number deadtime-str))
 			   (string->number deadtime-str)
 			   7200))) ;; two hours
-    (if (number? ovr-deadtime)(set! deadtime ovr-deadtime))
-    
-    ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
-    ;;
-    ;; HOWEVER: this code in run:test seems to work fine
-    ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
-    ;;                     (db:test-get-run_duration testdat)))
-    ;;                    600) 
-    ;; (db:delay-if-busy dbdat)
-    (sqlite3:for-each-row 
-     (lambda (test-id run-dir uname testname item-path)
-       (if (and (equal? uname "n/a")
-		(equal? item-path "")) ;; this is a toplevel test
-	   ;; what to do with toplevel? call rollup?
-	   (begin
-	     (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-	     (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
-	   (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))
-     db
-     "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');"
-     run-id deadtime)
-
-    ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
-    ;;
-    ;; (db:delay-if-busy dbdat)
-    (sqlite3:for-each-row
-     (lambda (test-id run-dir uname testname item-path)
-       (if (and (equal? uname "n/a")
-		(equal? item-path "")) ;; this is a toplevel test
-	   ;; what to do with toplevel? call rollup?
-	   (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-	   (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))
-     db
-     "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"
-     run-id)
-    
-    (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")
-    (if (and (null? incompleted)
-	     (null? oldlaunched)
-	     (null? toplevels))
-	#f
-	#t)))
+    (db:with-db
+     dbstruct #f #f
+     (lambda (db)
+       (if (number? ovr-deadtime)(set! deadtime ovr-deadtime))
+       
+       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
+       ;;
+       ;; HOWEVER: this code in run:test seems to work fine
+       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
+       ;;                     (db:test-get-run_duration testdat)))
+       ;;                    600) 
+       ;; (db:delay-if-busy dbdat)
+       (sqlite3:for-each-row 
+        (lambda (test-id run-dir uname testname item-path)
+          (if (and (equal? uname "n/a")
+                   (equal? item-path "")) ;; this is a toplevel test
+              ;; what to do with toplevel? call rollup?
+              (begin
+                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+              (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))
+        db
+        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');"
+        run-id deadtime)
+
+       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
+       ;;
+       ;; (db:delay-if-busy dbdat)
+       (sqlite3:for-each-row
+        (lambda (test-id run-dir uname testname item-path)
+          (if (and (equal? uname "n/a")
+                   (equal? item-path "")) ;; this is a toplevel test
+              ;; what to do with toplevel? call rollup?
+              (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+              (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))
+        db
+        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"
+        run-id)
+       
+       (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")
+       (if (and (null? incompleted)
+                (null? oldlaunched)
+                (null? toplevels))
+           #f
+           #t)))))
 
 ;; given a launch delay (minimum time from last launch) return amount of time to wait
 ;;
 ;; (define (db:launch-delay-left dbstruct run-id launch-delay)
   
@@ -1444,77 +1442,78 @@
 ;;      (select testname,item_path,event_time+run_duration as
 ;;                          end_time,strftime('%s','now') as now from tests where state in
 ;;      ('RUNNING','REMOTEHOSTSTART','LAUNCED'));
 
 (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
-  (let* ((dbdat        (db:get-db dbstruct run-id))
-	 (db           (db:dbdat-get-db dbdat))
-	 (incompleted '())
+  (let* ((incompleted '())
 	 (oldlaunched '())
 	 (toplevels   '())
 	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime"))
 	 (deadtime     (if (and deadtime-str
 				(string->number deadtime-str))
 			   (string->number deadtime-str)
 			   7200))) ;; two hours
-    (if (number? ovr-deadtime)(set! deadtime ovr-deadtime))
-    
-    ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
-    ;;
-    ;; HOWEVER: this code in run:test seems to work fine
-    ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
-    ;;                     (db:test-get-run_duration testdat)))
-    ;;                    600) 
-    ;; (db:delay-if-busy dbdat)
-    (sqlite3:for-each-row 
-     (lambda (test-id run-dir uname testname item-path)
-       (if (and (equal? uname "n/a")
-		(equal? item-path "")) ;; this is a toplevel test
-	   ;; what to do with toplevel? call rollup?
-	   (begin
-	     (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-	     (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
-	   (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))
-     db
-     "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');"
-     run-id deadtime)
-
-    ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
-    ;;
-    ;; (db:delay-if-busy dbdat)
-    (sqlite3:for-each-row
-     (lambda (test-id run-dir uname testname item-path)
-       (if (and (equal? uname "n/a")
-		(equal? item-path "")) ;; this is a toplevel test
-	   ;; what to do with toplevel? call rollup?
-	   (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-	   (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))
-     db
-     "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"
-     run-id)
-    
-    (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")
-
-    ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
-    ;;
-    ;; (db:delay-if-busy dbdat)
-    (let* (;; (min-incompleted (filter (lambda (x)
-	   ;;      		      (let* ((testpath (cadr x))
-	   ;;      			     (tdatpath (conc testpath "/testdat.db"))
-	   ;;      			     (dbexists (file-exists? tdatpath)))
-	   ;;      			(or (not dbexists) ;; if no file then something wrong - mark as incomplete
-	   ;;      			    (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim
-	   ;;      		    incompleted))
-	   (min-incompleted-ids (map car incompleted)) ;; do 'em all
-	   (all-ids             (append min-incompleted-ids (map car oldlaunched))))
-      (if (> (length all-ids) 0)
-	  (begin
-	    (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as INCOMPLETE")
-            (for-each
-             (lambda (test-id)
-               (db:test-set-state-status dbstruct run-id test-id "COMPLETE" "DEAD" "Test failed to complete"))
-             all-ids))))))
+    (db:with-db 
+     dbstruct #f #f
+     (lambda (db)
+       (if (number? ovr-deadtime)(set! deadtime ovr-deadtime))
+       
+       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
+       ;;
+       ;; HOWEVER: this code in run:test seems to work fine
+       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
+       ;;                     (db:test-get-run_duration testdat)))
+       ;;                    600) 
+       ;; (db:delay-if-busy dbdat)
+       (sqlite3:for-each-row 
+        (lambda (test-id run-dir uname testname item-path)
+          (if (and (equal? uname "n/a")
+                   (equal? item-path "")) ;; this is a toplevel test
+              ;; what to do with toplevel? call rollup?
+              (begin
+                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+              (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))
+        db
+        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');"
+        run-id deadtime)
+
+       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
+       ;;
+       ;; (db:delay-if-busy dbdat)
+       (sqlite3:for-each-row
+        (lambda (test-id run-dir uname testname item-path)
+          (if (and (equal? uname "n/a")
+                   (equal? item-path "")) ;; this is a toplevel test
+              ;; what to do with toplevel? call rollup?
+              (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+              (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))
+        db
+        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"
+        run-id)
+       
+       (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")
+
+       ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
+       ;;
+       ;; (db:delay-if-busy dbdat)
+       (let* (;; (min-incompleted (filter (lambda (x)
+              ;;      		      (let* ((testpath (cadr x))
+              ;;      			     (tdatpath (conc testpath "/testdat.db"))
+              ;;      			     (dbexists (file-exists? tdatpath)))
+              ;;      			(or (not dbexists) ;; if no file then something wrong - mark as incomplete
+              ;;      			    (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim
+              ;;      		    incompleted))
+              (min-incompleted-ids (map car incompleted)) ;; do 'em all
+              (all-ids             (append min-incompleted-ids (map car oldlaunched))))
+         (if (> (length all-ids) 0)
+             (begin
+               (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as INCOMPLETE")
+               (for-each
+                (lambda (test-id)
+                  (db:test-set-state-status dbstruct run-id test-id "COMPLETE" "DEAD" "Test failed to complete"))
+                all-ids))))))))
 
 ;; ALL REPLACED BY THE BLOCK ABOVE
 ;;
 ;; 	    (sqlite3:execute 
 ;; 	     db
@@ -1680,23 +1679,24 @@
 
 ;; returns number if string->number is successful, string otherwise
 ;; also updates *global-delta*
 ;;
 (define (db:get-var dbstruct var)
-  (let* ((res      #f)
-	 (dbdat    (db:get-db dbstruct #f))
-	 (db       (db:dbdat-get-db dbdat)))
-    (sqlite3:for-each-row
-     (lambda (val)
-       (set! res val))
-     db
-     "SELECT val FROM metadat WHERE var=?;" var)
-    ;; convert to number if can
-    (if (string? res)
-	(let ((valnum (string->number res)))
-	  (if valnum (set! res valnum))))
-    res))
+  (let* ((res      #f))
+    (db:with-db
+     dbstruct #f #f
+     (lambda (db)
+       (sqlite3:for-each-row
+        (lambda (val)
+          (set! res val))
+        db
+        "SELECT val FROM metadat WHERE var=?;" var)
+       ;; convert to number if can
+       (if (string? res)
+           (let ((valnum (string->number res)))
+             (if valnum (set! res valnum))))
+       res))))
 
 ;; This was part of db:get-var. It was used to estimate the load on
 ;; the database files.
 ;;
 ;; scale by 10, average with current value.
@@ -1811,13 +1811,11 @@
 
 ;; register a test run with the db, this accesses the main.db and does NOT
 ;; use server api
 ;;
 (define (db:register-run dbstruct keyvals runname state status user)
-  (let* ((dbdat     (db:get-db dbstruct #f))
-	 (db        (db:dbdat-get-db dbdat))
-	 (keys      (map car keyvals))
+  (let* ((keys      (map car keyvals))
 	 (keystr    (keys->keystr keys))	 
 	 (comma     (if (> (length keys) 0) "," ""))
 	 (andstr    (if (> (length keys) 0) " AND " ""))
 	 (valslots  (keys->valslots keys)) ;; ?,?,? ...
 	 (allvals   (append (list runname state status user) (map cadr keyvals)))
@@ -1828,20 +1826,17 @@
     (if (and runname (null? (filter (lambda (x)(not x)) keyvals))) ;; there must be a better way to "apply and"
 	(db:with-db
 	 dbstruct #f #f
 	 (lambda (db)
 	   (let ((res #f))
-	     ;; (db:delay-if-busy dbdat)
 	     (apply sqlite3:execute db (conc "INSERT OR IGNORE INTO runs (runname,state,status,owner,event_time" comma keystr ") VALUES (?,?,?,?,strftime('%s','now')" comma valslots ");")
 		    allvals)
-	     ;; (db:delay-if-busy dbdat)
 	     (apply sqlite3:for-each-row 
 		    (lambda (id)
 		      (set! res id))
 		    db
 		    (let ((qry (conc "SELECT id FROM runs WHERE (runname=? " andstr key=?str ");")))
-					;(debug:print 4 *default-log-port* "qry: " qry) 
 		      qry)
 		    qryvals)
 	     (sqlite3:execute db "UPDATE runs SET state=?,status=?,event_time=strftime('%s','now') WHERE id=? AND state='deleted';" state status res)
 	     res))) 
 	(begin
@@ -2162,47 +2157,38 @@
       ;; (hash-table-set! *run-info-cache* run-id finalres)
       finalres)))
 
 (define (db:set-comment-for-run dbstruct run-id comment)
   (db:with-db
-   dbstruct
-   #f
-   #f
+   dbstruct #f #f
    (lambda (db)
      (sqlite3:execute db "UPDATE runs SET comment=? WHERE id=?;" comment ;; (sdb:qry 'getid comment)
 		      run-id))))
 
 ;; does not (obviously!) removed dependent data. But why not!!?
 (define (db:delete-run dbstruct run-id)
-  ;; First set any related tests to DELETED
-  (let* ((rdbdat (db:get-db dbstruct run-id))
-	 (rdb    (db:dbdat-get-db rdbdat))
-	 (dbdat  (db:get-db dbstruct #f))
-	 (db     (db:dbdat-get-db dbdat)))
-    ;; (db:delay-if-busy rdbdat)
-    (sqlite3:with-transaction
-     db
-     (lambda ()
-       (sqlite3:execute rdb "DELETE FROM test_steps WHERE test_id IN (SELECT id FROM tests WHERE run_id=?);" run-id)
-       (sqlite3:execute rdb "DELETE FROM test_data WHERE test_id IN (SELECT id FROM tests WHERE run_id=?);"  run-id)
-       (sqlite3:execute rdb "UPDATE tests SET state='DELETED',comment='' WHERE run_id=?;" run-id)
-       ;; (db:delay-if-busy dbdat)
-       (sqlite3:execute db "UPDATE runs SET state='deleted',comment='' WHERE id=?;" run-id)))))
+  (db:with-db
+   dbstruct #f #f
+   (lambda (db)
+     (sqlite3:with-transaction
+      db
+      (lambda ()
+        (sqlite3:execute db "DELETE FROM test_steps WHERE test_id IN (SELECT id FROM tests WHERE run_id=?);" run-id)
+        (sqlite3:execute db "DELETE FROM test_data WHERE test_id IN (SELECT id FROM tests WHERE run_id=?);"  run-id)
+        (sqlite3:execute db "UPDATE tests SET state='DELETED',comment='' WHERE run_id=?;" run-id)
+        ;; (db:delay-if-busy dbdat)
+        (sqlite3:execute db "UPDATE runs SET state='deleted',comment='' WHERE id=?;" run-id))))))
 
 (define (db:update-run-event_time dbstruct run-id)
   (db:with-db
-   dbstruct
-   #f
-   #t
+   dbstruct #f #t
    (lambda (db)
      (sqlite3:execute db "UPDATE runs SET event_time=strftime('%s','now') WHERE id=?;" run-id))))
 
 (define (db:lock/unlock-run dbstruct run-id lock unlock user)
   (db:with-db
-   dbstruct
-   #f
-   #t
+   dbstruct #f #t
    (lambda (db)
      (let ((newlockval (if lock "locked"
 			   (if unlock
 			       "unlocked"
 			       "locked")))) ;; semi-failsafe
@@ -2210,23 +2196,21 @@
        (sqlite3:execute db "INSERT INTO access_log (user,accessed,args) VALUES(?,strftime('%s','now'),?);"
 			user (conc newlockval " " run-id))
        (debug:print-info 1 *default-log-port* "" newlockval " run number " run-id)))))
 
 (define (db:set-run-status dbstruct run-id status msg)
-  (let* ((dbdat (db:get-db dbstruct #f))
-	 (db    (db:dbdat-get-db dbdat)))
-    ;; (db:delay-if-busy dbdat)
-    (if msg
-	(sqlite3:execute db "UPDATE runs SET status=?,comment=? WHERE id=?;" status msg run-id)
-	(sqlite3:execute db "UPDATE runs SET status=? WHERE id=?;" status run-id))))
+  (db:with-db
+   dbstruct #f #f
+   (lambda (db)
+     (if msg
+         (sqlite3:execute db "UPDATE runs SET status=?,comment=? WHERE id=?;" status msg run-id)
+         (sqlite3:execute db "UPDATE runs SET status=? WHERE id=?;" status run-id)))))
 
 (define (db:get-run-status dbstruct run-id)
   (let ((res "n/a"))
     (db:with-db
-     dbstruct
-     #f
-     #f
+     dbstruct #f #f
      (lambda (db)
        (sqlite3:for-each-row 
 	(lambda (status)
 	  (set! res status))
 	db
@@ -2897,13 +2881,11 @@
 ;; Roll up test_data pass/fail results
 ;; look at the test_data status field, 
 ;;    if all are pass (any case) and the test status is PASS or NULL or '' then set test status to PASS.
 ;;    if one or more are fail (any case) then set test status to PASS, non "pass" or "fail" are ignored
 (define (db:test-data-rollup dbstruct run-id test-id status)
-  (let* ((dbdat      (db:get-db dbstruct run-id))
-	 (db         (db:dbdat-get-db dbdat))
-	 (fail-count 0)
+  (let* ((fail-count 0)
 	 (pass-count 0))
     (db:with-db
      dbstruct #f #f
      (lambda (db)
        (sqlite3:for-each-row
@@ -3200,53 +3182,55 @@
 	 (test-name    (if (number? test-name)
 			   (db:test-get-testname testdat)
 			   test-name))
 	 (item-path    (db:test-get-item-path testdat))
          (tl-testdat   (db:get-test-info dbstruct run-id test-name ""))
-         (tl-test-id   (db:test-get-id tl-testdat))
-	 (db           (db:dbdat-get-db  (db:get-db dbstruct #f))))
+         (tl-test-id   (db:test-get-id tl-testdat)))
     (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
 	(db:general-call dbstruct 'set-test-start-time (list test-id)))
-    (mutex-lock! *db-transaction-mutex*)
-    (let ((tr-res
-	   (sqlite3:with-transaction
-	    db
-	    (lambda ()
-	      ;; (db:test-set-state-status-by-id dbstruct run-id test-id state status comment)
-	      (db:test-set-state-status dbstruct run-id test-id state status comment)
-	      (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item
-		  (let* ((state-status-counts  (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path)) ;; item-path is used to exclude current state/status of THIS test
-			 (running              (length (filter (lambda (x)
-								 (member (dbr:counts-state x) *common:running-states*))
-							       state-status-counts)))
-			 (bad-not-started      (length (filter (lambda (x)
-								 (and (equal? (dbr:counts-state x) "NOT_STARTED")
-								      (not (member (dbr:counts-status x)
-										   *common:not-started-ok-statuses*))))
-								  state-status-counts)))
-			 (all-curr-states   (common:special-sort  ;; worst -> best (sort of)
-					     (delete-duplicates
-					      (cons state (map dbr:counts-state state-status-counts)))
-					     *common:std-states* >))
-			 (all-curr-statuses (common:special-sort  ;; worst -> best
-					     (delete-duplicates
-					      (cons status (map dbr:counts-status state-status-counts)))
-					     *common:std-statuses* >))
-			 (newstate          (if (> running 0)
-						"RUNNING"
-						(if (> bad-not-started 0)
-						    "COMPLETED"
-						    (car all-curr-states))))
-			 (newstatus         (if (> bad-not-started 0)
-						"CHECK"
-						(car all-curr-statuses))))
-		    ;; (print "Setting toplevel to: " newstate "/" newstatus)
-		    (db:test-set-state-status dbstruct run-id tl-test-id newstate newstatus #f)))))))
-      (mutex-unlock! *db-transaction-mutex*)
-      (if (and test-id state status (equal? status "AUTO")) 
-	  (db:test-data-rollup dbstruct run-id test-id status))
-      tr-res)))
+    ;; (mutex-lock! *db-transaction-mutex*)
+    (db:with-db
+     dbstruct #f #f
+     (lambda (db)
+       (let ((tr-res
+              (sqlite3:with-transaction
+               db
+               (lambda ()
+                 ;; (db:test-set-state-status-by-id dbstruct run-id test-id state status comment)
+                 (db:test-set-state-status dbstruct run-id test-id state status comment)
+                 (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item
+                     (let* ((state-status-counts  (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path)) ;; item-path is used to exclude current state/status of THIS test
+                            (running              (length (filter (lambda (x)
+                                                                    (member (dbr:counts-state x) *common:running-states*))
+                                                                  state-status-counts)))
+                            (bad-not-started      (length (filter (lambda (x)
+                                                                    (and (equal? (dbr:counts-state x) "NOT_STARTED")
+                                                                         (not (member (dbr:counts-status x)
+                                                                                      *common:not-started-ok-statuses*))))
+								  state-status-counts)))
+                            (all-curr-states   (common:special-sort  ;; worst -> best (sort of)
+                                                (delete-duplicates
+                                                 (cons state (map dbr:counts-state state-status-counts)))
+                                                *common:std-states* >))
+                            (all-curr-statuses (common:special-sort  ;; worst -> best
+                                                (delete-duplicates
+                                                 (cons status (map dbr:counts-status state-status-counts)))
+                                                *common:std-statuses* >))
+                            (newstate          (if (> running 0)
+                                                   "RUNNING"
+                                                   (if (> bad-not-started 0)
+                                                       "COMPLETED"
+                                                       (car all-curr-states))))
+                            (newstatus         (if (> bad-not-started 0)
+                                                   "CHECK"
+                                                   (car all-curr-statuses))))
+                       ;; (print "Setting toplevel to: " newstate "/" newstatus)
+                       (db:test-set-state-status dbstruct run-id tl-test-id newstate newstatus #f)))))))
+         ;; (mutex-unlock! *db-transaction-mutex*)
+         (if (and test-id state status (equal? status "AUTO")) 
+             (db:test-data-rollup dbstruct run-id test-id status))
+         tr-res)))))
 
 (define (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path)
   (db:with-db
    dbstruct #f #f
    (lambda (db)
@@ -3484,27 +3468,21 @@
       db
       "SELECT state,status,count(state) FROM tests WHERE run_id=? AND testname=? AND item_path='' GROUP BY state,status;"
       run-id testname)
      res)))
 
-
 (define (db:get-latest-host-load dbstruct raw-hostname)
   (let* ((hostname (string-substitute "\\..*$" "" raw-hostname))
-        (res  (cons -1 0))
-        (mydb (db:dbdat-get-db (db:get-db dbstruct 0)))
-        )
+         (res  (cons -1 0)))
     (db:with-db
-     dbstruct
-     0
-     #f
+     dbstruct #f #f
      (lambda (db)
        (sqlite3:for-each-row
         (lambda (cpuload update-time)  (set! res (cons cpuload update-time)))
         db
         "SELECT tr.cpuload, tr.update_time FROM test_rundat tr, tests t WHERE t.host=? AND tr.cpuload != -1  AND tr.test_id=t.id ORDER BY tr.update_time DESC LIMIT 1;"
         hostname))) res ))
-
 
 (define (db:set-top-level-from-items dbstruct run-id testname)
   (let* ((summ  (db:get-state-status-summary dbstruct run-id testname))
 	 (find  (lambda (state status)
 		  (if (null? summ) 
@@ -3849,11 +3827,11 @@
 (define (db:extract-ods-file dbstruct outputfile keypatt-alist runspatt pathmod)
   (let* ((keysstr  (string-intersperse (map car keypatt-alist) ","))
 	 (keyqry   (string-intersperse (map (lambda (p)(conc (car p) " LIKE ? ")) keypatt-alist) " AND "))
 	 (numkeys  (length keypatt-alist))
 	 (test-ids '())
-	 (dbdat    (db:get-db dbstruct #f))
+	 (dbdat    (db:get-db dbstruct))
 	 (db       (db:dbdat-get-db dbdat))
 	 (windows  (and pathmod (substring-index "\\" pathmod)))
 	 (tempdir  (conc "/tmp/" (current-user-name) "/" runspatt "_" (random 10000) "_" (current-process-id)))
 	 (runsheader (append (list "Run Id" "Runname") ; 0 1
 			     (map car keypatt-alist)   ; + N = length keypatt-alist

Index: tasks.scm
==================================================================
--- tasks.scm
+++ tasks.scm
@@ -783,11 +783,11 @@
 (define (tasks:find-task-queue-records dbstruct target run-name test-patt state-patt action-patt)
   ;; (handle-exceptions
   ;;  exn
   ;;  '()
   ;;  (sqlite3:first-row
-  (let ((db (db:delay-if-busy (db:get-db dbstruct #f)))
+  (let ((db (db:delay-if-busy (db:get-db dbstruct)))
 	(res '()))
     (sqlite3:for-each-row 
      (lambda (a . b)
        (set! res (cons (cons a b) res)))
      db "SELECT id,action,owner,state,target,name,testpatt,keylock,params FROM tasks_queue