Index: Makefile
==================================================================
--- Makefile
+++ Makefile
@@ -28,11 +28,11 @@
            ezsteps.scm lock-queue.scm sdb.scm rmt.scm api.scm		\
            subrun.scm portlogger.scm archive.scm env.scm		\
            diff-report.scm cgisetup/models/pgdb.scm
 
 # module source files
-MSRCFILES = 
+MSRCFILES = adjutant.scm mutils.scm
 # ftail.scm rmtmod.scm commonmod.scm removed
 # MSRCFILES = ducttape-lib.scm pkts.scm stml2.scm cookie.scm mutils.scm	\
 #             mtargs.scm commonmod.scm dbmod.scm adjutant.scm ulex.scm	\
 #             rmtmod.scm apimod.scm
 

Index: TODO
==================================================================
--- TODO
+++ TODO
@@ -16,10 +16,17 @@
 #     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 TODO
 ====
 
+WW38
+. Add test_rundat to no-sync ==> correction, put in <testdir>/.meta/test-run.dat
+. Add STATE/STATUS transitions to .meta/test-run.dat or similar
+. Swizzle update-test-rundat to operate on no-sync
+. Swizzle update-run-duration, -uname-host and cpuload-diskfree to no-sync
+. On state/status change update tests table with duration
+
 WW15
 . fill newview matrix with data, filter pipeline gui elements
 . improve [script], especially indent handling
 
 WW16
@@ -35,11 +42,10 @@
 . break command line into sections; all, run control, queries, utilities etc.
 . pull in ftfplan (not integrated, just code pulled in)
 
 WW20
 . ./configure => ubuntu, sles11, sles12, rh7
-. Jenkins junit XML support
 . Add output flushing in teamcity support
 . Switch to using simple runs query everywhere
 . Add end_time to runs and add a rollup call that sets state, status and end_time
 
 Future

Index: adjutant.scm
==================================================================
--- adjutant.scm
+++ adjutant.scm
@@ -22,12 +22,23 @@
 
 (module adjutant *
 
 (import scheme chicken data-structures extras files)
 (import (prefix sqlite3 sqlite3:) posix typed-records srfi-18 srfi-69
-	md5 message-digest
+	md5 message-digest matchable
 	regex srfi-1)
 
-(define (adjutant-run)
-  (print "Running the adjutant!"))
+(define (adjutant-run host-type rmt:no-sync-take-job)
+  (print "Running the adjutant!")
+  (let loop ((wait-count 0))
+    (if (< wait-count 10) ;; 6 x 10 seconds = one minute
+	(let* ((dat (rmt:no-sync-take-job host-type)))
+	  (match dat
+	    ((id ht vars exekey cmdline state event-time last-update)
+	     (system cmdline)
+	     (loop 0))
+	    (else
+	     (thread-sleep! 10)
+	     (loop (+ wait-count 1)))))
+	(print "I'm bored. Exiting."))))
 
 )

Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -251,17 +251,20 @@
 		   ;; NO SYNC DB
 		   ((no-sync-set)               (apply db:no-sync-set         *no-sync-db* params))
 		   ((no-sync-get/default)       (apply db:no-sync-get/default *no-sync-db* params))
 		   ((no-sync-del!)              (apply db:no-sync-del!        *no-sync-db* params))
 		   ((no-sync-get-lock)          (apply db:no-sync-get-lock    *no-sync-db* params))
+		   ((no-sync-add-job)           (apply db:no-sync-add-job     *no-sync-db* params))
+		   ((no-sync-take-job)          (apply db:no-sync-take-job    *no-sync-db* params))
+		   ((no-sync-job-records-clean) (apply db:no-sync-job-records-clean *no-sync-db* params))
 		   
                    ;; ARCHIVES
                    ;; ((archive-get-allocations)   
                    ((archive-register-disk)     (apply db:archive-register-disk dbstruct params))
                    ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params))
                    ;; ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey))
-
+		   
                    ;;======================================================================
                    ;; READ ONLY QUERIES
                    ;;======================================================================
 
                    ;; KEYS

Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -567,11 +567,11 @@
 			     (- num-logs max-allowed))))
 	    (for-each
 	     (lambda (file)
 	       (let* ((fullname (conc "logs/" file)))
 		 (if (directory? fullname)
-		     (debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
+		     (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
 		     (handle-exceptions
 		      exn
 		      (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn)
 		      (delete-file* fullname)))))
 	     files)
@@ -3171,31 +3171,39 @@
 ;; arm cubie01 cubie02
 ;; x86_64 zeus xena myth01
 ;; allhosts #{g hosts arm} #{g hosts x86_64}
 ;; 
 ;; [host-types]
+;; C/M/A      lets megatest know this launcher provides C cores, M bytes memory for architecture A 
+;; 2/2G/arm   smart -cores 2 -memory 2G -arch arm
 ;; general #MTLOWESTLOAD #{g hosts allhosts}
 ;; arm     #MTLOWESTLOAD #{g hosts arm}
 ;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo
+;;
+;; NOTE: host-rules is ONLY used for MTLOWESTLOAD
 ;;
 ;; [host-rules]
 ;; # maxnload   => max normalized load
 ;; # maxnjobs   => max jobs per cpu
 ;; # maxjobrate => max jobs per second
 ;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 
 ;; 
 ;; [launchers]
 ;; envsetup general
-;; xor/%/n 4C16G
+;; xor/%/n 2/2G/arm
 ;; % nbgeneral
 ;; 
 ;; [jobtools]
 ;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match.
 ;; flexi-launcher yes  
 ;; launcher nbfake
+;; mode adjutant|normal (default is normal)
+;; 
+;;
+;; mode is 'normal (i.e. directly use launcher) or 'adjutant (i.e. use adjutant)
 ;;
-(define (common:get-launcher configdat testname itempath)
+(define (common:get-launcher configdat testname itempath mode)
   (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher")))
     (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher
 	     (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no")))
 	(let* ((launchers         (hash-table-ref/default configdat "launchers" '())))
 	  (if (null? launchers)
@@ -3202,32 +3210,36 @@
 	      fallback-launcher
 	      (let loop ((hed (car launchers))
 			 (tal (cdr launchers)))
 		(let ((patt      (car hed))
 		      (host-type (cadr hed)))
-		  (if (tests:match patt testname itempath)
+		  (if (tests:match patt testname itempath) ;; have a launcher match for this test
 		      (begin
 			(debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type)
-			(let ((launcher (configf:lookup configdat "host-types" host-type)))
+			(let ((launcher (configf:lookup configdat "host-types" host-type))) ;; find the actual launcher from the host-types table
+			  ;; if we are in adjutant mode then we want to return both host-type and launcher
 			  (if launcher
 			      (let* ((launcher-parts (string-split launcher))
 				     (launcher-exe   (car launcher-parts)))
 				(if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline
 				    (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat))
 						    (count     100))
 				      (if targ-host
 					  (conc "remrun " targ-host)
 					  (if (> count 0)
+					      
 					      (begin
 						(debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type)
 						(thread-sleep! (- 101 count))
 						(host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)
 							   (- count 1)))
 					      (begin
 						(debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type)
 						(exit)))))
-				    launcher))
+				    (case mode
+				      ((adjutant) (list host-type launcher))
+				      (else       launcher))))
 			      (begin
 				(debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type)
 				(if (null? tal)
 				    fallback-launcher
 				    (loop (car tal)(cdr tal)))))))

Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -212,10 +212,12 @@
   (hash-table-set!
    (dboard:commondat-tabdats commondat)
    tabnum
    tabdat))
 
+(define *updater-running* #f) ;; move this into one of the stucts
+
 ;; gets and calls updater list based on curr-tab-num
 ;;
 (define (dboard:common-run-curr-updaters commondat #!key (tab-num #f))
   (if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat
       (let* ((tnum     (or tab-num (dboard:commondat-curr-tab-num commondat)))
@@ -240,11 +242,13 @@
 		     tnum
 		     (cons updater curr-updaters))))
 
 ;; data for each specific tab goes here
 ;;
-(defstruct dboard:tabdat 
+(defstruct dboard:tabdat
+  ;; flags
+  ((already-running #f)                  : boolean)     ;; the updater is already running. skip
   ;; runs
   ((allruns         '())                 : list)        ;; list of dboard:rundat records
   ((allruns-by-id    (make-hash-table))  : hash-table)  ;; hash of run-id -> dboard:rundat records
   ((done-runs       '())                 : list)        ;; list of runs already drawn
   ((not-done-runs   '())                 : list)        ;; list of runs not yet drawn
@@ -645,11 +649,11 @@
 ;;
 (define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals)
   (let* ((start-time   (current-seconds))
 	 (access-mode  (dboard:tabdat-access-mode tabdat))
          (num-to-get   (string->number (or (configf:lookup *configdat* "setup" "num-tests-to-get")
-                                           "200")))
+                                           "60"))) ;; was 200, which is fine in a normal run area.
 	 (states       (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat)))
 	 (statuses     (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat)))
          (do-not-use-db-file-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab
          (do-not-use-query-timestamps   #t) ;; (configf:lookup *configdat* "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab
 	 (sort-info    (get-curr-sort))
@@ -716,10 +720,11 @@
     (if got-all
 	(begin
 	  (dboard:rundat-last-update-set!     run-dat (- start-time 2))
 	  (dboard:rundat-run-data-offset-set! run-dat 0))
 	(begin
+	  ;;; (thread-sleep! 0.25) ;; give the rest of the gui some time to update. <-- this did NOT help
 	  (dboard:rundat-run-data-offset-set! run-dat
 					      (+ num-to-get (dboard:rundat-run-data-offset run-dat)))))
 
     (for-each 
      (lambda (tdat)
@@ -833,11 +838,13 @@
 ;;
 ;; create a virtual table of all the tests
 ;; keypatts: ( (KEY1 "abc%def")(KEY2 "%") )
 ;;
 (define (dboard:update-rundat tabdat runnamepatt numruns testnamepatt keypatts)
-  (let* ((access-mode      (dboard:tabdat-access-mode tabdat))
+  (dboard:tabdat-already-running-set! tabdat #t)
+  (let* (;; (already-running  (dboard:tabdat-already-running tabdat))
+	 (access-mode      (dboard:tabdat-access-mode tabdat))
          (keys             (dboard:tabdat-keys tabdat)) ;; (db:dispatch-query access-mode rmt:get-keys db:get-keys)))
 	 (last-runs-update (- (dboard:tabdat-last-runs-update tabdat) 2))
          (allruns          (rmt:get-runs runnamepatt numruns (dboard:tabdat-start-run-offset tabdat) keypatts))
          ;;(allruns-tree (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f))
          (allruns-tree    (rmt:get-runs-by-patt keys "%" #f #f #f #f 0)) ;; last-runs-update));;'("id" "runname")
@@ -901,27 +908,26 @@
 		  (hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id)
 		  (hash-table-set!    (dboard:tabdat-allruns-by-id tabdat) run-id run-struct))
 	      (if (or (null? tal)
 		      (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update
 		  (begin
-		    (when (> elapsed-time 2)   
+		    #;(when (> elapsed-time 2)   
                       (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
                       (let* ((old-val (iup:attribute *tim* "TIME"))
                              (new-val (number->string (inexact->exact (floor (* 2  (string->number old-val)))))))
                         (if (< (string->number new-val) 5000)
-                            ((debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
-                            (iup:attribute-set! *tim* "TIME" new-val))))
-
-
-                      )
+                            (begin
+			      (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
+			      (iup:attribute-set! *tim* "TIME" new-val)))))
 		    (dboard:tabdat-allruns-set! tabdat new-res)
 		    maxtests)
 		  (if (> (dboard:rundat-run-data-offset run-struct) 0)
 		      (loop run tal new-res newmaxtests) ;; not done getting data for this run
 		      (loop (car tal)(cdr tal) new-res newmaxtests)))))))
     (dboard:tabdat-filters-changed-set! tabdat #f)
-    (dboard:update-tree tabdat runs-hash header tb)))
+    (dboard:update-tree tabdat runs-hash header tb)
+    (dboard:tabdat-already-running-set! tabdat #f)))
 
 (define *collapsed* (make-hash-table))
 
 (define (toggle-hide lnum uidat) ; fulltestname)
   (let* ((btn (vector-ref (dboard:uidat-get-lftcol uidat) lnum))
@@ -2505,14 +2511,10 @@
 
         ;; insert extra widget here
         (if extra-widget
             extra-widget
             (iup:hbox)) ;; empty widget
-
-        
-
-        
         )))
 
      (let* ((status-toggles (map (lambda (status)
 				   (iup:toggle (conc status)
 					       #:fontsize 8 ;; btn-fontsz ;; "10"
@@ -3725,28 +3727,36 @@
 ;; (define (tabdat-values tabdat)
 
 ;; runs update-rundat using the various filters from the gui
 ;;
 (define (dashboard:do-update-rundat tabdat)
+  ;; this seems like a good place to check for already running and skip if so
+  ;;
+  ;; (set! *updater-running* #t)
+;;(if (dboard:tabdat-already-running tabdat)
+;;    (begin
+;;	(debug:print-info 0 *default-log-port* "Dashboard overloaded - updates will be slow, skipping update.")
+;;	(dboard:tabdat-target tabdat))
   (dboard:update-rundat
    tabdat
    (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "runname" "%")
    (dboard:tabdat-numruns tabdat)
    (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%")
    ;; generate key patterns from the target stored in tabdat
    (let* ((dbkeys (dboard:tabdat-dbkeys tabdat)))
      (let ((fres   (if (dboard:tabdat-target tabdat)
-                       (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%"))))
-                         (map (lambda (k v)(list k v)) dbkeys ptparts))
-                       (let ((res '()))
-                         (for-each (lambda (key)
-                                     (if (not (equal? key "runname"))
-                                         (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f)))
-                                           (if val (set! res (cons (list key val) res))))))
-                                   dbkeys)
-                         res))))
-       fres))))
+		       (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%"))))
+			 (map (lambda (k v)(list k v)) dbkeys ptparts))
+		       (let ((res '()))
+			 (for-each (lambda (key)
+				     (if (not (equal? key "runname"))
+					 (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f)))
+					   (if val (set! res (cons (list key val) res))))))
+				   dbkeys)
+			 res))))
+       fres)))
+  #;(set! *updater-running* #f))
 
 (define (dashboard:runs-tab-updater commondat tab-num)
   (debug:catch-and-dump 
    (lambda ()
      (let* ((tabdat (dboard:common-get-tabdat commondat tab-num: tab-num))
@@ -3801,22 +3811,25 @@
 	   (dashboard:runs-tab-updater commondat 1))
 	 tab-num: 2)
 	(iup:callback-set! *tim*
 			   "ACTION_CB"
 			   (lambda (time-obj)
-			     (let ((update-is-running #f))
-			     (mutex-lock! (dboard:commondat-update-mutex commondat))
-			     (set! update-is-running (dboard:commondat-updating commondat))
-			     (if (not update-is-running)
-			     (dboard:commondat-updating-set! commondat #t))
-			     (mutex-unlock! (dboard:commondat-update-mutex commondat))
-			     (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
-			     (begin
-			     (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)
-			     (mutex-lock! (dboard:commondat-update-mutex commondat))
-			     (dboard:commondat-updating-set! commondat #f)
-			     (mutex-unlock! (dboard:commondat-update-mutex commondat)))
+			     (if (not *updater-running*)
+				 (begin
+				   ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
+				   ;; (set! update-is-running (dboard:commondat-updating commondat))
+				   ;;(if (not update-is-running)
+				   ;;	   (dboard:commondat-updating-set! commondat #t))
+				   ;;(mutex-unlock! (dboard:commondat-update-mutex commondat))
+				   ;;(if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
+				   ;;	   (begin
+				   (set! *updater-running* #t)
+				   (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)
+				   (set! *updater-running* #f)
+				   ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
+				   ;;  (dboard:commondat-updating-set! commondat #f)
+				   ;;   (mutex-unlock! (dboard:commondat-update-mutex commondat)))
 				   ))
 			     1))))
       
       (let ((th1 (make-thread (lambda ()
 				(thread-sleep! 1)

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -1534,11 +1534,11 @@
                               state        TEXT DEFAULT 'new',
                               status       TEXT DEFAULT 'n/a',
                               archive_type TEXT DEFAULT 'bup',
                               du           INTEGER,
                               archive_path TEXT);")))
-         (print "creating trigges from init") 
+         ;; (print "creating trigges from init") 
         (db:create-triggers db)    
      db)) ;; )
 
 ;;======================================================================
 ;; A R C H I V E S
@@ -2142,14 +2142,62 @@
 	 (db-exists (common:file-exists? dbname))
 	 (db     (sqlite3:open-database dbname)))
     (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000))
     (if (not db-exists)
 	(begin
-	  (sqlite3:execute db "PRAGMA synchronous = 0;")
 	  (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));")
 	  (sqlite3:execute db "PRAGMA journal_mode=WAL;")))
+    ;; MOVE THIS TABLE CREATION TO THE (begin above in about six months (it is Sep 2020 right now).
+    (sqlite3:execute db "CREATE TABLE IF NOT EXISTS jobs_queue (id INTEGER PRIMARY KEY, host_type TEXT, cores INTEGER, memory TEXT, vars TEXT, exekey TEXT, cmdline TEXT, state TEXT, event_time INTEGER, last_update INTEGER);")
+    ;; not sure I'll use this next one. I prefer if tests simply append to a file:
+    ;;    last-update-seconds cpuload tmpspace rundirspace
+    (sqlite3:execute db "CREATE TABLE IF NOT EXISTS test_extra_data (id INTEGER PRIMARY KEY, run_id INTEGER, test_id INTEGER, last_seen_running INTEGER);") 
+    (sqlite3:execute db "PRAGMA synchronous = 0;")
     db))
+
+(define (db:no-sync-add-job db-in host-type vars-list exekey cmdline)
+  (sqlite3:execute (db:no-sync-db db-in) "INSERT INTO jobs_queue (host_type,vars,exekey,cmdline,state,event_time,last_update) VALUES (?,?,?,?,?,?,?);"
+		   host-type vars-list exekey cmdline "waiting" (current-seconds)(current-seconds)))
+
+;; find next job (waiting longest) that matches host-type - future, we'll find jobs that fit if no exact match
+(define (db:no-sync-take-job db-in host-type)
+  (let* ((db   (db:no-sync-db db-in))
+	 (stmt1 "SELECT id,host_type,vars,exekey,cmdline,state,event_time,last_update FROM jobs_queue WHERE host_type=? AND state != 'taken' ORDER BY event_time ASC;")
+	 (stmt1h (sqlite3:prepare db stmt1))
+	 (stmt2  "UPDATE jobs_queue SET state='taken',last_update=? WHERE id=?;")
+	 (stmt2h (sqlite3:prepare db stmt2))
+	 (res    (sqlite3:with-transaction
+		  db
+		  (lambda ()
+		    (let* ((matching-jobs (sqlite3:fold-row
+					   (lambda (res . row) ;; id host-type vars exekey state event-time last-update)
+					     (cons row res))
+					   '()
+					   stmt1h
+					   host-type)))
+		      (if (null? matching-jobs)
+			  #f
+			  (let ((choosen-one  (let loop ((tal matching-jobs)
+							 (res #f)) ;; put bestest one in here
+						(if (null? tal)
+						    res
+						    (let ((curr (car tal))
+							  (rem  (cdr tal)))
+						      curr) ;; here we will compare with res, if better candidate the loop with curr else loop with res
+						    ))))
+			    (if choosen-one ;; we need to mark it as taken
+				(sqlite3:execute stmt2h (current-seconds) (car choosen-one)))
+			    choosen-one)))))))
+    (sqlite3:finalize! stmt1h) ;; it'd be nice to cache these and finalize on exit.
+    (sqlite3:finalize! stmt2h)
+    res))
+
+;; clean out old jobs in queue, i.e. taken and event_time > 24 hrs ago
+;;
+(define (db:no-sync-job-records-clean db)
+  (sqlite3:execute (db:no-sync-db db) "DELETE FROM jobs_queue WHERE state='taken' AND event_time < ?;" (- (current-seconds)(* 24 3600))))
+		 
 
 ;; if we are not a server create a db handle. this is not finalized
 ;; so watch for problems. I'm still not clear if it is needed to manually
 ;; finalize sqlite3 dbs with the sqlite3 egg.
 ;;
@@ -2167,12 +2215,13 @@
   (sqlite3:execute (db:no-sync-db db) "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES (?,?);" var val))
 
 (define (db:no-sync-del! db var)
   (sqlite3:execute (db:no-sync-db db) "DELETE FROM no_sync_metadat WHERE var=?;" var))
 
-(define (db:no-sync-get/default db var default)
-  (let ((res default))
+(define (db:no-sync-get/default db-in var default)
+  (let ((db  (db:no-sync-db db-in))
+	(res default))
     (sqlite3:for-each-row
      (lambda (val)
        (set! res val))
      (db:no-sync-db db)
      "SELECT val FROM no_sync_metadat WHERE var=?;"
@@ -3467,11 +3516,11 @@
      (lambda (run-id)
        (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
 	 (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs)))
      run-ids)))
 
-;; Get test data using test_id, run-id is not used
+;; Get test data using test_id, run-id is not used - but it will be!
 ;; 
 (define (db:get-test-info-by-id dbstruct run-id test-id)
   (db:with-db
    dbstruct
    #f ;; run-id

Index: docs/manual/megatest_manual.html
==================================================================
--- docs/manual/megatest_manual.html
+++ docs/manual/megatest_manual.html
@@ -769,13 +769,13 @@
 <h1>The Megatest Users Manual</h1>
 <span id="author">Matt Welland</span><br>
 <span id="email" class="monospaced">&lt;<a href="mailto:matt@kiatoa.com">matt@kiatoa.com</a>&gt;</span><br>
 <span id="revnumber">version 1.5,</span>
 <span id="revdate">June 2020</span>
-<div id="toc">
-  <div id="toctitle">Table of Contents</div>
-  <noscript><p><b>JavaScript must be enabled in your browser to display the table of contents.</b></p></noscript>
+<div id="toc">
+  <div id="toctitle">Table of Contents</div>
+  <noscript><p><b>JavaScript must be enabled in your browser to display the table of contents.</b></p></noscript>
 </div>
 </div>
 <div id="content">
 <div class="sect1">
 <h2 id="_preface">Preface</h2>
@@ -3440,10 +3440,48 @@
 <td class="tableblock halign-left valign-top" ><p class="tableblock monospaced"></p></td>
 </tr>
 </tbody>
 </table>
 </div>
+</div>
+<div class="sect1">
+<h2 id="_test_plan">Test Plan</h2>
+<div class="sectionbody">
+<div class="sect2">
+<h3 id="_tests">Tests</h3>
+<div class="paragraph"><p>itemwait|33</p></div>
+<div class="paragraph"><p>rerun-downstream-item|20</p></div>
+<div class="paragraph"><p>rerunclean|20</p></div>
+<div class="paragraph"><p>fullrun|18</p></div>
+<div class="paragraph"><p>goodtests|18</p></div>
+<div class="paragraph"><p>kill-rerun|17</p></div>
+<div class="paragraph"><p>items-runconfigvars|16</p></div>
+<div class="paragraph"><p>ro_test|16</p></div>
+<div class="paragraph"><p>runconfig-tests|16</p></div>
+<div class="paragraph"><p>env-pollution|13</p></div>
+<div class="paragraph"><p>itemmap|11</p></div>
+<div class="paragraph"><p>testpatt_envvar|10</p></div>
+<div class="paragraph"><p>toprun|10</p></div>
+<div class="paragraph"><p>chained-waiton|8</p></div>
+<div class="paragraph"><p>skip-on-fileexists|8</p></div>
+<div class="paragraph"><p>killrun_preqfail|7</p></div>
+<div class="paragraph"><p>subrun|6</p></div>
+<div class="paragraph"><p>dependencies|5</p></div>
+<div class="paragraph"><p>itemwait-simple|4</p></div>
+<div class="paragraph"><p>rollup|4</p></div>
+<div class="paragraph"><p>end-of-run|3</p></div>
+<div class="paragraph"><p>killrun|3</p></div>
+<div class="paragraph"><p>listener|3</p></div>
+<div class="paragraph"><p>test2|3</p></div>
+<div class="paragraph"><p>testpatt|3</p></div>
+<div class="paragraph"><p>env-pollution-usecacheno|2</p></div>
+<div class="paragraph"><p>set-values|2
+envvars|1
+listruns-tests|1
+subrun-usecases|1</p></div>
+</div>
+</div>
 </div>
 <div class="sect1">
 <h2 id="_megatest_internals">Megatest Internals</h2>
 <div class="sectionbody">
 <div class="imageblock graphviz">
@@ -3461,11 +3499,11 @@
 </div>
 <div id="footnotes"><hr></div>
 <div id="footer">
 <div id="footer-text">
 Version 1.5<br>
-Last updated
+Last updated 2020-09-08 08:39:29 PDT
  2020-08-22 12:47:36 MST
 </div>
 </div>
 </body>
 </html>

Index: docs/manual/megatest_manual.txt
==================================================================
--- docs/manual/megatest_manual.txt
+++ docs/manual/megatest_manual.txt
@@ -116,10 +116,12 @@
 include::writing_tests.txt[]
 
 include::howto.txt[]
 
 include::reference.txt[]
+
+include::testplan.txt[]
 
 Megatest Internals
 ------------------
 
 ["graphviz", "server.png"]

ADDED   docs/manual/testplan.txt
Index: docs/manual/testplan.txt
==================================================================
--- /dev/null
+++ docs/manual/testplan.txt
@@ -0,0 +1,81 @@
+// This file is part of Megatest.
+// 
+//     Megatest is free software: you can redistribute it and/or modify
+//     it under the terms of the GNU General Public License as published by
+//     the Free Software Foundation, either version 3 of the License, or
+//     (at your option) any later version.
+// 
+//     Megatest is distributed in the hope that it will be useful,
+//     but WITHOUT ANY WARRANTY; without even the implied warranty of
+//     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//     GNU General Public License for more details.
+// 
+//     You should have received a copy of the GNU General Public License
+//     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+
+// Copyright 2006-2020, Matthew Welland.
+
+Test Plan
+---------
+
+Tests
+~~~~~
+
+itemwait|33
+
+rerun-downstream-item|20
+
+rerunclean|20
+
+fullrun|18
+
+goodtests|18
+
+kill-rerun|17
+
+items-runconfigvars|16
+
+ro_test|16
+
+runconfig-tests|16
+
+env-pollution|13
+
+itemmap|11
+
+testpatt_envvar|10
+
+toprun|10
+
+chained-waiton|8
+
+skip-on-fileexists|8
+
+killrun_preqfail|7
+
+subrun|6
+
+dependencies|5
+
+itemwait-simple|4
+
+rollup|4
+
+end-of-run|3
+
+killrun|3
+
+listener|3
+
+test2|3
+
+testpatt|3
+
+env-pollution-usecacheno|2
+
+set-values|2
+envvars|1
+listruns-tests|1
+subrun-usecases|1
+
+

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -205,17 +205,19 @@
 			     (current-seconds) 
 			     start-seconds)))))
 	 (kill-tries 0))
     ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
     ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
-    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
+    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t)
 
     (let loop ((minutes   (calc-minutes))
 	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
 	       (disk-free (get-df (current-directory)))
                (last-sync (current-seconds)))
-      (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
+      ;; (common:telemetry-log "zombie" (conc "launch:monitor-job -
+      ;; top of loop encountered at "(current-seconds)" with
+      ;; last-sync="last-sync))
       (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
              (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                    (delta (abs (- load cpu-load))))
                               (if (> delta 0.1) ;; don't bother updating with small changes
                                   load
@@ -233,33 +235,28 @@
              (test-info   (rmt:get-test-info-by-id run-id test-id))
              (state       (db:test-get-state test-info))
              (status      (db:test-get-status test-info))
              (kill-reason  "no kill reason specified")
              (kill-job?    #f))
-        (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
+        #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
         (cond
          ((test-get-kill-request run-id test-id)
           (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
           (set! kill-job? #t))
          ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
           (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
           (set! kill-job? #t))
          ((equal? status "DEAD")
-          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
+          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t)
           (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
           ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
           (set! kill-job? #f)))
 
         (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
         (launch:handle-zombie-tests run-id)
-        (when do-sync
-          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
-          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
-          (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
-          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
-          (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
-        
+        (if do-sync ;; save meta data about the running of this test
+	    (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
 	(if kill-job? 
 	    (begin
               (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
 	      (mutex-lock! m)
 	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
@@ -312,11 +309,11 @@
 	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
 		  (loop (calc-minutes)
                         (or new-cpu-load cpu-load)
                         (or new-disk-free disk-free)
                         (if do-sync (current-seconds) last-sync)))))))
-    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
+    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional
 
 
 (define (launch:execute encoded-cmd)
   (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
 	 (tconfigreg #f))
@@ -465,11 +462,13 @@
 				  (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.")
 				  (exit))))
 		 (test-pid  (db:test-get-process_id  test-info)))
 	    (cond
              ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag.
-	     ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
+	     ((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
+		  (and (equal? (db:test-get-state test-info) "COMPLETED")                           ;; completed/abort => rerun if asked
+		       (member (db:test-get-status test-info) '("ABORT"))))
 	      (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
 	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
 
               (rmt:general-call 'set-test-start-time #f test-id)
               (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
@@ -1355,12 +1354,20 @@
       (debug:print 1 *default-log-port* "INFO: search and mark zombie tests")
       (rmt:set-var key (current-seconds))
       (rmt:find-and-mark-incomplete run-id #f))))
 
 
-
+(defstruct launch:ajt
+  (vars '())
+  (exekey #f)
+  (host-type #f)
+  (test-sig  #f)
+  (cmdline   #f))
 
+;; append vars
+(define (launch:ajt-add-vars dat vars)
+  (launch:ajt-vars-set! dat (append (launch:ajt-vars dat) vars)))
 
 ;; 1. look though disks list for disk with most space
 ;; 2. create run dir on disk, path name is meaningful
 ;; 3. create link from run dir to megatest runs area 
 ;; 4. remotely run the test on allocated host
@@ -1367,44 +1374,41 @@
 ;;    - could be ssh to host from hosts table (update regularly with load)
 ;;    - could be netbatch
 ;;      (launch-test db (cadr status) test-conf))
 (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params)
   (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex
-  (let* ( ;; (lock-key        (conc "test-" test-id))
-	;; (got-lock        (let loop ((lock        (rmt:no-sync-get-lock lock-key))
-	;; 			     (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds
-	;; 		    (if (car lock)
-	;; 			#t
-	;; 			(if (> (current-seconds) expire-time)
-	;; 			    (begin
-	;; 			      (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path)
-	;; 			      (rmt:no-sync-del! lock-key) ;; destroy the lock
-	;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; 
-	;; 			    (begin
-	;; 			      (thread-sleep! 1)
-	;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time))))))
+  (let* (;; locking code removed from here commented out and pasted at end of file
 	 (item-path       (item-list->path itemdat))
-	 (contour         #f)) ;; NOT READY FOR THIS (args:get-arg "-contour")))
+	 (contour         #f)                         ;; NOT READY FOR THIS (args:get-arg "-contour")))
+	 ;; launcher-mode will be 'adjutant or 'normal
+	 (launcher-mode   (string->symbol (or (configf:lookup *configdat* "jobtools" "mode") "normal")))
+	 (ajtdat          (make-launch:ajt)))
     (let loop ((delta        (- (current-seconds) *last-launch*))
 	       (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 0)))
       (if (> launch-delay delta)
 	  (begin
 	    (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay.
 		(debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds"))
 	    (thread-sleep! (- launch-delay delta))
 	    (loop (- (current-seconds) *last-launch*) launch-delay))))
     (change-directory *toppath*)
-    (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars)
-     (append
-      (list
-       (list "MT_RUN_AREA_HOME" *toppath*)
-       (list "MT_TEST_NAME" test-name)
-       (list "MT_RUNNAME"   runname)
-       (list "MT_ITEMPATH"  item-path)
-       (list "MT_CONTOUR"   contour)
-       )
-      itemdat))
+    (let ((var-list (append
+		     (list
+		      (list "MT_RUN_AREA_HOME" *toppath*)
+		      (list "MT_TEST_NAME" test-name)
+		      (list "MT_RUNNAME"   runname)
+		      (list "MT_ITEMPATH"  item-path)
+		      (list "MT_CONTOUR"   contour)
+		      )
+		     itemdat)))
+       ;; consolidate this code with the code in megatest.scm for
+       ;; "-execute", *maybe* - the longer they are set the longer
+       ;; each launch takes (must be non-overlapping with the vars)
+      (alist->env-vars var-list)
+      ;; the var-list into the ajtdat adjutant record whether it is needed or not.
+      (launch:ajt-add-vars ajtdat var-list))
+    
     (let* ((tregistry       (tests:get-all)) ;; third param (below) is system-allowed
            ;; for tconfig, why do we allow fallback to test-conf?
 	   (tconfig         (or (tests:get-testconfig test-name item-path tregistry #t force-create: #t)
 				(begin
                                   (debug:print 0 *default-log-port* "WARNING: falling back to pre-calculated testconfig. This is likely not desired.")
@@ -1422,26 +1426,17 @@
 	   ;; (memory          (configf:lookup tconfig   "requirements" "memory"))
 	   ;; (hosts           (configf:lookup *configdat* "jobtools"     "workhosts")) ;; I'm pretty sure this was never completed
 	   (remote-megatest (configf:lookup *configdat* "setup" "executable"))
 	   (run-time-limit  (or (configf:lookup  tconfig   "requirements" "runtimelim")
 				(configf:lookup  *configdat* "setup" "runtimelim")))
-	   ;; FIXME SOMEDAY: not good how this is so obtuse, this hack is to 
-	   ;;                allow running from dashboard. Extract the path
-	   ;;                from the called megatest and convert dashboard
-	   ;;             	  or dboard to megatest
 	   (local-megatest  (common:find-local-megatest))
-	   #;(local-megatest  (let* ((lm  (car (argv)))
-				   (dir (pathname-directory lm))
-				   (exe (pathname-strip-directory lm)))
-			      (conc (if dir (conc dir "/") "")
-				    (case (string->symbol exe)
-				      ((dboard)    "../megatest")
-				      ((mtest)     "../megatest")
-				      ((dashboard) "megatest")
-				      (else exe)))))
-	   (launcher        (common:get-launcher *configdat* test-name item-path)) ;; (configf:lookup *configdat* "jobtools"     "launcher"))
-	   (test-sig        (conc (common:get-testsuite-name) ":" test-name ":" item-path)) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path
+	   (launcher        (let ((l (common:get-launcher *configdat* test-name item-path launcher-mode)))
+			      (if (string? l)
+				  (string-split l)
+				  l))) ;; some nonhomogenuity here. '(cmd param1 param2 ...) OR '(host-type launcher)
+	    ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path
+	   (test-sig        (conc (common:get-testsuite-name) ":" test-name ":" item-path))
 	   (work-area       #f)
 	   (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all
 	   (diskpath   #f)
 	   (cmdparms   #f)
 	   (fullcmd    #f) ;; (define a (with-output-to-string (lambda ()(write x))))
@@ -1451,15 +1446,24 @@
 	   (debug-param (append (if (args:get-arg "-debug")  (list "-debug" (args:get-arg "-debug")) '())
 				(if (args:get-arg "-logging")(list "-logging") '())
 				(if (configf:lookup *configdat* "misc" "profilesw")
 				    (list (configf:lookup *configdat* "misc" "profilesw"))
 				    '()))))
+      ;; save the test-sig in the ajtdat record
+      (launch:ajt-test-sig-set! ajtdat test-sig)
+      ;; go ahead and figure out if we have a host-type from the
+      ;; launcher call above and save it in the ajtdat record
+      (if (and (eq? launcher-mode 'adjutant)
+	       (list? launcher)
+	       (> (length launcher) 1))
+	  (launch:ajt-host-type-set! ajtdat (car launcher)))
+ 
       ;; (if hosts (set! hosts (string-split hosts)))
       ;; set the megatest to be called on the remote host
       (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest"))
       (set! mt-bindir-path (pathname-directory remote-megatest))
-      (if launcher (set! launcher (string-split launcher)))
+      ;; (if launcher (set! launcher (string-split launcher)))           ;; yuk!
       ;; set up the run work area for this test
       (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run
 	       (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir
 	  (begin
 	    (debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path)
@@ -1512,64 +1516,92 @@
 					(list 'runtlim   (if run-time-limit (common:hms-string->seconds run-time-limit) #f))
 					(list 'env-ovrd  (hash-table-ref/default *configdat* "env-override" '())) 
 					(list 'set-vars  (if params (hash-table-ref/default params "-setvars" #f)))
 					(list 'runname   runname)
 					(list 'mt-bindir-path mt-bindir-path))))))))
+      ;; save the cmdparms in the ajtdat
+      (launch:ajt-exekey-set! ajtdat cmdparms)
       
       ;; clean out step records from previous run if they exist
       ;; (rmt:delete-test-step-records run-id test-id)
       ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway
       (if (common:file-exists? work-area)
 	  (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir
-      (cond
-       ;; ((and launcher hosts) ;; must be using ssh hostname
-       ;;    (set! fullcmd (append launcher (car hosts)(list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param)))
-       ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms))))
+
+      ;; save the command line for adjutant mode (might never be needed but best to assemble it here)
+      (launch:ajt-cmdline-set! ajtdat (string-intersperse
+				       (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param)))
+      (cond       
        (launcher
 	(set! fullcmd (append launcher (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param)))
-       ;; (set! fullcmd (append launcher (list remote-megatest test-sig "-execute" cmdparms))))
        (else
 	(if (not useshell)(debug:print 0 *default-log-port* "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section"))
 	(set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" ""))))))
-      ;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" "")))))
+      
       (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm"))))
       (debug:print 1 *default-log-port* "Launching " work-area)
       ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done
       (debug:print 4 *default-log-port* "fullcmd: " fullcmd)
       (set! *last-launch* (current-seconds)) ;; all that junk above takes time, set this as late as possible.
-      (let* ((commonprevvals (alist->env-vars
-			      (hash-table-ref/default *configdat* "env-override" '())))
-	     (miscprevvals   (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute"
-			      (append (list (list "MT_TEST_RUN_DIR" work-area)
-					    (list "MT_TEST_NAME" test-name)
-					    (list "MT_ITEM_INFO" (conc itemdat)) 
-					    (list "MT_RUNNAME"   runname)
-					    (list "MT_TARGET"    mt_target)
-					    (list "MT_ITEMPATH"  item-path)
-					    )
-				      itemdat)))
-	     (testprevvals   (alist->env-vars
-			      (hash-table-ref/default tconfig "pre-launch-env-overrides" '())))
+      (let* ((env-override-vars  (hash-table-ref/default *configdat* "env-override" '()))
+	     (commonprevvals     (alist->env-vars env-override-vars))
+	     (misc-vars          (append (list (list "MT_TEST_RUN_DIR" work-area)
+					       (list "MT_TEST_NAME" test-name)
+					       (list "MT_ITEM_INFO" (conc itemdat)) 
+					       (list "MT_RUNNAME"   runname)
+					       (list "MT_TARGET"    mt_target)
+					       (list "MT_ITEMPATH"  item-path))
+					 itemdat))
+	     (miscprevvals   (alist->env-vars misc-vars));; consolidate this code with the code in megatest.scm for "-execute"
+	     (test-vars      (hash-table-ref/default tconfig "pre-launch-env-overrides" '()))
+	     (testprevvals   (alist->env-vars test-vars))
+			      
 	     ;; Launchwait defaults to true, must override it to turn off wait
 	     (launchwait     (if (equal? (configf:lookup *configdat* "setup" "launchwait") "no") #f #t))
-	     (launch-results-prev (apply (if launchwait ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed.
-					     process:cmd-run-with-stderr-and-exitcode->list
-					     process-run)
-					 (if useshell
-					     (let ((cmdstr (string-intersperse fullcmd " ")))
-					       (if launchwait
-						   cmdstr
-						   (conc cmdstr " >> mt_launch.log 2>&1 &")))
-					     (car fullcmd))
-					 (if useshell
-					     '()
-					     (cdr fullcmd))))
+	     ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed.
+	     (launch-results-prev (if (eq? launcher-mode 'adjutant)
+				      '(#t 0) ;; just some fake data to fool downstream but non-applicable code
+				      (apply (if launchwait
+						 process:cmd-run-with-stderr-and-exitcode->list
+						 process-run)
+					     (if useshell
+						 (let ((cmdstr (string-intersperse fullcmd " ")))
+						   (if launchwait
+						       cmdstr
+						       (conc cmdstr " >> mt_launch.log 2>&1 &")))
+						 (car fullcmd))
+					     (if useshell
+						 '()
+						 (cdr fullcmd)))))
              (success        (if launchwait (equal? 0 (cadr launch-results-prev)) #t))
              (launch-results (if launchwait (car launch-results-prev) launch-results-prev)))
-        (if (not success)
+
+	(launch:ajt-add-vars ajtdat env-override-vars)
+	(launch:ajt-add-vars ajtdat misc-vars)
+	(launch:ajt-add-vars ajtdat test-vars)
+
+	;; if in adjutant mode we register the job in the jobs_queue
+	;; then fire off an adjutant runner
+	;;
+	(if (eq? launcher-mode 'adjutant)
+	    (let* ((adjutant-runner-cmd (append (cdr launcher)
+						(list remote-megatest "-adjutant"
+						      (launch:ajt-host-type ajtdat)
+						      "-start-dir" *toppath*)))
+		   (adj-cmd     (conc (string-intersperse (map conc adjutant-runner-cmd) " ")
+				      "&")))         
+	      (rmt:no-sync-add-job
+	       (launch:ajt-host-type  ajtdat)
+	       (conc (launch:ajt-vars ajtdat))
+	       (launch:ajt-exekey     ajtdat)
+	       (launch:ajt-cmdline    ajtdat))
+	      (print "adj-cmd: " adj-cmd)
+	      (system adj-cmd)
+	      ))
+	
+	(if (not success)
             (tests:test-set-status! run-id test-id "COMPLETED" "DEAD" "launcher failed; exited non-zero; check mt_launch.log" #f)) ;; (if launch-results launch-results "FAILED"))
-        (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork.
 	;; (rmt:no-sync-del! lock-key)         ;; release the lock for starting this test
 	(if (not launchwait) ;; give the OS a little time to allow the process to start
 	    (thread-sleep! 0.01))
 	(with-output-to-file "mt_launch.log"
 	  (lambda ()
@@ -1591,10 +1623,14 @@
 	      (process-signal (current-process-id) signal/kill)
 	      ))
 	(alist->env-vars miscprevvals)
 	(alist->env-vars testprevvals)
 	(alist->env-vars commonprevvals)
+	;; yes, really should mutex all the way to here. Need to put this entire process into a fork.
+	;; the unlock previously was further up. This seemed wrong as we should not proceed until the
+	;; vars have been reset.
+	(mutex-unlock! *launch-setup-mutex*)
 	launch-results))
     (change-directory *toppath*)
     (thread-sleep! (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.0))))
 
 ;; recover a test where the top controlling mtest may have died
@@ -1620,5 +1656,21 @@
     ;; now wait on that process if all is correct
     ;; periodically update the db with runtime
     ;; when the process exits look at the db, if still RUNNING after 10 seconds set
     ;; state/status appropriately
     (process-wait pid)))
+
+
+ ;; (lock-key        (conc "test-" test-id))
+	;; (got-lock        (let loop ((lock        (rmt:no-sync-get-lock lock-key))
+	;; 			     (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds
+	;; 		    (if (car lock)
+	;; 			#t
+	;; 			(if (> (current-seconds) expire-time)
+	;; 			    (begin
+	;; 			      (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path)
+	;; 			      (rmt:no-sync-del! lock-key) ;; destroy the lock
+	;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; 
+	;; 			    (begin
+	;; 			      (thread-sleep! 1)
+	;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time))))))
+	 

Index: megatest-version.scm
==================================================================
--- megatest-version.scm
+++ megatest-version.scm
@@ -18,6 +18,6 @@
 ;; Always use two or four digit decimal
 ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..
 
 ;; (declare (unit megatest-version))
 
-(define megatest-version 1.6568)
+(define megatest-version 1.6569)

Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -39,10 +39,17 @@
 (declare (uses mt))
 (declare (uses api))
 (declare (uses tasks)) ;; only used for debugging.
 (declare (uses env))
 (declare (uses diff-report))
+
+(declare (uses mutils))
+(import mutils)
+
+(declare (uses adjutant))
+(import adjutant)
+
 ;; (declare (uses ftail))
 ;; (import ftail)
 
 (define *db* #f) ;; this is only for the repl, do not use in general!!!!
 
@@ -51,18 +58,18 @@
 (include "db_records.scm")
 (include "run_records.scm")
 (include "megatest-fossil-hash.scm")
 
 (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (prefix base64 base64:)
-     readline apropos json http-client directory-utils typed-records
+     readline apropos json http-client directory-utils typed-records matchable
      http-client srfi-18 extras format)
 
 ;; Added for csv stuff - will be removed
 ;;
 (use sparse-vectors)
 
-(require-library mutils)
+;; (require-library mutils)
 
 (define *usage-log-file* #f)    ;; put path to file for logging usage in this var in the ~/.megatestrc file
 (define *usage-use-seconds* #t) ;; for Epoc seconds in usage logging change this to #t in ~/.megatestrc file
 
 ;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file
@@ -199,11 +206,11 @@
   -update-meta            : update the tests metadata for all tests
   -setvars VAR1=val1,VAR2=val2 : Add environment variables to a run NB// these are
                                  overwritten by values set in config files.
   -server -|hostname      : start the server (reduces contention on megatest.db), use
                             - to automatically figure out hostname
-  -adjutant C,M           : start the server/adjutant with allocated cores C and Mem M (Gig), 
+  -adjutant host-type     : start the server/adjutant with given host-type
                             use 0,0 to auto use full machine
   -transport http|rpc     : use http or rpc for transport (default is http) 
   -log logfile            : send stdout and stderr to logfile
   -list-servers           : list the servers 
   -kill-servers           : kill all servers
@@ -912,12 +919,25 @@
 
 ;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to
 ;; a specific Megatest area. Detail are being hashed out and this may change.
 ;;
 (if (args:get-arg "-adjutant")
-    (begin
-      (adjutant-run)
+    (let* ((host-type (args:get-arg "-adjutant")))
+      (launch:setup) ;; dang it, wish this wasn't needed
+      (print "Running the adjutant!")
+      (let loop ((wait-count 0))
+	(if (< wait-count 10) ;; 6 x 10 seconds = one minute
+	    (let* ((dat (rmt:no-sync-take-job host-type)))
+	      (match dat
+		((id ht vars exekey cmdline state event-time last-update)
+		 (system cmdline)
+		 (loop 0))
+		(else
+		 (thread-sleep! 10)
+		 (loop (+ wait-count 1)))))
+	    (print "I'm bored. Exiting.")))
+      ;; (adjutant-run (args:get-arg "-ajutant") rmt:no-sync-take-job)
       (set! *didsomething* #t)))
 
 (if (or (args:get-arg "-list-servers")
         (args:get-arg "-kill-servers"))
     (let ((tl (launch:setup)))

Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -22,10 +22,12 @@
 
 (declare (unit rmt))
 (declare (uses api))
 (declare (uses http-transport))
 (include "common_records.scm")
+(include "db_records.scm")
+
 ;; (declare (uses rmtmod))
 
 ;; (import rmtmod)
 
 ;;
@@ -54,18 +56,39 @@
 		  (client:setup areapath)
 		  #f))))
 
 (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id
 
+(define *rmt-query-last-call-time* 0)
+(define *rmt-query-last-rest-time* 0) ;; last time there was at least a 1/2 second rest - giving other processes access to the db
+
+;; NOTE: This query rest algorythm will not adapt to long query times. REDESIGN NEEDED. TODO. FIXME.
+;;
+(define (rmt:query-rest)
+  (let* ((now (current-milliseconds)))
+    (cond
+     ((> (- now *rmt-query-last-call-time*) 500)  ;; it's been a while since last query - no need to rest
+      (set! *rmt-query-last-rest-time*  now)
+      (set! *rmt-query-last-call-time*  now))
+     ((> (- now *rmt-query-last-rest-time*) 5000) ;; no natural rests have happened
+      (debug:print 0 *default-log-port* "query rest needed. blocking for 1/2 second.")
+      (thread-sleep! 0.5) ;; force a rest of a half second
+      (set! *rmt-query-last-rest-time* now)
+      (set! *rmt-query-last-call-time* now))
+     (else ;; sufficient rests have occurred, just record the last query time
+      (set! *rmt-query-last-call-time* now)))))
+
 ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
 ;;
 (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected
 
   #;(common:telemetry-log (conc "rmt:"(->string cmd))
                         payload: `((rid . ,rid)
                                    (params . ,params)))
-                          
+  (if (not (equal? (configf:lookup *configdat* "setup" "query-rest") "no"))
+      (rmt:query-rest))
+  
   (if (> attemptnum 2)
       (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum))
     
   (cond
    ((> attemptnum 2) (thread-sleep! 0.05))
@@ -525,15 +548,24 @@
   (rmt:general-call 'register-test run-id run-id test-name item-path))
 
 (define (rmt:get-test-id run-id testname item-path)
   (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))
 
-;; run-id is NOT used
+;; run-id is NOT used - but it will be! 
 ;;
 (define (rmt:get-test-info-by-id run-id test-id)
   (if (number? test-id)
-      (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
+      (let* ((testdat  (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)))
+	     (trundatf (conc (db:test-get-rundir testdat) "/.mt_data/test-run.dat")))
+	;; now we can update a couple fields from the filesystem
+	(if (and (db:test-get-rundir testdat)
+		 (file-exists? trundatf))
+	    (let* ((duration   (db:test-get-run_duration testdat))
+		   (event-time (db:test-get-event_time   testdat))
+		   (last-touch (file-modification-time trundatf)))
+	      (db:test-set-run_duration! testdat (max duration (- last-touch event-time)))))
+	testdat)
       (begin
 	(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
 	(print-call-chain (current-error-port))
 	#f)))
 
@@ -927,10 +959,19 @@
 (define (rmt:no-sync-del! var)
   (rmt:send-receive 'no-sync-del! #f `(,var)))
 
 (define (rmt:no-sync-get-lock keyname)
   (rmt:send-receive 'no-sync-get-lock #f `(,keyname)))
+
+(define (rmt:no-sync-add-job host-type vars-list exekey cmdline)
+  (rmt:send-receive 'no-sync-add-job #f `(,host-type ,vars-list ,exekey ,cmdline)))
+
+(define (rmt:no-sync-take-job host-type)
+  (rmt:send-receive 'no-sync-take-job #f `(,host-type)))
+
+(define (rmt:no-sync-job-records-clean)
+  (rmt:set-receive 'no-sync-job-records-clean #f '()))
 
 ;;======================================================================
 ;; A R C H I V E S
 ;;======================================================================
 

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -585,14 +585,17 @@
                                (if (not (configf:lookup *configdat* "server" "disable-db-snapshot"))
                                    (common:snapshot-file mtdbfile subdir: ".db-snapshot"))
 		               (delete-file* staging-file)
 		               (let* ((start-time (current-milliseconds))
                                       (res (system sync-cmd))
+                                      (dbbackupfile (conc mtdbfile ".backup"))
                                       (res2 
                                        (cond
-                                        ((eq? 0 res)
-		                         (delete-file* (conc mtdbfile ".backup"))
+                                        ((eq? 0 res )
+                                         (if (file-exists? dbbackupfile)
+		                           (delete-file* dbbackupfile)
+                                         )
                                          (if (eq? 0 (file-size sync-log))
                                              (delete-file sync-log))
 		                         (system (conc "/bin/mv " staging-file " " mtdbfile))
                                          
                                          (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000))

Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -1944,52 +1944,53 @@
 	 tdb
 	 "SELECT count(id) FROM test_rundat;")
 	res))
   0)
 
-(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)
-  (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))
-  (if (and cpuload diskfree)
-      (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
-  (if minutes 
-      (rmt:general-call 'update-run-duration run-id minutes test-id))
-  (if (and uname hostname)
-      (rmt:general-call 'update-uname-host run-id uname hostname test-id)))
+;; 
+(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname #!key (update-db #f)(tmpfree #f))
+  (if (get-environment-variable "MT_TEST_RUN_DIR")
+      (let* ((dest-dir (conc (get-environment-variable "MT_TEST_RUN_DIR") "/.mt_data"))
+	     (or-dash  (lambda (instr)
+			 (cond
+			  ((not instr) "") ;; #f -> blank, indicates value unchanged since last measurement taken
+			  ((string? instr)(if (string-search " " instr) (conc "\"" instr "\"") instr))
+			  (else instr))))
+	     (file-new (not (directory-exists? dest-dir))))
+	(if file-new (create-directory dest-dir #t))
+	(let* ((outp (open-output-file (conc dest-dir "/test-run.dat") #:append)))
+	  (with-output-to-port outp
+	    (lambda ()
+	      (if file-new
+		  (print "epoch_time,run_id,test_id,cpuload,diskfree,tmpfree,run_minutes,hostname,uname"))
+	      (print (current-seconds) "," (or-dash run-id)   "," (or-dash test-id)  ","
+		     (or-dash cpuload) "," (or-dash diskfree) "," (or-dash tmpfree)  ","
+		     (or-dash minutes) "," (or-dash hostname) ","
+		     (or-dash uname)))) ;; put uname last as it has spaces in it
+	  (close-output-port outp)))
+      (begin
+	(rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))))
+  (if update-db
+      (begin
+	(if (and cpuload diskfree)
+	    (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
+	(if minutes 
+	    (rmt:general-call 'update-run-duration run-id minutes test-id))
+	(if (and uname hostname)
+	    (rmt:general-call 'update-uname-host run-id uname hostname test-id)))))
   
 ;; This one is for running with no db access (i.e. via rmt: internally)
-(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries)
+(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries #!key (update-db #f))
 ;; (define (tests:set-full-meta-info test-id run-id minutes work-area)
 ;;  (let ((remtries 10))
   (let* ((cpuload  (get-cpu-load))
 	 (diskfree (get-df (current-directory)))
+	 (tmpfree  (get-df "/tmp"))
 	 (uname    (get-uname "-srvpio"))
 	 (hostname (get-host-name)))
-    (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)))
-    
-;; (define (tests:set-partial-meta-info test-id run-id minutes work-area)
-#;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries)
-  (let* ((cpuload  (get-cpu-load))
-	 (diskfree (get-df (current-directory)))
-	 (remtries 10))
-    (handle-exceptions
-     exn
-     (if (> remtries 0)
-	 (begin
-	   (print-call-chain (current-error-port))
-	   (debug:print-info 0 *default-log-port* "WARNING: failed to set meta info. Will try " remtries " more times")
-	   (set! remtries (- remtries 1))
-	   (thread-sleep! 10)
-	   (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1)))
-	 (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn)))
-	   (debug:print-error 0 *default-log-port* "tried for over a minute to update meta info and failed. Giving up")
-	   (debug:print 0 *default-log-port* "EXCEPTION: database probably overloaded or unreadable.")
-	   (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
-	   (debug:print 5 *default-log-port* "exn=" (condition->list exn))
-	   (debug:print 0 *default-log-port* " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
-	   (print-call-chain (current-error-port))))
-     (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes)
-  )))
+    (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db tmpfree: tmpfree)))
+    
 	 
 ;;======================================================================
 ;; A R C H I V I N G
 ;;======================================================================