Index: codescanlib.scm
==================================================================
--- codescanlib.scm
+++ codescanlib.scm
@@ -46,11 +46,11 @@
                          #f)]
                     [else #f] ) scm-tree))))
     procs))
 
 
-;; given a sexp, return a flat lost of atoms in that sexp
+;; given a sexp, return a flat list of atoms in that sexp
 (define (get-atoms-in-body body)
   (cond
    ((null? body) '())
    ((atom? body) (list body))
    (else

Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -531,11 +531,12 @@
     (5 "WAIVED")
     (6 "CHECK")
     (7 "STUCK/DEAD")
     (8 "DEAD")
     (9 "FAIL")
-    (10 "ABORT")))
+    (10 "PREQ_FAIL")
+    (11 "ABORT")))
 
 (define *common:ended-states*       ;; states which indicate the test is stopped and will not proceed
   '("COMPLETED" "ARCHIVED" "KILLED" "KILLREQ" "STUCK" "INCOMPLETE"))
 
 (define *common:badly-ended-states* ;; these roll up as CHECK, i.e. results need to be checked

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -1,6 +1,6 @@
-;;======================================================================
+;======================================================================
 ;; Copyright 2006-2016, Matthew Welland.
 ;; 
 ;;  This program is made available under the GNU GPL version 2.0 or
 ;;  greater. See the accompanying file COPYING for details.
 ;; 
@@ -198,21 +198,21 @@
 
 ;; open an sql database inside a file lock
 ;; returns: db existed-prior-to-opening
 ;; RA => Returns a db handler; sets the lock if opened in writable mode
 ;;
-;;(define *db-open-mutex* (make-mutex))
+;; (define *db-open-mutex* (make-mutex))
 
 (define (db:lock-create-open fname initproc)
   (let* ((parent-dir   (or (pathname-directory fname)(current-directory))) ;; no parent? go local
          (raw-fname    (pathname-file fname))
 	 (dir-writable (file-write-access? parent-dir))
 	 (file-exists  (common:file-exists? fname))
 	 (file-write   (if file-exists
 			   (file-write-access? fname)
 			   dir-writable )))
-    ;;(mutex-lock! *db-open-mutex*) ;; tried this mutex, not clear it helped.
+    ;; (mutex-lock! *db-open-mutex*) ;; tried this mutex, not clear it helped.
     (if file-write ;; dir-writable
 	(condition-case
          (let* ((lockfname   (conc fname ".lock"))
                 (readyfname  (conc parent-dir "/.ready-" raw-fname))
                 (readyexists (common:file-exists? readyfname)))
@@ -246,11 +246,11 @@
         
 	(condition-case
          (begin
            (debug:print 2 *default-log-port* "WARNING: opening db in non-writable dir " fname)
            (let ((db (sqlite3:open-database fname)))
-             ;;(mutex-unlock! *db-open-mutex*)
+             ;; (mutex-unlock! *db-open-mutex*)
              db))
          (exn (io-error)  (debug:print 0 *default-log-port* "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again."))
          (exn (corrupt)   (debug:print 0 *default-log-port* "ERROR: database " fname " is corrupt. Repair it to proceed."))
          (exn (busy)      (debug:print 0 *default-log-port* "ERROR: database " fname " is locked. Try copying to another location, remove original and copy back."))
          (exn (permission)(debug:print 0 *default-log-port* "ERROR: database " fname " has some permissions problem."))
@@ -1587,11 +1587,11 @@
 
 (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
   (let* ((incompleted '())
 	 (oldlaunched '())
 	 (toplevels   '())
-	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime"))
+	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime")) ;; FIXME suspect test run time & deadtime are not well matched; resulting in COMPLETED/DEAD status of an a-ok running test
 	 (deadtime     (if (and deadtime-str
 				(string->number deadtime-str))
 			   (string->number deadtime-str)
 			   7200))) ;; two hours
     (db:with-db 
@@ -1650,11 +1650,12 @@
          (if (> (length all-ids) 0)
              (begin
                (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as INCOMPLETE")
                (for-each
                 (lambda (test-id)
-                  (db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test failed to complete")) ;; fix for one aspect of Randy's ticket 1405717332
+                  (db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test failed to complete")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS.  ref ticket 220546828
+
                 all-ids))))))))
 
 ;; ALL REPLACED BY THE BLOCK ABOVE
 ;;
 ;; 	    (sqlite3:execute 
@@ -3492,37 +3493,50 @@
                                                         (map dbr:counts-status state-status-counts)))
                                                    *common:std-statuses* >))
 			    (non-completes     (filter (lambda (x)
 							 (not (equal? x "COMPLETED")))
 						       all-curr-states))
-			    (num-non-completes (length non-completes))
-                            
+			    (preq-fails        (filter (lambda (x)
+							 (equal? x "PREQ_FAIL"))
+						       all-curr-statuses))
+                            (num-non-completes (length non-completes))
                             (newstate          (cond
-						((> running 0)
-						 "RUNNING") ;; anything running, call the situation running
-						((> bad-not-started 0)  ;; we have an ugly situation, it is completed in the sense we cannot do more.
-						 "COMPLETED") 
-						((> num-non-completes 0) ;;
-						 (car non-completes))  ;;  (remove (lambda (x)(equal? "COMPLETED" x)) all-curr-states)))
-                                                ;; only rollup DELETED if all DELETED
-						(else
-						 (car all-curr-states))))
+						((> running 0)           "RUNNING")            ;; anything running, call the situation running
+                                                ((> (length preq-fails) 0)
+                                                 "NOT_STARTED")
+						((> bad-not-started 0)   "COMPLETED")          ;; we have an ugly situation, it is completed in the sense we cannot do more.
+						((> num-non-completes 0) (car non-completes))  ;;  (remove (lambda (x)(equal? "COMPLETED" x)) all-curr-states))) ;; only rollup DELETED if all DELETED
+						(else                    (car all-curr-states))))
 			                       ;; (if (> running 0)
                                                ;;     "RUNNING"
                                                ;;     (if (> bad-not-started 0)
                                                ;;         "COMPLETED"
                                                ;;         (car all-curr-states))))
-                            (newstatus            (if (or (> bad-not-started 0)
-							  (and (equal? newstate "NOT_STARTED")
-							       (> num-non-completes 0)))
-						      "STARTED"
-                                                      (car all-curr-statuses))))
+                            (newstatus         (cond
+                                                ((> (length preq-fails) 0)
+                                                 "PREQ_FAIL")
+                                                ((or (> bad-not-started 0)
+                                                     (and (equal? newstate "NOT_STARTED")
+                                                          (> num-non-completes 0)))
+                                                 "STARTED")
+                                                (else
+                                                 (car all-curr-statuses)))))
+
                        ;; (print "bad-not-supported: " bad-not-support " all-curr-states: " all-curr-states " all-curr-statuses: " all-curr-states)
                        ;;      " newstate: " newstate " newstatus: " newstatus)
                        ;; NB// Pass the db so it is part of the transaction
+                       (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path"> bad-not-started="bad-not-started" newstate="newstate" newstatus="newstatus" num-non-completes="num-non-completes" non-completes="non-completes "len(sscs)="(length state-status-counts)  " state-status-counts: "
+                                    (apply conc
+                                           (map (lambda (x)
+                                                  (conc
+                                                   (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
+                                                state-status-counts))
+                                    
+                                    ); end debug:print
                        (if tl-test-id
-			   (db:test-set-state-status db run-id tl-test-id newstate newstatus #f))))))))
+			   (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct
+		       ))))))
          (mutex-unlock! *db-transaction-mutex*)
          (if (and test-id state status (equal? status "AUTO")) 
              (db:test-data-rollup dbstruct run-id test-id status))
          tr-res)))))
 ;; BBnote: db:get-all-state-status-counts-for-test returns dbr:counts object aggregating state and status of items of a given test, *not including rollup state/status*

Index: megatest-version.scm
==================================================================
--- megatest-version.scm
+++ megatest-version.scm
@@ -1,7 +1,7 @@
 ;; Always use two or four digit decimal
 ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..
 
 (declare (unit megatest-version))
 
-(define megatest-version 1.6429)
+(define megatest-version 1.6431)
 

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -465,11 +465,11 @@
     ;;
     ;; What happended, this code is now duplicated in tests!?
     ;;
     ;;======================================================================
     
-    (if (not (null? test-names))
+    (if (not (null? test-names)) ;; BEGIN test-names loop
 	(let loop ((hed (car test-names))   ;; NOTE: This is the main loop that iterates over the test-names
 		   (tal (cdr test-names)))         ;; 'return-procs tells the config reader to prep running system but return a proc
 	  (change-directory *toppath*) ;; PLEASE OPTIMIZE ME!!! I think this should be a no-op but there are several places where change-directories could be happening.
 	  (setenv "MT_TEST_NAME" hed) ;; 
 	  (let*-values (((waitons waitors config)(tests:get-waitons hed all-tests-registry)))
@@ -483,16 +483,16 @@
 		  (set! waitons (filter (lambda (x)(not (equal? x hed))) waitons))
 		  (set! waitors (filter (lambda (x)(not (equal? x hed))) waitors))))
 	    
 	    ;; (items   (items:get-items-from-config config)))
 	    (if (not (hash-table-ref/default test-records hed #f))
-		(hash-table-set! test-records
-				 hed (vector hed     ;; 0
-					     config  ;; 1
-					     waitons ;; 2
+		(hash-table-set! test-records ;; BB: we are doing a manual make-tests:testqueue
+				 hed (vector hed     ;; 0 ;; testname
+					     config  ;; 1 
+					     waitons ;; 2 
 					     (config-lookup config "requirements" "priority")     ;; priority 3
-					     (tests:get-items config) ;; expand the [items] and or [itemstable] into explict items
+					     (tests:get-items config) ;; 4 ;; expand the [items] and or [itemstable] into explict items
 					     #f      ;; itemsdat 5
 					     #f      ;; spare - used for item-path
 					     waitors ;; 
 					     )))
 	    (for-each 
@@ -502,11 +502,11 @@
 			  (waiton-tconfig  (if waiton-record (vector-ref waiton-record 1) #f))
 			  (waiton-itemized (and waiton-tconfig
 						(or (hash-table-ref/default waiton-tconfig "items" #f)
 						    (hash-table-ref/default waiton-tconfig "itemstable" #f))))
 			  (itemmaps        (tests:get-itemmaps config))  ;; (configf:lookup config "requirements" "itemmap"))
-			  (new-test-patts  (tests:extend-test-patts test-patts hed waiton itemmaps)))
+			  (new-test-patts  (tests:extend-test-patts test-patts hed waiton itemmaps)))   ;; BB:  items expanded here.
 		     (debug:print-info 0 *default-log-port* "Test " waiton " has " (if waiton-record "a" "no") " waiton-record and" (if waiton-itemized " " " no ") "items")
 		     ;; need to account for test-patt here, if I am test "a", selected with a test-patt of "hed/b%"
 		     ;; and we are waiting on "waiton" we need to add "waiton/,waiton/b%" to test-patt
 		     ;; is this satisfied by merely appending "/" to the waiton name added to the list?
 		     ;;
@@ -542,11 +542,11 @@
 	     (delete-duplicates (append waitons waitors)))
 	    (let ((remtests (delete-duplicates (append waitons tal))))
 	      (if (not (null? remtests))
 		  (begin
 		    ;; (debug:print-info 0 *default-log-port* "Preprocessing continues for " (string-intersperse remtests ", "))
-		    (loop (car remtests)(cdr remtests))))))))
+		    (loop (car remtests)(cdr remtests)))))))) ;; END test-names loop
 
     (if (not (null? required-tests))
 	(debug:print-info 1 *default-log-port* "Adding \"" (string-intersperse required-tests " ") "\" to the run queue"))
     ;; NOTE: these are all parent tests, items are not expanded yet.
     (debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records))
@@ -561,19 +561,19 @@
 						  (print-call-chain)
 						  (print " message: " ((condition-property-accessor 'exn 'message) exn)))
 					      (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
 								    (any->number reglen) all-tests-registry)))
 					  "runs:run-tests-queue"))
-		 (th2        (make-thread (lambda ()				    
+		 (th2        (make-thread (lambda ()			 ;; BBQ: why are we visiting ALL runs here?	    
 					    ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ...
 					    (let ((run-ids (rmt:get-all-run-ids)))
 					      (for-each (lambda (run-id)
 							  (if keep-going
 							      (handle-exceptions
 							       exn
 							       (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id)
-							       (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime)))
+							       (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
 							run-ids)))
 					  "runs: mark-incompletes")))
 	    (thread-start! th1)
 	    (thread-start! th2)
 	    (thread-join! th1)
@@ -750,19 +750,19 @@
 		  #f
                   (runs:loop-values trimmed-tal trimmed-reg reglen regfull reruns)
                   ))
 	      (list (car newtal)(append (cdr newtal) reg) '() reruns))))
 
-     ((and (null? fails)
+     ((and (null? fails) ;; have not-started tests, but unable to run them.  everything looks completed with no prospect of unsticking something that is stuck.  we should mark hed as moribund and exit or continue if there are more tests to consider
 	   (null? prereq-fails)
 	   (null? non-completed))
       (if  (runs:can-keep-running? hed 20)
 	  (begin
 	    (runs:inc-cant-run-tests hed)
-	    (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0))
+	    (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; 
 	    ;; getting here likely means the system is way overloaded, kill a full minute before continuing
-	    (thread-sleep! 60)
+	    (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
 	    ;; num-retries code was here
 	    ;; we use this opportunity to move contents of reg to tal
 	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
 	  (begin
 	    (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
@@ -780,11 +780,13 @@
 			", removing it from to-do list")
       (let ((test-id (rmt:get-test-id run-id hed "")))
 	(if test-id
 	    (if (not (null? prereq-fails))
 		(mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_DISCARDED" "Failed to run due to prior failed prerequisites")
-		(mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL"      "Failed to run due to failed prerequisites"))))
+                (begin
+                  (debug:print 4 *default-log-port*"BB> set PREQ_FAIL on "hed)
+                  (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL"      "Failed to run due to failed prerequisites"))))) ;; BB: this works, btu equivalent for itemwait mode does not work.
       (if (or (not (null? reg))(not (null? tal)))
 	  (begin
 	    (hash-table-set! test-registry hed 'CANNOTRUN)
             (runs:loop-values tal reg reglen regfull (cons hed reruns))
             )
@@ -818,10 +820,13 @@
 
 
 ;;  hed tal reg reruns reglen regfull test-record runname test-name item-path jobgroup max-concurrent-jobs run-id waitons item-path testmode test-patts required-tests test-registry registry-mutex flags keyvals run-info newtal all-tests-registry itemmaps)
 (define (runs:process-expanded-tests runsdat testdat)
   ;; unroll the contents of runsdat and testdat (due to ongoing refactoring).
+  (debug:print 2 *default-log-port* "runs:process-expanded-tests; testdat:" )
+  (debug:print 2 *default-log-port* (with-output-to-string
+                                            (lambda () (pp (runs:testdat->alist testdat) ))))
   (let* ((hed                    (runs:testdat-hed testdat))
 	 (tal                    (runs:testdat-tal testdat))
 	 (reg                    (runs:testdat-reg testdat))
 	 (reruns                 (runs:testdat-reruns testdat))
 	 (test-name              (runs:testdat-test-name testdat))
@@ -889,11 +894,11 @@
 
     ;; Don't know at this time if the test have been launched at some time in the past
     ;; i.e. is this a re-launch?
     (debug:print-info 4 *default-log-port* "run-limits-info = " run-limits-info)
     
-    (cond
+    (cond ; cond 894- 1067
      
      ;; Check item path against item-patts, 
      ;;
      ((not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) ;; This test/itempath is not to be run
       ;; else the run is stuck, temporarily or permanently
@@ -1005,30 +1010,36 @@
 		 (debug:print-info 0 *default-log-port* "Waiting for more work to do..."))
 	    (thread-sleep! 1)
 	    (list (car newtal)(cdr newtal) reg reruns))
 	  ;; the waiton is FAIL so no point in trying to run hed ever again
 	  (if (or (not (null? reg))(not (null? tal)))
-	      (if (vector? hed)
+	      (if (or (vector? hed)  (not (null? fails))) ;; BB: why do we need a vector?  in my case, fails is populated (prereq failed), reg is not nul, and we really want to drop this one
 		  (begin
 		    (debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path
 				 " from the launch list as it has prerequistes that are FAIL")
 		    (let ((test-id (rmt:get-test-id run-id hed "")))
 		      (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))
 		    (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?)
 		    ;; (thread-sleep! *global-delta*)
 		    ;; This next is for the items
-		    (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "BLOCKED" #f)
+
+                    (if (not (null? fails))
+                        ;;(mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" #f)
+                        (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" #f) 
+                        ;;(mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "BLOCKED" #f)
+                        (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) )
 		    (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'removed)
 		    (runs:loop-values tal reg reglen regfull reruns))
-		  (let ((nth-try (hash-table-ref/default test-registry hed 0)))
+		  (let ((nth-try (hash-table-ref/default test-registry hed 0))) ;; hed not a vector...
+                    (debug:print 2 *default-log-port* "nth-try("hed")="nth-try)
 		    (cond
 		     ((member "RUNNING" (map db:test-get-state prereqs-not-met))
 		      (if (runs:lownoise (conc "possible RUNNING prerequistes " hed) 60)
 			  (debug:print 0 *default-log-port* "WARNING: test " hed " has possible RUNNING prerequisites, don't give up on it yet."))
 		      (thread-sleep! 4)
 		      (runs:loop-values tal reg reglen regfull reruns))
-		     ((or (not nth-try)
+		     ((or (not nth-try) ;; BB: condition on subsequent tries, condition below fires on first try 
 			  (and (number? nth-try)
 			       (< nth-try 10)))
 		      (hash-table-set! test-registry hed (if (number? nth-try)
 							     (+ nth-try 1)
 							     0))
@@ -1035,18 +1046,18 @@
 		      (if (runs:lownoise (conc "not removing test " hed) 60)
 			  (debug:print 1 *default-log-port* "WARNING: not removing test " hed " from queue although it may not be runnable due to FAILED prerequisites"))
 		      ;; may not have processed correctly. Could be a race condition in your test implementation? Dropping test " hed) ;;  " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)")
 		      (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?)
 		      (runs:loop-values newtal reg reglen regfull reruns))
-		     ((symbol? nth-try)
+		     ((symbol? nth-try) ;; BB: 'done matches here in one case where prereq itemwait failed.  This is first "try"
 		      (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW
 			  (if (null? tal)
 			      #f ;; yes, really
 			      (list (car tal)(cdr tal) reg reruns))
 			  (begin
 			    (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60)
-				(debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state " nth-try " will be overridden and we'll retry."))
+				(debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state >" nth-try "< will be overridden and we'll retry."))
 			    (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f)
 			    (hash-table-set! test-registry hed 0)
 			    (runs:loop-values newtal reg reglen regfull))))
 		     (else
 		      (if (runs:lownoise (conc "FAILED prerequitests and we tried" hed) 60)
@@ -1166,50 +1177,50 @@
   ;; Do mark-and-find clean up of db before starting runing of quue
   ;;
   ;; (rmt:find-and-mark-incomplete)
 
   (let* ((run-info             (rmt:get-run-info run-id))
-	(tests-info            (mt:get-tests-for-run run-id #f '() '())) ;;  qryvals: "id,testname,item_path"))
-	(sorted-test-names     (tests:sort-by-priority-and-waiton test-records))
-	(test-registry         (make-hash-table))
-	(registry-mutex        (make-mutex))
-	(num-retries           0)
-	(max-retries           (config-lookup *configdat* "setup" "maxretries"))
-	(max-concurrent-jobs   (configf:lookup-number *configdat* "setup" "max_concurrent_jobs" default: 50))
-        (reglen                (if (number? reglen-in) reglen-in 1))
-	(last-time-incomplete  (- (current-seconds) 900)) ;; force at least one clean up cycle
-	(last-time-some-running (current-seconds))
-	;; (tdbdat                (tasks:open-db))
-	(runsdat (make-runs:dat
-		  ;; hed: hed
-		  ;; tal: tal
-		  ;; reg: reg
-		  ;; reruns: reruns
-		  reglen: reglen
-		  regfull: #f ;; regfull
-		  ;; test-record: test-record
-		  runname: runname
-		  ;; test-name: test-name
-		  ;; item-path: item-path
-		  ;; jobgroup: jobgroup
-		  max-concurrent-jobs: max-concurrent-jobs
-		  run-id: run-id
-		  ;; waitons: waitons
-		  ;; testmode: testmode
-		  test-patts: test-patts
-		  required-tests: required-tests
-		  test-registry: test-registry
-		  registry-mutex: registry-mutex
-		  flags: flags
-		  keyvals: keyvals
-		  run-info: run-info
-		  ;; newtal: newtal
-		  all-tests-registry: all-tests-registry
-		  ;; itemmaps: itemmaps
-		  ;; prereqs-not-met: (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)
-		  ;; can-run-more-tests: (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) ;; look at the test jobgroup and tot jobs running
-		  )))
+         (tests-info            (mt:get-tests-for-run run-id #f '() '())) ;;  qryvals: "id,testname,item_path"))
+         (sorted-test-names     (tests:sort-by-priority-and-waiton test-records))
+         (test-registry         (make-hash-table))
+         (registry-mutex        (make-mutex))
+         (num-retries           0)
+         (max-retries           (config-lookup *configdat* "setup" "maxretries"))
+         (max-concurrent-jobs   (configf:lookup-number *configdat* "setup" "max_concurrent_jobs" default: 50))
+         (reglen                (if (number? reglen-in) reglen-in 1))
+         (last-time-incomplete  (- (current-seconds) 900)) ;; force at least one clean up cycle
+         (last-time-some-running (current-seconds))
+         ;; (tdbdat                (tasks:open-db))
+         (runsdat (make-runs:dat
+                   ;; hed: hed
+                   ;; tal: tal
+                   ;; reg: reg
+                   ;; reruns: reruns
+                   reglen: reglen
+                   regfull: #f ;; regfull
+                   ;; test-record: test-record
+                   runname: runname
+                   ;; test-name: test-name
+                   ;; item-path: item-path
+                   ;; jobgroup: jobgroup
+                   max-concurrent-jobs: max-concurrent-jobs
+                   run-id: run-id
+                   ;; waitons: waitons
+                   ;; testmode: testmode
+                   test-patts: test-patts
+                   required-tests: required-tests
+                   test-registry: test-registry
+                   registry-mutex: registry-mutex
+                   flags: flags
+                   keyvals: keyvals
+                   run-info: run-info
+                   ;; newtal: newtal
+                   all-tests-registry: all-tests-registry
+                   ;; itemmaps: itemmaps
+                   ;; prereqs-not-met: (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)
+                   ;; can-run-more-tests: (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) ;; look at the test jobgroup and tot jobs running
+                   )))
 
     ;; Initialize the test-registery hash with tests that already have a record
     ;; convert state to symbol and use that as the hash value
     (for-each (lambda (trec)
 		(let ((id (db:test-get-id        trec))
@@ -1283,12 +1294,12 @@
 	;;     (server:kind-run *toppath*))
 	
 	(if (> num-running 0)
 	  (set! last-time-some-running (current-seconds)))
 
-      (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
-	  (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
+        (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
+            (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
 	;; (debug:print 0 *default-log-port* "max-tries-hash: " (hash-table->alist *max-tries-hash*))
 
 	;; Ensure all top level tests get registered. This way they show up as "NOT_STARTED" on the dashboard
 	;; and it is clear they *should* have run but did not.
 	(if (not (hash-table-ref/default test-registry (db:test-make-full-name test-name "") #f))
@@ -1442,11 +1453,12 @@
 	 ((not (null? reg)) ;; could we get here with leftovers?
 	  (debug:print-info 0 *default-log-port* "Have leftovers!")
 	  (loop (car reg)(cdr reg) '() reruns))
 	 (else
 	  (debug:print-info 4 *default-log-port* "Exiting loop with...\n  hed=" hed "\n  tal=" tal "\n  reruns=" reruns))
-	 )))
+	 ))) ;; end loop on sorted test names
+    
     ;; now *if* -run-wait we wait for all tests to be done
     ;; Now wait for any RUNNING tests to complete (if in run-wait mode)
     (thread-sleep! 5) ;; I think there is a race condition here. Let states/statuses settle
     (let wait-loop ((num-running      (rmt:get-count-tests-running-for-run-id run-id))
 		    (prev-num-running 0))

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -458,14 +458,15 @@
         (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300))
 	(debug-mode   (debug:debug-mode 1))
 	(last-time    (current-seconds))
 	(no-sync-db   (db:open-no-sync-db))
         (sync-duration 0) ;; run time of the sync in milliseconds
-        (this-wd-num  (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x))))
+        ;;(this-wd-num  (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x)))
+        )
     (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls
     (debug:print-info 2 *default-log-port* "Periodic sync thread started.")
-    (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num)
+    (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)  );;  " this-wd-num="this-wd-num)
     (if (and legacy-sync (not *time-to-exit*))
 	(let* (;;(dbstruct (db:setup))
 	       (mtdb       (dbr:dbstruct-mtdb dbstruct))
 	       (mtpath     (db:dbdat-get-path mtdb))
 	       (tmp-area   (common:get-db-tmp-area))
@@ -579,7 +580,7 @@
 			(delay-loop (+ count 1))))
 		  (if (not *time-to-exit*) (loop))))
 	    ;; time to exit, close the no-sync db here
 	    (db:no-sync-close-db no-sync-db)
 	    (if (common:low-noise-print 30)
-		(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" this-wd-num="this-wd-num)))))))
+		(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) ))))))) ;;" this-wd-num="this-wd-num)))))))