Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -1587,11 +1587,11 @@
 
 (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
   (let* ((incompleted '())
 	 (oldlaunched '())
 	 (toplevels   '())
-	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime"))
+	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime")) ;; FIXME suspect test run time & deadtime are not well matched; resulting in COMPLETED/DEAD status of an a-ok running test
 	 (deadtime     (if (and deadtime-str
 				(string->number deadtime-str))
 			   (string->number deadtime-str)
 			   7200))) ;; two hours
     (db:with-db 
@@ -1650,11 +1650,12 @@
          (if (> (length all-ids) 0)
              (begin
                (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as INCOMPLETE")
                (for-each
                 (lambda (test-id)
-                  (db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test failed to complete")) ;; fix for one aspect of Randy's ticket 1405717332
+                  (db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test failed to complete")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS.  ref ticket 220546828
+
                 all-ids))))))))
 
 ;; ALL REPLACED BY THE BLOCK ABOVE
 ;;
 ;; 	    (sqlite3:execute 

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -444,16 +444,16 @@
 		  (set! waitons (filter (lambda (x)(not (equal? x hed))) waitons))
 		  (set! waitors (filter (lambda (x)(not (equal? x hed))) waitors))))
 	    
 	    ;; (items   (items:get-items-from-config config)))
 	    (if (not (hash-table-ref/default test-records hed #f))
-		(hash-table-set! test-records
-				 hed (vector hed     ;; 0
-					     config  ;; 1
-					     waitons ;; 2
+		(hash-table-set! test-records ;; BB: we are doing a manual make-tests:testqueue
+				 hed (vector hed     ;; 0 ;; testname
+					     config  ;; 1 
+					     waitons ;; 2 
 					     (config-lookup config "requirements" "priority")     ;; priority 3
-					     (tests:get-items config) ;; expand the [items] and or [itemstable] into explict items
+					     (tests:get-items config) ;; 4 ;; expand the [items] and or [itemstable] into explict items
 					     #f      ;; itemsdat 5
 					     #f      ;; spare - used for item-path
 					     waitors ;; 
 					     )))
 	    (for-each 
@@ -540,11 +540,11 @@
 					      (for-each (lambda (run-id)
 							  (if keep-going
 							      (handle-exceptions
 							       exn
 							       (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id)
-							       (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime)))
+							       (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
 							run-ids)))
 					  "runs: mark-incompletes")))
 	    (thread-start! th1)
 	    (thread-start! th2)
 	    (thread-join! th1)
@@ -718,19 +718,19 @@
 			(runs:queue-next-tal trimmed-tal trimmed-reg reglen regfull)
 			(runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull)
 			reruns)))
 	      (list (car newtal)(append (cdr newtal) reg) '() reruns))))
 
-     ((and (null? fails)
+     ((and (null? fails) ;; have not-started tests, but unable to run them.  everything looks completed with no prospect of unsticking something that is stuck.  we should mark hed as moribund and exit or continue if there are more tests to consider
 	   (null? prereq-fails)
 	   (null? non-completed))
       (if  (runs:can-keep-running? hed 20)
 	  (begin
 	    (runs:inc-cant-run-tests hed)
-	    (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0))
+	    (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; 
 	    ;; getting here likely means the system is way overloaded, kill a full minute before continuing
-	    (thread-sleep! 60)
+	    (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
 	    ;; num-retries code was here
 	    ;; we use this opportunity to move contents of reg to tal
 	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
 	  (begin
 	    (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
@@ -1167,50 +1167,50 @@
   ;; Do mark-and-find clean up of db before starting runing of quue
   ;;
   ;; (rmt:find-and-mark-incomplete)
 
   (let* ((run-info             (rmt:get-run-info run-id))
-	(tests-info            (mt:get-tests-for-run run-id #f '() '())) ;;  qryvals: "id,testname,item_path"))
-	(sorted-test-names     (tests:sort-by-priority-and-waiton test-records))
-	(test-registry         (make-hash-table))
-	(registry-mutex        (make-mutex))
-	(num-retries           0)
-	(max-retries           (config-lookup *configdat* "setup" "maxretries"))
-	(max-concurrent-jobs   (configf:lookup-number *configdat* "setup" "max_concurrent_jobs" default: 50))
-        (reglen                (if (number? reglen-in) reglen-in 1))
-	(last-time-incomplete  (- (current-seconds) 900)) ;; force at least one clean up cycle
-	(last-time-some-running (current-seconds))
-	;; (tdbdat                (tasks:open-db))
-	(runsdat (make-runs:dat
-		  ;; hed: hed
-		  ;; tal: tal
-		  ;; reg: reg
-		  ;; reruns: reruns
-		  reglen: reglen
-		  regfull: #f ;; regfull
-		  ;; test-record: test-record
-		  runname: runname
-		  ;; test-name: test-name
-		  ;; item-path: item-path
-		  ;; jobgroup: jobgroup
-		  max-concurrent-jobs: max-concurrent-jobs
-		  run-id: run-id
-		  ;; waitons: waitons
-		  ;; testmode: testmode
-		  test-patts: test-patts
-		  required-tests: required-tests
-		  test-registry: test-registry
-		  registry-mutex: registry-mutex
-		  flags: flags
-		  keyvals: keyvals
-		  run-info: run-info
-		  ;; newtal: newtal
-		  all-tests-registry: all-tests-registry
-		  ;; itemmaps: itemmaps
-		  ;; prereqs-not-met: (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)
-		  ;; can-run-more-tests: (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) ;; look at the test jobgroup and tot jobs running
-		  )))
+         (tests-info            (mt:get-tests-for-run run-id #f '() '())) ;;  qryvals: "id,testname,item_path"))
+         (sorted-test-names     (tests:sort-by-priority-and-waiton test-records))
+         (test-registry         (make-hash-table))
+         (registry-mutex        (make-mutex))
+         (num-retries           0)
+         (max-retries           (config-lookup *configdat* "setup" "maxretries"))
+         (max-concurrent-jobs   (configf:lookup-number *configdat* "setup" "max_concurrent_jobs" default: 50))
+         (reglen                (if (number? reglen-in) reglen-in 1))
+         (last-time-incomplete  (- (current-seconds) 900)) ;; force at least one clean up cycle
+         (last-time-some-running (current-seconds))
+         ;; (tdbdat                (tasks:open-db))
+         (runsdat (make-runs:dat
+                   ;; hed: hed
+                   ;; tal: tal
+                   ;; reg: reg
+                   ;; reruns: reruns
+                   reglen: reglen
+                   regfull: #f ;; regfull
+                   ;; test-record: test-record
+                   runname: runname
+                   ;; test-name: test-name
+                   ;; item-path: item-path
+                   ;; jobgroup: jobgroup
+                   max-concurrent-jobs: max-concurrent-jobs
+                   run-id: run-id
+                   ;; waitons: waitons
+                   ;; testmode: testmode
+                   test-patts: test-patts
+                   required-tests: required-tests
+                   test-registry: test-registry
+                   registry-mutex: registry-mutex
+                   flags: flags
+                   keyvals: keyvals
+                   run-info: run-info
+                   ;; newtal: newtal
+                   all-tests-registry: all-tests-registry
+                   ;; itemmaps: itemmaps
+                   ;; prereqs-not-met: (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)
+                   ;; can-run-more-tests: (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) ;; look at the test jobgroup and tot jobs running
+                   )))
 
     ;; Initialize the test-registery hash with tests that already have a record
     ;; convert state to symbol and use that as the hash value
     (for-each (lambda (trec)
 		(let ((id (db:test-get-id        trec))
@@ -1284,12 +1284,12 @@
 	;;     (server:kind-run *toppath*))
 	
 	(if (> num-running 0)
 	  (set! last-time-some-running (current-seconds)))
 
-      (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
-	  (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
+        (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
+            (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
 	;; (debug:print 0 *default-log-port* "max-tries-hash: " (hash-table->alist *max-tries-hash*))
 
 	;; Ensure all top level tests get registered. This way they show up as "NOT_STARTED" on the dashboard
 	;; and it is clear they *should* have run but did not.
 	(if (not (hash-table-ref/default test-registry (db:test-make-full-name test-name "") #f))
@@ -1443,11 +1443,12 @@
 	 ((not (null? reg)) ;; could we get here with leftovers?
 	  (debug:print-info 0 *default-log-port* "Have leftovers!")
 	  (loop (car reg)(cdr reg) '() reruns))
 	 (else
 	  (debug:print-info 4 *default-log-port* "Exiting loop with...\n  hed=" hed "\n  tal=" tal "\n  reruns=" reruns))
-	 )))
+	 ))) ;; end loop on sorted test names
+    
     ;; now *if* -run-wait we wait for all tests to be done
     ;; Now wait for any RUNNING tests to complete (if in run-wait mode)
     (thread-sleep! 5) ;; I think there is a race condition here. Let states/statuses settle
     (let wait-loop ((num-running      (rmt:get-count-tests-running-for-run-id run-id))
 		    (prev-num-running 0))