@@ -16,11 +16,12 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;  strftime('%m/%d/%Y %H:%M:%S','now','localtime')
 
 (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) 
-     posix-extras directory-utils pathname-expand typed-records format  sxml-serializer sxml-modifications)
+     posix-extras directory-utils pathname-expand typed-records format  sxml-serializer
+     sxml-modifications matchable)
 
 (declare (unit runs))
 (declare (uses db))
 (declare (uses common))
 (declare (uses items))
@@ -45,20 +46,29 @@
 (defstruct runs:dat
   reglen regfull
   runname max-concurrent-jobs run-id
   test-patts required-tests test-registry
   registry-mutex flags keyvals run-info all-tests-registry
-  can-run-more-tests
+  ;; stores results from last runs:can-run-more-tests 
+  (can-run-more-tests      #f) ;; (list can-run-more-flag num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit)
   ((can-run-more-tests-count 0) : fixnum)
   (last-fuel-check         0)  ;; time when we last checked fuel
   (beginning-of-time       (current-seconds))
+  (load-mgmt-function      #f)
+  (wait-for-jobs-function  #f)
+  (last-load-check-time    0)
+  (last-jobs-check-time    0)
   )
 
 (defstruct runs:testdat
   hed tal reg reruns  test-record
   test-name item-path jobgroup
-  waitons testmode  newtal itemmaps prereqs-not-met)
+  waitons testmode  newtal
+  itemmaps
+  (prereqs-not-met #f)
+  (last-update 0) ;; 
+  )
   
 ;; look in the $MT_RUN_AREA_HOME/.softlocks directory for key-host-pid.softlock files
 ;;  - remove any that are over 3600 seconds old
 ;;  - if there are any that are younger than 10 seconds
 ;;      * sleep 10 seconds
@@ -89,11 +99,11 @@
 				    lock-files)))
 	  (if fresh-locks
 	      (begin
 		(if (runs:lownoise "runners-softlock-wait" 360)
 		    (debug:print-info 0 *default-log-port* "Other runners in flight, giving up some time..."))
-		(thread-sleep! 10))
+		(thread-sleep! 2))
 	      (begin
 		(if (runs:lownoise "runners-softlock-nowait" 360)
 		    (debug:print-info 0 *default-log-port* "No runners in flight, updating softlock"))
 		(let* ((ouf (open-output-file my-lock-file)))
 		  (with-output-to-port ouf
@@ -102,12 +112,12 @@
 	  (runs:dat-last-fuel-check-set! rdat (current-seconds))))))
   
 ;; Fourth try, do accounting through time
 ;;
 (define (runs:parallel-runners-mgmt rdat)
-  (let ((time-to-check 10) ;; 28
-	(time-to-wait  30)
+  (let ((time-to-check (configf:lookup-number *configdat* "runners" "time-to-check" default: 10)) ;; 28
+	(time-to-wait  (configf:lookup-number *configdat* "runners" "time-to-wait" default: 30))
 	(now-time      (current-seconds)))
     (if (> (- now-time (runs:dat-last-fuel-check rdat)) time-to-check) ;; time to check
 	(runs:wait-on-softlock rdat "runners"))))
 
 ;; To test parallel-runners management start a repl:
@@ -213,15 +223,17 @@
 	  exn
 	  (let ((call-chain (get-call-chain))
 		(msg        ((condition-property-accessor 'exn 'message) exn)))
 	    (if (< count 5)
 		(begin ;; this call is colliding, do some crude stuff to fix it.
-		  (debug:print 0 *default-log-port* "ERROR: *configdat* was inaccessible! This should never happen. Retry #" count)
+		  (debug:print 0 *default-log-port* "ERROR: *configdat* was inaccessible! This should never happen. Retry #" count
+			       ", exn=" exn)
 		  (launch:setup force-reread: #t)
 		  (fatal-loop (+ count 1))) 
 		(begin
-		  (debug:print 0 *default-log-port* "FATAL: *configdat* was inaccessible! This should never happen. Retried " count " times. Message: " msg)
+		  (debug:print 0 *default-log-port* "FATAL: *configdat* was inaccessible! This should never happen. Retried " count
+			       " times. Message: " msg)
 		  (debug:print 0 *default-log-port* "Call chain:")
 		  (with-output-to-port *default-log-port*
 
                     (lambda ()
                       (print "*configdat* is >>"*configdat*"<<")
@@ -311,23 +323,12 @@
   ;; Take advantage of a good place to exit if running the one-pass methodology
   (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20)
 	   (args:get-arg "-one-pass"))
       (exit 0))
 
-  (thread-sleep! (cond ;; BB: check with Matt.  Should this sleep move
-		       ;; to cond clauses below where we determine we
-		       ;; have too many jobs running rather than each
-		       ;; time the and condition above is true (which
-		       ;; seems like always)?
-        	  ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time
-		   (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ..."))
-		   10)  ;; obviously haven't had any work to do for a while
-		  (else 0)))
-;;		   ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?
-;;		   (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)
-;;		   )))
-  
+  (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+
   (let* ((num-running             (rmt:get-count-tests-running run-id #f)) ;; fastmode=no
 	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
 	 (job-group-limit         (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))
 				    (if (string? jobg-count)
 					(string->number jobg-count)
@@ -378,11 +379,11 @@
           (if (null? existing-tests)
               (let* ((use-log-dir (if (not (directory-exists? log-dir))
                                       (handle-exceptions
                                        exn
                                        (begin
-                                         (debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir)
+                                         (debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir ", exn=" exn)
                                          #f)
                                        (create-directory log-dir #t)
                                        #t)
                                       #t))
                      (start-time   (current-seconds))
@@ -389,11 +390,11 @@
                      (actual-logf  (if use-log-dir full-log-fname log-file)))
                 (handle-exceptions
                  exn
                  (begin
                    (print-call-chain *default-log-port*)
-                   (debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn))
+                   (debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)
                    (debug:print 0 *default-log-port* "ERROR: failed to run pre-hook " run-pre-hook ", check the log " log-file))
                  (debug:print-info 0 *default-log-port* "running run-pre-hook: \"" run-pre-hook "\", log is " actual-logf)
                  (system (conc run-pre-hook " >> " actual-logf " 2>&1"))
                  (debug:print-info 0 *default-log-port* "pre-hook \"" run-pre-hook "\" took " (- (current-seconds) start-time) " seconds to run.")))
               (debug:print 0 *default-log-port* "Skipping pre-hook call \"" run-pre-hook "\" as there are existing tests for this run.")))))
@@ -418,11 +419,11 @@
           ;;    (debug:print 0 *default-log-port* "Skipping post-hook call \"" run-post-hook "\" as there are existing tests for this run.")))))
 	  (let* ((use-log-dir (if (not (directory-exists? log-dir))
 				  (handle-exceptions
 				      exn
 				      (begin
-					(debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir)
+					(debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir ", exn=" exn)
 					#f)
 				    (create-directory log-dir #t)
 				    #t)
 				  #t))
 		 (start-time   (current-seconds))
@@ -429,11 +430,11 @@
 		 (actual-logf  (if use-log-dir full-log-fname log-file)))
 	    (handle-exceptions
 		exn
 		(begin
 		  (print-call-chain *default-log-port*)
-		  (debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn))
+		  (debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)
 		  (debug:print 0 *default-log-port* "ERROR: failed to run post-hook " run-post-hook ", check the log " log-file))
 	      (debug:print-info 0 *default-log-port* "running run-post-hook: \"" run-post-hook "\", log is " actual-logf)
 	      (system (conc run-post-hook " >> " actual-logf " 2>&1"))
 	      (debug:print-info 0 *default-log-port* "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run."))))))
 
@@ -535,11 +536,11 @@
 
     ;; register this run in monitor.db
     (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params)
     (rmt:tasks-set-state-given-param-key task-key "running")
 
-    (common:telemetry-log "run-tests"
+    #;(common:telemetry-log "run-tests"
                           payload:
                           `( (target . ,target)
                              (run-name . ,runname)
                              (test-patts . ,test-patts) ) )
 
@@ -749,11 +750,11 @@
 					    (let ((run-ids (rmt:get-all-run-ids)))
 					      (for-each (lambda (run-id)
 							  (if keep-going
 							      (handle-exceptions
 							       exn
-							       (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id)
+							       (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)
 							       (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
 							run-ids)))
 					  "runs: mark-incompletes")))
 	    ;; (thread-start! th1)
 	    (thread-start! th2)
@@ -831,10 +832,24 @@
 ;; reg - list of previously visited tests
 ;; tal - list of never visited tests
 ;;   prefer next hed to be from reg than tal.
 
 (define runs:nothing-left-in-queue-count 0)
+
+;; cache the result of get-prereqs-not-met and don't call it if called in past 10 seconds
+;; NOTE: This is assuming that testdat is highly specific to this test
+;;
+(define (runs:lazy-get-prereqs-not-met  testdat run-id waitons hed item-path #!key (mode '(normal))(itemmaps #f))
+  ;; mode: testmode itemmaps: itemmaps)
+  (if (and (runs:testdat-prereqs-not-met testdat)
+	   (< (- (current-seconds) (runs:testdat-last-update testdat)) 10)) ;;; only refresh for this test if
+					                                    ;;; it has been at least 10 seconds
+      (runs:testdat-prereqs-not-met testdat)  ;; return the cached result
+      (let* ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: mode itemmaps: itemmaps)))
+	(runs:testdat-prereqs-not-met-set! testdat res)
+	(runs:testdat-last-update-set! testdat (current-seconds))
+	res)))
 
 ;;======================================================================
 ;; runs:expand-items is called by runs:run-tests-queue
 ;;======================================================================
 ;;
@@ -841,22 +856,18 @@
 ;; return value of runs:expand-items is passed back to runs-tests-queue and is fed to named loop with this signature:
 ;;    (let loop ((hed         (car sorted-test-names))
 ;;	         (tal         (cdr sorted-test-names))
 ;;	         (reg         '()) ;; registered, put these at the head of tal 
 ;;	         (reruns      '()))
-(define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps)
+(define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs
+			   run-id waitons item-path testmode test-record
+			   can-run-more items runname tconfig reglen test-registry
+			   test-records itemmaps testdat)
   (let* ((loop-list       (list hed tal reg reruns))
-	 (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)))
-			    (if (list? res)
-				res
-				(begin
-				  (debug:print 0 *default-log-port*
-					       "ERROR: rmt:get-prereqs-not-met returned non-list!\n"
-					       "  res=" res " run-id=" run-id " waitons=" waitons " hed=" hed " item-path=" item-path " testmode=" testmode " itemmaps=" itemmaps)
-				  '()))))
-         (have-itemized (not (null? (lset-intersection eq? testmode '(itemmatch itemwait)))))
-	 ;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))
+	 (prereqs-not-met (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path
+							 mode: testmode itemmaps: itemmaps))
+	 (have-itemized   (not (null? (lset-intersection eq? testmode '(itemmatch itemwait)))))
 	 (fails           (runs:calc-fails prereqs-not-met))
 	 (prereq-fails    (runs:calc-prereq-fail prereqs-not-met))
 	 (non-completed   (runs:calc-not-completed prereqs-not-met))
 	 (runnables       (runs:calc-runnable prereqs-not-met))
          (unexpanded-prereqs
@@ -987,13 +998,17 @@
 	   (null? non-completed))
      (debug:print-info 4 *default-log-port* "cond branch - "  "ei-4")
       (if  (runs:can-keep-running? hed 20)
 	  (begin
 	    (runs:inc-cant-run-tests hed)
-	    (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; 
+	    (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0) ", going to wait 60 sec.") ;; 
 	    ;; getting here likely means the system is way overloaded, kill a full minute before continuing
-	    (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
+	    ;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!!
+	    ;; No runsdat, can't do this yet
+	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+	    ;;
+	    (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
 	    ;; num-retries code was here
 	    ;; we use this opportunity to move contents of reg to tal
 	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
 	  (begin
 	    (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
@@ -1119,11 +1134,29 @@
 				  (conc " WARNING: t is not a vector=" t )))
 			    prereqs-not-met)
 		       ", ") ") fails: " fails
 		       "\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f))
 			    
-
+    ;; well, first lets see if cpu load throttling is enabled. If so wait around until the
+    ;; average cpu load is under the threshold before continuing
+    ;;
+    (if (runs:dat-load-mgmt-function runsdat)
+	((runs:dat-load-mgmt-function runsdat))
+	(runs:dat-load-mgmt-function-set!
+	 runsdat
+	 (lambda ()
+	   ;; jobtools maxload is useful for where the full Megatest run is done on one machine
+	   (if (and (not (common:on-homehost?))
+		    maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
+	       (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
+	   
+	   ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
+	   (if maxhomehostload
+	       (common:wait-for-homehost-load maxhomehostload
+					      (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))))))
+    
+ 
     
     (if (and (not (null? prereqs-not-met))
 	     (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))
 	(debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", ")))
 
@@ -1192,11 +1225,11 @@
      ((not have-resources) ;; simply try again after waiting a second
       (if (runs:lownoise "no resources" 60)
 	  (debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ..."))
       ;; Have gone back and forth on this but db starvation is an issue.
       ;; wait one second before looking again to run jobs.
-      (thread-sleep! 1)
+      (thread-sleep! 0.25)
       ;; could have done hed tal here but doing car/cdr of newtal to rotate tests
       (list (car newtal)(cdr newtal) reg reruns))
      
      ;; This is the final stage, everything is in place so launch the test
      ;;
@@ -1208,28 +1241,17 @@
       ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path))
       ;; we are going to reset all the counters for test retries by setting a new hash table
       ;; this means they will increment only when nothing can be run
       (set! *max-tries-hash* (make-hash-table))
       
-      ;; well, first lets see if cpu load throttling is enabled. If so wait around until the
-      ;; average cpu load is under the threshold before continuing
-
-      ;; jobtools maxload is useful for where the full Megatest run is done on one machine
-      (if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
-          (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
-
-      ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
-      (if maxhomehostload
-          (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))
-      
-      (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry)
+      (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat testdat)
       (runs:incremental-print-results run-id)
       (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running)
       (runs:shrink-can-run-more-tests-count runsdat)  ;; DELAY TWEAKER (still needed?)
       ;; (thread-sleep! *global-delta*)
       (if (or (not (null? tal))(not (null? reg)))
-	  (runs:loop-values tal reg reglen regfull reruns)
+	  (runs:loop-values tal reg reglen regfull reruns) ;; hed should be dropped at this time
 	  #f))
      
      ;; must be we have unmet prerequisites
      ;;
      (else
@@ -1246,19 +1268,18 @@
 	      (member 'toplevel testmode))
 	  (begin
 	    ;; couldn't run, take a breather
 	    (if  (runs:lownoise "Waiting for more work to do..." 60)
 		 (debug:print-info 0 *default-log-port* "Waiting for more work to do..."))
-	    (thread-sleep! 1)
+	    
+	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+	    (thread-sleep! 5)
 	    (list (car newtal)(cdr newtal) reg reruns))
 	  ;; the waiton is FAIL so no point in trying to run hed ever again
 	  (begin
             (let ((my-test-id (rmt:get-test-id run-id test-name item-path)))
-              (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2"))
-
-
-            
+              (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2"))            
             (if (or (not (null? reg))(not (null? tal)))
                 (if (vector? hed)
 		  (begin
 		    (debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path
 				 " from the launch list as it has prerequistes that are FAIL")
@@ -1434,10 +1455,13 @@
 ;; every time though the loop increment the test/itempatt val.
 ;; when the min is > max-allowed and none running then force exit
 ;;
 (define *max-tries-hash* (make-hash-table))
 
+(define (runs:pretty-long-list lst)
+   (if (> (length lst) 8)(append (take lst 3)(list "...")) lst))
+
 ;;======================================================================
 ;; runs:run-tests-queue is called by runs:run-tests
 ;;======================================================================
 ;;
 ;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... >
@@ -1573,19 +1597,10 @@
 			   itemmaps:    itemmaps
 			   ;; prereqs-not-met: prereqs-not-met
 			   )))
 	(runs:dat-regfull-set! runsdat regfull)
     
-        ;; -- removed BB 17ww28 - no longer needed.
-	;; every 15 minutes verify the server is there for this run
-	;; (if (and (common:low-noise-print 240 "try start server"  run-id)
-	;; 	 (not (or (and *runremote*
-	;; 		       (remote-server-url *runremote*)
-	;; 		       (server:ping (remote-server-url *runremote*)))
-	;; 		  (server:check-if-running *toppath*))))
-	;;     (server:kind-run *toppath*))
-	
 	(if (> num-running 0)
             (set! last-time-some-running (current-seconds)))
 
         (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
             (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
@@ -1614,11 +1629,11 @@
 
 	(runs:incremental-print-results run-id)
 	(debug:print 4 *default-log-port* "TOP OF LOOP => "
 		     "test-name: " test-name
 		     "\n  hed:         " hed
-		     "\n  tal:         " tal
+		     "\n  tal:         " (runs:pretty-long-list tal)
 		     "\n  reg:         " reg
                      "\n  test-record  " test-record
                      "\n  itemdat:     " itemdat
 		     "\n  items:       " items
 		     "\n  item-path:   " item-path
@@ -1628,11 +1643,11 @@
 		     "\n  regfull:     " regfull
 		     "\n  reglen:      " reglen
 		     "\n  length reg:  " (length reg)
                      )
 
-	(runs:parallel-runners-mgmt runsdat)
+	;; (runs:parallel-runners-mgmt runsdat)
 
 	;; check for hed in waitons => this would be circular, remove it and issue an
 	;; error
 	(if (member test-name waitons)
 	    (begin
@@ -1664,13 +1679,65 @@
           (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-2")
 	  (debug:print-info 4 *default-log-port* "OUTER COND: (not items)")
 	  (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests))
 		   (not (null? tal)))
 	      (loop (car tal)(cdr tal) reg reruns))
-	  (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
-	  (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
-	  (let ((loop-list (runs:process-expanded-tests runsdat testdat)))
+
+	  ;; gonna try a strategy change here.
+	  ;;
+	  ;; check if can run more tests. if yes, continue, if no, rest until can run more
+	  ;; look at the test jobgroup and tot jobs running
+	  ;;
+	  ;; NOTE: This does NOT actually gate here, only captures the proc to be called later
+	  ;; 
+	  (if (not (runs:dat-wait-for-jobs-function runsdat))
+	      (runs:dat-wait-for-jobs-function-set!
+	       runsdat 
+	       (lambda (testdat-in)
+		 (let* ((jobgroup              (runs:testdat-jobgroup testdat-in))
+			(can-run-more-tests    (runs:dat-can-run-more-tests runsdat))
+			(last-jobs-check-time  (runs:dat-last-jobs-check-time runsdat))
+			(should-check-jobs     (match can-run-more-tests
+						 ((can-run-more-flag num-running nr-in-jobgroup max-concurrent-jobs . params)
+						  (if (< (- max-concurrent-jobs num-running) 25)
+						      (begin
+							(debug:print-info 0 *default-log-port*
+									  "less than 20 jobs headroom, ("max-concurrent-jobs
+									  "-"num-running")>20. Forcing prelaunch check.")
+							#t)
+						      #f))
+						 (else #f)))) ;; no record yet
+		   (if should-check-jobs
+		       (let loop-can-run-more
+			   ((res      (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))
+			    (remtries 1440)) ;; we can wait for up to two hours for jobs to get done
+			 (match res
+			   ((run-more num-running . rem)
+			    (if (or run-more
+				    (< remtries 1))
+				(begin
+				  (if (runs:lownoise "num-running" 30)
+				      (debug:print-info 0 *default-log-port* "Have "num-running" tests of max " max-concurrent-jobs))
+				  (runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through
+				(begin
+				  (if (runs:lownoise "num-running" 10)
+				      (debug:print-info 0 *default-log-port* "Can't run more tests, have "num-running" tests of "
+							max-concurrent-jobs " allowed."))
+				  (thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable
+				  
+				  ;; wait for load here
+				  (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+				  (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)
+						     (- remtries 1)))))))
+		       )))))
+
+	  ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed
+
+	  ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed
+	  (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))
+
+	  (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running
             (if loop-list (apply loop loop-list))))
 
 	 ;; items processed into a list but not came in as a list been processed
 	 ;;
 	 ((and (list? items)     ;; thus we know our items are already calculated
@@ -1729,19 +1796,20 @@
 	 ;; if items is a proc then need to run items:get-items-from-config, get the list and loop 
 	 ;;    - but only do that if resources exist to kick off the job
 	 ;; EXPAND ITEMS
 	 ((or (procedure? items)(eq? items 'have-procedure))
           (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-4")
-	  (let ((can-run-more    (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)))
+	  (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)))
 	    (if (and (list? can-run-more)
-		     (car can-run-more))
-		(let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here
+		     (car can-run-more)) ;; itemized test expanded here
+		(let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup
+						    max-concurrent-jobs run-id waitons item-path
+						    testmode test-record can-run-more items runname
+						    tconfig reglen test-registry test-records itemmaps)))
 		  (if loop-list
 		      (apply loop loop-list)
-                      (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)
-                      )
-                  )
+                      (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)))
 		;; if can't run more just loop with next possible test
 		(loop (car newtal)(cdr newtal) reg reruns))))
          
 	 ;; this case should not happen, added to help catch any bugs
 	 ((and (list? items) itemdat)
@@ -1774,11 +1842,13 @@
     ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched 
     (rmt:set-var (conc "lunch-complete-" run-id) "yes")  
         
     ;; now *if* -run-wait we wait for all tests to be done
     ;; Now wait for any RUNNING tests to complete (if in run-wait mode)
+    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
     (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle
+    
     (let wait-loop ((num-running      (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes
 		    (prev-num-running 0))
       ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running)
       (if (and (or (args:get-arg "-run-wait")
 		   (equal? (configf:lookup *configdat* "setup" "run-wait") "yes"))
@@ -1793,11 +1863,12 @@
 		  (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
 		  (rmt:find-and-mark-incomplete run-id #f)
 		  (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running
 				    " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at "
 				    (time->string (seconds->local-time (current-seconds))))))
-	    (thread-sleep! 5)
+	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+	    (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1))
 	    (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes
 		       num-running))))
     ;; LET* ((test-record
     ;; we get here on "drop through". All done!
     ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. 
@@ -1852,11 +1923,11 @@
 	     (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t))))
        lst))
 
 ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step
 ;;
-(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry)
+(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry runsdat testdat-rec)
   ;; All these vars might be referenced by the testconfig file reader
   ;; 
   ;; NEED to reprocess testconfig here, ensuring that item variables are available.
   ;; This is for Tal's issue with item-specific env vars not being set for use in skip.
   ;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273
@@ -1930,12 +2001,13 @@
 		  (set! test-id (rmt:get-test-id run-id test-name item-path))))
 	    (debug:print-info 4 *default-log-port* "test-id=" test-id ", run-id=" run-id ", test-name=" test-name ", item-path=\"" item-path "\"")
 	    (set! testdat (rmt:get-test-info-by-id run-id test-id))
 	    (if (not testdat)
 		(begin
-		  (debug:print-info 0 *default-log-port* "WARNING: server is overloaded, trying again in one second")
-		  (thread-sleep! 1)
+		  (debug:print-info 0 *default-log-port* "WARNING: server is overloaded, trying again in two seconds")
+		  ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+		  (thread-sleep! 2)
 		  (loop)))))
       (if (not testdat) ;; should NOT happen
 	  (debug:print-error 0 *default-log-port* "failed to get test record for test-id " test-id))
       (set! test-id (db:test-get-id testdat))
       (if (common:file-exists? test-path)
@@ -2042,15 +2114,23 @@
 		       (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test)
 		       (debug:print-info 1 *default-log-port* "SKIPPING Test " full-test-name " due to " skip-test))
 		     ;;
 		     ;; Here the test is handed off to launch.scm for launch-test to complete the launch process
 		     ;;
-		     (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags))
-			 (begin
-			   (print "ERROR: Failed to launch the test. Exiting as soon as possible")
-			   (set! *globalexitstatus* 1) ;; 
-			   (process-signal (current-process-id) signal/kill))))))))
+		     (begin
+		       ;; wait for less than max jobs here
+		       (if (runs:dat-wait-for-jobs-function runsdat)
+			   ((runs:dat-wait-for-jobs-function runsdat) testdat-rec))
+		       
+		       (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags))
+			   (begin
+			     (print "ERROR: Failed to launch the test. Exiting as soon as possible")
+			     (set! *globalexitstatus* 1) ;; 
+			     (process-signal (current-process-id) signal/kill))
+			   )
+		       ;; wait again here?
+		       ))))))
 	((KILLED) 
 	 (debug:print 1 *default-log-port* "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.")
 	 (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)) ;; KILLED))
 	((LAUNCHED REMOTEHOSTSTART RUNNING)  
 	 (debug:print 2 *default-log-port* "NOTE: " test-name " is already running"))
@@ -2119,11 +2199,11 @@
   (let* ((targets (string-split target-patts ","))
 	 (keys    (rmt:get-keys))
 	 (res-ht  (make-hash-table))) ;; target -> ( runrecord1 runrecord2 ... )
     (for-each
      (lambda (target-patt)
-       (let ((runs     (rmt:simple-get-runs runpatt #f #f target-patt)))
+       (let ((runs     (rmt:simple-get-runs runpatt #f #f target-patt #f)))
 	 (for-each
 	  (lambda (run)
 	    (let ((target (simple-run-target run)))
 	      (hash-table-set! res-ht target (cons run (hash-table-ref/default res-ht target '())))))
 	  runs)))
@@ -2521,11 +2601,11 @@
 				(if (not (null? tal))
 				    (loop (car tal)(cdr tal))))
 			       ((run-wait)
                                 ;; BB TODO - manage has-subrun case
 				(debug:print-info 2 *default-log-port* "still waiting, " (length tests) " tests still running")
-				(thread-sleep! 10)
+				(thread-sleep! 5)
 				(let ((new-tests (proc-get-tests run-id)))
 				  (if (null? new-tests)
 				      (debug:print-info 1 *default-log-port* "Run completed according to zero tests matching provided criteria.")
 				      (loop (car new-tests)(cdr new-tests)))))
 			       ((archive)
@@ -2628,18 +2708,18 @@
     (if (symbolic-link? run-dir)
 	(begin
 	  (debug:print-info 1 *default-log-port* "Removing symlink " run-dir)
 	  (handle-exceptions
 	   exn
-	   (debug:print-error 0 *default-log-port* " Failed to remove symlink " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue")
+	   (debug:print-error 0 *default-log-port* " Failed to remove symlink " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue, exn=" exn)
 	   (delete-file run-dir)))
 	(if (directory? run-dir)
 	    (if (> (directory-fold (lambda (f x)(+ 1 x)) 0 run-dir) 0)
 		(debug:print 0 *default-log-port* "WARNING: refusing to remove " run-dir " as it is not empty")
 		(handle-exceptions
 		 exn
-		 (debug:print-error 0 *default-log-port* " Failed to remove directory " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue")
+		 (debug:print-error 0 *default-log-port* " Failed to remove directory " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue, exn=" exn)
 		 (delete-directory run-dir)))
 	    (if (and run-dir
 		     (not (member run-dir (list "n/a" "/tmp/badname"))))
 		(debug:print 0 *default-log-port* "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink")
 		(debug:print 0 *default-log-port* "NOTE: the run dir for this test is undefined. Test may have already been deleted."))
@@ -2912,11 +2992,11 @@
 		    (let* ((final-doc ((sxml-modify `("testsuite" insert-into (@ (name ,xml-ts-name) (tests ,tests-count) (errors ,error-cnt) (failures ,fail-cnt)))) new-doc)))
 		    (debug:print 0 *default-log-port* "modify attrib error=" error-cnt " fail= " fail-cnt)
 		    (handle-exceptions
 		     exn
 		     (let*	((msg	((condition-property-accessor 'exn 'message) exn)))
-		       (debug:print 0 *default-log-port* (conc "WARNING: Failed to update file" xml-path". Message:" msg)))
+		       (debug:print 0 *default-log-port* (conc "WARNING: Failed to update file" xml-path". Message:" msg ", exn=" exn)))
 		     		   
 		     (if (not (file-exists? xml-dir)) 
 			 (create-directory xml-dir #t))
                      (if (not (rmt:no-sync-get/default keyname #f)) 
                        (begin
@@ -2946,10 +3026,10 @@
 		  (debug:print-info 0 *default-log-port* "Removing cached files:\n    " (string-intersperse files "\n    "))
 		  (for-each 
 		   (lambda (f)
 		     (handle-exceptions
 			 exn
-			 (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f)
+			 (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f ", exn=" exn)
 		       (delete-file f)))
 		   files))))
 	  (debug:print-error 0 *default-log-port* "-clean-cache requires -runname."))
       (debug:print-error 0 *default-log-port* "-clean-cache requires -target or -reqtarg")))