Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -1750,19 +1750,26 @@
     (with-input-from-pipe
 	(conc "/bin/readlink -f " path)
       (lambda ()
 	(read-line)))))
 
+;; returns *effective load* (not normalized)
+;;
 (define (common:get-intercept onemin fivemin)
-  (let* ((load-change (- onemin fivemin))
-         (tchange     (- 300 60)))
-  (max (+ onemin (* 60 (/ load-change tchange)))0))
-)
+  (if (< onemin fivemin) ;; load is decreasing, just use the onemin load
+      onemin
+      (let* ((load-change (- onemin fivemin))
+	     (tchange     (- 300 60)))
+	(max (+ onemin (* 60 (/ load-change tchange))) 0))))
 
+;; calculate a delay number based on a droop curve
+;;    inputs are:
+;;      - load-in, load as from uptime, NOT normalized
+;;      - numcpus, number of cpus, ideally use the real cpus, not threads
+;;
 (define (common:get-delay load-in numcpus)
-  (max (/ (expt 5 (* 4 (/ load-in numcpus))) 10) 0)
-)
+  (max (/ (expt 5 (* 4 (/ load-in numcpus))) 10) 0))
 
 
 (define (get-cpu-load #!key (remote-host #f))
   (car (common:get-cpu-load remote-host)))
 ;;   (let* ((load-res (process:cmd-run->list "uptime"))
@@ -2062,16 +2069,16 @@
 		    (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
 		    (host-last-used-set! rec curr-time)
 		    new-best)
 		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))
 
-(define (common:wait-for-homehost-load maxload msg)
+(define (common:wait-for-homehost-load maxnormload msg)
   (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                      #f
                      (common:get-homehost)))
          (hh     (if hh-dat (car hh-dat) #f)))
-    (common:wait-for-normalized-load maxload msg hh)))
+    (common:wait-for-normalized-load maxnormload msg hh)))
 
 (define (common:get-num-cpus remote-host)
   (let* ((actual-host (or remote-host (get-host-name))))
     (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) ;; hosts had better not be changing the number of cpus too often!
 	(let* ((proc   (lambda ()
@@ -2095,88 +2102,177 @@
 	      (common:write-cached-info actual-host "num-cpus" result))
 	  result))))
 
 ;; wait for normalized cpu load to drop below maxload
 ;;
-(define (common:wait-for-normalized-load maxload msg remote-host #!optional (rem-tries 5))
+(define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5))
   (let ((num-cpus (common:get-num-cpus remote-host)))
     (if num-cpus
-	(common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host)
+	(common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host)
 	(begin
 	  (thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again
 	  (if (> rem-tries 0)
-	      (common:wait-for-normalized-load maxload msg remote-host (- rem-tries 1))
+	      (common:wait-for-normalized-load maxnormload msg remote-host (- rem-tries 1))
 	      #f)))))
 
 ;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
+;;   count     - count down to zero, at some point we'd give up if the load never drops
+;;   num-tries - count down to zero number tries to get numcpus
 ;;
-(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5))
-  (let* ((loadavg (common:get-cpu-load remote-host))
-	 (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
-		      (common:get-num-cpus remote-host)
-		      numcpus-in))
-	 (maxload (if force-maxload
-		      maxload-in
-		      (if (number? maxload-in)
-			  (max maxload-in 0.5)
-			  0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
-	 (first   (car loadavg))
-	 (next    (cadr loadavg))
-	 (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where
-					       ;; numcpus (or could be
-					       ;; maxload) is zero,
-					       ;; crude fallback is to
-					       ;; at least use 1
-	 (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next?
-			       0
-			       next))) ;; we will force a conservative calculation any time next is large.
-	 (first-next-avg    (/ (+ first next) 2))
-	 ;; add some randomness to the time to break any alignment
-	 ;; where netbatch dumps many jobs to machines simultaneously
-         (adjwait           (min (+ 300 (random 10)) (abs (* (+ (random 10)
-								(/ (- 1000 count) 10)
-								waitdelay)
-							     (- first adjmaxload) ))))
-	 (load-jump-limit   (configf:lookup-number *configdat* "setup" "load-jump-limit")))
-    ;; let's let the user know once in a long while that load checking
-    ;; is happening but not constantly report it
-    (if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time
-	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
-			  ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
+(define (common:wait-for-cpuload maxnormload numcpus-in
+				 #!key (count 1000)
+				 (msg #f)(remote-host #f)(num-tries 5))
+  (let* ((loadavg     (common:get-cpu-load remote-host))
+	  ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
+	 (numcpus     (if (<= 1 numcpus-in)
+			  (common:get-num-cpus remote-host)
+			  numcpus-in))
+	 (first       (car loadavg))
+	 (next        (cadr loadavg))
+	 (adjmaxload  (* maxnormload (max 1 numcpus))) ;; possible bug
+						   ;; where numcpus
+						   ;; (or could be
+						   ;; maxload) is
+						   ;; zero, crude
+						   ;; fallback is to
+						   ;; at least use 1
+	 ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
+	 ;; etc.
+	 (effective-load    (common:get-intercept first next))
+	 (recommended-delay (common:get-delay effective-load numcpus))
+	 (effective-host    (or remote-host "localhost"))
+	 (normalized-effective-load (/ effective-load numcpus))
+	 (will-wait                 (> normalized-effective-load maxnormload)))
+    (if (> recommended-delay 0)
+	(let* ((actual-delay (min recommended-delay 30)))
+	  (debug:print-info 0 *default-log-port* "Load is high, delaying " actual-delay " seconds.")
+	  (thread-sleep! actual-delay)))
+    
     (cond
+     ;; bad data, try again to get the data
+     ((not will-wait)
+      (if (common:low-noise-print 30 (conc (round normalized-effective-load) "-load-acceptable-" effective-host))
+	  (debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing.")))
      ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
 	   (> num-tries 0))
-      (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.")
+      (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load "
+		   first ", we'll sleep 10s and try " num-tries " more times.")
       (thread-sleep! 10)
-      (common:wait-for-cpuload maxload-in numcpus-in waitdelay
-			       count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1)))
-     ((and (> first adjmaxload)
+      (common:wait-for-cpuload maxnormload numcpus-in
+			       count: count remote-host: remote-host num-tries: (- num-tries 1)))
+     ;; need to wait for load to drop
+     ((and will-wait ;; (> first adjmaxload)
 	   (> count 0))
       (debug:print-info 0 *default-log-port*
-			"Delaying " adjwait
-			" seconds due to load " first
+			"Delaying 15" ;; adjwait
+			" seconds due to normalized effective load " normalized-effective-load ;; first
 			" exceeding max of " adjmaxload
 			" on server " (or remote-host (get-host-name))
-			" (normalized load-limit: " maxload ") " (if msg msg ""))
-      (thread-sleep! adjwait)
-      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
-     ((and (> loadjmp (cond
-		       (load-jump-limit load-jump-limit)
-		       ((> numcpus 8)(/ numcpus 2))
-		       ((> numcpus 4)(/ numcpus 1.2))
-		       (else 0.5)))
-	   (> count 0))
-      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". "
-			(if msg msg ""))
-      (thread-sleep! adjwait)
-      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
+			" (normalized load-limit: " maxnormload ") " (if msg msg ""))
+      (thread-sleep! 15) ;; adjwait)
+      (common:wait-for-cpuload maxnormload numcpus count: (- count 1) msg: msg remote-host: remote-host)
+      ;; put the message here to indicate came out of waiting
+      (debug:print-info 1 *default-log-port*
+			"On host: " effective-host
+			", effective load: " effective-load
+			", numcpus: " numcpus
+			", normalized effective load: " normalized-effective-load
+			))
+     ;; overloaded and count expired (i.e. went to zero)
      (else
-      (if (> num-tries 0)
-	  (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost")))
-	      (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing."))
-	  (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing."))))))
+      (if (> num-tries 0) ;; should be "num-tries-left". 
+	  (if (common:low-noise-print 30 (conc (round effective-load) "-load-acceptable-" effective-host))
+	      (debug:print 0 *default-log-port* "Load on " effective-host " is acceptable at effective normalized load of "
+			   effective-normalized-load "  continuing."))
+	  (debug:print 0 *default-log-port* "Load on " effective-host ", "
+		       first" could not be retrieved. Giving up and continuing."))))))
 
+;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
+;;
+;; (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5))
+;;   (let* ((loadavg (common:get-cpu-load remote-host))
+;; 	 (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
+;; 		      (common:get-num-cpus remote-host)
+;; 		      numcpus-in))
+;; 	 (maxload (if force-maxload
+;; 		      maxload-in
+;; 		      (if (number? maxload-in)
+;; 			  (max maxload-in 0.5)
+;; 			  0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
+;; 	 (first   (car loadavg))
+;; 	 (next    (cadr loadavg))
+;; 	 (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where
+;; 					       ;; numcpus (or could be
+;; 					       ;; maxload) is zero,
+;; 					       ;; crude fallback is to
+;; 					       ;; at least use 1
+;; 	 (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next?
+;; 			       0
+;; 			       next))) ;; we will force a conservative calculation any time next is large.
+;; 	 (first-next-avg    (/ (+ first next) 2))
+;; 	 ;; add some randomness to the time to break any alignment
+;; 	 ;; where netbatch dumps many jobs to machines simultaneously
+;;          (adjwait           (min (+ 300 (random 10)) (abs (* (+ (random 10)
+;; 								(/ (- 1000 count) 10)
+;; 								waitdelay)
+;; 							     (- first adjmaxload) ))))
+;; 	 (load-jump-limit   (configf:lookup-number *configdat* "setup" "load-jump-limit"))
+;; 	 ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
+;; 	 ;; etc.
+;; 	 (effective-load    (common:get-intercept first next))
+;; 	 (effective-host    (or remote-host "localhost"))
+;; 	 (normalized-effective-load (/ effective-load numcpus))
+;; 	 (will-wait                 (> normalized-effective-load maxload)))
+;; 	 
+;;     ;; let's let the user know once in a long while that load checking
+;;     ;; is happening but not constantly report it
+;;     #;(if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time
+;; 	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
+;;     ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
+;; 
+;;     (debug:print-info 1 *default-log-port*
+;; 		      "On host: " effective-host
+;; 		      ", effective load: " effective-load
+;; 		      ", numcpus: " numcpus
+;; 		      ", normalized effective load: " normalized-effective-load
+;; 		      )
+;;     
+;;     (cond
+;;      ;; bad data, try again to get the data
+;;      ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
+;; 	   (> num-tries 0))
+;;       (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.")
+;;       (thread-sleep! 10)
+;;       (common:wait-for-cpuload maxload-in numcpus-in waitdelay
+;; 			       count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1)))
+;;      ;; need to wait for load to drop
+;;      ((and will-wait ;; (> first adjmaxload)
+;; 	   (> count 0))
+;;       (debug:print-info 0 *default-log-port*
+;; 			"Delaying " 15 ;; adjwait
+;; 			" seconds due to normalized effective load " normalized-effective-load ;; first
+;; 			" exceeding max of " adjmaxload
+;; 			" on server " (or remote-host (get-host-name))
+;; 			" (normalized load-limit: " maxload ") " (if msg msg ""))
+;;       (thread-sleep! 15) ;; adjwait)
+;;       (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
+;;      ((and (> loadjmp (cond
+;; 		       (load-jump-limit load-jump-limit)
+;; 		       ((> numcpus 8)(/ numcpus 2))
+;; 		       ((> numcpus 4)(/ numcpus 1.2))
+;; 		       (else 0.5)))
+;; 	   (> count 0))
+;;       (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". "
+;; 			(if msg msg ""))
+;;       (thread-sleep! adjwait)
+;;       (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
+;;      (else
+;;       (if (> num-tries 0)
+;; 	  (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost")))
+;; 	      (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing."))
+;; 	  (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing."))))))
+;; 
 (define (get-uname . params)
   (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
 	 (uname #f))
     (if (null? (car uname-res))
 	"unknown"

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -1221,11 +1221,12 @@
       (if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
           (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
 
       ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
       (if maxhomehostload
-          (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))
+          (common:wait-for-homehost-load maxhomehostload
+					 (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))
       
       (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry)
       (runs:incremental-print-results run-id)
       (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running)
       (runs:shrink-can-run-more-tests-count runsdat)  ;; DELAY TWEAKER (still needed?)
@@ -1672,10 +1673,16 @@
 	  (debug:print-info 4 *default-log-port* "OUTER COND: (not items)")
 	  (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests))
 		   (not (null? tal)))
 	      (loop (car tal)(cdr tal) reg reruns))
 	  (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
+
+	  ;; This would be a good place to block on homehost load
+
+
+
+	  
 	  (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
 	  (let ((loop-list (runs:process-expanded-tests runsdat testdat)))
             (if loop-list (apply loop loop-list))))
 
 	 ;; items processed into a list but not came in as a list been processed

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -135,11 +135,11 @@
 							   "")
 		      ;; " -log " logfile
 		      " -m testsuite:" testsuite
 		      " " profile-mode
 		      )) ;; (conc " >> " logfile " 2>&1 &")))))
-	 (log-rotate  (make-thread common:rotate-logs  "server run, rotate logs thread"))
+	 (log-rotate  (make-thread common:rotate-logs  "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!?
          (load-limit  (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0)))
     ;; we want the remote server to start in *toppath* so push there
     (push-directory areapath)
     (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
     (thread-start! log-rotate)
@@ -155,11 +155,11 @@
 	  (setenv "TARGETHOST" target-host)))
       
     (setenv "TARGETHOST_LOGF" logfile)
     (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
     ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
-    (common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
+    #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
     (system (conc "nbfake " cmdln))
     (unsetenv "TARGETHOST_LOGF")
     (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
     (thread-join! log-rotate)
     (pop-directory)))
@@ -317,14 +317,32 @@
 (define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
   (if *my-client-signature* *my-client-signature*
       (let ((sig (server:mk-signature)))
         (set! *my-client-signature* sig)
         *my-client-signature*)))
+
+;; wait for server=start-last to be three seconds old
+;;
+(define (server:wait-for-server-start-last-flag areapath)
+  (let* ((start-flag (conc areapath "/logs/server-start-last")))
+    (if (file-exists? start-flag)
+	(let* ((fmodtime (file-modification-time start-flag)))
+	  (if (> (- (current-seconds) fmodtime) 3) ;; good enough
+	      (system (conc "touch " start-flag)) ;; lazy but safe
+	      (begin
+		(thread-sleep! 5)
+		(server:wait-for-server-start-last-flag areapath))))
+	(system (conc "touch " start-flag)))))
+	      
 
 ;; kind start up of servers, wait 40 seconds before allowing another server for a given
 ;; run-id to be launched
+;;
 (define (server:kind-run areapath)
+  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
+  ;; and wait for it to be at least 3 seconds old
+  (server:wait-for-server-start-last-flag areapath)
   (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
       (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
 	     (call-num     (car last-run-dat))
 	     (when-run     (cadr last-run-dat))
 	     (run-delay    (+ (case call-num
@@ -340,10 +358,12 @@
 		  (server:run areapath)
 		  (thread-sleep! 2) ;; don't release the lock for at least a few seconds
 		  (common:simple-file-release-lock lock-file)))
 	(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))
 
+;; this one seems to be the general entry point
+;;
 (define (server:start-and-wait areapath #!key (timeout 60))
   (let ((give-up-time (+ (current-seconds) timeout)))
     (let loop ((server-url (server:check-if-running areapath))
 	       (try-num    0))
       (if (or server-url