Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -817,14 +817,14 @@
 
 (define *common:ended-states*       ;; states which indicate the test is stopped and will not proceed
   '("COMPLETED" "ARCHIVED" "KILLED" "KILLREQ" "STUCK" "INCOMPLETE" ))
 
 (define *common:badly-ended-states* ;; these roll up as CHECK, i.e. results need to be checked
-  '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD"))
+  '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD" "CHECK"))
 
 (define *common:well-ended-states* ;; an item's prereq in this state allows item to proceed
-  '("PASS" "WARN" "CHECK" "WAIVED" "SKIP"))
+  '("PASS" "WARN" "WAIVED" "SKIP"))
 
 ;; BBnote: *common:running-states* used from db:set-state-status-and-roll-up-items
 (define *common:running-states*     ;; test is either running or can be run
   '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED" "STARTED"))
 
@@ -2208,37 +2208,31 @@
 ;;
 (define (common:wait-for-cpuload maxnormload numcpus-in
 				 #!key (count 1000)
 				 (msg #f)(remote-host #f)(num-tries 5))
   (let* ((loadavg     (common:get-cpu-load remote-host))
-	  ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
-	 (numcpus     (if (<= 1 numcpus-in)
-			  (common:get-num-cpus remote-host)
-			  numcpus-in))
-	 (first       (car loadavg))
-	 (next        (cadr loadavg))
-	 (adjmaxload  (* maxnormload (max 1 numcpus))) ;; possible bug
-						   ;; where numcpus
-						   ;; (or could be
-						   ;; maxload) is
-						   ;; zero, crude
-						   ;; fallback is to
-						   ;; at least use 1
-	 ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
-	 ;; etc.
-	 (effective-load    (common:get-intercept first next))
-	 (recommended-delay (common:get-delay effective-load numcpus))
-	 (effective-host    (or remote-host "localhost"))
-	 (normalized-effective-load (/ effective-load numcpus))
-	 (will-wait                 (> normalized-effective-load maxnormload)))
-    (if (> recommended-delay 1) 
-	(let* ((actual-delay (min recommended-delay 30)))
-	  (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load"))
-	      (debug:print-info 0 *default-log-port* "Load control, delaying "
+    ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
+    (numcpus     (if (<= 1 numcpus-in)
+    (common:get-num-cpus remote-host) numcpus-in))
+    (first       (car loadavg))
+    (next        (cadr loadavg))
+    (adjmaxload  (* maxnormload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude
+						  ;; fallback is to at least use 1
+    ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
+    ;; etc.
+    (effective-load    (common:get-intercept first next))
+    (recommended-delay (common:get-delay effective-load numcpus))
+    (effective-host    (or remote-host "localhost"))
+    (normalized-effective-load (/ effective-load numcpus))
+    (will-wait                 (> normalized-effective-load maxnormload)))
+    (if (and will-wait (> recommended-delay 1)) 
+      (let* ((actual-delay (min recommended-delay 30)))
+        (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load"))
+          (debug:print-info 0 *default-log-port* "Load control, delaying "
 				actual-delay " seconds to maintain safe load. current normalized effective load is "
 				normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load))
-	  (thread-sleep! actual-delay)))
+	(thread-sleep! actual-delay)))
     
     (cond
      ;; bad data, try again to get the data
      ((not will-wait)
       (if (common:low-noise-print 3600 (conc (round normalized-effective-load) "-load-acceptable-" effective-host))

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -4659,11 +4659,11 @@
 
 
 
 ;; the new prereqs calculation, looks also at itempath if specified
 ;; all prereqs must be met
-;;    if prereq test with itempath='' is COMPLETED and PASS, WARN, CHECK, or WAIVED then prereq is met
+;;    if prereq test with itempath='' is in common:well-ended-states, then prereq is met
 ;;    if prereq test with itempath=ref-item-path and COMPLETED with PASS, WARN, CHECK, or WAIVED then prereq is met
 ;;
 ;; Note: mode 'normal means that tests must be COMPLETED and ok (i.e. PASS, WARN, CHECK, SKIP or WAIVED)
 ;;       mode 'toplevel means that tests must be COMPLETED only
 ;;       mode 'itemmatch or 'itemwait means that tests items must be COMPLETED and (PASS|WARN|WAIVED|CHECK) [[ NB// NOT IMPLEMENTED YET ]]
@@ -4674,10 +4674,11 @@
 ;;    2. any NOT completed and good? if yes => return those as prereqs not met, if no => return null list
 ;; 
 ;; (define (db:get-prereqs-not-met dbstruct run-id waitons ref-item-path mode)
 (define (db:get-prereqs-not-met dbstruct run-id waitons ref-test-name ref-item-path mode itemmaps) ;; #!key (mode '(normal))(itemmap #f))
   ;; BBnote - rollup of an itemized test's overall state/status done in db:set-state-status-and-roll-up-items
+  (debug:print 4 *default-log-port* "db:get-prereqs-not-met: " waitons)
   (append
    (if (member 'exclusive mode)
        (let ((running-tests (db:get-tests-for-run dbstruct
 						  #f  ;; run-id of #f means for all runs. 
 						  (if (string=? ref-item-path "")   ;; testpatt
@@ -4700,10 +4701,12 @@
 	;;	    (conc (db:test-get-testname testdat)
 	;;		  "/"
 	;;		  (db:test-get-item-path testdat))))
 	 running-tests) ;; calling functions want the entire data
        '())
+
+
 
    ;; collection of: for each waiton -
    ;;   if this ref-test-name is an item in an itemized test and mode is itemwait/itemmatch:
    ;;     if waiton is not itemized - if waiton is not both completed and in ok status, add as unmet prerequisite
    ;;     if waiton is itemized:

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -739,11 +739,11 @@
 			  (waiton-itemized (and waiton-tconfig
 						(or (hash-table-ref/default waiton-tconfig "items" #f)
 						    (hash-table-ref/default waiton-tconfig "itemstable" #f))))
 			  (itemmaps        (tests:get-itemmaps config))  ;; (configf:lookup config "requirements" "itemmap"))
 			  (new-test-patts  (tests:extend-test-patts test-patts hed waiton itemmaps hed-itemized-waiton))) 
-		     (debug:print-info 0 *default-log-port* "Test " waiton " has " (if waiton-record "a" "no") " waiton-record and" (if waiton-itemized " " " no ") "items")
+		     (debug:print-info 2 *default-log-port* "Test " waiton " has " (if waiton-record "a" "no") " waiton-record and" (if waiton-itemized " " " no ") "items")
 		     ;; need to account for test-patt here, if I am test "a", selected with a test-patt of "hed/b%"
 		     ;; and we are waiting on "waiton" we need to add "waiton/,waiton/b%" to test-patt
 		     ;; is this satisfied by merely appending "/" to the waiton name added to the list?
 		     ;;
 		     ;; This approach causes all of the items in an upstream test to be run 
@@ -761,11 +761,11 @@
                                    (set! test-patts new-test-patts))
                                  (begin
                                    (debug:print-info 0 *default-log-port* "Waitor(s) not yet on testpatt for " waiton ", setting up to re-process it")
                                    (set! tal (append (cons waiton tal)(list hed)))))
                              (begin
-                               (debug:print-info 0 *default-log-port* "Adding non-itemized test " waiton " to required-tests")
+                               (debug:print-info 2 *default-log-port* "Adding non-itemized test " waiton " to required-tests")
                                (set! required-tests (cons waiton required-tests))
                                (set! test-patts new-test-patts)))
 			 (begin
 			   (debug:print-info 0 *default-log-port* "No testconfig info yet for " waiton ", setting up to re-process it")
 			   (set! tal (append (cons waiton tal)(list hed))))) ;; (cons (conc waiton "/") required-tests))
@@ -898,10 +898,11 @@
 ;;	         (tal         (cdr sorted-test-names))
 ;;	         (reg         '()) ;; registered, put these at the head of tal 
 ;;	         (reruns      '()))
 (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps)
   (let* ((loop-list       (list hed tal reg reruns))
+         (junk (debug:print-info 4 *default-log-port* "expand-items calling rmt:get-prereqs-not-met"))
 	 (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)))
 			    (if (list? res)
 				res
 				(begin
 				  (debug:print 0 *default-log-port*
@@ -1091,11 +1092,11 @@
      ((null? runnables)
       (debug:print-info 4 *default-log-port* "cond branch - "  "ei-7")
       #f) ;; if we get here and non-completed is null then it is all over.
      (else
       (debug:print-info 4 *default-log-port* "cond branch - "  "ei-8")
-      (debug:print 0 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now")
+      (debug:print 2 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now")
       (list (car newtal)(cdr newtal) reg reruns)))))
 
 (define (runs:mixed-list-testname-and-testrec->list-of-strings inlst)
   (if (null? inlst)
       '()
@@ -1286,11 +1287,13 @@
       ;; we are going to reset all the counters for test retries by setting a new hash table
       ;; this means they will increment only when nothing can be run
       (set! *max-tries-hash* (make-hash-table))
       
       (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat testdat)
-      (runs:incremental-print-results run-id)
+      (if (debug:debug-mode 3)
+        (runs:incremental-print-results run-id)
+      )
       (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running)
       (runs:shrink-can-run-more-tests-count runsdat)  ;; DELAY TWEAKER (still needed?)
       ;; (thread-sleep! *global-delta*)
       (if (or (not (null? tal))(not (null? reg)))
 	  (runs:loop-values tal reg reglen regfull reruns) ;; hed should be dropped at this time
@@ -1668,12 +1671,13 @@
 		  (loop (runs:queue-next-hed tal reg reglen regfull)
 			(runs:queue-next-tal tal reg reglen regfull)
 			(runs:queue-next-reg tal reg reglen regfull)
 			reruns))))
         ;; (loop (car tal)(cdr tal) reg reruns))))
-
-	(runs:incremental-print-results run-id)
+        (if (debug:debug-mode 3)
+	  (runs:incremental-print-results run-id)
+        )
 	(debug:print 4 *default-log-port* "TOP OF LOOP => "
 		     "test-name: " test-name
 		     "\n  hed:         " hed
 		     "\n  tal:         " (runs:pretty-long-list tal)
 		     "\n  reg:         " reg
@@ -1852,16 +1856,18 @@
                       (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)
                       )
                   )
 		;; if can't run more just loop with next possible test
 		(loop (car newtal)(cdr newtal) reg reruns))))
-         
+        
+
 	 ;; this case should not happen, added to help catch any bugs
 	 ((and (list? items) itemdat)
           (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-5")
 	  (debug:print-error 0 *default-log-port* "Should not have a list of items in a test and the itemspath set - please report this")
 	  (exit 1))
+
 	 ((not (null? reruns))
           (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-6")
 	  (let* ((newlst (tests:filter-non-runnable run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED,
 		 (junked (lset-difference equal? tal newlst)))
 	    (debug:print-info 4 *default-log-port* "full drop through, if reruns is less than 100 we will force retry them, reruns=" reruns ", tal=" tal)
@@ -1870,10 +1876,11 @@
 	    (set! num-retries (+ num-retries 1))
 	    ;; (thread-sleep! (+ 1 *global-delta*))
 	    (if (not (null? newlst))
 		;; since reruns have been tacked on to newlst create new reruns from junked
 		(loop (car newlst)(cdr newlst) reg (delete-duplicates junked)))))
+
 	 ((not (null? tal))
           (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-7")
 	  (debug:print-info 4 *default-log-port* "I'm pretty sure I shouldn't get here."))
 	 ((not (null? reg)) ;; could we get here with leftovers?
           (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-8")
@@ -2109,11 +2116,11 @@
 	    (else (set! runflag #f)))
 	   (debug:print 4 *default-log-port* "RUNNING => runflag: " runflag " STATE: " (test:get-state testdat) " STATUS: " (test:get-status testdat))
 	   (if (not runflag)
 	       (if (not parent-test)
 		   (if (runs:lownoise (conc "not starting test" full-test-name) 60)
-		       (debug:print 1 *default-log-port* "NOTE: Not starting test " full-test-name " as it is state \"" (test:get-state testdat) 
+		       (debug:print 3 *default-log-port* "NOTE: Not starting test " full-test-name " as it is state \"" (test:get-state testdat) 
 				    "\" and status \"" (test:get-status testdat) "\", use -rerun \"" (test:get-status testdat)
 				    "\" or -force to override")))
 	       ;; NOTE: No longer be checking prerequisites here! Will never get here unless prereqs are
 	       ;;       already met.
 	       ;; This would be a great place to do the process-fork
@@ -2394,11 +2401,11 @@
 		    (tasks:kill-runner target run-name testpatt)
 		    ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt))
 		    (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
 		   ((set-state-status)
 		    ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
-		    (debug:print 1 *default-log-port* "Modifying state and status for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
+		    (debug:print 2 *default-log-port* "Modifying state and status for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
 		   ((print-run)
 		    (debug:print 1 *default-log-port* "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header)
 		    action)
 		   ((run-wait)
 		    (debug:print 1 *default-log-port* "Waiting for run " runkey ", run=" runnamepatt " to complete"))
@@ -2535,11 +2542,11 @@
                                     ) ; end let
                                   ); end cond has-subrun
 
                                  (else
                                   ;; BB - TODO - consider backgrounding to threads to delete tests (work below) 
-                                  (debug:print-info 0 *default-log-port* "test: " test-name " itest-state: " test-state)
+                                  (debug:print-info 2 *default-log-port* "test: " test-name " itest-state: " test-state)
                                   (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
                                       (begin
                                         (if (not (hash-table-ref/default test-retry-time test-fulln #f))
                                             (begin
                                               ;; want to set to REMOVING BUT CANNOT do it here?
@@ -2730,11 +2737,11 @@
 	 )
     (case clean-mode
       ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f))
       ((remove-all)      (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f))
       ((archive-remove)  (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVE_REMOVING" #f #f)))
-    (debug:print-info 1 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir)
+    (debug:print-info 2 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir)
     (if (and real-dir 
 	     (> (string-length real-dir) 5)
 	     (common:file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc.
 	(let* ((realpath (resolve-pathname run-dir)))
 	  (debug:print-info 1 *default-log-port* "Recursively removing " realpath)
@@ -3058,17 +3065,17 @@
 		 (files    (if (common:file-exists? runtop)
 			       (append (glob (conc runtop "/.megatest*"))
 				       (glob (conc runtop "/.runconfig*")))
 			       '())))
 	    (if (null? files)
-		(debug:print-info 0 *default-log-port* "No cached megatest or runconfigs files found. None removed.")
+		(debug:print-info 2 *default-log-port* "No cached megatest or runconfigs files found. None removed.")
 		(begin
-		  (debug:print-info 0 *default-log-port* "Removing cached files:\n    " (string-intersperse files "\n    "))
+		  (debug:print-info 2 *default-log-port* "Removing cached files:\n    " (string-intersperse files "\n    "))
 		  (for-each 
 		   (lambda (f)
 		     (handle-exceptions
 			 exn
 			 (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f ", exn=" exn)
 		       (delete-file f)))
 		   files))))
 	  (debug:print-error 0 *default-log-port* "-clean-cache requires -runname."))
       (debug:print-error 0 *default-log-port* "-clean-cache requires -target or -reqtarg")))

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -154,11 +154,11 @@
 	(begin
 	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
 	  (setenv "TARGETHOST" target-host)))
       
     (setenv "TARGETHOST_LOGF" logfile)
-    (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
+    (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time
     (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
     (system (conc "nbfake " cmdln))
     (unsetenv "TARGETHOST_LOGF")
     (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
     (thread-join! log-rotate)
@@ -367,13 +367,13 @@
       (let ((sig (server:mk-signature)))
         (set! *my-client-signature* sig)
         *my-client-signature*)))
 
 
-;; if server-start-last exists, and wasn't old enough, wait <idle time>, then call this function recursively until it is old enough.
+;; if server-start-last exists, and wasn't old enough, wait <idle time> + 1, then call this function recursively until it is old enough.
 ;; if it is old enough, overwrite it and wait 0.25 seconds.
-;; if it then has the wrong server key, wait <idle time> and call this function recursively.
+;; if it then has the wrong server key, wait <idle time> + 1 and call this function recursively.
 ;;
 (define (server:wait-for-server-start-last-flag areapath)
   (let* ((start-flag (conc areapath "/logs/server-start-last"))
 	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
 	 (idletime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
@@ -380,30 +380,31 @@
 	 (server-key (conc (get-host-name) "-" (current-process-id))))
     (if (file-exists? start-flag)
 	(let* ((fmodtime (file-modification-time start-flag))
 	       (delta    (- (current-seconds) fmodtime))
 	       (old-enough   (> delta idletime))
+               (new-server-key "")
               )
 
           ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than <idletime> seconds, and the new file still has the same server key as you just wrote, return #t.
-          ;; 
-	  (if (and old-enough
+	  ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process.
+           (if (and old-enough
 		   (begin
                      (debug:print-info 0 *default-log-port* "Writing " start-flag)
 		     (with-output-to-file start-flag (lambda () (print server-key)))
 		     (thread-sleep! 0.25)
-		     (let ((res (with-input-from-file start-flag (lambda () (read-line)))))
-		       (equal? server-key res)))
+		     (set! new-server-key (with-input-from-file start-flag (lambda () (read-line))))
+		     (equal? server-key new-server-key))
                 )
 	      #t
 
-           ;; If either of the above conditions is not true, print a "Gating server start" message, wait <idle-time>, then call this function recursively. 
+           ;; If either of the above conditions is not true, print a "Gating server start" message, wait <idle-time> + 1, then call this function recursively. 
 	      (begin
 		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
-				  (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "server key does not match" "too soon to start another server"))
+				  (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server"))
 
-		(thread-sleep! idletime)
+		(thread-sleep! ( + 1 idletime))
 		(server:wait-for-server-start-last-flag areapath)))))))
 
 
         
 ;; kind start up of server, wait before allowing another server for a given
@@ -419,11 +420,11 @@
 	(let* ((start-flag (conc areapath "/logs/server-start-last")))
 	  (common:simple-file-lock-and-wait lock-file expire-time: 25)
 	  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
 	  (system (conc "touch " start-flag)) ;; lazy but safe
 	  (server:run areapath)
-	  (thread-sleep! 18) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
+	  (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
 	  (common:simple-file-release-lock lock-file)))
 
       (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")
    )
 )

Index: utils/mk_wrapper
==================================================================
--- utils/mk_wrapper
+++ utils/mk_wrapper
@@ -44,10 +44,19 @@
 else
   export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$libdir
 fi
 
 export MT_SQLITE3_EXE=$sqlite3_exe
+
+http_vars="http_proxy https_proxy HTTP_PROXY HTTPS_PROXY"
+for i in \$http_vars
+do
+j=\${!i}
+if [ "\$j" != "" ]; then
+   unset \$i
+fi
+done
 __EOF
 ) > $cfgfile
   echo 
 else
   echo "INFO: LD_LIBRARY_PATH not set" >&2