Index: commonmod.scm
==================================================================
--- commonmod.scm
+++ commonmod.scm
@@ -184,10 +184,11 @@
 make-and-init-bigdata
 call-with-environment-variables
 common:simple-file-lock
 common:simple-file-lock-and-wait
 common:simple-file-release-lock
+common:with-simple-file-lock
 common:fail-safe
 get-file-descriptor-count
 common:get-this-exe-fullpath
 common:get-sync-lock-filepath
 common:find-local-megatest
@@ -1242,10 +1243,17 @@
 (define (common:simple-file-release-lock fname)
   (handle-exceptions
       exn
       #f ;; I don't really care why this failed (at least for now)
     (delete-file* fname)))
+
+(define (common:with-simple-file-lock fname proc)
+  (let* ((lkfname (conc fname ".lock")))
+    (common:simple-file-lock-and-wait lkfname)
+    (let ((res (proc)))
+      (common:simple-file-release-lock lkfname)
+      res)))
 
 ;;======================================================================
 ;; PUlled below from common.scm
 ;;======================================================================
 

Index: configfmod.scm
==================================================================
--- configfmod.scm
+++ configfmod.scm
@@ -49,10 +49,11 @@
 	 configf:write-alist
 	 configf:write-config
 	 find-config
 	 getenv
 	 mytarget
+	 my-with-lock
 	 nice-path
 	 process:cmd-run->list
 	 runconfig:read
 	 runconfigs-get
 	 safe-setenv
@@ -114,14 +115,19 @@
 ;;======================================================================
 
 ;; while targets are Megatest specific they are a useful concept
 (define mytarget (make-parameter #f))
 
+;; fake locker
+(define (fake-locker fname proc)(proc))
+
 ;; locking is optional, many environments don't care (e.g. running on one machine)
 ;; NOTE: the locker must follow the same syntax as with-dot-lock*
+;;       with-dot-lock* has problems if /tmp and the file being
+;;       locked are not on the same filesystem
 ;;
-(define my-with-lock (make-parameter with-dot-lock*))
+(define my-with-lock (make-parameter fake-locker)) ;; with-dot-lock*))
 
 ;;======================================================================
 ;; move debug stuff to separate module then put these back where they belong
 ;;======================================================================
 ;;======================================================================
@@ -1190,11 +1196,11 @@
 
 ;;======================================================================
 ;; DO THE LOCKING AROUND THE CALL
 ;;======================================================================
 ;;
-(define (configf:write-alist cdat fname)
+(define (configf:write-alist cdat fname #!optional (check-written #f))
   ;; (if (not (common:faux-lock fname))
   ;; (debug:print 0 *default-log-port* "INFO: NEED LOCKING ADDED HERE " fname)
   ((my-with-lock)
    fname
    (lambda ()
@@ -1202,26 +1208,27 @@
             (res
              (begin
                (with-output-to-file fname ;; first write out the file
 		 (lambda ()
                    (pp dat)))
-               ;; I don't like this. It makes write-alist opaque and complicated. -mrw-
-               (if (file-exists? fname)   ;; now verify it is readable
-                   (if (configf:read-alist fname)
-                       #t ;; data is good.
-                       (begin
-			 (handle-exceptions
-			  exn
-			  (begin
-			    (debug:print 0 *default-log-port* "deleting " fname " failed, exn=" exn)
-			    #f)
-			  (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.")
-			  (delete-file fname))
-			 #f))
-                   #f))))
+               ;; I don't like this. It makes write-alist complicated
+	       ;; move to something like write-and-verify-alist. -mrw-
+               (if check-written
+		   (if (file-exists? fname)   ;; now verify it is readable
+		       (if (configf:read-alist fname)
+			   'data-good ;; data is good.
+			   (handle-exceptions
+			       exn
+			       (begin
+				 (debug:print 0 *default-log-port* "deleting " fname " failed, exn=" exn)
+				 'data-bad)
+			     (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.")
+			     (delete-file fname)))
+		       'data-not-there)
+		   'data-not-checked))))
        res))))
   
 (define (common:get-fields cfgdat)
   (let ((fields (hash-table-ref/default cfgdat "fields" '())))
     (map car fields)))
 
 )

Index: dbmod.scm
==================================================================
--- dbmod.scm
+++ dbmod.scm
@@ -3454,12 +3454,14 @@
     (db:with-db
      dbstruct
      run-id
      #f
      (lambda (db)
-       (let* ((stmth (db:get-cache-stmth dbstruct db qry)))
-	 (sqlite3:first-result stmth run-id))))))
+       (let* (#;(stmth (db:get-cache-stmth dbstruct db qry)))
+	 #;(sqlite3:first-result stmth run-id)
+	 (sqlite3:first-result db qry run-id)
+	 )))))
 
 ;; For a given testname how many items are running? Used to determine
 ;; probability for regenerating html
 ;;
 (define (db:get-count-tests-running-for-testname dbstruct run-id testname)

Index: launchmod.scm
==================================================================
--- launchmod.scm
+++ launchmod.scm
@@ -663,11 +663,12 @@
 	  (let* ((tconfig-fname   (conc work-area "/.testconfig"))
 		 (tconfig-tmpfile (conc tconfig-fname ".tmp"))
 		 (tconfig         (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs)))
 		 (scripts (configf:get-section tconfig "scripts")))
 	    ;; create .testconfig file
-	    (configf:write-alist tconfig tconfig-tmpfile)
+	    (configf:write-alist tconfig tconfig-tmpfile #t) ;; the #t forces a check of the written data
+	    (assert (file-exists? tconfig-tmpfile) "FATAL: We just wrote the dang file, how can it not exist?")
 	    (move-file tconfig-tmpfile tconfig-fname #t)
 	    (delete-file* ".final-status")
 
 	    ;; extract scripts from testconfig and write them to files in test run dir
 	    (for-each

Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -169,10 +169,12 @@
 ;;   (return-method 'direct)
   
   ;; ulex parameters
   ;; (work-method   'mailbox)
   ;; (return-method 'mailbox)
+
+(my-with-lock common:with-simple-file-lock)
   
 ;; fake out readline usage of toplevel-command
 (define (toplevel-command . a) #f)
 (define *didsomething* #f)  
 (define *db* #f) ;; this is only for the repl, do not use in general!!!!

Index: runsmod.scm
==================================================================
--- runsmod.scm
+++ runsmod.scm
@@ -244,10 +244,23 @@
 	(begin
 	  (hash-table-set! *runs:denoise* key currtime)
 	  #t)
 	#f)))
 
+(define *too-soon-delays* (make-hash-table))
+
+;; to-soon delay, when matching event happened in less than dseconds delay wseconds
+;;
+(define (runs:too-soon-delay key dseconds wseconds)
+  (let* ((last-time (hash-table-ref/default *too-soon-delays* key #f)))
+    (if (and last-time
+	     (< (- (current-seconds) last-time) dseconds))
+	(begin
+	  (debug:print-info 0 *default-log-port* "Whoa, slow down there ... "key" has been too recently seen.")
+	  (thread-sleep! wseconds)))
+    (hash-table-set! *too-soon-delays* key (current-seconds))))
+
 (define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)
 
   ;; Take advantage of a good place to exit if running the one-pass methodology
   (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20)
 	   (args:get-arg "-one-pass"))
@@ -1467,11 +1480,13 @@
 			   newtal:      newtal
 			   itemmaps:    itemmaps
 			   ;; prereqs-not-met: prereqs-not-met
 			   )))
 	(runs:dat-regfull-set! runsdat regfull)
-    
+
+	(runs:too-soon-delay (conc "loop delay " hed) 1 1)
+	
 	(if (> num-running 0)
             (set! last-time-some-running (current-seconds)))
 
         (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
             (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
@@ -1494,10 +1509,11 @@
 	      (if (or (not (null? tal))(not (null? reg)))
 		  (loop (runs:queue-next-hed tal reg reglen regfull)
 			(runs:queue-next-tal tal reg reglen regfull)
 			(runs:queue-next-reg tal reg reglen regfull)
 			reruns))))
+	
         ;; (loop (car tal)(cdr tal) reg reruns))))
 
 	(runs:incremental-print-results run-id)
 	(debug:print 4 *default-log-port* "TOP OF LOOP => "
 		     "test-name: " test-name
@@ -1725,10 +1741,11 @@
 		   (equal? (configf:lookup *configdat* "setup" "run-wait") "yes"))
 	       (> num-running 0))
 	  (begin
 	    ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
 	    ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0))
+	    (thread-sleep! 5) ;; let's always sleep, prevents abutting calls to rum:get-count-tests-running-for-run-id - didn't help
 	    (if (> (current-seconds)(+ last-time-incomplete 900))
 		(let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id)))
 		  (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id
 				    ". Running as pid " (current-process-id) " on " (get-host-name))
 		  (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
@@ -1735,11 +1752,10 @@
 		  (runs:find-and-mark-incomplete-and-check-end-of-run run-id #f)
 		  (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running
 				    " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at "
 				    (time->string (seconds->local-time (current-seconds))))))
 	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
-	    (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1))
 	    (wait-loop (rmt:get-count-tests-running-for-run-id run-id)
 		       num-running))))
     ;; LET* ((test-record
     ;; we get here on "drop through". All done!
     ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed.