Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -1780,28 +1780,31 @@
 
 ;; check duration against test-run.dat file if it exists and update the value in
 ;; the db if necessary
 ;;
 (define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)
-  (let* ((datf             (conc run-dir ".mt_data/test-run.dat"))
+  (let* ((datf             (conc run-dir "/.mt_data/test-run.dat"))
 	 (modt             (if (and (file-exists? datf)
 				    (file-read-access? datf))
 			       (file-modification-time datf)
 			       #f)) ;; (+ event-time run-duration))))
 	 (alt-run-duration (if modt
 			       (- modt event-time)
 			       #f)))
+    (debug:print 0 *default-log-port* "Test " test-id " datf " datf " modt " modt " alt-run-duration " alt-run-duration)
     (if (and alt-run-duration
 	     (> alt-run-duration run-duration))
 	(begin
 	  (debug:print 0 *default-log-port* "Test " test-id " run duration mismatch. Setting to " alt-run-duration)
 	  (db:with-db
 	   dbstruct #f #f
 	   (lambda (db)
 	     (sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id)
 	     #t)))
-	#f))) ;; #f = we did NOT adjust the time
+	(begin
+	  (debug:print 0 *default-log-port* "Test " test-id " run duration correct. No adjustment needed.")
+	  #f)))) ;; #f = we did NOT adjust the time
 	      
 (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
   (let* ((incompleted '())
 	 (oldlaunched '())
 	 (toplevels   '())
@@ -4019,24 +4022,28 @@
      (lambda (db)
        (let ((tr-res
               (sqlite3:with-transaction
                db
                (lambda ()
-                 ;; NB// Pass the db so it is part fo the transaction
+                 ;; NB// Pass the db so it is part of the transaction
                  (db:test-set-state-status db run-id test-id state status comment) ;; this call sets the item state/status
                  (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item
-                     (let* ((state-status-counts  (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test
-			      						  (state-stauses (db:roll-up-rules state-status-counts state status))
-                          (newstate (car state-stauses))
-                          (newstatus (cadr state-stauses)))
-                       (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts)  " state-status-counts: "
-							(apply conc
-                  (map (lambda (x)
-                     (conc
-                     		(with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
-                              state-status-counts))); end debug:print
-   
+		     ;;                           item-path is used to exclude current state/status of THIS test
+                     (let* ((state-status-counts  (db:get-all-state-status-counts-for-test
+						   dbstruct run-id test-name item-path state status))
+			    (state-stauses        (db:roll-up-rules state-status-counts state status))
+                            (newstate             (car state-stauses))
+                            (newstatus            (cadr state-stauses)))
+                       (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name
+				    ":"item-path" newstate="newstate" newstatus="newstatus
+				    " len(sscs)="(length state-status-counts)  " state-status-counts: "
+				    (apply conc
+					   (map (lambda (x)
+						  (conc
+                     				   (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
+						state-status-counts))); end debug:print
+		       
                        (if tl-test-id
 			   (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct
 		       ))))))
          (mutex-unlock! *db-transaction-mutex*)
          (if (and test-id state status (equal? status "AUTO")) 
@@ -4107,15 +4114,15 @@
        (let ((tr-res
               (sqlite3:with-transaction
                db
                (lambda ()
                    (let* ((state-status-counts  (db:get-all-state-status-counts-for-run dbstruct run-id))
-													(state-stauses (db:roll-up-rules state-status-counts #f #f ))
-                          (newstate (car state-stauses))
-                          (newstatus (cadr state-stauses))) 
+			  (state-stauses        (db:roll-up-rules state-status-counts #f #f ))
+                          (newstate             (car state-stauses))
+                          (newstatus            (cadr state-stauses))) 
                     (if (or (not (eq? newstate curr-state)) (not (eq?  newstatus curr-status)))
-                   (db:set-run-state-status dbstruct run-id newstate newstatus )))))))
+			(db:set-run-state-status dbstruct run-id newstate newstatus )))))))
          (mutex-unlock! *db-transaction-mutex*)
          tr-res))))
 
 
 (define (db:get-all-state-status-counts-for-run dbstruct run-id)

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -105,11 +105,14 @@
   ;; any of the other stuff that tests:test-set-status! does. Let's just 
   ;; force RUNNING/n/a
 
   ;; (thread-sleep! 0.3)
   ;; (tests:test-force-state-status! run-id test-id "RUNNING" "n/a")
-  (rmt:set-state-status-and-roll-up-items run-id test-name item-path "RUNNING" #f #f) 
+  (common:soft-lock
+   (lambda ()
+     (rmt:set-state-status-and-roll-up-items run-id test-name item-path "RUNNING" #f #f))
+    "set-state-status-roll-up")
   ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here
 
   ;; if there is a runscript do it first
   (if fullrunscript
       (let ((pid (process-run fullrunscript)))
@@ -1353,13 +1356,33 @@
     (when do-scan?
       (debug:print 1 *default-log-port* "INFO: search and mark zombie tests")
       (rmt:set-var key (current-seconds))
       (rmt:find-and-mark-incomplete run-id #f))))
 
-
-
-
+;; weak locking - slow down concurrent activities but don't entirely stop them from
+;; overlapping. wait for up to 5 seconds
+;;
+(define (common:soft-lock proc keyname #!key (wait-time 5))
+  (let* ((start-time (current-seconds))
+	 (my-key     (conc (current-process-id) "-" (get-host-name))))
+    (let loop ((key-val (rmt:no-sync-get/default keyname #f)))
+      (if (and key-val
+	       (not (equal? key-val my-key))
+	       (< (- (current-seconds) start-time) wait-time))
+	  (begin
+	    (thread-sleep! 0.5)
+	    (debug:print-info 0 *default-log-port* "Still trying to lock for " keyname)
+	    (loop (current-seconds))) ;; try again
+	  (begin ;; either we got the lock or we timed out - proceed as if 
+	    (rmt:no-sync-set keyname my-key)
+	    (if (equal? (rmt:no-sync-get/default keyname #f) my-key)
+		(begin
+		  (debug:print-info 0 *default-log-port* "Got lock for " keyname)
+		  (proc)
+		  (rmt:no-sync-del! keyname)
+		  (debug:print-info 0 *default-log-port* "Released lock for " keyname))
+		(common:soft-lock proc keyname wait-time: wait-time)))))))
 
 ;; 1. look though disks list for disk with most space
 ;; 2. create run dir on disk, path name is meaningful
 ;; 3. create link from run dir to megatest runs area 
 ;; 4. remotely run the test on allocated host
@@ -1466,11 +1489,14 @@
       
       ;; prevent overlapping actions - set to LAUNCHED as early as possible
       ;;
       ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail
       (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED"))
-      (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f)
+      (common:soft-lock
+       (lambda ()
+	 (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f))
+       "set-state-status-roll-up")
       ;; (pp (hash-table->alist tconfig))
       (set! diskpath (get-best-disk *configdat* tconfig))
       (if diskpath
 	  (let ((dat  (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat)))
 	    (set! work-area (car dat))

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -747,11 +747,12 @@
 					      (for-each (lambda (run-id)
 							  (if keep-going
 							      (handle-exceptions
 							       exn
 							       (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)
-							       (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
+							       (if (common:low-noise-print 900 (conc "mark-incomplete-" run-id))
+								   (rmt:find-and-mark-incomplete run-id #f))))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
 							run-ids)))
 					  "runs: mark-incompletes")))
 	    ;; (thread-start! th1)
 	    (thread-start! th2)
 	    ;; (thread-join! th1)
@@ -1846,11 +1847,12 @@
 	    (if (> (current-seconds)(+ last-time-incomplete 900))
 		(let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no
 		  (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id
 				    ". Running as pid " (current-process-id) " on " (get-host-name))
 		  (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
-		  (rmt:find-and-mark-incomplete run-id #f)
+		  (if (common:low-noise-print 900 (conc "mark-incomplete-" run-id))
+		      (rmt:find-and-mark-incomplete run-id #f))
 		  (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running
 				    " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at "
 				    (time->string (seconds->local-time (current-seconds))))))
 	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
 	    (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1))