Index: http-transport.scm
==================================================================
--- http-transport.scm
+++ http-transport.scm
@@ -415,10 +415,12 @@
       (mutex-unlock! *heartbeat-mutex*)
 
       ;; (debug:print 11 "last-access=" last-access ", server-timeout=" server-timeout)
       ;;
       ;; no_traffic, no running tests, if server 0, no running servers
+      ;;
+      ;; (let ((wait-on-running (configf:lookup *configdat* "server" "wait-on-running"))) ;; wait on running tasks (if not true then exit on time out)
       ;;
       (if (and *server-run*
 	       (or (> (+ last-access server-timeout)
 		      (current-seconds))
 		   (and (eq? run-id 0)

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -91,18 +91,18 @@
                                               fulln
                                               runscript))))) ;; assume it is on the path
 	       (rollup-status 0))
 	  (change-directory top-path)
 
-
-
-
-	  ;; ADD here - is test already RUNNING? If so ---- ABORT RUN ATTEMPT
-
-
-
-
+	  ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART,
+	  ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY*
+	  ;;
+	  (let ((test-info (rmt:get-testinfo-state-status run-id test-id)))
+	    (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")))
+		(tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
+		(debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed")))
+	  
 	  (debug:print 2 "Exectuing " test-name " (id: " test-id ") on " (get-host-name))
 	  (set! keys       (rmt:get-keys))
 	  ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process
 	  ;; one of these is defunct/redundant ...
 	  (if (not (launch:setup-for-run force: #t))
@@ -173,12 +173,10 @@
 	  ;; open-run-close not needed for test-set-meta-info
 	  ;; (tests:set-full-meta-info #f test-id run-id 0 work-area)
 	  ;; (tests:set-full-meta-info test-id run-id 0 work-area)
 	  (tests:set-full-meta-info #f test-id run-id 0 work-area 10)
 
-	  ;; (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f)
-	  (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
 	  (thread-sleep! 0.3) ;; NFS slowness has caused grief here
 
 	  (if (args:get-arg "-xterm")
 	      (set! fullrunscript "xterm")
 	      (if (and fullrunscript (not (file-execute-access? fullrunscript)))
@@ -524,11 +522,11 @@
 ;; 
 ;; All log file links should be stored relative to the top of link path
 ;;  
 ;; <target> - <testname> [ - <itempath> ] 
 ;;
-(define (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat)
+(define (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat #!key (remtries 2))
   (let* ((item-path (item-list->path itemdat))
 	 (runname  (db:get-value-by-header (db:get-rows run-info)
 					   (db:get-header run-info)
 					   "runname"))
 	 ;; convert back to db: from rdb: - this is always run at server end
@@ -546,13 +544,14 @@
 
 	 ;; ensure this exists first as links to subtests must be created there
 	 (linktree  (let ((rd (config-lookup *configdat* "setup" "linktree")))
 		      (if rd rd (conc *toppath* "/runs"))))
 
-	 (lnkbase  (conc linktree "/" target "/" runname))
-	 (lnkpath  (conc lnkbase "/" testname))
-	 (lnkpathf (conc lnkpath (if not-iterated "" "/") item-path)))
+	 (lnkbase   (conc linktree "/" target "/" runname))
+	 (lnkpath   (conc lnkbase "/" testname))
+	 (lnkpathf  (conc lnkpath (if not-iterated "" "/") item-path))
+	 (lnktarget (conc lnkpath "/" item-path)))
 
     ;; Update the rundir path in the test record for all, rundir=physical, shortdir=logical
     ;;                                                 rundir   shortdir
     (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path)
 
@@ -631,11 +630,11 @@
 		(hash-table-set! *toptest-paths* testname toptest-path)))))
 
     ;; The toptest path has been created, the link to the test in the linktree has
     ;; been created. Now, if this is an iterated test the real test dir must be created
     (if (not not-iterated) ;; this is an iterated test
-	(let ((lnktarget (conc lnkpath "/" item-path)))
+	(begin ;; (let ((lnktarget (conc lnkpath "/" item-path)))
 	  (debug:print 2 "Setting up sub test run area")
 	  (debug:print 2 " - creating run area in " test-path)
 	  (handle-exceptions
 	   exn
 	   (begin
@@ -653,10 +652,13 @@
 	     (debug:print 0 "ERROR:  Failed to re-create link " linktarget ((condition-property-accessor 'exn 'message) exn) ", exiting")
 	     (exit))
 	   (if (symbolic-link? lnktarget)     (delete-file lnktarget))
 	   (if (not (file-exists? lnktarget)) (create-symbolic-link test-path lnktarget)))))
 
+    (if (not (directory? test-path))
+	(create-directory test-path #t)) ;; this is a hack, I don't know why out of the blue this path does not exist sometimes
+
     (if (directory? test-path)
 	(begin
 	  (let* ((ovrcmd (let ((cmd (config-lookup *configdat* "setup" "testcopycmd")))
 			   (if cmd
 			       ;; substitute the TEST_SRC_PATH and TEST_TARG_PATH
@@ -669,11 +671,16 @@
 				   " >> " test-path "/mt_launch.log 2>> " test-path "/mt_launch.log")))
 		 (status (system cmd)))
 	    (if (not (eq? status 0))
 		(debug:print 2 "ERROR: problem with running \"" cmd "\"")))
 	  (list lnkpathf lnkpath ))
-	(list #f #f))))
+	(if (> remtries 0)
+	    (begin
+	      (debug:print 0 "ERROR: Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries)
+	      ;; 
+	      (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat remtries: (- remtries 1)))
+	    (list #f #f)))))
 
 ;; 1. look though disks list for disk with most space
 ;; 2. create run dir on disk, path name is meaningful
 ;; 3. create link from run dir to megatest runs area 
 ;; 4. remotely run the test on allocated host
@@ -741,10 +748,15 @@
     (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run
 	     (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir
 	(begin
 	  (debug:print-info 0 "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path)
 	  (runs:remove-test-directory #f testinfo #t))) ;; remove data only, do not perturb the record
+
+    ;; prevent overlapping actions - set to LAUNCHED as early as possible
+    ;;
+    (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED"))
+    (rmt:roll-up-pass-fail-counts run-id test-name item-path "LAUNCHED")
     (set! diskpath (get-best-disk *configdat*))
     (if diskpath
 	(let ((dat  (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat)))
 	  (set! work-area (car dat))
 	  (set! toptest-work-area (cadr dat))
@@ -774,15 +786,14 @@
 				      (list 'runtlim   (if run-time-limit (common:hms-string->seconds run-time-limit) #f))
 				      (list 'env-ovrd  (hash-table-ref/default *configdat* "env-override" '())) 
 				      (list 'set-vars  (if params (hash-table-ref/default params "-setvars" #f)))
 				      (list 'runname   runname)
 				      (list 'mt-bindir-path mt-bindir-path))))))))
+
     ;; clean out step records from previous run if they exist
     ;; (rmt:delete-test-step-records run-id test-id)
     (change-directory work-area) ;; so that log files from the launch process don't clutter the test dir
-    (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED"))
-    (rmt:roll-up-pass-fail-counts run-id test-name item-path "LAUNCHED")
     (cond
      ((and launcher hosts) ;; must be using ssh hostname
       (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms) debug-param)))
      ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms))))
      (launcher

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -1070,11 +1070,11 @@
 	    ;; (debug:print 0 "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0))
 	    (if (> (current-seconds)(+ last-time-incomplete 900))
 		(begin
 		  (debug:print-info 0 "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name))
 		  (set! last-time-incomplete (current-seconds))
-		  (rmt:find-and-mark-incomplete run-id)))
+		  (rmt:find-and-mark-incomplete run-id #f)))
 	    (if (not (eq? num-running prev-num-running))
 		(debug:print-info 0 "run-wait specified, waiting on " num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))
 	    (thread-sleep! 15)
 	    ;; (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running))))
 	    (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running))))
@@ -1505,11 +1505,12 @@
 (define (runs:remove-test-directory db test remove-data-only)
   (let* ((run-dir       (db:test-get-rundir test))    ;; run dir is from the link tree
 	 (real-dir      (if (file-exists? run-dir)
 			    (resolve-pathname run-dir)
 			    #f)))
-    (if (not remove-data-only)
+    (if remove-data-only
+	(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f)
 	(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f))
     (debug:print-info 1 "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir)
     (if (and real-dir 
 	     (> (string-length real-dir) 5)
 	     (file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc.
@@ -1539,11 +1540,12 @@
 		     (not (member run-dir (list "n/a" "/tmp/badname"))))
 		(debug:print 0 "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink")
 		(debug:print 0 "NOTE: the run dir for this test is undefined. Test may have already been deleted."))
 	    ))
     ;; Only delete the records *after* removing the directory. If things fail we have a record 
-    (if (not remove-data-only)
+    (if remove-data-only
+	(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "NOT_STARTED" "n/a" #f)
 	(rmt:delete-test-records (db:test-get-run_id test) (db:test-get-id test)))))
 
 ;;======================================================================
 ;; Routines for manipulating runs
 ;;======================================================================