Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -1025,5 +1025,30 @@
       (alist->env-vars testprevvals)
       (alist->env-vars commonprevvals)
       launch-results))
   (change-directory *toppath*))
 
+;; recover a test where the top controlling mtest may have died
+;;
+(define (launch:recover-test run-id test-id)
+  ;; this function is called on the test run host via ssh
+  ;;
+  ;; 1. look at the process from pid
+  ;;    - is it owned by calling user
+  ;;    - it it's run directory correct for the test
+  ;;    - is there a controlling mtest (maybe stuck)
+  ;; 2. if recovery is needed watch pid
+  ;;    - when it exits take the exit code and do the needful
+  ;;
+  (let* ((pid (rmt:test-get-top-process-id run-id test-id))
+	 (psres (with-input-from-pipe
+		 (conc "ps -F -u " (current-user-name) " | grep -E '" pid " ' | grep -v 'grep -E " pid "'")
+		 (lambda ()
+		   (read-line))))
+	 (rundir (if (string? psres) ;; real process owned by user
+		     (read-symbolic-link (conc "/proc/" pid "/cwd"))
+		     #f)))
+    ;; now wait on that process if all is correct
+    ;; periodically update the db with runtime
+    ;; when the process exits look at the db, if still RUNNING after 10 seconds set
+    ;; state/status appropriately
+    (process-wait pid)))

Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -251,10 +251,11 @@
 			"-o"
 			"-log"
 			"-archive"
 			"-since"
 			"-fields"
+			"-recover-test" ;; run-id,test-id - used internally to recover a test stuck in RUNNING state
 			) 
 		 (list  "-h" "-help" "--help"
 			"-version"
 		        "-force"
 		        "-xterm"
@@ -1344,10 +1345,28 @@
 
 (if (args:get-arg "-execute")
     (begin
       (launch:execute (args:get-arg "-execute"))
       (set! *didsomething* #t)))
+
+;;======================================================================
+;; recover from a test where the managing mtest was killed but the underlying
+;; process might still be salvageable
+;;======================================================================
+
+(if (args:get-arg "-recover-test")
+    (let* ((params (string-split (args:get-arg "-recover-test") ",")))
+      (if (> (length params) 1) ;; run-id and test-id
+	  (let ((run-id (string->number (car params)))
+		(test-id (string->number (cadr params))))
+	    (if (and run-id test-id)
+		(begin
+		  (launch:recover-test run-id test-id)
+		  (set! *didsomething* #t))
+		(begin
+		  (debug:print 0 "ERROR: bad run-id or test-id, must be integers")
+		  (exit 1)))))))
 
 ;;======================================================================
 ;; Test commands (i.e. for use inside tests)
 ;;======================================================================