Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -1025,5 +1025,30 @@ (alist->env-vars testprevvals) (alist->env-vars commonprevvals) launch-results)) (change-directory *toppath*)) +;; recover a test where the top controlling mtest may have died +;; +(define (launch:recover-test run-id test-id) + ;; this function is called on the test run host via ssh + ;; + ;; 1. look at the process from pid + ;; - is it owned by calling user + ;; - it it's run directory correct for the test + ;; - is there a controlling mtest (maybe stuck) + ;; 2. if recovery is needed watch pid + ;; - when it exits take the exit code and do the needful + ;; + (let* ((pid (rmt:test-get-top-process-id run-id test-id)) + (psres (with-input-from-pipe + (conc "ps -F -u " (current-user-name) " | grep -E '" pid " ' | grep -v 'grep -E " pid "'") + (lambda () + (read-line)))) + (rundir (if (string? psres) ;; real process owned by user + (read-symbolic-link (conc "/proc/" pid "/cwd")) + #f))) + ;; now wait on that process if all is correct + ;; periodically update the db with runtime + ;; when the process exits look at the db, if still RUNNING after 10 seconds set + ;; state/status appropriately + (process-wait pid))) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -251,10 +251,11 @@ "-o" "-log" "-archive" "-since" "-fields" + "-recover-test" ;; run-id,test-id - used internally to recover a test stuck in RUNNING state ) (list "-h" "-help" "--help" "-version" "-force" "-xterm" @@ -1344,10 +1345,28 @@ (if (args:get-arg "-execute") (begin (launch:execute (args:get-arg "-execute")) (set! *didsomething* #t))) + +;;====================================================================== +;; recover from a test where the managing mtest was killed but the underlying +;; process might still be salvageable +;;====================================================================== + +(if (args:get-arg "-recover-test") + (let* ((params (string-split (args:get-arg "-recover-test") ","))) + (if (> (length params) 1) ;; run-id and test-id + (let ((run-id (string->number (car params))) + (test-id (string->number (cadr params)))) + (if (and run-id test-id) + (begin + (launch:recover-test run-id test-id) + (set! *didsomething* #t)) + (begin + (debug:print 0 "ERROR: bad run-id or test-id, must be integers") + (exit 1))))))) ;;====================================================================== ;; Test commands (i.e. for use inside tests) ;;======================================================================