Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -113,11 +113,10 @@ (define *dbkeys* (append *keys* (list "runname"))) (define *header* #f) (define *allruns* '()) (define *allruns-by-id* (make-hash-table)) ;; -(define *runchangerate* (make-hash-table)) (define *buttondat* (make-hash-table)) ;; (define *alltestnamelst* '()) (define *searchpatts* (make-hash-table)) (define *num-runs* 8) @@ -306,14 +305,10 @@ (if (> (length tests) maxtests) (set! maxtests (length tests))) (if (or (not *hide-empty-runs*) ;; this reduces the data burden when set (not (null? tests))) (let ((dstruct (vector run tests key-vals (- (current-seconds) 10)))) - ;; - ;; compare the tests with the tests in *allruns-by-id* same run-id - ;; if different then increment value in *runchangerate* - ;; (hash-table-set! *allruns-by-id* run-id dstruct) (set! result (cons dstruct result)))))) runs) (set! *header* header) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -269,15 +269,21 @@ ;; (set-signal-handler! signal/int (lambda () ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* ;; - (let ((test-info (rmt:get-testinfo-state-status run-id test-id))) + (let* ((test-info (rmt:get-testinfo-state-status run-id test-id)) + (test-host (db:test-get-host test-info)) + (test-pid (db:test-get-process_id test-info))) (cond ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "KILLREQ" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")) ;; prime it for running + ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART")) + (if (process-alive-on-host? test-host test-pid) + (debug:print 0 "ERROR: test state is " (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed") + (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a"))) ((not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")) (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")) (debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed") (exit)))) Index: process.scm ================================================================== --- process.scm +++ process.scm @@ -147,11 +147,27 @@ ;; possibly pid is a process not a child, look in /proc to see if it is running still (file-exists? (conc "/proc/" pid)) (let-values (((rpid exit-type exit-signal)(process-wait pid #t))) (and (number? rpid) (equal? rpid pid))))) - + +(define (process:alive-on-host? host pid) + (let ((cmd (conc "ssh " host " ps -o pid= -p " pid))) + (handle-exceptions + exn + #f ;; anything goes wrong - assume the process in NOT running. + (with-input-from-pipe + cmd + (lambda () + (let loop ((inl (read-line))) + (if (eof-object? inl) + #f + (let* ((clean-str (string-substitute "^[^\\d]*([0-9]+)[^\\d]*$" "\\1" inl)) + (innum (string->number clean-str))) + (and innum + (eq? pid innum)))))))))) + (define (process:get-sub-pids pid) (with-input-from-pipe (conc "pstree -A -p " pid) ;; | tr 'a-z\\-+`()\\.' ' ' " pid) (lambda () (let loop ((inl (read-line))