@@ -1363,10 +1363,11 @@ (let ((sorted-tests (sort tests (lambda (a b)(let ((dira (db:test-get-rundir a)) (dirb (db:test-get-rundir b))) (if (and (string? dira)(string? dirb)) (> (string-length dira)(string-length dirb)) #f))))) + (toplevel-retries (make-hash-table)) ;; try three times to loop through and remove top level tests (test-retry-time (make-hash-table)) (allow-run-time 10)) ;; seconds to allow for killing tests before just brutally killing 'em (let loop ((test (car sorted-tests)) (tal (cdr sorted-tests))) (let* ((test-id (db:test-get-id test)) @@ -1385,11 +1386,18 @@ (> (cdb:remote-run db:test-toplevel-num-items db run-id test-name) 0)))) (case action ((remove-runs) ;; if the test is a toplevel-with-children issue an error and do not remove (if toplevel-with-children - (debug:print 0 "WARNING: skipping removal of " test-fulln " with run-id " run-id " as it has sub tests") + (begin + (debug:print 0 "WARNING: skipping removal of " test-fulln " with run-id " run-id " as it has sub tests") + (hash-table-set! toplevel-retries test-fulln (+ (hash-table-ref/default toplevel-retries test-fulln 0) 1)) + (if (> (hash-table-ref toplevel-retries test-fulln) 3) + (if (not (null? tal)) + (loop (car tal)(cdr tal))) ;; no else clause - drop it if no more in queue and > 3 tries + (let ((newtal (append tal (list test)))) + (loop (car newtal)(cdr newtal))))) ;; loop with test still in queue (begin (debug:print-info 0 "test: " test-name " itest-state: " test-state) (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ")) (begin (if (not (hash-table-ref/default test-retry-time test-fulln #f))