Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -3251,11 +3251,15 @@ dbstruct run-id #f (lambda (db) (let* ((stmth (db:get-cache-stmth dbstruct db qry))) - (sqlite3:first-result stmth run-id)))))) + (sqlite3:fold-row + (lambda (res val) val) + 0 stmth run-id)))))) + +;; (sqlite3:first-result stmth run-id)))))) ;; For a given testname how many items are running? Used to determine ;; probability for regenerating html ;; (define (db:get-count-tests-running-for-testname dbstruct run-id testname) @@ -3264,23 +3268,23 @@ run-id #f (lambda (db) (let* ((stmt "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? AND NOT (uname = 'n/a' AND item_path = '') AND testname=?;") (stmth (db:get-cache-stmth dbstruct db stmt))) - (sqlite3:first-result - stmth run-id testname))))) + (sqlite3:fold-row + (lambda (res val) val) 0 stmth run-id testname))))) (define (db:get-not-completed-cnt dbstruct run-id) -(db:with-db + (db:with-db dbstruct run-id #f (lambda (db) - ;(print "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=" run-id) - (sqlite3:first-result - db - "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=?;" run-id)))) + (let* ((stmt "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=?;")) + (sqlite3:fold-row + (lambda (res val) val) + 0 (db:get-cache-stmth dbstruct db stmt) run-id))))) (define (db:get-count-tests-running-in-jobgroup dbstruct run-id jobgroup) (if (not jobgroup) 0 ;; (let ((testnames '())) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -462,10 +462,12 @@ (debug:print-info 0 *default-log-port* "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run.")))))) ;; return #t when all items in waitors-upon list are represented in test-patt, #f otherwise. (define (runs:testpatts-mention-waitors-upon? test-patt waitors-upon) (null? (tests:filter-test-names-not-matched waitors-upon test-patt))) + +(define *find-and-mark-incomplete-last-run* (make-hash-table)) ;;====================================================================== ;; runs:run-tests is called from megatest.scm and itself ;;====================================================================== ;; @@ -772,13 +774,18 @@ ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going (handle-exceptions - exn - (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn) - (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) + exn + (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn) + ;; lets run this only if a run has been NOT seen for more than 900 seconds + (if (> (- (current-seconds)(hash-table-ref/default *find-and-mark-incomplete-last-run* run-id 0)) 900) + (begin + (rmt:find-and-mark-incomplete run-id #f) + (hash-table-set! *find-and-mark-incomplete-last-run* run-id (current-seconds))) + )))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) run-ids))) "runs: mark-incompletes"))) ;; (thread-start! th1) (thread-start! th2) ;; (thread-join! th1) @@ -1870,16 +1877,18 @@ (equal? (configf:lookup *configdat* "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) - (if (> (current-seconds)(+ last-time-incomplete 900)) + (if (> (- (current-seconds)(hash-table-ref/default *find-and-mark-incomplete-last-run* run-id 0)) 900) + ;; (begin(if (> (current-seconds)(+ last-time-incomplete 900)) (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id))) (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) - (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! + ;; (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) + (hash-table-set! *find-and-mark-complete-last-run* run-id (current-seconds)) (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1))