Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1780,28 +1780,31 @@ ;; check duration against test-run.dat file if it exists and update the value in ;; the db if necessary ;; (define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration) - (let* ((datf (conc run-dir ".mt_data/test-run.dat")) + (let* ((datf (conc run-dir "/.mt_data/test-run.dat")) (modt (if (and (file-exists? datf) (file-read-access? datf)) (file-modification-time datf) #f)) ;; (+ event-time run-duration)))) (alt-run-duration (if modt (- modt event-time) #f))) + (debug:print 0 *default-log-port* "Test " test-id " datf " datf " modt " modt " alt-run-duration " alt-run-duration) (if (and alt-run-duration (> alt-run-duration run-duration)) (begin (debug:print 0 *default-log-port* "Test " test-id " run duration mismatch. Setting to " alt-run-duration) (db:with-db dbstruct #f #f (lambda (db) (sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id) #t))) - #f))) ;; #f = we did NOT adjust the time + (begin + (debug:print 0 *default-log-port* "Test " test-id " run duration correct. No adjustment needed.") + #f)))) ;; #f = we did NOT adjust the time (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) (let* ((incompleted '()) (oldlaunched '()) (toplevels '()) @@ -4019,24 +4022,28 @@ (lambda (db) (let ((tr-res (sqlite3:with-transaction db (lambda () - ;; NB// Pass the db so it is part fo the transaction + ;; NB// Pass the db so it is part of the transaction (db:test-set-state-status db run-id test-id state status comment) ;; this call sets the item state/status (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item - (let* ((state-status-counts (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test - (state-stauses (db:roll-up-rules state-status-counts state status)) - (newstate (car state-stauses)) - (newstatus (cadr state-stauses))) - (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts) " state-status-counts: " - (apply conc - (map (lambda (x) - (conc - (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | ")) - state-status-counts))); end debug:print - + ;; item-path is used to exclude current state/status of THIS test + (let* ((state-status-counts (db:get-all-state-status-counts-for-test + dbstruct run-id test-name item-path state status)) + (state-stauses (db:roll-up-rules state-status-counts state status)) + (newstate (car state-stauses)) + (newstatus (cadr state-stauses))) + (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name + ":"item-path" newstate="newstate" newstatus="newstatus + " len(sscs)="(length state-status-counts) " state-status-counts: " + (apply conc + (map (lambda (x) + (conc + (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | ")) + state-status-counts))); end debug:print + (if tl-test-id (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct )))))) (mutex-unlock! *db-transaction-mutex*) (if (and test-id state status (equal? status "AUTO")) @@ -4107,15 +4114,15 @@ (let ((tr-res (sqlite3:with-transaction db (lambda () (let* ((state-status-counts (db:get-all-state-status-counts-for-run dbstruct run-id)) - (state-stauses (db:roll-up-rules state-status-counts #f #f )) - (newstate (car state-stauses)) - (newstatus (cadr state-stauses))) + (state-stauses (db:roll-up-rules state-status-counts #f #f )) + (newstate (car state-stauses)) + (newstatus (cadr state-stauses))) (if (or (not (eq? newstate curr-state)) (not (eq? newstatus curr-status))) - (db:set-run-state-status dbstruct run-id newstate newstatus ))))))) + (db:set-run-state-status dbstruct run-id newstate newstatus ))))))) (mutex-unlock! *db-transaction-mutex*) tr-res)))) (define (db:get-all-state-status-counts-for-run dbstruct run-id) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -105,11 +105,14 @@ ;; any of the other stuff that tests:test-set-status! does. Let's just ;; force RUNNING/n/a ;; (thread-sleep! 0.3) ;; (tests:test-force-state-status! run-id test-id "RUNNING" "n/a") - (rmt:set-state-status-and-roll-up-items run-id test-name item-path "RUNNING" #f #f) + (common:soft-lock + (lambda () + (rmt:set-state-status-and-roll-up-items run-id test-name item-path "RUNNING" #f #f)) + "set-state-status-roll-up") ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here ;; if there is a runscript do it first (if fullrunscript (let ((pid (process-run fullrunscript))) @@ -1353,13 +1356,33 @@ (when do-scan? (debug:print 1 *default-log-port* "INFO: search and mark zombie tests") (rmt:set-var key (current-seconds)) (rmt:find-and-mark-incomplete run-id #f)))) - - - +;; weak locking - slow down concurrent activities but don't entirely stop them from +;; overlapping. wait for up to 5 seconds +;; +(define (common:soft-lock proc keyname #!key (wait-time 5)) + (let* ((start-time (current-seconds)) + (my-key (conc (current-process-id) "-" (get-host-name)))) + (let loop ((key-val (rmt:no-sync-get/default keyname #f))) + (if (and key-val + (not (equal? key-val my-key)) + (< (- (current-seconds) start-time) wait-time)) + (begin + (thread-sleep! 0.5) + (debug:print-info 0 *default-log-port* "Still trying to lock for " keyname) + (loop (current-seconds))) ;; try again + (begin ;; either we got the lock or we timed out - proceed as if + (rmt:no-sync-set keyname my-key) + (if (equal? (rmt:no-sync-get/default keyname #f) my-key) + (begin + (debug:print-info 0 *default-log-port* "Got lock for " keyname) + (proc) + (rmt:no-sync-del! keyname) + (debug:print-info 0 *default-log-port* "Released lock for " keyname)) + (common:soft-lock proc keyname wait-time: wait-time))))))) ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host @@ -1466,11 +1489,14 @@ ;; prevent overlapping actions - set to LAUNCHED as early as possible ;; ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) - (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f) + (common:soft-lock + (lambda () + (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f)) + "set-state-status-roll-up") ;; (pp (hash-table->alist tconfig)) (set! diskpath (get-best-disk *configdat* tconfig)) (if diskpath (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) (set! work-area (car dat)) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -747,11 +747,12 @@ (for-each (lambda (run-id) (if keep-going (handle-exceptions exn (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn) - (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) + (if (common:low-noise-print 900 (conc "mark-incomplete-" run-id)) + (rmt:find-and-mark-incomplete run-id #f))))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) run-ids))) "runs: mark-incompletes"))) ;; (thread-start! th1) (thread-start! th2) ;; (thread-join! th1) @@ -1846,11 +1847,12 @@ (if (> (current-seconds)(+ last-time-incomplete 900)) (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! - (rmt:find-and-mark-incomplete run-id #f) + (if (common:low-noise-print 900 (conc "mark-incomplete-" run-id)) + (rmt:find-and-mark-incomplete run-id #f)) (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1))