Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -49,10 +49,11 @@ test-get-paths-matching-keynames-target-new get-prereqs-not-met get-count-tests-running-for-run-id get-run-info get-run-status + get-run-state get-run-stats get-run-times get-targets get-target ;; register-run @@ -80,11 +81,11 @@ testmeta-get-record have-incompletes? synchash-get get-changed-record-ids get-run-record-ids - )) + get-not-completed-cnt)) (define api:write-queries '( get-keys-write ;; dummy "write" query to force server start @@ -97,10 +98,11 @@ delete-test-records delete-old-deleted-test-records test-set-state-status test-set-top-process-pid set-state-status-and-roll-up-items + update-pass-fail-counts top-test-set-per-pf-counts ;; (db:top-test-set-per-pf-counts (db:get-db *db* 5) 5 "runfirst") ;; RUNS register-run @@ -107,11 +109,11 @@ set-tests-state-status delete-run lock/unlock-run update-run-event_time mark-incomplete - + set-state-status-and-roll-up-run ;; STEPS teststep-set-status! ;; TEST DATA test-data-rollup @@ -198,10 +200,11 @@ ((delete-test-records) (apply db:delete-test-records dbstruct params)) ((delete-old-deleted-test-records) (apply db:delete-old-deleted-test-records dbstruct params)) ((test-set-state-status) (apply db:test-set-state-status dbstruct params)) ((test-set-top-process-pid) (apply db:test-set-top-process-pid dbstruct params)) ((set-state-status-and-roll-up-items) (apply db:set-state-status-and-roll-up-items dbstruct params)) + ((set-state-status-and-roll-up-run) (apply db:set-state-status-and-roll-up-run dbstruct params)) ((top-test-set-per-pf-counts) (apply db:top-test-set-per-pf-counts dbstruct params)) ((test-set-archive-block-id) (apply db:test-set-archive-block-id dbstruct params)) ;; RUNS ((register-run) (apply db:register-run dbstruct params)) @@ -276,18 +279,21 @@ ((get-testinfo-state-status) (apply db:get-testinfo-state-status dbstruct params)) ((test-get-top-process-pid) (apply db:test-get-top-process-pid dbstruct params)) ((test-get-paths-matching-keynames-target-new) (apply db:test-get-paths-matching-keynames-target-new dbstruct params)) ((get-prereqs-not-met) (apply db:get-prereqs-not-met dbstruct params)) ((get-count-tests-running-for-run-id) (apply db:get-count-tests-running-for-run-id dbstruct params)) + ((get-not-completed-cnt) (apply db:get-not-completed-cnt dbstruct params)) ((synchash-get) (apply synchash:server-get dbstruct params)) ((get-raw-run-stats) (apply db:get-raw-run-stats dbstruct params)) ((get-test-times) (apply db:get-test-times dbstruct params)) ;; RUNS ((get-run-info) (apply db:get-run-info dbstruct params)) ((get-run-status) (apply db:get-run-status dbstruct params)) + ((get-run-state) (apply db:get-run-state dbstruct params)) ((set-run-status) (apply db:set-run-status dbstruct params)) + ((set-run-state-status) (apply db:set-run-state-status dbstruct params)) ((get-tests-for-run) (apply db:get-tests-for-run dbstruct params)) ((get-test-id) (apply db:get-test-id dbstruct params)) ((get-tests-for-run-mindata) (apply db:get-tests-for-run-mindata dbstruct params)) ((get-tests-for-runs-mindata) (apply db:get-tests-for-runs-mindata dbstruct params)) ((get-runs) (apply db:get-runs dbstruct params)) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -318,11 +318,17 @@ (common:version-signature)))) (define (common:api-changed?) (not (equal? (substring (->string megatest-version) 0 4) (substring (conc (common:get-last-run-version)) 0 4)))) - + + +(define (common:get-sync-lock-filepath) + (let* ((tmp-area (common:get-db-tmp-area)) + (lockfile (conc tmp-area "/megatest.db.sync-lock"))) + lockfile)) + ;; Move me elsewhere ... ;; RADT => Why do we meed the version check here, this is called only if version misma ;; (define (common:cleanup-db dbstruct #!key (full #f)) (apply db:multi-db-sync @@ -599,13 +605,13 @@ ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; (define (common:simple-file-lock fname #!key (expire-time 300)) - (handle-exceptions - exn - #f ;; don't really care what went wrong right now. NOTE: I have not seen this one actually fail. +; (handle-exceptions +; exn +; #f ;; don't really care what went wrong right now. NOTE: I have not seen this one actually fail. (if (common:file-exists? fname) (if (> (- (current-seconds)(file-modification-time fname)) expire-time) (begin (delete-file* fname) (common:simple-file-lock fname expire-time: expire-time)) @@ -617,11 +623,13 @@ (thread-sleep! 0.25) (if (common:file-exists? fname) (with-input-from-file fname (lambda () (equal? key-string (read-line)))) - #f))))) + #f))) +; ) + ) (define (common:simple-file-lock-and-wait fname #!key (expire-time 300)) (let ((end-time (+ expire-time (current-seconds)))) (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time))) (if got-lock @@ -937,11 +945,11 @@ 0) (define (std-signal-handler signum) ;; (signal-mask! signum) - (set! *time-to-exit* #t) + (set! *time-to-exit* #t) ;;(debug:print-info 13 *default-log-port* "got signal "signum) (debug:print-error 0 *default-log-port* "Received signal " signum " aaa exiting promptly") ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway (exit)) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -157,10 +157,11 @@ ;; RA => Might require revert for filters ;; create a watch dog to move changes from lt/.db/*.db to megatest.db ;; ;;;(if (file-write-access? (conc *toppath* "/megatest.db")) ;;(debug:print-info 13 *default-log-port* "Before common:watchdog spawn") + (thread-start! (make-thread common:watchdog "Watchdog thread")) ;;(debug:print-info 13 *default-log-port* "After common:watchdog spawn") ;; (if (not (args:get-arg "-use-db-cache")) ;; (begin ;; (debug:print-info 0 *default-log-port* "Forcing db-cache mode due to read-only access to megatest.db") @@ -2636,14 +2637,16 @@ #:tabchangepos-cb (lambda (obj curr prev) (debug:catch-and-dump (lambda () (let* ((tab-num (dboard:commondat-curr-tab-num commondat)) (tabdat (dboard:common-get-tabdat commondat tab-num: tab-num))) + (dboard:tabdat-layout-update-ok-set! tabdat #f)) (dboard:commondat-curr-tab-num-set! commondat curr) (let* ((tab-num (dboard:commondat-curr-tab-num commondat)) (tabdat (dboard:common-get-tabdat commondat tab-num: tab-num))) + (dboard:commondat-please-update-set! commondat #t) (dboard:tabdat-layout-update-ok-set! tabdat #t))) "tabchangepos")) (dashboard:summary commondat stats-dat tab-num: 0) runs-view Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -313,11 +313,12 @@ ;; (define (db:open-db dbstruct #!key (areapath #f)(do-sync #t)) ;; TODO: actually use areapath (let ((tmpdb-stack (dbr:dbstruct-dbstack dbstruct))) ;; RA => Returns the first reference in dbstruct (if (stack? tmpdb-stack) (db:get-db tmpdb-stack) ;; get previously opened db (will create new db handle if all in the stack are already used - (let* ((dbpath (db:dbfile-path )) ;; path to tmp db area + (let* ((max-stale-tmp (configf:lookup-number *configdat* "server" "filling-db-max-stale-seconds" default: 10)) + (dbpath (db:dbfile-path )) ;; path to tmp db area (dbexists (common:file-exists? dbpath)) (tmpdbfname (conc dbpath "/megatest.db")) (dbfexists (common:file-exists? tmpdbfname)) ;; (conc dbpath "/megatest.db"))) (mtdbexists (common:file-exists? (conc *toppath* "/megatest.db"))) @@ -351,11 +352,11 @@ (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb) ;; olddb is already a (cons db path) (dbr:dbstruct-refndb-set! dbstruct refndb) ;; (mutex-unlock! *rundb-mutex*) (if (and (or (not dbfexists) (and modtimedelta - (> modtimedelta 10))) ;; if db in tmp is over ten seconds older than the file in MTRA then do a sync back + (> modtimedelta max-stale-tmp))) ;; if db in tmp is over ten seconds older than the file in MTRA then do a sync back do-sync) (begin (debug:print 1 *default-log-port* "filling db " (db:dbdat-get-path tmpdb) " with data \n from " (db:dbdat-get-path mtdb) " mod time delta: " modtimedelta) (db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb refndb tmpdb) ;touch tmp db to avoid wal mode wierdness @@ -1048,11 +1049,15 @@ (for-each (lambda (server) (match-let (((mod-time host port start-time pid) server)) (if (and host pid) (tasks:kill-server host pid)))) - servers)) + servers) + + ;; /tmp/bjbarcla/megatest_localdb/fullrun/.nfs.pdx.disks.icf_env_disk001.bjbarcla.gwa.issues.mtdev.matt-bisect.megatest.ext-tests.runs.all.v1.65.1.6524.dev.bb-24-justrollup-f8.rollup.fullrun/megatest.db.lock + (delete-file* (common:get-sync-lock-filepath)) + ) ;; clear out junk records ;; ((dejunk) (db:delay-if-busy mtdb) ;; ok to delay on mtdb @@ -2628,13 +2633,21 @@ (define (db:set-run-status dbstruct run-id status msg) (db:with-db dbstruct #f #f (lambda (db) - (if msg + (if msg (sqlite3:execute db "UPDATE runs SET status=?,comment=? WHERE id=?;" status msg run-id) (sqlite3:execute db "UPDATE runs SET status=? WHERE id=?;" status run-id))))) + +(define (db:set-run-state-status dbstruct run-id state status ) + (db:with-db + dbstruct #f #f + (lambda (db) + (sqlite3:execute db "UPDATE runs SET status=?,state=? WHERE id=?;" status state run-id)))) + + (define (db:get-run-status dbstruct run-id) (let ((res "n/a")) (db:with-db dbstruct #f #f @@ -2644,10 +2657,24 @@ (set! res status)) db "SELECT status FROM runs WHERE id=?;" run-id) res)))) + +(define (db:get-run-state dbstruct run-id) + (let ((res "n/a")) + (db:with-db + dbstruct #f #f + (lambda (db) + (sqlite3:for-each-row + (lambda (status) + (set! res status)) + db + "SELECT state FROM runs WHERE id=?;" + run-id) + res)))) + ;;====================================================================== ;; K E Y S ;;====================================================================== @@ -3020,10 +3047,22 @@ (lambda (db) (sqlite3:first-result db "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? AND NOT (uname = 'n/a' AND item_path = '') AND testname=?;" run-id testname)))) + +(define (db:get-not-completed-cnt dbstruct run-id) +(db:with-db + dbstruct + run-id + #f + (lambda (db) + ;(print "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=" run-id) + (sqlite3:first-result + db + "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=?;" run-id)))) + (define (db:get-count-tests-running-in-jobgroup dbstruct run-id jobgroup) (if (not jobgroup) 0 ;; (let ((testnames '())) ;; get the testnames @@ -3694,11 +3733,10 @@ ;; if test-name is an integer work off that instead of test-name test-path ;; (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) ;; establish info on incoming test followed by info on top level test ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met - (let* ((testdat (if (number? test-name) (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id (db:get-test-info dbstruct run-id test-name item-path))) (test-id (db:test-get-id testdat)) (test-name (if (number? test-name) @@ -3721,63 +3759,69 @@ (lambda () ;; NB// Pass the db so it is part fo the transaction (db:test-set-state-status db run-id test-id state status comment) ;; this call sets the item state/status (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item (let* ((state-status-counts (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test - (running (length (filter (lambda (x) - (member (dbr:counts-state x) *common:running-states*)) - state-status-counts))) - (bad-not-started (length (filter (lambda (x) - (and (equal? (dbr:counts-state x) "NOT_STARTED") - (not (member (dbr:counts-status x) - *common:not-started-ok-statuses*)))) - state-status-counts))) - ;; (non-completes (filter (lambda (x) - ;; (not (equal? (dbr:counts-state x) "COMPLETED"))) - ;; state-status-counts)) - (all-curr-states (common:special-sort ;; worst -> best (sort of) - (delete-duplicates - (if (not (member state *common:dont-roll-up-states*)) - (cons state (map dbr:counts-state state-status-counts)) - (map dbr:counts-state state-status-counts))) - *common:std-states* >)) - (all-curr-statuses (common:special-sort ;; worst -> best - (delete-duplicates - (if (not (member state *common:dont-roll-up-states*)) - (cons status (map dbr:counts-status state-status-counts)) - (map dbr:counts-status state-status-counts))) + (state-stauses (db:roll-up-rules state-status-counts state status)) + (newstate (car state-stauses)) + (newstatus (cadr state-stauses))) + (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts) " state-status-counts: " + (apply conc + (map (lambda (x) + (conc + (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | ")) + state-status-counts))); end debug:print + + (if tl-test-id + (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct + )))))) + (mutex-unlock! *db-transaction-mutex*) + (if (and test-id state status (equal? status "AUTO")) + (db:test-data-rollup dbstruct run-id test-id status)) + tr-res))))) + +(define (db:roll-up-rules state-status-counts state status) + (let* ((running (length (filter (lambda (x) + (member (dbr:counts-state x) *common:running-states*)) + state-status-counts))) + (bad-not-started (length (filter (lambda (x) + (and (equal? (dbr:counts-state x) "NOT_STARTED") + (not (member (dbr:counts-status x) *common:not-started-ok-statuses*)))) + state-status-counts))) + (all-curr-states (common:special-sort ;; worst -> best (sort of) + (delete-duplicates + (if (and state (not (member state *common:dont-roll-up-states*))) + (cons state (map dbr:counts-state state-status-counts)) + (map dbr:counts-state state-status-counts))) + *common:std-states* >)) + (all-curr-statuses (common:special-sort ;; worst -> best + (delete-duplicates + (if (and state status (not (member state *common:dont-roll-up-states*))) + (cons status (map dbr:counts-status state-status-counts)) + (map dbr:counts-status state-status-counts))) *common:std-statuses* >)) - (non-completes (filter (lambda (x) - (not (member x (cons "COMPLETED" *common:dont-roll-up-states*)))) - all-curr-states)) - (preq-fails (filter (lambda (x) - (equal? x "PREQ_FAIL")) - all-curr-statuses)) - (num-non-completes (length non-completes)) - (newstate (cond - ((> running 0) "RUNNING") ;; anything running, call the situation running - ((> (length preq-fails) 0) - "NOT_STARTED") - ((> bad-not-started 0) "COMPLETED") ;; we have an ugly situation, it is completed in the sense we cannot do more. - ((> num-non-completes 0) (car non-completes)) ;; (remove (lambda (x)(equal? "COMPLETED" x)) all-curr-states))) ;; only rollup DELETED if all DELETED - (else (car all-curr-states)))) - ;; (if (> running 0) - ;; "RUNNING" - ;; (if (> bad-not-started 0) - ;; "COMPLETED" - ;; (car all-curr-states)))) - (newstatus (cond - ((> (length preq-fails) 0) - "PREQ_FAIL") - ((or (> bad-not-started 0) - (and (equal? newstate "NOT_STARTED") - (> num-non-completes 0))) - "STARTED") - (else - (car all-curr-statuses))))) - - (debug:print-info 2 *default-log-port* + (non-completes (filter (lambda (x) + (not (member x (cons "COMPLETED" *common:dont-roll-up-states*)))) + all-curr-states)) + (preq-fails (filter (lambda (x) + (equal? x "PREQ_FAIL")) + all-curr-statuses)) + (num-non-completes (length non-completes)) + (newstate (cond + ((> running 0) "RUNNING") ;; anything running, call the situation running + ((> (length preq-fails) 0) "NOT_STARTED") + ((> bad-not-started 0) "COMPLETED") ;; we have an ugly situation, it is completed in the sense we cannot do more. + ((> num-non-completes 0) (car non-completes)) ;; (remove (lambda (x)(equal? "COMPLETED" x)) all-curr-states))) ;; only rollup DELETED if all DELETED + (else (car all-curr-states)))) + (newstatus (cond + ((> (length preq-fails) 0) "PREQ_FAIL") + ((or (> bad-not-started 0) + (and (equal? newstate "NOT_STARTED") + (> num-non-completes 0))) + "STARTED") + (else (car all-curr-statuses))))) + (debug:print-info 2 *default-log-port* "\n--> probe db:set-state-status-and-roll-up-items: " "\n--> state-status-counts: "(map dbr:counts->alist state-status-counts) "\n--> running: "running "\n--> bad-not-started: "bad-not-started "\n--> non-non-completes: "num-non-completes @@ -3786,33 +3830,47 @@ "\n--> all-curr-statuses: "all-curr-statuses "\n--> newstate "newstate "\n--> newstatus "newstatus "\n\n") - ;; (print "bad-not-supported: " bad-not-support " all-curr-states: " all-curr-states " all-curr-statuses: " all-curr-states) - ;; " newstate: " newstate " newstatus: " newstatus) - ;; NB// Pass the db so it is part of the transaction - (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path"> bad-not-started="bad-not-started" newstate="newstate" newstatus="newstatus" num-non-completes="num-non-completes" non-completes="non-completes "len(sscs)="(length state-status-counts) " state-status-counts: " - (apply conc - (map (lambda (x) - (conc - (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | ")) - state-status-counts)) - - ); end debug:print - (if tl-test-id - (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct - )))))) - + ;; NB// Pass the db so it is part of the transaction + (list newstate newstatus))) + +(define (db:set-state-status-and-roll-up-run dbstruct run-id curr-state curr-status) + (mutex-lock! *db-transaction-mutex*) + (db:with-db + dbstruct #f #f + (lambda (db) + (let ((tr-res + (sqlite3:with-transaction + db + (lambda () + (let* ((state-status-counts (db:get-all-state-status-counts-for-run dbstruct run-id)) + (state-stauses (db:roll-up-rules state-status-counts #f #f )) + (newstate (car state-stauses)) + (newstatus (cadr state-stauses))) + (if (or (not (eq? newstate curr-state)) (not (eq? newstatus curr-status))) + (db:set-run-state-status dbstruct run-id newstate newstatus ))))))) (mutex-unlock! *db-transaction-mutex*) - (if (and test-id state status (equal? status "AUTO")) - (db:test-data-rollup dbstruct run-id test-id status)) - tr-res))))) + tr-res)))) + + +(define (db:get-all-state-status-counts-for-run dbstruct run-id) + (let* ((test-count-recs (db:with-db + dbstruct #f #f + (lambda (db) + (sqlite3:map-row + (lambda (state status count) + (make-dbr:counts state: state status: status count: count)) + db + "SELECT state,status,count(id) FROM tests WHERE run_id=? GROUP BY state,status;" + run-id ))))) + test-count-recs)) + + ;; BBnote: db:get-all-state-status-counts-for-test returns dbr:counts object aggregating state and status of items of a given test, *not including rollup state/status* (define (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path item-state-in item-status-in) - - (let* ((test-info (db:get-test-info dbstruct run-id test-name item-path)) (item-state (or item-state-in (db:test-get-state test-info))) (item-status (or item-status-in (db:test-get-status test-info))) (other-items-count-recs (db:with-db dbstruct #f #f @@ -3891,11 +3949,11 @@ ;; TESTS '(register-test "INSERT OR IGNORE INTO tests (run_id,testname,event_time,item_path,state,status) VALUES (?,?,strftime('%s','now'),?,'NOT_STARTED','n/a');") ;; Test state and status '(set-test-state "UPDATE tests SET state=? WHERE id=?;") '(set-test-status "UPDATE tests SET state=? WHERE id=?;") - '(state-status "UPDATE tests SET state=?,status=? WHERE id=?;") ;; DONE + '(state-status "UPDATE tests SET state=?,status=? WHERE id=?;") ;; D/ONE '(state-status-msg "UPDATE tests SET state=?,status=?,comment=? WHERE id=?;") ;; DONE ;; Test comment '(set-test-comment "UPDATE tests SET comment=? WHERE id=?;") '(set-test-start-time "UPDATE tests SET event_time=strftime('%s','now') WHERE id=?;") ;; DONE '(pass-fail-counts "UPDATE tests SET pass_count=?,fail_count=? WHERE id=?;") Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -470,11 +470,12 @@ (begin (debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2) (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt ))) (mutex-unlock! m) - ;; no point in sticking around. Exit now. + ;; no point in sticking around. Exit now. But run end of run before exiting? + (launch:end-of-run-check run-id) (exit))) (if (hash-table-ref/default misc-flags 'keep-going #f) (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta @@ -807,12 +808,11 @@ ;; only state and status needed - use lazy routine (testinfo (rmt:get-testinfo-state-status run-id test-id))) ;; Am I completed? (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) ;; NOTE: It should *not* be REMOTEHOSTSTART but for reasons I don't yet understand it sometimes gets stuck in that state ;; (not (equal? (db:test-get-state testinfo) "COMPLETED")) (let ((new-state (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status - ;; "COMPLETED" - ;; (db:test-get-state testinfo))) ;; else preseve the state as set within the test + ;; "COMPLETED" ;; (db:test-get-state testinfo))) ;; else preseve the state as set within the test ) (new-status (cond ((not (launch:einf-exit-status exit-info)) "FAIL") ;; job failed to run ... (vector-ref exit-info 1) ((eq? (launch:einf-rollup-status exit-info) 0) ;; (vector-ref exit-info 3) ;; if the current status is AUTO then defer to the calculated value (i.e. leave this AUTO) @@ -839,16 +839,95 @@ (if (not (equal? item-path "")) (tests:summarize-items run-id test-id test-name #f)) (tests:summarize-test run-id test-id) ;; don't force - just update if no (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) (mutex-unlock! m) + (launch:end-of-run-check run-id ) (debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n") (if (not (launch:einf-exit-status exit-info)) (exit 4)))) ))) +;; Spec for End of test +;; At end of each test call, after marking self as COMPLETED do run-state-status-rollup +;; At transition to run COMPLETED/X do hooks +;; Definition: test_dead if event_time + duration + 1 minute? < current_time AND +;; we can prove the process is not alive (ssh host pstree -A pid) +;; if dead safe to mark the test as killed in the db +;; State/status table +;; new +;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup +;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na +;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED +;; 0 RUNNING ==> this is actually the first condition, should not get here + +(define (launch:end-of-run-check run-id ) + (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id)) + (running-cnt (rmt:get-count-tests-running-for-run-id run-id)) + (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id))) + (current-state (rmt:get-run-state run-id)) + (current-status (rmt:get-run-status run-id))) + ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing + (debug:print 0 *default-log-port* "rollup run state/status") + (rmt:set-state-status-and-roll-up-run run-id current-state current-status) + + (cond + ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" )) + (debug:print 0 *default-log-port* "look for post hook.") + (runs:run-post-hook run-id)) + ((> running-cnt 3) + (debug:print 0 *default-log-port* "There are " running-cnt " tests running." )) + ((> running-cnt 0) + (debug:print 0 *default-log-port* "running cnt > 0 but <= 3 kill-running-tests-if-dead" ) + (let ((kill-cnt (launch:kill-tests-if-dead run-id))) + (if (and all-test-launched (equal? all-test-launched "yes") (eq? kill-cnt running-cnt)) + (launch:end-of-run-check run-id)))) ;;todo + (else (debug:print 0 *default-log-port* "Should it get here?? May be everything is not launched yet. Running test cnt:" running-cnt " Not completed test cnt:" not-completed-cnt) + (let* ((not-completed-tests (rmt:get-tests-for-run run-id "%" `("NOT_STARTED" "RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f))) + (if (> (length not-completed-tests) 0) + (let loop ((running-test (car not-completed-tests)) + (tal (cdr not-completed-tests))) + (let* ((test-name (vector-ref running-test 2)) + (item-path (vector-ref running-test 11))) + (debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed") + (if (not (null? tal)) + (loop (car tal) (cdr tal))))))))))) + +(define (launch:is-test-alive host pid) +(if (and host pid (not (equal? host "n/a"))) +(let* ((cmd (conc "ssh " host " pstree -A " pid)) + (output (with-input-from-pipe cmd read-lines))) + (print "cmd: " cmd "\n op: " output ) + (if(eq? (length output) 0) + #f + #t)) +#t)) + +(define (launch:kill-tests-if-dead run-id) + (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f))) + (let loop ((running-test (car running-tests)) + (tal (cdr running-tests)) + (kill-cnt 0)) + (let* ((test-name (vector-ref running-test 2)) + (item-path (vector-ref running-test 11)) + (test-id (vector-ref running-test 0)) + (host (vector-ref running-test 6)) + (pid (rmt:test-get-top-process-pid run-id test-id)) + (event-time (vector-ref running-test 5)) + (duration (vector-ref running-test 12)) + (flag 0) + (curr-time (current-seconds))) + (if (and (< (+ event-time duration 600) curr-time) (not (launch:is-test-alive host pid))) ;;test has not updated duration in last 10 min then likely its not running but confirm before marking it as killed + (begin + (debug:print 0 *default-log-port* "test " test-name "/" item-path " needs to be killed") + (set! flag 1) + (rmt:set-state-status-and-roll-up-items run-id test-name item-path "KILLREQ" "n/a" #f))) + (if (not (null? tal)) + (loop (car tal) (cdr tal) (+ kill-cnt flag)) + (+ kill-cnt flag)))))) + ;; DO NOT USE - caching of configs is handled in launch:setup now. ;; (define (launch:cache-config) ;; if we have a linktree and -runtests and -target and the directory exists dump the config ;; to megatest-(current-seconds).cfg and symlink it to megatest.cfg Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.6525) +(define megatest-version 1.6528) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -109,13 +109,14 @@ Launching and managing runs -run : run all tests or as specified by -testpatt -remove-runs : remove the data for a run, requires -runname and -testpatt Optionally use :state and :status, use -keep-records to remove only - the run data. - -kill-runs : kill existing run(s) (all incomplete tests killed) - -kill-rerun : kill an existing run (all incomplete tests killed and run is rerun) + the run data. Use -kill-wait to override the 10 second + per test wait after kill delay. + -kill-runs : kill existing run(s) (all incomplete tests killed) + -kill-rerun : kill an existing run (all incomplete tests killed and run is rerun) -set-state-status X,Y : set state to X and status to Y, requires controls per -remove-runs -rerun FAIL,WARN... : force re-run for tests with specificed status(s) -rerun-clean : set all tests not COMPLETED+PASS,WARN,WAIVED to NOT_STARTED,n/a and then run the specified testpatt with -preclean -rerun-all : set all tests to NOT_STARTED,n/a and run with -preclean @@ -129,11 +130,11 @@ -no-cache : do not use the cached config files. -one-pass : launch as many tests as you can but do not wait for more to be ready -remove-keep N : remove all but N most recent runs per target; use '-actions, -age, -precmd' -age : 120d,3h,20m to apply only to runs older than the specified age. NB// M=month, m=minute - -actions : print,remove-runs,archive to specify action to take + -actions [,...] : actions to take; print,remove-runs,archive,kill-runs -precmd : insert a wrapper command in front of the commands run Selectors (e.g. use for -runtests, -remove-runs, -set-state-status, -list-runs etc.) -target key1/key2/... : run for key1, key2, etc. -reqtarg key1/key2/... : run for key1, key2, etc. but key1/key2 must be in runconfigs @@ -357,10 +358,11 @@ "-src-target" "-src-runname" "-diff-email" "-sync-to" "-pgsync" + "-kill-wait" ;; wait this long before removing test (default is 10 sec) "-diff-html" ) (list "-h" "-help" "--help" "-manual" "-version" @@ -380,23 +382,25 @@ "-clean-cache" "-no-cache" "-cache-db" "-use-db-cache" "-prepend-contour" + ;; misc "-repl" "-lock" "-unlock" "-list-servers" "-kill-servers" "-run-wait" ;; wait on a run to complete (i.e. no RUNNING) - "-one-pass" ;; + "-one-pass" ;; "-local" ;; run some commands using local db access - "-generate-html" - "-generate-html-structure" + "-generate-html" + "-generate-html-structure" "-list-run-time" "-list-test-time" + ;; misc queries "-list-disks" "-list-targets" "-list-db-targets" "-show-runconfig" @@ -422,10 +426,11 @@ "-convert-to-norm" "-convert-to-old" "-import-megatest.db" "-sync-to-megatest.db" + "-sync-brute-force" "-logging" "-v" ;; verbose 2, more than normal (normal is 1) "-q" ;; quiet 0, errors/warnings only "-diff-rep" @@ -1240,11 +1245,11 @@ (statuses (string-split (or (args:get-arg "-status") "") ",")) (run-id (db:get-value-by-header run header "id")) (runname (db:get-value-by-header run header "runname")) (states (string-split (or (args:get-arg "-state") "") ",")) (tests (if tests-spec - (db:dispatch-query access-mode rmt:get-tests-for-run db:get-tests-for-run run-id testpatt states statuses #f #f #f 'testname 'asc ;; (db:get-tests-for-run dbstruct run-id testpatt '() '() #f #f #f 'testname 'asc + (rmt:get-tests-for-run run-id testpatt states statuses #f #f #f 'testname 'asc ;; (db:get-tests-for-run dbstruct run-id testpatt '() '() #f #f #f 'testname 'asc ;; use qryvals if test-spec provided (if tests-spec (string-intersperse adj-tests-spec ",") ;; db:test-record-fields #f) @@ -1376,11 +1381,11 @@ (let* ((run-id (db:get-value-by-header run header "id")) (runname (db:get-value-by-header run header "runname")) (states (string-split (or (args:get-arg "-state") "") ",")) (statuses (string-split (or (args:get-arg "-status") "") ",")) (tests (if tests-spec - (db:dispatch-query access-mode rmt:get-tests-for-run db:get-tests-for-run run-id testpatt states statuses #f #f #f 'testname 'asc ;; (db:get-tests-for-run dbstruct run-id testpatt '() '() #f #f #f 'testname 'asc + (rmt:get-tests-for-run run-id testpatt states statuses #f #f #f 'testname 'asc ;; (db:get-tests-for-run dbstruct run-id testpatt '() '() #f #f #f 'testname 'asc ;; use qryvals if test-spec provided (if tests-spec (string-intersperse adj-tests-spec ",") ;; db:test-record-fields #f) @@ -1501,11 +1506,11 @@ ;; "\n rundir: " (get-value-by-fieldname test test-field-index "") ;; (sdb:qry 'getstr ;; (filedb:get-path *fdb* ;; (db:test-get-rundir test) ;; ) ) ;; Each test ;; DO NOT remote run - (let ((steps (db:dispatch-query access-mode rmt:get-steps-for-test db:get-steps-for-test run-id (db:test-get-id test)))) ;; (db:get-steps-for-test dbstruct run-id (db:test-get-id test)))) + (let ((steps (rmt:get-steps-for-test run-id (db:test-get-id test)))) ;; (db:get-steps-for-test dbstruct run-id (db:test-get-id test)))) (for-each (lambda (step) (format #t " Step: ~20a State: ~10a Status: ~10a Time ~22a\n" (tdb:step-get-stepname step) @@ -2292,10 +2297,14 @@ 'adj-testids 'old2new ;; 'new2old ) (set! *didsomething* #t))) + +(when (args:get-arg "-sync-brute-force") + ((server:get-bruteforce-syncer (db:setup #t) persist-until-sync: #t)) + (set! *didsomething* #t)) (if (args:get-arg "-sync-to-megatest.db") (let* ((dbstruct (db:setup #f)) (tmpdbpth (cdr (dbr:dbstruct-tmpdb dbstruct))) (lockfile (conc tmpdbpth ".lock")) Index: mtut.scm ================================================================== --- mtut.scm +++ mtut.scm @@ -217,10 +217,12 @@ ("-run-name" . n) ("-mode-patt" . o) ("-test-patt" . p) ;; idea, enhance margs ("-test-patt" "-testpatt") => yields one value in "-test-patt" ("-status" . s) ("-target" . t) + ("-reqtarg" . R) + ("-tag-expr" . x) ;; misc ("-debug" . #f) ;; for *verbosity* > 2 ("-load" . #f) ;; load and exectute a scheme file ("-log" . #f) @@ -241,10 +243,11 @@ ;; misc ("-repl" . #f) ("-immediate" . I) ("-preclean" . r) ("-prepend-contour" . w) + ("-force" . F) ("-list-pkt-keys" . #f) )) ;; alist to map actions to old megatest commands (define *action-keys* @@ -251,10 +254,12 @@ '((run . "-run") (rerun-clean . "-rerun-clean") (rerun-all . "-rerun-all") (kill-run . "-kill-runs") (kill-rerun . "-kill-rerun") + (lock . "-lock") + (unlock . "-unlock") (sync . "") (archive . "-archive") (set-ss . "-set-state-status") (remove . "-remove-runs"))) @@ -823,11 +828,12 @@ (define (contains list x) (cond ((null? list) #f) ((eq? (car list) x) #t) (else (contains (cdr list) x)))) ;; collect all needed data and create run pkts for contours with changed inputs ;; (define (generate-run-pkts mtconf toppath) - (let ((std-runname (conc "sched" (time->string (seconds->local-time (current-seconds)) "%M%H%d")))) + (let ((std-runname (conc "sched" (time->string (seconds->local-time (current-seconds)) "%M%H%d"))) + (packets-generated 0)) (common:with-queue-db mtconf (lambda (pktsdirs pktsdir pdb) (let* ((rgconfdat (find-and-read-config (conc toppath "/runconfigs.config"))) (rgconf (car rgconfdat)) @@ -1004,15 +1010,27 @@ (lambda (cmd) ;;(print "cmd: " cmd) ;;(print "Areas: " all-areas) (for-each (lambda (area) + ;;(print "Area: " area) + ;;(print "Target: " runkey) + ;;(print "OR: " (or (string->number (if (configf:lookup mtconf "setup" "max_packets_per_run") (configf:lookup mtconf "setup" "max_packets_per_run") "10000" )))) + ;;(print "Packets generated: " packets-generated) + ;;(print "Comparison: " (< packets-generated 4)) + ;;(print "Full Comparison: " + ;; (and (< packets-generated (or (string->number (if (configf:lookup mtconf "setup" "max_packets_per_run") (configf:lookup mtconf "setup" "max_packets_per_run") "10000" )) 10000)) + ;; (if (args:get-arg "-target") + ;; (if (string= (args:get-arg "-target") runkey) (area-allowed? area "area-needs-to-be-run" runkey contour #f) #f) + ;; (area-allowed? area "area-needs-to-be-run" runkey contour #f)))) + ;;(print "Area Allowed: " (area-allowed? area "area-needs-to-be-run" runkey contour #f)) ;Add code to check whether area is valid (if - (if (args:get-arg "-target") + ;; This code checks whether the target has been passed in via argument, and only runs the specified target + (and (< packets-generated (or (string->number (if (configf:lookup mtconf "setup" "max_packets_per_run") (configf:lookup mtconf "setup" "max_packets_per_run") "10000" )) 10000)) (if (args:get-arg "-target") (if (string= (args:get-arg "-target") runkey) (area-allowed? area "area-needs-to-be-run" runkey contour #f) #f) - (area-allowed? area "area-needs-to-be-run" runkey contour #f)) + (area-allowed? area "area-needs-to-be-run" runkey contour #f))) (let* ((script (car cmd)) (params (cdr cmd)) (cmd (conc script " " contour " " area " " runkey " " std-runname " " action " " params)) (res (handle-exceptions @@ -1091,13 +1109,16 @@ (aval (or (configf:lookup mtconf "areas" area) "")) (aval-alist (common:val->alist aval)) (targets (map-targets mtconf aval-alist runkey area contour))) (pp targets) - (for-each (lambda (target) (create-run-pkt mtconf action area runkey target new-runname mode-patt + (for-each (lambda (target) + (create-run-pkt mtconf action area runkey target new-runname mode-patt tag-expr pktsdir reason contour sched dbdest append - runtrans)) targets) + runtrans) + (set! packets-generated (+ packets-generated 1)) + ) targets) ;; Add filter for targets ;;(create-run-pkt mtconf action area runkey target runname ;; pktsdir reason contour dbdest append ;; runtrans) @@ -1105,11 +1126,11 @@ ;;(push-run-spec torun contour ;; (if optional ;; we need to be able to differentiate same contour, different behavior. ;; (conc runkey ":" optional) ;; NOTE: NOT COMPLETELY IMPLEMENTED. DO NOT USE ;; runkey) ;; key-msg) - ))))))) (filter (lambda (x) (if (not (args:get-arg "-area")) #t (if (string= x (args:get-arg "-area")) #t #f))) all-areas)) + ))))) (print "Skipping area: " area " and target: " runkey " due to packets-generated: " packets-generated " higher than " (configf:lookup mtconf "setup" "max_packets_per_run") ) ) ) (filter (lambda (x) (if (not (args:get-arg "-area")) #t (if (string= x (args:get-arg "-area")) #t #f))) all-areas)) ) val-alist)) ;; iterate over the param split by ;\s* ;; fossil scm based triggers ;; ((fossil) @@ -1447,11 +1468,12 @@ (set! *default-log-port* oup) ))) (if *action* (case (string->symbol *action*) - ((run remove rerun rerun-clean rerun-all set-ss archive kill list kill-run kill-rerun) + ((run remove rerun rerun-clean rerun-all set-ss archive kill list kill-run kill-rerun lock unlock) + (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir"))) (mtconf (car mtconfdat)) (area (args:get-arg "-area")) ;; look up the area to dispatch to from [areas] section (areasec (if area (configf:lookup mtconf "areas" area) #f)) (areadat (if areasec (common:val->alist areasec) #f)) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -676,10 +676,14 @@ (define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f)) (rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps))) (define (rmt:get-count-tests-running-for-run-id run-id) (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id))) + +(define (rmt:get-not-completed-cnt run-id) + (rmt:send-receive 'get-not-completed-cnt run-id (list run-id))) + ;; Statistical queries (define (rmt:get-count-tests-running run-id) (rmt:send-receive 'get-count-tests-running run-id (list run-id))) @@ -692,10 +696,14 @@ ;; state and status are extra hints not usually used in the calculation ;; (define (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status comment) (rmt:send-receive 'set-state-status-and-roll-up-items run-id (list run-id test-name item-path state status comment))) + +(define (rmt:set-state-status-and-roll-up-run run-id state status) + (rmt:send-receive 'set-state-status-and-roll-up-run run-id (list run-id state status))) + (define (rmt:update-pass-fail-counts run-id test-name) (rmt:general-call 'update-pass-fail-counts run-id test-name test-name test-name)) (define (rmt:top-test-set-per-pf-counts run-id test-name) @@ -752,13 +760,21 @@ (rmt:send-receive 'lock/unlock-run #f (list run-id lock unlock user))) ;; set/get status (define (rmt:get-run-status run-id) (rmt:send-receive 'get-run-status #f (list run-id))) + +(define (rmt:get-run-state run-id) + (rmt:send-receive 'get-run-state #f (list run-id))) + (define (rmt:set-run-status run-id run-status #!key (msg #f)) (rmt:send-receive 'set-run-status #f (list run-id run-status msg))) + +(define (rmt:set-run-state-status run-id state status ) + (rmt:send-receive 'set-run-state-status #f (list run-id state status))) + (define (rmt:update-run-event_time run-id) (rmt:send-receive 'update-run-event_time #f (list run-id))) (define (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-runs-update #!key (sort-order "asc")) ;; fields of #f uses default Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -520,11 +520,13 @@ (runs:update-all-test_meta #f) ;; run the run prehook if there are no tests yet run for this run: ;; (runs:run-pre-hook run-id) - + ;; mark all test launced flag as false in the meta table + (rmt:set-var (conc "lunch-complete-" run-id) "no") + (rmt:set-run-state-status run-id "new" "n/a") ;; now add non-directly referenced dependencies (i.e. waiton) ;;====================================================================== ;; refactoring this block into tests:get-full-data ;; ;; What happended, this code is now duplicated in tests!? @@ -674,16 +676,16 @@ ;; recursive call to self (runs:run-tests target runname test-patts user flags run-count: (- run-count 1))))) (debug:print-info 0 *default-log-port* "No tests to run"))) (debug:print-info 4 *default-log-port* "All done by here") ;; TODO: try putting post hook call here - (if (eq? run-count 0) - (begin - (debug:print-info 0 *default-log-port* "Calling Post Hook") - (debug:print-info 2 *default-log-port* " run-count " run-count) - (runs:run-post-hook run-id)) - (debug:print-info 2 *default-log-port* "Not calling post hook runcount = " run-count )) + ;(if (eq? run-count 0) + ; (begin + ; (debug:print-info 0 *default-log-port* "Calling Post Hook") + ; (debug:print-info 2 *default-log-port* " run-count " run-count) + ; (runs:run-post-hook run-id)) + ; (debug:print-info 2 *default-log-port* "Not calling post hook runcount = " run-count )) (rmt:tasks-set-state-given-param-key task-key "done") ;; (sqlite3:finalize! tasks-db) )) @@ -1647,11 +1649,13 @@ (loop (car reg)(cdr reg) '() reruns)) (else (debug:print-info 4 *default-log-port* "cond branch - " "rtq-9") (debug:print-info 4 *default-log-port* "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) ))) ;; end loop on sorted test names - + ;; this is the point where everything is launced and now you can mark the run in metadata table as all launced + (rmt:set-var (conc "lunch-complete-" run-id) "yes") + ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id)) (prev-num-running 0)) @@ -2007,11 +2011,14 @@ ((print) (print " " (simple-run-runname run) " " (time->string (seconds->local-time (simple-run-event_time run)) "WW%V.%u %H:%M:%S") " " (if remove "REMOVE" ""))) ((remove-runs) - (if remove (system (conc precmd " megatest -remove-runs -target " target " -runname " (simple-run-runname run) " -testpatt %")))) + (if remove (system (conc precmd " megatest -remove-runs -target " target " -runname " (simple-run-runname run) " -testpatt %" + (if (member 'kill-runs actions) ;; if kill-runs is specified then set -kill-wait to 0 + " -kill-wait 0" + ""))))) ((archive) (if remove (system (conc precmd " megatest -archive save-remove -target " target " -runname " (simple-run-runname run) " -testpatt %")))) ((kill-runs) (if remove (system (conc precmd " megatest -kill-runs -target " target " -runname " (simple-run-runname run) " -testpatt %")))) )) @@ -2144,11 +2151,11 @@ (toplevel-retries (make-hash-table)) ;; try three times to loop through and remove top level tests (test-retry-time (make-hash-table)) (backgrounded-remove-status (make-hash-table)) (backgrounded-remove-last-visit (make-hash-table)) (backgrounded-remove-result (make-hash-table)) - (allow-run-time 10)) ;; seconds to allow for killing tests before just brutally killing 'em + (allow-run-time (string->number (or (args:get-arg "-kill-wait") "10")))) ;; seconds to allow for killing tests before just brutally killing 'em (let loop ((test (car sorted-tests)) (tal (cdr sorted-tests))) (let* ((test-id (db:test-get-id test)) (new-test-dat (rmt:get-test-info-by-id run-id test-id))) (if (not new-test-dat) @@ -2491,17 +2498,23 @@ (let* ((db #f) (rundat (mt:get-runs-by-patt keys runname target)) (header (vector-ref rundat 0)) (runs (vector-ref rundat 1))) (for-each (lambda (run) - (let ((run-id (db:get-value-by-header run header "id"))) + (let ((run-id (db:get-value-by-header run header "id")) + (str (if lock + "lock" + "unlock"))) (if (or lock (and unlock - (begin + (or (args:get-arg "-force") + (begin (print "Do you really wish to unlock run " run-id "?\n y/n: ") - (equal? "y" (read-line))))) + (equal? "y" (read-line)))))) + (begin (rmt:lock/unlock-run run-id lock unlock user) + (debug:print-info 0 *default-log-port* "Done " str " on run id " run-id)) (debug:print-info 0 *default-log-port* "Skipping lock/unlock on " run-id)))) runs))) ;;====================================================================== ;; Rollup runs ;;====================================================================== Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -328,11 +328,11 @@ (lock-file (conc areapath "/logs/server-start.lock"))) (if (> (- (current-seconds) when-run) run-delay) (begin (common:simple-file-lock-and-wait lock-file expire-time: 15) (server:run areapath) - (thread-sleep! 5) ;; don't release the lock for at least a few seconds + (thread-sleep! 2) ;; don't release the lock for at least a few seconds (common:simple-file-release-lock lock-file))) (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) @@ -500,207 +500,117 @@ ;; (server:have-sync-lock?)) ;; (else #f)))) ;; moving this here as it needs access to db and cannot be in common. ;; -(define (server:writable-watchdog dbstruct) - (thread-sleep! 10) ;; delay for startup - (let* ((legacy-sync (common:run-sync?)) - (sqlite-exe (or (get-environment-variable "MT_SQLITE3_EXE"))) ;; defined in cfg.sh + +(define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f)) + (let* ((sqlite-exe (or (get-environment-variable "MT_SQLITE3_EXE"))) ;; defined in cfg.sh (sync-log (or (args:get-arg "-sync-log") (conc *toppath* "/logs/sync-" (current-process-id) "-" (get-host-name) ".log"))) (tmp-area (common:get-db-tmp-area)) (tmp-db (conc tmp-area "/megatest.db")) (staging-file (conc *toppath* "/.megatest.db")) (mtdbfile (conc *toppath* "/megatest.db")) - (lockfile (conc tmp-db ".lock")) - (sync-cmd (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log)) - (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 30))) - (if (and (not (args:get-arg "-sync-to-megatest.db")) ;; conditions under which we do not run the sync - (args:get-arg "-server")) - (let loop () - (debug:print 0 *default-log-port* "INFO: syncer thread sleeping for server.minimum-intersync-delay seconds ["min-intersync-delay"]") - (thread-sleep! min-intersync-delay) - (if (common:simple-file-lock lockfile) - (begin - - (if (not (configf:lookup *configdat* "server" "disable-db-snapshot")) - (common:snapshot-file mtdbfile subdir: ".db-snapshot")) - (delete-file* staging-file) - (let* ((start-time (current-milliseconds)) - (res (system sync-cmd))) - (cond - ((eq? 0 res) - (delete-file* (conc mtdbfile ".backup")) - (if (eq? 0 (file-size sync-log)) - (delete-file sync-log)) - (system (conc "/bin/mv " staging-file " " mtdbfile)) - (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "(/ (- (current-milliseconds) start-time) 1000)" sec") - #t) - (else - (system (conc "/bin/cp "sync-log" "sync-log".fail")) - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail") - (if (file-exists? (conc mtdbfile ".backup")) - (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile))))) - (common:simple-file-release-lock lockfile))) - ;; else - (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; not syncing.") - ) ;; end if got lockfile - - ;; keep going unless time to exit - ;; - (if (not *time-to-exit*) - (let delay-loop ((count 0)) - ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) - - (if (and (not *time-to-exit*) - (< count 6)) ;; was 11, changing to 4. - (begin - (thread-sleep! 1) - (delay-loop (+ count 1)))) - (if (not *time-to-exit*) (loop)))) - ;; time to exit, close the no-sync db here - ;; (db:no-sync-close-db no-sync-db) - (if (common:low-noise-print 30) - (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) - )))))) - -;; (let ((legacy-sync (common:run-sync?))) -;; (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300)) -;; (debug-mode (debug:debug-mode 1)) -;; (last-time (current-seconds)) -;; (no-sync-db (db:open-no-sync-db)) -;; (sync-duration 0) ;; run time of the sync in milliseconds -;; ;;(this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x))) -;; ) -;; (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls -;; (debug:print-info 2 *default-log-port* "Periodic sync thread started.") -;; (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id) );; " this-wd-num="this-wd-num) -;; (if (and legacy-sync (not *time-to-exit*)) -;; (let* (;;(dbstruct (db:setup)) -;; (mtdb (dbr:dbstruct-mtdb dbstruct)) -;; (mtpath (db:dbdat-get-path mtdb)) -;; (tmp-area (common:get-db-tmp-area)) -;; (lockfile (conc tmp-area "/megatest.db.lock")) -;; (start-file (conc tmp-area "/.start-sync")) -;; (end-file (conc tmp-area "/.end-sync"))) -;; (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") -;; (let loop () -;; ;; sync for filesystem local db writes -;; ;; -;; -;; (mutex-lock! *db-multi-sync-mutex*) -;; (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write -;; (sync-in-progress *db-sync-in-progress*) -;; -;; (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 5)) -;; (should-sync (and (not *time-to-exit*) -;; (> (- (current-seconds) *db-last-sync*) min-intersync-delay))) ;; sync every five seconds minimum, deprecated logic, can probably be removed -;; (start-time (current-seconds)) -;; (cpu-load-adj (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f))) -;; (mt-mod-time (file-modification-time mtpath)) -;; (last-sync-start (if (common:file-exists? start-file) -;; (file-modification-time start-file) -;; 0)) -;; (last-sync-end (if (common:file-exists? end-file) -;; (file-modification-time end-file) -;; 10)) -;; (sync-period (+ 3 (* cpu-load-adj 30))) ;; as adjusted load increases increase the sync period -;; (recently-synced (and (< (- start-time mt-mod-time) sync-period) ;; not useful if sync didn't modify megatest.db! -;; (< mt-mod-time last-sync-start))) -;; (sync-done (<= last-sync-start last-sync-end)) -;; (sync-stale (> start-time (+ last-sync-start sync-stale-seconds))) -;; (will-sync-pre (and (not *time-to-exit*) ;; do not start a sync if we are in the process of exiting -;; have-lock? -;; (or need-sync should-sync) -;; (or sync-done sync-stale) -;; (not sync-in-progress) -;; (not recently-synced))) -;; (will-sync (if will-sync-pre -;; ;; delay get lock until we decide to sync -;; #t ;; (server:have-sync-lock?) -;; #f))) -;; ;; if another server is syncing, postpone sync -;; (if (and will-sync-pre (not will-sync)) -;; (set! *db-last-sync* start-time)) -;; (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop. need-sync="need-sync" sync-in-progress=" sync-in-progress -;; " should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync -;; " sync-done=" sync-done " sync-period=" sync-period) -;; (if (and (> sync-period 5) -;; (common:low-noise-print 30 "sync-period")) -;; (debug:print-info 0 *default-log-port* "Increased sync period due to long sync times, sync took: " sync-period " seconds.")) -;; ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced)) -;; ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) -;; (if will-sync (set! *db-sync-in-progress* #t)) -;; (mutex-unlock! *db-multi-sync-mutex*) -;; (if will-sync -;; (let (;; (max-sync-duration (configf:lookup-number *configdat* "server" "max-sync-duration")) ;; KEEPING THIS AVAILABLE BUT SHOULD NOT USE, I'M PRETTY SURE IT DOES NOT WORK! -;; (sync-start (current-milliseconds))) -;; (with-output-to-file start-file (lambda ()(print (current-process-id)))) -;; -;; ;; put lock here -;; -;; ;; (if (or (not max-sync-duration) -;; ;; (< sync-duration max-sync-duration)) ;; NOTE: db:sync-to-megatest.db keeps track of time of last sync and syncs incrementally -;; (let ((res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive -;; (set! sync-duration (- (current-milliseconds) sync-start)) -;; (if (> res 0) ;; some records were transferred, keep the db alive -;; (begin -;; (mutex-lock! *heartbeat-mutex*) -;; (set! *db-last-access* (current-seconds)) -;; (mutex-unlock! *heartbeat-mutex*) -;; (debug:print-info 0 *default-log-port* "sync called, " res " records transferred.")) -;; (debug:print-info 2 *default-log-port* "sync called but zero records transferred"))))) -;; ;; ;; TODO: factor this next routine out into a function -;; ;; (with-input-from-pipe ;; this should not block other threads but need to verify this -;; ;; (conc "megatest -sync-to-megatest.db -m testsuite:" (common:get-area-name) ":" *toppath*) -;; ;; (lambda () -;; ;; (let loop ((inl (read-line)) -;; ;; (res #f)) -;; ;; (if (eof-object? inl) -;; ;; (begin -;; ;; (set! sync-duration (- (current-milliseconds) sync-start)) -;; ;; (cond -;; ;; ((not res) -;; ;; (debug:print 0 *default-log-port* "ERROR: sync from /tmp db to megatest.db appears to have failed. Recommended that you stop your runs and run \"megatest -cleanup-db\"")) -;; ;; ((> res 0) -;; ;; (mutex-lock! *heartbeat-mutex*) -;; ;; (set! *db-last-access* (current-seconds)) -;; ;; (mutex-unlock! *heartbeat-mutex*)))) -;; ;; (let ((num-synced (let ((matches (string-match "^Synced (\\d+).*$" inl))) -;; ;; (if matches -;; ;; (string->number (cadr matches)) -;; ;; #f)))) -;; ;; (loop (read-line) -;; ;; (or num-synced res)))))))))) -;; (if will-sync -;; (begin -;; (mutex-lock! *db-multi-sync-mutex*) -;; (set! *db-sync-in-progress* #f) -;; (set! *db-last-sync* start-time) -;; (with-output-to-file end-file (lambda ()(print (current-process-id)))) -;; -;; ;; release lock here -;; ;; (server:release-sync-lock) -;; (mutex-unlock! *db-multi-sync-mutex*))) -;; (if (and debug-mode -;; (> (- start-time last-time) 60)) -;; (begin -;; (set! last-time start-time) -;; (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*)))))) -;; -;; ;; keep going unless time to exit -;; ;; -;; (if (not *time-to-exit*) -;; (let delay-loop ((count 0)) -;; ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) -;; -;; (if (and (not *time-to-exit*) -;; (< count 6)) ;; was 11, changing to 4. -;; (begin -;; (thread-sleep! 1) -;; (delay-loop (+ count 1)))) -;; (if (not *time-to-exit*) (loop)))) -;; ;; time to exit, close the no-sync db here -;; (db:no-sync-close-db no-sync-db) -;; (if (common:low-noise-print 30) -;; (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) ))))))) ;;" this-wd-num="this-wd-num))))))) + (lockfile (common:get-sync-lock-filepath)) + (sync-cmd-core (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log)) + (sync-cmd (if fork-to-background + (conc "/usr/bin/env NBFAKE_LOG="*toppath*"/logs/last-server-sync-"(current-process-id)".log nbfake \""sync-cmd-core" && /bin/mv -f " staging-file " " mtdbfile" \"") + sync-cmd-core)) + (default-min-intersync-delay 2) + (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: default-min-intersync-delay)) + (default-duty-cycle 0.1) + (duty-cycle (configf:lookup-number *configdat* "server" "sync-duty-cycle" default: default-duty-cycle)) + (last-sync-seconds 10) ;; we will adjust this to a measurement and delay last-sync-seconds * (1 - duty-cycle) + (calculate-off-time (lambda (work-duration duty-cycle) + (* (/ (- 1 duty-cycle) duty-cycle) last-sync-seconds))) + (off-time min-intersync-delay) ;; adjusted in closure below. + (do-a-sync + (lambda () + (BB> "Start do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync) + (let* ((finalres + (let retry-loop ((num-tries 0)) + (if (common:simple-file-lock lockfile) + (begin + (cond + ((not (or fork-to-background persist-until-sync)) + (debug:print 0 *default-log-port* "INFO: syncer thread sleeping for max of (server.minimum-intersync-delay="min-intersync-delay + " , off-time="off-time" seconds ]") + (thread-sleep! (max off-time min-intersync-delay))) + (else + (debug:print 0 *default-log-port* "INFO: syncer thread NOT sleeping ; maybe time-to-exit..."))) + + (if (not (configf:lookup *configdat* "server" "disable-db-snapshot")) + (common:snapshot-file mtdbfile subdir: ".db-snapshot")) + (delete-file* staging-file) + (let* ((start-time (current-milliseconds)) + (res (system sync-cmd)) + (res2 + (cond + ((eq? 0 res) + (delete-file* (conc mtdbfile ".backup")) + (if (eq? 0 (file-size sync-log)) + (delete-file sync-log)) + (system (conc "/bin/mv " staging-file " " mtdbfile)) + + (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000)) + (set! off-time (calculate-off-time + last-sync-seconds + (cond + ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1)) + duty-cycle) + (else + (debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid. Should be a number between 0 and 1, but "duty-cycle" was specified. Using default value: "default-duty-cycle) + default-duty-cycle)))) + + (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec") + (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time) + 'sync-completed) + (else + (system (conc "/bin/cp "sync-log" "sync-log".fail")) + (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail") + (if (file-exists? (conc mtdbfile ".backup")) + (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile))) + #f)))) + (common:simple-file-release-lock lockfile) + (BB> "released lockfile: " lockfile) + (when (common:file-exists? lockfile) + (BB> "DID NOT ACTUALLY RELEASE LOCKFILE")) + res2) ;; end let + );; end begin + ;; else + (cond + (persist-until-sync + (thread-sleep! 1) + (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; we're in a fork-to-background so we need to succeed. Let's wait a jiffy and and try again. num-tries="num-tries" (waiting for lockfile="lockfile" to disappear)") + (retry-loop (add1 num-tries))) + (else + (thread-sleep! (max off-time (+ last-sync-seconds min-intersync-delay))) + (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; not syncing.") + 'parallel-sync-in-progress)) + ) ;; end if got lockfile + ) + )) + (BB> "End do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync" and result="finalres) + finalres) + ) ;; end lambda + )) + do-a-sync)) + +(define (server:writable-watchdog dbstruct) + (thread-sleep! 1) ;; delay for startup + (let* ((do-a-sync (server:get-bruteforce-syncer dbstruct)) + (final-sync (server:get-bruteforce-syncer dbstruct fork-to-background: #t persist-until-sync: #t))) + (when (and (not (args:get-arg "-sync-to-megatest.db")) ;; conditions under which we do not run the sync + (args:get-arg "-server")) + + (let loop () + (do-a-sync) + (if (not *time-to-exit*) (loop))) ;; keep going unless time to exit + + ;; time to exit, close the no-sync db here + (final-sync) + + (if (common:low-noise-print 30) + (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) + ))))) + Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -993,10 +993,11 @@ ;; get runs changed since last sync ;; (define (tasks:sync-test-data dbh cached-info area-info) ;; (let* (( (define (tasks:sync-to-postgres configdat dest) + (print "In sync") (let* ((dbh (pgdb:open configdat dbname: dest)) (area-info (pgdb:get-area-by-path dbh *toppath*)) (cached-info (make-hash-table)) (start (current-seconds)) (test-patt (if (args:get-arg "-testpatt") @@ -1014,15 +1015,17 @@ (exit 1))) (if (and (not target) run-name) (begin (print "Error: Provide target") (exit 1))) - + ;(print "123") + ;(exit 1) (for-each (lambda (dtype) (hash-table-set! cached-info dtype (make-hash-table))) '(runs targets tests steps data)) (hash-table-set! cached-info 'start start) ;; when done we'll set sync times to this + ;(print "here") (if area-info (let* ((last-sync-time (vector-ref area-info 3)) (smallest-last-update-time (make-hash-table)) (changed (if (and target run-name) (rmt:get-run-record-ids target run-name (rmt:get-keys) test-patt) @@ -1035,10 +1038,11 @@ (area-tag (if (args:get-arg "-area-tag") (args:get-arg "-area-tag") (if (args:get-arg "-area") (args:get-arg "-area") "")))) + ;(print "here2") (if (and (equal? area-tag "") (not (pgdb:is-area-taged dbh (vector-ref area-info 0)))) (set! area-tag *default-area-tag*)) (if (not (equal? area-tag "")) (task:add-area-tag dbh area-info area-tag)) (if (or (not (null? test-ids)) (not (null? run-ids))) @@ -1051,15 +1055,15 @@ (tasks:sync-test-steps dbh cached-info test-step-ids smallest-last-update-time) (debug:print-info 0 *default-log-port* "syncing test data") (tasks:sync-test-gen-data dbh cached-info test-data-ids smallest-last-update-time) (print "----------done---------------"))) (let* ((smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f))) - (print "smallest-time :" smallest-time " last-sync-time " last-sync-time) + (debug:print-info 0 "smallest-time :" smallest-time " last-sync-time " last-sync-time) (if (not (and target run-name)) (if (or (and smallest-time (> smallest-time last-sync-time)) (and smallest-time (eq? last-sync-time 0))) (pgdb:write-sync-time dbh area-info smallest-time))))) ;;this needs to be changed (if (tasks:set-area dbh configdat) (tasks:sync-to-postgres configdat dest) (begin (debug:print 0 *default-log-port* "ERROR: unable to create an area record") #f))))) Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -485,11 +485,14 @@ dcomment ",," ;; extra comma for status type ))) ;; This was run remote, don't think that makes sense. Perhaps not, but that is the easiest path for the moment. (rmt:csv->test-data run-id test-id dat) - (thread-sleep! 10) ;; add 10 second delay before quit incase rmt needs time to start a server. + ;; This was added in check-in a5adfa3f9a. Message was: "...added delay in set-values to allow for delayed write on server start" + ;; I'm inserting an arbitrary rmt: call to force/ensure that the server is available to (hopefully) prevent a communication issue. + (rmt:get-var "MEGATEST_VERSION") ;; this does NOTHING but ensure the server is reachable. This is almost certainly NOT needed :) + ;; BB - commentiong out arbitrary 10 second wait (thread-sleep! 10) ;; add 10 second delay before quit incase rmt needs time to start a server. ))) ;; need to update the top test record if PASS or FAIL and this is a subtest ;;;;;; (if (not (equal? item-path "")) ;;;;;; (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status #f) ;;;;;)