Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -39,10 +39,11 @@ get-var get-keys get-key-vals test-toplevel-num-items get-test-info-by-id + get-test-state-status-by-id get-steps-info-by-id get-data-info-by-id test-get-rundir-from-test-id get-count-tests-running-for-testname get-count-tests-running @@ -328,10 +329,11 @@ ((test-get-archive-block-info) (apply db:test-get-archive-block-info dbstruct params)) ;; TESTS ((test-toplevel-num-items) (apply db:test-toplevel-num-items dbstruct params)) ((get-test-info-by-id) (apply db:get-test-info-by-id dbstruct params)) + ((get-test-state-status-by-id) (apply db:get-test-state-status-by-id dbstruct params)) ((test-get-rundir-from-test-id) (apply db:test-get-rundir-from-test-id dbstruct params)) ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params)) ((get-count-tests-running) (apply db:get-count-tests-running dbstruct params)) ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params)) ;; ((delete-test-step-records) (apply db:delete-test-step-records dbstruct params)) @@ -351,10 +353,11 @@ ;; RUNS ((get-run-info) (apply db:get-run-info dbstruct params)) ((get-run-status) (apply db:get-run-status dbstruct params)) ((get-run-state) (apply db:get-run-state dbstruct params)) + ((get-run-state-status) (apply db:get-run-state-status dbstruct params)) ((set-run-status) (apply db:set-run-status dbstruct params)) ((set-run-state-status) (apply db:set-run-state-status dbstruct params)) ((update-tesdata-on-repilcate-db) (apply db:update-tesdata-on-repilcate-db dbstruct params)) ((get-tests-for-run) (apply db:get-tests-for-run dbstruct params)) ((get-tests-for-run-state-status) (apply db:get-tests-for-run-state-status dbstruct params)) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -468,11 +468,13 @@ (hash-table-set! sync-durations (conc fname".db") (- (current-milliseconds) start-time))) (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date") ))) dbfiles) - (if dbdat (dbfile:add-dbdat dbstruct #f dbdat))) + ;; WHY does the dbdat need to be added back? + (if dbdat (dbfile:add-dbdat dbstruct #f dbdat)) + ) #t) ;; options: ;; ;; 'killservers - kills all servers @@ -598,10 +600,11 @@ (mtdb (dbr:subdb-mtdb subdb)) (tmpdb (db:get-subdb dbstruct run-id)) (refndb (dbr:subdb-refndb subdb)) (newres (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) last-update tmpdb refndb mtdb))) ;; (stack-push! (dbr:subdb-dbstack subdb) tmpdb) + ;; BUG: verify this is really needed (dbfile:add-dbdat dbstruct run-id tmpdb) (set! res (cons newres res)))) subdbs) res)) @@ -904,10 +907,11 @@ "SELECT d.id,d.archive_area_name,disk_path,last_df,last_df_time FROM archive_disks AS d INNER JOIN archive_blocks AS b ON d.id=b.archive_disk_id WHERE b.id IN (" (string-intersperse (map conc res) ",") ") AND last_df > ?;") dneeded)) + ;; BUG: Verfify this is really needed (dbfile:add-dbdat dbstruct #f dbdat) blocks)) ;; returns id of the record, register a disk allocated to archiving and record it's last known ;; available space @@ -2066,12 +2070,13 @@ dbstruct #f #f (lambda (dbdat db) (sqlite3:for-each-row (lambda (status) (set! res status)) - db - "SELECT status FROM runs WHERE id=?;" + (db:get-cache-stmth + dbdat db + "SELECT status FROM runs WHERE id=?;" ) run-id) res)))) (define (db:get-run-state dbstruct run-id) (let ((res "n/a")) @@ -2079,12 +2084,27 @@ dbstruct #f #f (lambda (dbdat db) (sqlite3:for-each-row (lambda (status) (set! res status)) - db - "SELECT state FROM runs WHERE id=?;" + (db:get-cache-stmth + dbdat db + "SELECT state FROM runs WHERE id=?;" ) + run-id) + res)))) + +(define (db:get-run-state-status dbstruct run-id) + (let ((res (cons "n/a" "n/a"))) + (db:with-db + dbstruct #f #f + (lambda (dbdat db) + (sqlite3:for-each-row + (lambda (state status) + (set! res (cons state status))) + (db:get-cache-stmth + dbdat db + "SELECT state,status FROM runs WHERE id=?;" ) run-id) res)))) ;;====================================================================== @@ -2696,11 +2716,11 @@ (lambda (run-id) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))) (db:prep-megatest.db-adj-test-ids (dbr:dbdat-dbh mtdb) run-id testrecs))) run-ids))) -;; Get test data using test_id, run-id is not used +;; Get test data using test_id ;; (define (db:get-test-info-by-id dbstruct run-id test-id) (db:with-db dbstruct run-id @@ -2713,10 +2733,26 @@ (set! res (vector id run-id testname state status event-time host cpuload diskfree uname rundir-id item-path run_duration final-logf-id comment short-dir-id attemptnum archived last-update))) (db:get-cache-stmth dbdat db (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id=?;")) test-id) res)))) + +;; Get test state, status using test_id +;; +(define (db:get-test-state-status-by-id dbstruct run-id test-id) + (db:with-db + dbstruct + run-id + #f + (lambda (dbdat db) + (let ((res (cons #f #f))) + (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test + (lambda (state status) + (cons state status)) + (db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=?;") + test-id) + res)))) ;; Use db:test-get* to access ;; Get test data using test_ids. NB// Only works within a single run!! ;; (define (db:get-test-info-by-ids dbstruct run-id test-ids) Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -37,13 +37,15 @@ stack files ports commonmod + ;; debugprint ) -;; (import debugprint) +(define keep-age-param (make-parameter 10)) ;; qif file age, if over move to attic +(define num-run-dbs (make-parameter 10)) ;; number of db's in .megatest ;;====================================================================== ;; R E C O R D S ;;====================================================================== @@ -191,17 +193,21 @@ (define (dbfile:run-id->path apath run-id) (conc apath"/"(dbfile:run-id->dbname run-id))) (define (db:dbname->path apath dbname) (conc apath"/"dbname)) + +(define (dbfile:run-id->dbnum run-id) + (cond + ((number? run-id) + (modulo run-id (num-run-dbs))) + ((not run-id) "main") ;; 0 or main? + (else run-id))) ;; POTENTIAL BUG: this implementation could produce a db file if run-id is neither #f or a number (define (dbfile:run-id->dbname run-id) - (cond - ((number? run-id) (conc ".megatest/" (modulo run-id 100) ".db")) - ((not run-id) (conc ".megatest/main.db")) - (else run-id))) + (conc ".megatest/"(dbfile:run-id->dbnum run-id)".db")) ;; Make the dbstruct, setup up auxillary db's and call for main db at least once ;; ;; called in http-transport and replicated in rmt.scm for *local* access. ;; @@ -241,12 +247,16 @@ (begin (stack-pop! (dbr:subdb-dbstack subdb)))))) ;; return a previously opened db handle to the stack of available handles (define (dbfile:add-dbdat dbstruct run-id dbdat) - (let* ((subdb (dbfile:get-subdb dbstruct run-id))) - (stack-push! (dbr:subdb-dbstack subdb) dbdat) + (let* ((subdb (dbfile:get-subdb dbstruct run-id)) + (dbstk (dbr:subdb-dbstack subdb)) + (count (stack-count dbstk))) + (if (> count 15) + (dbfile:print-err "WARNING: stack for "run-id".db is "count".")) + (stack-push! dbstk dbdat) dbdat)) ;; set up a subdb ;; (define (dbfile:init-subdb dbstruct run-id init-proc) @@ -886,11 +896,12 @@ ) ) tbls) (let* ((runtime (- (current-milliseconds) start-time)) (should-print (or ;; (debug:debug-mode 12) - (common:low-noise-print 120 "db sync" (> runtime 500))))) ;; low and high sync times treated as separate. + (common:low-noise-print 120 "db sync") + (> runtime 500)))) ;; low and high sync times treated as separate. (for-each (lambda (dat) (let ((tblname (car dat)) (count (cdr dat))) (set! tot-count (+ tot-count count)) @@ -1004,53 +1015,53 @@ ;; (mutex-unlock! *db-open-mutex*) dbdat)) (define dbfile:db-init-proc (make-parameter #f)) -(define keep-age-param (make-parameter 10)) -(define qif-slope (make-parameter 100)) +;; in xmaxima this gives a curve close to what I want: +;; plot2d ((exp(x/1.2)-1)/300, [x, 0, 10])$ +;; plot2d ((exp(x/1.5)-1)/40, [x, 0, 10])$ +;; plot2d ((exp(x/5)-1)/40, [x, 0, 20])$ +(define (dbfile:droop x) + (/ (- (exp (/ x 5)) 1) 40)) + ;; (* numqrys (/ 1 (qif-slope)))) ;; create a dropping near the db file in a qif dir ;; use count of such files to gate queries (queries in flight) ;; (define (dbfile:wait-for-qif fname run-id params) (let* ((thedir (pathname-directory fname)) - (destdir (conc thedir"/qif-"run-id)) - (uniqn (get-area-path-signature (conc (or run-id "main") params))) + (dbnum (dbfile:run-id->dbnum run-id)) + (destdir (conc thedir"/qif-"dbnum)) + (uniqn (get-area-path-signature (conc dbnum params))) (crumbn (conc destdir"/"(current-seconds)"-"uniqn"."(current-process-id)))) (if (not (file-exists? destdir))(create-directory (conc destdir"/attic") #t)) (let loop ((count 0)) (let* ((currlks (glob (conc destdir"/*"))) (numqrys (length currlks)) (delayval (cond ;; do a droopish curve - ((> numqrys 50) - (if (> numqrys 50) - (for-each - (lambda (f) - (if (> (- (current-seconds) - (handle-exceptions - exn - (current-seconds) ;; file is likely gone, just fake out - (file-modification-time f))) - (keep-age-param)) - (let* ((basedir (pathname-directory f)) - (filen (pathname-file f)) - (destf (conc basedir"/attic/"filen))) - (dbfile:print-err "Moving qif file "f" older than 10 seconds to "destf) - ;; (delete-file* f) - (handle-exceptions - exn - #t - (file-move f destf #t))))) - currlks)) - 1) ;; 50 and above => 1 - ((> numqrys 10) (* numqrys (/ 1 (qif-slope)))) ;; slope of 1/100 - ;; ((> numqrys 30) 0.50) - ;; ((> numqrys 25) 0.20) - ;; ((> numqrys 20) 0.10) - ;; ((> numqrys 15) 0.05) - ;; ((> numqrys 10) 0.01) + ((> numqrys 25) + (for-each + (lambda (f) + (if (> (- (current-seconds) + (handle-exceptions + exn + (current-seconds) ;; file is likely gone, just fake out + (file-modification-time f))) + (keep-age-param)) + (let* ((basedir (pathname-directory f)) + (filen (pathname-file f)) + (destf (conc basedir"/attic/"filen))) + (dbfile:print-err "Moving qif file "f" older than 10 seconds to "destf) + ;; (delete-file* f) + (handle-exceptions + exn + #t + (file-move f destf #t))))) + currlks) + 4) + ((> numqrys 0) (dbfile:droop numqrys)) ;; slope of 1/100 (else #f)))) (if (and delayval (< count 5)) (begin (thread-sleep! delayval) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -29,18 +29,21 @@ (import (prefix sqlite3 sqlite3:)) (declare (unit launch)) (declare (uses subrun)) (declare (uses common)) +(declare (uses commonmod)) (declare (uses configf)) (declare (uses db)) (declare (uses ezsteps)) (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") (include "megatest-fossil-hash.scm") + +(import commonmod) ;;====================================================================== ;; ezsteps ;;====================================================================== @@ -205,11 +208,11 @@ ) )))))) (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) - (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30"))) + (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "60"))) (start-seconds (current-seconds)) (calc-minutes (lambda () (inexact->exact (round (- @@ -239,13 +242,13 @@ (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg df #f))) (do-sync (or new-cpu-load new-disk-free over-time)) - (test-info (rmt:get-test-info-by-id run-id test-id)) - (state (db:test-get-state test-info)) - (status (db:test-get-status test-info)) + (test-info (rmt:get-test-state-status-by-id run-id test-id)) + (state (car test-info));; (db:test-get-state test-info)) + (status (cdr test-info));; (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) @@ -259,11 +262,12 @@ (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) - (launch:handle-zombie-tests run-id) + (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty + (launch:handle-zombie-tests run-id)) (when do-sync ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) @@ -765,20 +769,28 @@ ;; new ;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup ;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED ;; 0 RUNNING ==> this is actually the first condition, should not get here - +(define *last-rollup* 0) (define (launch:end-of-run-check run-id ) (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id)) - (running-cnt (rmt:get-count-tests-running-for-run-id run-id)) + (running-cnt (rmt:get-count-tests-running-for-run-id run-id)) (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id))) - (current-state (rmt:get-run-state run-id)) - (current-status (rmt:get-run-status run-id))) - ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing - (debug:print 0 *default-log-port* "Running test cnt :" running-cnt) - (rmt:set-state-status-and-roll-up-run run-id current-state current-status) + (current-state-status (rmt:get-run-state-status run-id)) + (current-state (car current-state-status)) ;; (rmt:get-run-state run-id)) + (current-status (cdr current-state-status))) ;; (rmt:get-run-status run-id))) + ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing + (debug:print 0 *default-log-port* "Running test cnt :" running-cnt) + ;; + ;; TODO: add a final rollup when run is done (if there isn't one already) + ;; + (if (or (< running-cnt 3) ;; have only few running + (> (- (current-seconds) *last-rollup*) 10)) ;; or haven't rolled up in past ten seconds + (begin + (rmt:set-state-status-and-roll-up-run run-id current-state current-status) + (set! *last-rollup* (current-seconds)))) (runs:update-junit-test-reporter-xml run-id) (cond ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" )) (if (and (equal? (rmt:get-var (conc "end-of-run-" run-id)) "no") (common:simple-lock (conc "endOfRun" run-id))) (begin Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -537,20 +537,21 @@ (define (rmt:get-test-id run-id testname item-path) (assert (number? run-id) "FATAL: Run id required.") (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) -;; run-id is NOT used -;; (define (rmt:get-test-info-by-id run-id test-id) (if (number? test-id) (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)) (begin (debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) (print-call-chain (current-error-port)) #f))) +(define (rmt:get-test-state-status-by-id run-id test-id) + (rmt:send-receive 'get-test-state-status-by-id run-id (list run-id test-id))) + (define (rmt:test-get-rundir-from-test-id run-id test-id) (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id))) (define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f)) (assert (number? run-id) "FATAL: Run id required.") @@ -799,10 +800,13 @@ (define (rmt:get-run-state run-id) (assert (number? run-id) "FATAL: Run id required.") (rmt:send-receive 'get-run-state #f (list run-id))) +(define (rmt:get-run-state-status run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-run-state-status #f (list run-id))) (define (rmt:set-run-status run-id run-status #!key (msg #f)) (assert (number? run-id) "FATAL: Run id required.") (rmt:send-receive 'set-run-status #f (list run-id run-status msg))) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1863,11 +1863,12 @@ (newtestname (db:test-make-full-name hed my-item-path))) ;; test names are unique on testname/item-path (tests:testqueue-set-items! new-test-record #f) (tests:testqueue-set-itemdat! new-test-record my-itemdat) (tests:testqueue-set-item_path! new-test-record my-item-path) (hash-table-set! test-records newtestname new-test-record) - (set! tal (append tal (list newtestname))))) ;; since these are itemized create new test names testname/itempath + ;; BUG: This next line sucks up a lot of horsepower + (set! tal (append tal (list newtestname))))) ;; since these are itemized create new test names testname/itempath items-in-testpatt))) ;; At this point we have possibly added items to tal but all must be handed off to Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -1966,13 +1966,13 @@ ;;====================================================================== ;; teststep-set-status! used to be here (define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat) - (let* ((testdat (rmt:get-test-info-by-id run-id test-id))) + (let* ((testdat (rmt:get-test-state-status-by-id run-id test-id))) (and testdat - (equal? (test:get-state testdat) "KILLREQ")))) + (equal? (car testdat) "KILLREQ")))) (define (test:tdb-get-rundat-count tdb) (if tdb (let ((res 0)) (sqlite3:for-each-row