Index: TODO ================================================================== --- TODO +++ TODO @@ -17,14 +17,15 @@ TODO ==== WW38 -. Add test_rundat to no-sync +. Add test_rundat to no-sync ==> correction, put in /.meta/test-run.dat +. Add STATE/STATUS transitions to .meta/test-run.dat or similar . Swizzle update-test-rundat to operate on no-sync . Swizzle update-run-duration, -uname-host and cpuload-diskfree to no-sync -. On test completion copy some of the data from no-sync to test_rundat +. On state/status change update tests table with duration WW15 . fill newview matrix with data, filter pipeline gui elements . improve [script], especially indent handling Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -3467,11 +3467,11 @@ (lambda (run-id) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))) (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs))) run-ids))) -;; Get test data using test_id, run-id is not used +;; Get test data using test_id, run-id is not used - but it will be! ;; (define (db:get-test-info-by-id dbstruct run-id test-id) (db:with-db dbstruct #f ;; run-id Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -205,17 +205,19 @@ (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) - (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) + (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) - (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) + ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - + ;; top of loop encountered at "(current-seconds)" with + ;; last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load @@ -233,33 +235,28 @@ (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) - (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) + #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) - (when do-sync - ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) - ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) - (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) - (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))) - + (if do-sync ;; save meta data about the running of this test + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this @@ -312,11 +309,11 @@ (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) - (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional + (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) @@ -465,10 +462,13 @@ (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.") (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. + ;; ((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun + ;; (and (equal? (db:test-get-state test-info) "COMPLETED") ;; completed/abort => rerun if asked + ;; (member (db:test-get-status test-info) '("ABORT")))) ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -22,10 +22,12 @@ (declare (unit rmt)) (declare (uses api)) (declare (uses http-transport)) (include "common_records.scm") +(include "db_records.scm") + ;; (declare (uses rmtmod)) ;; (import rmtmod) ;; @@ -525,15 +527,24 @@ (rmt:general-call 'register-test run-id run-id test-name item-path)) (define (rmt:get-test-id run-id testname item-path) (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) -;; run-id is NOT used +;; run-id is NOT used - but it will be! ;; (define (rmt:get-test-info-by-id run-id test-id) (if (number? test-id) - (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)) + (let* ((testdat (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))) + (trundatf (conc (db:test-get-rundir testdat) "/.mt_data/test-run.dat"))) + ;; now we can update a couple fields from the filesystem + (if (and (db:test-get-rundir testdat) + (file-exists? trundatf)) + (let* ((duration (db:test-get-run_duration testdat)) + (event-time (db:test-get-event_time testdat)) + (last-touch (file-modification-time trundatf))) + (db:test-set-run_duration! testdat (max duration (- last-touch event-time))))) + testdat) (begin (debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) (print-call-chain (current-error-port)) #f))) Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -1944,28 +1944,44 @@ tdb "SELECT count(id) FROM test_rundat;") res)) 0) -(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname) - (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1)) - (if (and cpuload diskfree) - (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id)) - (if minutes - (rmt:general-call 'update-run-duration run-id minutes test-id)) - (if (and uname hostname) - (rmt:general-call 'update-uname-host run-id uname hostname test-id))) +;; +(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname #!key (update-db #f)) + (if (get-environment-variable "MT_TEST_RUN_DIR") + (let* ((dest-dir (conc (get-environment-variable "MT_TEST_RUN_DIR") "/.mt_data")) + (or-dash (lambda (instr)(if instr instr "-")))) + (if (not (directory-exists? dest-dir))(create-directory dest-dir #t)) + (let* ((outp (open-output-file (conc dest-dir "/test-run.dat") #:append))) + (with-output-to-port outp + (lambda () + (print (current-seconds) " " (or-dash run-id) " " (or-dash test-id) " " + (or-dash cpuload) " " (or-dash diskfree) " " + (or-dash minutes) " " (or-dash hostname) " " + (or-dash uname)))) ;; put uname last as it has spaces in it + (close-output-port outp))) + (begin + (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1)))) + (if update-db + (begin + (if (and cpuload diskfree) + (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id)) + (if minutes + (rmt:general-call 'update-run-duration run-id minutes test-id)) + (if (and uname hostname) + (rmt:general-call 'update-uname-host run-id uname hostname test-id))))) ;; This one is for running with no db access (i.e. via rmt: internally) -(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries) +(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries #!key (update-db #f)) ;; (define (tests:set-full-meta-info test-id run-id minutes work-area) ;; (let ((remtries 10)) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) (uname (get-uname "-srvpio")) (hostname (get-host-name))) - (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname))) + (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db))) ;; (define (tests:set-partial-meta-info test-id run-id minutes work-area) #;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) Index: ulex/ulex.scm ================================================================== --- ulex/ulex.scm +++ ulex/ulex.scm @@ -336,11 +336,12 @@ (+ (current-seconds) lease) dbfname) #t) #f)) (#f (sqlite3:execute db "INSERT INTO dbs (dbname,dbfile,dbtype,host_port,lease_thru) VALUES (?,?,?,?,?)" - "captain" dbfname "captain" host-port (+ (current-seconds) lease))) + "captain" dbfname "captain" host-port (+ (current-seconds) lease)) + #t) (else (print "ERROR: Unrecognised result from fold-row") (exit 1))))))) ;;====================================================================== ;; network utilities