Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -758,22 +758,23 @@ (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" this-wd-num="this-wd-num))))))) ;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage (define (common:watchdog) (debug:print-info 13 *default-log-port* "common:watchdog entered.") - (if (common:on-homehost?) - (let ((dbstruct (db:setup #t))) - (debug:print-info 13 *default-log-port* "after db:setup with dbstruct="dbstruct) - (cond - ((dbr:dbstruct-read-only dbstruct) - (debug:print-info 13 *default-log-port* "loading read-only watchdog") - (common:readonly-watchdog dbstruct)) - (else - (debug:print-info 13 *default-log-port* "loading writable-watchdog.") - (common:writable-watchdog dbstruct))) - (debug:print-info 13 *default-log-port* "watchdog done.")) - (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost"))) + (if (launch:setup) + (if (common:on-homehost?) + (let ((dbstruct (db:setup #t))) + (debug:print-info 13 *default-log-port* "after db:setup with dbstruct=" dbstruct) + (cond + ((dbr:dbstruct-read-only dbstruct) + (debug:print-info 13 *default-log-port* "loading read-only watchdog") + (common:readonly-watchdog dbstruct)) + (else + (debug:print-info 13 *default-log-port* "loading writable-watchdog.") + (common:writable-watchdog dbstruct))) + (debug:print-info 13 *default-log-port* "watchdog done.")) + (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost")))) (define (std-exit-procedure) (on-exit (lambda () 0)) ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) @@ -1756,11 +1757,12 @@ (let* ((key (car keyval)) (val (cdr keyval)) (delim (if (string-search whitesp val) "\"" ""))) - (print (if (member key ignorevars) + (print (if (or (member key ignorevars) + (string-search whitesp key)) "# setenv " "setenv ") key " " delim (mungeval val) delim))) envvars))) (with-output-to-file (conc fname ".sh") @@ -1770,10 +1772,11 @@ (val (cdr keyval)) (delim (if (string-search whitesp val) "\"" ""))) (print (if (or (member key ignorevars) + (string-search whitesp key) (string-search ":" key)) ;; internal only values to be skipped. "# export " "export ") key "=" delim (mungeval val) delim))) envvars))))) @@ -1787,11 +1790,11 @@ (let* ((var (car p)) (val (cadr p)) (prv (get-environment-variable var))) (set! res (cons (list var prv) res)) (if val - (setenv var (->string val)) + (safe-setenv var (->string val)) (unsetenv var)))) lst) res) '())) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -937,71 +937,71 @@ ;; 'closeall - close all opened dbs ;; 'schema - attempt to apply schema changes ;; run-ids: '(1 2 3 ...) or #f (for all) ;; (define (db:multi-db-sync dbstruct . options) - (if (not (launch:setup)) - (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.") - (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) - (tmpdb (db:get-db dbstruct)) - (refndb (dbr:dbstruct-refndb dbstruct)) - (allow-cleanup #t) ;; (if run-ids #f #t)) - (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat))) - (data-synced 0)) ;; count of changed records (I hope) - - (for-each - (lambda (option) - - (case option - ;; kill servers - ((killservers) - (for-each - (lambda (server) - (match-let (((mod-time host port start-time pid) server)) - (if (and host pid) - (tasks:kill-server host pid)))) - servers)) - - ;; clear out junk records - ;; - ((dejunk) - (db:delay-if-busy mtdb) ;; ok to delay on mtdb - (db:clean-up mtdb) - (db:clean-up tmpdb) - (db:clean-up refndb)) - - ;; sync runs, test_meta etc. - ;; - ((old2new) - (set! data-synced - (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb tmpdb refndb) - data-synced))) - - ;; now ensure all newdb data are synced to megatest.db - ;; do not use the run-ids list passed in to the function - ;; - ((new2old) - (set! data-synced - (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f tmpdb refndb mtdb) - data-synced))) - - ((adj-target) - (db:adj-target (db:dbdat-get-db mtdb)) - (db:adj-target (db:dbdat-get-db tmpdb)) - (db:adj-target (db:dbdat-get-db refndb))) - - ((schema) - (db:patch-schema-maindb (db:dbdat-get-db mtdb)) - (db:patch-schema-maindb (db:dbdat-get-db tmpdb)) - (db:patch-schema-maindb (db:dbdat-get-db refndb)) - (db:patch-schema-rundb (db:dbdat-get-db mtdb)) - (db:patch-schema-rundb (db:dbdat-get-db tmpdb)) - (db:patch-schema-rundb (db:dbdat-get-db refndb)))) - - (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb)) - options) - data-synced))) + ;; (if (not (launch:setup)) + ;; (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.") + (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) + (tmpdb (db:get-db dbstruct)) + (refndb (dbr:dbstruct-refndb dbstruct)) + (allow-cleanup #t) ;; (if run-ids #f #t)) + (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat))) + (data-synced 0)) ;; count of changed records (I hope) + + (for-each + (lambda (option) + + (case option + ;; kill servers + ((killservers) + (for-each + (lambda (server) + (match-let (((mod-time host port start-time pid) server)) + (if (and host pid) + (tasks:kill-server host pid)))) + servers)) + + ;; clear out junk records + ;; + ((dejunk) + (db:delay-if-busy mtdb) ;; ok to delay on mtdb + (db:clean-up mtdb) + (db:clean-up tmpdb) + (db:clean-up refndb)) + + ;; sync runs, test_meta etc. + ;; + ((old2new) + (set! data-synced + (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb tmpdb refndb) + data-synced))) + + ;; now ensure all newdb data are synced to megatest.db + ;; do not use the run-ids list passed in to the function + ;; + ((new2old) + (set! data-synced + (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f tmpdb refndb mtdb) + data-synced))) + + ((adj-target) + (db:adj-target (db:dbdat-get-db mtdb)) + (db:adj-target (db:dbdat-get-db tmpdb)) + (db:adj-target (db:dbdat-get-db refndb))) + + ((schema) + (db:patch-schema-maindb (db:dbdat-get-db mtdb)) + (db:patch-schema-maindb (db:dbdat-get-db tmpdb)) + (db:patch-schema-maindb (db:dbdat-get-db refndb)) + (db:patch-schema-rundb (db:dbdat-get-db mtdb)) + (db:patch-schema-rundb (db:dbdat-get-db tmpdb)) + (db:patch-schema-rundb (db:dbdat-get-db refndb)))) + + (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb)) + options) + data-synced)) ;; keeping it around for debugging purposes only (define (open-run-close-no-exception-handling proc idb . params) (debug:print-info 11 *default-log-port* "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params) (print "I don't work anymore. open-run-close-no-exception-handling needs fixing or removing...") @@ -3259,11 +3259,13 @@ (test-name (if (number? test-name) (db:test-get-testname testdat) test-name)) (item-path (db:test-get-item-path testdat)) (tl-testdat (db:get-test-info dbstruct run-id test-name "")) - (tl-test-id (db:test-get-id tl-testdat))) + (tl-test-id (if tl-testdat + (db:test-get-id tl-testdat) + #f))) (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) (db:general-call dbstruct 'set-test-start-time (list test-id))) (mutex-lock! *db-transaction-mutex*) (db:with-db dbstruct #f #f @@ -3325,11 +3327,12 @@ "STARTED" (car all-curr-statuses)))) ;; (print "bad-not-supported: " bad-not-support " all-curr-states: " all-curr-states " all-curr-statuses: " all-curr-states) ;; " newstate: " newstate " newstatus: " newstatus) ;; NB// Pass the db so it is part of the transaction - (db:test-set-state-status db run-id tl-test-id newstate newstatus #f))))))) + (if tl-test-id + (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)))))))) (mutex-unlock! *db-transaction-mutex*) (if (and test-id state status (equal? status "AUTO")) (db:test-data-rollup dbstruct run-id test-id status)) tr-res))))) ;; BBnote: db:get-all-state-status-counts-for-test returns dbr:counts object aggregating state and status of items of a given test, *not including rollup state/status* Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -314,11 +314,12 @@ (loop (car tal) (cdr tal) stepname)) (debug:print 4 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping"))) (debug:print 4 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep))))))) (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) - (let* ((start-seconds (current-seconds)) + (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30"))) + (start-seconds (current-seconds)) (calc-minutes (lambda () (inexact->exact (round (- (current-seconds) @@ -327,30 +328,38 @@ ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) - (disk-free (get-df (current-directory)))) - (let ((new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) - (delta (abs (- load cpu-load)))) - (if (> delta 0.1) ;; don't bother updating with small changes - load - #f))) - (new-disk-free (let* ((df (get-df (current-directory))) - (delta (abs (- df disk-free)))) - (if (> delta 200) ;; ignore changes under 200 Meg - df - #f)))) + (disk-free (get-df (current-directory))) + (last-sync (current-seconds))) + (let* ((over-time (> (current-seconds) (+ last-sync update-period))) + (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) + (delta (abs (- load cpu-load)))) + (if (> delta 0.1) ;; don't bother updating with small changes + load + #f))) + (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds + (get-df (current-directory)) + disk-free)) + (delta (abs (- df disk-free)))) + (if (and (> df 0) + (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg + df + #f))) + (do-sync (or new-cpu-load new-disk-free over-time))) + (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)) (and runtlim (let* ((run-seconds (- (current-seconds) start-seconds)) (time-exceeded (> run-seconds runtlim))) (if time-exceeded (begin (debug:print-info 0 *default-log-port* "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim) #t) #f))))) - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) + (if do-sync + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) (if kill-job? (begin (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second @@ -394,11 +403,14 @@ (exit))) (if (hash-table-ref/default misc-flags 'keep-going #f) (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta - (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free))))))) + (loop (calc-minutes) + (or new-cpu-load cpu-load) + (or new-disk-free disk-free) + (if do-sync (current-seconds) last-sync))))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) @@ -842,13 +854,18 @@ (toppath (or *toppath* areapath (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath (target (common:args-get-target)) (sections (if target (list "default" target) #f)) ;; for runconfigs (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config (cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig)) - (mtcachef (car cachefiles)) ;; (and cachedir (conc cachedir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) - (rccachef (cdr cachefiles)) ;; (and cachedir (conc cachedir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) - ) ;; (cancreate (and cachedir (common:file-exists? cachedir)(file-write-access? cachedir) (not (common:in-running-test?))))) + ;; checking for null cachefiles should not be necessary, I was seeing error car of '(), might be a chicken bug or a red herring ... + (mtcachef (if (null? cachefiles) + #f + (car cachefiles))) ;; (and cachedir (conc cachedir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) + (rccachef (if (null? cachefiles) + #f + (cdr cachefiles)))) ;; (and cachedir (conc cachedir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) + ;; (cancreate (and cachedir (common:file-exists? cachedir)(file-write-access? cachedir) (not (common:in-running-test?))))) (set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource ;;(BB> "launch:setup-body -- cachefiles="cachefiles) (cond ;; if mtcachef exists just read it, however we need to assume toppath is available in $MT_RUN_AREA_HOME ((and (not force-reread) mtcachef (common:file-exists? mtcachef) (get-environment-variable "MT_RUN_AREA_HOME") use-cache) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -387,11 +387,19 @@ ;; The watchdog is to keep an eye on things like db sync etc. ;; ;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage -(define *watchdog* (make-thread common:watchdog "Watchdog thread")) +(define *watchdog* (make-thread + (lambda () + (handle-exceptions + exn + (begin + (print-call-chain) + (print " message: " ((condition-property-accessor 'exn 'message) exn))) + (common:watchdog))) + "Watchdog thread")) ;;(if (not (args:get-arg "-server")) ;; (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog (let* ((no-watchdog-args '("-list-runs" Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -513,11 +513,17 @@ (let ((reglen (configf:lookup *configdat* "setup" "runqueue"))) (if (> (length (hash-table-keys test-records)) 0) (let* ((keep-going #t) (run-queue-retries 5) (th1 (make-thread (lambda () - (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry)) + (handle-exceptions + exn + (begin + (print-call-chain) + (print " message: " ((condition-property-accessor 'exn 'message) exn))) + (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests + (any->number reglen) all-tests-registry))) ;; (handle-exceptions ;; exn ;; (begin ;; (print-call-chain (current-error-port)) ;; (debug:print-error 0 *default-log-port* "failure in runs:run-tests-queue thread, error: " ((condition-property-accessor 'exn 'message) exn)) Index: tests/tests.scm ================================================================== --- tests/tests.scm +++ tests/tests.scm @@ -17,10 +17,26 @@ (import srfi-18) ;; (require-extension zmq) ;; (import zmq) (define test-work-dir (current-directory)) + +;; given list of lists +;; ( ( msg expected param1 param2 ...) +;; ( ... ) ) +;; apply test to all +;; +(define (test-batch proc pname inlst #!key (post-proc #f)) + (for-each + (lambda (spec) + (let ((msg (conc pname " " (car spec))) + (result (cadr spec)) + (params (cddr spec))) + (if post-proc + (test msg result (post-proc (apply proc params))) + (test msg result (apply proc params))))) + inlst)) ;; read in all the _record files (let ((files (glob "*_records.scm"))) (for-each (lambda (file) Index: tests/unittests/all-rmt.scm ================================================================== --- tests/unittests/all-rmt.scm +++ tests/unittests/all-rmt.scm @@ -28,13 +28,25 @@ ;; DEF (rmt:kill-server run-id) ;; DEF (rmt:start-server run-id) (test #f '(#t "successful login")(rmt:login #f)) ;; DEF (rmt:login-no-auto-client-setup connection-info) (test #f #t (pair? (rmt:get-latest-host-load (get-host-name)))) + +;; get-latest-host-load does a lookup in the db, it won't return a useful value unless +;; a test ran recently on host +(test-batch rmt:get-latest-host-load + "rmt:get-latest-host-load" + (list (list "localhost" #t (get-host-name)) + (list "not-a-host" #t "not-a-host" )) + post-proc: pair?) + (test #f #t (list? (rmt:get-changed-record-ids 0))) + (test #f #f (begin (runs:update-all-test_meta #f) #f)) + (test #f '("test1" "test2")(sort (alist-ref "tagtwo" (hash-table->alist (rmt:get-tests-tags)) equal?) string<=)) + (test #f '() (rmt:get-key-val-pairs 0)) (test #f '("SYSTEM" "RELEASE") (rmt:get-keys)) (test #f '("SYSTEM" "RELEASE") (rmt:get-keys-write)) ;; dummy query to force server start (test #f '() (rmt:get-key-vals 1)) (test #f (vector '("SYSTEM" "RELEASE") '()) (rmt:get-targets)) @@ -82,11 +94,27 @@ (test #f '()(rmt:get-prev-run-ids 1)) (test #f #t (begin (rmt:lock/unlock-run 1 #t #f "mikey") #t)) (test #f "JUSTFINE" (rmt:get-run-status 1)) (test #f #t (begin (rmt:set-run-status 1 "NOTFINE" msg: "A message") #t)) (test #f #t (begin (rmt:update-run-event_time 1) #t)) + ;; (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-runs-update) ;; fields of #f uses default +;; +(let ((keys (rmt:get-keys)) + (rnp "%") ;; run name patt + (tpt "%/%")) ;; target patt + (test-batch rmt:get-runs-by-patt + "rmt:get-runs-by-patt" + (list (list "t=0" #t keys rnp tpt #f #f #f 0) + (list "t=current" #f keys rnp tpt #f #f #f (+ 100 (current-seconds))) ;; should be no records from the future + ) + post-proc: (lambda (res) + ;; (print "rmt:get-runs-by-patt returned: " res) + (and (vector? res) + (let ((rows (vector-ref res 1))) + (> (length rows) 0)))))) + ;; (rmt:find-and-mark-incomplete run-id ovr-deadtime) ;; (rmt:get-main-run-stats run-id) ;; (rmt:get-var varname) ;; (rmt:set-var varname value) ;; (rmt:find-and-mark-incomplete-all-runs #!key (ovr-deadtime #f))