Index: mtmod.scm ================================================================== --- mtmod.scm +++ mtmod.scm @@ -56,4242 +56,9 @@ (prefix mtargs args:)) (include "run_records.scm") (include "db_records.scm") (include "test_records.scm") - -;; This is the Megatest API. All generally "useful" routines will be wrapped or extended -;; here. - -;; 0 1 2 3 -(defstruct launch:einf (pid #t)(exit-status #t)(exit-code #t)(rollup-status 0)) - -;;====================================================================== -;; R U N S -;;====================================================================== - -;; runs:get-runs-by-patt -;; get runs by list of criteria -;; register a test run with the db -;; -;; Use: (db-get-value-by-header (db:get-header runinfo)(db:get-rows runinfo)) -;; to extract info from the structure returned -;; -(define (mt:get-runs-by-patt keys runnamepatt targpatt) - (let loop ((runsdat (rmt:get-runs-by-patt keys runnamepatt targpatt 0 500 #f 0)) - (res '()) - (offset 0) - (limit 500)) - ;; (print "runsdat: " runsdat) - (let* ((header (vector-ref runsdat 0)) - (runslst (vector-ref runsdat 1)) - (full-list (append res runslst)) - (have-more (eq? (length runslst) limit))) - ;; (debug:print 0 *default-log-port* "header: " header " runslst: " runslst " have-more: " have-more) - (if have-more - (let ((new-offset (+ offset limit)) - (next-batch (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit #f 0))) - (debug:print-info 4 *default-log-port* "More than " limit " runs, have " (length full-list) " runs so far.") - (debug:print-info 0 *default-log-port* "next-batch: " next-batch) - (loop next-batch - full-list - new-offset - limit)) - (vector header full-list))))) - -;;====================================================================== -;; T E S T S -;;====================================================================== - -(define (mt:get-tests-for-run run-id testpatt states status #!key (not-in #t) (sort-by 'event_time) (sort-order "ASC") (qryvals #f)(last-update #f)) - (let loop ((testsdat (rmt:get-tests-for-run run-id testpatt states status 0 500 not-in sort-by sort-order qryvals last-update 'normal)) - (res '()) - (offset 0) - (limit 500)) - (let* ((full-list (append res testsdat)) - (have-more (eq? (length testsdat) limit))) - (if have-more - (let ((new-offset (+ offset limit))) - (debug:print-info 4 *default-log-port* "More than " limit " tests, have " (length full-list) " tests so far.") - (loop (rmt:get-tests-for-run run-id testpatt states status new-offset limit not-in sort-by sort-order qryvals last-update 'normal) - full-list - new-offset - limit)) - full-list)))) - -(define (mt:lazy-get-prereqs-not-met run-id waitons ref-item-path #!key (mode '(normal))(itemmaps #f) ) - (let* ((key (list run-id waitons ref-item-path mode)) - (res (hash-table-ref/default *pre-reqs-met-cache* key #f)) - (useres (let ((last-time (if (vector? res) (vector-ref res 0) #f))) - (if last-time - (< (current-seconds)(+ last-time 5)) - #f)))) - (if useres - (let ((result (vector-ref res 1))) - (debug:print 4 *default-log-port* "Using lazy value res: " result) - result) - (let ((newres (rmt:get-prereqs-not-met run-id waitons ref-item-path mode: mode itemmaps: itemmaps))) - (hash-table-set! *pre-reqs-met-cache* key (vector (current-seconds) newres)) - newres)))) - -(define (mt:get-run-stats dbstruct run-id) -;; Get run stats from local access, move this ... but where? - (db:get-run-stats dbstruct run-id)) - -(define (mt:discard-blocked-tests run-id failed-test tests test-records) - (if (null? tests) - tests - (begin - (debug:print-info 1 *default-log-port* "Discarding tests from " tests " that are waiting on " failed-test) - (let loop ((testn (car tests)) - (remt (cdr tests)) - (res '())) - (let* ((test-dat (hash-table-ref/default test-records testn (vector #f #f '()))) - (waitons (vector-ref test-dat 2))) - ;; (print "mt:discard-blocked-tests run-id: " run-id " failed-test: " failed-test " testn: " testn " with waitons: " waitons) - (if (null? remt) - (let ((new-res (reverse res))) - ;; (print " new-res: " new-res) - new-res) - (loop (car remt) - (cdr remt) - (if (member failed-test waitons) - (begin - (debug:print 0 *default-log-port* "Discarding test " testn "(" test-dat ") due to " failed-test) - res) - (cons testn res))))))))) - -;;====================================================================== -;; S T A T E A N D S T A T U S F O R T E S T S -;;====================================================================== - -;; speed up for common cases with a little logic -(define (mt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment) - (if (not (and run-id test-id)) - (begin - (debug:print-error 0 *default-log-port* "bad data handed to mt:test-set-state-status-by-id, run-id=" run-id ", test-id=" test-id ", newstate=" newstate) - (print-call-chain (current-error-port)) - #f) - (begin - ;; cond - ;; ((and newstate newstatus newcomment) - ;; (rmt:general-call 'state-status-msg run-id newstate newstatus newcomment test-id)) - ;; ((and newstate newstatus) - ;; (rmt:general-call 'state-status run-id newstate newstatus test-id)) - ;; (else - ;; (if newstate (rmt:general-call 'set-test-state run-id newstate test-id)) - ;; (if newstatus (rmt:general-call 'set-test-status run-id newstatus test-id)) - ;; (if newcomment (rmt:general-call 'set-test-comment run-id newcomment test-id)))) - (rmt:set-state-status-and-roll-up-items run-id test-id #f newstate newstatus newcomment) - ;; (mt:process-triggers run-id test-id newstate newstatus) - #t))) - - -(define (mt:test-set-state-status-by-id-unless-completed run-id test-id newstate newstatus newcomment) - (let* ((test-vec (rmt:get-testinfo-state-status run-id test-id)) - (state (vector-ref test-vec 3))) - (if (equal? state "COMPLETED") - #t - (rmt:set-state-status-and-roll-up-items run-id test-id #f newstate newstatus newcomment)))) - - -(define (mt:test-set-state-status-by-testname run-id test-name item-path new-state new-status new-comment) - ;(let ((test-id (rmt:get-test-id run-id test-name item-path))) - (rmt:set-state-status-and-roll-up-items run-id test-name item-path new-state new-status new-comment) - ;; (mt:process-triggers run-id test-id new-state new-status) - #t);) - ;;(mt:test-set-state-status-by-id run-id test-id new-state new-status new-comment))) - -(define (mt:test-set-state-status-by-testname-unless-completed run-id test-name item-path new-state new-status new-comment) - (let ((test-id (rmt:get-test-id run-id test-name item-path))) - (mt:test-set-state-status-by-id-unless-completed run-id test-id new-state new-status new-comment))) - -;; kill any runner processes (i.e. processes handling -runtests) that match target/runname -;; -;; do a remote call to get the task queue info but do the killing as self here. -;; -(define (tasks:kill-runner target run-name testpatt) - (let ((records (rmt:tasks-find-task-queue-records target run-name testpatt "running" "run-tests")) - (hostpid-rx (regexp "\\s+(\\w+)\\s+(\\d+)$"))) ;; host pid is at end of param string - (if (null? records) - (debug:print 0 *default-log-port* "No run launching processes found for " target " / " run-name " with testpatt " (or testpatt "* no testpatt specified! *")) - (debug:print 0 *default-log-port* "Found " (length records) " run(s) to kill.")) - (for-each - (lambda (record) - (let* ((param-key (list-ref record 8)) - (match-dat (string-search hostpid-rx param-key))) - (if match-dat - (let ((hostname (cadr match-dat)) - (pid (string->number (caddr match-dat)))) - (debug:print 0 *default-log-port* "Sending SIGINT to process " pid " on host " hostname) - (if (equal? (get-host-name) hostname) - (if (process:alive? pid) - (begin - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "Kill of process " pid " on host " hostname " failed.") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - #t) - (process-signal pid signal/int) - (thread-sleep! 5) - (if (process:alive? pid) - (process-signal pid signal/kill))))) - ;; (call-with-environment-variables - (let ((old-targethost (getenv "TARGETHOST"))) - (setenv "TARGETHOST" hostname) - (setenv "TARGETHOST_LOGF" "server-kills.log") - (system (conc "nbfake kill " pid)) - (if old-targethost (setenv "TARGETHOST" old-targethost)) - (unsetenv "TARGETHOST") - (unsetenv "TARGETHOST_LOGF")))) - (debug:print-error 0 *default-log-port* "no record or improper record for " target "/" run-name " in tasks_queue in main.db")))) - records))) - -(define (task:get-run-times) - (let* ( - (run-patt (if (args:get-arg "-run-patt") - (args:get-arg "-run-patt") - "%")) - (target-patt (if (args:get-arg "-target-patt") - (args:get-arg "-target-patt") - "%")) - - (run-times (rmt:get-run-times run-patt target-patt ))) - (if (eq? (length run-times) 0) - (begin - (print "Data not found!!") - (exit))) - (if (equal? (args:get-arg "-dumpmode") "json") - (task:print-runtime-as-json run-times) - (if (equal? (args:get-arg "-dumpmode") "csv") - (task:print-runtime run-times ",") - (task:print-runtime run-times " "))))) - - (define (task:get-test-times) - (let* ((runname (if (args:get-arg "-runname") - (args:get-arg "-runname") - #f)) - (target (if (args:get-arg "-target") - (args:get-arg "-target") - #f)) - - (test-times (rmt:get-test-times runname target ))) - (if (not runname) - (begin - (print "Error: Missing argument -runname") - (exit))) - (if (string-contains runname "%") - (begin - (print "Error: Invalid runname, '%' not allowed (" runname ") ") - (exit))) - (if (not target) - (begin - (print "Error: Missing argument -target") - (exit))) - (if (string-contains target "%") - (begin - (print "Error: Invalid target, '%' not allowed (" target ") ") - (exit))) - - (if (eq? (length test-times) 0) - (begin - (print "Data not found!!") - (exit))) - (if (equal? (args:get-arg "-dumpmode") "json") - (task:print-testtime-as-json test-times) - (if (equal? (args:get-arg "-dumpmode") "csv") - (task:print-testtime test-times ",") - (task:print-testtime test-times " "))))) - - - -;; gets mtpg-run-id and syncs the record if different -;; -(define (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time) - (let* ((runs-ht (hash-table-ref cached-info 'runs)) - (runinf (hash-table-ref/default runs-ht run-id #f)) - (area-id (vector-ref area-info 0))) - (if runinf - runinf ;; already cached - (let* ((run-dat (rmt:get-run-info run-id)) ;; NOTE: get-run-info returns a vector < row header > - (run-name (rmt:get-run-name-from-id run-id)) - (row (db:get-rows run-dat)) ;; yes, this returns a single row - (header (db:get-header run-dat)) - (state (db:get-value-by-header row header "state")) - (status (db:get-value-by-header row header "status")) - (owner (db:get-value-by-header row header "owner")) - (event-time (db:get-value-by-header row header "event_time")) - (comment (db:get-value-by-header row header "comment")) - (fail-count (db:get-value-by-header row header "fail_count")) - (pass-count (db:get-value-by-header row header "pass_count")) - (db-contour (db:get-value-by-header row header "contour")) - (contour (if (args:get-arg "-prepend-contour") - (if (and db-contour (not (equal? db-contour "")) (string? db-contour )) - (begin - (debug:print-info 1 *default-log-port* "db-contour") - db-contour) - (args:get-arg "-contour")))) - (run-tag (if (args:get-arg "-run-tag") - (args:get-arg "-run-tag") - "")) - (last-update (db:get-value-by-header row header "last_update")) - (keytarg (if (or (args:get-arg "-prepend-contour") (args:get-arg "-prefix-target")) - (conc "MT_CONTOUR/MT_AREA/" (string-intersperse (rmt:get-keys) "/")) (string-intersperse (rmt:get-keys) "/"))) ;; e.g. version/iteration/platform - (target (if (or (args:get-arg "-prepend-contour") (args:get-arg "-prefix-target")) - (conc (or (args:get-arg "-prefix-target") (conc contour "/" (common:get-area-name) "/")) (rmt:get-target run-id)) (rmt:get-target run-id))) ;; e.g. v1.63/a3e1/ubuntu - (spec-id (pgdb:get-ttype dbh keytarg)) - (publish-time (if (args:get-arg "-cp-eventtime-to-publishtime") - event-time - (current-seconds))) - (new-run-id (pgdb:get-run-id dbh spec-id target run-name area-id))) - (if new-run-id - (begin ;; let ((run-record (pgdb:get-run-info dbh new-run-id)) - (hash-table-set! runs-ht run-id new-run-id) - ;; ensure key fields are up to date - ;; if last_update == pgdb_last_update do not update smallest-last-update-time - (let* ((pgdb-last-update (pgdb:get-run-last-update dbh new-run-id)) - (smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f))) - (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) - (hash-table-set! smallest-last-update-time "smallest-time" last-update))) - (pgdb:refresh-run-info - dbh - new-run-id - state status owner event-time comment fail-count pass-count area-id last-update publish-time) - (debug:print-info 0 *default-log-port* "Working on run-id " run-id " pgdb-id " new-run-id ) - (if (not (equal? run-tag "")) - (task:add-run-tag dbh new-run-id run-tag)) - new-run-id) - - (if (equal? state "deleted") - (begin - (debug:print-info 1 *default-log-port* "Warning: Run with id " run-id " was created after previous sync and deleted before the sync") #f) - (if (handle-exceptions - exn - (begin (print-call-chain) - (print ((condition-property-accessor 'exn 'message) exn)) - #f) - - (pgdb:insert-run - dbh - spec-id target run-name state status owner event-time comment fail-count pass-count area-id last-update publish-time)) - (let* ((smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f))) - (if (or (not smallest-time) (< last-update smallest-time)) - (hash-table-set! smallest-last-update-time "smallest-time" last-update)) - (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time)) - #f))))))) - -(define (task:add-run-tag dbh run-id tag) - (let* ((tag-info (pgdb:get-tag-info-by-name dbh tag))) - (if (not tag-info) - (begin - (if (handle-exceptions - exn - (begin - (debug:print-info 1 *default-log-port* ((condition-property-accessor 'exn 'message) exn)) - #f) - (pgdb:insert-tag dbh tag)) - (set! tag-info (pgdb:get-tag-info-by-name dbh tag)) - #f))) - ;;add to area_tags - (handle-exceptions - exn - (begin - (debug:print-info 1 *default-log-port* ((condition-property-accessor 'exn 'message) exn)) - #f) - (if (not (pgdb:is-run-taged-with-a-tag dbh (vector-ref tag-info 0) run-id)) - (pgdb:insert-run-tag dbh (vector-ref tag-info 0) run-id))))) - - -(define (tasks:sync-test-steps dbh cached-info test-step-ids smallest-last-update-time) - ; (print "Sync Steps " test-step-ids ) - (let ((test-ht (hash-table-ref cached-info 'tests)) - (step-ht (hash-table-ref cached-info 'steps))) - (for-each - (lambda (test-step-id) - (let* ((test-step-info (rmt:get-steps-info-by-id test-step-id)) - (step-id (tdb:step-get-id test-step-info)) - (test-id (tdb:step-get-test_id test-step-info)) - (stepname (tdb:step-get-stepname test-step-info)) - (state (tdb:step-get-state test-step-info)) - (status (tdb:step-get-status test-step-info)) - (event_time (tdb:step-get-event_time test-step-info)) - (comment (tdb:step-get-comment test-step-info)) - (logfile (tdb:step-get-logfile test-step-info)) - (last-update (tdb:step-get-last_update test-step-info)) - (pgdb-test-id (hash-table-ref/default test-ht test-id #f)) - (smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f)) - (pgdb-step-id (if pgdb-test-id - (pgdb:get-test-step-id dbh pgdb-test-id stepname state) - #f))) - (if step-id - (begin - (if pgdb-test-id - (begin - (if pgdb-step-id - (begin - (debug:print-info 1 *default-log-port* "Updating existing test-step with test-id: " test-id " and step-id " step-id " pgdb test id: " pgdb-test-id " pgdb step id " pgdb-step-id ) - (let* ((pgdb-last-update (pgdb:get-test-step-last-update dbh pgdb-step-id))) - (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) - (hash-table-set! smallest-last-update-time "smallest-time" last-update))) - (pgdb:update-test-step dbh pgdb-step-id pgdb-test-id stepname state status event_time comment logfile last-update)) - (begin - (debug:print-info 1 *default-log-port* "Inserting test-step with test-id: " test-id " and step-id " step-id " pgdb test id: " pgdb-test-id) - (if (or (not smallest-time) (< last-update smallest-time)) - (hash-table-set! smallest-last-update-time "smallest-time" last-update)) - (pgdb:insert-test-step dbh pgdb-test-id stepname state status event_time comment logfile last-update ) - (set! pgdb-step-id (pgdb:get-test-step-id dbh pgdb-test-id stepname state)))) - (hash-table-set! step-ht step-id pgdb-step-id )) - (debug:print-info 1 *default-log-port* "Error: Test not cashed"))) - (debug:print-info 1 *default-log-port* "Error: Could not get test step info for step id " test-step-id )))) ;; this is a wierd senario need to debug - test-step-ids))) - -(define (tasks:sync-test-gen-data dbh cached-info test-data-ids smallest-last-update-time) - (let ((test-ht (hash-table-ref cached-info 'tests)) - (data-ht (hash-table-ref cached-info 'data))) - (for-each - (lambda (test-data-id) - (let* ((test-data-info (rmt:get-data-info-by-id test-data-id)) - (data-id (db:test-data-get-id test-data-info)) - (test-id (db:test-data-get-test_id test-data-info)) - (category (db:test-data-get-category test-data-info)) - (variable (db:test-data-get-variable test-data-info)) - (value (db:test-data-get-value test-data-info)) - (expected (db:test-data-get-expected test-data-info)) - (tol (db:test-data-get-tol test-data-info)) - (units (db:test-data-get-units test-data-info)) - (comment (db:test-data-get-comment test-data-info)) - (status (db:test-data-get-status test-data-info)) - (type (db:test-data-get-type test-data-info)) - (last-update (db:test-data-get-last_update test-data-info)) - (smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f)) - - (pgdb-test-id (hash-table-ref/default test-ht test-id #f)) - (pgdb-data-id (if pgdb-test-id - (pgdb:get-test-data-id dbh pgdb-test-id category variable) - #f))) - (if data-id - (begin - (if pgdb-test-id - (begin - (if pgdb-data-id - (begin - (debug:print-info 1 *default-log-port* "Updating existing test-data with test-id: " test-id " and data-id " data-id " pgdb test id: " pgdb-test-id " pgdb data id " pgdb-data-id) - (let* ((pgdb-last-update (pgdb:get-test-data-last-update dbh pgdb-data-id))) - (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) - (hash-table-set! smallest-last-update-time "smallest-time" last-update))) - (pgdb:update-test-data dbh pgdb-data-id pgdb-test-id category variable value expected tol units comment status type last-update)) - (begin - (debug:print-info 1 *default-log-port* "Inserting test-data with test-id: " test-id " and data-id " data-id " pgdb test id: " pgdb-test-id) - (if (handle-exceptions - exn - (begin (print-call-chain) - (print ((condition-property-accessor 'exn 'message) exn)) - #f) - - (pgdb:insert-test-data dbh pgdb-test-id category variable value expected tol units comment status type last-update)) - ;(tasks:run-id->mtpg-run-id dbh cached-info run-id area-info) - (begin - ;(pgdb:insert-test-data dbh pgdb-test-id category variable value expected tol units comment status type ) - (if (or (not smallest-time) (< last-update smallest-time)) - (hash-table-set! smallest-last-update-time "smallest-time" last-update)) - (set! pgdb-data-id (pgdb:get-test-data-id dbh pgdb-test-id category variable))) - #f))) - (hash-table-set! data-ht data-id pgdb-data-id )) - (begin - (debug:print-info 1 *default-log-port* "Error: Test not in pgdb")))) - - (debug:print-info 1 *default-log-port* "Error: Could not get test data info for data id " test-data-id )))) ;; this is a wierd senario need to debug - test-data-ids))) - - - -(define (tasks:sync-tests-data dbh cached-info test-ids area-info smallest-last-update-time) - (let ((test-ht (hash-table-ref cached-info 'tests))) - (for-each - (lambda (test-id) - ; (print test-id) - (let* ((test-info (rmt:get-test-info-by-id #f test-id)) - (run-id (db:test-get-run_id test-info)) ;; look these up in db_records.scm - (test-id (db:test-get-id test-info)) - (test-name (db:test-get-testname test-info)) - (item-path (db:test-get-item-path test-info)) - (state (db:test-get-state test-info)) - (status (db:test-get-status test-info)) - (host (db:test-get-host test-info)) - (pid (db:test-get-process_id test-info)) - (cpuload (db:test-get-cpuload test-info)) - (diskfree (db:test-get-diskfree test-info)) - (uname (db:test-get-uname test-info)) - (run-dir (db:test-get-rundir test-info)) - (log-file (db:test-get-final_logf test-info)) - (run-duration (db:test-get-run_duration test-info)) - (comment (db:test-get-comment test-info)) - (event-time (db:test-get-event_time test-info)) - (archived (db:test-get-archived test-info)) - (last-update (db:test-get-last_update test-info)) - (pgdb-run-id (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time)) - (smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f)) - (pgdb-test-id (if pgdb-run-id - (begin - ;(print pgdb-run-id) - (pgdb:get-test-id dbh pgdb-run-id test-name item-path)) - #f))) - ;; "id" "run_id" "testname" "state" "status" "event_time" - ;; "host" "cpuload" "diskfree" "uname" "rundir" "item_path" - ;; "run_duration" "final_logf" "comment" "shortdir" "attemptnum" "archived" - (if pgdb-run-id - (begin - (if pgdb-test-id ;; have a record - (begin ;; let ((key-name (conc run-id "/" test-name "/" item-path))) - (debug:print-info 0 *default-log-port* "Updating existing test with run-id: " run-id " and test-id: " test-id " pgdb run id: " pgdb-run-id " pgdb-test-id " pgdb-test-id) - (let* ((pgdb-last-update (pgdb:get-test-last-update dbh pgdb-test-id))) - (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) ;;if last-update is same as pgdb-last-update then it is safe to assume the records are identical and we can use a larger last update time. - (hash-table-set! smallest-last-update-time "smallest-time" last-update))) - (pgdb:update-test dbh pgdb-test-id pgdb-run-id test-name item-path state status host cpuload diskfree uname run-dir log-file run-duration comment event-time archived last-update pid)) - (begin - (debug:print-info 0 *default-log-port* "Inserting test with run-id: " run-id " and test-id: " test-id " pgdb run id: " pgdb-run-id) - (pgdb:insert-test dbh pgdb-run-id test-name item-path state status host cpuload diskfree uname run-dir log-file run-duration comment event-time archived last-update pid) - (if (or (not smallest-time) (< last-update smallest-time)) - (hash-table-set! smallest-last-update-time "smallest-time" last-update)) - (set! pgdb-test-id (pgdb:get-test-id dbh pgdb-run-id test-name item-path)))) - (hash-table-set! test-ht test-id pgdb-test-id)) - (debug:print-info 1 *default-log-port* "WARNING: Skipping run with run-id:" run-id ". This run was created after privious sync and removed before this sync.")))) - test-ids))) - -(define (task:add-area-tag dbh area-info tag) - (let* ((tag-info (pgdb:get-tag-info-by-name dbh tag))) - (if (not tag-info) - (begin - (if (handle-exceptions - exn - (begin - (debug:print-info 1 *default-log-port* ((condition-property-accessor 'exn 'message) exn)) - #f) - (pgdb:insert-tag dbh tag)) - (set! tag-info (pgdb:get-tag-info-by-name dbh tag)) - #f))) - ;;add to area_tags - (handle-exceptions - exn - (begin - (debug:print-info 1 *default-log-port* ((condition-property-accessor 'exn 'message) exn)) - #f) - (if (not (pgdb:is-area-taged-with-a-tag dbh (vector-ref tag-info 0) (vector-ref area-info 0))) - (pgdb:insert-area-tag dbh (vector-ref tag-info 0) (vector-ref area-info 0)))))) - -(define (tasks:sync-run-data dbh cached-info run-ids area-info smallest-last-update-time) - (for-each - (lambda (run-id) - (debug:print-info 1 *default-log-port* "Check if run with " run-id " needs to be synced" ) - (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time)) -run-ids)) - - -;; get runs changed since last sync -;; (define (tasks:sync-test-data dbh cached-info area-info) -;; (let* (( - -(define (tasks:sync-to-postgres configdat dest) - (print "In sync") - (let* ((dbh (pgdb:open configdat dbname: dest)) - (area-info (pgdb:get-area-by-path dbh *toppath*)) - (cached-info (make-hash-table)) - (start (current-seconds)) - (test-patt (if (args:get-arg "-testpatt") - (args:get-arg "-testpatt") - "%")) - (target (if (args:get-arg "-target") - (args:get-arg "-target") - #f)) - (run-name (if (args:get-arg "-runname") - (args:get-arg "-runname") - #f))) - (if (and target (not run-name)) - (begin - (print "Error: Provide runname") - (exit 1))) - (if (and (not target) run-name) - (begin - (print "Error: Provide target") - (exit 1))) - ;(print "123") - ;(exit 1) - (for-each (lambda (dtype) - (hash-table-set! cached-info dtype (make-hash-table))) - '(runs targets tests steps data)) - (hash-table-set! cached-info 'start start) ;; when done we'll set sync times to this - (if area-info - (let* ((last-sync-time (vector-ref area-info 3)) - (smallest-last-update-time (make-hash-table)) - (changed (if (and target run-name) - (rmt:get-run-record-ids target run-name (rmt:get-keys) test-patt) - (rmt:get-changed-record-ids last-sync-time))) - (run-ids (alist-ref 'runs changed)) - (test-ids (alist-ref 'tests changed)) - (test-step-ids (alist-ref 'test_steps changed)) - (test-data-ids (alist-ref 'test_data changed)) - (run-stat-ids (alist-ref 'run_stats changed)) - (area-tag (if (args:get-arg "-area-tag") - (args:get-arg "-area-tag") - (if (args:get-arg "-area") - (args:get-arg "-area") - "")))) - (if (and (equal? area-tag "") (not (pgdb:is-area-taged dbh (vector-ref area-info 0)))) - (set! area-tag *default-area-tag*)) - (if (not (equal? area-tag "")) - (task:add-area-tag dbh area-info area-tag)) - (if (or (not (null? test-ids)) (not (null? run-ids))) - (begin - (debug:print-info 0 *default-log-port* "syncing runs") - (tasks:sync-run-data dbh cached-info run-ids area-info smallest-last-update-time) - (debug:print-info 0 *default-log-port* "syncing tests") - (tasks:sync-tests-data dbh cached-info test-ids area-info smallest-last-update-time) - (debug:print-info 0 *default-log-port* "syncing test steps") - (tasks:sync-test-steps dbh cached-info test-step-ids smallest-last-update-time) - (debug:print-info 0 *default-log-port* "syncing test data") - (tasks:sync-test-gen-data dbh cached-info test-data-ids smallest-last-update-time) - (print "----------done---------------"))) - (let* ((smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f))) - (debug:print-info 0 "smallest-time :" smallest-time " last-sync-time " last-sync-time) - (if (not (and target run-name)) - (if (or (and smallest-time (> smallest-time last-sync-time)) (and smallest-time (eq? last-sync-time 0))) - (pgdb:write-sync-time dbh area-info smallest-time))))) ;;this needs to be changed - (if (tasks:set-area dbh configdat) - (tasks:sync-to-postgres configdat dest) - (begin - (debug:print 0 *default-log-port* "ERROR: unable to create an area record") - #f))))) - -;;====================================================================== -;; L O C K I N G M E C H A N I S M S -;;====================================================================== - -;; faux-lock is deprecated. Please use simple-lock below -;; -(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t)) - (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count - (if (> wait-time 0) - (begin - (thread-sleep! 1) - (if (eq? wait-time 1) ;; only one second left, steal the lock - (begin - (debug:print-info 0 *default-log-port* "stealing lock for " keyname) - (common:faux-unlock keyname force: #t))) - (common:faux-lock keyname wait-time: (- wait-time 1))) - #f) - (begin - (rmt:no-sync-set keyname (conc (current-process-id))) - (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))))) - -(define (common:faux-unlock keyname #!key (force #f)) - (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))) - (begin - (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname)) - #t) - #f)) - -;; simple lock. improve and converge on this one. -;; -(define (common:simple-lock keyname) - (rmt:no-sync-get-lock keyname)) - -(define (common:simple-unlock keyname #!key (force #f)) - (rmt:no-sync-del! keyname)) - -;;====================================================================== -;; db based host calls -;;====================================================================== - -;;====================================================================== -;; T E S T L A U N C H I N G P E R I T E M W I T H H O S T T Y P E S -;;====================================================================== -;; -;; [hosts] -;; arm cubie01 cubie02 -;; x86_64 zeus xena myth01 -;; allhosts #{g hosts arm} #{g hosts x86_64} -;; -;; [host-types] -;; general #MTLOWESTLOAD #{g hosts allhosts} -;; arm #MTLOWESTLOAD #{g hosts arm} -;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo -;; -;; [host-rules] -;; # maxnload => max normalized load -;; # maxnjobs => max jobs per cpu -;; # maxjobrate => max jobs per second -;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 -;; -;; [launchers] -;; envsetup general -;; xor/%/n 4C16G -;; % nbgeneral -;; -;; [jobtools] -;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match. -;; flexi-launcher yes -;; launcher nbfake -;; -(define (common:get-launcher configdat testname itempath) - (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher"))) - (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher - (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no"))) - (let* ((launchers (hash-table-ref/default configdat "launchers" '()))) - (if (null? launchers) - fallback-launcher - (let loop ((hed (car launchers)) - (tal (cdr launchers))) - (let ((patt (car hed)) - (host-type (cadr hed))) - (if (tests:match patt testname itempath) - (begin - (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) - (let ((launcher (configf:lookup configdat "host-types" host-type))) - (if launcher - (let* ((launcher-parts (string-split launcher)) - (launcher-exe (car launcher-parts))) - (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline - (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) - (count 100)) - (if targ-host - (conc "remrun " targ-host) - (if (> count 0) - (begin - (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) - (thread-sleep! (- 101 count)) - (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) - (- count 1))) - (begin - (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) - (exit))))) - launcher)) - (begin - (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) - (if (null? tal) - fallback-launcher - (loop (car tal)(cdr tal))))))) - ;; no match, try again - (if (null? tal) - fallback-launcher - (loop (car tal)(cdr tal)))))))) - fallback-launcher))) - -;; ideally put all this info into the db, no need to preserve it across moving homehost -;; -;; return list of -;; ( reachable? cpuload update-time ) -(define (common:get-host-info hostname) - (let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data - (load (car loadinfo)) - (load-sample-time (cdr loadinfo)) - (load-sample-age (- (current-seconds) load-sample-time)) - (loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds - (host-last-update-timeout-seconds 4) - (host-rec (hash-table-ref/default *host-loads* hostname #f)) - ) - (cond - ((< load-sample-age loadinfo-timeout-seconds) - (list #t - load-sample-time - load)) - ((and host-rec - (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds))) - (list #t - (host-last-update host-rec) - (host-last-cpuload host-rec ))) - ((common:unix-ping hostname) - (list #t - (current-seconds) - (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds - (else - (list #f 0 -1) ;; bad host, don't use! - )))) - -;; see defstruct host at top of file. -;; host: reachable last-update last-used last-cpuload -;; -(define (common:update-host-loads-table hosts-raw) - (let* ((hosts (filter (lambda (x) - (string-match (regexp "^\\S+$") x)) - hosts-raw))) - (for-each - (lambda (hostname) - (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) - (if h - h - (let ((h (make-host))) - (hash-table-set! *host-loads* hostname h) - h)))) - (host-info (common:get-host-info hostname)) - (is-reachable (car host-info)) - (last-reached-time (cadr host-info)) - (load (caddr host-info))) - (host-reachable-set! rec is-reachable) - (host-last-update-set! rec last-reached-time) - (host-last-cpuload-set! rec load))) - hosts))) - -;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the -;; [host-rules] section. -;; -(define (common:get-least-loaded-host hosts-raw host-type configdat) - (let* ((rdat (configf:lookup configdat "host-rules" host-type)) - (rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate - (maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load - (maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs - (maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second - (hosts (filter (lambda (x) - (string-match (regexp "^\\S+$") x)) - hosts-raw)) - ;; (best-host #f) - (get-rec (lambda (hostname) - ;; (print "get-rec hostname=" hostname) - (let ((h (hash-table-ref/default *host-loads* hostname #f))) - (if h - h - (let ((h (make-host))) - (hash-table-set! *host-loads* hostname h) - h))))) - (best-load 99999) - (curr-time (current-seconds)) - (get-hosts-sorted (lambda (hosts) - (sort hosts (lambda (a b) - (let ((a-rec (get-rec a)) - (b-rec (get-rec b))) - ;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec)) - ;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec)) - (< (host-last-used a-rec) - (host-last-used b-rec)))))))) - (debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts)) - (if (null? hosts) - #f ;; no hosts to select from. All done and giving up now. - (let ((hosts-sorted (get-hosts-sorted hosts))) - (common:update-host-loads-table hosts) - (let loop ((hostname (car hosts-sorted)) - (tal (cdr hosts-sorted)) - (best-host #f)) - (let* ((rec (get-rec hostname)) - (reachable (host-reachable rec)) - (load (host-last-cpuload rec)) - (last-used (host-last-used rec)) - (delta (- curr-time last-used)) - (job-rate (if (> delta 0) - (/ 1 delta) - 999)) ;; jobs per second - (new-best - (cond - ((not reachable) - (debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.") - best-host) - ((and (< load maxnload) ;; load is acceptable - (< job-rate maxjobrate)) ;; job rate is acceptable - (set! best-load load) - hostname) - (else best-host)))) - (debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." ) - (if new-best - (begin ;; found a host, return it - (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) - (host-last-used-set! rec curr-time) - new-best) - (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) - -(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)) - (let* ((loadavg (common:get-cpu-load remote-host)) - (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again - (common:get-num-cpus remote-host) - numcpus-in)) - (maxload (if force-maxload - maxload-in - (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? - (first (car loadavg)) - (next (cadr loadavg)) - (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 - (loadjmp (- first next)) - (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously - (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload - ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp) - (cond - ((and (> first adjload) - (> count 0)) - (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) - (thread-sleep! adjwait) - (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) - ((and (> loadjmp numcpus) - (> count 0)) - (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) - (thread-sleep! adjwait) - (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))))) - -(define (common:wait-for-homehost-load maxload msg) - (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. - #f - (common:get-homehost))) - (hh (if hh-dat (car hh-dat) #f)) - (numcpus (common:get-num-cpus hh))) - (common:wait-for-normalized-load maxload msg hh))) - -(define (common:get-num-cpus remote-host) - (let* ((actual-host (or remote-host (get-host-name)))) - (or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often! - (let* ((proc (lambda () - (let loop ((numcpu 0) - (inl (read-line))) - (if (eof-object? inl) - (begin - (common:write-cached-info remote-host "num-cpus" numcpu) - numcpu) - (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl) - (+ numcpu 1) - numcpu) - (read-line)))))) - (result (if remote-host - (with-input-from-pipe - (conc "ssh " remote-host " cat /proc/cpuinfo") - proc) - (with-input-from-file "/proc/cpuinfo" proc)))) - (common:write-cached-info actual-host "num-cpus" result) - result)))) - -;; wait for normalized cpu load to drop below maxload -;; -(define (common:wait-for-normalized-load maxload msg remote-host) - (let ((num-cpus (common:get-num-cpus remote-host))) - (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host))) - -;;====================================================================== -;; D E B U G G I N G S T U F F -;;====================================================================== - -;; (define *verbosity* 1) -;; (define *logging* #f) - -(define (common:set-last-run-version) - (rmt:set-var "MEGATEST_VERSION" (common:version-signature))) - -;; postive number if megatest version > db version -;; negative number if megatest version < db version -(define (common:version-db-delta) - (- megatest-version (common:get-last-run-version-number))) - -(define (common:version-changed?) - (not (equal? (common:get-last-run-version) - (common:version-signature)))) - -;; from metadat lookup MEGATEST_VERSION -;; -(define (common:get-last-run-version) ;; RADT => How does this work in send-receive function??; assume it is the value saved in some DB - (rmt:get-var "MEGATEST_VERSION")) - -(define (common:get-last-run-version-number) - (string->number - (substring (common:get-last-run-version) 0 6))) - -(define (common:api-changed?) - (not (equal? (substring (->string megatest-version) 0 4) - (substring (conc (common:get-last-run-version)) 0 4)))) - -;; '(print (string-intersperse (map cadr (hash-table-ref/default (read-config "megatest.config" \#f \#t) "disks" '"'"'("none" ""))) "\n"))' -(define (common:get-disks #!key (configf #f)) - (hash-table-ref/default - (or configf (configf:read-config "megatest.config" #f #t)) - "disks" '("none" ""))) - -;;====================================================================== -;; watchdog and exit procedures -;;====================================================================== - -;;====================================================================== -;; E X I T H A N D L I N G -;;====================================================================== - -;; (let ((ohh (common:on-homehost?)) -;; (srv (args:get-arg "-server"))) -;; (and ohh srv))) - ;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv) - -(define *watchdog* (make-thread - (lambda () - (handle-exceptions - exn - (begin - (print-call-chain) - (print " message: " ((condition-property-accessor 'exn 'message) exn))) - (common:watchdog))) - "Watchdog thread")) - -;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp -;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server) -;; -(define (common:readonly-watchdog dbstruct) - (thread-sleep! 0.05) ;; delay for startup - (debug:print-info 13 *default-log-port* "common:readonly-watchdog entered.") - ;; sync megatest.db to /tmp/.../megatst.db - (let* ((sync-cool-off-duration 3) - (golden-mtdb (dbr:dbstruct-mtdb dbstruct)) - (golden-mtpath (db:dbdat-get-path golden-mtdb)) - (tmp-mtdb (dbr:dbstruct-tmpdb dbstruct)) - (tmp-mtpath (db:dbdat-get-path tmp-mtdb))) - (debug:print-info 0 *default-log-port* "Read-only periodic sync thread started.") - (let loop ((last-sync-time 0)) - (debug:print-info 13 *default-log-port* "loop top tmp-mtpath="tmp-mtpath" golden-mtpath="golden-mtpath) - (let* ((duration-since-last-sync (- (current-seconds) last-sync-time))) - (debug:print-info 13 *default-log-port* "duration-since-last-sync="duration-since-last-sync) - (if (and (not *time-to-exit*) - (< duration-since-last-sync sync-cool-off-duration)) - (thread-sleep! (- sync-cool-off-duration duration-since-last-sync))) - (if (not *time-to-exit*) - (let ((golden-mtdb-mtime (file-modification-time golden-mtpath)) - (tmp-mtdb-mtime (file-modification-time tmp-mtpath))) - (if (> golden-mtdb-mtime tmp-mtdb-mtime) - (if (< golden-mtdb-mtime (- (current-seconds) 3)) ;; file has NOT been touched in past three seconds, this way multiple servers won't fight to sync back - (let ((res (db:multi-db-sync dbstruct 'old2new))) - (debug:print-info 13 *default-log-port* "rosync called, " res " records transferred.")))) - (loop (current-seconds))) - #t))) - (debug:print-info 0 *default-log-port* "Exiting readonly-watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" mtpath="golden-mtpath))) - - -;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage -(define (common:watchdog) - (debug:print-info 13 *default-log-port* "common:watchdog entered.") - (if (launch:setup) - (if (common:on-homehost?) - (let ((dbstruct (db:setup #t))) - (debug:print-info 13 *default-log-port* "after db:setup with dbstruct=" dbstruct) - (cond - ((dbr:dbstruct-read-only dbstruct) - (debug:print-info 13 *default-log-port* "loading read-only watchdog") - (common:readonly-watchdog dbstruct)) - (else - (debug:print-info 13 *default-log-port* "loading writable-watchdog.") - (let* ((syncer (or (configf:lookup *configdat* "server" "sync-method") "brute-force-sync"))) - (cond - ((equal? syncer "brute-force-sync") - (server:writable-watchdog-bruteforce dbstruct)) - ((equal? syncer "delta-sync") - (server:writable-watchdog-deltasync dbstruct)) - (else - (debug:print-error 0 *default-log-port* "Unknown server/sync-method specified ("syncer") - valid values are brute-force-sync and delta-sync.") - (exit 1))) - ;;(debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] Syncer started (method="syncer")") - ))) - (debug:print-info 13 *default-log-port* "watchdog done.")) - (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost")))) - - -(define (std-exit-procedure) - ;;(common:telemetry-log-close) - (on-exit (lambda () 0)) - ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) - (let ((no-hurry (if *time-to-exit* ;; hurry up - #f - (begin - (set! *time-to-exit* #t) - #t)))) - (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.") - (if (and no-hurry (debug:debug-mode 18)) - (rmt:print-db-stats)) - (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds - (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated - (if *task-db* - (let ((db (cdr *task-db*))) - (if (sqlite3:database? db) - (begin - (sqlite3:interrupt! db) - (sqlite3:finalize! db #t) - ;; (vector-set! *task-db* 0 #f) - (set! *task-db* #f))))) - #;(http-client#close-all-connections!) - ;; (if (and *runremote* - ;; (remote-conndat *runremote*)) - ;; (begin - ;; (http-client#close-all-connections!))) ;; for http-client - (if (not (eq? *default-log-port* (current-error-port))) - (close-output-port *default-log-port*)) - (set! *default-log-port* (current-error-port))) "Cleanup db exit thread")) - (th2 (make-thread (lambda () - (debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...") - (if no-hurry - (begin - (thread-sleep! 5)) ;; give the clean up few seconds to do it's stuff - (begin - (thread-sleep! 2))) - (debug:print 4 *default-log-port* " ... done") - ) - "clean exit"))) - (thread-start! th1) - (thread-start! th2) - (thread-join! th1) - ) - ) - - 0) - -(define (std-signal-handler signum) - ;; (signal-mask! signum) - (set! *time-to-exit* #t) - ;;(debug:print-info 13 *default-log-port* "got signal "signum) - (debug:print-error 0 *default-log-port* "Received signal " signum " aaa exiting promptly") - ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway - (exit)) - -(define (special-signal-handler signum) - ;; (signal-mask! signum) - (set! *time-to-exit* #t) - ;;(debug:print-info 13 *default-log-port* "got signal "signum) - (debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting!!") - ;;TODO send email to notify admin contact listed in the config that the lisner got killed - ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway - (exit)) - - -(set-signal-handler! signal/int std-signal-handler) ;; ^C -(set-signal-handler! signal/term std-signal-handler) - -;; (set-signal-handler! signal/stop std-signal-handler) ;; ^Z NO, do NOT handle ^Z! - -;; Force a megatest cleanup-db if version is changed and skip-version-check not specified -;; Do NOT check if not on homehost! -;; -(define (common:exit-on-version-changed) - (if (common:on-homehost?) - (if (common:api-changed?) - (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")) - (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.db")) - (read-only (not (file-write-access? dbfile))) - (dbstruct (db:setup #t))) - (debug:print 0 *default-log-port* - "WARNING: Version mismatch!\n" - " expected: " (common:version-signature) "\n" - " got: " (common:get-last-run-version)) - (cond - ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t) - ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only) - (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db - (debug:print 0 *default-log-port* " I see you are the owner of megatest.config, attempting to cleanup and reset to new version") - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "Failed to switch versions.") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (print-call-chain (current-error-port)) - (exit 1)) - (common:cleanup-db dbstruct))) - ((not (common:file-exists? mtconf)) - (debug:print 0 *default-log-port* " megatest.config does not exist in this area. Cannot proceed with megatest version migration.") - (exit 1)) - ((not (common:file-exists? dbfile)) - (debug:print 0 *default-log-port* " megatest.db does not exist in this area. Cannot proceed with megatest version migration.") - (exit 1)) - ((not (eq? (current-user-id)(file-owner mtconf))) - (debug:print 0 *default-log-port* " You do not own megatest.db in this area. Cannot proceed with megatest version migration.") - (exit 1)) - (read-only - (debug:print 0 *default-log-port* " You have read-only access to this area. Cannot proceed with megatest version migration.") - (exit 1)) - (else - (debug:print 0 *default-log-port* " to switch versions you can run: \"megatest -cleanup-db\"") - (exit 1))))))) -;; (begin -;; (debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.") -;; (exit 1)))) - -;; Move me elsewhere ... -;; RADT => Why do we meed the version check here, this is called only if version misma -;; -(define (common:cleanup-db dbstruct #!key (full #f)) - (apply db:multi-db-sync - dbstruct - 'schema - ;; 'new2old - 'killservers - 'adj-target - ;; 'old2new - 'new2old - ;; (if full - '(dejunk) - ;; '()) - ) - (if (common:api-changed?) - (common:set-last-run-version))) - -;; called in megatest.scm, host-port is string hostname:port -;; -;; NOTE: This is NOT called directly from clients as not all transports support a client running -;; in the same process as the server. -;; -#;(define (server:ping host-port-in #!key (do-exit #f)) - (let ((host:port (if (not host-port-in) ;; use read-dotserver to find - #f ;; (server:check-if-running *toppath*) - ;; (if (number? host-port-in) ;; we were handed a server-id - ;; (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) - ;; ;; (print "srec: " srec " host-port-in: " host-port-in) - ;; (if srec - ;; (conc (vector-ref srec 3) ":" (vector-ref srec 4)) - ;; (conc "no such server-id " host-port-in))) - host-port-in))) ;; ) - (let* ((host-port (if host:port - (let ((slst (string-split host:port ":"))) - (if (eq? (length slst) 2) - (list (car slst)(string->number (cadr slst))) - #f)) - #f))) -;; (toppath (launch:setup))) - ;; (print "host-port=" host-port) - (if (not host-port) - (begin - (if host-port-in - (debug:print 0 *default-log-port* "ERROR: bad host:port")) - (if do-exit (exit 1)) - #f) - (let* ((iface (car host-port)) - (port (cadr host-port)) - #;(server-dat (http-transport:client-connect iface port)) - (login-res (rmt:login-no-auto-client-setup server-dat))) - (if (and (list? login-res) - (car login-res)) - (begin - ;; (print "LOGIN_OK") - (if do-exit (exit 0)) - #t) - (begin - ;; (print "LOGIN_FAILED") - (if do-exit (exit 1)) - #f))))))) - -;; run ping in separate process, safest way in some cases -;; -(define (server:ping-server ifaceport) - (with-input-from-pipe - (conc (common:get-megatest-exe) " -ping " ifaceport) - (lambda () - (let loop ((inl (read-line)) - (res "NOREPLY")) - (if (eof-object? inl) - (case (string->symbol res) - ((NOREPLY) #f) - ((LOGIN_OK) #t) - (else #f)) - (loop (read-line) inl)))))) - -;; ping the given server -;; -#;(define (server:check-server server-record) - (let* ((server-url (server:record->url server-record)) - (res (case *transport-type* - ((http)(server:ping server-url)) - ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) - ))) - (if res - server-url - #f))) - -;; no longer care if multiple servers are started by accident. older servers will drop off in time. -;; -#;(define (server:check-if-running areapath) ;; #!key (numservers "2")) - (let* ((ns (server:get-num-servers)) - (servers (server:get-best (server:get-list areapath)))) - ;; (print "servers: " servers " ns: " ns) - (if (or (and servers - (null? servers)) - (not servers) - (and (list? servers) - (< (length servers) (random ns)))) ;; somewhere between 0 and numservers - #f - (let loop ((hed (car servers)) - (tal (cdr servers))) - (let ((res (server:check-server hed))) - (if res - res - (if (null? tal) - #f - (loop (car tal)(cdr tal))))))))) - - -;; NOT USED (well, ok, reference in rpc-transport but otherwise not used). -;; -(define (server:login toppath) - (lambda (toppath) - (set! *db-last-access* (current-seconds)) ;; might not be needed. - (if (equal? *toppath* toppath) - #t - #f))) -;; Given a run id start a server process ### NOTE ### > file 2>&1 -;; if the run-id is zero and the target-host is set -;; try running on that host -;; incidental: rotate logs in logs/ dir. -;; -#;(define (server:run areapath) ;; areapath is *toppath* for a given testsuite area - (let* ((curr-host (get-host-name)) - ;; (attempt-in-progress (server:start-attempted? areapath)) - ;; (dot-server-url (server:check-if-running areapath)) - (curr-ip (server:get-best-guess-address curr-host)) - (curr-pid (current-process-id)) - (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) - (target-host (car homehost)) - (testsuite (common:get-testsuite-name)) - (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) - (cmdln (conc (common:get-megatest-exe) - " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") - " -daemonize " - "") - ;; " -log " logfile - " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) - (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) - (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) - ;; we want the remote server to start in *toppath* so push there - (push-directory areapath) - (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") - (thread-start! log-rotate) - - ;; host.domain.tld match host? - (if (and target-host - ;; look at target host, is it host.domain.tld or ip address and does it - ;; match current ip or hostname - (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) - (not (equal? curr-ip target-host))) - (begin - (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) - (setenv "TARGETHOST" target-host))) - - (setenv "TARGETHOST_LOGF" logfile) - (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time - (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever - (system (conc "nbfake " cmdln)) - (unsetenv "TARGETHOST_LOGF") - (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) - (thread-join! log-rotate) - (pop-directory))) - -;; kind start up of servers, wait 40 seconds before allowing another server for a given -;; run-id to be launched -#;(define (server:kind-run areapath) - (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? - (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun - (call-num (car last-run-dat)) - (when-run (cadr last-run-dat)) - (run-delay (+ (case call-num - ((0) 0) - ((1) 20) - ((2) 300) - (else 600)) - (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously - (lock-file (conc areapath "/logs/server-start.lock"))) - (if (> (- (current-seconds) when-run) run-delay) - (begin - (common:simple-file-lock-and-wait lock-file expire-time: 15) - (server:run areapath) - (thread-sleep! 2) ;; don't release the lock for at least a few seconds - (common:simple-file-release-lock lock-file))) - (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) - -#;(define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. - -#;(define (server:start-and-wait areapath #!key (timeout 60)) - (let ((give-up-time (+ (current-seconds) timeout))) - (let loop ((server-url (server:check-if-running areapath)) - (try-num 0)) - (if (or server-url - (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. - server-url - (let ((num-ok (length (server:get-best (server:get-list areapath))))) - (if (and (> try-num 0) ;; first time through simply wait a little while then try again - (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one - (server:kind-run areapath)) - (thread-sleep! 5) - (loop (server:check-if-running areapath) - (+ try-num 1))))))) - -;;====================================================================== -;; make html output -;;====================================================================== - -(define (tests:test-set-toplog! run-id test-name logf) - (rmt:general-call 'tests:test-set-toplog run-id logf run-id test-name)) - -(define (tests:summarize-items run-id test-id test-name force) - ;; if not force then only update the record if one of these is true: - ;; 1. logf is "log/final.log - ;; 2. logf is same as outputfilename - (let* ((outputfilename (conc "megatest-rollup-" test-name ".html")) - (orig-dir (current-directory)) - (logf-info (rmt:test-get-logfile-info run-id test-name)) - (logf (if logf-info (cadr logf-info) #f)) - (path (if logf-info (car logf-info) #f))) - ;; This query finds the path and changes the directory to it for the test - (if (and (string? path) - (directory? path)) ;; can get #f here under some wierd conditions. why, unknown ... - (begin - (debug:print 4 *default-log-port* "Found path: " path) - (change-directory path)) - ;; (set! outputfilename (conc path "/" outputfilename))) - (debug:print-error 0 *default-log-port* "summarize-items for run-id=" run-id ", test-name=" test-name ", no such path: " path)) - (debug:print 4 *default-log-port* "summarize-items with logf " logf ", outputfilename " outputfilename " and force " force) - (if (or (equal? logf "logs/final.log") - (equal? logf outputfilename) - force) - (let ((my-start-time (current-seconds)) - (lockf (conc outputfilename ".lock"))) - (let loop ((have-lock (common:simple-file-lock lockf))) - (if have-lock - (let ((script (configf:lookup *configdat* "testrollup" test-name))) - (print "Obtained lock for " outputfilename) - (rmt:set-state-status-and-roll-up-items run-id test-name "" #f #f #f) - (if script - (system (conc script " > " outputfilename " & ")) - (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename)) - (common:simple-file-release-lock lockf) - (change-directory orig-dir) - ;; NB// tests:test-set-toplog! is remote internal... - (tests:test-set-toplog! run-id test-name outputfilename)) - ;; didn't get the lock, check to see if current update started later than this - ;; update, if so we can exit without doing any work - (if (> my-start-time (handle-exceptions - exn - 0 - (file-modification-time lockf))) - ;; we started since current re-gen in flight, delay a little and try again - (begin - (debug:print-info 1 *default-log-port* "Waiting to update " outputfilename ", another test currently updating it") - (thread-sleep! (+ 5 (random 5))) ;; delay between 5 and 10 seconds - (loop (common:simple-file-lock lockf)))))))))) - -(define (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename) - (let ((counts (make-hash-table)) - (statecounts (make-hash-table)) - (outtxt "") - (tot 0) - (testdat (rmt:test-get-records-for-index-file run-id test-name))) - (with-output-to-file outputfilename - (lambda () - (set! outtxt (conc outtxt "Summary: " test-name - "

Summary for " test-name "

")) - (for-each - (lambda (testrecord) - (let ((id (vector-ref testrecord 0)) - (itempath (vector-ref testrecord 1)) - (state (vector-ref testrecord 2)) - (status (vector-ref testrecord 3)) - (run_duration (vector-ref testrecord 4)) - (logf (vector-ref testrecord 5)) - (comment (vector-ref testrecord 6))) - (hash-table-set! counts status (+ 1 (hash-table-ref/default counts status 0))) - (hash-table-set! statecounts state (+ 1 (hash-table-ref/default statecounts state 0))) - (set! outtxt (conc outtxt "" - ;; " " itempath "" - " " itempath "" - "" state "" - "" status "" - "" (if (equal? comment "") - " " - comment) "" - "")))) - (if (list? testdat) - testdat - (begin - (print "ERROR: failed to get records with rmt:test-get-records-for-index-file run-id=" run-id "test-name=" test-name) - '()))) - - (print "
") - ;; Print out stats for status - (set! tot 0) - (print "") - (for-each (lambda (state) - (set! tot (+ tot (hash-table-ref statecounts state))) - (print "")) - (hash-table-keys statecounts)) - (print "

State stats

" state "" (hash-table-ref statecounts state) "
Total" tot "
") - (print "
") - ;; Print out stats for state - (set! tot 0) - (print "") - (for-each (lambda (status) - (set! tot (+ tot (hash-table-ref counts status))) - (print "")) - (hash-table-keys counts)) - (print "

Status stats

" status - "" (hash-table-ref counts status) "
Total" tot "
") - (print "
") - - (print "" - "" - outtxt "
ItemStateStatusComment
") - ;; (release-dot-lock outputfilename) - ;;(rmt:update-run-stats - ;; run-id - ;; (hash-table-map - ;; state-status-counts - ;; (lambda (key val) - ;; (append key (list val))))) - )))) - -(define tests:css-jscript-block -#< -ul.LinkedList { display: block; } -/* ul.LinkedList ul { display: none; } */ -.HandCursorStyle { cursor: pointer; cursor: hand; } /* For IE */ -th {background-color: #8c8c8c;} -td.test {background-color: #d9dbdd;} -td.PASS {background-color: #347533;} -td.FAIL {background-color: #cc2812;} -td.SKIP{background-color: #FFD733;} -td.WARN {background-color: #EA8724;} -td.WAIVED {background-color: #838A12;} -td.ABORT{background-color: #EA24B7;} -.PASS .link, .SKIP .link, .WARN .link,.WAIVED .link,.ABORT .link, .FAIL .link{color: #FFFFFF;} - - - - - - -EOF -) - -(define tests:css-jscript-block-dynamic -#< -EOF -) - -(define (test:js-block javascript-lib) - (conc "" )) - - -(define tests:css-jscript-block-static (test:js-block *java-script-lib*)) - -(define (tests:css-jscript-block-cond dynamic) - (if (equal? dynamic #t) - tests:css-jscript-block-dynamic - tests:css-jscript-block-static)) - - -(define (tests:run-record->test-path run numkeys) - (append (take (vector->list run) numkeys) - (list (vector-ref run (+ 1 numkeys))))) - - -(define (tests:get-rest-data runs header numkeys) - (let ((resh (make-hash-table))) - (for-each - (lambda (run) - (let* ((run-id (db:get-value-by-header run header "id")) - (run-dir (tests:run-record->test-path run numkeys)) - (test-data (rmt:get-tests-for-run - run-id - "%" ;; testnamepatt - '() ;; states - '() ;; statuses - #f ;; offset - #f ;; num-to-get - #f ;; hide/not-hide - #f ;; sort-by - #f ;; sort-order - #f ;; 'shortlist ;; qrytype - 0 ;; last update - #f))) - - (map (lambda (test) - (let* ((test-name (vector-ref test 2)) - (test-html-path (conc (vector-ref test 10) "/" (vector-ref test 13))) - (test-item (conc test-name ":" (vector-ref test 11))) - (test-status (vector-ref test 4))) - - (if (not (hash-table-ref/default resh test-name #f)) - (hash-table-set! resh test-name (make-hash-table))) - (if (not (hash-table-ref/default (hash-table-ref/default resh test-name #f) test-item #f)) - (hash-table-set! (hash-table-ref/default resh test-name #f) test-item (make-hash-table))) - (hash-table-set! (hash-table-ref/default (hash-table-ref/default resh test-name #f) test-item #f) run-id (list test-status test-html-path)))) - test-data))) - runs) - resh)) - - -;; hash-table tree to html list tree -;; -;; tipfunc takes two parameters: y the tip value and path the path to that point -;; -(define (common:htree->html ht path tipfunc) - (let ((datlist (sort (hash-table->alist ht) - (lambda (a b) - (string< (car a)(car b)))))) - (if (null? datlist) - (tipfunc #f path) ;; really shouldn't get here - (s:ul - (map (lambda (x) - (let* ((levelname (car x)) - (y (cdr x)) - (newpath (append path (list levelname))) - (leaf (or (not (hash-table? y)) - (null? (hash-table-keys y))))) - (if leaf - (s:li (tipfunc y newpath)) - (s:li - (list - levelname - (common:htree->html y newpath tipfunc)))))) - datlist))))) - - -;; tests:genrate dashboard body -;; - -(define (tests:dashboard-body page pg-size keys numkeys total-runs linktree area-name get-prev-links get-next-links flag run-patt target-patt) - (let* ((start (* page pg-size)) - ;(runsdat (rmt:get-runs "%" pg-size start (map (lambda (x)(list x "%")) keys))) - (runsdat (rmt:get-runs-by-patt keys run-patt target-patt start pg-size #f 0 sort-order: "desc")) - ; db:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-update - (header (vector-ref runsdat 0)) - (runs (vector-ref runsdat 1)) - (ctr 0) - (test-runs-hash (tests:get-rest-data runs header numkeys)) - (test-list (hash-table-keys test-runs-hash))) - - (s:html tests:css-jscript-block (tests:css-jscript-block-cond flag) - (s:title "Summary for " area-name) - (s:body 'onload "addEvents();" - (get-prev-links page linktree) - (get-next-links page linktree total-runs) - - (s:h1 "Summary for " area-name) - (s:h3 "Filter" ) - (s:input 'type "text" 'name "testname" 'id "testname" 'length "30" 'onkeyup "filtersome()") - ;; top list - - (s:table 'id "LinkedList1" 'border "1" 'cellspacing 0 - (map (lambda (key) - (let* ((res (s:tr 'class "something" - (s:th key ) - (map (lambda (run) - (s:th (vector-ref run ctr))) - runs)))) - (set! ctr (+ ctr 1)) - res)) - keys) - (s:tr - (s:th "Run Name") - (map (lambda (run) - (s:th (db:get-value-by-header run header "runname"))) - runs)) - - (map (lambda (test-name) - (let* ((item-hash (hash-table-ref/default test-runs-hash test-name #f)) - (item-keys (sort (hash-table-keys item-hash) string<=?))) - (map (lambda (item-name) - (let* ((res (s:tr 'class item-name - (s:td item-name 'class "test" ) - (map (lambda (run) - (let* ((run-test (hash-table-ref/default item-hash item-name #f)) - (run-id (db:get-value-by-header run header "id")) - (result (hash-table-ref/default run-test run-id "n/a")) - ;(relative-path (get-relative-path)) - (status (if (string? result) - result - (car result))) - (link (if (string? result) - result - (if (equal? flag #t) - (s:a (car result) 'href (conc "./test_log?runid=" run-id "&testname=" item-name )) - (s:a (car result) 'href (string-substitute (conc linktree "/") "" (cadr result) "-")))))) - (s:td link 'class status))) - runs)))) - res)) - item-keys))) - test-list)))))) - -;; (tests:create-html-tree "test-index.html") -;; -(define (tests:create-html-tree outf) - (let* ((lockfile (conc outf ".lock")) - (runs-to-process '()) - (linktree (common:get-linktree)) - (area-name (common:get-testsuite-name)) - (keys (rmt:get-keys)) - (numkeys (length keys)) - (run-patt (or (args:get-arg "-run-patt") - (args:get-arg "-runname") - "%")) - (target (or (args:get-arg "-target-patt") - (args:get-arg "-target") - "%")) - (targlist (string-split target "/")) - (numtarg (length targlist)) - (targtweaked (if (> numkeys numtarg) - (append targlist (make-list (- numkeys numtarg) "%")) - targlist)) - (target-patt (string-join targtweaked "/")) - ;(total-runs (rmt:get-num-runs "%")) ;;this needs to be changed to filter by target - (total-runs (rmt:get-runs-cnt-by-patt run-patt target-patt keys )) - (pg-size 10)) - (if (common:simple-file-lock lockfile) - (begin - ;(print total-runs) - (let loop ((page 0)) - (let* ((oup (open-output-file (or outf (conc linktree "/page" page ".html")))) - (get-prev-links (lambda (page linktree ) - (let* ((link (if (not (eq? page 0)) - (s:a "<<prev" 'href (conc "page" (- page 1) ".html")) - (s:a "" 'href (conc "page" page ".html"))))) - link))) - (get-next-links (lambda (page linktree total-runs) - (let* ((link (if (> total-runs (+ 10 (* page pg-size))) - (s:a "next>>" 'href (conc "page" (+ page 1) ".html")) - (s:a "" 'href (conc "page" page ".html"))))) - link))) ) - (print "total runs: " total-runs) - (s:output-new - oup - (tests:dashboard-body page pg-size keys numkeys total-runs linktree area-name get-prev-links get-next-links #f run-patt target-patt)) ;; update this function - (close-output-port oup) - ; (set! page (+ 1 page)) - (if (> total-runs (* (+ 1 page) pg-size)) - (loop (+ 1 page))))) - (common:simple-file-release-lock lockfile)) - - #f))) - - -(define (tests:readlines filename) - (call-with-input-file filename - (lambda (p) - (let loop ((line (read-line p)) - (result '())) - (if (eof-object? line) - (reverse result) - (loop (read-line p) (cons line result))))))) - -(define (tests:get-test-log run-id test-name item-name) - (let* ((test-data (rmt:get-tests-for-run - (string->number run-id) - test-name ;; testnamepatt - '() ;; states - '() ;; statuses - #f ;; offset - #f ;; num-to-get - #f ;; hide/not-hide - #f ;; sort-by - #f ;; sort-order - #f ;; 'shortlist ;; qrytype - 0 ;; last update - #f)) - (path "") - (found 0)) - (debug:print-info 0 *default-log-port* "found: " found ) - - (let loop ((hed (car test-data)) - (tal (cdr test-data))) - (debug:print-info 0 *default-log-port* "item: " (vector-ref hed 11) (vector-ref hed 10) "/" (vector-ref hed 13)) - - (if (equal? (vector-ref hed 11) item-name) - (begin - (set! found 1) - (set! path (conc (vector-ref hed 10) "/" (vector-ref hed 13))))) - (if (and (not (null? tal)) (equal? found 0)) - (loop (car tal)(cdr tal)))) - (if (equal? path "") - "

Data not found

" - (string-join (tests:readlines path) "\n")))) - - -(define (tests:dynamic-dboard page) -;(define (tests:create-html-tree o) - (let* ( -;(page "1") - (linktree (common:get-linktree)) - (area-name (common:get-testsuite-name)) - (keys (rmt:get-keys)) - (numkeys (length keys)) - (targtweaked (make-list numkeys "%")) - (target-patt (string-join targtweaked "/")) - (total-runs (rmt:get-num-runs "%")) - (pg-size 10) - (pg (if (equal? page #f) - 0 - (- (string->number page) 1))) - (get-prev-links (lambda (pg linktree) - (debug:print-info 0 *default-log-port* "val: " (- 1 pg)) - (let* ((link (if (not (eq? pg 0)) - (s:a "<<prev " 'href (conc "dashboard?page=" pg )) - (s:a "" 'href (conc "dashboard?page=" pg))))) - link))) - (get-next-links (lambda (pg linktree total-runs) - (debug:print-info 0 *default-log-port* "val: " pg) - (debug:print-info 0 *default-log-port* "val: " total-runs " size" pg-size) - - (let* ((link (if (> total-runs (+ 10 (* pg pg-size))) - (s:a "next>> " 'href (conc "dashboard?page=" (+ pg 2) )) - (s:a "" 'href (conc "dashboard?page=" pg ))))) - link))) - (html-body (tests:dashboard-body pg pg-size keys numkeys total-runs linktree area-name get-prev-links get-next-links #t "%" target-patt))) ;; update tis function - html-body)) - -(define (tests:create-html-summary outf) - (let* ((lockfile (conc outf ".lock")) - (linktree (common:get-linktree)) - (keys (rmt:get-keys)) - (area-name (common:get-testsuite-name)) - (run-patt (or (args:get-arg "-run-patt") - (args:get-arg "-runname") - "%")) - (target (or (args:get-arg "-target-patt") - (args:get-arg "-target") - "%")) - (targlist (string-split target "/")) - (numkeys (length keys)) - (numtarg (length targlist)) - (targtweaked (if (> numkeys numtarg) - (append targlist (make-list (- numkeys numtarg) "%")) - targlist)) - (target-patt (string-join targtweaked "/"))) - (if (common:simple-file-lock lockfile) - (begin - (let* (;(runsdat1 (rmt:get-runs run-patt #f #f (map (lambda (x)(list x "%")) keys))) - (runsdat (rmt:get-runs-by-patt keys run-patt target-patt #f #f #f 0)) - (runs (vector-ref runsdat 1)) - (header (vector-ref runsdat 0)) - (oup (open-output-file (or outf (conc linktree "/targets.html")))) - (target-hash (test:create-target-hash runs header (length keys)))) - (test:create-target-html target-hash oup area-name linktree) - (test:create-run-html runs area-name linktree (length keys) header)) - (common:simple-file-release-lock lockfile)) - #f))) - -(define (test:get-test-hash test-data) - (let ((resh (make-hash-table))) - (map (lambda (test) - (let* ((test-name (vector-ref test 2)) - (test-html-path (if (file-exists? (conc (vector-ref test 10) "/test-summary.html")) - (conc (vector-ref test 10) "/test-summary.html" ) - (conc (vector-ref test 10) "/" (vector-ref test 13)))) - (test-item (vector-ref test 11)) - (test-status (vector-ref test 4))) - (if (not (hash-table-ref/default resh test-item #f)) - (hash-table-set! resh test-item (make-hash-table))) - (hash-table-set! (hash-table-ref/default resh test-item #f) test-name (list test-status test-html-path)))) - test-data) -resh)) - -(define (test:get-data->b-keys ordered-data a-keys) - (delete-duplicates - (sort (apply - append - (map (lambda (sub-key) - (let ((subdat (hash-table-ref ordered-data sub-key))) - (hash-table-keys subdat))) - a-keys)) - string>=?))) - - -(define (test:create-run-html runs area-name linktree numkeys header) - (map (lambda (run) - (let* ((target (string-join (take (vector->list run) numkeys) "/")) - (run-name (db:get-value-by-header run header "runname")) - (run-time (seconds->work-week/day-time (db:get-value-by-header run header "event_time"))) - (oup (if (file-exists? (conc linktree "/" target "/" run-name)) - (open-output-file (conc linktree "/" target "/" run-name "/run.html")) - #f)) - (run-id (db:get-value-by-header run header "id")) - (test-data (rmt:get-tests-for-run - run-id - "%" ;; testnamepatt - '() ;; states - '() ;; statuses - #f ;; offset - #f ;; num-to-get - #f ;; hide/not-hide - #f ;; sort-by - #f ;; sort-order - #f ;; 'shortlist ;; qrytype - 0 ;; last update - #f)) - (item-test-hash (test:get-test-hash test-data)) - (items (hash-table-keys item-test-hash)) - (test-names (test:get-data->b-keys item-test-hash items))) - (if oup - (begin - (s:output-new - oup - (s:html tests:css-jscript-block (tests:css-jscript-block-cond #f) - (s:title "Runs View " run-name) - (s:body - (s:h1 "Runs View " ) - (s:h3 "Target" target) - (s:p - (s:b "Run name" ) run-name) - (s:p - (s:b "Run Date" ) run-time) - (s:table 'border 1 'cellspacing 0 - (s:tr - (s:th "Items") - (map (lambda (test) - (s:th test)) - test-names)) - (map (lambda (item) - (let* ((test-hash (hash-table-ref/default item-test-hash item #f))) - (if test-hash - (begin - (s:tr - (s:td 'class "test" item) - (map (lambda (test) - (let* ((test-details (hash-table-ref/default test-hash test #f)) - (status (if test-details - (car test-details))) - (link (if test-details - (string-substitute (conc linktree "/" target "/" run-name "/") "" (cadr test-details) "-")))) - (if test-details - (s:td 'class status - (s:a 'class "link" 'href link status )) - (s:td "")))) - test-names)))))) - (sort items string<=?)))))) - (close-output-port oup)) - (debug:print-info 0 "Skip: Dirctory structure " linktree "/" target "/" run-name " does not exist. Megatest will not create run.html")))) -runs)) - -(define (test:create-target-hash runs header numkeys) - (let ((resh (make-hash-table))) - (for-each - (lambda (run) - (let* ((run-name (db:get-value-by-header run header "runname")) - (target (string-join (take (vector->list run) numkeys) "/")) - (run-list (hash-table-ref/default resh target #f))) - - (if (not run-list) - (hash-table-set! resh target (list run-name)) - (hash-table-set! resh target (cons run-name run-list))))) - runs) - resh)) - -(define (test:get-max-run-cnt target-hash targets) - (let* ((cnt 0 )) - (map (lambda (target) - (let* ((runs (hash-table-ref/default target-hash target #f)) - (run-length (if runs - (length runs) - 0))) - - (if (< cnt run-length) - (set! cnt run-length)))) - targets) -cnt)) - -(define (test:pad-runs target-hash targets max-row-length) - (map (lambda (target) - (let loop ((run-list (hash-table-ref/default target-hash target #f))) - (if (< (length run-list) max-row-length) - (begin - (hash-table-set! target-hash target (cons "" run-list)) - (loop (hash-table-ref/default target-hash target #f) ))))) - targets) - target-hash) - -(define (test:create-target-html target-hash oup area-name linktree) - (let* ((targets (hash-table-keys target-hash)) - (max-row-length (test:get-max-run-cnt target-hash targets)) - (pad-runs-hash (test:pad-runs target-hash targets max-row-length))) - (s:output-new - oup - (s:html tests:css-jscript-block (tests:css-jscript-block-cond #f) - - (s:title "Target View " area-name) - (s:body - (s:h1 "Target View " area-name) - (s:table 'id "LinkedList1" 'border "1" 'cellspacing 0 - (s:tr 'class "something" - (s:th "Target") - (s:th 'colspan max-row-length "Runs")) - (let* ((tbl (map (lambda (target) - (s:tr - (s:td 'class "test" target) - (let* ((runs (hash-table-ref/default target-hash target #f)) - (rest-row (map (lambda (run) - (if (equal? run "") - (s:td run) - (if (file-exists?(conc linktree "/" target "/" run )) - (begin - (s:td - (s:a 'href (conc target "/" run "/run.html") run)))))) - (reverse runs)))) - rest-row))) - targets))) - tbl))))) - (close-output-port oup))) - - -(define (tests:create-html-tree-old outf) - (let* ((lockfile (conc outf ".lock")) - (runs-to-process '())) - (if (common:simple-file-lock lockfile) - (let* ((linktree (common:get-linktree)) - (oup (open-output-file (or outf (conc linktree "/runs-index.html")))) - (area-name (common:get-testsuite-name)) - (keys (rmt:get-keys)) - (numkeys (length keys)) - (runsdat (rmt:get-runs "%" #f #f (map (lambda (x)(list x "%")) keys))) - (header (vector-ref runsdat 0)) - (runs (vector-ref runsdat 1)) - (runtreedat (map (lambda (x) - (tests:run-record->test-path x numkeys)) - runs)) - (runs-htree (common:list->htree runtreedat))) - (set! runs-to-process runs) - (s:output-new - oup - (s:html tests:css-jscript-block - (s:title "Summary for " area-name) - (s:body 'onload "addEvents();" - (s:h1 "Summary for " area-name) - ;; top list - (s:ul 'id "LinkedList1" 'class "LinkedList" - (s:li - "Runs" - (common:htree->html runs-htree - '() - (lambda (x p) - (let* ((targ-path (string-intersperse p "/")) - (full-path (conc linktree "/" targ-path)) - (run-name (car (reverse p)))) - (if (and (common:file-exists? full-path) - (directory? full-path) - (file-write-access? full-path)) - (s:a run-name 'href (conc targ-path "/run-summary.html")) - (begin - (debug:print 0 *default-log-port* "INFO: Can't create " targ-path "/run-summary.html") - (conc run-name " (Not able to create summary at " targ-path ")"))))))))))) - (close-output-port oup) - (common:simple-file-release-lock lockfile) - - (for-each - (lambda (run) - (let* ((test-subpath (tests:run-record->test-path run numkeys)) - (run-id (db:get-value-by-header run header "id")) - (run-dir (tests:run-record->test-path run numkeys)) - (test-dats (rmt:get-tests-for-run - run-id - "%/" ;; testnamepatt - '() ;; states - '() ;; statuses - #f ;; offset - #f ;; num-to-get - #f ;; hide/not-hide - #f ;; sort-by - #f ;; sort-order - #f ;; 'shortlist ;; qrytype - 0 ;; last update - #f)) - (tests-tree-dat (map (lambda (test-dat) - ;; (tests:run-record->test-path x numkeys)) - (let* ((test-name (db:test-get-testname test-dat)) - (item-path (db:test-get-item-path test-dat)) - (full-name (db:test-make-full-name test-name item-path)) - (path-parts (string-split full-name))) - path-parts)) - test-dats)) - (tests-htree (common:list->htree tests-tree-dat)) - (html-dir (conc linktree "/" (string-intersperse run-dir "/"))) - (html-path (conc html-dir "/run-summary.html")) - (oup (if (and (common:file-exists? html-dir) - (directory? html-dir) - (file-write-access? html-dir)) - (open-output-file html-path) - #f))) - ;; (print "run-dir: " run-dir ", tests-tree-dat: " tests-tree-dat) - (if oup - (begin - (s:output-new - oup - (s:html tests:css-jscript-block - (s:title "Summary for " area-name) - (s:body 'onload "addEvents();" - (s:h1 "Summary for " (string-intersperse run-dir "/")) - ;; top list - (s:ul 'id "LinkedList1" 'class "LinkedList" - (s:li - "Tests" - (common:htree->html tests-htree - '() - (lambda (x p) - (let* ((targ-path (string-intersperse p "/")) - (test-name (car p)) - (item-path ;; (if (> (length p) 2) ;; test-name + run-name - (string-intersperse p "/")) - (full-targ (conc html-dir "/" targ-path)) - (std-file (conc full-targ "/test-summary.html")) - (alt-file (conc full-targ "/megatest-rollup-" test-name ".html")) - (html-file (if (common:file-exists? alt-file) - alt-file - std-file)) - (run-name (car (reverse p)))) - (if (and (not (common:file-exists? full-targ)) - (directory? full-targ) - (file-write-access? full-targ)) - (tests:summarize-test - run-id - (rmt:get-test-id run-id test-name item-path))) - (if (common:file-exists? full-targ) - (s:a run-name 'href html-file) - (begin - (debug:print 0 *default-log-port* "ERROR: can't access " full-targ) - (conc "No summary for " run-name))))) - )))))) - (close-output-port oup))))) - runs) - #t) - #f))) - - - - -(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname) - (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1)) - (if (and cpuload diskfree) - (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id)) - (if minutes - (rmt:general-call 'update-run-duration run-id minutes test-id)) - (if (and uname hostname) - (rmt:general-call 'update-uname-host run-id uname hostname test-id))) - -;; This one is for running with no db access (i.e. via rmt: internally) -(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries) -;; (define (tests:set-full-meta-info test-id run-id minutes work-area) -;; (let ((remtries 10)) - (let* ((cpuload (get-cpu-load)) - (diskfree (get-df (current-directory))) - (uname (get-uname "-srvpio")) - (hostname (get-host-name))) - (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname))) - -;; -;; -(define (tests:get-compressed-steps run-id test-id) - (let* ((steps-data (rmt:get-steps-for-test run-id test-id)) ;; 0 1 2 3 4 5 6 7 - (comprsteps (tests:process-steps-table steps-data))) ;; # - (map (lambda (x) - ;; take advantage of the \n on time->string - (vector ;; we are constructing basically the original vector but collapsing start end records - (vector-ref x 0) ;; id 0 - (let ((s (vector-ref x 1))) - (if (number? s)(seconds->time-string s) s)) ;; starttime 1 - (let ((s (vector-ref x 2))) - (if (number? s)(seconds->time-string s) s)) ;; endtime 2 - (vector-ref x 3) ;; status 3 - (vector-ref x 4) ;; duration 4 - (vector-ref x 5) ;; logfile 5 - (vector-ref x 6) ;; comment 6 - (vector-ref x 7))) ;; id 7 - (sort (hash-table-values comprsteps) - (lambda (a b) - (let ((time-a (vector-ref a 1)) - (time-b (vector-ref b 1)) - (id-a (vector-ref a 7)) - (id-b (vector-ref b 7))) - (if (and (number? time-a)(number? time-b)) - (if (< time-a time-b) - #t - (if (eq? time-a time-b) - (< id-a id-b) - ;; (stringwork-week/day-time - (db:test-get-event_time test-dat))) - (s:td "Duration") (s:td (seconds->hr-min-sec (db:test-get-run_duration test-dat))))) - (s:h3 "Log files") - (s:table - 'cellspacing "0" 'border "1" - (s:tr (s:td "Final log")(s:td (s:a 'href logf logf)))) - (s:table - 'cellspacing "0" 'border "1" - (s:tr (s:td "Step Name")(s:td "Start")(s:td "End")(s:td "Status")(s:td "Duration")(s:td "Log File")) - (map (lambda (step-dat) - (s:tr (s:td (tdb:steps-table-get-stepname step-dat)) - (s:td (tdb:steps-table-get-start step-dat)) - (s:td (tdb:steps-table-get-end step-dat)) - (s:td (tdb:steps-table-get-status step-dat)) - (s:td (tdb:steps-table-get-runtime step-dat)) - (s:td (let ((step-log (tdb:steps-table-get-log-file step-dat))) - (s:a 'href step-log step-log))))) - steps-dat)) - ))) - (close-output-port oup))))) - - -;; MUST BE CALLED local! -;; -(define (tests:test-get-paths-matching keynames target fnamepatt #!key (res '())) - ;; BUG: Move the values derived from args to parameters and push to megatest.scm - (let* ((testpatt (or (args:get-arg "-testpatt")(args:get-arg "-testpatt") "%")) - (statepatt (or (args:get-arg "-state") (args:get-arg ":state") "%")) - (statuspatt (or (args:get-arg "-status") (args:get-arg ":status") "%")) - (runname (or (args:get-arg "-runname") (args:get-arg ":runname") "%")) - (paths-from-db (rmt:test-get-paths-matching-keynames-target-new keynames target res - testpatt - statepatt - statuspatt - runname))) - (if fnamepatt - (apply append - (map (lambda (p) - (if (directory-exists? p) - (let ((glob-query (conc p "/" fnamepatt))) - (handle-exceptions - exn - (with-input-from-pipe - (conc "echo " glob-query) - read-lines) ;; we aren't going to try too hard. If glob breaks it is likely because someone tried to do */*/*.log or similar - (glob glob-query))) - '())) - paths-from-db)) - paths-from-db))) - - -;; for each test: -;; -(define (tests:filter-non-runnable run-id testkeynames testrecordshash) - (let ((runnables '())) - (for-each - (lambda (testkeyname) - (let* ((test-record (hash-table-ref testrecordshash testkeyname)) - (test-name (tests:testqueue-get-testname test-record)) - (itemdat (tests:testqueue-get-itemdat test-record)) - (item-path (tests:testqueue-get-item_path test-record)) - (waitons (tests:testqueue-get-waitons test-record)) - (keep-test #t) - (test-id (rmt:get-test-id run-id test-name item-path)) - (tdat (rmt:get-testinfo-state-status run-id test-id))) ;; (cdb:get-test-info-by-id *runremote* test-id))) - (if tdat - (begin - ;; Look at the test state and status - (if (or (and (member (db:test-get-status tdat) - '("PASS" "WARN" "WAIVED" "CHECK" "SKIP")) - (equal? (db:test-get-state tdat) "COMPLETED")) - (member (db:test-get-state tdat) - '("INCOMPLETE" "KILLED"))) - (set! keep-test #f)) - - ;; examine waitons for any fails. If it is FAIL or INCOMPLETE then eliminate this test - ;; from the runnable list - (if keep-test - (for-each (lambda (waiton) - ;; for now we are waiting only on the parent test - (let* ((parent-test-id (rmt:get-test-id run-id waiton "")) - (wtdat (rmt:get-testinfo-state-status run-id test-id))) ;; (cdb:get-test-info-by-id *runremote* test-id))) - (if (or (and (equal? (db:test-get-state wtdat) "COMPLETED") - (member (db:test-get-status wtdat) '("FAIL" "ABORT"))) - (member (db:test-get-status wtdat) '("KILLED")) - (member (db:test-get-state wtdat) '("INCOMPETE"))) - ;; (if (or (member (db:test-get-status wtdat) - ;; '("FAIL" "KILLED")) - ;; (member (db:test-get-state wtdat) - ;; '("INCOMPETE"))) - (set! keep-test #f)))) ;; no point in running this one again - waitons)))) - (if keep-test (set! runnables (cons testkeyname runnables))))) - testkeynames) - runnables)) - -;;====================================================================== -;; test steps -;;====================================================================== - -;; teststep-set-status! used to be here - -(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat) - (let* ((testdat (rmt:get-test-info-by-id run-id test-id))) - (and testdat - (equal? (test:get-state testdat) "KILLREQ")))) - -(define (test:tdb-get-rundat-count tdb) - (if tdb - (let ((res 0)) - (sqlite3:for-each-row - (lambda (count) - (set! res count)) - tdb - "SELECT count(id) FROM test_rundat;") - res)) - 0) - -;; (define (tests:set-partial-meta-info test-id run-id minutes work-area) -#;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries) - (let* ((cpuload (get-cpu-load)) - (diskfree (get-df (current-directory))) - (remtries 10)) - (handle-exceptions - exn - (if (> remtries 0) - (begin - (print-call-chain (current-error-port)) - (debug:print-info 0 *default-log-port* "WARNING: failed to set meta info. Will try " remtries " more times") - (set! remtries (- remtries 1)) - (thread-sleep! 10) - (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1))) - (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) - (debug:print-error 0 *default-log-port* "tried for over a minute to update meta info and failed. Giving up") - (debug:print 0 *default-log-port* "EXCEPTION: database probably overloaded or unreadable.") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 5 *default-log-port* "exn=" (condition->list exn)) - (debug:print 0 *default-log-port* " status: " ((condition-property-accessor 'sqlite3 'status) exn)) - (print-call-chain (current-error-port)))) - (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes) - ))) - - -;;====================================================================== -;; launch a task - this runs on the originating host, tests themselves -;; -;;====================================================================== - - -;;====================================================================== -;; ezsteps -;;====================================================================== - -;; ezsteps were going to be coded as -;; stepname[,predstep1,predstep2 ...] [{VAR1=first,second,third}] command to execute -;; BUT -;; now are -;; stepname {VAR=first,second,third ...} command ... -;; where the {VAR=first,second,third ...} is optional. - -;; given an exit code and whether or not logpro was used calculate OK/BAD -;; return #t if we are ok, #f otherwise -(define (steprun-good? logpro exitcode) - (or (eq? exitcode 0) - (and logpro (eq? exitcode 2)))) - -;; if handed a string, process it, else look for MT_CMDINFO -(define (launch:get-cmdinfo-assoc-list #!key (encoded-cmd #f)) - (let ((enccmd (if encoded-cmd encoded-cmd (getenv "MT_CMDINFO")))) - (if enccmd - (common:read-encoded-string enccmd) - '()))) - -;; return (conc status ": " comment) from the final section so that -;; the comment can be set in the step record in launch.scm -;; -(define (launch:load-logpro-dat run-id test-id stepname) - (let ((cname (conc stepname ".dat"))) - (if (common:file-exists? cname) - (let* ((dat (configf:read-config cname #f #f)) - (csvr (db:logpro-dat->csv dat stepname)) - (csvt (let-values (((fmt-cell fmt-record fmt-csv) (make-format ","))) - (fmt-csv (map list->csv-record csvr)))) - (status (configf:lookup dat "final" "exit-status")) - (msg (configf:lookup dat "final" "message"))) - (if csvt ;; this if blocked stack dump caused by .dat file from logpro being 0-byte. fixed by upgrading logpro - (rmt:csv->test-data run-id test-id csvt) - (debug:print 0 *default-log-port* "ERROR: no csvdat exists for run-id: " run-id " test-id: " test-id " stepname: " stepname ", check that logpro version is 1.15 or newer")) - ;; (debug:print-info 13 *default-log-port* "Error: run-id/test-id/stepname="run-id"/"test-id"/"stepname" => bad csvr="csvr) - ;; ) - (cond - ((equal? status "PASS") "PASS") ;; skip the message part if status is pass - (status (conc (configf:lookup dat "final" "exit-status") ": " (if msg msg "no message"))) - (else #f))) - #f))) - -(define (launch:runstep ezstep run-id test-id exit-info m tal testconfig) ;;; TODO: deprecate me in favor of ezsteps.scm - (let* ((stepname (car ezstep)) ;; do stuff to run the step - (stepinfo (cadr ezstep)) - ;; (let ((info (cadr ezstep))) - ;; (if (proc? info) "" info))) - ;; (stepproc (let ((info (cadr ezstep))) - ;; (if (proc? info) info #f))) - (stepparts (string-match (regexp "^(\\{([^\\}\\{]*)\\}\\s*|)(.*)$") stepinfo)) - (stepparams (list-ref stepparts 2)) ;; for future use, {VAR=1,2,3}, run step for each - (paramparts (if (string? stepparams) - (map (lambda (x)(string-split x "=")) (string-split-fields "[^;]*=[^;]*" stepparams)) - '())) - (subrun (alist-ref "subrun" paramparts equal?)) - (stepcmd (list-ref stepparts 3)) - (script "") ; "#!/bin/bash\n") ;; yep, we depend on bin/bash FIXME!!!\ - (logpro-file (conc stepname ".logpro")) - (html-file (conc stepname ".html")) - (dat-file (conc stepname ".dat")) - (tconfig-logpro (configf:lookup testconfig "logpro" stepname)) - (logpro-used (common:file-exists? logpro-file))) - - (debug:print 0 *default-log-port* "stepparts: " stepparts ", stepparams: " stepparams - ", paramparts: " paramparts ", subrun: " subrun ", stepcmd: " stepcmd) - - (if (and tconfig-logpro - (not logpro-used)) ;; no logpro file found but have a defn in the testconfig - (begin - (with-output-to-file logpro-file - (lambda () - (print ";; logpro file extracted from testconfig\n" - ";;") - (print tconfig-logpro))) - (set! logpro-used #t))) - - ;; NB// can safely assume we are in test-area directory - (debug:print 4 *default-log-port* "ezsteps:\n stepname: " stepname " stepinfo: " stepinfo " stepparts: " stepparts - " stepparams: " stepparams " stepcmd: " stepcmd) - - ;; ;; first source the previous environment - ;; (let ((prev-env (conc ".ezsteps/" prevstep (if (string-search (regexp "csh") - ;; (get-environment-variable "SHELL")) ".csh" ".sh")))) - ;; (if (and prevstep (common:file-exists? prev-env)) - ;; (set! script (conc script "source " prev-env)))) - - ;; call the command using mt_ezstep - ;; (set! script (conc "mt_ezstep " stepname " " (if prevstep prevstep "x") " " stepcmd)) - - (debug:print 4 *default-log-port* "script: " script) - (rmt:teststep-set-status! run-id test-id stepname "start" "-" #f #f) - ;; now launch the actual process - (call-with-environment-variables - (list (cons "PATH" (conc (get-environment-variable "PATH") ":."))) - (lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1") - (let* ((cmd (conc stepcmd " > " stepname ".log 2>&1")) ;; >outfile 2>&1 - (pid #f)) - (let ((proc (lambda () - (set! pid (process-run "/bin/bash" (list "-c" cmd)))))) - (if subrun - (begin - (debug:print-info 0 *default-log-port* "Running without MT_.* environment variables.") - (common:without-vars proc "^MT_.*")) - (proc))) - - (with-output-to-file "Makefile.ezsteps" - (lambda () - (print stepname ".log :") - (print "\t" cmd) - (if (common:file-exists? (conc stepname ".logpro")) - (print "\tlogpro " stepname ".logpro " stepname ".html < " stepname ".log")) - (print) - (print stepname " : " stepname ".log") - (print)) - #:append) - - (rmt:test-set-top-process-pid run-id test-id pid) - (let processloop ((i 0)) - (let-values (((pid-val exit-status exit-code)(process-wait pid #t))) - (mutex-lock! m) - (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) - (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status) - (launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code) - (mutex-unlock! m) - (if (eq? pid-val 0) - (begin - (thread-sleep! 2) - (processloop (+ i 1)))) - ))))) - (debug:print-info 0 *default-log-port* "step " stepname " completed with exit code " (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2)) - ;; now run logpro if needed - (if logpro-used - (let* ((logpro-exe (or (getenv "LOGPRO_EXE") "logpro")) - (pid (process-run (conc "/bin/sh -c '"logpro-exe" "logpro-file " " (conc stepname ".html") " < " stepname ".log > /dev/null'")))) - (let processloop ((i 0)) - (let-values (((pid-val exit-status exit-code)(process-wait pid #t))) - (mutex-lock! m) - ;; (make-launch:einf pid: pid exit-status: exit-status exit-code: exit-code) - (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) - (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status) - (launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code) - (mutex-unlock! m) - (if (eq? pid-val 0) - (begin - (thread-sleep! 2) - (processloop (+ i 1))))) - (debug:print-info 0 *default-log-port* "logpro for step " stepname " exited with code " (launch:einf-exit-code exit-info))))) ;; (vector-ref exit-info 2))))) - - (let ((exinfo (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2)) - (logfna (if logpro-used (conc stepname ".html") "")) - (comment #f)) - (if logpro-used - (let ((datfile (conc stepname ".dat"))) - ;; load the .dat file into the test_data table if it exists - (if (common:file-exists? datfile) - (set! comment (launch:load-logpro-dat run-id test-id stepname))) - (rmt:test-set-log! run-id test-id (conc stepname ".html")))) - (rmt:teststep-set-status! run-id test-id stepname "end" exinfo comment logfna)) - ;; set the test final status - (let* ((process-exit-status (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2)) - (this-step-status (cond - ((and (eq? process-exit-status 2) logpro-used) 'warn) ;; logpro 2 = warnings - ((and (eq? process-exit-status 3) logpro-used) 'check) ;; logpro 3 = check - ((and (eq? process-exit-status 4) logpro-used) 'waived) ;; logpro 4 = waived - ((and (eq? process-exit-status 5) logpro-used) 'abort) ;; logpro 5 = abort - ((and (eq? process-exit-status 6) logpro-used) 'skip) ;; logpro 6 = skip - ((eq? process-exit-status 0) 'pass) ;; logpro 0 = pass - (else 'fail))) - (overall-status (cond - ((eq? (launch:einf-rollup-status exit-info) 2) 'warn) ;; rollup-status (vector-ref exit-info 3) - ((eq? (launch:einf-rollup-status exit-info) 0) 'pass) ;; (vector-ref exit-info 3) - (else 'fail))) - (next-status (cond - ((eq? overall-status 'pass) this-step-status) - ((eq? overall-status 'warn) - (if (eq? this-step-status 'fail) 'fail 'warn)) - ((eq? overall-status 'abort) 'abort) - (else 'fail))) - (next-state ;; "RUNNING") ;; WHY WAS THIS CHANGED TO NOT USE (null? tal) ?? - (cond - ((null? tal) ;; more to run? - "COMPLETED") - (else "RUNNING")))) - (debug:print 4 *default-log-port* "Exit value received: " (launch:einf-exit-code exit-info) " logpro-used: " logpro-used - " this-step-status: " this-step-status " overall-status: " overall-status - " next-status: " next-status " rollup-status: " (launch:einf-rollup-status exit-info)) ;; (vector-ref exit-info 3)) - (case next-status - ((warn) - (launch:einf-rollup-status-set! exit-info 2) ;; (vector-set! exit-info 3 2) ;; rollup-status - ;; NB// test-set-status! does rdb calls under the hood - (tests:test-set-status! run-id test-id next-state "WARN" - (if (eq? this-step-status 'warn) "Logpro warning found" #f) - #f)) - ((check) - (launch:einf-rollup-status-set! exit-info 3) ;; (vector-set! exit-info 3 3) ;; rollup-status - ;; NB// test-set-status! does rdb calls under the hood - (tests:test-set-status! run-id test-id next-state "CHECK" - (if (eq? this-step-status 'check) "Logpro check found" #f) - #f)) - ((waived) - (launch:einf-rollup-status-set! exit-info 4) ;; (vector-set! exit-info 3 3) ;; rollup-status - ;; NB// test-set-status! does rdb calls under the hood - (tests:test-set-status! run-id test-id next-state "WAIVED" - (if (eq? this-step-status 'check) "Logpro waived found" #f) - #f)) - ((abort) - (launch:einf-rollup-status-set! exit-info 5) ;; (vector-set! exit-info 3 4) ;; rollup-status - ;; NB// test-set-status! does rdb calls under the hood - (tests:test-set-status! run-id test-id next-state "ABORT" - (if (eq? this-step-status 'abort) "Logpro abort found" #f) - #f)) - ((skip) - (launch:einf-rollup-status-set! exit-info 6) ;; (vector-set! exit-info 3 4) ;; rollup-status - ;; NB// test-set-status! does rdb calls under the hood - (tests:test-set-status! run-id test-id next-state "SKIP" - (if (eq? this-step-status 'skip) "Logpro skip found" #f) - #f)) - ((pass) - (tests:test-set-status! run-id test-id next-state "PASS" #f #f)) - (else ;; 'fail - (launch:einf-rollup-status-set! exit-info 1) ;; (vector-set! exit-info 3 1) ;; force fail, this used to be next-state but that doesn't make sense. should always be "COMPLETED" - (tests:test-set-status! run-id test-id "COMPLETED" "FAIL" (conc "Failed at step " stepname) #f) - ))) - logpro-used)) - -(define (launch:manage-steps run-id test-id item-path fullrunscript ezsteps subrun test-name tconfigreg exit-info m) - ;; (let-values - ;; (((pid exit-status exit-code) - ;; (run-n-wait fullrunscript))) - ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) - ;; Since we should have a clean slate at this time there is no need to do - ;; any of the other stuff that tests:test-set-status! does. Let's just - ;; force RUNNING/n/a - - ;; (thread-sleep! 0.3) - ;; (tests:test-force-state-status! run-id test-id "RUNNING" "n/a") - (rmt:set-state-status-and-roll-up-items run-id test-name item-path "RUNNING" #f #f) - ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here - - ;; if there is a runscript do it first - (if fullrunscript - (let ((pid (process-run fullrunscript))) - (rmt:test-set-top-process-pid run-id test-id pid) - (let loop ((i 0)) - (let-values - (((pid-val exit-status exit-code) (process-wait pid #t))) - (mutex-lock! m) - (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) - (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status) - (launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code) - (launch:einf-rollup-status-set! exit-info exit-code) ;; (vector-set! exit-info 3 exit-code) ;; rollup status - (mutex-unlock! m) - (if (eq? pid-val 0) - (begin - (thread-sleep! 2) - (loop (+ i 1))) - ))))) - ;; then, if runscript ran ok (or did not get called) - ;; do all the ezsteps (if any) - (if (or ezsteps subrun) - (let* ((test-run-dir (tests:get-test-path-from-environment)) - (testconfig ;; (read-config (conc work-area "/testconfig") #f #t environ-patt: "pre-launch-env-vars")) ;; FIXME??? is allow-system ok here? - ;; NOTE: it is tempting to turn off force-create of testconfig but dynamic - ;; ezstep names need a full re-eval here. - (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs))) - (ezstepslst (if (hash-table? testconfig) - (hash-table-ref/default testconfig "ezsteps" '()) - #f))) - (if testconfig - (hash-table-set! *testconfigs* test-name testconfig) ;; cached for lazy reads later ... - (begin - (launch:setup) - (debug:print 0 *default-log-port* "WARNING: no testconfig found for " test-name " in search path:\n " - (string-intersperse (tests:get-tests-search-path *configdat*) "\n ")))) - ;; after all that, still no testconfig? Time to abort - (if (not testconfig) - (begin - (debug:print-error 0 *default-log-port* "Failed to resolve megatest.config, runconfigs.config and testconfig issues. Giving up now") - (exit 1))) - - ;; create a proc for the subrun if requested, save that proc in the ezsteps table as the last entry - ;; 1. get section [runarun] - ;; 2. unset MT_* vars - ;; 3. fix target - ;; 4. fix runname - ;; 5. fix testpatt or calculate it from contour - ;; 6. launch the run - ;; 7. roll up the run result and or roll up the logpro processed result - (when (configf:lookup testconfig "subrun" "runwait") ;; we use runwait as the flag that a subrun is requested - (subrun:initialize-toprun-test testconfig test-run-dir) - (let* ((mt-cmd (subrun:launch-cmd test-run-dir))) - (debug:print-info 0 *default-log-port* "Subrun command is \"" mt-cmd "\"") - (set! ezsteps #t) ;; set the needed flag - (set! ezstepslst - (append (or ezstepslst '()) - (list (list "subrun" (conc "{subrun=true} " mt-cmd))))))) - - ;; process the ezsteps - (if ezsteps - (begin - (if (not (common:file-exists? ".ezsteps"))(create-directory ".ezsteps")) - ;; if ezsteps was defined then we are sure to have at least one step but check anyway - (if (not (> (length ezstepslst) 0)) - (debug:print-error 0 *default-log-port* "ezsteps defined but ezstepslst is zero length") - (let loop ((ezstep (car ezstepslst)) - (tal (cdr ezstepslst)) - (prevstep #f)) - (debug:print-info 0 *default-log-port* "Processing ezstep \"" (string-intersperse ezstep " ") "\"") - ;; check exit-info (vector-ref exit-info 1) - (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) - (let ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig)) - (stepname (car ezstep))) - ;; if logpro-used read in the stepname.dat file - (if (and logpro-used (common:file-exists? (conc stepname ".dat"))) - (launch:load-logpro-dat run-id test-id stepname)) - (if (steprun-good? logpro-used (launch:einf-exit-code exit-info)) - (if (not (null? tal)) - (loop (car tal) (cdr tal) stepname)) - (debug:print 0 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping"))) - (debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep))))))))) - -(define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) - (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30"))) - (start-seconds (current-seconds)) - (calc-minutes (lambda () - (inexact->exact - (round - (- - (current-seconds) - start-seconds))))) - (kill-tries 0)) - ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) - ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) - (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) - - (let loop ((minutes (calc-minutes)) - (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) - (disk-free (get-df (current-directory))) - (last-sync (current-seconds))) - #;(common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) - (let* ((over-time (> (current-seconds) (+ last-sync update-period))) - (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) - (delta (abs (- load cpu-load)))) - (if (> delta 0.1) ;; don't bother updating with small changes - load - #f))) - (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds - (get-df (current-directory)) - disk-free)) - (delta (abs (- df disk-free)))) - (if (and (> df 0) - (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg - df - #f))) - (do-sync (or new-cpu-load new-disk-free over-time)) - - (test-info (rmt:get-test-info-by-id run-id test-id)) - (state (db:test-get-state test-info)) - (status (db:test-get-status test-info)) - (kill-reason "no kill reason specified") - (kill-job? #f)) - #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) - (cond - ((test-get-kill-request run-id test-id) - (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") - (set! kill-job? #t)) - ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) - (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) - (set! kill-job? #t)) - ((equal? status "DEAD") - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) - (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") - ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING - (set! kill-job? #f))) - - (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) - (launch:handle-zombie-tests run-id) - (when do-sync - ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) - ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) - #;(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) - #;(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))) - - (if kill-job? - (begin - (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) - (mutex-lock! m) - ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this - ;; section and the runit section? Or add a loop that tries three times with a 1/4 second - ;; between tries? - (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0)) - (pid2 (rmt:test-get-top-process-pid run-id test-id)) - (pids (delete-duplicates (filter number? (list pid1 pid2))))) - (if (not (null? pids)) - (begin - (for-each - (lambda (pid) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to kill process with pid " pid ", possibly already killed.") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))) - (debug:print 0 *default-log-port* "WARNING: Request received to kill job " pid) ;; " (attempt # " kill-tries ")") - (debug:print-info 0 *default-log-port* "Signal mask=" (signal-mask)) - ;; (if (process:alive? pid) - ;; (begin - (map (lambda (pid-num) - (process-signal pid-num signal/term)) - (process:get-sub-pids pid)) - (thread-sleep! 5) - ;; (if (process:process-alive? pid) - (map (lambda (pid-num) - (handle-exceptions - exn - #f - (process-signal pid-num signal/kill))) - (process:get-sub-pids pid)))) - ;; (debug:print-info 0 *default-log-port* "not killing process " pid " as it is not alive")))) - pids) - ;; BB: question to Matt -- does the tests:test-state-status! encompass rollup to toplevel? If not, should it? - (tests:test-set-status! run-id test-id "KILLED" "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) ;; BB ADDED kill-reason -- confirm OK with Matt - (begin - (debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2) - (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt - ))) - (mutex-unlock! m) - ;; no point in sticking around. Exit now. But run end of run before exiting? - (launch:end-of-run-check run-id) - (exit))) - (if (hash-table-ref/default misc-flags 'keep-going #f) - (begin - (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses - (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta - (loop (calc-minutes) - (or new-cpu-load cpu-load) - (or new-disk-free disk-free) - (if do-sync (current-seconds) last-sync))))))) - (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional - -;; set up needed environment variables given a run-id and optionally a target, itempath etc. -;; -(define (runs:set-megatest-env-vars run-id #!key (inkeys #f)(inrunname #f)(inkeyvals #f)(intarget #f)(testname #f)(itempath #f)) - ;;(bb-check-path msg: "runs:set-megatest-env-vars entry") - (let* ((target (or intarget - (common:args-get-target) - (get-environment-variable "MT_TARGET"))) - (keys (if inkeys inkeys (rmt:get-keys))) - (keyvals (if inkeyvals inkeyvals (keys:target->keyval keys target))) - (vals (hash-table-ref/default *env-vars-by-run-id* run-id #f)) - (link-tree (common:get-linktree))) ;; (configf:lookup *configdat* "setup" "linktree"))) - (if testname (setenv "MT_TEST_NAME" testname)) - (if itempath (setenv "MT_ITEMPATH" itempath)) - - ;; get the info from the db and put it in the cache - (if link-tree - (setenv "MT_LINKTREE" link-tree) - (debug:print-error 0 *default-log-port* "linktree not set, should be set in megatest.config in [setup] section.")) - (if (not vals) - (let ((ht (make-hash-table))) - (hash-table-set! *env-vars-by-run-id* run-id ht) - (set! vals ht) - (for-each - (lambda (key) - (hash-table-set! vals (car key) (cadr key))) - keyvals))) - ;; from the cached data set the vars - - (hash-table-for-each - vals - (lambda (key val) - (debug:print 2 *default-log-port* "setenv " key " " val) - (safe-setenv key val))) - ;;(bb-check-path msg: "runs:set-megatest-env-vars block 1") - ;;(BB> "*env-vars-by-run-id*/runid("run-id" vals="(hash-table->alist vals)) - - (if (not (get-environment-variable "MT_TARGET"))(setenv "MT_TARGET" target)) - ;; we had a case where there was an exception generated by the hash-table-ref - ;; due to *configdat* being #f Adding a handle and exit - (let fatal-loop ((count 0)) - (handle-exceptions - exn - (let ((call-chain (get-call-chain)) - (msg ((condition-property-accessor 'exn 'message) exn))) - (if (< count 5) - (begin ;; this call is colliding, do some crude stuff to fix it. - (debug:print 0 *default-log-port* "ERROR: *configdat* was inaccessible! This should never happen. Retry #" count) - (launch:setup force-reread: #t) - (fatal-loop (+ count 1))) - (begin - (debug:print 0 *default-log-port* "FATAL: *configdat* was inaccessible! This should never happen. Retried " count " times. Message: " msg) - (debug:print 0 *default-log-port* "Call chain:") - (with-output-to-port *default-log-port* - - (lambda () - (print "*configdat* is >>"*configdat*"<<") - (pp *configdat*) - (pp call-chain))) - - (exit 1)))) - ;;(bb-check-path msg: "runs:set-megatest-env-vars block 1.5") - (when (or (not *configdat*) (not (hash-table? *configdat*))) - (debug:print 0 *default-log-port* "WARNING: *configdat* was inaccessible! This should never happen. Brute force reread.") - ;;(BB> "ERROR: *configdat* was inaccessible! This should never happen. Brute force reread.") - (thread-sleep! 2) ;; assuming nfs lag. - (launch:setup force-reread: #t)) - (alist->env-vars (hash-table-ref/default *configdat* "env-override" '())))) ;;;; environment is tainted HERE in this let block. - ;;(bb-check-path msg: "runs:set-megatest-env-vars block 2") - ;; Lets use this as an opportunity to put MT_RUNNAME in the environment - (let ((runname (if inrunname inrunname (rmt:get-run-name-from-id run-id)))) - (if runname - (setenv "MT_RUNNAME" runname) - (debug:print-error 0 *default-log-port* "no value for runname for id " run-id))) - (setenv "MT_RUN_AREA_HOME" *toppath*) - ;; if a testname and itempath are available set the remaining appropriate variables - (if testname (setenv "MT_TEST_NAME" testname)) - (if itempath (setenv "MT_ITEMPATH" itempath)) - ;;(bb-check-path msg: "runs:set-megatest-env-vars block 3") - (if (and testname link-tree) - (setenv "MT_TEST_RUN_DIR" (conc (getenv "MT_LINKTREE") "/" - (getenv "MT_TARGET") "/" - (getenv "MT_RUNNAME") "/" - (getenv "MT_TEST_NAME") - (if (and itempath - (not (equal? itempath ""))) - (conc "/" itempath) - "")))))) - -(define (launch:execute encoded-cmd) - (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) - (tconfigreg #f)) - (setenv "MT_CMDINFO" encoded-cmd) - ;;(bb-check-path msg: "launch:execute incoming") - (if (list? cmdinfo) ;; ((testpath /tmp/mrwellan/jazzmind/src/example_run/tests/sqlitespeed) - ;; (test-name sqlitespeed) (runscript runscript.rb) (db-host localhost) (run-id 1)) - (let* ((testpath (assoc/default 'testpath cmdinfo)) ;; testpath is the test spec area - (top-path (assoc/default 'toppath cmdinfo)) - (work-area (assoc/default 'work-area cmdinfo)) ;; work-area is the test run area - (test-name (assoc/default 'test-name cmdinfo)) - (runscript (assoc/default 'runscript cmdinfo)) - (ezsteps (assoc/default 'ezsteps cmdinfo)) - (subrun (assoc/default 'subrun cmdinfo)) - ;; (runremote (assoc/default 'runremote cmdinfo)) - ;; (transport (assoc/default 'transport cmdinfo)) ;; not used - ;; (serverinf (assoc/default 'serverinf cmdinfo)) - ;; (port (assoc/default 'port cmdinfo)) - (serverurl (assoc/default 'serverurl cmdinfo)) - (homehost (assoc/default 'homehost cmdinfo)) - (run-id (assoc/default 'run-id cmdinfo)) - (test-id (assoc/default 'test-id cmdinfo)) - (target (assoc/default 'target cmdinfo)) - (areaname (assoc/default 'areaname cmdinfo)) - (itemdat (assoc/default 'itemdat cmdinfo)) - (env-ovrd (assoc/default 'env-ovrd cmdinfo)) - (set-vars (assoc/default 'set-vars cmdinfo)) ;; pre-overrides from -setvar - (runname (assoc/default 'runname cmdinfo)) - (megatest (assoc/default 'megatest cmdinfo)) - (runtlim (assoc/default 'runtlim cmdinfo)) - (contour (assoc/default 'contour cmdinfo)) - (item-path (item-list->path itemdat)) - (mt-bindir-path (assoc/default 'mt-bindir-path cmdinfo)) - (keys #f) - (keyvals #f) - (fullrunscript (if (not runscript) - #f - (if (substring-index "/" runscript) - runscript ;; use unadultered if contains slashes - (let ((fulln (conc work-area "/" runscript))) - (if (and (common:file-exists? fulln) - (file-execute-access? fulln)) - fulln - runscript))))) ;; assume it is on the path - (check-work-area (lambda () - ;; NFS might not have propagated the directory meta data to the run host - give it time if needed - (let loop ((count 0)) - (if (or (common:directory-exists? work-area) - (> count 10)) - (change-directory work-area) - (begin - (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " work-area " not found") - (thread-sleep! 10) - (loop (+ count 1))))) - - (if (not (string=? (common:real-path work-area)(common:real-path (current-directory)))) - (begin - (debug:print 0 *default-log-port* - "INFO: we are expecting to be in directory " work-area "\n" - " but we are actually in the directory " (current-directory) "\n" - " doing another change dir.") - (change-directory work-area))) - - ;; spot check that the files in testpath are available. Too often NFS delays cause problems here. - (let ((files (glob (conc testpath "/*"))) - (bad-files '())) - (for-each - (lambda (fullname) - (let* ((fname (pathname-strip-directory fullname)) - (targn (conc work-area "/" fname))) - (if (not (file-exists? targn)) - (set! bad-files (cons fname bad-files))))) - files) - (if (not (null? bad-files)) - (begin - (debug:print 0 *default-log-port* "INFO: test data from " testpath " not copied properly or filesystem problems causing data to not be found. Re-running the copy command.") - (debug:print 0 *default-log-port* "INFO: missing files from " work-area ": " (string-intersperse bad-files ", ")) - (launch:test-copy testpath work-area)))) - ;; one more time, change to the work-area directory - (change-directory work-area))) - ) ;; let* - - (if contour (setenv "MT_CONTOUR" contour)) - - ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ... - ;; - (setenv "MT_TESTSUITENAME" areaname) - (setenv "MT_RUN_AREA_HOME" top-path) - (set! *toppath* top-path) - (change-directory *toppath*) ;; temporarily switch to the run area home - (setenv "MT_TEST_RUN_DIR" work-area) - - (launch:setup) ;; should be properly in the run area home now - - (if contour (setenv "MT_CONTOUR" contour)) - - ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ... - ;; - (setenv "MT_TESTSUITENAME" areaname) - (setenv "MT_RUN_AREA_HOME" top-path) - (set! *toppath* top-path) - (change-directory *toppath*) ;; temporarily switch to the run area home - (setenv "MT_TEST_RUN_DIR" work-area) - - (launch:setup) ;; should be properly in the run area home now - - (set! tconfigreg (tests:get-all)) ;; mapping of testname => test source path - (let ((sighand (lambda (signum) - ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting - (if (eq? signum signal/stop) - (debug:print-error 0 *default-log-port* "attempt to STOP process. Exiting.")) - (set! *time-to-exit* #t) - (print "Received signal " signum ", cleaning up before exit (set this test to COMPLETED/ABORT) . Please wait...") - (let ((th1 (make-thread (lambda () - (print "set test to COMPLETED/ABORT begin.") - (rmt:test-set-state-status run-id test-id "COMPLETED" "ABORT" "received kill signal") - (print "set test to COMPLETED/ABORT complete.") - (print "Killed by signal " signum ". Exiting") - (exit 1)))) - (th2 (make-thread (lambda () - (thread-sleep! 20) - (debug:print 0 *default-log-port* "Done") - (exit 4))))) - (thread-start! th2) - (thread-start! th1) - (thread-join! th2))))) - (set-signal-handler! signal/int sighand) - (set-signal-handler! signal/term sighand) - ) ;; (set-signal-handler! signal/stop sighand) - - ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, - ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* - ;; - (let* ((test-info (rmt:get-test-info-by-id run-id test-id)) - (test-host (if test-info - (db:test-get-host test-info) - (begin - (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.") - (exit)))) - (test-pid (db:test-get-process_id test-info))) - (cond - ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. - ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun - (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") - ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") - - (rmt:general-call 'set-test-start-time #f test-id) - (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) - ) ;; prime it for running - ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART")) - (if (process:alive-on-host? test-host test-pid) - (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed") - (exit))) - ((not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) - ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") - (rmt:general-call 'set-test-start-time #f test-id) - (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) - ) - (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")) - (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed") - (exit)))) - - ;; cleanup prior execution's steps - (rmt:delete-steps-for-test! run-id test-id) - - (debug:print 2 *default-log-port* "Executing " test-name " (id: " test-id ") on " (get-host-name)) - (set! keys (rmt:get-keys)) - ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process - ;; one of these is defunct/redundant ... - (if (not (launch:setup force-reread: #t)) - (begin - (debug:print 0 *default-log-port* "Failed to setup, exiting") - ;; (sqlite3:finalize! db) - ;; (sqlite3:finalize! tdb) - (exit 1))) - ;; validate that the test run area is available - (check-work-area) - - ;; still need to go back to run area home for next couple steps - (change-directory *toppath*) - - ;; NOTE: Current order is to process runconfigs *before* setting the MT_ vars. This - ;; seems non-ideal but could well break stuff - ;; BUG? BUG? BUG? - - (let ((rconfig (full-runconfigs-read)) ;; (read-config (conc *toppath* "/runconfigs.config") #f #t sections: (list "default" target)))) - (wconfig (configf:read-config "waivers.config" #f #t sections: `( "default" ,target )))) ;; read the waivers config if it exists - ;; (setup-env-defaults (conc *toppath* "/runconfigs.config") run-id (make-hash-table) keyvals target) - ;; (set-run-config-vars run-id keyvals target) ;; (db:get-target db run-id)) - ;; Now have runconfigs data loaded, set environment vars - (for-each - (lambda (section) - (for-each - (lambda (varval) - (let ((var (car varval)) - (val (cadr varval))) - (if (and (string? var)(string? val)) - (begin - (safe-setenv var (configf:eval-string-in-environment val))) ;; val) - (debug:print-error 0 *default-log-port* "bad variable spec, " var "=" val)))) - (configf:get-section rconfig section))) - (list "default" target))) - ;;(bb-check-path msg: "launch:execute post block 1") - - ;; NFS might not have propagated the directory meta data to the run host - give it time if needed - (let loop ((count 0)) - (if (or (common:file-exists? work-area) - (> count 10)) - (change-directory work-area) - (begin - (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " work-area " not found") - (thread-sleep! 10) - (loop (+ count 1))))) - - ;; now we can switch to the work-area? - (change-directory work-area) - ;;(bb-check-path msg: "launch:execute post block 1.5") - ;; (change-directory work-area) - (set! keyvals (keys:target->keyval keys target)) - ;; apply pre-overrides before other variables. The pre-override vars must not - ;; clobbers things from the official sources such as megatest.config and runconfigs.config - (if (string? set-vars) - (let ((varpairs (string-split set-vars ","))) - (debug:print 4 *default-log-port* "varpairs: " varpairs) - (map (lambda (varpair) - (let ((varval (string-split varpair "="))) - (if (eq? (length varval) 2) - (let ((var (car varval)) - (val (cadr varval))) - (debug:print 1 *default-log-port* "Adding pre-var/val " var " = " val " to the environment") - (setenv var val))))) - varpairs))) - ;;(bb-check-path msg: "launch:execute post block 2") - (for-each - (lambda (varval) - (let ((var (car varval)) - (val (cadr varval))) - (if val - (setenv var val) - (begin - (debug:print-error 0 *default-log-port* "required variable " var " does not have a valid value. Exiting") - (exit))))) - (list - (list "MT_TEST_RUN_DIR" work-area) - (list "MT_TEST_NAME" test-name) - (list "MT_ITEM_INFO" (conc itemdat)) - (list "MT_ITEMPATH" item-path) - (list "MT_RUNNAME" runname) - (list "MT_MEGATEST" megatest) - (list "MT_TARGET" target) - (list "MT_LINKTREE" (common:get-linktree)) ;; (configf:lookup *configdat* "setup" "linktree")) - (list "MT_TESTSUITENAME" (common:get-testsuite-name)))) - ;;(bb-check-path msg: "launch:execute post block 3") - - (if mt-bindir-path (setenv "PATH" (conc (getenv "PATH") ":" mt-bindir-path))) - ;;(bb-check-path msg: "launch:execute post block 4") - ;; (change-directory top-path) - ;; Can setup as client for server mode now - ;; (client:setup) - - - ;; environment overrides are done *before* the remaining critical envars. - (alist->env-vars env-ovrd) - ;;(bb-check-path msg: "launch:execute post block 41") - (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) - ;;(bb-check-path msg: "launch:execute post block 42") - (set-item-env-vars itemdat) - ;;(bb-check-path msg: "launch:execute post block 43") - (let ((blacklist (configf:lookup *configdat* "setup" "blacklistvars"))) - (if blacklist - (let ((vars (string-split blacklist))) - (save-environment-as-files "megatest" ignorevars: vars) - (for-each (lambda (var) - (unsetenv var)) - vars)) - (save-environment-as-files "megatest"))) - ;;(bb-check-path msg: "launch:execute post block 44") - ;; open-run-close not needed for test-set-meta-info - ;; (tests:set-full-meta-info #f test-id run-id 0 work-area) - ;; (tests:set-full-meta-info test-id run-id 0 work-area) - (tests:set-full-meta-info #f test-id run-id 0 work-area 10) - - ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here - - (if (args:get-arg "-xterm") - (set! fullrunscript "xterm") - (if (and fullrunscript - (common:file-exists? fullrunscript) - (not (file-execute-access? fullrunscript))) - (system (conc "chmod ug+x " fullrunscript)))) - - ;; We are about to actually kick off the test - ;; so this is a good place to remove the records for - ;; any previous runs - ;; (db:test-remove-steps db run-id testname itemdat) - ;; now is also a good time to write the .testconfig file - (let* ((tconfig-fname (conc work-area "/.testconfig")) - (tconfig-tmpfile (conc tconfig-fname ".tmp")) - (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t))) ;; 'return-procs))) - (configf:write-alist tconfig tconfig-tmpfile) - (file-move tconfig-tmpfile tconfig-fname #t)) - ;; - (let* ((m (make-mutex)) - (kill-job? #f) - (exit-info (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status - (job-thread #f) - ;; (keep-going #t) - (misc-flags (let ((ht (make-hash-table))) - (hash-table-set! ht 'keep-going #t) - ht)) - (runit (lambda () - (launch:manage-steps run-id test-id item-path fullrunscript ezsteps subrun test-name tconfigreg exit-info m))) - (monitorjob (lambda () - (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags))) - (th1 (make-thread monitorjob "monitor job")) - (th2 (make-thread runit "run job"))) - (set! job-thread th2) - (thread-start! th1) - (thread-start! th2) - (thread-join! th2) - (debug:print-info 0 *default-log-port* "Megatest exectute of test " test-name ", item path " item-path " complete. Notifying the db ...") - (hash-table-set! misc-flags 'keep-going #f) - (thread-join! th1) - (thread-sleep! 1) ;; givbe thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec. - (mutex-lock! m) - (let* ((item-path (item-list->path itemdat)) - ;; only state and status needed - use lazy routine - (testinfo (rmt:get-testinfo-state-status run-id test-id))) - ;; Am I completed? - (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) ;; NOTE: It should *not* be REMOTEHOSTSTART but for reasons I don't yet understand it sometimes gets stuck in that state ;; (not (equal? (db:test-get-state testinfo) "COMPLETED")) - (let ((new-state (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status - ;; "COMPLETED" ;; (db:test-get-state testinfo))) ;; else preseve the state as set within the test - ) - (new-status (cond - ((not (launch:einf-exit-status exit-info)) "FAIL") ;; job failed to run ... (vector-ref exit-info 1) - ((eq? (launch:einf-rollup-status exit-info) 0) ;; (vector-ref exit-info 3) - ;; if the current status is AUTO then defer to the calculated value (i.e. leave this AUTO) - (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO" "PASS")) - ((eq? (launch:einf-rollup-status exit-info) 1) "FAIL") ;; (vector-ref exit-info 3) - ((eq? (launch:einf-rollup-status exit-info) 2) ;; (vector-ref exit-info 3) - ;; if the current status is AUTO the defer to the calculated value but qualify (i.e. make this AUTO-WARN) - (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO-WARN" "WARN")) - ((eq? (launch:einf-rollup-status exit-info) 3) "CHECK") - ((eq? (launch:einf-rollup-status exit-info) 4) "WAIVED") - ((eq? (launch:einf-rollup-status exit-info) 5) "ABORT") - ((eq? (launch:einf-rollup-status exit-info) 6) "SKIP") - (else "FAIL")))) ;; (db:test-get-status testinfo))) - (debug:print-info 1 *default-log-port* "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (launch:einf-exit-status exit-info) " and rollup-status of " (launch:einf-rollup-status exit-info)) - (tests:test-set-status! run-id - test-id - new-state - new-status - (args:get-arg "-m") #f) - ;; need to update the top test record if PASS or FAIL and this is a subtest - ;; NO NEED TO CALL set-state-status-and-roll-up-items HERE, THIS IS DONE IN set-state-status-and-roll-up-items called by tests:test-set-status! - )) - ;; for automated creation of the rollup html file this is a good place... - (if (not (equal? item-path "")) - (tests:summarize-items run-id test-id test-name #f)) - (tests:summarize-test run-id test-id) ;; don't force - just update if no - (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) - (mutex-unlock! m) - (launch:end-of-run-check run-id ) - (debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " - work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n") - (if (not (launch:einf-exit-status exit-info)) - (exit 4)))) - ))) - -(define (launch:is-test-alive host pid) -(if (and host pid (not (equal? host "n/a"))) -(let* ((cmd (conc "ssh " host " pstree -A " pid)) - (output (with-input-from-pipe cmd read-lines))) - (print "cmd: " cmd "\n op: " output ) - (if(eq? (length output) 0) - #f - #t)) -#t)) - -(define (launch:kill-tests-if-dead run-id) - (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f))) - (let loop ((running-test (car running-tests)) - (tal (cdr running-tests)) - (kill-cnt 0)) - (let* ((test-name (vector-ref running-test 2)) - (item-path (vector-ref running-test 11)) - (test-id (vector-ref running-test 0)) - (host (vector-ref running-test 6)) - (pid (rmt:test-get-top-process-pid run-id test-id)) - (event-time (vector-ref running-test 5)) - (duration (vector-ref running-test 12)) - (flag 0) - (curr-time (current-seconds))) - (if (and (< (+ event-time duration 600) curr-time) (not (launch:is-test-alive host pid))) ;;test has not updated duration in last 10 min then likely its not running but confirm before marking it as killed - (begin - (debug:print 0 *default-log-port* "test " test-name "/" item-path " needs to be killed") - (set! flag 1) - (rmt:set-state-status-and-roll-up-items run-id test-name item-path "KILLREQ" "n/a" #f))) - (if (not (null? tal)) - (loop (car tal) (cdr tal) (+ kill-cnt flag)) - (+ kill-cnt flag)))))) - -;; DO NOT USE - caching of configs is handled in launch:setup now. -;; -(define (launch:cache-config) - ;; if we have a linktree and -runtests and -target and the directory exists dump the config - ;; to megatest-(current-seconds).cfg and symlink it to megatest.cfg - (if (and *configdat* - (or (args:get-arg "-run") - (args:get-arg "-runtests") - (args:get-arg "-execute"))) - (let* ((linktree (common:get-linktree)) ;; (get-environment-variable "MT_LINKTREE")) - (target (common:args-get-target exit-if-bad: #t)) - (runname (or (args:get-arg "-runname") - (args:get-arg ":runname") - (getenv "MT_RUNNAME"))) - (fulldir (conc linktree "/" - target "/" - runname))) - (if (and linktree (common:file-exists? linktree)) ;; can't proceed without linktree - (begin - (debug:print-info 0 *default-log-port* "Have -run with target=" target ", runname=" runname ", fulldir=" fulldir ", testpatt=" (or (args:get-arg "-testpatt") "%")) - (if (not (common:file-exists? fulldir)) - (create-directory fulldir #t)) ;; need to protect with exception handler - (if (and target - runname - (common:file-exists? fulldir)) - (let ((tmpfile (conc fulldir "/.megatest.cfg." (current-seconds))) - (targfile (conc fulldir "/.megatest.cfg-" megatest-version "-" megatest-fossil-hash)) - (rconfig (conc fulldir "/.runconfig." megatest-version "-" megatest-fossil-hash))) - (if (common:file-exists? rconfig) ;; only cache megatest.config AFTER runconfigs has been cached - (begin - (debug:print-info 0 *default-log-port* "Caching megatest.config in " tmpfile) - (if (not (common:in-running-test?)) - (configf:write-alist *configdat* tmpfile)) - (system (conc "ln -sf " tmpfile " " targfile)))) - ))) - (debug:print-info 1 *default-log-port* "No linktree yet, no caching configs."))))) - - -;; gather available information, if legit read configs in this order: -;; -;; if have cache; -;; read it a return it -;; else -;; megatest.config (do not cache) -;; runconfigs.config (cache if all vars avail) -;; megatest.config (cache if all vars avail) -;; returns: -;; *toppath* -;; side effects: -;; sets; *configdat* (megatest.config info) -;; *runconfigdat* (runconfigs.config info) -;; *configstatus* (status of the read data) -;; -(define (launch:setup #!key (force-reread #f) (areapath #f)) - (mutex-lock! *launch-setup-mutex*) - (if (and *toppath* - (eq? *configstatus* 'fulldata) (not force-reread)) ;; got it all - (begin - (debug:print 2 *default-log-port* "NOTE: skipping launch:setup-body call since we have fulldata") - (mutex-unlock! *launch-setup-mutex*) - *toppath*) - (let ((res (launch:setup-body force-reread: force-reread areapath: areapath))) - (mutex-unlock! *launch-setup-mutex*) - res))) - -;; return paths depending on what info is available. -;; -(define (launch:get-cache-file-paths areapath toppath target mtconfig) - (let* ((use-cache (common:use-cache?)) - (runname (common:args-get-runname)) - (linktree (common:get-linktree)) - (testname (common:get-full-test-name)) - (rundir (if (and runname target linktree) - (common:directory-writable? (conc linktree "/" target "/" runname)) - #f)) - (testdir (if (and rundir testname) - (common:directory-writable? (conc rundir "/" testname)) - #f)) - (cachedir (or testdir rundir)) - (mtcachef (and cachedir (conc cachedir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) - (rccachef (and cachedir (conc cachedir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash)))) - (debug:print-info 6 *default-log-port* - "runname=" runname - "\n linktree=" linktree - "\n testname=" testname - "\n rundir=" rundir - "\n testdir=" testdir - "\n cachedir=" cachedir - "\n mtcachef=" mtcachef - "\n rccachef=" rccachef) - (cons mtcachef rccachef))) - -(define (launch:setup-body #!key (force-reread #f) (areapath #f)) - (if (and (eq? *configstatus* 'fulldata) - *toppath* - (not force-reread)) ;; no need to reprocess - *toppath* ;; return toppath - (let* ((use-cache (common:use-cache?)) ;; BB- use-cache checks *configdat* for use-cache setting. We do not have *configdat*. Bootstrapping problem here. - (toppath (or *toppath* areapath (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath - (target (common:args-get-target)) - (sections (if target (list "default" target) #f)) ;; for runconfigs - (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config - (cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig)) - ;; checking for null cachefiles should not be necessary, I was seeing error car of '(), might be a chicken bug or a red herring ... - (mtcachef (if (null? cachefiles) - #f - (car cachefiles))) ;; (and cachedir (conc cachedir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) - (rccachef (if (null? cachefiles) - #f - (cdr cachefiles)))) ;; (and cachedir (conc cachedir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) - ;; (cancreate (and cachedir (common:file-exists? cachedir)(file-write-access? cachedir) (not (common:in-running-test?))))) - (set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource - ;;(BB> "launch:setup-body -- cachefiles="cachefiles) - (cond - ;; if mtcachef exists just read it, however we need to assume toppath is available in $MT_RUN_AREA_HOME - ((and (not force-reread) - mtcachef rccachef - use-cache - (get-environment-variable "MT_RUN_AREA_HOME") - (common:file-exists? mtcachef) - (common:file-exists? rccachef)) - ;;(BB> "launch:setup-body -- cond branch 1 - use-cache") - (set! *configdat* (configf:read-alist mtcachef)) - ;;(BB> "launch:setup-body -- 1 set! *configdat*="*configdat*) - (set! *runconfigdat* (configf:read-alist rccachef)) - (set! *configinfo* (list *configdat* (get-environment-variable "MT_RUN_AREA_HOME"))) - (set! *configstatus* 'fulldata) - (set! *toppath* (get-environment-variable "MT_RUN_AREA_HOME")) - *toppath*) - ;; there are no existing cached configs, do full reads of the configs and cache them - ;; we have all the info needed to fully process runconfigs and megatest.config - ((and ;; (not force-reread) ;; force-reread is irrelevant in the AND, could however OR it? - mtcachef - rccachef) ;; BB- why are we doing this without asking if caching is desired? - ;;(BB> "launch:setup-body -- cond branch 2") - (let* ((first-pass (configf:find-and-read-config ;; NB// sets MT_RUN_AREA_HOME as side effect - mtconfig - environ-patt: "env-override" - given-toppath: toppath - pathenvvar: "MT_RUN_AREA_HOME")) - (first-rundat (let ((toppath (if toppath - toppath - (car first-pass)))) - (configf:read-config ;; (conc toppath "/runconfigs.config") ;; this should be converted to runconfig:read but it is non-trivial, leaving it for now. - (conc (if (string? toppath) - toppath - (get-environment-variable "MT_RUN_AREA_HOME")) - "/runconfigs.config") - *runconfigdat* #t - sections: sections)))) - (set! *runconfigdat* first-rundat) - (if first-pass ;; - (begin - ;;(BB> "launch:setup-body -- \"first-pass\"=first-pass") - (set! *configdat* (car first-pass)) - ;;(BB> "launch:setup-body -- 2 set! *configdat*="*configdat*) - (set! *configinfo* first-pass) - (set! *toppath* (or toppath (cadr first-pass))) ;; use the gathered data unless already have it - (set! toppath *toppath*) - (if (not *toppath*) - (begin - (debug:print-error 0 *default-log-port* "you are not in a megatest area!") - (exit 1))) - (setenv "MT_RUN_AREA_HOME" *toppath*) - ;; the seed read is done, now read runconfigs, cache it then read megatest.config one more time and cache it - (let* ((keys (rmt:get-keys)) - (key-vals (keys:target->keyval keys target)) - (linktree (common:get-linktree)) ;; (or (getenv "MT_LINKTREE")(if *configdat* (configf:lookup *configdat* "setup" "linktree") #f))) - ; (if *configdat* - ; (configf:lookup *configdat* "setup" "linktree") - ; (conc *toppath* "/lt")))) - (second-pass (configf:find-and-read-config - mtconfig - environ-patt: "env-override" - given-toppath: toppath - pathenvvar: "MT_RUN_AREA_HOME")) - (runconfigdat (begin ;; this read of the runconfigs will see any adjustments made by re-reading megatest.config - (for-each (lambda (kt) - (setenv (car kt) (cadr kt))) - key-vals) - (configf:read-config (conc toppath "/runconfigs.config") *runconfigdat* #t ;; consider using runconfig:read some day ... - sections: sections))) - (cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig)) - (mtcachef (car cachefiles)) - (rccachef (cdr cachefiles))) - ;; trap exception due to stale NFS handle -- Error: (open-output-file) cannot open file - Stale NFS file handle: "/p/fdk/gwa/lefkowit/mtTesting/qa/primbeqa/links/p1222/11/PDK_r1.1.1/prim/clean/pcell_testgen/.runconfigs.cfg-1.6427-7d1e789cb3f62f9cde719a4865bb51b3c17ea853" - ticket 220546342 - ;; TODO - consider 1) using simple-lock to bracket cache write - ;; 2) cache in hash on server, since need to do rmt: anyway to lock. - - (if rccachef - (common:fail-safe - (lambda () - (configf:write-alist runconfigdat rccachef)) - (conc "Could not write cache file - "rccachef))) - (if mtcachef - (common:fail-safe - (lambda () - (configf:write-alist *configdat* mtcachef)) - (conc "Could not write cache file - "mtcachef))) - (set! *runconfigdat* runconfigdat) - (if (and rccachef mtcachef) (set! *configstatus* 'fulldata)))) - ;; no configs found? should not happen but let's try to recover gracefully, return an empty hash-table - (set! *configdat* (make-hash-table)) - ))) - - ;; else read what you can and set the flag accordingly - ;; here we don't have either mtconfig or rccachef - (else - ;;(BB> "launch:setup-body -- cond branch 3 - else") - (let* ((cfgdat (configf:find-and-read-config - (or (args:get-arg "-config") "megatest.config") - environ-patt: "env-override" - given-toppath: (get-environment-variable "MT_RUN_AREA_HOME") - pathenvvar: "MT_RUN_AREA_HOME"))) - - (if (and cfgdat (list? cfgdat) (> (length cfgdat) 0) (hash-table? (car cfgdat))) - (let* ((toppath (or (get-environment-variable "MT_RUN_AREA_HOME")(cadr cfgdat))) - (rdat (configf:read-config (conc toppath ;; convert this to use runconfig:read! - "/runconfigs.config") *runconfigdat* #t sections: sections))) - (set! *configinfo* cfgdat) - (set! *configdat* (car cfgdat)) - (set! *runconfigdat* rdat) - (set! *toppath* toppath) - (set! *configstatus* 'partial)) - (begin - (debug:print-error 0 *default-log-port* "No " mtconfig " file found. Giving up.") - (exit 2)))))) - ;; COND ends here. - - ;; additional house keeping - (let* ((linktree (or (common:get-linktree) - (conc *toppath* "/lt")))) - (if linktree - (begin - (if (not (common:file-exists? linktree)) - (begin - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* "Something went wrong when trying to create linktree dir at " linktree) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (exit 1)) - (create-directory linktree #t)))) - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* "Something went wrong when trying to create link to linktree at " *toppath*) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))) - (let ((tlink (conc *toppath* "/lt"))) - (if (not (common:file-exists? tlink)) - (create-symbolic-link linktree tlink))))) - (begin - (debug:print-error 0 *default-log-port* "linktree not defined in [setup] section of megatest.config") - ))) - (if (and *toppath* - (directory-exists? *toppath*)) - (begin - (setenv "MT_RUN_AREA_HOME" *toppath*) - (setenv "MT_TESTSUITENAME" (common:get-testsuite-name))) - (begin - (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.") - (set! *toppath* #f) ;; force it to be false so we return #f - #f)) - - ;; one more attempt to cache the configs for future reading - (let* ((cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig)) - (mtcachef (car cachefiles)) - (rccachef (cdr cachefiles))) - - ;; trap exception due to stale NFS handle -- Error: (open-output-file) cannot open file - Stale NFS file handle: "...somepath.../.runconfigs.cfg-1.6427-7d1e789cb3f62f9cde719a4865bb51b3c17ea853" - ticket 220546342 - ;; TODO - consider 1) using simple-lock to bracket cache write - ;; 2) cache in hash on server, since need to do rmt: anyway to lock. - (if (and rccachef *runconfigdat* (not (common:file-exists? rccachef))) - (common:fail-safe - (lambda () - (configf:write-alist *runconfigdat* rccachef)) - (conc "Could not write cache file - "rccachef)) - ) - (if (and mtcachef *configdat* (not (common:file-exists? mtcachef))) - (common:fail-safe - (lambda () - (configf:write-alist *configdat* mtcachef)) - (conc "Could not write cache file - "mtcachef)) - ) - (if (and rccachef mtcachef *runconfigdat* *configdat*) - (set! *configstatus* 'fulldata))) - - ;; if have -append-config then read and append here - (let ((cfname (args:get-arg "-append-config"))) - (if (and cfname - (file-read-access? cfname)) - (configf:read-config cfname *configdat* #t))) ;; values are added to the hash, no need to do anything special. - *toppath*))) - -(define (get-best-disk confdat testconfig) - (let* ((disks (or (and testconfig (hash-table-ref/default testconfig "disks" #f)) - (hash-table-ref/default confdat "disks" #f))) - (minspace (let ((m (configf:lookup confdat "setup" "minspace"))) - (string->number (or m "10000"))))) - (if disks - (let ((res (common:get-disk-with-most-free-space disks minspace))) ;; min size of 1000, seems tad dumb - (if res - (cdr res) - (begin -;; (if (common:low-noise-print 20 "No valid disks or no disk with enough space") -;; (debug:print-error 0 *default-log-port* "No valid disks found in megatest.config. Please add some to your [disks] section and ensure the directory exists and has enough space!\n You can change minspace in the [setup] section of megatest.config. Current setting is: " minspace)) - ;;(exit 1) - (if (null? disks) - (cons 1 (conc *toppath* "/runs")) - (let ((paths (sort disks (lambda (x y) (> (string-length (cadr x)) (string-length (cadr y))))))) - (let loop ((head (car paths)) (tail (cdr paths))) - (let ((result (handle-exceptions exn #f (create-directory (cadr head) #t)))) - (if result - result - (if (null? tail) - (cons 1 (conc *toppath* "/runs")) - (loop (car tail) (cdr tail)))))))))))))) ;; the code creates the necessary directories if it does not exist and returns the path. - - -(define (launch:test-copy test-src-path test-path) - (let* ((ovrcmd (let ((cmd (configf:lookup *configdat* "setup" "testcopycmd"))) - (if cmd - ;; substitute the TEST_SRC_PATH and TEST_TARG_PATH - (string-substitute "TEST_TARG_PATH" test-path - (string-substitute "TEST_SRC_PATH" test-src-path cmd #t) #t) - #f))) - (cmd (if ovrcmd - ovrcmd - (conc "rsync -av" (if (debug:debug-mode 1) "" "q") " " test-src-path "/ " test-path "/" - " >> " test-path "/mt_launch.log 2>> " test-path "/mt_launch.log"))) - (status (system cmd))) - (if (not (eq? status 0)) - (debug:print 2 *default-log-port* "ERROR: problem with running \"" cmd "\"")))) - - -;; Desired directory structure: -;; -;; - - -. -;; | -;; v -;; - - -|- -;; -;; dir stored in test is: -;; -;; - - [ - ] -;; -;; All log file links should be stored relative to the top of link path -;; -;; - [ - ] -;; -(define (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat #!key (remtries 2)) - (let* ((item-path (if (string? itemdat) itemdat (item-list->path itemdat))) ;; if pass in string - just use it - (runname (if (string? run-info) ;; if we pass in a string as run-info use it as run-name. - run-info - (db:get-value-by-header (db:get-rows run-info) - (db:get-header run-info) - "runname"))) - (contour #f) ;; NOT READY FOR THIS (args:get-arg "-contour")) - ;; convert back to db: from rdb: - this is always run at server end - (target (string-intersperse (map cadr keyvals) "/")) - - (not-iterated (equal? "" item-path)) - - ;; all tests are found at /test-base or /test-base - (testtop-base (conc target "/" runname "/" testname)) - (test-base (conc testtop-base (if not-iterated "" "/") item-path)) - - ;; nb// if itempath is not "" then it is prefixed with "/" - (toptest-path (conc disk-path (if contour (conc "/" contour) "") "/" testtop-base)) - (test-path (conc disk-path (if contour (conc "/" contour) "") "/" test-base)) - - ;; ensure this exists first as links to subtests must be created there - (linktree (common:get-linktree)) - ;; WAS: (let ((rd (configf:lookup *configdat* "setup" "linktree"))) - ;; (if rd rd (conc *toppath* "/runs")))) - ;; which seems wrong ... - - (lnkbase (conc linktree (if contour (conc "/" contour) "") "/" target "/" runname)) - (lnkpath (conc lnkbase "/" testname)) - (lnkpathf (conc lnkpath (if not-iterated "" "/") item-path)) - (lnktarget (conc lnkpath "/" item-path))) - - ;; Update the rundir path in the test record for all, rundir=physical, shortdir=logical - ;; rundir shortdir - (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path run-id) - - (debug:print 2 *default-log-port* "INFO:\n lnkbase=" lnkbase "\n lnkpath=" lnkpath "\n toptest-path=" toptest-path "\n test-path=" test-path) - (if (not (common:file-exists? linktree)) - (begin - (debug:print 0 *default-log-port* "WARNING: linktree did not exist! Creating it now at " linktree) - (create-directory linktree #t))) ;; (system (conc "mkdir -p " linktree)))) - ;; create the directory for the tests dir links, this is needed no matter what... try up to three times - (let loop ((done 3)) - (let ((success (if (and (not (common:directory-exists? lnkbase)) - (not (common:file-exists? lnkbase))) - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* "Problem creating linktree base at " lnkbase) - (print-error-message exn (current-error-port)) - #t) - (create-directory lnkbase #t) - #f)))) - (if (and (not success)(> done 0)) - (loop (- done 1))))) - - ;; update the toptest record with its location rundir, cache the path - ;; This wass highly inefficient, one db write for every subtest, potentially - ;; thousands of unnecessary updates, cache the fact it was set and don't set it - ;; again. - - ;; Now create the link from the test path to the link tree, however - ;; if the test is iterated it is necessary to create the parent path - ;; to the iteration. use pathname-directory to trim the path by one - ;; level - (if (not not-iterated) ;; i.e. iterated - (let ((iterated-parent (pathname-directory (conc lnkpath "/" item-path)))) - (debug:print-info 2 *default-log-port* "Creating iterated parent " iterated-parent) - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* " Failed to create directory " iterated-parent ((condition-property-accessor 'exn 'message) exn) ", exiting") - (exit 1)) - (create-directory iterated-parent #t)))) - - (if (symbolic-link? lnkpath) - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* " Failed to remove symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", exiting") - (exit 1)) - (delete-file lnkpath))) - - (if (not (or (common:file-exists? lnkpath) - (symbolic-link? lnkpath))) - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* " Failed to create symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", exiting") - (exit 1)) - (create-symbolic-link toptest-path lnkpath))) - - ;; NB - This was not working right - some top tests are not getting the path set!!! - ;; - ;; Do the setting of this record after the paths are created so that the shortdir can - ;; be set to the real directory location. This is safer for future clean up if the link - ;; tree is damaged or lost. - ;; - (if (not (hash-table-ref/default *toptest-paths* testname #f)) - (let* ((testinfo (rmt:get-test-info-by-id run-id test-id)) ;; run-id testname item-path)) - (curr-test-path (if testinfo ;; (filedb:get-path *fdb* - ;; (db:get-path dbstruct - ;; (rmt:sdb-qry 'getstr - (db:test-get-rundir testinfo) ;; ) ;; ) - #f))) - (hash-table-set! *toptest-paths* testname curr-test-path) - ;; NB// Was this for the test or for the parent in an iterated test? - (rmt:general-call 'test-set-rundir-shortdir run-id lnkpath - (if (common:file-exists? lnkpath) - ;; (resolve-pathname lnkpath) - (common:nice-path lnkpath) - lnkpath) - testname "" run-id) - ;; (rmt:general-call 'test-set-rundir run-id lnkpath testname "") ;; toptest-path) - (if (or (not curr-test-path) - (not (directory-exists? toptest-path))) - (begin - (debug:print-info 2 *default-log-port* "Creating " toptest-path " and link " lnkpath) - (handle-exceptions - exn - #f ;; don't care to catch and deal with errors here for now. - (create-directory toptest-path #t)) - (hash-table-set! *toptest-paths* testname toptest-path))))) - - ;; The toptest path has been created, the link to the test in the linktree has - ;; been created. Now, if this is an iterated test the real test dir must be created - (if (not not-iterated) ;; this is an iterated test - (begin ;; (let ((lnktarget (conc lnkpath "/" item-path))) - (debug:print 2 *default-log-port* "Setting up sub test run area") - (debug:print 2 *default-log-port* " - creating run area in " test-path) - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) ", exiting") - (exit 1)) - (create-directory test-path #t)) - (debug:print 2 *default-log-port* - " - creating link from: " test-path "\n" - " to: " lnktarget) - - ;; If there is already a symlink delete it and recreate it. - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* " Failed to re-create link " lnktarget ((condition-property-accessor 'exn 'message) exn) ", exiting") - (exit)) - (if (symbolic-link? lnktarget) (delete-file lnktarget)) - (if (not (common:file-exists? lnktarget)) (create-symbolic-link test-path lnktarget))))) - - (if (not (directory? test-path)) - (create-directory test-path #t)) ;; this is a hack, I don't know why out of the blue this path does not exist sometimes - - (if (and test-src-path (directory? test-path)) - (begin - (launch:test-copy test-src-path test-path) - (list lnkpathf lnkpath )) - (if (and test-src-path (> remtries 0)) - (begin - (debug:print-error 0 *default-log-port* "Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries) - ;; - (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat remtries: (- remtries 1))) - (list #f #f))))) - - -(define (launch:handle-zombie-tests run-id) - (let* ((key (conc "zombiescan-runid-"run-id)) - (now (current-seconds)) - (threshold (- (current-seconds) (* 2 (or (configf:lookup-number *configdat* "setup" "deadtime") 120)))) - (val (rmt:get-var key)) - (do-scan? - (cond - ((not val) - #t) - ((< val threshold) - #t) - (else #f)))) - (when do-scan? - (debug:print 1 *default-log-port* "INFO: search and mark zombie tests") - (rmt:set-var key (current-seconds)) - (rmt:find-and-mark-incomplete run-id #f)))) - - - - - -;; 1. look though disks list for disk with most space -;; 2. create run dir on disk, path name is meaningful -;; 3. create link from run dir to megatest runs area -;; 4. remotely run the test on allocated host -;; - could be ssh to host from hosts table (update regularly with load) -;; - could be netbatch -;; (launch-test db (cadr status) test-conf)) -(define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) - (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex - (let* ( ;; (lock-key (conc "test-" test-id)) - ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) - ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds - ;; (if (car lock) - ;; #t - ;; (if (> (current-seconds) expire-time) - ;; (begin - ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path) - ;; (rmt:no-sync-del! lock-key) ;; destroy the lock - ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; - ;; (begin - ;; (thread-sleep! 1) - ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)))))) - (item-path (item-list->path itemdat)) - (contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour"))) - (let loop ((delta (- (current-seconds) *last-launch*)) - (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1))) - (if (> launch-delay delta) - (begin - (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay. - (debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds")) - (thread-sleep! (- launch-delay delta)) - (loop (- (current-seconds) *last-launch*) launch-delay)))) - (change-directory *toppath*) - (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars) - (append - (list - (list "MT_RUN_AREA_HOME" *toppath*) - (list "MT_TEST_NAME" test-name) - (list "MT_RUNNAME" runname) - (list "MT_ITEMPATH" item-path) - (list "MT_CONTOUR" contour) - ) - itemdat)) - (let* ((tregistry (tests:get-all)) ;; third param (below) is system-allowed - ;; for tconfig, why do we allow fallback to test-conf? - (tconfig (or (tests:get-testconfig test-name item-path tregistry #t force-create: #t) - (begin - (debug:print 0 *default-log-port* "WARNING: falling back to pre-calculated testconfig. This is likely not desired.") - test-conf))) ;; force re-read now that all vars are set - (useshell (let ((ush (configf:lookup *configdat* "jobtools" "useshell"))) - (if ush - (if (equal? ush "no") ;; must use "no" to NOT use shell - #f - ush) - #t))) ;; default is yes - (runscript (configf:lookup tconfig "setup" "runscript")) - (ezsteps (> (length (hash-table-ref/default tconfig "ezsteps" '())) 0)) ;; don't send all the steps, could be big, just send a flag - (subrun (> (length (hash-table-ref/default tconfig "subrun" '())) 0)) ;; send a flag to process a subrun - ;; (diskspace (configf:lookup tconfig "requirements" "diskspace")) - ;; (memory (configf:lookup tconfig "requirements" "memory")) - ;; (hosts (configf:lookup *configdat* "jobtools" "workhosts")) ;; I'm pretty sure this was never completed - (remote-megatest (configf:lookup *configdat* "setup" "executable")) - (run-time-limit (or (configf:lookup tconfig "requirements" "runtimelim") - (configf:lookup *configdat* "setup" "runtimelim"))) - ;; FIXME SOMEDAY: not good how this is so obtuse, this hack is to - ;; allow running from dashboard. Extract the path - ;; from the called megatest and convert dashboard - ;; or dboard to megatest - (local-megatest (let* ((lm (car (argv))) - (dir (pathname-directory lm)) - (exe (pathname-strip-directory lm))) - (conc (if dir (conc dir "/") "") - (case (string->symbol exe) - ((dboard) "../megatest") - ((mtest) "../megatest") - ((dashboard) "megatest") - (else exe))))) - (launcher (common:get-launcher *configdat* test-name item-path)) ;; (configf:lookup *configdat* "jobtools" "launcher")) - (test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path)) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path - (work-area #f) - (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all - (diskpath #f) - (cmdparms #f) - (fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x)))) - (mt-bindir-path #f) - (testinfo (rmt:get-test-info-by-id run-id test-id)) - (mt_target (string-intersperse (map cadr keyvals) "/")) - (debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '()) - (if (args:get-arg "-logging")(list "-logging") '())))) - ;; (if hosts (set! hosts (string-split hosts))) - ;; set the megatest to be called on the remote host - (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest")) - (set! mt-bindir-path (pathname-directory remote-megatest)) - (if launcher (set! launcher (string-split launcher))) - ;; set up the run work area for this test - (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run - (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir - (begin - (debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) - (runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record - - ;; prevent overlapping actions - set to LAUNCHED as early as possible - ;; - ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail - (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) - (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f) - ;; (pp (hash-table->alist tconfig)) - (set! diskpath (get-best-disk *configdat* tconfig)) - (if diskpath - (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) - (set! work-area (car dat)) - (set! toptest-work-area (cadr dat)) - (debug:print-info 2 *default-log-port* "Using work area " work-area)) - (begin - (set! work-area (conc test-path "/tmp_run")) - (create-directory work-area #t) - (debug:print 0 *default-log-port* "WARNING: No disk work area specified - running in the test directory under tmp_run"))) - (set! cmdparms (base64:base64-encode - (z3:encode-buffer - (with-output-to-string - (lambda () ;; (list 'hosts hosts) - (write (list (list 'testpath test-path) - ;; (list 'transport (conc *transport-type*)) - ;; (list 'serverinf *server-info*) - (list 'homehost (let* ((hhdat (common:get-homehost))) - (if hhdat - (car hhdat) - #f))) - (list 'serverurl (if *runremote* - (remote-server-url *runremote*) - #f)) ;; - (list 'areaname (common:get-testsuite-name)) - (list 'toppath *toppath*) - (list 'work-area work-area) - (list 'test-name test-name) - (list 'runscript runscript) - (list 'run-id run-id ) - (list 'test-id test-id ) - ;; (list 'item-path item-path ) - (list 'itemdat itemdat ) - (list 'megatest remote-megatest) - (list 'ezsteps ezsteps) - (list 'subrun subrun) - (list 'target mt_target) - (list 'contour contour) - (list 'runtlim (if run-time-limit (common:hms-string->seconds run-time-limit) #f)) - (list 'env-ovrd (hash-table-ref/default *configdat* "env-override" '())) - (list 'set-vars (if params (hash-table-ref/default params "-setvars" #f))) - (list 'runname runname) - (list 'mt-bindir-path mt-bindir-path)))))))) - - ;; clean out step records from previous run if they exist - ;; (rmt:delete-test-step-records run-id test-id) - ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway - (if (common:file-exists? work-area) - (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir - (cond - ;; ((and launcher hosts) ;; must be using ssh hostname - ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) - ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms)))) - (launcher - (set! fullcmd (append launcher (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) - ;; (set! fullcmd (append launcher (list remote-megatest test-sig "-execute" cmdparms)))) - (else - (if (not useshell)(debug:print 0 *default-log-port* "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section")) - (set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" "")))))) - ;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" ""))))) - (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm")))) - (debug:print 1 *default-log-port* "Launching " work-area) - ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done - (debug:print 4 *default-log-port* "fullcmd: " fullcmd) - (set! *last-launch* (current-seconds)) ;; all that junk above takes time, set this as late as possible. - (let* ((commonprevvals (alist->env-vars - (hash-table-ref/default *configdat* "env-override" '()))) - (miscprevvals (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute" - (append (list (list "MT_TEST_RUN_DIR" work-area) - (list "MT_TEST_NAME" test-name) - (list "MT_ITEM_INFO" (conc itemdat)) - (list "MT_RUNNAME" runname) - (list "MT_TARGET" mt_target) - (list "MT_ITEMPATH" item-path) - ) - itemdat))) - (testprevvals (alist->env-vars - (hash-table-ref/default tconfig "pre-launch-env-overrides" '()))) - ;; Launchwait defaults to true, must override it to turn off wait - (launchwait (if (equal? (configf:lookup *configdat* "setup" "launchwait") "no") #f #t)) - (launch-results-prev (apply (if launchwait ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed. - process:cmd-run-with-stderr-and-exitcode->list - process-run) - (if useshell - (let ((cmdstr (string-intersperse fullcmd " "))) - (if launchwait - cmdstr - (conc cmdstr " >> mt_launch.log 2>&1 &"))) - (car fullcmd)) - (if useshell - '() - (cdr fullcmd)))) - (success (if launchwait (equal? 0 (cadr launch-results-prev)) #t)) - (launch-results (if launchwait (car launch-results-prev) launch-results-prev))) - (if (not success) - (tests:test-set-status! run-id test-id "COMPLETED" "DEAD" "launcher failed; exited non-zero; check mt_launch.log" #f)) ;; (if launch-results launch-results "FAILED")) - (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork. - ;; (rmt:no-sync-del! lock-key) ;; release the lock for starting this test - (if (not launchwait) ;; give the OS a little time to allow the process to start - (thread-sleep! 0.01)) - (with-output-to-file "mt_launch.log" - (lambda () - (print "LAUNCHCMD: " (string-intersperse fullcmd " ")) - (if (list? launch-results) - (apply print launch-results) - (print "NOTE: launched \"" fullcmd "\"\n but did not wait for it to proceed. Add the following to megatest.config \n[setup]\nlaunchwait yes\n if you have problems with this")) - #:append)) - (debug:print 2 *default-log-port* "Launching completed, updating db") - (debug:print 2 *default-log-port* "Launch results: " launch-results) - (if (not launch-results) - (begin - (print "ERROR: Failed to run " (string-intersperse fullcmd " ") ", exiting now") - ;; (sqlite3:finalize! db) - ;; good ole "exit" seems not to work - ;; (_exit 9) - ;; but this hack will work! Thanks go to Alan Post of the Chicken email list - ;; NB// Is this still needed? Should be safe to go back to "exit" now? - (process-signal (current-process-id) signal/kill) - )) - (alist->env-vars miscprevvals) - (alist->env-vars testprevvals) - (alist->env-vars commonprevvals) - launch-results)) - (change-directory *toppath*))) - -;; recover a test where the top controlling mtest may have died -;; -(define (launch:recover-test run-id test-id) - ;; this function is called on the test run host via ssh - ;; - ;; 1. look at the process from pid - ;; - is it owned by calling user - ;; - it it's run directory correct for the test - ;; - is there a controlling mtest (maybe stuck) - ;; 2. if recovery is needed watch pid - ;; - when it exits take the exit code and do the needful - ;; - (let* ((pid (rmt:test-get-top-process-pid run-id test-id)) - (psres (with-input-from-pipe - (conc "ps -F -u " (current-user-name) " | grep -E '" pid " ' | grep -v 'grep -E " pid "'") - (lambda () - (read-line)))) - (rundir (if (string? psres) ;; real process owned by user - (read-symbolic-link (conc "/proc/" pid "/cwd")) - #f))) - ;; now wait on that process if all is correct - ;; periodically update the db with runtime - ;; when the process exits look at the db, if still RUNNING after 10 seconds set - ;; state/status appropriately - (process-wait pid))) - - -;; Do not rpc this one, do the underlying calls!!! -(define (tests:test-set-status! run-id test-id state status comment dat #!key (work-area #f)) - (let* ((real-status status) - (otherdat (if dat dat (make-hash-table))) - (testdat (rmt:get-test-info-by-id run-id test-id)) - (test-name (db:test-get-testname testdat)) - (item-path (db:test-get-item-path testdat)) - ;; before proceeding we must find out if the previous test (where all keys matched except runname) - ;; was WAIVED if this test is FAIL - - ;; NOTES: - ;; 1. Is the call to test:get-previous-run-record remotified? - ;; 2. Add test for testconfig waiver propagation control here - ;; - (prev-test (if (equal? status "FAIL") - (rmt:get-previous-test-run-record run-id test-name item-path) - #f)) - (waived (if prev-test - (if prev-test ;; true if we found a previous test in this run series - (let ((prev-status (db:test-get-status prev-test)) - (prev-state (db:test-get-state prev-test)) - (prev-comment (db:test-get-comment prev-test))) - (debug:print 4 *default-log-port* "prev-status " prev-status ", prev-state " prev-state ", prev-comment " prev-comment) - (if (and (equal? prev-state "COMPLETED") - (equal? prev-status "WAIVED")) - (if comment - comment - prev-comment) ;; waived is either the comment or #f - #f)) - #f) - #f))) - (if (and waived - (tests:check-waiver-eligibility testdat prev-test)) - (set! real-status "WAIVED")) - - (debug:print 4 *default-log-port* "real-status " real-status ", waived " waived ", status " status) - - ;; update the primary record IF state AND status are defined - (if (and state status) - (begin - (rmt:set-state-status-and-roll-up-items run-id test-id item-path state real-status (if waived waived comment)) - ;; (mt:process-triggers run-id test-id state real-status) ;; triggers are called in test-set-state-status - )) - - ;; if status is "AUTO" then call rollup (note, this one modifies data in test - ;; run area, it does remote calls under the hood. - ;; (if (and test-id state status (equal? status "AUTO")) - ;; (rmt:test-data-rollup run-id test-id status)) - - ;; add metadata (need to do this way to avoid SQL injection issues) - - ;; :first_err - ;; (let ((val (hash-table-ref/default otherdat ":first_err" #f))) - ;; (if val - ;; (sqlite3:execute db "UPDATE tests SET first_err=? WHERE run_id=? AND testname=? AND item_path=?;" val run-id test-name item-path))) - ;; - ;; ;; :first_warn - ;; (let ((val (hash-table-ref/default otherdat ":first_warn" #f))) - ;; (if val - ;; (sqlite3:execute db "UPDATE tests SET first_warn=? WHERE run_id=? AND testname=? AND item_path=?;" val run-id test-name item-path))) - - (let ((category (hash-table-ref/default otherdat ":category" "")) - (variable (hash-table-ref/default otherdat ":variable" "")) - (value (hash-table-ref/default otherdat ":value" #f)) - (expected (hash-table-ref/default otherdat ":expected" "n/a")) - (tol (hash-table-ref/default otherdat ":tol" "n/a")) - (units (hash-table-ref/default otherdat ":units" "")) - (type (hash-table-ref/default otherdat ":type" "")) - (dcomment (hash-table-ref/default otherdat ":comment" ""))) - (debug:print 4 *default-log-port* - "category: " category ", variable: " variable ", value: " value - ", expected: " expected ", tol: " tol ", units: " units) - (if (and value) ;; require only value; BB was- all three required - (let ((dat (conc category "," - variable "," - value "," - expected "," - tol "," - units "," - dcomment ",," ;; extra comma for status - type ))) - ;; This was run remote, don't think that makes sense. Perhaps not, but that is the easiest path for the moment. - (rmt:csv->test-data run-id test-id - dat) - ;; This was added in check-in a5adfa3f9a. Message was: "...added delay in set-values to allow for delayed write on server start" - ;; I'm inserting an arbitrary rmt: call to force/ensure that the server is available to (hopefully) prevent a communication issue. - (rmt:get-var "MEGATEST_VERSION") ;; this does NOTHING but ensure the server is reachable. This is almost certainly NOT needed :) - ;; BB - commentiong out arbitrary 10 second wait (thread-sleep! 10) ;; add 10 second delay before quit incase rmt needs time to start a server. - ))) - - ;; need to update the top test record if PASS or FAIL and this is a subtest - ;;;;;; (if (not (equal? item-path "")) - ;;;;;; (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status #f) ;;;;;) - - (if (or (and (string? comment) - (string-match (regexp "\\S+") comment)) - waived) - (let ((cmt (if waived waived comment))) - (rmt:general-call 'set-test-comment run-id cmt test-id))))) - ) Index: runsmod.scm ================================================================== --- runsmod.scm +++ runsmod.scm @@ -2648,7 +2648,4240 @@ (item-path (vector-ref running-test 11))) (debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed") (if (not (null? tal)) (loop (car tal) (cdr tal))))))))))) +;; This is the Megatest API. All generally "useful" routines will be wrapped or extended +;; here. + +;; 0 1 2 3 +(defstruct launch:einf (pid #t)(exit-status #t)(exit-code #t)(rollup-status 0)) + +;;====================================================================== +;; R U N S +;;====================================================================== + +;; runs:get-runs-by-patt +;; get runs by list of criteria +;; register a test run with the db +;; +;; Use: (db-get-value-by-header (db:get-header runinfo)(db:get-rows runinfo)) +;; to extract info from the structure returned +;; +(define (mt:get-runs-by-patt keys runnamepatt targpatt) + (let loop ((runsdat (rmt:get-runs-by-patt keys runnamepatt targpatt 0 500 #f 0)) + (res '()) + (offset 0) + (limit 500)) + ;; (print "runsdat: " runsdat) + (let* ((header (vector-ref runsdat 0)) + (runslst (vector-ref runsdat 1)) + (full-list (append res runslst)) + (have-more (eq? (length runslst) limit))) + ;; (debug:print 0 *default-log-port* "header: " header " runslst: " runslst " have-more: " have-more) + (if have-more + (let ((new-offset (+ offset limit)) + (next-batch (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit #f 0))) + (debug:print-info 4 *default-log-port* "More than " limit " runs, have " (length full-list) " runs so far.") + (debug:print-info 0 *default-log-port* "next-batch: " next-batch) + (loop next-batch + full-list + new-offset + limit)) + (vector header full-list))))) + +;;====================================================================== +;; T E S T S +;;====================================================================== + +(define (mt:get-tests-for-run run-id testpatt states status #!key (not-in #t) (sort-by 'event_time) (sort-order "ASC") (qryvals #f)(last-update #f)) + (let loop ((testsdat (rmt:get-tests-for-run run-id testpatt states status 0 500 not-in sort-by sort-order qryvals last-update 'normal)) + (res '()) + (offset 0) + (limit 500)) + (let* ((full-list (append res testsdat)) + (have-more (eq? (length testsdat) limit))) + (if have-more + (let ((new-offset (+ offset limit))) + (debug:print-info 4 *default-log-port* "More than " limit " tests, have " (length full-list) " tests so far.") + (loop (rmt:get-tests-for-run run-id testpatt states status new-offset limit not-in sort-by sort-order qryvals last-update 'normal) + full-list + new-offset + limit)) + full-list)))) + +(define (mt:lazy-get-prereqs-not-met run-id waitons ref-item-path #!key (mode '(normal))(itemmaps #f) ) + (let* ((key (list run-id waitons ref-item-path mode)) + (res (hash-table-ref/default *pre-reqs-met-cache* key #f)) + (useres (let ((last-time (if (vector? res) (vector-ref res 0) #f))) + (if last-time + (< (current-seconds)(+ last-time 5)) + #f)))) + (if useres + (let ((result (vector-ref res 1))) + (debug:print 4 *default-log-port* "Using lazy value res: " result) + result) + (let ((newres (rmt:get-prereqs-not-met run-id waitons ref-item-path mode: mode itemmaps: itemmaps))) + (hash-table-set! *pre-reqs-met-cache* key (vector (current-seconds) newres)) + newres)))) + +(define (mt:get-run-stats dbstruct run-id) +;; Get run stats from local access, move this ... but where? + (db:get-run-stats dbstruct run-id)) + +(define (mt:discard-blocked-tests run-id failed-test tests test-records) + (if (null? tests) + tests + (begin + (debug:print-info 1 *default-log-port* "Discarding tests from " tests " that are waiting on " failed-test) + (let loop ((testn (car tests)) + (remt (cdr tests)) + (res '())) + (let* ((test-dat (hash-table-ref/default test-records testn (vector #f #f '()))) + (waitons (vector-ref test-dat 2))) + ;; (print "mt:discard-blocked-tests run-id: " run-id " failed-test: " failed-test " testn: " testn " with waitons: " waitons) + (if (null? remt) + (let ((new-res (reverse res))) + ;; (print " new-res: " new-res) + new-res) + (loop (car remt) + (cdr remt) + (if (member failed-test waitons) + (begin + (debug:print 0 *default-log-port* "Discarding test " testn "(" test-dat ") due to " failed-test) + res) + (cons testn res))))))))) + +;;====================================================================== +;; S T A T E A N D S T A T U S F O R T E S T S +;;====================================================================== + +;; speed up for common cases with a little logic +(define (mt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment) + (if (not (and run-id test-id)) + (begin + (debug:print-error 0 *default-log-port* "bad data handed to mt:test-set-state-status-by-id, run-id=" run-id ", test-id=" test-id ", newstate=" newstate) + (print-call-chain (current-error-port)) + #f) + (begin + ;; cond + ;; ((and newstate newstatus newcomment) + ;; (rmt:general-call 'state-status-msg run-id newstate newstatus newcomment test-id)) + ;; ((and newstate newstatus) + ;; (rmt:general-call 'state-status run-id newstate newstatus test-id)) + ;; (else + ;; (if newstate (rmt:general-call 'set-test-state run-id newstate test-id)) + ;; (if newstatus (rmt:general-call 'set-test-status run-id newstatus test-id)) + ;; (if newcomment (rmt:general-call 'set-test-comment run-id newcomment test-id)))) + (rmt:set-state-status-and-roll-up-items run-id test-id #f newstate newstatus newcomment) + ;; (mt:process-triggers run-id test-id newstate newstatus) + #t))) + + +(define (mt:test-set-state-status-by-id-unless-completed run-id test-id newstate newstatus newcomment) + (let* ((test-vec (rmt:get-testinfo-state-status run-id test-id)) + (state (vector-ref test-vec 3))) + (if (equal? state "COMPLETED") + #t + (rmt:set-state-status-and-roll-up-items run-id test-id #f newstate newstatus newcomment)))) + + +(define (mt:test-set-state-status-by-testname run-id test-name item-path new-state new-status new-comment) + ;(let ((test-id (rmt:get-test-id run-id test-name item-path))) + (rmt:set-state-status-and-roll-up-items run-id test-name item-path new-state new-status new-comment) + ;; (mt:process-triggers run-id test-id new-state new-status) + #t);) + ;;(mt:test-set-state-status-by-id run-id test-id new-state new-status new-comment))) + +(define (mt:test-set-state-status-by-testname-unless-completed run-id test-name item-path new-state new-status new-comment) + (let ((test-id (rmt:get-test-id run-id test-name item-path))) + (mt:test-set-state-status-by-id-unless-completed run-id test-id new-state new-status new-comment))) + +;; kill any runner processes (i.e. processes handling -runtests) that match target/runname +;; +;; do a remote call to get the task queue info but do the killing as self here. +;; +(define (tasks:kill-runner target run-name testpatt) + (let ((records (rmt:tasks-find-task-queue-records target run-name testpatt "running" "run-tests")) + (hostpid-rx (regexp "\\s+(\\w+)\\s+(\\d+)$"))) ;; host pid is at end of param string + (if (null? records) + (debug:print 0 *default-log-port* "No run launching processes found for " target " / " run-name " with testpatt " (or testpatt "* no testpatt specified! *")) + (debug:print 0 *default-log-port* "Found " (length records) " run(s) to kill.")) + (for-each + (lambda (record) + (let* ((param-key (list-ref record 8)) + (match-dat (string-search hostpid-rx param-key))) + (if match-dat + (let ((hostname (cadr match-dat)) + (pid (string->number (caddr match-dat)))) + (debug:print 0 *default-log-port* "Sending SIGINT to process " pid " on host " hostname) + (if (equal? (get-host-name) hostname) + (if (process:alive? pid) + (begin + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "Kill of process " pid " on host " hostname " failed.") + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + #t) + (process-signal pid signal/int) + (thread-sleep! 5) + (if (process:alive? pid) + (process-signal pid signal/kill))))) + ;; (call-with-environment-variables + (let ((old-targethost (getenv "TARGETHOST"))) + (setenv "TARGETHOST" hostname) + (setenv "TARGETHOST_LOGF" "server-kills.log") + (system (conc "nbfake kill " pid)) + (if old-targethost (setenv "TARGETHOST" old-targethost)) + (unsetenv "TARGETHOST") + (unsetenv "TARGETHOST_LOGF")))) + (debug:print-error 0 *default-log-port* "no record or improper record for " target "/" run-name " in tasks_queue in main.db")))) + records))) + +(define (task:get-run-times) + (let* ( + (run-patt (if (args:get-arg "-run-patt") + (args:get-arg "-run-patt") + "%")) + (target-patt (if (args:get-arg "-target-patt") + (args:get-arg "-target-patt") + "%")) + + (run-times (rmt:get-run-times run-patt target-patt ))) + (if (eq? (length run-times) 0) + (begin + (print "Data not found!!") + (exit))) + (if (equal? (args:get-arg "-dumpmode") "json") + (task:print-runtime-as-json run-times) + (if (equal? (args:get-arg "-dumpmode") "csv") + (task:print-runtime run-times ",") + (task:print-runtime run-times " "))))) + + (define (task:get-test-times) + (let* ((runname (if (args:get-arg "-runname") + (args:get-arg "-runname") + #f)) + (target (if (args:get-arg "-target") + (args:get-arg "-target") + #f)) + + (test-times (rmt:get-test-times runname target ))) + (if (not runname) + (begin + (print "Error: Missing argument -runname") + (exit))) + (if (string-contains runname "%") + (begin + (print "Error: Invalid runname, '%' not allowed (" runname ") ") + (exit))) + (if (not target) + (begin + (print "Error: Missing argument -target") + (exit))) + (if (string-contains target "%") + (begin + (print "Error: Invalid target, '%' not allowed (" target ") ") + (exit))) + + (if (eq? (length test-times) 0) + (begin + (print "Data not found!!") + (exit))) + (if (equal? (args:get-arg "-dumpmode") "json") + (task:print-testtime-as-json test-times) + (if (equal? (args:get-arg "-dumpmode") "csv") + (task:print-testtime test-times ",") + (task:print-testtime test-times " "))))) + + + +;; gets mtpg-run-id and syncs the record if different +;; +(define (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time) + (let* ((runs-ht (hash-table-ref cached-info 'runs)) + (runinf (hash-table-ref/default runs-ht run-id #f)) + (area-id (vector-ref area-info 0))) + (if runinf + runinf ;; already cached + (let* ((run-dat (rmt:get-run-info run-id)) ;; NOTE: get-run-info returns a vector < row header > + (run-name (rmt:get-run-name-from-id run-id)) + (row (db:get-rows run-dat)) ;; yes, this returns a single row + (header (db:get-header run-dat)) + (state (db:get-value-by-header row header "state")) + (status (db:get-value-by-header row header "status")) + (owner (db:get-value-by-header row header "owner")) + (event-time (db:get-value-by-header row header "event_time")) + (comment (db:get-value-by-header row header "comment")) + (fail-count (db:get-value-by-header row header "fail_count")) + (pass-count (db:get-value-by-header row header "pass_count")) + (db-contour (db:get-value-by-header row header "contour")) + (contour (if (args:get-arg "-prepend-contour") + (if (and db-contour (not (equal? db-contour "")) (string? db-contour )) + (begin + (debug:print-info 1 *default-log-port* "db-contour") + db-contour) + (args:get-arg "-contour")))) + (run-tag (if (args:get-arg "-run-tag") + (args:get-arg "-run-tag") + "")) + (last-update (db:get-value-by-header row header "last_update")) + (keytarg (if (or (args:get-arg "-prepend-contour") (args:get-arg "-prefix-target")) + (conc "MT_CONTOUR/MT_AREA/" (string-intersperse (rmt:get-keys) "/")) (string-intersperse (rmt:get-keys) "/"))) ;; e.g. version/iteration/platform + (target (if (or (args:get-arg "-prepend-contour") (args:get-arg "-prefix-target")) + (conc (or (args:get-arg "-prefix-target") (conc contour "/" (common:get-area-name) "/")) (rmt:get-target run-id)) (rmt:get-target run-id))) ;; e.g. v1.63/a3e1/ubuntu + (spec-id (pgdb:get-ttype dbh keytarg)) + (publish-time (if (args:get-arg "-cp-eventtime-to-publishtime") + event-time + (current-seconds))) + (new-run-id (pgdb:get-run-id dbh spec-id target run-name area-id))) + (if new-run-id + (begin ;; let ((run-record (pgdb:get-run-info dbh new-run-id)) + (hash-table-set! runs-ht run-id new-run-id) + ;; ensure key fields are up to date + ;; if last_update == pgdb_last_update do not update smallest-last-update-time + (let* ((pgdb-last-update (pgdb:get-run-last-update dbh new-run-id)) + (smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f))) + (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) + (hash-table-set! smallest-last-update-time "smallest-time" last-update))) + (pgdb:refresh-run-info + dbh + new-run-id + state status owner event-time comment fail-count pass-count area-id last-update publish-time) + (debug:print-info 0 *default-log-port* "Working on run-id " run-id " pgdb-id " new-run-id ) + (if (not (equal? run-tag "")) + (task:add-run-tag dbh new-run-id run-tag)) + new-run-id) + + (if (equal? state "deleted") + (begin + (debug:print-info 1 *default-log-port* "Warning: Run with id " run-id " was created after previous sync and deleted before the sync") #f) + (if (handle-exceptions + exn + (begin (print-call-chain) + (print ((condition-property-accessor 'exn 'message) exn)) + #f) + + (pgdb:insert-run + dbh + spec-id target run-name state status owner event-time comment fail-count pass-count area-id last-update publish-time)) + (let* ((smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f))) + (if (or (not smallest-time) (< last-update smallest-time)) + (hash-table-set! smallest-last-update-time "smallest-time" last-update)) + (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time)) + #f))))))) + +(define (task:add-run-tag dbh run-id tag) + (let* ((tag-info (pgdb:get-tag-info-by-name dbh tag))) + (if (not tag-info) + (begin + (if (handle-exceptions + exn + (begin + (debug:print-info 1 *default-log-port* ((condition-property-accessor 'exn 'message) exn)) + #f) + (pgdb:insert-tag dbh tag)) + (set! tag-info (pgdb:get-tag-info-by-name dbh tag)) + #f))) + ;;add to area_tags + (handle-exceptions + exn + (begin + (debug:print-info 1 *default-log-port* ((condition-property-accessor 'exn 'message) exn)) + #f) + (if (not (pgdb:is-run-taged-with-a-tag dbh (vector-ref tag-info 0) run-id)) + (pgdb:insert-run-tag dbh (vector-ref tag-info 0) run-id))))) + + +(define (tasks:sync-test-steps dbh cached-info test-step-ids smallest-last-update-time) + ; (print "Sync Steps " test-step-ids ) + (let ((test-ht (hash-table-ref cached-info 'tests)) + (step-ht (hash-table-ref cached-info 'steps))) + (for-each + (lambda (test-step-id) + (let* ((test-step-info (rmt:get-steps-info-by-id test-step-id)) + (step-id (tdb:step-get-id test-step-info)) + (test-id (tdb:step-get-test_id test-step-info)) + (stepname (tdb:step-get-stepname test-step-info)) + (state (tdb:step-get-state test-step-info)) + (status (tdb:step-get-status test-step-info)) + (event_time (tdb:step-get-event_time test-step-info)) + (comment (tdb:step-get-comment test-step-info)) + (logfile (tdb:step-get-logfile test-step-info)) + (last-update (tdb:step-get-last_update test-step-info)) + (pgdb-test-id (hash-table-ref/default test-ht test-id #f)) + (smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f)) + (pgdb-step-id (if pgdb-test-id + (pgdb:get-test-step-id dbh pgdb-test-id stepname state) + #f))) + (if step-id + (begin + (if pgdb-test-id + (begin + (if pgdb-step-id + (begin + (debug:print-info 1 *default-log-port* "Updating existing test-step with test-id: " test-id " and step-id " step-id " pgdb test id: " pgdb-test-id " pgdb step id " pgdb-step-id ) + (let* ((pgdb-last-update (pgdb:get-test-step-last-update dbh pgdb-step-id))) + (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) + (hash-table-set! smallest-last-update-time "smallest-time" last-update))) + (pgdb:update-test-step dbh pgdb-step-id pgdb-test-id stepname state status event_time comment logfile last-update)) + (begin + (debug:print-info 1 *default-log-port* "Inserting test-step with test-id: " test-id " and step-id " step-id " pgdb test id: " pgdb-test-id) + (if (or (not smallest-time) (< last-update smallest-time)) + (hash-table-set! smallest-last-update-time "smallest-time" last-update)) + (pgdb:insert-test-step dbh pgdb-test-id stepname state status event_time comment logfile last-update ) + (set! pgdb-step-id (pgdb:get-test-step-id dbh pgdb-test-id stepname state)))) + (hash-table-set! step-ht step-id pgdb-step-id )) + (debug:print-info 1 *default-log-port* "Error: Test not cashed"))) + (debug:print-info 1 *default-log-port* "Error: Could not get test step info for step id " test-step-id )))) ;; this is a wierd senario need to debug + test-step-ids))) + +(define (tasks:sync-test-gen-data dbh cached-info test-data-ids smallest-last-update-time) + (let ((test-ht (hash-table-ref cached-info 'tests)) + (data-ht (hash-table-ref cached-info 'data))) + (for-each + (lambda (test-data-id) + (let* ((test-data-info (rmt:get-data-info-by-id test-data-id)) + (data-id (db:test-data-get-id test-data-info)) + (test-id (db:test-data-get-test_id test-data-info)) + (category (db:test-data-get-category test-data-info)) + (variable (db:test-data-get-variable test-data-info)) + (value (db:test-data-get-value test-data-info)) + (expected (db:test-data-get-expected test-data-info)) + (tol (db:test-data-get-tol test-data-info)) + (units (db:test-data-get-units test-data-info)) + (comment (db:test-data-get-comment test-data-info)) + (status (db:test-data-get-status test-data-info)) + (type (db:test-data-get-type test-data-info)) + (last-update (db:test-data-get-last_update test-data-info)) + (smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f)) + + (pgdb-test-id (hash-table-ref/default test-ht test-id #f)) + (pgdb-data-id (if pgdb-test-id + (pgdb:get-test-data-id dbh pgdb-test-id category variable) + #f))) + (if data-id + (begin + (if pgdb-test-id + (begin + (if pgdb-data-id + (begin + (debug:print-info 1 *default-log-port* "Updating existing test-data with test-id: " test-id " and data-id " data-id " pgdb test id: " pgdb-test-id " pgdb data id " pgdb-data-id) + (let* ((pgdb-last-update (pgdb:get-test-data-last-update dbh pgdb-data-id))) + (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) + (hash-table-set! smallest-last-update-time "smallest-time" last-update))) + (pgdb:update-test-data dbh pgdb-data-id pgdb-test-id category variable value expected tol units comment status type last-update)) + (begin + (debug:print-info 1 *default-log-port* "Inserting test-data with test-id: " test-id " and data-id " data-id " pgdb test id: " pgdb-test-id) + (if (handle-exceptions + exn + (begin (print-call-chain) + (print ((condition-property-accessor 'exn 'message) exn)) + #f) + + (pgdb:insert-test-data dbh pgdb-test-id category variable value expected tol units comment status type last-update)) + ;(tasks:run-id->mtpg-run-id dbh cached-info run-id area-info) + (begin + ;(pgdb:insert-test-data dbh pgdb-test-id category variable value expected tol units comment status type ) + (if (or (not smallest-time) (< last-update smallest-time)) + (hash-table-set! smallest-last-update-time "smallest-time" last-update)) + (set! pgdb-data-id (pgdb:get-test-data-id dbh pgdb-test-id category variable))) + #f))) + (hash-table-set! data-ht data-id pgdb-data-id )) + (begin + (debug:print-info 1 *default-log-port* "Error: Test not in pgdb")))) + + (debug:print-info 1 *default-log-port* "Error: Could not get test data info for data id " test-data-id )))) ;; this is a wierd senario need to debug + test-data-ids))) + + + +(define (tasks:sync-tests-data dbh cached-info test-ids area-info smallest-last-update-time) + (let ((test-ht (hash-table-ref cached-info 'tests))) + (for-each + (lambda (test-id) + ; (print test-id) + (let* ((test-info (rmt:get-test-info-by-id #f test-id)) + (run-id (db:test-get-run_id test-info)) ;; look these up in db_records.scm + (test-id (db:test-get-id test-info)) + (test-name (db:test-get-testname test-info)) + (item-path (db:test-get-item-path test-info)) + (state (db:test-get-state test-info)) + (status (db:test-get-status test-info)) + (host (db:test-get-host test-info)) + (pid (db:test-get-process_id test-info)) + (cpuload (db:test-get-cpuload test-info)) + (diskfree (db:test-get-diskfree test-info)) + (uname (db:test-get-uname test-info)) + (run-dir (db:test-get-rundir test-info)) + (log-file (db:test-get-final_logf test-info)) + (run-duration (db:test-get-run_duration test-info)) + (comment (db:test-get-comment test-info)) + (event-time (db:test-get-event_time test-info)) + (archived (db:test-get-archived test-info)) + (last-update (db:test-get-last_update test-info)) + (pgdb-run-id (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time)) + (smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f)) + (pgdb-test-id (if pgdb-run-id + (begin + ;(print pgdb-run-id) + (pgdb:get-test-id dbh pgdb-run-id test-name item-path)) + #f))) + ;; "id" "run_id" "testname" "state" "status" "event_time" + ;; "host" "cpuload" "diskfree" "uname" "rundir" "item_path" + ;; "run_duration" "final_logf" "comment" "shortdir" "attemptnum" "archived" + (if pgdb-run-id + (begin + (if pgdb-test-id ;; have a record + (begin ;; let ((key-name (conc run-id "/" test-name "/" item-path))) + (debug:print-info 0 *default-log-port* "Updating existing test with run-id: " run-id " and test-id: " test-id " pgdb run id: " pgdb-run-id " pgdb-test-id " pgdb-test-id) + (let* ((pgdb-last-update (pgdb:get-test-last-update dbh pgdb-test-id))) + (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) ;;if last-update is same as pgdb-last-update then it is safe to assume the records are identical and we can use a larger last update time. + (hash-table-set! smallest-last-update-time "smallest-time" last-update))) + (pgdb:update-test dbh pgdb-test-id pgdb-run-id test-name item-path state status host cpuload diskfree uname run-dir log-file run-duration comment event-time archived last-update pid)) + (begin + (debug:print-info 0 *default-log-port* "Inserting test with run-id: " run-id " and test-id: " test-id " pgdb run id: " pgdb-run-id) + (pgdb:insert-test dbh pgdb-run-id test-name item-path state status host cpuload diskfree uname run-dir log-file run-duration comment event-time archived last-update pid) + (if (or (not smallest-time) (< last-update smallest-time)) + (hash-table-set! smallest-last-update-time "smallest-time" last-update)) + (set! pgdb-test-id (pgdb:get-test-id dbh pgdb-run-id test-name item-path)))) + (hash-table-set! test-ht test-id pgdb-test-id)) + (debug:print-info 1 *default-log-port* "WARNING: Skipping run with run-id:" run-id ". This run was created after privious sync and removed before this sync.")))) + test-ids))) + +(define (task:add-area-tag dbh area-info tag) + (let* ((tag-info (pgdb:get-tag-info-by-name dbh tag))) + (if (not tag-info) + (begin + (if (handle-exceptions + exn + (begin + (debug:print-info 1 *default-log-port* ((condition-property-accessor 'exn 'message) exn)) + #f) + (pgdb:insert-tag dbh tag)) + (set! tag-info (pgdb:get-tag-info-by-name dbh tag)) + #f))) + ;;add to area_tags + (handle-exceptions + exn + (begin + (debug:print-info 1 *default-log-port* ((condition-property-accessor 'exn 'message) exn)) + #f) + (if (not (pgdb:is-area-taged-with-a-tag dbh (vector-ref tag-info 0) (vector-ref area-info 0))) + (pgdb:insert-area-tag dbh (vector-ref tag-info 0) (vector-ref area-info 0)))))) + +(define (tasks:sync-run-data dbh cached-info run-ids area-info smallest-last-update-time) + (for-each + (lambda (run-id) + (debug:print-info 1 *default-log-port* "Check if run with " run-id " needs to be synced" ) + (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time)) +run-ids)) + + +;; get runs changed since last sync +;; (define (tasks:sync-test-data dbh cached-info area-info) +;; (let* (( + +(define (tasks:sync-to-postgres configdat dest) + (print "In sync") + (let* ((dbh (pgdb:open configdat dbname: dest)) + (area-info (pgdb:get-area-by-path dbh *toppath*)) + (cached-info (make-hash-table)) + (start (current-seconds)) + (test-patt (if (args:get-arg "-testpatt") + (args:get-arg "-testpatt") + "%")) + (target (if (args:get-arg "-target") + (args:get-arg "-target") + #f)) + (run-name (if (args:get-arg "-runname") + (args:get-arg "-runname") + #f))) + (if (and target (not run-name)) + (begin + (print "Error: Provide runname") + (exit 1))) + (if (and (not target) run-name) + (begin + (print "Error: Provide target") + (exit 1))) + ;(print "123") + ;(exit 1) + (for-each (lambda (dtype) + (hash-table-set! cached-info dtype (make-hash-table))) + '(runs targets tests steps data)) + (hash-table-set! cached-info 'start start) ;; when done we'll set sync times to this + (if area-info + (let* ((last-sync-time (vector-ref area-info 3)) + (smallest-last-update-time (make-hash-table)) + (changed (if (and target run-name) + (rmt:get-run-record-ids target run-name (rmt:get-keys) test-patt) + (rmt:get-changed-record-ids last-sync-time))) + (run-ids (alist-ref 'runs changed)) + (test-ids (alist-ref 'tests changed)) + (test-step-ids (alist-ref 'test_steps changed)) + (test-data-ids (alist-ref 'test_data changed)) + (run-stat-ids (alist-ref 'run_stats changed)) + (area-tag (if (args:get-arg "-area-tag") + (args:get-arg "-area-tag") + (if (args:get-arg "-area") + (args:get-arg "-area") + "")))) + (if (and (equal? area-tag "") (not (pgdb:is-area-taged dbh (vector-ref area-info 0)))) + (set! area-tag *default-area-tag*)) + (if (not (equal? area-tag "")) + (task:add-area-tag dbh area-info area-tag)) + (if (or (not (null? test-ids)) (not (null? run-ids))) + (begin + (debug:print-info 0 *default-log-port* "syncing runs") + (tasks:sync-run-data dbh cached-info run-ids area-info smallest-last-update-time) + (debug:print-info 0 *default-log-port* "syncing tests") + (tasks:sync-tests-data dbh cached-info test-ids area-info smallest-last-update-time) + (debug:print-info 0 *default-log-port* "syncing test steps") + (tasks:sync-test-steps dbh cached-info test-step-ids smallest-last-update-time) + (debug:print-info 0 *default-log-port* "syncing test data") + (tasks:sync-test-gen-data dbh cached-info test-data-ids smallest-last-update-time) + (print "----------done---------------"))) + (let* ((smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f))) + (debug:print-info 0 "smallest-time :" smallest-time " last-sync-time " last-sync-time) + (if (not (and target run-name)) + (if (or (and smallest-time (> smallest-time last-sync-time)) (and smallest-time (eq? last-sync-time 0))) + (pgdb:write-sync-time dbh area-info smallest-time))))) ;;this needs to be changed + (if (tasks:set-area dbh configdat) + (tasks:sync-to-postgres configdat dest) + (begin + (debug:print 0 *default-log-port* "ERROR: unable to create an area record") + #f))))) + +;;====================================================================== +;; L O C K I N G M E C H A N I S M S +;;====================================================================== + +;; faux-lock is deprecated. Please use simple-lock below +;; +(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t)) + (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count + (if (> wait-time 0) + (begin + (thread-sleep! 1) + (if (eq? wait-time 1) ;; only one second left, steal the lock + (begin + (debug:print-info 0 *default-log-port* "stealing lock for " keyname) + (common:faux-unlock keyname force: #t))) + (common:faux-lock keyname wait-time: (- wait-time 1))) + #f) + (begin + (rmt:no-sync-set keyname (conc (current-process-id))) + (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))))) + +(define (common:faux-unlock keyname #!key (force #f)) + (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))) + (begin + (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname)) + #t) + #f)) + +;; simple lock. improve and converge on this one. +;; +(define (common:simple-lock keyname) + (rmt:no-sync-get-lock keyname)) + +(define (common:simple-unlock keyname #!key (force #f)) + (rmt:no-sync-del! keyname)) + +;;====================================================================== +;; db based host calls +;;====================================================================== + +;;====================================================================== +;; T E S T L A U N C H I N G P E R I T E M W I T H H O S T T Y P E S +;;====================================================================== +;; +;; [hosts] +;; arm cubie01 cubie02 +;; x86_64 zeus xena myth01 +;; allhosts #{g hosts arm} #{g hosts x86_64} +;; +;; [host-types] +;; general #MTLOWESTLOAD #{g hosts allhosts} +;; arm #MTLOWESTLOAD #{g hosts arm} +;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo +;; +;; [host-rules] +;; # maxnload => max normalized load +;; # maxnjobs => max jobs per cpu +;; # maxjobrate => max jobs per second +;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 +;; +;; [launchers] +;; envsetup general +;; xor/%/n 4C16G +;; % nbgeneral +;; +;; [jobtools] +;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match. +;; flexi-launcher yes +;; launcher nbfake +;; +(define (common:get-launcher configdat testname itempath) + (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher"))) + (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher + (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no"))) + (let* ((launchers (hash-table-ref/default configdat "launchers" '()))) + (if (null? launchers) + fallback-launcher + (let loop ((hed (car launchers)) + (tal (cdr launchers))) + (let ((patt (car hed)) + (host-type (cadr hed))) + (if (tests:match patt testname itempath) + (begin + (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) + (let ((launcher (configf:lookup configdat "host-types" host-type))) + (if launcher + (let* ((launcher-parts (string-split launcher)) + (launcher-exe (car launcher-parts))) + (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline + (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) + (count 100)) + (if targ-host + (conc "remrun " targ-host) + (if (> count 0) + (begin + (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) + (thread-sleep! (- 101 count)) + (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) + (- count 1))) + (begin + (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) + (exit))))) + launcher)) + (begin + (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) + (if (null? tal) + fallback-launcher + (loop (car tal)(cdr tal))))))) + ;; no match, try again + (if (null? tal) + fallback-launcher + (loop (car tal)(cdr tal)))))))) + fallback-launcher))) + +;; ideally put all this info into the db, no need to preserve it across moving homehost +;; +;; return list of +;; ( reachable? cpuload update-time ) +(define (common:get-host-info hostname) + (let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data + (load (car loadinfo)) + (load-sample-time (cdr loadinfo)) + (load-sample-age (- (current-seconds) load-sample-time)) + (loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds + (host-last-update-timeout-seconds 4) + (host-rec (hash-table-ref/default *host-loads* hostname #f)) + ) + (cond + ((< load-sample-age loadinfo-timeout-seconds) + (list #t + load-sample-time + load)) + ((and host-rec + (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds))) + (list #t + (host-last-update host-rec) + (host-last-cpuload host-rec ))) + ((common:unix-ping hostname) + (list #t + (current-seconds) + (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds + (else + (list #f 0 -1) ;; bad host, don't use! + )))) + +;; see defstruct host at top of file. +;; host: reachable last-update last-used last-cpuload +;; +(define (common:update-host-loads-table hosts-raw) + (let* ((hosts (filter (lambda (x) + (string-match (regexp "^\\S+$") x)) + hosts-raw))) + (for-each + (lambda (hostname) + (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) + (if h + h + (let ((h (make-host))) + (hash-table-set! *host-loads* hostname h) + h)))) + (host-info (common:get-host-info hostname)) + (is-reachable (car host-info)) + (last-reached-time (cadr host-info)) + (load (caddr host-info))) + (host-reachable-set! rec is-reachable) + (host-last-update-set! rec last-reached-time) + (host-last-cpuload-set! rec load))) + hosts))) + +;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the +;; [host-rules] section. +;; +(define (common:get-least-loaded-host hosts-raw host-type configdat) + (let* ((rdat (configf:lookup configdat "host-rules" host-type)) + (rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate + (maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load + (maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs + (maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second + (hosts (filter (lambda (x) + (string-match (regexp "^\\S+$") x)) + hosts-raw)) + ;; (best-host #f) + (get-rec (lambda (hostname) + ;; (print "get-rec hostname=" hostname) + (let ((h (hash-table-ref/default *host-loads* hostname #f))) + (if h + h + (let ((h (make-host))) + (hash-table-set! *host-loads* hostname h) + h))))) + (best-load 99999) + (curr-time (current-seconds)) + (get-hosts-sorted (lambda (hosts) + (sort hosts (lambda (a b) + (let ((a-rec (get-rec a)) + (b-rec (get-rec b))) + ;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec)) + ;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec)) + (< (host-last-used a-rec) + (host-last-used b-rec)))))))) + (debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts)) + (if (null? hosts) + #f ;; no hosts to select from. All done and giving up now. + (let ((hosts-sorted (get-hosts-sorted hosts))) + (common:update-host-loads-table hosts) + (let loop ((hostname (car hosts-sorted)) + (tal (cdr hosts-sorted)) + (best-host #f)) + (let* ((rec (get-rec hostname)) + (reachable (host-reachable rec)) + (load (host-last-cpuload rec)) + (last-used (host-last-used rec)) + (delta (- curr-time last-used)) + (job-rate (if (> delta 0) + (/ 1 delta) + 999)) ;; jobs per second + (new-best + (cond + ((not reachable) + (debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.") + best-host) + ((and (< load maxnload) ;; load is acceptable + (< job-rate maxjobrate)) ;; job rate is acceptable + (set! best-load load) + hostname) + (else best-host)))) + (debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." ) + (if new-best + (begin ;; found a host, return it + (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) + (host-last-used-set! rec curr-time) + new-best) + (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) + +(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)) + (let* ((loadavg (common:get-cpu-load remote-host)) + (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again + (common:get-num-cpus remote-host) + numcpus-in)) + (maxload (if force-maxload + maxload-in + (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? + (first (car loadavg)) + (next (cadr loadavg)) + (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 + (loadjmp (- first next)) + (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously + (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload + ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp) + (cond + ((and (> first adjload) + (> count 0)) + (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) + (thread-sleep! adjwait) + (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) + ((and (> loadjmp numcpus) + (> count 0)) + (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) + (thread-sleep! adjwait) + (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))))) + +(define (common:wait-for-homehost-load maxload msg) + (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. + #f + (common:get-homehost))) + (hh (if hh-dat (car hh-dat) #f)) + (numcpus (common:get-num-cpus hh))) + (common:wait-for-normalized-load maxload msg hh))) + +(define (common:get-num-cpus remote-host) + (let* ((actual-host (or remote-host (get-host-name)))) + (or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often! + (let* ((proc (lambda () + (let loop ((numcpu 0) + (inl (read-line))) + (if (eof-object? inl) + (begin + (common:write-cached-info remote-host "num-cpus" numcpu) + numcpu) + (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl) + (+ numcpu 1) + numcpu) + (read-line)))))) + (result (if remote-host + (with-input-from-pipe + (conc "ssh " remote-host " cat /proc/cpuinfo") + proc) + (with-input-from-file "/proc/cpuinfo" proc)))) + (common:write-cached-info actual-host "num-cpus" result) + result)))) + +;; wait for normalized cpu load to drop below maxload +;; +(define (common:wait-for-normalized-load maxload msg remote-host) + (let ((num-cpus (common:get-num-cpus remote-host))) + (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host))) + +;;====================================================================== +;; D E B U G G I N G S T U F F +;;====================================================================== + +;; (define *verbosity* 1) +;; (define *logging* #f) + +(define (common:set-last-run-version) + (rmt:set-var "MEGATEST_VERSION" (common:version-signature))) + +;; postive number if megatest version > db version +;; negative number if megatest version < db version +(define (common:version-db-delta) + (- megatest-version (common:get-last-run-version-number))) + +(define (common:version-changed?) + (not (equal? (common:get-last-run-version) + (common:version-signature)))) + +;; from metadat lookup MEGATEST_VERSION +;; +(define (common:get-last-run-version) ;; RADT => How does this work in send-receive function??; assume it is the value saved in some DB + (rmt:get-var "MEGATEST_VERSION")) + +(define (common:get-last-run-version-number) + (string->number + (substring (common:get-last-run-version) 0 6))) + +(define (common:api-changed?) + (not (equal? (substring (->string megatest-version) 0 4) + (substring (conc (common:get-last-run-version)) 0 4)))) + +;; '(print (string-intersperse (map cadr (hash-table-ref/default (read-config "megatest.config" \#f \#t) "disks" '"'"'("none" ""))) "\n"))' +(define (common:get-disks #!key (configf #f)) + (hash-table-ref/default + (or configf (configf:read-config "megatest.config" #f #t)) + "disks" '("none" ""))) + +;;====================================================================== +;; watchdog and exit procedures +;;====================================================================== + +;;====================================================================== +;; E X I T H A N D L I N G +;;====================================================================== + +;; (let ((ohh (common:on-homehost?)) +;; (srv (args:get-arg "-server"))) +;; (and ohh srv))) + ;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv) + +(define *watchdog* (make-thread + (lambda () + (handle-exceptions + exn + (begin + (print-call-chain) + (print " message: " ((condition-property-accessor 'exn 'message) exn))) + (common:watchdog))) + "Watchdog thread")) + +;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp +;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server) +;; +(define (common:readonly-watchdog dbstruct) + (thread-sleep! 0.05) ;; delay for startup + (debug:print-info 13 *default-log-port* "common:readonly-watchdog entered.") + ;; sync megatest.db to /tmp/.../megatst.db + (let* ((sync-cool-off-duration 3) + (golden-mtdb (dbr:dbstruct-mtdb dbstruct)) + (golden-mtpath (db:dbdat-get-path golden-mtdb)) + (tmp-mtdb (dbr:dbstruct-tmpdb dbstruct)) + (tmp-mtpath (db:dbdat-get-path tmp-mtdb))) + (debug:print-info 0 *default-log-port* "Read-only periodic sync thread started.") + (let loop ((last-sync-time 0)) + (debug:print-info 13 *default-log-port* "loop top tmp-mtpath="tmp-mtpath" golden-mtpath="golden-mtpath) + (let* ((duration-since-last-sync (- (current-seconds) last-sync-time))) + (debug:print-info 13 *default-log-port* "duration-since-last-sync="duration-since-last-sync) + (if (and (not *time-to-exit*) + (< duration-since-last-sync sync-cool-off-duration)) + (thread-sleep! (- sync-cool-off-duration duration-since-last-sync))) + (if (not *time-to-exit*) + (let ((golden-mtdb-mtime (file-modification-time golden-mtpath)) + (tmp-mtdb-mtime (file-modification-time tmp-mtpath))) + (if (> golden-mtdb-mtime tmp-mtdb-mtime) + (if (< golden-mtdb-mtime (- (current-seconds) 3)) ;; file has NOT been touched in past three seconds, this way multiple servers won't fight to sync back + (let ((res (db:multi-db-sync dbstruct 'old2new))) + (debug:print-info 13 *default-log-port* "rosync called, " res " records transferred.")))) + (loop (current-seconds))) + #t))) + (debug:print-info 0 *default-log-port* "Exiting readonly-watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" mtpath="golden-mtpath))) + + +;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage +(define (common:watchdog) + (debug:print-info 13 *default-log-port* "common:watchdog entered.") + (if (launch:setup) + (if (common:on-homehost?) + (let ((dbstruct (db:setup #t))) + (debug:print-info 13 *default-log-port* "after db:setup with dbstruct=" dbstruct) + (cond + ((dbr:dbstruct-read-only dbstruct) + (debug:print-info 13 *default-log-port* "loading read-only watchdog") + (common:readonly-watchdog dbstruct)) + (else + (debug:print-info 13 *default-log-port* "loading writable-watchdog.") + (let* ((syncer (or (configf:lookup *configdat* "server" "sync-method") "brute-force-sync"))) + (cond + ((equal? syncer "brute-force-sync") + (server:writable-watchdog-bruteforce dbstruct)) + ((equal? syncer "delta-sync") + (server:writable-watchdog-deltasync dbstruct)) + (else + (debug:print-error 0 *default-log-port* "Unknown server/sync-method specified ("syncer") - valid values are brute-force-sync and delta-sync.") + (exit 1))) + ;;(debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] Syncer started (method="syncer")") + ))) + (debug:print-info 13 *default-log-port* "watchdog done.")) + (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost")))) + + +(define (std-exit-procedure) + ;;(common:telemetry-log-close) + (on-exit (lambda () 0)) + ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) + (let ((no-hurry (if *time-to-exit* ;; hurry up + #f + (begin + (set! *time-to-exit* #t) + #t)))) + (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.") + (if (and no-hurry (debug:debug-mode 18)) + (rmt:print-db-stats)) + (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds + (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated + (if *task-db* + (let ((db (cdr *task-db*))) + (if (sqlite3:database? db) + (begin + (sqlite3:interrupt! db) + (sqlite3:finalize! db #t) + ;; (vector-set! *task-db* 0 #f) + (set! *task-db* #f))))) + #;(http-client#close-all-connections!) + ;; (if (and *runremote* + ;; (remote-conndat *runremote*)) + ;; (begin + ;; (http-client#close-all-connections!))) ;; for http-client + (if (not (eq? *default-log-port* (current-error-port))) + (close-output-port *default-log-port*)) + (set! *default-log-port* (current-error-port))) "Cleanup db exit thread")) + (th2 (make-thread (lambda () + (debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...") + (if no-hurry + (begin + (thread-sleep! 5)) ;; give the clean up few seconds to do it's stuff + (begin + (thread-sleep! 2))) + (debug:print 4 *default-log-port* " ... done") + ) + "clean exit"))) + (thread-start! th1) + (thread-start! th2) + (thread-join! th1) + ) + ) + + 0) + +(define (std-signal-handler signum) + ;; (signal-mask! signum) + (set! *time-to-exit* #t) + ;;(debug:print-info 13 *default-log-port* "got signal "signum) + (debug:print-error 0 *default-log-port* "Received signal " signum " aaa exiting promptly") + ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway + (exit)) + +(define (special-signal-handler signum) + ;; (signal-mask! signum) + (set! *time-to-exit* #t) + ;;(debug:print-info 13 *default-log-port* "got signal "signum) + (debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting!!") + ;;TODO send email to notify admin contact listed in the config that the lisner got killed + ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway + (exit)) + + +(set-signal-handler! signal/int std-signal-handler) ;; ^C +(set-signal-handler! signal/term std-signal-handler) + +;; (set-signal-handler! signal/stop std-signal-handler) ;; ^Z NO, do NOT handle ^Z! + +;; Force a megatest cleanup-db if version is changed and skip-version-check not specified +;; Do NOT check if not on homehost! +;; +(define (common:exit-on-version-changed) + (if (common:on-homehost?) + (if (common:api-changed?) + (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")) + (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.db")) + (read-only (not (file-write-access? dbfile))) + (dbstruct (db:setup #t))) + (debug:print 0 *default-log-port* + "WARNING: Version mismatch!\n" + " expected: " (common:version-signature) "\n" + " got: " (common:get-last-run-version)) + (cond + ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t) + ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only) + (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db + (debug:print 0 *default-log-port* " I see you are the owner of megatest.config, attempting to cleanup and reset to new version") + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "Failed to switch versions.") + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (print-call-chain (current-error-port)) + (exit 1)) + (common:cleanup-db dbstruct))) + ((not (common:file-exists? mtconf)) + (debug:print 0 *default-log-port* " megatest.config does not exist in this area. Cannot proceed with megatest version migration.") + (exit 1)) + ((not (common:file-exists? dbfile)) + (debug:print 0 *default-log-port* " megatest.db does not exist in this area. Cannot proceed with megatest version migration.") + (exit 1)) + ((not (eq? (current-user-id)(file-owner mtconf))) + (debug:print 0 *default-log-port* " You do not own megatest.db in this area. Cannot proceed with megatest version migration.") + (exit 1)) + (read-only + (debug:print 0 *default-log-port* " You have read-only access to this area. Cannot proceed with megatest version migration.") + (exit 1)) + (else + (debug:print 0 *default-log-port* " to switch versions you can run: \"megatest -cleanup-db\"") + (exit 1))))))) +;; (begin +;; (debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.") +;; (exit 1)))) + +;; Move me elsewhere ... +;; RADT => Why do we meed the version check here, this is called only if version misma +;; +(define (common:cleanup-db dbstruct #!key (full #f)) + (apply db:multi-db-sync + dbstruct + 'schema + ;; 'new2old + 'killservers + 'adj-target + ;; 'old2new + 'new2old + ;; (if full + '(dejunk) + ;; '()) + ) + (if (common:api-changed?) + (common:set-last-run-version))) + +;; called in megatest.scm, host-port is string hostname:port +;; +;; NOTE: This is NOT called directly from clients as not all transports support a client running +;; in the same process as the server. +;; +#;(define (server:ping host-port-in #!key (do-exit #f)) + (let ((host:port (if (not host-port-in) ;; use read-dotserver to find + #f ;; (server:check-if-running *toppath*) + ;; (if (number? host-port-in) ;; we were handed a server-id + ;; (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) + ;; ;; (print "srec: " srec " host-port-in: " host-port-in) + ;; (if srec + ;; (conc (vector-ref srec 3) ":" (vector-ref srec 4)) + ;; (conc "no such server-id " host-port-in))) + host-port-in))) ;; ) + (let* ((host-port (if host:port + (let ((slst (string-split host:port ":"))) + (if (eq? (length slst) 2) + (list (car slst)(string->number (cadr slst))) + #f)) + #f))) +;; (toppath (launch:setup))) + ;; (print "host-port=" host-port) + (if (not host-port) + (begin + (if host-port-in + (debug:print 0 *default-log-port* "ERROR: bad host:port")) + (if do-exit (exit 1)) + #f) + (let* ((iface (car host-port)) + (port (cadr host-port)) + #;(server-dat (http-transport:client-connect iface port)) + (login-res (rmt:login-no-auto-client-setup server-dat))) + (if (and (list? login-res) + (car login-res)) + (begin + ;; (print "LOGIN_OK") + (if do-exit (exit 0)) + #t) + (begin + ;; (print "LOGIN_FAILED") + (if do-exit (exit 1)) + #f))))))) + +;; run ping in separate process, safest way in some cases +;; +(define (server:ping-server ifaceport) + (with-input-from-pipe + (conc (common:get-megatest-exe) " -ping " ifaceport) + (lambda () + (let loop ((inl (read-line)) + (res "NOREPLY")) + (if (eof-object? inl) + (case (string->symbol res) + ((NOREPLY) #f) + ((LOGIN_OK) #t) + (else #f)) + (loop (read-line) inl)))))) + +;; ping the given server +;; +#;(define (server:check-server server-record) + (let* ((server-url (server:record->url server-record)) + (res (case *transport-type* + ((http)(server:ping server-url)) + ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) + ))) + (if res + server-url + #f))) + +;; no longer care if multiple servers are started by accident. older servers will drop off in time. +;; +#;(define (server:check-if-running areapath) ;; #!key (numservers "2")) + (let* ((ns (server:get-num-servers)) + (servers (server:get-best (server:get-list areapath)))) + ;; (print "servers: " servers " ns: " ns) + (if (or (and servers + (null? servers)) + (not servers) + (and (list? servers) + (< (length servers) (random ns)))) ;; somewhere between 0 and numservers + #f + (let loop ((hed (car servers)) + (tal (cdr servers))) + (let ((res (server:check-server hed))) + (if res + res + (if (null? tal) + #f + (loop (car tal)(cdr tal))))))))) + + +;; NOT USED (well, ok, reference in rpc-transport but otherwise not used). +;; +(define (server:login toppath) + (lambda (toppath) + (set! *db-last-access* (current-seconds)) ;; might not be needed. + (if (equal? *toppath* toppath) + #t + #f))) +;; Given a run id start a server process ### NOTE ### > file 2>&1 +;; if the run-id is zero and the target-host is set +;; try running on that host +;; incidental: rotate logs in logs/ dir. +;; +#;(define (server:run areapath) ;; areapath is *toppath* for a given testsuite area + (let* ((curr-host (get-host-name)) + ;; (attempt-in-progress (server:start-attempted? areapath)) + ;; (dot-server-url (server:check-if-running areapath)) + (curr-ip (server:get-best-guess-address curr-host)) + (curr-pid (current-process-id)) + (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) + (target-host (car homehost)) + (testsuite (common:get-testsuite-name)) + (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) + (cmdln (conc (common:get-megatest-exe) + " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") + " -daemonize " + "") + ;; " -log " logfile + " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) + (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) + (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) + ;; we want the remote server to start in *toppath* so push there + (push-directory areapath) + (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") + (thread-start! log-rotate) + + ;; host.domain.tld match host? + (if (and target-host + ;; look at target host, is it host.domain.tld or ip address and does it + ;; match current ip or hostname + (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) + (not (equal? curr-ip target-host))) + (begin + (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) + (setenv "TARGETHOST" target-host))) + + (setenv "TARGETHOST_LOGF" logfile) + (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time + (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever + (system (conc "nbfake " cmdln)) + (unsetenv "TARGETHOST_LOGF") + (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) + (thread-join! log-rotate) + (pop-directory))) + +;; kind start up of servers, wait 40 seconds before allowing another server for a given +;; run-id to be launched +#;(define (server:kind-run areapath) + (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? + (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun + (call-num (car last-run-dat)) + (when-run (cadr last-run-dat)) + (run-delay (+ (case call-num + ((0) 0) + ((1) 20) + ((2) 300) + (else 600)) + (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously + (lock-file (conc areapath "/logs/server-start.lock"))) + (if (> (- (current-seconds) when-run) run-delay) + (begin + (common:simple-file-lock-and-wait lock-file expire-time: 15) + (server:run areapath) + (thread-sleep! 2) ;; don't release the lock for at least a few seconds + (common:simple-file-release-lock lock-file))) + (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) + +#;(define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. + +#;(define (server:start-and-wait areapath #!key (timeout 60)) + (let ((give-up-time (+ (current-seconds) timeout))) + (let loop ((server-url (server:check-if-running areapath)) + (try-num 0)) + (if (or server-url + (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. + server-url + (let ((num-ok (length (server:get-best (server:get-list areapath))))) + (if (and (> try-num 0) ;; first time through simply wait a little while then try again + (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one + (server:kind-run areapath)) + (thread-sleep! 5) + (loop (server:check-if-running areapath) + (+ try-num 1))))))) + +;;====================================================================== +;; make html output +;;====================================================================== + +(define (tests:test-set-toplog! run-id test-name logf) + (rmt:general-call 'tests:test-set-toplog run-id logf run-id test-name)) + +(define (tests:summarize-items run-id test-id test-name force) + ;; if not force then only update the record if one of these is true: + ;; 1. logf is "log/final.log + ;; 2. logf is same as outputfilename + (let* ((outputfilename (conc "megatest-rollup-" test-name ".html")) + (orig-dir (current-directory)) + (logf-info (rmt:test-get-logfile-info run-id test-name)) + (logf (if logf-info (cadr logf-info) #f)) + (path (if logf-info (car logf-info) #f))) + ;; This query finds the path and changes the directory to it for the test + (if (and (string? path) + (directory? path)) ;; can get #f here under some wierd conditions. why, unknown ... + (begin + (debug:print 4 *default-log-port* "Found path: " path) + (change-directory path)) + ;; (set! outputfilename (conc path "/" outputfilename))) + (debug:print-error 0 *default-log-port* "summarize-items for run-id=" run-id ", test-name=" test-name ", no such path: " path)) + (debug:print 4 *default-log-port* "summarize-items with logf " logf ", outputfilename " outputfilename " and force " force) + (if (or (equal? logf "logs/final.log") + (equal? logf outputfilename) + force) + (let ((my-start-time (current-seconds)) + (lockf (conc outputfilename ".lock"))) + (let loop ((have-lock (common:simple-file-lock lockf))) + (if have-lock + (let ((script (configf:lookup *configdat* "testrollup" test-name))) + (print "Obtained lock for " outputfilename) + (rmt:set-state-status-and-roll-up-items run-id test-name "" #f #f #f) + (if script + (system (conc script " > " outputfilename " & ")) + (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename)) + (common:simple-file-release-lock lockf) + (change-directory orig-dir) + ;; NB// tests:test-set-toplog! is remote internal... + (tests:test-set-toplog! run-id test-name outputfilename)) + ;; didn't get the lock, check to see if current update started later than this + ;; update, if so we can exit without doing any work + (if (> my-start-time (handle-exceptions + exn + 0 + (file-modification-time lockf))) + ;; we started since current re-gen in flight, delay a little and try again + (begin + (debug:print-info 1 *default-log-port* "Waiting to update " outputfilename ", another test currently updating it") + (thread-sleep! (+ 5 (random 5))) ;; delay between 5 and 10 seconds + (loop (common:simple-file-lock lockf)))))))))) + +(define (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename) + (let ((counts (make-hash-table)) + (statecounts (make-hash-table)) + (outtxt "") + (tot 0) + (testdat (rmt:test-get-records-for-index-file run-id test-name))) + (with-output-to-file outputfilename + (lambda () + (set! outtxt (conc outtxt "Summary: " test-name + "

Summary for " test-name "

")) + (for-each + (lambda (testrecord) + (let ((id (vector-ref testrecord 0)) + (itempath (vector-ref testrecord 1)) + (state (vector-ref testrecord 2)) + (status (vector-ref testrecord 3)) + (run_duration (vector-ref testrecord 4)) + (logf (vector-ref testrecord 5)) + (comment (vector-ref testrecord 6))) + (hash-table-set! counts status (+ 1 (hash-table-ref/default counts status 0))) + (hash-table-set! statecounts state (+ 1 (hash-table-ref/default statecounts state 0))) + (set! outtxt (conc outtxt "" + ;; " " itempath "" + " " itempath "" + "" state "" + "" status "" + "" (if (equal? comment "") + " " + comment) "" + "")))) + (if (list? testdat) + testdat + (begin + (print "ERROR: failed to get records with rmt:test-get-records-for-index-file run-id=" run-id "test-name=" test-name) + '()))) + + (print "
") + ;; Print out stats for status + (set! tot 0) + (print "") + (for-each (lambda (state) + (set! tot (+ tot (hash-table-ref statecounts state))) + (print "")) + (hash-table-keys statecounts)) + (print "

State stats

" state "" (hash-table-ref statecounts state) "
Total" tot "
") + (print "
") + ;; Print out stats for state + (set! tot 0) + (print "") + (for-each (lambda (status) + (set! tot (+ tot (hash-table-ref counts status))) + (print "")) + (hash-table-keys counts)) + (print "

Status stats

" status + "" (hash-table-ref counts status) "
Total" tot "
") + (print "
") + + (print "" + "" + outtxt "
ItemStateStatusComment
") + ;; (release-dot-lock outputfilename) + ;;(rmt:update-run-stats + ;; run-id + ;; (hash-table-map + ;; state-status-counts + ;; (lambda (key val) + ;; (append key (list val))))) + )))) + +(define tests:css-jscript-block +#< +ul.LinkedList { display: block; } +/* ul.LinkedList ul { display: none; } */ +.HandCursorStyle { cursor: pointer; cursor: hand; } /* For IE */ +th {background-color: #8c8c8c;} +td.test {background-color: #d9dbdd;} +td.PASS {background-color: #347533;} +td.FAIL {background-color: #cc2812;} +td.SKIP{background-color: #FFD733;} +td.WARN {background-color: #EA8724;} +td.WAIVED {background-color: #838A12;} +td.ABORT{background-color: #EA24B7;} +.PASS .link, .SKIP .link, .WARN .link,.WAIVED .link,.ABORT .link, .FAIL .link{color: #FFFFFF;} + + + + + + +EOF +) + +(define tests:css-jscript-block-dynamic +#< +EOF +) + +(define (test:js-block javascript-lib) + (conc "" )) + + +(define tests:css-jscript-block-static (test:js-block *java-script-lib*)) + +(define (tests:css-jscript-block-cond dynamic) + (if (equal? dynamic #t) + tests:css-jscript-block-dynamic + tests:css-jscript-block-static)) + + +(define (tests:run-record->test-path run numkeys) + (append (take (vector->list run) numkeys) + (list (vector-ref run (+ 1 numkeys))))) + + +(define (tests:get-rest-data runs header numkeys) + (let ((resh (make-hash-table))) + (for-each + (lambda (run) + (let* ((run-id (db:get-value-by-header run header "id")) + (run-dir (tests:run-record->test-path run numkeys)) + (test-data (rmt:get-tests-for-run + run-id + "%" ;; testnamepatt + '() ;; states + '() ;; statuses + #f ;; offset + #f ;; num-to-get + #f ;; hide/not-hide + #f ;; sort-by + #f ;; sort-order + #f ;; 'shortlist ;; qrytype + 0 ;; last update + #f))) + + (map (lambda (test) + (let* ((test-name (vector-ref test 2)) + (test-html-path (conc (vector-ref test 10) "/" (vector-ref test 13))) + (test-item (conc test-name ":" (vector-ref test 11))) + (test-status (vector-ref test 4))) + + (if (not (hash-table-ref/default resh test-name #f)) + (hash-table-set! resh test-name (make-hash-table))) + (if (not (hash-table-ref/default (hash-table-ref/default resh test-name #f) test-item #f)) + (hash-table-set! (hash-table-ref/default resh test-name #f) test-item (make-hash-table))) + (hash-table-set! (hash-table-ref/default (hash-table-ref/default resh test-name #f) test-item #f) run-id (list test-status test-html-path)))) + test-data))) + runs) + resh)) + + +;; hash-table tree to html list tree +;; +;; tipfunc takes two parameters: y the tip value and path the path to that point +;; +(define (common:htree->html ht path tipfunc) + (let ((datlist (sort (hash-table->alist ht) + (lambda (a b) + (string< (car a)(car b)))))) + (if (null? datlist) + (tipfunc #f path) ;; really shouldn't get here + (s:ul + (map (lambda (x) + (let* ((levelname (car x)) + (y (cdr x)) + (newpath (append path (list levelname))) + (leaf (or (not (hash-table? y)) + (null? (hash-table-keys y))))) + (if leaf + (s:li (tipfunc y newpath)) + (s:li + (list + levelname + (common:htree->html y newpath tipfunc)))))) + datlist))))) + + +;; tests:genrate dashboard body +;; + +(define (tests:dashboard-body page pg-size keys numkeys total-runs linktree area-name get-prev-links get-next-links flag run-patt target-patt) + (let* ((start (* page pg-size)) + ;(runsdat (rmt:get-runs "%" pg-size start (map (lambda (x)(list x "%")) keys))) + (runsdat (rmt:get-runs-by-patt keys run-patt target-patt start pg-size #f 0 sort-order: "desc")) + ; db:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-update + (header (vector-ref runsdat 0)) + (runs (vector-ref runsdat 1)) + (ctr 0) + (test-runs-hash (tests:get-rest-data runs header numkeys)) + (test-list (hash-table-keys test-runs-hash))) + + (s:html tests:css-jscript-block (tests:css-jscript-block-cond flag) + (s:title "Summary for " area-name) + (s:body 'onload "addEvents();" + (get-prev-links page linktree) + (get-next-links page linktree total-runs) + + (s:h1 "Summary for " area-name) + (s:h3 "Filter" ) + (s:input 'type "text" 'name "testname" 'id "testname" 'length "30" 'onkeyup "filtersome()") + ;; top list + + (s:table 'id "LinkedList1" 'border "1" 'cellspacing 0 + (map (lambda (key) + (let* ((res (s:tr 'class "something" + (s:th key ) + (map (lambda (run) + (s:th (vector-ref run ctr))) + runs)))) + (set! ctr (+ ctr 1)) + res)) + keys) + (s:tr + (s:th "Run Name") + (map (lambda (run) + (s:th (db:get-value-by-header run header "runname"))) + runs)) + + (map (lambda (test-name) + (let* ((item-hash (hash-table-ref/default test-runs-hash test-name #f)) + (item-keys (sort (hash-table-keys item-hash) string<=?))) + (map (lambda (item-name) + (let* ((res (s:tr 'class item-name + (s:td item-name 'class "test" ) + (map (lambda (run) + (let* ((run-test (hash-table-ref/default item-hash item-name #f)) + (run-id (db:get-value-by-header run header "id")) + (result (hash-table-ref/default run-test run-id "n/a")) + ;(relative-path (get-relative-path)) + (status (if (string? result) + result + (car result))) + (link (if (string? result) + result + (if (equal? flag #t) + (s:a (car result) 'href (conc "./test_log?runid=" run-id "&testname=" item-name )) + (s:a (car result) 'href (string-substitute (conc linktree "/") "" (cadr result) "-")))))) + (s:td link 'class status))) + runs)))) + res)) + item-keys))) + test-list)))))) + +;; (tests:create-html-tree "test-index.html") +;; +(define (tests:create-html-tree outf) + (let* ((lockfile (conc outf ".lock")) + (runs-to-process '()) + (linktree (common:get-linktree)) + (area-name (common:get-testsuite-name)) + (keys (rmt:get-keys)) + (numkeys (length keys)) + (run-patt (or (args:get-arg "-run-patt") + (args:get-arg "-runname") + "%")) + (target (or (args:get-arg "-target-patt") + (args:get-arg "-target") + "%")) + (targlist (string-split target "/")) + (numtarg (length targlist)) + (targtweaked (if (> numkeys numtarg) + (append targlist (make-list (- numkeys numtarg) "%")) + targlist)) + (target-patt (string-join targtweaked "/")) + ;(total-runs (rmt:get-num-runs "%")) ;;this needs to be changed to filter by target + (total-runs (rmt:get-runs-cnt-by-patt run-patt target-patt keys )) + (pg-size 10)) + (if (common:simple-file-lock lockfile) + (begin + ;(print total-runs) + (let loop ((page 0)) + (let* ((oup (open-output-file (or outf (conc linktree "/page" page ".html")))) + (get-prev-links (lambda (page linktree ) + (let* ((link (if (not (eq? page 0)) + (s:a "<<prev" 'href (conc "page" (- page 1) ".html")) + (s:a "" 'href (conc "page" page ".html"))))) + link))) + (get-next-links (lambda (page linktree total-runs) + (let* ((link (if (> total-runs (+ 10 (* page pg-size))) + (s:a "next>>" 'href (conc "page" (+ page 1) ".html")) + (s:a "" 'href (conc "page" page ".html"))))) + link))) ) + (print "total runs: " total-runs) + (s:output-new + oup + (tests:dashboard-body page pg-size keys numkeys total-runs linktree area-name get-prev-links get-next-links #f run-patt target-patt)) ;; update this function + (close-output-port oup) + ; (set! page (+ 1 page)) + (if (> total-runs (* (+ 1 page) pg-size)) + (loop (+ 1 page))))) + (common:simple-file-release-lock lockfile)) + + #f))) + + +(define (tests:readlines filename) + (call-with-input-file filename + (lambda (p) + (let loop ((line (read-line p)) + (result '())) + (if (eof-object? line) + (reverse result) + (loop (read-line p) (cons line result))))))) + +(define (tests:get-test-log run-id test-name item-name) + (let* ((test-data (rmt:get-tests-for-run + (string->number run-id) + test-name ;; testnamepatt + '() ;; states + '() ;; statuses + #f ;; offset + #f ;; num-to-get + #f ;; hide/not-hide + #f ;; sort-by + #f ;; sort-order + #f ;; 'shortlist ;; qrytype + 0 ;; last update + #f)) + (path "") + (found 0)) + (debug:print-info 0 *default-log-port* "found: " found ) + + (let loop ((hed (car test-data)) + (tal (cdr test-data))) + (debug:print-info 0 *default-log-port* "item: " (vector-ref hed 11) (vector-ref hed 10) "/" (vector-ref hed 13)) + + (if (equal? (vector-ref hed 11) item-name) + (begin + (set! found 1) + (set! path (conc (vector-ref hed 10) "/" (vector-ref hed 13))))) + (if (and (not (null? tal)) (equal? found 0)) + (loop (car tal)(cdr tal)))) + (if (equal? path "") + "

Data not found

" + (string-join (tests:readlines path) "\n")))) + + +(define (tests:dynamic-dboard page) +;(define (tests:create-html-tree o) + (let* ( +;(page "1") + (linktree (common:get-linktree)) + (area-name (common:get-testsuite-name)) + (keys (rmt:get-keys)) + (numkeys (length keys)) + (targtweaked (make-list numkeys "%")) + (target-patt (string-join targtweaked "/")) + (total-runs (rmt:get-num-runs "%")) + (pg-size 10) + (pg (if (equal? page #f) + 0 + (- (string->number page) 1))) + (get-prev-links (lambda (pg linktree) + (debug:print-info 0 *default-log-port* "val: " (- 1 pg)) + (let* ((link (if (not (eq? pg 0)) + (s:a "<<prev " 'href (conc "dashboard?page=" pg )) + (s:a "" 'href (conc "dashboard?page=" pg))))) + link))) + (get-next-links (lambda (pg linktree total-runs) + (debug:print-info 0 *default-log-port* "val: " pg) + (debug:print-info 0 *default-log-port* "val: " total-runs " size" pg-size) + + (let* ((link (if (> total-runs (+ 10 (* pg pg-size))) + (s:a "next>> " 'href (conc "dashboard?page=" (+ pg 2) )) + (s:a "" 'href (conc "dashboard?page=" pg ))))) + link))) + (html-body (tests:dashboard-body pg pg-size keys numkeys total-runs linktree area-name get-prev-links get-next-links #t "%" target-patt))) ;; update tis function + html-body)) + +(define (tests:create-html-summary outf) + (let* ((lockfile (conc outf ".lock")) + (linktree (common:get-linktree)) + (keys (rmt:get-keys)) + (area-name (common:get-testsuite-name)) + (run-patt (or (args:get-arg "-run-patt") + (args:get-arg "-runname") + "%")) + (target (or (args:get-arg "-target-patt") + (args:get-arg "-target") + "%")) + (targlist (string-split target "/")) + (numkeys (length keys)) + (numtarg (length targlist)) + (targtweaked (if (> numkeys numtarg) + (append targlist (make-list (- numkeys numtarg) "%")) + targlist)) + (target-patt (string-join targtweaked "/"))) + (if (common:simple-file-lock lockfile) + (begin + (let* (;(runsdat1 (rmt:get-runs run-patt #f #f (map (lambda (x)(list x "%")) keys))) + (runsdat (rmt:get-runs-by-patt keys run-patt target-patt #f #f #f 0)) + (runs (vector-ref runsdat 1)) + (header (vector-ref runsdat 0)) + (oup (open-output-file (or outf (conc linktree "/targets.html")))) + (target-hash (test:create-target-hash runs header (length keys)))) + (test:create-target-html target-hash oup area-name linktree) + (test:create-run-html runs area-name linktree (length keys) header)) + (common:simple-file-release-lock lockfile)) + #f))) + +(define (test:get-test-hash test-data) + (let ((resh (make-hash-table))) + (map (lambda (test) + (let* ((test-name (vector-ref test 2)) + (test-html-path (if (file-exists? (conc (vector-ref test 10) "/test-summary.html")) + (conc (vector-ref test 10) "/test-summary.html" ) + (conc (vector-ref test 10) "/" (vector-ref test 13)))) + (test-item (vector-ref test 11)) + (test-status (vector-ref test 4))) + (if (not (hash-table-ref/default resh test-item #f)) + (hash-table-set! resh test-item (make-hash-table))) + (hash-table-set! (hash-table-ref/default resh test-item #f) test-name (list test-status test-html-path)))) + test-data) +resh)) + +(define (test:get-data->b-keys ordered-data a-keys) + (delete-duplicates + (sort (apply + append + (map (lambda (sub-key) + (let ((subdat (hash-table-ref ordered-data sub-key))) + (hash-table-keys subdat))) + a-keys)) + string>=?))) + + +(define (test:create-run-html runs area-name linktree numkeys header) + (map (lambda (run) + (let* ((target (string-join (take (vector->list run) numkeys) "/")) + (run-name (db:get-value-by-header run header "runname")) + (run-time (seconds->work-week/day-time (db:get-value-by-header run header "event_time"))) + (oup (if (file-exists? (conc linktree "/" target "/" run-name)) + (open-output-file (conc linktree "/" target "/" run-name "/run.html")) + #f)) + (run-id (db:get-value-by-header run header "id")) + (test-data (rmt:get-tests-for-run + run-id + "%" ;; testnamepatt + '() ;; states + '() ;; statuses + #f ;; offset + #f ;; num-to-get + #f ;; hide/not-hide + #f ;; sort-by + #f ;; sort-order + #f ;; 'shortlist ;; qrytype + 0 ;; last update + #f)) + (item-test-hash (test:get-test-hash test-data)) + (items (hash-table-keys item-test-hash)) + (test-names (test:get-data->b-keys item-test-hash items))) + (if oup + (begin + (s:output-new + oup + (s:html tests:css-jscript-block (tests:css-jscript-block-cond #f) + (s:title "Runs View " run-name) + (s:body + (s:h1 "Runs View " ) + (s:h3 "Target" target) + (s:p + (s:b "Run name" ) run-name) + (s:p + (s:b "Run Date" ) run-time) + (s:table 'border 1 'cellspacing 0 + (s:tr + (s:th "Items") + (map (lambda (test) + (s:th test)) + test-names)) + (map (lambda (item) + (let* ((test-hash (hash-table-ref/default item-test-hash item #f))) + (if test-hash + (begin + (s:tr + (s:td 'class "test" item) + (map (lambda (test) + (let* ((test-details (hash-table-ref/default test-hash test #f)) + (status (if test-details + (car test-details))) + (link (if test-details + (string-substitute (conc linktree "/" target "/" run-name "/") "" (cadr test-details) "-")))) + (if test-details + (s:td 'class status + (s:a 'class "link" 'href link status )) + (s:td "")))) + test-names)))))) + (sort items string<=?)))))) + (close-output-port oup)) + (debug:print-info 0 "Skip: Dirctory structure " linktree "/" target "/" run-name " does not exist. Megatest will not create run.html")))) +runs)) + +(define (test:create-target-hash runs header numkeys) + (let ((resh (make-hash-table))) + (for-each + (lambda (run) + (let* ((run-name (db:get-value-by-header run header "runname")) + (target (string-join (take (vector->list run) numkeys) "/")) + (run-list (hash-table-ref/default resh target #f))) + + (if (not run-list) + (hash-table-set! resh target (list run-name)) + (hash-table-set! resh target (cons run-name run-list))))) + runs) + resh)) + +(define (test:get-max-run-cnt target-hash targets) + (let* ((cnt 0 )) + (map (lambda (target) + (let* ((runs (hash-table-ref/default target-hash target #f)) + (run-length (if runs + (length runs) + 0))) + + (if (< cnt run-length) + (set! cnt run-length)))) + targets) +cnt)) + +(define (test:pad-runs target-hash targets max-row-length) + (map (lambda (target) + (let loop ((run-list (hash-table-ref/default target-hash target #f))) + (if (< (length run-list) max-row-length) + (begin + (hash-table-set! target-hash target (cons "" run-list)) + (loop (hash-table-ref/default target-hash target #f) ))))) + targets) + target-hash) + +(define (test:create-target-html target-hash oup area-name linktree) + (let* ((targets (hash-table-keys target-hash)) + (max-row-length (test:get-max-run-cnt target-hash targets)) + (pad-runs-hash (test:pad-runs target-hash targets max-row-length))) + (s:output-new + oup + (s:html tests:css-jscript-block (tests:css-jscript-block-cond #f) + + (s:title "Target View " area-name) + (s:body + (s:h1 "Target View " area-name) + (s:table 'id "LinkedList1" 'border "1" 'cellspacing 0 + (s:tr 'class "something" + (s:th "Target") + (s:th 'colspan max-row-length "Runs")) + (let* ((tbl (map (lambda (target) + (s:tr + (s:td 'class "test" target) + (let* ((runs (hash-table-ref/default target-hash target #f)) + (rest-row (map (lambda (run) + (if (equal? run "") + (s:td run) + (if (file-exists?(conc linktree "/" target "/" run )) + (begin + (s:td + (s:a 'href (conc target "/" run "/run.html") run)))))) + (reverse runs)))) + rest-row))) + targets))) + tbl))))) + (close-output-port oup))) + + +(define (tests:create-html-tree-old outf) + (let* ((lockfile (conc outf ".lock")) + (runs-to-process '())) + (if (common:simple-file-lock lockfile) + (let* ((linktree (common:get-linktree)) + (oup (open-output-file (or outf (conc linktree "/runs-index.html")))) + (area-name (common:get-testsuite-name)) + (keys (rmt:get-keys)) + (numkeys (length keys)) + (runsdat (rmt:get-runs "%" #f #f (map (lambda (x)(list x "%")) keys))) + (header (vector-ref runsdat 0)) + (runs (vector-ref runsdat 1)) + (runtreedat (map (lambda (x) + (tests:run-record->test-path x numkeys)) + runs)) + (runs-htree (common:list->htree runtreedat))) + (set! runs-to-process runs) + (s:output-new + oup + (s:html tests:css-jscript-block + (s:title "Summary for " area-name) + (s:body 'onload "addEvents();" + (s:h1 "Summary for " area-name) + ;; top list + (s:ul 'id "LinkedList1" 'class "LinkedList" + (s:li + "Runs" + (common:htree->html runs-htree + '() + (lambda (x p) + (let* ((targ-path (string-intersperse p "/")) + (full-path (conc linktree "/" targ-path)) + (run-name (car (reverse p)))) + (if (and (common:file-exists? full-path) + (directory? full-path) + (file-write-access? full-path)) + (s:a run-name 'href (conc targ-path "/run-summary.html")) + (begin + (debug:print 0 *default-log-port* "INFO: Can't create " targ-path "/run-summary.html") + (conc run-name " (Not able to create summary at " targ-path ")"))))))))))) + (close-output-port oup) + (common:simple-file-release-lock lockfile) + + (for-each + (lambda (run) + (let* ((test-subpath (tests:run-record->test-path run numkeys)) + (run-id (db:get-value-by-header run header "id")) + (run-dir (tests:run-record->test-path run numkeys)) + (test-dats (rmt:get-tests-for-run + run-id + "%/" ;; testnamepatt + '() ;; states + '() ;; statuses + #f ;; offset + #f ;; num-to-get + #f ;; hide/not-hide + #f ;; sort-by + #f ;; sort-order + #f ;; 'shortlist ;; qrytype + 0 ;; last update + #f)) + (tests-tree-dat (map (lambda (test-dat) + ;; (tests:run-record->test-path x numkeys)) + (let* ((test-name (db:test-get-testname test-dat)) + (item-path (db:test-get-item-path test-dat)) + (full-name (db:test-make-full-name test-name item-path)) + (path-parts (string-split full-name))) + path-parts)) + test-dats)) + (tests-htree (common:list->htree tests-tree-dat)) + (html-dir (conc linktree "/" (string-intersperse run-dir "/"))) + (html-path (conc html-dir "/run-summary.html")) + (oup (if (and (common:file-exists? html-dir) + (directory? html-dir) + (file-write-access? html-dir)) + (open-output-file html-path) + #f))) + ;; (print "run-dir: " run-dir ", tests-tree-dat: " tests-tree-dat) + (if oup + (begin + (s:output-new + oup + (s:html tests:css-jscript-block + (s:title "Summary for " area-name) + (s:body 'onload "addEvents();" + (s:h1 "Summary for " (string-intersperse run-dir "/")) + ;; top list + (s:ul 'id "LinkedList1" 'class "LinkedList" + (s:li + "Tests" + (common:htree->html tests-htree + '() + (lambda (x p) + (let* ((targ-path (string-intersperse p "/")) + (test-name (car p)) + (item-path ;; (if (> (length p) 2) ;; test-name + run-name + (string-intersperse p "/")) + (full-targ (conc html-dir "/" targ-path)) + (std-file (conc full-targ "/test-summary.html")) + (alt-file (conc full-targ "/megatest-rollup-" test-name ".html")) + (html-file (if (common:file-exists? alt-file) + alt-file + std-file)) + (run-name (car (reverse p)))) + (if (and (not (common:file-exists? full-targ)) + (directory? full-targ) + (file-write-access? full-targ)) + (tests:summarize-test + run-id + (rmt:get-test-id run-id test-name item-path))) + (if (common:file-exists? full-targ) + (s:a run-name 'href html-file) + (begin + (debug:print 0 *default-log-port* "ERROR: can't access " full-targ) + (conc "No summary for " run-name))))) + )))))) + (close-output-port oup))))) + runs) + #t) + #f))) + + + + +(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname) + (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1)) + (if (and cpuload diskfree) + (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id)) + (if minutes + (rmt:general-call 'update-run-duration run-id minutes test-id)) + (if (and uname hostname) + (rmt:general-call 'update-uname-host run-id uname hostname test-id))) + +;; This one is for running with no db access (i.e. via rmt: internally) +(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries) +;; (define (tests:set-full-meta-info test-id run-id minutes work-area) +;; (let ((remtries 10)) + (let* ((cpuload (get-cpu-load)) + (diskfree (get-df (current-directory))) + (uname (get-uname "-srvpio")) + (hostname (get-host-name))) + (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname))) + +;; +;; +(define (tests:get-compressed-steps run-id test-id) + (let* ((steps-data (rmt:get-steps-for-test run-id test-id)) ;; 0 1 2 3 4 5 6 7 + (comprsteps (tests:process-steps-table steps-data))) ;; # + (map (lambda (x) + ;; take advantage of the \n on time->string + (vector ;; we are constructing basically the original vector but collapsing start end records + (vector-ref x 0) ;; id 0 + (let ((s (vector-ref x 1))) + (if (number? s)(seconds->time-string s) s)) ;; starttime 1 + (let ((s (vector-ref x 2))) + (if (number? s)(seconds->time-string s) s)) ;; endtime 2 + (vector-ref x 3) ;; status 3 + (vector-ref x 4) ;; duration 4 + (vector-ref x 5) ;; logfile 5 + (vector-ref x 6) ;; comment 6 + (vector-ref x 7))) ;; id 7 + (sort (hash-table-values comprsteps) + (lambda (a b) + (let ((time-a (vector-ref a 1)) + (time-b (vector-ref b 1)) + (id-a (vector-ref a 7)) + (id-b (vector-ref b 7))) + (if (and (number? time-a)(number? time-b)) + (if (< time-a time-b) + #t + (if (eq? time-a time-b) + (< id-a id-b) + ;; (stringwork-week/day-time + (db:test-get-event_time test-dat))) + (s:td "Duration") (s:td (seconds->hr-min-sec (db:test-get-run_duration test-dat))))) + (s:h3 "Log files") + (s:table + 'cellspacing "0" 'border "1" + (s:tr (s:td "Final log")(s:td (s:a 'href logf logf)))) + (s:table + 'cellspacing "0" 'border "1" + (s:tr (s:td "Step Name")(s:td "Start")(s:td "End")(s:td "Status")(s:td "Duration")(s:td "Log File")) + (map (lambda (step-dat) + (s:tr (s:td (tdb:steps-table-get-stepname step-dat)) + (s:td (tdb:steps-table-get-start step-dat)) + (s:td (tdb:steps-table-get-end step-dat)) + (s:td (tdb:steps-table-get-status step-dat)) + (s:td (tdb:steps-table-get-runtime step-dat)) + (s:td (let ((step-log (tdb:steps-table-get-log-file step-dat))) + (s:a 'href step-log step-log))))) + steps-dat)) + ))) + (close-output-port oup))))) + + +;; MUST BE CALLED local! +;; +(define (tests:test-get-paths-matching keynames target fnamepatt #!key (res '())) + ;; BUG: Move the values derived from args to parameters and push to megatest.scm + (let* ((testpatt (or (args:get-arg "-testpatt")(args:get-arg "-testpatt") "%")) + (statepatt (or (args:get-arg "-state") (args:get-arg ":state") "%")) + (statuspatt (or (args:get-arg "-status") (args:get-arg ":status") "%")) + (runname (or (args:get-arg "-runname") (args:get-arg ":runname") "%")) + (paths-from-db (rmt:test-get-paths-matching-keynames-target-new keynames target res + testpatt + statepatt + statuspatt + runname))) + (if fnamepatt + (apply append + (map (lambda (p) + (if (directory-exists? p) + (let ((glob-query (conc p "/" fnamepatt))) + (handle-exceptions + exn + (with-input-from-pipe + (conc "echo " glob-query) + read-lines) ;; we aren't going to try too hard. If glob breaks it is likely because someone tried to do */*/*.log or similar + (glob glob-query))) + '())) + paths-from-db)) + paths-from-db))) + + +;; for each test: +;; +(define (tests:filter-non-runnable run-id testkeynames testrecordshash) + (let ((runnables '())) + (for-each + (lambda (testkeyname) + (let* ((test-record (hash-table-ref testrecordshash testkeyname)) + (test-name (tests:testqueue-get-testname test-record)) + (itemdat (tests:testqueue-get-itemdat test-record)) + (item-path (tests:testqueue-get-item_path test-record)) + (waitons (tests:testqueue-get-waitons test-record)) + (keep-test #t) + (test-id (rmt:get-test-id run-id test-name item-path)) + (tdat (rmt:get-testinfo-state-status run-id test-id))) ;; (cdb:get-test-info-by-id *runremote* test-id))) + (if tdat + (begin + ;; Look at the test state and status + (if (or (and (member (db:test-get-status tdat) + '("PASS" "WARN" "WAIVED" "CHECK" "SKIP")) + (equal? (db:test-get-state tdat) "COMPLETED")) + (member (db:test-get-state tdat) + '("INCOMPLETE" "KILLED"))) + (set! keep-test #f)) + + ;; examine waitons for any fails. If it is FAIL or INCOMPLETE then eliminate this test + ;; from the runnable list + (if keep-test + (for-each (lambda (waiton) + ;; for now we are waiting only on the parent test + (let* ((parent-test-id (rmt:get-test-id run-id waiton "")) + (wtdat (rmt:get-testinfo-state-status run-id test-id))) ;; (cdb:get-test-info-by-id *runremote* test-id))) + (if (or (and (equal? (db:test-get-state wtdat) "COMPLETED") + (member (db:test-get-status wtdat) '("FAIL" "ABORT"))) + (member (db:test-get-status wtdat) '("KILLED")) + (member (db:test-get-state wtdat) '("INCOMPETE"))) + ;; (if (or (member (db:test-get-status wtdat) + ;; '("FAIL" "KILLED")) + ;; (member (db:test-get-state wtdat) + ;; '("INCOMPETE"))) + (set! keep-test #f)))) ;; no point in running this one again + waitons)))) + (if keep-test (set! runnables (cons testkeyname runnables))))) + testkeynames) + runnables)) + +;;====================================================================== +;; test steps +;;====================================================================== + +;; teststep-set-status! used to be here + +(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat) + (let* ((testdat (rmt:get-test-info-by-id run-id test-id))) + (and testdat + (equal? (test:get-state testdat) "KILLREQ")))) + +(define (test:tdb-get-rundat-count tdb) + (if tdb + (let ((res 0)) + (sqlite3:for-each-row + (lambda (count) + (set! res count)) + tdb + "SELECT count(id) FROM test_rundat;") + res)) + 0) + +;; (define (tests:set-partial-meta-info test-id run-id minutes work-area) +#;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries) + (let* ((cpuload (get-cpu-load)) + (diskfree (get-df (current-directory))) + (remtries 10)) + (handle-exceptions + exn + (if (> remtries 0) + (begin + (print-call-chain (current-error-port)) + (debug:print-info 0 *default-log-port* "WARNING: failed to set meta info. Will try " remtries " more times") + (set! remtries (- remtries 1)) + (thread-sleep! 10) + (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1))) + (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) + (debug:print-error 0 *default-log-port* "tried for over a minute to update meta info and failed. Giving up") + (debug:print 0 *default-log-port* "EXCEPTION: database probably overloaded or unreadable.") + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 5 *default-log-port* "exn=" (condition->list exn)) + (debug:print 0 *default-log-port* " status: " ((condition-property-accessor 'sqlite3 'status) exn)) + (print-call-chain (current-error-port)))) + (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes) + ))) + + +;;====================================================================== +;; launch a task - this runs on the originating host, tests themselves +;; +;;====================================================================== + + +;;====================================================================== +;; ezsteps +;;====================================================================== + +;; ezsteps were going to be coded as +;; stepname[,predstep1,predstep2 ...] [{VAR1=first,second,third}] command to execute +;; BUT +;; now are +;; stepname {VAR=first,second,third ...} command ... +;; where the {VAR=first,second,third ...} is optional. + +;; given an exit code and whether or not logpro was used calculate OK/BAD +;; return #t if we are ok, #f otherwise +(define (steprun-good? logpro exitcode) + (or (eq? exitcode 0) + (and logpro (eq? exitcode 2)))) + +;; if handed a string, process it, else look for MT_CMDINFO +(define (launch:get-cmdinfo-assoc-list #!key (encoded-cmd #f)) + (let ((enccmd (if encoded-cmd encoded-cmd (getenv "MT_CMDINFO")))) + (if enccmd + (common:read-encoded-string enccmd) + '()))) + +;; return (conc status ": " comment) from the final section so that +;; the comment can be set in the step record in launch.scm +;; +(define (launch:load-logpro-dat run-id test-id stepname) + (let ((cname (conc stepname ".dat"))) + (if (common:file-exists? cname) + (let* ((dat (configf:read-config cname #f #f)) + (csvr (db:logpro-dat->csv dat stepname)) + (csvt (let-values (((fmt-cell fmt-record fmt-csv) (make-format ","))) + (fmt-csv (map list->csv-record csvr)))) + (status (configf:lookup dat "final" "exit-status")) + (msg (configf:lookup dat "final" "message"))) + (if csvt ;; this if blocked stack dump caused by .dat file from logpro being 0-byte. fixed by upgrading logpro + (rmt:csv->test-data run-id test-id csvt) + (debug:print 0 *default-log-port* "ERROR: no csvdat exists for run-id: " run-id " test-id: " test-id " stepname: " stepname ", check that logpro version is 1.15 or newer")) + ;; (debug:print-info 13 *default-log-port* "Error: run-id/test-id/stepname="run-id"/"test-id"/"stepname" => bad csvr="csvr) + ;; ) + (cond + ((equal? status "PASS") "PASS") ;; skip the message part if status is pass + (status (conc (configf:lookup dat "final" "exit-status") ": " (if msg msg "no message"))) + (else #f))) + #f))) + +(define (launch:runstep ezstep run-id test-id exit-info m tal testconfig) ;;; TODO: deprecate me in favor of ezsteps.scm + (let* ((stepname (car ezstep)) ;; do stuff to run the step + (stepinfo (cadr ezstep)) + ;; (let ((info (cadr ezstep))) + ;; (if (proc? info) "" info))) + ;; (stepproc (let ((info (cadr ezstep))) + ;; (if (proc? info) info #f))) + (stepparts (string-match (regexp "^(\\{([^\\}\\{]*)\\}\\s*|)(.*)$") stepinfo)) + (stepparams (list-ref stepparts 2)) ;; for future use, {VAR=1,2,3}, run step for each + (paramparts (if (string? stepparams) + (map (lambda (x)(string-split x "=")) (string-split-fields "[^;]*=[^;]*" stepparams)) + '())) + (subrun (alist-ref "subrun" paramparts equal?)) + (stepcmd (list-ref stepparts 3)) + (script "") ; "#!/bin/bash\n") ;; yep, we depend on bin/bash FIXME!!!\ + (logpro-file (conc stepname ".logpro")) + (html-file (conc stepname ".html")) + (dat-file (conc stepname ".dat")) + (tconfig-logpro (configf:lookup testconfig "logpro" stepname)) + (logpro-used (common:file-exists? logpro-file))) + + (debug:print 0 *default-log-port* "stepparts: " stepparts ", stepparams: " stepparams + ", paramparts: " paramparts ", subrun: " subrun ", stepcmd: " stepcmd) + + (if (and tconfig-logpro + (not logpro-used)) ;; no logpro file found but have a defn in the testconfig + (begin + (with-output-to-file logpro-file + (lambda () + (print ";; logpro file extracted from testconfig\n" + ";;") + (print tconfig-logpro))) + (set! logpro-used #t))) + + ;; NB// can safely assume we are in test-area directory + (debug:print 4 *default-log-port* "ezsteps:\n stepname: " stepname " stepinfo: " stepinfo " stepparts: " stepparts + " stepparams: " stepparams " stepcmd: " stepcmd) + + ;; ;; first source the previous environment + ;; (let ((prev-env (conc ".ezsteps/" prevstep (if (string-search (regexp "csh") + ;; (get-environment-variable "SHELL")) ".csh" ".sh")))) + ;; (if (and prevstep (common:file-exists? prev-env)) + ;; (set! script (conc script "source " prev-env)))) + + ;; call the command using mt_ezstep + ;; (set! script (conc "mt_ezstep " stepname " " (if prevstep prevstep "x") " " stepcmd)) + + (debug:print 4 *default-log-port* "script: " script) + (rmt:teststep-set-status! run-id test-id stepname "start" "-" #f #f) + ;; now launch the actual process + (call-with-environment-variables + (list (cons "PATH" (conc (get-environment-variable "PATH") ":."))) + (lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1") + (let* ((cmd (conc stepcmd " > " stepname ".log 2>&1")) ;; >outfile 2>&1 + (pid #f)) + (let ((proc (lambda () + (set! pid (process-run "/bin/bash" (list "-c" cmd)))))) + (if subrun + (begin + (debug:print-info 0 *default-log-port* "Running without MT_.* environment variables.") + (common:without-vars proc "^MT_.*")) + (proc))) + + (with-output-to-file "Makefile.ezsteps" + (lambda () + (print stepname ".log :") + (print "\t" cmd) + (if (common:file-exists? (conc stepname ".logpro")) + (print "\tlogpro " stepname ".logpro " stepname ".html < " stepname ".log")) + (print) + (print stepname " : " stepname ".log") + (print)) + #:append) + + (rmt:test-set-top-process-pid run-id test-id pid) + (let processloop ((i 0)) + (let-values (((pid-val exit-status exit-code)(process-wait pid #t))) + (mutex-lock! m) + (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) + (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status) + (launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code) + (mutex-unlock! m) + (if (eq? pid-val 0) + (begin + (thread-sleep! 2) + (processloop (+ i 1)))) + ))))) + (debug:print-info 0 *default-log-port* "step " stepname " completed with exit code " (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2)) + ;; now run logpro if needed + (if logpro-used + (let* ((logpro-exe (or (getenv "LOGPRO_EXE") "logpro")) + (pid (process-run (conc "/bin/sh -c '"logpro-exe" "logpro-file " " (conc stepname ".html") " < " stepname ".log > /dev/null'")))) + (let processloop ((i 0)) + (let-values (((pid-val exit-status exit-code)(process-wait pid #t))) + (mutex-lock! m) + ;; (make-launch:einf pid: pid exit-status: exit-status exit-code: exit-code) + (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) + (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status) + (launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code) + (mutex-unlock! m) + (if (eq? pid-val 0) + (begin + (thread-sleep! 2) + (processloop (+ i 1))))) + (debug:print-info 0 *default-log-port* "logpro for step " stepname " exited with code " (launch:einf-exit-code exit-info))))) ;; (vector-ref exit-info 2))))) + + (let ((exinfo (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2)) + (logfna (if logpro-used (conc stepname ".html") "")) + (comment #f)) + (if logpro-used + (let ((datfile (conc stepname ".dat"))) + ;; load the .dat file into the test_data table if it exists + (if (common:file-exists? datfile) + (set! comment (launch:load-logpro-dat run-id test-id stepname))) + (rmt:test-set-log! run-id test-id (conc stepname ".html")))) + (rmt:teststep-set-status! run-id test-id stepname "end" exinfo comment logfna)) + ;; set the test final status + (let* ((process-exit-status (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2)) + (this-step-status (cond + ((and (eq? process-exit-status 2) logpro-used) 'warn) ;; logpro 2 = warnings + ((and (eq? process-exit-status 3) logpro-used) 'check) ;; logpro 3 = check + ((and (eq? process-exit-status 4) logpro-used) 'waived) ;; logpro 4 = waived + ((and (eq? process-exit-status 5) logpro-used) 'abort) ;; logpro 5 = abort + ((and (eq? process-exit-status 6) logpro-used) 'skip) ;; logpro 6 = skip + ((eq? process-exit-status 0) 'pass) ;; logpro 0 = pass + (else 'fail))) + (overall-status (cond + ((eq? (launch:einf-rollup-status exit-info) 2) 'warn) ;; rollup-status (vector-ref exit-info 3) + ((eq? (launch:einf-rollup-status exit-info) 0) 'pass) ;; (vector-ref exit-info 3) + (else 'fail))) + (next-status (cond + ((eq? overall-status 'pass) this-step-status) + ((eq? overall-status 'warn) + (if (eq? this-step-status 'fail) 'fail 'warn)) + ((eq? overall-status 'abort) 'abort) + (else 'fail))) + (next-state ;; "RUNNING") ;; WHY WAS THIS CHANGED TO NOT USE (null? tal) ?? + (cond + ((null? tal) ;; more to run? + "COMPLETED") + (else "RUNNING")))) + (debug:print 4 *default-log-port* "Exit value received: " (launch:einf-exit-code exit-info) " logpro-used: " logpro-used + " this-step-status: " this-step-status " overall-status: " overall-status + " next-status: " next-status " rollup-status: " (launch:einf-rollup-status exit-info)) ;; (vector-ref exit-info 3)) + (case next-status + ((warn) + (launch:einf-rollup-status-set! exit-info 2) ;; (vector-set! exit-info 3 2) ;; rollup-status + ;; NB// test-set-status! does rdb calls under the hood + (tests:test-set-status! run-id test-id next-state "WARN" + (if (eq? this-step-status 'warn) "Logpro warning found" #f) + #f)) + ((check) + (launch:einf-rollup-status-set! exit-info 3) ;; (vector-set! exit-info 3 3) ;; rollup-status + ;; NB// test-set-status! does rdb calls under the hood + (tests:test-set-status! run-id test-id next-state "CHECK" + (if (eq? this-step-status 'check) "Logpro check found" #f) + #f)) + ((waived) + (launch:einf-rollup-status-set! exit-info 4) ;; (vector-set! exit-info 3 3) ;; rollup-status + ;; NB// test-set-status! does rdb calls under the hood + (tests:test-set-status! run-id test-id next-state "WAIVED" + (if (eq? this-step-status 'check) "Logpro waived found" #f) + #f)) + ((abort) + (launch:einf-rollup-status-set! exit-info 5) ;; (vector-set! exit-info 3 4) ;; rollup-status + ;; NB// test-set-status! does rdb calls under the hood + (tests:test-set-status! run-id test-id next-state "ABORT" + (if (eq? this-step-status 'abort) "Logpro abort found" #f) + #f)) + ((skip) + (launch:einf-rollup-status-set! exit-info 6) ;; (vector-set! exit-info 3 4) ;; rollup-status + ;; NB// test-set-status! does rdb calls under the hood + (tests:test-set-status! run-id test-id next-state "SKIP" + (if (eq? this-step-status 'skip) "Logpro skip found" #f) + #f)) + ((pass) + (tests:test-set-status! run-id test-id next-state "PASS" #f #f)) + (else ;; 'fail + (launch:einf-rollup-status-set! exit-info 1) ;; (vector-set! exit-info 3 1) ;; force fail, this used to be next-state but that doesn't make sense. should always be "COMPLETED" + (tests:test-set-status! run-id test-id "COMPLETED" "FAIL" (conc "Failed at step " stepname) #f) + ))) + logpro-used)) + +(define (launch:manage-steps run-id test-id item-path fullrunscript ezsteps subrun test-name tconfigreg exit-info m) + ;; (let-values + ;; (((pid exit-status exit-code) + ;; (run-n-wait fullrunscript))) + ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) + ;; Since we should have a clean slate at this time there is no need to do + ;; any of the other stuff that tests:test-set-status! does. Let's just + ;; force RUNNING/n/a + + ;; (thread-sleep! 0.3) + ;; (tests:test-force-state-status! run-id test-id "RUNNING" "n/a") + (rmt:set-state-status-and-roll-up-items run-id test-name item-path "RUNNING" #f #f) + ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here + + ;; if there is a runscript do it first + (if fullrunscript + (let ((pid (process-run fullrunscript))) + (rmt:test-set-top-process-pid run-id test-id pid) + (let loop ((i 0)) + (let-values + (((pid-val exit-status exit-code) (process-wait pid #t))) + (mutex-lock! m) + (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) + (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status) + (launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code) + (launch:einf-rollup-status-set! exit-info exit-code) ;; (vector-set! exit-info 3 exit-code) ;; rollup status + (mutex-unlock! m) + (if (eq? pid-val 0) + (begin + (thread-sleep! 2) + (loop (+ i 1))) + ))))) + ;; then, if runscript ran ok (or did not get called) + ;; do all the ezsteps (if any) + (if (or ezsteps subrun) + (let* ((test-run-dir (tests:get-test-path-from-environment)) + (testconfig ;; (read-config (conc work-area "/testconfig") #f #t environ-patt: "pre-launch-env-vars")) ;; FIXME??? is allow-system ok here? + ;; NOTE: it is tempting to turn off force-create of testconfig but dynamic + ;; ezstep names need a full re-eval here. + (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs))) + (ezstepslst (if (hash-table? testconfig) + (hash-table-ref/default testconfig "ezsteps" '()) + #f))) + (if testconfig + (hash-table-set! *testconfigs* test-name testconfig) ;; cached for lazy reads later ... + (begin + (launch:setup) + (debug:print 0 *default-log-port* "WARNING: no testconfig found for " test-name " in search path:\n " + (string-intersperse (tests:get-tests-search-path *configdat*) "\n ")))) + ;; after all that, still no testconfig? Time to abort + (if (not testconfig) + (begin + (debug:print-error 0 *default-log-port* "Failed to resolve megatest.config, runconfigs.config and testconfig issues. Giving up now") + (exit 1))) + + ;; create a proc for the subrun if requested, save that proc in the ezsteps table as the last entry + ;; 1. get section [runarun] + ;; 2. unset MT_* vars + ;; 3. fix target + ;; 4. fix runname + ;; 5. fix testpatt or calculate it from contour + ;; 6. launch the run + ;; 7. roll up the run result and or roll up the logpro processed result + (when (configf:lookup testconfig "subrun" "runwait") ;; we use runwait as the flag that a subrun is requested + (subrun:initialize-toprun-test testconfig test-run-dir) + (let* ((mt-cmd (subrun:launch-cmd test-run-dir))) + (debug:print-info 0 *default-log-port* "Subrun command is \"" mt-cmd "\"") + (set! ezsteps #t) ;; set the needed flag + (set! ezstepslst + (append (or ezstepslst '()) + (list (list "subrun" (conc "{subrun=true} " mt-cmd))))))) + + ;; process the ezsteps + (if ezsteps + (begin + (if (not (common:file-exists? ".ezsteps"))(create-directory ".ezsteps")) + ;; if ezsteps was defined then we are sure to have at least one step but check anyway + (if (not (> (length ezstepslst) 0)) + (debug:print-error 0 *default-log-port* "ezsteps defined but ezstepslst is zero length") + (let loop ((ezstep (car ezstepslst)) + (tal (cdr ezstepslst)) + (prevstep #f)) + (debug:print-info 0 *default-log-port* "Processing ezstep \"" (string-intersperse ezstep " ") "\"") + ;; check exit-info (vector-ref exit-info 1) + (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) + (let ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig)) + (stepname (car ezstep))) + ;; if logpro-used read in the stepname.dat file + (if (and logpro-used (common:file-exists? (conc stepname ".dat"))) + (launch:load-logpro-dat run-id test-id stepname)) + (if (steprun-good? logpro-used (launch:einf-exit-code exit-info)) + (if (not (null? tal)) + (loop (car tal) (cdr tal) stepname)) + (debug:print 0 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping"))) + (debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep))))))))) + +(define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) + (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30"))) + (start-seconds (current-seconds)) + (calc-minutes (lambda () + (inexact->exact + (round + (- + (current-seconds) + start-seconds))))) + (kill-tries 0)) + ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) + ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) + (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) + + (let loop ((minutes (calc-minutes)) + (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) + (disk-free (get-df (current-directory))) + (last-sync (current-seconds))) + #;(common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) + (let* ((over-time (> (current-seconds) (+ last-sync update-period))) + (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) + (delta (abs (- load cpu-load)))) + (if (> delta 0.1) ;; don't bother updating with small changes + load + #f))) + (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds + (get-df (current-directory)) + disk-free)) + (delta (abs (- df disk-free)))) + (if (and (> df 0) + (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg + df + #f))) + (do-sync (or new-cpu-load new-disk-free over-time)) + + (test-info (rmt:get-test-info-by-id run-id test-id)) + (state (db:test-get-state test-info)) + (status (db:test-get-status test-info)) + (kill-reason "no kill reason specified") + (kill-job? #f)) + #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) + (cond + ((test-get-kill-request run-id test-id) + (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") + (set! kill-job? #t)) + ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) + (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) + (set! kill-job? #t)) + ((equal? status "DEAD") + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) + (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") + ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING + (set! kill-job? #f))) + + (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) + (launch:handle-zombie-tests run-id) + (when do-sync + ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) + ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) + #;(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) + #;(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))) + + (if kill-job? + (begin + (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) + (mutex-lock! m) + ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this + ;; section and the runit section? Or add a loop that tries three times with a 1/4 second + ;; between tries? + (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0)) + (pid2 (rmt:test-get-top-process-pid run-id test-id)) + (pids (delete-duplicates (filter number? (list pid1 pid2))))) + (if (not (null? pids)) + (begin + (for-each + (lambda (pid) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "Unable to kill process with pid " pid ", possibly already killed.") + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))) + (debug:print 0 *default-log-port* "WARNING: Request received to kill job " pid) ;; " (attempt # " kill-tries ")") + (debug:print-info 0 *default-log-port* "Signal mask=" (signal-mask)) + ;; (if (process:alive? pid) + ;; (begin + (map (lambda (pid-num) + (process-signal pid-num signal/term)) + (process:get-sub-pids pid)) + (thread-sleep! 5) + ;; (if (process:process-alive? pid) + (map (lambda (pid-num) + (handle-exceptions + exn + #f + (process-signal pid-num signal/kill))) + (process:get-sub-pids pid)))) + ;; (debug:print-info 0 *default-log-port* "not killing process " pid " as it is not alive")))) + pids) + ;; BB: question to Matt -- does the tests:test-state-status! encompass rollup to toplevel? If not, should it? + (tests:test-set-status! run-id test-id "KILLED" "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) ;; BB ADDED kill-reason -- confirm OK with Matt + (begin + (debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2) + (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt + ))) + (mutex-unlock! m) + ;; no point in sticking around. Exit now. But run end of run before exiting? + (launch:end-of-run-check run-id) + (exit))) + (if (hash-table-ref/default misc-flags 'keep-going #f) + (begin + (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses + (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta + (loop (calc-minutes) + (or new-cpu-load cpu-load) + (or new-disk-free disk-free) + (if do-sync (current-seconds) last-sync))))))) + (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional + +;; set up needed environment variables given a run-id and optionally a target, itempath etc. +;; +(define (runs:set-megatest-env-vars run-id #!key (inkeys #f)(inrunname #f)(inkeyvals #f)(intarget #f)(testname #f)(itempath #f)) + ;;(bb-check-path msg: "runs:set-megatest-env-vars entry") + (let* ((target (or intarget + (common:args-get-target) + (get-environment-variable "MT_TARGET"))) + (keys (if inkeys inkeys (rmt:get-keys))) + (keyvals (if inkeyvals inkeyvals (keys:target->keyval keys target))) + (vals (hash-table-ref/default *env-vars-by-run-id* run-id #f)) + (link-tree (common:get-linktree))) ;; (configf:lookup *configdat* "setup" "linktree"))) + (if testname (setenv "MT_TEST_NAME" testname)) + (if itempath (setenv "MT_ITEMPATH" itempath)) + + ;; get the info from the db and put it in the cache + (if link-tree + (setenv "MT_LINKTREE" link-tree) + (debug:print-error 0 *default-log-port* "linktree not set, should be set in megatest.config in [setup] section.")) + (if (not vals) + (let ((ht (make-hash-table))) + (hash-table-set! *env-vars-by-run-id* run-id ht) + (set! vals ht) + (for-each + (lambda (key) + (hash-table-set! vals (car key) (cadr key))) + keyvals))) + ;; from the cached data set the vars + + (hash-table-for-each + vals + (lambda (key val) + (debug:print 2 *default-log-port* "setenv " key " " val) + (safe-setenv key val))) + ;;(bb-check-path msg: "runs:set-megatest-env-vars block 1") + ;;(BB> "*env-vars-by-run-id*/runid("run-id" vals="(hash-table->alist vals)) + + (if (not (get-environment-variable "MT_TARGET"))(setenv "MT_TARGET" target)) + ;; we had a case where there was an exception generated by the hash-table-ref + ;; due to *configdat* being #f Adding a handle and exit + (let fatal-loop ((count 0)) + (handle-exceptions + exn + (let ((call-chain (get-call-chain)) + (msg ((condition-property-accessor 'exn 'message) exn))) + (if (< count 5) + (begin ;; this call is colliding, do some crude stuff to fix it. + (debug:print 0 *default-log-port* "ERROR: *configdat* was inaccessible! This should never happen. Retry #" count) + (launch:setup force-reread: #t) + (fatal-loop (+ count 1))) + (begin + (debug:print 0 *default-log-port* "FATAL: *configdat* was inaccessible! This should never happen. Retried " count " times. Message: " msg) + (debug:print 0 *default-log-port* "Call chain:") + (with-output-to-port *default-log-port* + + (lambda () + (print "*configdat* is >>"*configdat*"<<") + (pp *configdat*) + (pp call-chain))) + + (exit 1)))) + ;;(bb-check-path msg: "runs:set-megatest-env-vars block 1.5") + (when (or (not *configdat*) (not (hash-table? *configdat*))) + (debug:print 0 *default-log-port* "WARNING: *configdat* was inaccessible! This should never happen. Brute force reread.") + ;;(BB> "ERROR: *configdat* was inaccessible! This should never happen. Brute force reread.") + (thread-sleep! 2) ;; assuming nfs lag. + (launch:setup force-reread: #t)) + (alist->env-vars (hash-table-ref/default *configdat* "env-override" '())))) ;;;; environment is tainted HERE in this let block. + ;;(bb-check-path msg: "runs:set-megatest-env-vars block 2") + ;; Lets use this as an opportunity to put MT_RUNNAME in the environment + (let ((runname (if inrunname inrunname (rmt:get-run-name-from-id run-id)))) + (if runname + (setenv "MT_RUNNAME" runname) + (debug:print-error 0 *default-log-port* "no value for runname for id " run-id))) + (setenv "MT_RUN_AREA_HOME" *toppath*) + ;; if a testname and itempath are available set the remaining appropriate variables + (if testname (setenv "MT_TEST_NAME" testname)) + (if itempath (setenv "MT_ITEMPATH" itempath)) + ;;(bb-check-path msg: "runs:set-megatest-env-vars block 3") + (if (and testname link-tree) + (setenv "MT_TEST_RUN_DIR" (conc (getenv "MT_LINKTREE") "/" + (getenv "MT_TARGET") "/" + (getenv "MT_RUNNAME") "/" + (getenv "MT_TEST_NAME") + (if (and itempath + (not (equal? itempath ""))) + (conc "/" itempath) + "")))))) + +(define (launch:execute encoded-cmd) + (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) + (tconfigreg #f)) + (setenv "MT_CMDINFO" encoded-cmd) + ;;(bb-check-path msg: "launch:execute incoming") + (if (list? cmdinfo) ;; ((testpath /tmp/mrwellan/jazzmind/src/example_run/tests/sqlitespeed) + ;; (test-name sqlitespeed) (runscript runscript.rb) (db-host localhost) (run-id 1)) + (let* ((testpath (assoc/default 'testpath cmdinfo)) ;; testpath is the test spec area + (top-path (assoc/default 'toppath cmdinfo)) + (work-area (assoc/default 'work-area cmdinfo)) ;; work-area is the test run area + (test-name (assoc/default 'test-name cmdinfo)) + (runscript (assoc/default 'runscript cmdinfo)) + (ezsteps (assoc/default 'ezsteps cmdinfo)) + (subrun (assoc/default 'subrun cmdinfo)) + ;; (runremote (assoc/default 'runremote cmdinfo)) + ;; (transport (assoc/default 'transport cmdinfo)) ;; not used + ;; (serverinf (assoc/default 'serverinf cmdinfo)) + ;; (port (assoc/default 'port cmdinfo)) + (serverurl (assoc/default 'serverurl cmdinfo)) + (homehost (assoc/default 'homehost cmdinfo)) + (run-id (assoc/default 'run-id cmdinfo)) + (test-id (assoc/default 'test-id cmdinfo)) + (target (assoc/default 'target cmdinfo)) + (areaname (assoc/default 'areaname cmdinfo)) + (itemdat (assoc/default 'itemdat cmdinfo)) + (env-ovrd (assoc/default 'env-ovrd cmdinfo)) + (set-vars (assoc/default 'set-vars cmdinfo)) ;; pre-overrides from -setvar + (runname (assoc/default 'runname cmdinfo)) + (megatest (assoc/default 'megatest cmdinfo)) + (runtlim (assoc/default 'runtlim cmdinfo)) + (contour (assoc/default 'contour cmdinfo)) + (item-path (item-list->path itemdat)) + (mt-bindir-path (assoc/default 'mt-bindir-path cmdinfo)) + (keys #f) + (keyvals #f) + (fullrunscript (if (not runscript) + #f + (if (substring-index "/" runscript) + runscript ;; use unadultered if contains slashes + (let ((fulln (conc work-area "/" runscript))) + (if (and (common:file-exists? fulln) + (file-execute-access? fulln)) + fulln + runscript))))) ;; assume it is on the path + (check-work-area (lambda () + ;; NFS might not have propagated the directory meta data to the run host - give it time if needed + (let loop ((count 0)) + (if (or (common:directory-exists? work-area) + (> count 10)) + (change-directory work-area) + (begin + (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " work-area " not found") + (thread-sleep! 10) + (loop (+ count 1))))) + + (if (not (string=? (common:real-path work-area)(common:real-path (current-directory)))) + (begin + (debug:print 0 *default-log-port* + "INFO: we are expecting to be in directory " work-area "\n" + " but we are actually in the directory " (current-directory) "\n" + " doing another change dir.") + (change-directory work-area))) + + ;; spot check that the files in testpath are available. Too often NFS delays cause problems here. + (let ((files (glob (conc testpath "/*"))) + (bad-files '())) + (for-each + (lambda (fullname) + (let* ((fname (pathname-strip-directory fullname)) + (targn (conc work-area "/" fname))) + (if (not (file-exists? targn)) + (set! bad-files (cons fname bad-files))))) + files) + (if (not (null? bad-files)) + (begin + (debug:print 0 *default-log-port* "INFO: test data from " testpath " not copied properly or filesystem problems causing data to not be found. Re-running the copy command.") + (debug:print 0 *default-log-port* "INFO: missing files from " work-area ": " (string-intersperse bad-files ", ")) + (launch:test-copy testpath work-area)))) + ;; one more time, change to the work-area directory + (change-directory work-area))) + ) ;; let* + + (if contour (setenv "MT_CONTOUR" contour)) + + ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ... + ;; + (setenv "MT_TESTSUITENAME" areaname) + (setenv "MT_RUN_AREA_HOME" top-path) + (set! *toppath* top-path) + (change-directory *toppath*) ;; temporarily switch to the run area home + (setenv "MT_TEST_RUN_DIR" work-area) + + (launch:setup) ;; should be properly in the run area home now + + (if contour (setenv "MT_CONTOUR" contour)) + + ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ... + ;; + (setenv "MT_TESTSUITENAME" areaname) + (setenv "MT_RUN_AREA_HOME" top-path) + (set! *toppath* top-path) + (change-directory *toppath*) ;; temporarily switch to the run area home + (setenv "MT_TEST_RUN_DIR" work-area) + + (launch:setup) ;; should be properly in the run area home now + + (set! tconfigreg (tests:get-all)) ;; mapping of testname => test source path + (let ((sighand (lambda (signum) + ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting + (if (eq? signum signal/stop) + (debug:print-error 0 *default-log-port* "attempt to STOP process. Exiting.")) + (set! *time-to-exit* #t) + (print "Received signal " signum ", cleaning up before exit (set this test to COMPLETED/ABORT) . Please wait...") + (let ((th1 (make-thread (lambda () + (print "set test to COMPLETED/ABORT begin.") + (rmt:test-set-state-status run-id test-id "COMPLETED" "ABORT" "received kill signal") + (print "set test to COMPLETED/ABORT complete.") + (print "Killed by signal " signum ". Exiting") + (exit 1)))) + (th2 (make-thread (lambda () + (thread-sleep! 20) + (debug:print 0 *default-log-port* "Done") + (exit 4))))) + (thread-start! th2) + (thread-start! th1) + (thread-join! th2))))) + (set-signal-handler! signal/int sighand) + (set-signal-handler! signal/term sighand) + ) ;; (set-signal-handler! signal/stop sighand) + + ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, + ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* + ;; + (let* ((test-info (rmt:get-test-info-by-id run-id test-id)) + (test-host (if test-info + (db:test-get-host test-info) + (begin + (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.") + (exit)))) + (test-pid (db:test-get-process_id test-info))) + (cond + ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. + ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun + (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") + ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") + + (rmt:general-call 'set-test-start-time #f test-id) + (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) + ) ;; prime it for running + ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART")) + (if (process:alive-on-host? test-host test-pid) + (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed") + (exit))) + ((not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) + ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") + (rmt:general-call 'set-test-start-time #f test-id) + (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) + ) + (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")) + (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed") + (exit)))) + + ;; cleanup prior execution's steps + (rmt:delete-steps-for-test! run-id test-id) + + (debug:print 2 *default-log-port* "Executing " test-name " (id: " test-id ") on " (get-host-name)) + (set! keys (rmt:get-keys)) + ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process + ;; one of these is defunct/redundant ... + (if (not (launch:setup force-reread: #t)) + (begin + (debug:print 0 *default-log-port* "Failed to setup, exiting") + ;; (sqlite3:finalize! db) + ;; (sqlite3:finalize! tdb) + (exit 1))) + ;; validate that the test run area is available + (check-work-area) + + ;; still need to go back to run area home for next couple steps + (change-directory *toppath*) + + ;; NOTE: Current order is to process runconfigs *before* setting the MT_ vars. This + ;; seems non-ideal but could well break stuff + ;; BUG? BUG? BUG? + + (let ((rconfig (full-runconfigs-read)) ;; (read-config (conc *toppath* "/runconfigs.config") #f #t sections: (list "default" target)))) + (wconfig (configf:read-config "waivers.config" #f #t sections: `( "default" ,target )))) ;; read the waivers config if it exists + ;; (setup-env-defaults (conc *toppath* "/runconfigs.config") run-id (make-hash-table) keyvals target) + ;; (set-run-config-vars run-id keyvals target) ;; (db:get-target db run-id)) + ;; Now have runconfigs data loaded, set environment vars + (for-each + (lambda (section) + (for-each + (lambda (varval) + (let ((var (car varval)) + (val (cadr varval))) + (if (and (string? var)(string? val)) + (begin + (safe-setenv var (configf:eval-string-in-environment val))) ;; val) + (debug:print-error 0 *default-log-port* "bad variable spec, " var "=" val)))) + (configf:get-section rconfig section))) + (list "default" target))) + ;;(bb-check-path msg: "launch:execute post block 1") + + ;; NFS might not have propagated the directory meta data to the run host - give it time if needed + (let loop ((count 0)) + (if (or (common:file-exists? work-area) + (> count 10)) + (change-directory work-area) + (begin + (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " work-area " not found") + (thread-sleep! 10) + (loop (+ count 1))))) + + ;; now we can switch to the work-area? + (change-directory work-area) + ;;(bb-check-path msg: "launch:execute post block 1.5") + ;; (change-directory work-area) + (set! keyvals (keys:target->keyval keys target)) + ;; apply pre-overrides before other variables. The pre-override vars must not + ;; clobbers things from the official sources such as megatest.config and runconfigs.config + (if (string? set-vars) + (let ((varpairs (string-split set-vars ","))) + (debug:print 4 *default-log-port* "varpairs: " varpairs) + (map (lambda (varpair) + (let ((varval (string-split varpair "="))) + (if (eq? (length varval) 2) + (let ((var (car varval)) + (val (cadr varval))) + (debug:print 1 *default-log-port* "Adding pre-var/val " var " = " val " to the environment") + (setenv var val))))) + varpairs))) + ;;(bb-check-path msg: "launch:execute post block 2") + (for-each + (lambda (varval) + (let ((var (car varval)) + (val (cadr varval))) + (if val + (setenv var val) + (begin + (debug:print-error 0 *default-log-port* "required variable " var " does not have a valid value. Exiting") + (exit))))) + (list + (list "MT_TEST_RUN_DIR" work-area) + (list "MT_TEST_NAME" test-name) + (list "MT_ITEM_INFO" (conc itemdat)) + (list "MT_ITEMPATH" item-path) + (list "MT_RUNNAME" runname) + (list "MT_MEGATEST" megatest) + (list "MT_TARGET" target) + (list "MT_LINKTREE" (common:get-linktree)) ;; (configf:lookup *configdat* "setup" "linktree")) + (list "MT_TESTSUITENAME" (common:get-testsuite-name)))) + ;;(bb-check-path msg: "launch:execute post block 3") + + (if mt-bindir-path (setenv "PATH" (conc (getenv "PATH") ":" mt-bindir-path))) + ;;(bb-check-path msg: "launch:execute post block 4") + ;; (change-directory top-path) + ;; Can setup as client for server mode now + ;; (client:setup) + + + ;; environment overrides are done *before* the remaining critical envars. + (alist->env-vars env-ovrd) + ;;(bb-check-path msg: "launch:execute post block 41") + (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) + ;;(bb-check-path msg: "launch:execute post block 42") + (set-item-env-vars itemdat) + ;;(bb-check-path msg: "launch:execute post block 43") + (let ((blacklist (configf:lookup *configdat* "setup" "blacklistvars"))) + (if blacklist + (let ((vars (string-split blacklist))) + (save-environment-as-files "megatest" ignorevars: vars) + (for-each (lambda (var) + (unsetenv var)) + vars)) + (save-environment-as-files "megatest"))) + ;;(bb-check-path msg: "launch:execute post block 44") + ;; open-run-close not needed for test-set-meta-info + ;; (tests:set-full-meta-info #f test-id run-id 0 work-area) + ;; (tests:set-full-meta-info test-id run-id 0 work-area) + (tests:set-full-meta-info #f test-id run-id 0 work-area 10) + + ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here + + (if (args:get-arg "-xterm") + (set! fullrunscript "xterm") + (if (and fullrunscript + (common:file-exists? fullrunscript) + (not (file-execute-access? fullrunscript))) + (system (conc "chmod ug+x " fullrunscript)))) + + ;; We are about to actually kick off the test + ;; so this is a good place to remove the records for + ;; any previous runs + ;; (db:test-remove-steps db run-id testname itemdat) + ;; now is also a good time to write the .testconfig file + (let* ((tconfig-fname (conc work-area "/.testconfig")) + (tconfig-tmpfile (conc tconfig-fname ".tmp")) + (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t))) ;; 'return-procs))) + (configf:write-alist tconfig tconfig-tmpfile) + (file-move tconfig-tmpfile tconfig-fname #t)) + ;; + (let* ((m (make-mutex)) + (kill-job? #f) + (exit-info (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status + (job-thread #f) + ;; (keep-going #t) + (misc-flags (let ((ht (make-hash-table))) + (hash-table-set! ht 'keep-going #t) + ht)) + (runit (lambda () + (launch:manage-steps run-id test-id item-path fullrunscript ezsteps subrun test-name tconfigreg exit-info m))) + (monitorjob (lambda () + (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags))) + (th1 (make-thread monitorjob "monitor job")) + (th2 (make-thread runit "run job"))) + (set! job-thread th2) + (thread-start! th1) + (thread-start! th2) + (thread-join! th2) + (debug:print-info 0 *default-log-port* "Megatest exectute of test " test-name ", item path " item-path " complete. Notifying the db ...") + (hash-table-set! misc-flags 'keep-going #f) + (thread-join! th1) + (thread-sleep! 1) ;; givbe thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec. + (mutex-lock! m) + (let* ((item-path (item-list->path itemdat)) + ;; only state and status needed - use lazy routine + (testinfo (rmt:get-testinfo-state-status run-id test-id))) + ;; Am I completed? + (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) ;; NOTE: It should *not* be REMOTEHOSTSTART but for reasons I don't yet understand it sometimes gets stuck in that state ;; (not (equal? (db:test-get-state testinfo) "COMPLETED")) + (let ((new-state (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status + ;; "COMPLETED" ;; (db:test-get-state testinfo))) ;; else preseve the state as set within the test + ) + (new-status (cond + ((not (launch:einf-exit-status exit-info)) "FAIL") ;; job failed to run ... (vector-ref exit-info 1) + ((eq? (launch:einf-rollup-status exit-info) 0) ;; (vector-ref exit-info 3) + ;; if the current status is AUTO then defer to the calculated value (i.e. leave this AUTO) + (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO" "PASS")) + ((eq? (launch:einf-rollup-status exit-info) 1) "FAIL") ;; (vector-ref exit-info 3) + ((eq? (launch:einf-rollup-status exit-info) 2) ;; (vector-ref exit-info 3) + ;; if the current status is AUTO the defer to the calculated value but qualify (i.e. make this AUTO-WARN) + (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO-WARN" "WARN")) + ((eq? (launch:einf-rollup-status exit-info) 3) "CHECK") + ((eq? (launch:einf-rollup-status exit-info) 4) "WAIVED") + ((eq? (launch:einf-rollup-status exit-info) 5) "ABORT") + ((eq? (launch:einf-rollup-status exit-info) 6) "SKIP") + (else "FAIL")))) ;; (db:test-get-status testinfo))) + (debug:print-info 1 *default-log-port* "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (launch:einf-exit-status exit-info) " and rollup-status of " (launch:einf-rollup-status exit-info)) + (tests:test-set-status! run-id + test-id + new-state + new-status + (args:get-arg "-m") #f) + ;; need to update the top test record if PASS or FAIL and this is a subtest + ;; NO NEED TO CALL set-state-status-and-roll-up-items HERE, THIS IS DONE IN set-state-status-and-roll-up-items called by tests:test-set-status! + )) + ;; for automated creation of the rollup html file this is a good place... + (if (not (equal? item-path "")) + (tests:summarize-items run-id test-id test-name #f)) + (tests:summarize-test run-id test-id) ;; don't force - just update if no + (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) + (mutex-unlock! m) + (launch:end-of-run-check run-id ) + (debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " + work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n") + (if (not (launch:einf-exit-status exit-info)) + (exit 4)))) + ))) + +(define (launch:is-test-alive host pid) +(if (and host pid (not (equal? host "n/a"))) +(let* ((cmd (conc "ssh " host " pstree -A " pid)) + (output (with-input-from-pipe cmd read-lines))) + (print "cmd: " cmd "\n op: " output ) + (if(eq? (length output) 0) + #f + #t)) +#t)) + +(define (launch:kill-tests-if-dead run-id) + (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f))) + (let loop ((running-test (car running-tests)) + (tal (cdr running-tests)) + (kill-cnt 0)) + (let* ((test-name (vector-ref running-test 2)) + (item-path (vector-ref running-test 11)) + (test-id (vector-ref running-test 0)) + (host (vector-ref running-test 6)) + (pid (rmt:test-get-top-process-pid run-id test-id)) + (event-time (vector-ref running-test 5)) + (duration (vector-ref running-test 12)) + (flag 0) + (curr-time (current-seconds))) + (if (and (< (+ event-time duration 600) curr-time) (not (launch:is-test-alive host pid))) ;;test has not updated duration in last 10 min then likely its not running but confirm before marking it as killed + (begin + (debug:print 0 *default-log-port* "test " test-name "/" item-path " needs to be killed") + (set! flag 1) + (rmt:set-state-status-and-roll-up-items run-id test-name item-path "KILLREQ" "n/a" #f))) + (if (not (null? tal)) + (loop (car tal) (cdr tal) (+ kill-cnt flag)) + (+ kill-cnt flag)))))) + +;; DO NOT USE - caching of configs is handled in launch:setup now. +;; +(define (launch:cache-config) + ;; if we have a linktree and -runtests and -target and the directory exists dump the config + ;; to megatest-(current-seconds).cfg and symlink it to megatest.cfg + (if (and *configdat* + (or (args:get-arg "-run") + (args:get-arg "-runtests") + (args:get-arg "-execute"))) + (let* ((linktree (common:get-linktree)) ;; (get-environment-variable "MT_LINKTREE")) + (target (common:args-get-target exit-if-bad: #t)) + (runname (or (args:get-arg "-runname") + (args:get-arg ":runname") + (getenv "MT_RUNNAME"))) + (fulldir (conc linktree "/" + target "/" + runname))) + (if (and linktree (common:file-exists? linktree)) ;; can't proceed without linktree + (begin + (debug:print-info 0 *default-log-port* "Have -run with target=" target ", runname=" runname ", fulldir=" fulldir ", testpatt=" (or (args:get-arg "-testpatt") "%")) + (if (not (common:file-exists? fulldir)) + (create-directory fulldir #t)) ;; need to protect with exception handler + (if (and target + runname + (common:file-exists? fulldir)) + (let ((tmpfile (conc fulldir "/.megatest.cfg." (current-seconds))) + (targfile (conc fulldir "/.megatest.cfg-" megatest-version "-" megatest-fossil-hash)) + (rconfig (conc fulldir "/.runconfig." megatest-version "-" megatest-fossil-hash))) + (if (common:file-exists? rconfig) ;; only cache megatest.config AFTER runconfigs has been cached + (begin + (debug:print-info 0 *default-log-port* "Caching megatest.config in " tmpfile) + (if (not (common:in-running-test?)) + (configf:write-alist *configdat* tmpfile)) + (system (conc "ln -sf " tmpfile " " targfile)))) + ))) + (debug:print-info 1 *default-log-port* "No linktree yet, no caching configs."))))) + + +;; gather available information, if legit read configs in this order: +;; +;; if have cache; +;; read it a return it +;; else +;; megatest.config (do not cache) +;; runconfigs.config (cache if all vars avail) +;; megatest.config (cache if all vars avail) +;; returns: +;; *toppath* +;; side effects: +;; sets; *configdat* (megatest.config info) +;; *runconfigdat* (runconfigs.config info) +;; *configstatus* (status of the read data) +;; +(define (launch:setup #!key (force-reread #f) (areapath #f)) + (mutex-lock! *launch-setup-mutex*) + (if (and *toppath* + (eq? *configstatus* 'fulldata) (not force-reread)) ;; got it all + (begin + (debug:print 2 *default-log-port* "NOTE: skipping launch:setup-body call since we have fulldata") + (mutex-unlock! *launch-setup-mutex*) + *toppath*) + (let ((res (launch:setup-body force-reread: force-reread areapath: areapath))) + (mutex-unlock! *launch-setup-mutex*) + res))) + +;; return paths depending on what info is available. +;; +(define (launch:get-cache-file-paths areapath toppath target mtconfig) + (let* ((use-cache (common:use-cache?)) + (runname (common:args-get-runname)) + (linktree (common:get-linktree)) + (testname (common:get-full-test-name)) + (rundir (if (and runname target linktree) + (common:directory-writable? (conc linktree "/" target "/" runname)) + #f)) + (testdir (if (and rundir testname) + (common:directory-writable? (conc rundir "/" testname)) + #f)) + (cachedir (or testdir rundir)) + (mtcachef (and cachedir (conc cachedir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) + (rccachef (and cachedir (conc cachedir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash)))) + (debug:print-info 6 *default-log-port* + "runname=" runname + "\n linktree=" linktree + "\n testname=" testname + "\n rundir=" rundir + "\n testdir=" testdir + "\n cachedir=" cachedir + "\n mtcachef=" mtcachef + "\n rccachef=" rccachef) + (cons mtcachef rccachef))) + +(define (launch:setup-body #!key (force-reread #f) (areapath #f)) + (if (and (eq? *configstatus* 'fulldata) + *toppath* + (not force-reread)) ;; no need to reprocess + *toppath* ;; return toppath + (let* ((use-cache (common:use-cache?)) ;; BB- use-cache checks *configdat* for use-cache setting. We do not have *configdat*. Bootstrapping problem here. + (toppath (or *toppath* areapath (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath + (target (common:args-get-target)) + (sections (if target (list "default" target) #f)) ;; for runconfigs + (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config + (cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig)) + ;; checking for null cachefiles should not be necessary, I was seeing error car of '(), might be a chicken bug or a red herring ... + (mtcachef (if (null? cachefiles) + #f + (car cachefiles))) ;; (and cachedir (conc cachedir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) + (rccachef (if (null? cachefiles) + #f + (cdr cachefiles)))) ;; (and cachedir (conc cachedir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) + ;; (cancreate (and cachedir (common:file-exists? cachedir)(file-write-access? cachedir) (not (common:in-running-test?))))) + (set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource + ;;(BB> "launch:setup-body -- cachefiles="cachefiles) + (cond + ;; if mtcachef exists just read it, however we need to assume toppath is available in $MT_RUN_AREA_HOME + ((and (not force-reread) + mtcachef rccachef + use-cache + (get-environment-variable "MT_RUN_AREA_HOME") + (common:file-exists? mtcachef) + (common:file-exists? rccachef)) + ;;(BB> "launch:setup-body -- cond branch 1 - use-cache") + (set! *configdat* (configf:read-alist mtcachef)) + ;;(BB> "launch:setup-body -- 1 set! *configdat*="*configdat*) + (set! *runconfigdat* (configf:read-alist rccachef)) + (set! *configinfo* (list *configdat* (get-environment-variable "MT_RUN_AREA_HOME"))) + (set! *configstatus* 'fulldata) + (set! *toppath* (get-environment-variable "MT_RUN_AREA_HOME")) + *toppath*) + ;; there are no existing cached configs, do full reads of the configs and cache them + ;; we have all the info needed to fully process runconfigs and megatest.config + ((and ;; (not force-reread) ;; force-reread is irrelevant in the AND, could however OR it? + mtcachef + rccachef) ;; BB- why are we doing this without asking if caching is desired? + ;;(BB> "launch:setup-body -- cond branch 2") + (let* ((first-pass (configf:find-and-read-config ;; NB// sets MT_RUN_AREA_HOME as side effect + mtconfig + environ-patt: "env-override" + given-toppath: toppath + pathenvvar: "MT_RUN_AREA_HOME")) + (first-rundat (let ((toppath (if toppath + toppath + (car first-pass)))) + (configf:read-config ;; (conc toppath "/runconfigs.config") ;; this should be converted to runconfig:read but it is non-trivial, leaving it for now. + (conc (if (string? toppath) + toppath + (get-environment-variable "MT_RUN_AREA_HOME")) + "/runconfigs.config") + *runconfigdat* #t + sections: sections)))) + (set! *runconfigdat* first-rundat) + (if first-pass ;; + (begin + ;;(BB> "launch:setup-body -- \"first-pass\"=first-pass") + (set! *configdat* (car first-pass)) + ;;(BB> "launch:setup-body -- 2 set! *configdat*="*configdat*) + (set! *configinfo* first-pass) + (set! *toppath* (or toppath (cadr first-pass))) ;; use the gathered data unless already have it + (set! toppath *toppath*) + (if (not *toppath*) + (begin + (debug:print-error 0 *default-log-port* "you are not in a megatest area!") + (exit 1))) + (setenv "MT_RUN_AREA_HOME" *toppath*) + ;; the seed read is done, now read runconfigs, cache it then read megatest.config one more time and cache it + (let* ((keys (rmt:get-keys)) + (key-vals (keys:target->keyval keys target)) + (linktree (common:get-linktree)) ;; (or (getenv "MT_LINKTREE")(if *configdat* (configf:lookup *configdat* "setup" "linktree") #f))) + ; (if *configdat* + ; (configf:lookup *configdat* "setup" "linktree") + ; (conc *toppath* "/lt")))) + (second-pass (configf:find-and-read-config + mtconfig + environ-patt: "env-override" + given-toppath: toppath + pathenvvar: "MT_RUN_AREA_HOME")) + (runconfigdat (begin ;; this read of the runconfigs will see any adjustments made by re-reading megatest.config + (for-each (lambda (kt) + (setenv (car kt) (cadr kt))) + key-vals) + (configf:read-config (conc toppath "/runconfigs.config") *runconfigdat* #t ;; consider using runconfig:read some day ... + sections: sections))) + (cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig)) + (mtcachef (car cachefiles)) + (rccachef (cdr cachefiles))) + ;; trap exception due to stale NFS handle -- Error: (open-output-file) cannot open file - Stale NFS file handle: "/p/fdk/gwa/lefkowit/mtTesting/qa/primbeqa/links/p1222/11/PDK_r1.1.1/prim/clean/pcell_testgen/.runconfigs.cfg-1.6427-7d1e789cb3f62f9cde719a4865bb51b3c17ea853" - ticket 220546342 + ;; TODO - consider 1) using simple-lock to bracket cache write + ;; 2) cache in hash on server, since need to do rmt: anyway to lock. + + (if rccachef + (common:fail-safe + (lambda () + (configf:write-alist runconfigdat rccachef)) + (conc "Could not write cache file - "rccachef))) + (if mtcachef + (common:fail-safe + (lambda () + (configf:write-alist *configdat* mtcachef)) + (conc "Could not write cache file - "mtcachef))) + (set! *runconfigdat* runconfigdat) + (if (and rccachef mtcachef) (set! *configstatus* 'fulldata)))) + ;; no configs found? should not happen but let's try to recover gracefully, return an empty hash-table + (set! *configdat* (make-hash-table)) + ))) + + ;; else read what you can and set the flag accordingly + ;; here we don't have either mtconfig or rccachef + (else + ;;(BB> "launch:setup-body -- cond branch 3 - else") + (let* ((cfgdat (configf:find-and-read-config + (or (args:get-arg "-config") "megatest.config") + environ-patt: "env-override" + given-toppath: (get-environment-variable "MT_RUN_AREA_HOME") + pathenvvar: "MT_RUN_AREA_HOME"))) + + (if (and cfgdat (list? cfgdat) (> (length cfgdat) 0) (hash-table? (car cfgdat))) + (let* ((toppath (or (get-environment-variable "MT_RUN_AREA_HOME")(cadr cfgdat))) + (rdat (configf:read-config (conc toppath ;; convert this to use runconfig:read! + "/runconfigs.config") *runconfigdat* #t sections: sections))) + (set! *configinfo* cfgdat) + (set! *configdat* (car cfgdat)) + (set! *runconfigdat* rdat) + (set! *toppath* toppath) + (set! *configstatus* 'partial)) + (begin + (debug:print-error 0 *default-log-port* "No " mtconfig " file found. Giving up.") + (exit 2)))))) + ;; COND ends here. + + ;; additional house keeping + (let* ((linktree (or (common:get-linktree) + (conc *toppath* "/lt")))) + (if linktree + (begin + (if (not (common:file-exists? linktree)) + (begin + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* "Something went wrong when trying to create linktree dir at " linktree) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (exit 1)) + (create-directory linktree #t)))) + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* "Something went wrong when trying to create link to linktree at " *toppath*) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))) + (let ((tlink (conc *toppath* "/lt"))) + (if (not (common:file-exists? tlink)) + (create-symbolic-link linktree tlink))))) + (begin + (debug:print-error 0 *default-log-port* "linktree not defined in [setup] section of megatest.config") + ))) + (if (and *toppath* + (directory-exists? *toppath*)) + (begin + (setenv "MT_RUN_AREA_HOME" *toppath*) + (setenv "MT_TESTSUITENAME" (common:get-testsuite-name))) + (begin + (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.") + (set! *toppath* #f) ;; force it to be false so we return #f + #f)) + + ;; one more attempt to cache the configs for future reading + (let* ((cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig)) + (mtcachef (car cachefiles)) + (rccachef (cdr cachefiles))) + + ;; trap exception due to stale NFS handle -- Error: (open-output-file) cannot open file - Stale NFS file handle: "...somepath.../.runconfigs.cfg-1.6427-7d1e789cb3f62f9cde719a4865bb51b3c17ea853" - ticket 220546342 + ;; TODO - consider 1) using simple-lock to bracket cache write + ;; 2) cache in hash on server, since need to do rmt: anyway to lock. + (if (and rccachef *runconfigdat* (not (common:file-exists? rccachef))) + (common:fail-safe + (lambda () + (configf:write-alist *runconfigdat* rccachef)) + (conc "Could not write cache file - "rccachef)) + ) + (if (and mtcachef *configdat* (not (common:file-exists? mtcachef))) + (common:fail-safe + (lambda () + (configf:write-alist *configdat* mtcachef)) + (conc "Could not write cache file - "mtcachef)) + ) + (if (and rccachef mtcachef *runconfigdat* *configdat*) + (set! *configstatus* 'fulldata))) + + ;; if have -append-config then read and append here + (let ((cfname (args:get-arg "-append-config"))) + (if (and cfname + (file-read-access? cfname)) + (configf:read-config cfname *configdat* #t))) ;; values are added to the hash, no need to do anything special. + *toppath*))) + +(define (get-best-disk confdat testconfig) + (let* ((disks (or (and testconfig (hash-table-ref/default testconfig "disks" #f)) + (hash-table-ref/default confdat "disks" #f))) + (minspace (let ((m (configf:lookup confdat "setup" "minspace"))) + (string->number (or m "10000"))))) + (if disks + (let ((res (common:get-disk-with-most-free-space disks minspace))) ;; min size of 1000, seems tad dumb + (if res + (cdr res) + (begin +;; (if (common:low-noise-print 20 "No valid disks or no disk with enough space") +;; (debug:print-error 0 *default-log-port* "No valid disks found in megatest.config. Please add some to your [disks] section and ensure the directory exists and has enough space!\n You can change minspace in the [setup] section of megatest.config. Current setting is: " minspace)) + ;;(exit 1) + (if (null? disks) + (cons 1 (conc *toppath* "/runs")) + (let ((paths (sort disks (lambda (x y) (> (string-length (cadr x)) (string-length (cadr y))))))) + (let loop ((head (car paths)) (tail (cdr paths))) + (let ((result (handle-exceptions exn #f (create-directory (cadr head) #t)))) + (if result + result + (if (null? tail) + (cons 1 (conc *toppath* "/runs")) + (loop (car tail) (cdr tail)))))))))))))) ;; the code creates the necessary directories if it does not exist and returns the path. + + +(define (launch:test-copy test-src-path test-path) + (let* ((ovrcmd (let ((cmd (configf:lookup *configdat* "setup" "testcopycmd"))) + (if cmd + ;; substitute the TEST_SRC_PATH and TEST_TARG_PATH + (string-substitute "TEST_TARG_PATH" test-path + (string-substitute "TEST_SRC_PATH" test-src-path cmd #t) #t) + #f))) + (cmd (if ovrcmd + ovrcmd + (conc "rsync -av" (if (debug:debug-mode 1) "" "q") " " test-src-path "/ " test-path "/" + " >> " test-path "/mt_launch.log 2>> " test-path "/mt_launch.log"))) + (status (system cmd))) + (if (not (eq? status 0)) + (debug:print 2 *default-log-port* "ERROR: problem with running \"" cmd "\"")))) + + +;; Desired directory structure: +;; +;; - - -. +;; | +;; v +;; - - -|- +;; +;; dir stored in test is: +;; +;; - - [ - ] +;; +;; All log file links should be stored relative to the top of link path +;; +;; - [ - ] +;; +(define (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat #!key (remtries 2)) + (let* ((item-path (if (string? itemdat) itemdat (item-list->path itemdat))) ;; if pass in string - just use it + (runname (if (string? run-info) ;; if we pass in a string as run-info use it as run-name. + run-info + (db:get-value-by-header (db:get-rows run-info) + (db:get-header run-info) + "runname"))) + (contour #f) ;; NOT READY FOR THIS (args:get-arg "-contour")) + ;; convert back to db: from rdb: - this is always run at server end + (target (string-intersperse (map cadr keyvals) "/")) + + (not-iterated (equal? "" item-path)) + + ;; all tests are found at /test-base or /test-base + (testtop-base (conc target "/" runname "/" testname)) + (test-base (conc testtop-base (if not-iterated "" "/") item-path)) + + ;; nb// if itempath is not "" then it is prefixed with "/" + (toptest-path (conc disk-path (if contour (conc "/" contour) "") "/" testtop-base)) + (test-path (conc disk-path (if contour (conc "/" contour) "") "/" test-base)) + + ;; ensure this exists first as links to subtests must be created there + (linktree (common:get-linktree)) + ;; WAS: (let ((rd (configf:lookup *configdat* "setup" "linktree"))) + ;; (if rd rd (conc *toppath* "/runs")))) + ;; which seems wrong ... + + (lnkbase (conc linktree (if contour (conc "/" contour) "") "/" target "/" runname)) + (lnkpath (conc lnkbase "/" testname)) + (lnkpathf (conc lnkpath (if not-iterated "" "/") item-path)) + (lnktarget (conc lnkpath "/" item-path))) + + ;; Update the rundir path in the test record for all, rundir=physical, shortdir=logical + ;; rundir shortdir + (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path run-id) + + (debug:print 2 *default-log-port* "INFO:\n lnkbase=" lnkbase "\n lnkpath=" lnkpath "\n toptest-path=" toptest-path "\n test-path=" test-path) + (if (not (common:file-exists? linktree)) + (begin + (debug:print 0 *default-log-port* "WARNING: linktree did not exist! Creating it now at " linktree) + (create-directory linktree #t))) ;; (system (conc "mkdir -p " linktree)))) + ;; create the directory for the tests dir links, this is needed no matter what... try up to three times + (let loop ((done 3)) + (let ((success (if (and (not (common:directory-exists? lnkbase)) + (not (common:file-exists? lnkbase))) + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* "Problem creating linktree base at " lnkbase) + (print-error-message exn (current-error-port)) + #t) + (create-directory lnkbase #t) + #f)))) + (if (and (not success)(> done 0)) + (loop (- done 1))))) + + ;; update the toptest record with its location rundir, cache the path + ;; This wass highly inefficient, one db write for every subtest, potentially + ;; thousands of unnecessary updates, cache the fact it was set and don't set it + ;; again. + + ;; Now create the link from the test path to the link tree, however + ;; if the test is iterated it is necessary to create the parent path + ;; to the iteration. use pathname-directory to trim the path by one + ;; level + (if (not not-iterated) ;; i.e. iterated + (let ((iterated-parent (pathname-directory (conc lnkpath "/" item-path)))) + (debug:print-info 2 *default-log-port* "Creating iterated parent " iterated-parent) + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* " Failed to create directory " iterated-parent ((condition-property-accessor 'exn 'message) exn) ", exiting") + (exit 1)) + (create-directory iterated-parent #t)))) + + (if (symbolic-link? lnkpath) + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* " Failed to remove symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", exiting") + (exit 1)) + (delete-file lnkpath))) + + (if (not (or (common:file-exists? lnkpath) + (symbolic-link? lnkpath))) + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* " Failed to create symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", exiting") + (exit 1)) + (create-symbolic-link toptest-path lnkpath))) + + ;; NB - This was not working right - some top tests are not getting the path set!!! + ;; + ;; Do the setting of this record after the paths are created so that the shortdir can + ;; be set to the real directory location. This is safer for future clean up if the link + ;; tree is damaged or lost. + ;; + (if (not (hash-table-ref/default *toptest-paths* testname #f)) + (let* ((testinfo (rmt:get-test-info-by-id run-id test-id)) ;; run-id testname item-path)) + (curr-test-path (if testinfo ;; (filedb:get-path *fdb* + ;; (db:get-path dbstruct + ;; (rmt:sdb-qry 'getstr + (db:test-get-rundir testinfo) ;; ) ;; ) + #f))) + (hash-table-set! *toptest-paths* testname curr-test-path) + ;; NB// Was this for the test or for the parent in an iterated test? + (rmt:general-call 'test-set-rundir-shortdir run-id lnkpath + (if (common:file-exists? lnkpath) + ;; (resolve-pathname lnkpath) + (common:nice-path lnkpath) + lnkpath) + testname "" run-id) + ;; (rmt:general-call 'test-set-rundir run-id lnkpath testname "") ;; toptest-path) + (if (or (not curr-test-path) + (not (directory-exists? toptest-path))) + (begin + (debug:print-info 2 *default-log-port* "Creating " toptest-path " and link " lnkpath) + (handle-exceptions + exn + #f ;; don't care to catch and deal with errors here for now. + (create-directory toptest-path #t)) + (hash-table-set! *toptest-paths* testname toptest-path))))) + + ;; The toptest path has been created, the link to the test in the linktree has + ;; been created. Now, if this is an iterated test the real test dir must be created + (if (not not-iterated) ;; this is an iterated test + (begin ;; (let ((lnktarget (conc lnkpath "/" item-path))) + (debug:print 2 *default-log-port* "Setting up sub test run area") + (debug:print 2 *default-log-port* " - creating run area in " test-path) + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) ", exiting") + (exit 1)) + (create-directory test-path #t)) + (debug:print 2 *default-log-port* + " - creating link from: " test-path "\n" + " to: " lnktarget) + + ;; If there is already a symlink delete it and recreate it. + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* " Failed to re-create link " lnktarget ((condition-property-accessor 'exn 'message) exn) ", exiting") + (exit)) + (if (symbolic-link? lnktarget) (delete-file lnktarget)) + (if (not (common:file-exists? lnktarget)) (create-symbolic-link test-path lnktarget))))) + + (if (not (directory? test-path)) + (create-directory test-path #t)) ;; this is a hack, I don't know why out of the blue this path does not exist sometimes + + (if (and test-src-path (directory? test-path)) + (begin + (launch:test-copy test-src-path test-path) + (list lnkpathf lnkpath )) + (if (and test-src-path (> remtries 0)) + (begin + (debug:print-error 0 *default-log-port* "Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries) + ;; + (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat remtries: (- remtries 1))) + (list #f #f))))) + + +(define (launch:handle-zombie-tests run-id) + (let* ((key (conc "zombiescan-runid-"run-id)) + (now (current-seconds)) + (threshold (- (current-seconds) (* 2 (or (configf:lookup-number *configdat* "setup" "deadtime") 120)))) + (val (rmt:get-var key)) + (do-scan? + (cond + ((not val) + #t) + ((< val threshold) + #t) + (else #f)))) + (when do-scan? + (debug:print 1 *default-log-port* "INFO: search and mark zombie tests") + (rmt:set-var key (current-seconds)) + (rmt:find-and-mark-incomplete run-id #f)))) + + + + + +;; 1. look though disks list for disk with most space +;; 2. create run dir on disk, path name is meaningful +;; 3. create link from run dir to megatest runs area +;; 4. remotely run the test on allocated host +;; - could be ssh to host from hosts table (update regularly with load) +;; - could be netbatch +;; (launch-test db (cadr status) test-conf)) +(define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) + (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex + (let* ( ;; (lock-key (conc "test-" test-id)) + ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) + ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds + ;; (if (car lock) + ;; #t + ;; (if (> (current-seconds) expire-time) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path) + ;; (rmt:no-sync-del! lock-key) ;; destroy the lock + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; + ;; (begin + ;; (thread-sleep! 1) + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)))))) + (item-path (item-list->path itemdat)) + (contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour"))) + (let loop ((delta (- (current-seconds) *last-launch*)) + (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1))) + (if (> launch-delay delta) + (begin + (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay. + (debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds")) + (thread-sleep! (- launch-delay delta)) + (loop (- (current-seconds) *last-launch*) launch-delay)))) + (change-directory *toppath*) + (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars) + (append + (list + (list "MT_RUN_AREA_HOME" *toppath*) + (list "MT_TEST_NAME" test-name) + (list "MT_RUNNAME" runname) + (list "MT_ITEMPATH" item-path) + (list "MT_CONTOUR" contour) + ) + itemdat)) + (let* ((tregistry (tests:get-all)) ;; third param (below) is system-allowed + ;; for tconfig, why do we allow fallback to test-conf? + (tconfig (or (tests:get-testconfig test-name item-path tregistry #t force-create: #t) + (begin + (debug:print 0 *default-log-port* "WARNING: falling back to pre-calculated testconfig. This is likely not desired.") + test-conf))) ;; force re-read now that all vars are set + (useshell (let ((ush (configf:lookup *configdat* "jobtools" "useshell"))) + (if ush + (if (equal? ush "no") ;; must use "no" to NOT use shell + #f + ush) + #t))) ;; default is yes + (runscript (configf:lookup tconfig "setup" "runscript")) + (ezsteps (> (length (hash-table-ref/default tconfig "ezsteps" '())) 0)) ;; don't send all the steps, could be big, just send a flag + (subrun (> (length (hash-table-ref/default tconfig "subrun" '())) 0)) ;; send a flag to process a subrun + ;; (diskspace (configf:lookup tconfig "requirements" "diskspace")) + ;; (memory (configf:lookup tconfig "requirements" "memory")) + ;; (hosts (configf:lookup *configdat* "jobtools" "workhosts")) ;; I'm pretty sure this was never completed + (remote-megatest (configf:lookup *configdat* "setup" "executable")) + (run-time-limit (or (configf:lookup tconfig "requirements" "runtimelim") + (configf:lookup *configdat* "setup" "runtimelim"))) + ;; FIXME SOMEDAY: not good how this is so obtuse, this hack is to + ;; allow running from dashboard. Extract the path + ;; from the called megatest and convert dashboard + ;; or dboard to megatest + (local-megatest (let* ((lm (car (argv))) + (dir (pathname-directory lm)) + (exe (pathname-strip-directory lm))) + (conc (if dir (conc dir "/") "") + (case (string->symbol exe) + ((dboard) "../megatest") + ((mtest) "../megatest") + ((dashboard) "megatest") + (else exe))))) + (launcher (common:get-launcher *configdat* test-name item-path)) ;; (configf:lookup *configdat* "jobtools" "launcher")) + (test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path)) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path + (work-area #f) + (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all + (diskpath #f) + (cmdparms #f) + (fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x)))) + (mt-bindir-path #f) + (testinfo (rmt:get-test-info-by-id run-id test-id)) + (mt_target (string-intersperse (map cadr keyvals) "/")) + (debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '()) + (if (args:get-arg "-logging")(list "-logging") '())))) + ;; (if hosts (set! hosts (string-split hosts))) + ;; set the megatest to be called on the remote host + (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest")) + (set! mt-bindir-path (pathname-directory remote-megatest)) + (if launcher (set! launcher (string-split launcher))) + ;; set up the run work area for this test + (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run + (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir + (begin + (debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) + (runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record + + ;; prevent overlapping actions - set to LAUNCHED as early as possible + ;; + ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail + (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) + (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f) + ;; (pp (hash-table->alist tconfig)) + (set! diskpath (get-best-disk *configdat* tconfig)) + (if diskpath + (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) + (set! work-area (car dat)) + (set! toptest-work-area (cadr dat)) + (debug:print-info 2 *default-log-port* "Using work area " work-area)) + (begin + (set! work-area (conc test-path "/tmp_run")) + (create-directory work-area #t) + (debug:print 0 *default-log-port* "WARNING: No disk work area specified - running in the test directory under tmp_run"))) + (set! cmdparms (base64:base64-encode + (z3:encode-buffer + (with-output-to-string + (lambda () ;; (list 'hosts hosts) + (write (list (list 'testpath test-path) + ;; (list 'transport (conc *transport-type*)) + ;; (list 'serverinf *server-info*) + (list 'homehost (let* ((hhdat (common:get-homehost))) + (if hhdat + (car hhdat) + #f))) + (list 'serverurl (if *runremote* + (remote-server-url *runremote*) + #f)) ;; + (list 'areaname (common:get-testsuite-name)) + (list 'toppath *toppath*) + (list 'work-area work-area) + (list 'test-name test-name) + (list 'runscript runscript) + (list 'run-id run-id ) + (list 'test-id test-id ) + ;; (list 'item-path item-path ) + (list 'itemdat itemdat ) + (list 'megatest remote-megatest) + (list 'ezsteps ezsteps) + (list 'subrun subrun) + (list 'target mt_target) + (list 'contour contour) + (list 'runtlim (if run-time-limit (common:hms-string->seconds run-time-limit) #f)) + (list 'env-ovrd (hash-table-ref/default *configdat* "env-override" '())) + (list 'set-vars (if params (hash-table-ref/default params "-setvars" #f))) + (list 'runname runname) + (list 'mt-bindir-path mt-bindir-path)))))))) + + ;; clean out step records from previous run if they exist + ;; (rmt:delete-test-step-records run-id test-id) + ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway + (if (common:file-exists? work-area) + (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir + (cond + ;; ((and launcher hosts) ;; must be using ssh hostname + ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) + ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms)))) + (launcher + (set! fullcmd (append launcher (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) + ;; (set! fullcmd (append launcher (list remote-megatest test-sig "-execute" cmdparms)))) + (else + (if (not useshell)(debug:print 0 *default-log-port* "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section")) + (set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" "")))))) + ;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" ""))))) + (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm")))) + (debug:print 1 *default-log-port* "Launching " work-area) + ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done + (debug:print 4 *default-log-port* "fullcmd: " fullcmd) + (set! *last-launch* (current-seconds)) ;; all that junk above takes time, set this as late as possible. + (let* ((commonprevvals (alist->env-vars + (hash-table-ref/default *configdat* "env-override" '()))) + (miscprevvals (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute" + (append (list (list "MT_TEST_RUN_DIR" work-area) + (list "MT_TEST_NAME" test-name) + (list "MT_ITEM_INFO" (conc itemdat)) + (list "MT_RUNNAME" runname) + (list "MT_TARGET" mt_target) + (list "MT_ITEMPATH" item-path) + ) + itemdat))) + (testprevvals (alist->env-vars + (hash-table-ref/default tconfig "pre-launch-env-overrides" '()))) + ;; Launchwait defaults to true, must override it to turn off wait + (launchwait (if (equal? (configf:lookup *configdat* "setup" "launchwait") "no") #f #t)) + (launch-results-prev (apply (if launchwait ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed. + process:cmd-run-with-stderr-and-exitcode->list + process-run) + (if useshell + (let ((cmdstr (string-intersperse fullcmd " "))) + (if launchwait + cmdstr + (conc cmdstr " >> mt_launch.log 2>&1 &"))) + (car fullcmd)) + (if useshell + '() + (cdr fullcmd)))) + (success (if launchwait (equal? 0 (cadr launch-results-prev)) #t)) + (launch-results (if launchwait (car launch-results-prev) launch-results-prev))) + (if (not success) + (tests:test-set-status! run-id test-id "COMPLETED" "DEAD" "launcher failed; exited non-zero; check mt_launch.log" #f)) ;; (if launch-results launch-results "FAILED")) + (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork. + ;; (rmt:no-sync-del! lock-key) ;; release the lock for starting this test + (if (not launchwait) ;; give the OS a little time to allow the process to start + (thread-sleep! 0.01)) + (with-output-to-file "mt_launch.log" + (lambda () + (print "LAUNCHCMD: " (string-intersperse fullcmd " ")) + (if (list? launch-results) + (apply print launch-results) + (print "NOTE: launched \"" fullcmd "\"\n but did not wait for it to proceed. Add the following to megatest.config \n[setup]\nlaunchwait yes\n if you have problems with this")) + #:append)) + (debug:print 2 *default-log-port* "Launching completed, updating db") + (debug:print 2 *default-log-port* "Launch results: " launch-results) + (if (not launch-results) + (begin + (print "ERROR: Failed to run " (string-intersperse fullcmd " ") ", exiting now") + ;; (sqlite3:finalize! db) + ;; good ole "exit" seems not to work + ;; (_exit 9) + ;; but this hack will work! Thanks go to Alan Post of the Chicken email list + ;; NB// Is this still needed? Should be safe to go back to "exit" now? + (process-signal (current-process-id) signal/kill) + )) + (alist->env-vars miscprevvals) + (alist->env-vars testprevvals) + (alist->env-vars commonprevvals) + launch-results)) + (change-directory *toppath*))) + +;; recover a test where the top controlling mtest may have died +;; +(define (launch:recover-test run-id test-id) + ;; this function is called on the test run host via ssh + ;; + ;; 1. look at the process from pid + ;; - is it owned by calling user + ;; - it it's run directory correct for the test + ;; - is there a controlling mtest (maybe stuck) + ;; 2. if recovery is needed watch pid + ;; - when it exits take the exit code and do the needful + ;; + (let* ((pid (rmt:test-get-top-process-pid run-id test-id)) + (psres (with-input-from-pipe + (conc "ps -F -u " (current-user-name) " | grep -E '" pid " ' | grep -v 'grep -E " pid "'") + (lambda () + (read-line)))) + (rundir (if (string? psres) ;; real process owned by user + (read-symbolic-link (conc "/proc/" pid "/cwd")) + #f))) + ;; now wait on that process if all is correct + ;; periodically update the db with runtime + ;; when the process exits look at the db, if still RUNNING after 10 seconds set + ;; state/status appropriately + (process-wait pid))) + + +;; Do not rpc this one, do the underlying calls!!! +(define (tests:test-set-status! run-id test-id state status comment dat #!key (work-area #f)) + (let* ((real-status status) + (otherdat (if dat dat (make-hash-table))) + (testdat (rmt:get-test-info-by-id run-id test-id)) + (test-name (db:test-get-testname testdat)) + (item-path (db:test-get-item-path testdat)) + ;; before proceeding we must find out if the previous test (where all keys matched except runname) + ;; was WAIVED if this test is FAIL + + ;; NOTES: + ;; 1. Is the call to test:get-previous-run-record remotified? + ;; 2. Add test for testconfig waiver propagation control here + ;; + (prev-test (if (equal? status "FAIL") + (rmt:get-previous-test-run-record run-id test-name item-path) + #f)) + (waived (if prev-test + (if prev-test ;; true if we found a previous test in this run series + (let ((prev-status (db:test-get-status prev-test)) + (prev-state (db:test-get-state prev-test)) + (prev-comment (db:test-get-comment prev-test))) + (debug:print 4 *default-log-port* "prev-status " prev-status ", prev-state " prev-state ", prev-comment " prev-comment) + (if (and (equal? prev-state "COMPLETED") + (equal? prev-status "WAIVED")) + (if comment + comment + prev-comment) ;; waived is either the comment or #f + #f)) + #f) + #f))) + (if (and waived + (tests:check-waiver-eligibility testdat prev-test)) + (set! real-status "WAIVED")) + + (debug:print 4 *default-log-port* "real-status " real-status ", waived " waived ", status " status) + + ;; update the primary record IF state AND status are defined + (if (and state status) + (begin + (rmt:set-state-status-and-roll-up-items run-id test-id item-path state real-status (if waived waived comment)) + ;; (mt:process-triggers run-id test-id state real-status) ;; triggers are called in test-set-state-status + )) + + ;; if status is "AUTO" then call rollup (note, this one modifies data in test + ;; run area, it does remote calls under the hood. + ;; (if (and test-id state status (equal? status "AUTO")) + ;; (rmt:test-data-rollup run-id test-id status)) + + ;; add metadata (need to do this way to avoid SQL injection issues) + + ;; :first_err + ;; (let ((val (hash-table-ref/default otherdat ":first_err" #f))) + ;; (if val + ;; (sqlite3:execute db "UPDATE tests SET first_err=? WHERE run_id=? AND testname=? AND item_path=?;" val run-id test-name item-path))) + ;; + ;; ;; :first_warn + ;; (let ((val (hash-table-ref/default otherdat ":first_warn" #f))) + ;; (if val + ;; (sqlite3:execute db "UPDATE tests SET first_warn=? WHERE run_id=? AND testname=? AND item_path=?;" val run-id test-name item-path))) + + (let ((category (hash-table-ref/default otherdat ":category" "")) + (variable (hash-table-ref/default otherdat ":variable" "")) + (value (hash-table-ref/default otherdat ":value" #f)) + (expected (hash-table-ref/default otherdat ":expected" "n/a")) + (tol (hash-table-ref/default otherdat ":tol" "n/a")) + (units (hash-table-ref/default otherdat ":units" "")) + (type (hash-table-ref/default otherdat ":type" "")) + (dcomment (hash-table-ref/default otherdat ":comment" ""))) + (debug:print 4 *default-log-port* + "category: " category ", variable: " variable ", value: " value + ", expected: " expected ", tol: " tol ", units: " units) + (if (and value) ;; require only value; BB was- all three required + (let ((dat (conc category "," + variable "," + value "," + expected "," + tol "," + units "," + dcomment ",," ;; extra comma for status + type ))) + ;; This was run remote, don't think that makes sense. Perhaps not, but that is the easiest path for the moment. + (rmt:csv->test-data run-id test-id + dat) + ;; This was added in check-in a5adfa3f9a. Message was: "...added delay in set-values to allow for delayed write on server start" + ;; I'm inserting an arbitrary rmt: call to force/ensure that the server is available to (hopefully) prevent a communication issue. + (rmt:get-var "MEGATEST_VERSION") ;; this does NOTHING but ensure the server is reachable. This is almost certainly NOT needed :) + ;; BB - commentiong out arbitrary 10 second wait (thread-sleep! 10) ;; add 10 second delay before quit incase rmt needs time to start a server. + ))) + + ;; need to update the top test record if PASS or FAIL and this is a subtest + ;;;;;; (if (not (equal? item-path "")) + ;;;;;; (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status #f) ;;;;;) + + (if (or (and (string? comment) + (string-match (regexp "\\S+") comment)) + waived) + (let ((cmt (if waived waived comment))) + (rmt:general-call 'set-test-comment run-id cmt test-id))))) + + )