@@ -168,34 +168,41 @@ (append (or ezstepslst '()) (list (list "subrun" (conc "{subrun=true} " mt-cmd))))))) ;; process the ezsteps (if ezsteps - (let* ((all-steps-dat (make-hash-table))) ;; keep all the info around as stepname ==> alist; where 'params is the params list (add other stuff as needed) + (let* ((envdbf (conc "/tmp/."(current-user-name)"-"(current-process-id)"-"run-id"-"test-id".db")) + (all-steps-dat (make-hash-table))) ;; keep all the info around as stepname ==> alist; + ;;; where 'params is the params list (add other + ;;; stuff as needed) (if (not (common:file-exists? ".ezsteps"))(create-directory ".ezsteps")) ;; if ezsteps was defined then we are sure to have at least one step but check anyway (if (not (> (length ezstepslst) 0)) (debug:print-error 0 *default-log-port* "ezsteps defined but ezstepslst is zero length") - (let loop ((ezstep (car ezstepslst)) - (tal (cdr ezstepslst)) - (prevstep #f)) - (debug:print-info 0 *default-log-port* "Processing ezstep \"" (string-intersperse ezstep " ") "\"") - ;; check exit-info (vector-ref exit-info 1) - (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) - (let* ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat)) - (stepname (car ezstep)) - (stepparms (hash-table-ref all-steps-dat stepname))) - (setenv "MT_STEP_NAME" stepname) - (pp (hash-table->alist all-steps-dat)) - ;; if logpro-used read in the stepname.dat file - (if (and logpro-used (common:file-exists? (conc stepname ".dat"))) - (launch:load-logpro-dat run-id test-id stepname)) - (if (steprun-good? logpro-used (launch:einf-exit-code exit-info) stepparms) - (if (not (null? tal)) - (loop (car tal) (cdr tal) stepname)) - (debug:print 0 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping"))) - (debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep))))))))) + (let ((all-step-names (map car ezstepslst))) + (setenv "MT_STEP_NAMES" (string-intersperse all-step-names " ")) + (let loop ((ezstep (car ezstepslst)) + (tal (cdr ezstepslst)) + (prevstep #f)) + (debug:print-info 0 *default-log-port* "Processing ezstep \"" (string-intersperse ezstep " ") "\"") + ;; check exit-info (vector-ref exit-info 1) + (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) + (let* ((logpro-used (launch:runstep ezstep run-id test-id exit-info m + tal testconfig all-steps-dat prevstep envdbf)) + (stepname (car ezstep)) + (stepparms (hash-table-ref all-steps-dat stepname))) + (setenv "MT_STEP_NAME" stepname) + (pp (hash-table->alist all-steps-dat)) + ;; if logpro-used read in the stepname.dat file + (if (and logpro-used (common:file-exists? (conc stepname ".dat"))) + (launch:load-logpro-dat run-id test-id stepname)) + (if (steprun-good? logpro-used (launch:einf-exit-code exit-info) stepparms) + (if (not (null? tal)) + (loop (car tal) (cdr tal) stepname)) + (debug:print 0 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping"))) + (debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep)) + )))))))) (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30"))) (start-seconds (current-seconds)) (calc-minutes (lambda () @@ -205,17 +212,19 @@ (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) - (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) + (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) - (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) + ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - + ;; top of loop encountered at "(current-seconds)" with + ;; last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load @@ -233,33 +242,28 @@ (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) - (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) + #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) - (when do-sync - ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) - ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) - (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) - (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))) - + (if do-sync ;; save meta data about the running of this test + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this @@ -312,11 +316,11 @@ (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) - (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional + (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) @@ -398,11 +402,10 @@ ;; one more time, change to the work-area directory (change-directory work-area))) ) ;; let* (if contour (setenv "MT_CONTOUR" contour)) - ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ... ;; (setenv "MT_TESTSUITENAME" areaname) (setenv "MT_RUN_AREA_HOME" top-path) (set! *toppath* top-path) @@ -465,10 +468,13 @@ (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.") (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. + ;;((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun + ;; (and (equal? (db:test-get-state test-info) "COMPLETED") ;; completed/abort => rerun if asked + ;; (member (db:test-get-status test-info) '("ABORT")))) ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) @@ -731,11 +737,11 @@ ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED ;; 0 RUNNING ==> this is actually the first condition, should not get here (define (launch:end-of-run-check run-id ) (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id)) - (running-cnt (rmt:get-count-tests-running-for-run-id run-id #f)) ;; fastmode=no + (running-cnt (rmt:get-count-tests-running-for-run-id run-id)) (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id))) (current-state (rmt:get-run-state run-id)) (current-status (rmt:get-run-status run-id))) ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)