Megatest: Diff

Differences From Artifact [9881087e2c]:

File launch.scm — part of check-in [87b6d8cf0e] at 2022-11-13 04:53:25 on branch v1.70-nohomehost — wip (user: matt, size: 88945) [annotate] [blame] [check-ins using] [more...]

To Artifact [9d838959e9]:

File launch.scm — part of check-in [8ff6166610] at 2023-06-27 09:08:53 on branch v1.80 — Fixed quote in path issue (user: mrwellan, size: 90134) [annotate] [blame] [check-ins using]

︙			︙
16 17 18 19 20 21 22 23 24 25 26 27 ~~28 29~~ ~~30 31 32 33 34 35 36~~ 37 38 39 40 41 42 43 44 45 46 47 48	;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;;====================================================================== ;; launch a task - this runs on the originating host, tests themselves ;; ;;====================================================================== (use regex regex-case base64 sqlite3 srfi-18 directory-utils posix-extras z3 call-with-environment-variables csv) (use typed-records pathname-expand matchable) ~~(import (prefix base64 base64:)) ~~(import~~ (prefix sqlite3 sqlite3:))~~ ~~(declare (unit launch))~~ ~~(declare (uses subrun))~~ ~~(declare (uses common))~~ ~~(declare (uses configf))~~ ~~(declare (uses db))~~ ~~(declare (uses ezsteps))~~ (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") (include "megatest-fossil-hash.scm") ;;====================================================================== ;; ezsteps ;;====================================================================== ;; ezsteps were going to be coded as ;; stepname[,predstep1,predstep2 ...] [{VAR1=first,second,third}] command to execute	> > > > > > > > > > > > > \| \| > \| < < < < < < > > > > > >	16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62	;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;;====================================================================== ;; launch a task - this runs on the originating host, tests themselves ;; ;;====================================================================== (declare (unit launch)) (declare (uses subrun)) (declare (uses common)) (declare (uses debugprint)) (declare (uses commonmod)) (declare (uses configf)) (declare (uses db)) (declare (uses rmtmod)) (declare (uses ezsteps)) ;; (declare (uses dbmod)) (declare (uses dbfile)) (declare (uses mtargs)) (use regex regex-case base64 sqlite3 srfi-18 directory-utils posix-extras z3 call-with-environment-variables csv) (use typed-records pathname-expand matchable) (import (prefix base64 base64:) (prefix sqlite3 sqlite3:) (prefix mtargs args:) ) (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") (include "megatest-fossil-hash.scm") (import commonmod rmtmod debugprint ;; dbmod dbfile) ;;====================================================================== ;; ezsteps ;;====================================================================== ;; ezsteps were going to be coded as ;; stepname[,predstep1,predstep2 ...] [{VAR1=first,second,third}] command to execute
︙			︙
181 182 183 184 185 186 187 ~~188~~ 189 190 191 192 193 194 195	(setenv "MT_STEP_NAMES" (string-intersperse all-step-names " ")) (let loop ((ezstep (car ezstepslst)) (tal (cdr ezstepslst)) (prevstep #f)) (debug:print-info 0 default-log-port "Processing ezstep \"" (string-intersperse ezstep " ") "\"") ;; check exit-info (vector-ref exit-info 1) (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) ~~(let* ((logpro-used (~~launch~~:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat))~~ (stepname (car ezstep)) (stepparms (hash-table-ref all-steps-dat stepname))) (setenv "MT_STEP_NAME" stepname) (pp (hash-table->alist all-steps-dat)) ;; if logpro-used read in the stepname.dat file (if (and logpro-used (common:file-exists? (conc stepname ".dat"))) (launch:load-logpro-dat run-id test-id stepname))	\|	195 196 197 198 199 200 201 202 203 204 205 206 207 208 209	(setenv "MT_STEP_NAMES" (string-intersperse all-step-names " ")) (let loop ((ezstep (car ezstepslst)) (tal (cdr ezstepslst)) (prevstep #f)) (debug:print-info 0 default-log-port "Processing ezstep \"" (string-intersperse ezstep " ") "\"") ;; check exit-info (vector-ref exit-info 1) (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) (let* ((logpro-used (ezsteps:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat)) (stepname (car ezstep)) (stepparms (hash-table-ref all-steps-dat stepname))) (setenv "MT_STEP_NAME" stepname) (pp (hash-table->alist all-steps-dat)) ;; if logpro-used read in the stepname.dat file (if (and logpro-used (common:file-exists? (conc stepname ".dat"))) (launch:load-logpro-dat run-id test-id stepname))
︙			︙
203 204 205 206 207 208 209 ~~210~~ 211 212 213 214 215 216 217	(debug:print 0 default-log-port "WARNING: a prior step failed, stopping at " ezstep))) (file-close status-file) ) )))))) (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) ~~(let* ((update-period (string->number (or (configf:lookup configdat "setup" "test-stats-update-period") "30")))~~ (start-seconds (current-seconds)) (calc-minutes (lambda () (inexact->exact (round (- (current-seconds) start-seconds)))))	\|	217 218 219 220 221 222 223 224 225 226 227 228 229 230 231	(debug:print 0 default-log-port "WARNING: a prior step failed, stopping at " ezstep))) (file-close status-file) ) )))))) (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) (let* ((update-period (string->number (or (configf:lookup configdat "setup" "test-stats-update-period") "60"))) (start-seconds (current-seconds)) (calc-minutes (lambda () (inexact->exact (round (- (current-seconds) start-seconds)))))
︙			︙
237 238 239 240 241 242 243 ~~244 245 246~~ 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 ~~264~~ 265 266 267 268 269 270 271	(delta (abs (- df disk-free)))) (if (and (> df 0) (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg df #f))) (do-sync (or new-cpu-load new-disk-free over-time)) ~~(test-info (rmt:get-test-~~info~~-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info))~~ (kill-reason "no kill reason specified") (kill-job? #f)) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 default-log-port "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) ~~(launch:handle-zombie-tests run-id)~~ (when do-sync ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))) )	\| \| \| > \|	251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286	(delta (abs (- df disk-free)))) (if (and (> df 0) (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg df #f))) (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-state-status-by-id run-id test-id)) (state (car test-info));; (db:test-get-state test-info)) (status (cdr test-info));; (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 default-log-port "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty (launch:handle-zombie-tests run-id)) (when do-sync ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))) )
︙			︙
312 313 314 315 316 317 318 ~~319~~ 320 321 322 323 324 325 326	(tests:test-set-status! run-id test-id "KILLED" "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) ;; BB ADDED kill-reason -- confirm OK with Matt (begin (debug:print-error 0 default-log-port "Nothing to kill, pid1=" pid1 ", pid2=" pid2) (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt ))) (mutex-unlock! m) ;; no point in sticking around. Exit now. But run end of run before exiting? ~~(launch:end-of-run-check run-id)~~ (exit))) (if (hash-table-ref/default misc-flags 'keep-going #f) (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load)	\|	327 328 329 330 331 332 333 334 335 336 337 338 339 340 341	(tests:test-set-status! run-id test-id "KILLED" "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) ;; BB ADDED kill-reason -- confirm OK with Matt (begin (debug:print-error 0 default-log-port "Nothing to kill, pid1=" pid1 ", pid2=" pid2) (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt ))) (mutex-unlock! m) ;; no point in sticking around. Exit now. But run end of run before exiting? (launch:end-of-run-check run-id) (exit))) (if (hash-table-ref/default misc-flags 'keep-going #f) (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load)
︙			︙
591 592 593 594 595 596 597 ~~598~~ 599 600 601 602 603 604 605	(list "MT_RUNNAME" runname) (list "MT_MEGATEST" megatest) (list "MT_TARGET" target) (list "MT_LINKTREE" (common:get-linktree)) ;; (configf:lookup configdat "setup" "linktree")) (list "MT_TESTSUITENAME" (common:get-testsuite-name)))) ;;(bb-check-path msg: "launch:execute post block 3") ~~(if mt-bindir-path (setenv "PATH" (conc ~~"\""(ge~~t~~env "PATH")~~":"mt-bindir-path~~"\""~~)))~~ ;;(bb-check-path msg: "launch:execute post block 4") ;; (change-directory top-path) ;; Can setup as client for server mode now ;; (client:setup) ;; environment overrides are done before the remaining critical envars.	> > > \|	606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623	(list "MT_RUNNAME" runname) (list "MT_MEGATEST" megatest) (list "MT_TARGET" target) (list "MT_LINKTREE" (common:get-linktree)) ;; (configf:lookup configdat "setup" "linktree")) (list "MT_TESTSUITENAME" (common:get-testsuite-name)))) ;;(bb-check-path msg: "launch:execute post block 3") (let ((tmppath (getenv "PATH"))) (if (string-search tmppath " ") (debug:print 0 default-log-port "WARNING: spaces in PATH are not supported.")) (if mt-bindir-path (setenv "PATH" (conc tmppath":"mt-bindir-path)))) ;;(bb-check-path msg: "launch:execute post block 4") ;; (change-directory top-path) ;; Can setup as client for server mode now ;; (client:setup) ;; environment overrides are done before the remaining critical envars.
︙			︙
724 725 726 727 728 729 730 ~~731~~ 732 733 734 735 736 737 738	;; NO NEED TO CALL set-state-status-and-roll-up-items HERE, THIS IS DONE IN set-state-status-and-roll-up-items called by tests:test-set-status! ) ) ;; for automated creation of the rollup html file this is a good place... (if (not (equal? item-path "")) ~~(tests:summarize-items run-id test-id test-name #f))~~ (tests:summarize-test run-id test-id) ;; don't force - just update if no ;; Leave a .final-status file for the top level test (tests:save-final-status run-id test-id) (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) ;; end of let* (mutex-unlock! m) (launch:end-of-run-check run-id )	\| > >	742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758	;; NO NEED TO CALL set-state-status-and-roll-up-items HERE, THIS IS DONE IN set-state-status-and-roll-up-items called by tests:test-set-status! ) ) ;; for automated creation of the rollup html file this is a good place... (if (not (equal? item-path "")) (tests:summarize-items run-id test-id test-name #f)) ;; BUG was this meant to be the antecnt of the if above? ;; BUG was this meant to be the antecnt of the if above? (tests:summarize-test run-id test-id) ;; don't force - just update if no ;; Leave a .final-status file for the top level test (tests:save-final-status run-id test-id) (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) ;; end of let* (mutex-unlock! m) (launch:end-of-run-check run-id )
︙			︙
763 764 765 766 767 768 769 ~~770~~ 771 772 ~~773~~ 774 ~~775 776 777 778~~ ~~779~~ 780 781 782 783 784 785 786	;; if dead safe to mark the test as killed in the db ;; State/status table ;; new ;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup ;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED ;; 0 RUNNING ==> this is actually the first condition, should not get here (define (launch:end-of-run-check run-id ) (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id)) ~~(running-cnt (rmt:get-count-tests-running-for-run-id run-id))~~ (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id))) (current-state (rmt:get-run-state run-id)) (current-status (rmt:get-run-status run-id))) ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing (debug:print 0 default-log-port "Running test cnt :" running-cnt) ~~(rmt:set-state-status-and-roll-up-run run-id current-state current-status)~~ (runs:update-junit-test-reporter-xml run-id) (cond ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" )) (if (and (equal? (rmt:get-var (conc "end-of-run-" run-id)) "no") (common:simple-lock (conc "endOfRun" run-id))) (begin (debug:print 4 default-log-port "look for post hook. currseconds: " (current-seconds) " EOR " (rmt:get-var (conc "end-of-run-" run-id))) (debug:print 0 default-log-port "End of Run Detected.")	\| \| > \| \| \| \| > > > > > > \| >	783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814	;; if dead safe to mark the test as killed in the db ;; State/status table ;; new ;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup ;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED ;; 0 RUNNING ==> this is actually the first condition, should not get here (define last-rollup 0) (define (launch:end-of-run-check run-id ) (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id)) (running-cnt (rmt:get-count-tests-running-for-run-id run-id)) (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id))) (current-state-status (rmt:get-run-state-status run-id)) (current-state (car current-state-status)) ;; (rmt:get-run-state run-id)) (current-status (cdr current-state-status))) ;; (rmt:get-run-status run-id))) ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing (debug:print 0 default-log-port "Running test cnt :" running-cnt) ;; ;; TODO: add a final rollup when run is done (if there isn't one already) ;; (if (or (< running-cnt 3) ;; have only few running (> (- (current-seconds) last-rollup) 10)) ;; or haven't rolled up in past ten seconds (begin (rmt:set-state-status-and-roll-up-run run-id current-state current-status) (set! last-rollup (current-seconds)))) (runs:update-junit-test-reporter-xml run-id) (cond ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" )) (if (and (equal? (rmt:get-var (conc "end-of-run-" run-id)) "no") (common:simple-lock (conc "endOfRun" run-id))) (begin (debug:print 4 default-log-port "look for post hook. currseconds: " (current-seconds) " EOR " (rmt:get-var (conc "end-of-run-" run-id))) (debug:print 0 default-log-port "End of Run Detected.")
︙			︙
804 805 806 807 808 809 810 ~~811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842~~ 843 844 845 846 847 848 849 850 851 852 853 854 855 856 ~~857~~ 858 859 860 861 862 863 864	(tal (cdr not-completed-tests))) (let* ((test-name (vector-ref running-test 2)) (item-path (vector-ref running-test 11))) (debug:print 0 default-log-port "test " test-name "/" item-path " not completed") (if (not (null? tal)) (loop (car tal) (cdr tal))))))))))) ~~;; replaced below with version that does not ssh if checking on localhost~~ ~~#;(define (launch:is-test-alive host pid)~~ ~~(if (and host pid (not (equal? host "n/a")))~~ ~~(let* ((cmd (conc "ssh " host " pstree -A " pid))~~ ~~(output (with-input-from-pipe cmd read-lines)))~~ ~~(debug:print 2 default-log-port "Running " cmd " received " output)~~ ~~(if (eq? (length output) 0)~~ #f ~~#t))~~ ~~#t))~~ ~~;; this is a close duplicate of:~~ ~~;; process:alist-on-host?~~ ~~;; process:alive~~ ;; ~~(define (launch:is-test-alive host pid)~~ ~~(let* ((same-host (equal? host (get-host-name)))~~ ~~(cmd (conc~~ ~~(if same-host "" (conc "ssh "host" "))~~ ~~"pstree -A "pid)))~~ ~~(if (and host pid~~ ~~(not (equal? host "n/a")))~~ ~~(let* ((output (if same-host~~ ~~(with-input-from-pipe cmd read-lines)~~ ~~(common:generic-ssh cmd read-lines '())))) ;; (with-input-from-pipe cmd read-lines)))~~ ~~(debug:print 2 default-log-port "Running " cmd " received " output)~~ ~~(if (eq? (length output) 0)~~ #f ~~#t))~~ ~~#t))) ;; assuming bad query is about a live test is likely not the right thing to do?~~ (define (launch:kill-tests-if-dead run-id) (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f))) (let loop ((running-test (car running-tests)) (tal (cdr running-tests)) (kill-cnt 0)) (let* ((test-name (vector-ref running-test 2)) (item-path (vector-ref running-test 11)) (test-id (vector-ref running-test 0)) (host (vector-ref running-test 6)) (pid (rmt:test-get-top-process-pid run-id test-id)) (event-time (vector-ref running-test 5)) (duration (vector-ref running-test 12)) (flag 0) (curr-time (current-seconds))) (if (and (< (+ event-time duration 600) curr-time) (not (~~lau~~nch:is-test-alive host pid))) ;;test has not updated duration in last 10 min then likely its not running but confirm before marking it as killed (begin (debug:print 0 default-log-port "test " test-name "/" item-path " needs to be killed") (set! flag 1) (rmt:set-state-status-and-roll-up-items run-id test-name item-path "KILLREQ" "n/a" #f))) (if (not (null? tal)) (loop (car tal) (cdr tal) (+ kill-cnt flag)) (+ kill-cnt flag))))))	< < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < \|	832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860	(tal (cdr not-completed-tests))) (let* ((test-name (vector-ref running-test 2)) (item-path (vector-ref running-test 11))) (debug:print 0 default-log-port "test " test-name "/" item-path " not completed") (if (not (null? tal)) (loop (car tal) (cdr tal))))))))))) (define (launch:kill-tests-if-dead run-id) (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f))) (let loop ((running-test (car running-tests)) (tal (cdr running-tests)) (kill-cnt 0)) (let* ((test-name (vector-ref running-test 2)) (item-path (vector-ref running-test 11)) (test-id (vector-ref running-test 0)) (host (vector-ref running-test 6)) (pid (rmt:test-get-top-process-pid run-id test-id)) (event-time (vector-ref running-test 5)) (duration (vector-ref running-test 12)) (flag 0) (curr-time (current-seconds))) (if (and (< (+ event-time duration 600) curr-time) (not (commonmod:is-test-alive host pid))) ;;test has not updated duration in last 10 min then likely its not running but confirm before marking it as killed (begin (debug:print 0 default-log-port "test " test-name "/" item-path " needs to be killed") (set! flag 1) (rmt:set-state-status-and-roll-up-items run-id test-name item-path "KILLREQ" "n/a" #f))) (if (not (null? tal)) (loop (car tal) (cdr tal) (+ kill-cnt flag)) (+ kill-cnt flag))))))
︙			︙
1129 1130 1131 1132 1133 1134 1135 ~~1136~~ 1137 1138 1139 1140 1141 1142 1143	(begin (setenv "MT_RUN_AREA_HOME" toppath) (setenv "MT_TESTSUITENAME" (common:get-testsuite-name))) (begin (debug:print-error 0 default-log-port "failed to find the top path to your Megatest area.") (set! toppath #f) ;; force it to be false so we return #f #f)) ;; one more attempt to cache the configs for future reading (let* ((cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig)) (mtcachef (car cachefiles)) (rccachef (cdr cachefiles))) ;; trap exception due to stale NFS handle -- Error: (open-output-file) cannot open file - Stale NFS file handle: "...somepath.../.runconfigs.cfg-1.6427-7d1e789cb3f62f9cde719a4865bb51b3c17ea853" - ticket 220546342 ;; TODO - consider 1) using simple-lock to bracket cache write	\| > > >	1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142	(begin (setenv "MT_RUN_AREA_HOME" toppath) (setenv "MT_TESTSUITENAME" (common:get-testsuite-name))) (begin (debug:print-error 0 default-log-port "failed to find the top path to your Megatest area.") (set! toppath #f) ;; force it to be false so we return #f #f)) ;; needed by various transport and db modules (dbfile:testsuite-name (common:get-testsuite-name)) ;; (get-testsuite-name toppath configdat)) ;; one more attempt to cache the configs for future reading (let* ((cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig)) (mtcachef (car cachefiles)) (rccachef (cdr cachefiles))) ;; trap exception due to stale NFS handle -- Error: (open-output-file) cannot open file - Stale NFS file handle: "...somepath.../.runconfigs.cfg-1.6427-7d1e789cb3f62f9cde719a4865bb51b3c17ea853" - ticket 220546342 ;; TODO - consider 1) using simple-lock to bracket cache write
︙			︙
1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171	(set! configstatus 'fulldata))) ;; if have -append-config then read and append here (let ((cfname (args:get-arg "-append-config"))) (if (and cfname (file-read-access? cfname)) (read-config cfname configdat #t))) ;; values are added to the hash, no need to do anything special. toppath))) (define (get-best-disk confdat testconfig) (let* ((disks (or (and testconfig (hash-table-ref/default testconfig "disks" #f)) (hash-table-ref/default confdat "disks" #f))) (minspace (let ((m (configf:lookup confdat "setup" "minspace")))	> > > > > > > > > > > > > > > > > > >	1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189	(set! configstatus 'fulldata))) ;; if have -append-config then read and append here (let ((cfname (args:get-arg "-append-config"))) (if (and cfname (file-read-access? cfname)) (read-config cfname configdat #t))) ;; values are added to the hash, no need to do anything special. ;; have config at this time, this is a good place to set params based on config file settings (let* ((dbmode (configf:lookup configdat "setup" "dbcache-mode")) (syncmode (configf:lookup configdat "setup" "sync-mode")) (srvdebug (configf:lookup configdat "server" "debug-parameter"))) (if dbmode (begin (debug:print-info 0 default-log-port "Overriding dbmode to "dbmode) (dbcache-mode (string->symbol dbmode)))) (if syncmode (begin (debug:print-info 0 default-log-port "Overriding syncmode to "syncmode) (dbfile:sync-method (string->symbol syncmode)))) (if srvdebug (begin (debug:print-info 0 default-log-port "Overriding server debug parameter to "srvdebug) (tt-server-profile-string srvdebug))) ) toppath))) (define (get-best-disk confdat testconfig) (let* ((disks (or (and testconfig (hash-table-ref/default testconfig "disks" #f)) (hash-table-ref/default confdat "disks" #f))) (minspace (let ((m (configf:lookup confdat "setup" "minspace")))
︙			︙
1364 1365 1366 1367 1368 1369 1370 ~~1371 1372 1373~~ 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 ~~1385 1386 1387 1388~~ 1389 1390 1391 1392 1393 1394 1395	;; (rmt:general-call 'test-set-rundir run-id lnkpath testname "") ;; toptest-path) (if (or (not curr-test-path) (not (directory-exists? toptest-path))) (begin (debug:print-info 2 default-log-port "Creating " toptest-path " and link " lnkpath) (handle-exceptions exn ~~(begin (debug:print 0 default-log-port "failed to create directory " toptest-path ", exn=" exn) #f)~~ (create-directory toptest-path #t)) (hash-table-set! toptest-paths testname toptest-path))))) ;; The toptest path has been created, the link to the test in the linktree has ;; been created. Now, if this is an iterated test the real test dir must be created (if (not not-iterated) ;; this is an iterated test (begin ;; (let ((lnktarget (conc lnkpath "/" item-path))) (debug:print 2 default-log-port "Setting up sub test run area") (debug:print 2 default-log-port " - creating run area in " test-path) (handle-exceptions exn ~~(begin (debug:print-error 0 default-log-port " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) ", exiting, exn=" exn) (exit 1))~~ (create-directory test-path #t)) (debug:print 2 default-log-port " - creating link from: " test-path "\n" " to: " lnktarget) ;; If there is already a symlink delete it and recreate it. (handle-exceptions	> > \| \| \| > > \| \| \| \|	1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417	;; (rmt:general-call 'test-set-rundir run-id lnkpath testname "") ;; toptest-path) (if (or (not curr-test-path) (not (directory-exists? toptest-path))) (begin (debug:print-info 2 default-log-port "Creating " toptest-path " and link " lnkpath) (handle-exceptions exn (if (directory-exists? toptest-path) ;; it was likely created in parallel #t (begin (debug:print-info 0 default-log-port "failed to create directory " toptest-path ", exn=" exn) #f)) (create-directory toptest-path #t)) (hash-table-set! toptest-paths testname toptest-path))))) ;; The toptest path has been created, the link to the test in the linktree has ;; been created. Now, if this is an iterated test the real test dir must be created (if (not not-iterated) ;; this is an iterated test (begin ;; (let ((lnktarget (conc lnkpath "/" item-path))) (debug:print 2 default-log-port "Setting up sub test run area") (debug:print 2 default-log-port " - creating run area in " test-path) (handle-exceptions exn (if (directory-exists? test-path) #t (begin (debug:print-error 0 default-log-port " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) ", exiting, exn=" exn) (exit 1))) (create-directory test-path #t)) (debug:print 2 default-log-port " - creating link from: " test-path "\n" " to: " lnktarget) ;; If there is already a symlink delete it and recreate it. (handle-exceptions
︙			︙
1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453	;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (mutex-lock! launch-setup-mutex) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex (let* ( ;; (lock-key (conc "test-" test-id)) ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds ;; (if (car lock) ;; #t ;; (if (> (current-seconds) expire-time)	>	1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476	;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (assert runname "FATAL: launch-test called with no runname") (mutex-lock! launch-setup-mutex) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex (let* ( ;; (lock-key (conc "test-" test-id)) ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds ;; (if (car lock) ;; #t ;; (if (> (current-seconds) expire-time)
︙			︙
1546 1547 1548 1549 1550 1551 1552 ~~1553~~ 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 ~~1572~~ 1573 1574 1575 1576 1577 1578 1579	;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f) ;; (pp (hash-table->alist tconfig)) (set! diskpath (get-best-disk configdat tconfig)) (debug:print 2 default-log-port "best disk path = " diskpath) (if diskpath ~~(let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat)))~~ (set! work-area (car dat)) (set! toptest-work-area (cadr dat)) (debug:print-info 2 default-log-port "Using work area " work-area)) (begin (set! work-area (conc test-path "/tmp_run")) (create-directory work-area #t) (debug:print 0 default-log-port "WARNING: No disk work area specified - running in the test directory under tmp_run"))) (set! cmdparms (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) ;; (list 'transport (conc transport-type)) ;; (list 'serverinf server-info) #;(list 'homehost (let* ((hhdat (server:get-homehost))) (if hhdat (car hhdat) #f))) (list 'serverurl (if runremote (remote-server-url runremote) #f)) ;; (list 'areaname (common:get-testsuite-name)) (list 'toppath toppath) (list 'work-area work-area) (list 'test-name test-name) (list 'runscript runscript)	\| \|	1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602	;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f) ;; (pp (hash-table->alist tconfig)) (set! diskpath (get-best-disk configdat tconfig)) (debug:print 2 default-log-port "best disk path = " diskpath) (if diskpath (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) (set! work-area (car dat)) (set! toptest-work-area (cadr dat)) (debug:print-info 2 default-log-port "Using work area " work-area)) (begin (set! work-area (conc test-path "/tmp_run")) (create-directory work-area #t) (debug:print 0 default-log-port "WARNING: No disk work area specified - running in the test directory under tmp_run"))) (set! cmdparms (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) ;; (list 'transport (conc transport-type)) ;; (list 'serverinf server-info) #;(list 'homehost (let* ((hhdat (server:get-homehost))) (if hhdat (car hhdat) #f))) #;(list 'serverurl (if runremote ;; would like to add this back ... WORK NEEDED (remote-server-url runremote) #f)) ;; (list 'areaname (common:get-testsuite-name)) (list 'toppath toppath) (list 'work-area work-area) (list 'test-name test-name) (list 'runscript runscript)
︙			︙