Megatest: Diff

Differences From Artifact [13e6c119c2]:

File launch.scm — part of check-in [647ef89272] at 2017-01-08 23:21:48 on branch v1.63-stackdumpfix — Fixed REMOTE instead of RUNNING status (user: matt, size: 65035) [annotate] [blame] [check-ins using] [more...]

To Artifact [1ca55ce1cc]:

File launch.scm — part of check-in [fe6415e075] at 2017-02-05 23:30:48 on branch area-specific — Attempt to make rmt: calls area specific (user: matt, size: 65414) [annotate] [blame] [check-ins using]

︙			︙
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91	;; 0 1 2 3 (defstruct launch:einf (pid #t)(exit-status #t)(exit-code #t)(rollup-status 0)) ;; return (conc status ": " comment) from the final section so that ;; the comment can be set in the step record in launch.scm ;; ~~(define (launch:load-logpro-dat run-id test-id stepname)~~ (let ((cname (conc stepname ".dat"))) (if (file-exists? cname) (let* ((dat (read-config cname #f #f)) (csvr (db:logpro-dat->csv dat stepname)) (csvt (let-values (((fmt-cell fmt-record fmt-csv) (make-format ","))) (fmt-csv (map list->csv-record csvr)))) (status (configf:lookup dat "final" "exit-status")) (msg (configf:lookup dat "final" "message"))) (if csvt ;; this if blocked stack dump caused by .dat file from logpro being 0-byte. fixed by upgrading logpro ~~(rmt:csv->test-data run-id test-id csvt)~~ (debug:print 0 default-log-port "ERROR: no csvdat exists for run-id: " run-id " test-id: " test-id " stepname: " stepname ", check that logpro version is 1.15 or newer")) ;; (BB> "Error: run-id/test-id/stepname="run-id"/"test-id"/"stepname" => bad csvr="csvr) ;; ) (cond ((equal? status "PASS") "PASS") ;; skip the message part if status is pass (status (conc (configf:lookup dat "final" "exit-status") ": " (if msg msg "no message"))) (else #f))) #f))) ~~(define (launch:runstep ezstep run-id test-id exit-info m tal testconfig)~~ (let* ((stepname (car ezstep)) ;; do stuff to run the step (stepinfo (cadr ezstep)) (stepparts (string-match (regexp "^(\\{([^\\}])\\}\\s\|)(.*)$") stepinfo)) (stepparms (list-ref stepparts 2)) ;; for future use, {VAR=1,2,3}, run step for each (stepcmd (list-ref stepparts 3)) (script "") ; "#!/bin/bash\n") ;; yep, we depend on bin/bash FIXME!!!\ (logpro-file (conc stepname ".logpro"))	\| \| \|	57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91	;; 0 1 2 3 (defstruct launch:einf (pid #t)(exit-status #t)(exit-code #t)(rollup-status 0)) ;; return (conc status ": " comment) from the final section so that ;; the comment can be set in the step record in launch.scm ;; (define (launch:load-logpro-dat area-dat run-id test-id stepname) (let ((cname (conc stepname ".dat"))) (if (file-exists? cname) (let* ((dat (read-config cname #f #f)) (csvr (db:logpro-dat->csv dat stepname)) (csvt (let-values (((fmt-cell fmt-record fmt-csv) (make-format ","))) (fmt-csv (map list->csv-record csvr)))) (status (configf:lookup dat "final" "exit-status")) (msg (configf:lookup dat "final" "message"))) (if csvt ;; this if blocked stack dump caused by .dat file from logpro being 0-byte. fixed by upgrading logpro (rmt:csv->test-data area-dat run-id test-id csvt) (debug:print 0 default-log-port "ERROR: no csvdat exists for run-id: " run-id " test-id: " test-id " stepname: " stepname ", check that logpro version is 1.15 or newer")) ;; (BB> "Error: run-id/test-id/stepname="run-id"/"test-id"/"stepname" => bad csvr="csvr) ;; ) (cond ((equal? status "PASS") "PASS") ;; skip the message part if status is pass (status (conc (configf:lookup dat "final" "exit-status") ": " (if msg msg "no message"))) (else #f))) #f))) (define (launch:runstep area-dat ezstep run-id test-id exit-info m tal testconfig) (let* ((stepname (car ezstep)) ;; do stuff to run the step (stepinfo (cadr ezstep)) (stepparts (string-match (regexp "^(\\{([^\\}])\\}\\s\|)(.*)$") stepinfo)) (stepparms (list-ref stepparts 2)) ;; for future use, {VAR=1,2,3}, run step for each (stepcmd (list-ref stepparts 3)) (script "") ; "#!/bin/bash\n") ;; yep, we depend on bin/bash FIXME!!!\ (logpro-file (conc stepname ".logpro"))
︙			︙
114 115 116 117 118 119 120 ~~121~~ 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 ~~140~~ 141 142 143 144 145 146 147	;; (if (and prevstep (file-exists? prev-env)) ;; (set! script (conc script "source " prev-env)))) ;; call the command using mt_ezstep ;; (set! script (conc "mt_ezstep " stepname " " (if prevstep prevstep "x") " " stepcmd)) (debug:print 4 default-log-port "script: " script) ~~(rmt:teststep-set-status! run-id test-id stepname "start" "-" #f #f)~~ ;; now launch the actual process (call-with-environment-variables (list (cons "PATH" (conc (get-environment-variable "PATH") ":."))) (lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1") (let* ((cmd (conc stepcmd " > " stepname ".log 2>&1")) ;; >outfile 2>&1 (pid (process-run "/bin/bash" (list "-c" cmd)))) (with-output-to-file "Makefile.ezsteps" (lambda () (print stepname ".log :") (print "\t" cmd) (if (file-exists? (conc stepname ".logpro")) (print "\tlogpro " stepname ".logpro " stepname ".html < " stepname ".log")) (print) (print stepname " : " stepname ".log") (print)) #:append) ~~(rmt:test-set-top-process-pid run-id test-id pid)~~ (let processloop ((i 0)) (let-values (((pid-val exit-status exit-code)(process-wait pid #t))) (mutex-lock! m) (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status) (launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code) (mutex-unlock! m)	\| \|	114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147	;; (if (and prevstep (file-exists? prev-env)) ;; (set! script (conc script "source " prev-env)))) ;; call the command using mt_ezstep ;; (set! script (conc "mt_ezstep " stepname " " (if prevstep prevstep "x") " " stepcmd)) (debug:print 4 default-log-port "script: " script) (rmt:teststep-set-status! area-dat run-id test-id stepname "start" "-" #f #f) ;; now launch the actual process (call-with-environment-variables (list (cons "PATH" (conc (get-environment-variable "PATH") ":."))) (lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1") (let* ((cmd (conc stepcmd " > " stepname ".log 2>&1")) ;; >outfile 2>&1 (pid (process-run "/bin/bash" (list "-c" cmd)))) (with-output-to-file "Makefile.ezsteps" (lambda () (print stepname ".log :") (print "\t" cmd) (if (file-exists? (conc stepname ".logpro")) (print "\tlogpro " stepname ".logpro " stepname ".html < " stepname ".log")) (print) (print stepname " : " stepname ".log") (print)) #:append) (rmt:test-set-top-process-pid area-dat run-id test-id pid) (let processloop ((i 0)) (let-values (((pid-val exit-status exit-code)(process-wait pid #t))) (mutex-lock! m) (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status) (launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code) (mutex-unlock! m)
︙			︙
172 173 174 175 176 177 178 ~~179 180~~ 181 182 183 184 185 186 187	(logfna (if logpro-used (conc stepname ".html") "")) (comment #f)) (if logpro-used (let ((datfile (conc stepname ".dat"))) ;; load the .dat file into the test_data table if it exists (if (file-exists? datfile) (set! comment (launch:load-logpro-dat run-id test-id stepname))) ~~(rmt:test-set-log! run-id test-id (conc stepname ".html")))) (rmt:teststep-set-status! run-id test-id stepname "end" exinfo comment logfna))~~ ;; set the test final status (let* ((process-exit-status (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2)) (this-step-status (cond ((and (eq? process-exit-status 2) logpro-used) 'warn) ;; logpro 2 = warnings ((and (eq? process-exit-status 3) logpro-used) 'check) ;; logpro 3 = check ((and (eq? process-exit-status 4) logpro-used) 'waived) ;; logpro 4 = waived ((and (eq? process-exit-status 5) logpro-used) 'abort) ;; logpro 5 = abort	\| \|	172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187	(logfna (if logpro-used (conc stepname ".html") "")) (comment #f)) (if logpro-used (let ((datfile (conc stepname ".dat"))) ;; load the .dat file into the test_data table if it exists (if (file-exists? datfile) (set! comment (launch:load-logpro-dat run-id test-id stepname))) (rmt:test-set-log! area-dat run-id test-id (conc stepname ".html")))) (rmt:teststep-set-status! area-dat run-id test-id stepname "end" exinfo comment logfna)) ;; set the test final status (let* ((process-exit-status (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2)) (this-step-status (cond ((and (eq? process-exit-status 2) logpro-used) 'warn) ;; logpro 2 = warnings ((and (eq? process-exit-status 3) logpro-used) 'check) ;; logpro 3 = check ((and (eq? process-exit-status 4) logpro-used) 'waived) ;; logpro 4 = waived ((and (eq? process-exit-status 5) logpro-used) 'abort) ;; logpro 5 = abort
︙			︙
206 207 208 209 210 211 212 ~~213~~ 214 215 216 217 218 ~~219~~ 220 221 222 223 224 ~~225~~ 226 227 228 229 230 ~~231~~ 232 233 234 235 236 ~~237~~ 238 239 240 ~~241~~ 242 243 ~~244~~ 245 246 247 ~~248~~ 249 250 251 252 253 254 255 256 257 258 ~~259~~ 260 261 262 263 264 ~~265~~ 266 267 268 269 270 271 272	(debug:print 4 default-log-port "Exit value received: " (launch:einf-exit-code exit-info) " logpro-used: " logpro-used " this-step-status: " this-step-status " overall-status: " overall-status " next-status: " next-status " rollup-status: " (launch:einf-rollup-status exit-info)) ;; (vector-ref exit-info 3)) (case next-status ((warn) (launch:einf-rollup-status-set! exit-info 2) ;; (vector-set! exit-info 3 2) ;; rollup-status ;; NB// test-set-status! does rdb calls under the hood ~~(tests:test-set-status! run-id test-id next-state "WARN"~~ (if (eq? this-step-status 'warn) "Logpro warning found" #f) #f)) ((check) (launch:einf-rollup-status-set! exit-info 3) ;; (vector-set! exit-info 3 3) ;; rollup-status ;; NB// test-set-status! does rdb calls under the hood ~~(tests:test-set-status! run-id test-id next-state "CHECK"~~ (if (eq? this-step-status 'check) "Logpro check found" #f) #f)) ((waived) (launch:einf-rollup-status-set! exit-info 4) ;; (vector-set! exit-info 3 3) ;; rollup-status ;; NB// test-set-status! does rdb calls under the hood ~~(tests:test-set-status! run-id test-id next-state "WAIVED"~~ (if (eq? this-step-status 'check) "Logpro waived found" #f) #f)) ((abort) (launch:einf-rollup-status-set! exit-info 5) ;; (vector-set! exit-info 3 4) ;; rollup-status ;; NB// test-set-status! does rdb calls under the hood ~~(tests:test-set-status! run-id test-id next-state "ABORT"~~ (if (eq? this-step-status 'abort) "Logpro abort found" #f) #f)) ((skip) (launch:einf-rollup-status-set! exit-info 6) ;; (vector-set! exit-info 3 4) ;; rollup-status ;; NB// test-set-status! does rdb calls under the hood ~~(tests:test-set-status! run-id test-id next-state "SKIP"~~ (if (eq? this-step-status 'skip) "Logpro skip found" #f) #f)) ((pass) ~~(tests:test-set-status! run-id test-id next-state "PASS" #f #f))~~ (else ;; 'fail (launch:einf-rollup-status-set! exit-info 1) ;; (vector-set! exit-info 3 1) ;; force fail, this used to be next-state but that doesn't make sense. should always be "COMPLETED" ~~(tests:test-set-status! run-id test-id "COMPLETED" "FAIL" (conc "Failed at step " stepname) #f)~~ ))) logpro-used)) ~~(define (launch:manage-steps run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m)~~ ;; (let-values ;; (((pid exit-status exit-code) ;; (run-n-wait fullrunscript))) ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) ;; Since we should have a clean slate at this time there is no need to do ;; any of the other stuff that tests:test-set-status! does. Let's just ;; force RUNNING/n/a ;; (thread-sleep! 0.3) ;; (tests:test-force-state-status! run-id test-id "RUNNING" "n/a") ~~(rmt:set-state-status-and-roll-up-items run-id test-name item-path "RUNNING" #f #f)~~ ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here ;; if there is a runscript do it first (if fullrunscript (let ((pid (process-run fullrunscript))) ~~(rmt:test-set-top-process-pid run-id test-id pid)~~ (let loop ((i 0)) (let-values (((pid-val exit-status exit-code) (process-wait pid #t))) (mutex-lock! m) (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status) (launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code)	\| \| \| \| \| \| \| \| \| \|	206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272	(debug:print 4 default-log-port "Exit value received: " (launch:einf-exit-code exit-info) " logpro-used: " logpro-used " this-step-status: " this-step-status " overall-status: " overall-status " next-status: " next-status " rollup-status: " (launch:einf-rollup-status exit-info)) ;; (vector-ref exit-info 3)) (case next-status ((warn) (launch:einf-rollup-status-set! exit-info 2) ;; (vector-set! exit-info 3 2) ;; rollup-status ;; NB// test-set-status! does rdb calls under the hood (tests:test-set-status! area-dat run-id test-id next-state "WARN" (if (eq? this-step-status 'warn) "Logpro warning found" #f) #f)) ((check) (launch:einf-rollup-status-set! exit-info 3) ;; (vector-set! exit-info 3 3) ;; rollup-status ;; NB// test-set-status! does rdb calls under the hood (tests:test-set-status! area-dat run-id test-id next-state "CHECK" (if (eq? this-step-status 'check) "Logpro check found" #f) #f)) ((waived) (launch:einf-rollup-status-set! exit-info 4) ;; (vector-set! exit-info 3 3) ;; rollup-status ;; NB// test-set-status! does rdb calls under the hood (tests:test-set-status! area-dat run-id test-id next-state "WAIVED" (if (eq? this-step-status 'check) "Logpro waived found" #f) #f)) ((abort) (launch:einf-rollup-status-set! exit-info 5) ;; (vector-set! exit-info 3 4) ;; rollup-status ;; NB// test-set-status! does rdb calls under the hood (tests:test-set-status! area-dat run-id test-id next-state "ABORT" (if (eq? this-step-status 'abort) "Logpro abort found" #f) #f)) ((skip) (launch:einf-rollup-status-set! exit-info 6) ;; (vector-set! exit-info 3 4) ;; rollup-status ;; NB// test-set-status! does rdb calls under the hood (tests:test-set-status! area-dat run-id test-id next-state "SKIP" (if (eq? this-step-status 'skip) "Logpro skip found" #f) #f)) ((pass) (tests:test-set-status! area-dat run-id test-id next-state "PASS" #f #f)) (else ;; 'fail (launch:einf-rollup-status-set! exit-info 1) ;; (vector-set! exit-info 3 1) ;; force fail, this used to be next-state but that doesn't make sense. should always be "COMPLETED" (tests:test-set-status! area-dat run-id test-id "COMPLETED" "FAIL" (conc "Failed at step " stepname) #f) ))) logpro-used)) (define (launch:manage-steps area-dat run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m) ;; (let-values ;; (((pid exit-status exit-code) ;; (run-n-wait fullrunscript))) ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) ;; Since we should have a clean slate at this time there is no need to do ;; any of the other stuff that tests:test-set-status! does. Let's just ;; force RUNNING/n/a ;; (thread-sleep! 0.3) ;; (tests:test-force-state-status! run-id test-id "RUNNING" "n/a") (rmt:set-state-status-and-roll-up-items area-dat run-id test-name item-path "RUNNING" #f #f) ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here ;; if there is a runscript do it first (if fullrunscript (let ((pid (process-run fullrunscript))) (rmt:test-set-top-process-pid area-dat run-id test-id pid) (let loop ((i 0)) (let-values (((pid-val exit-status exit-code) (process-wait pid #t))) (mutex-lock! m) (launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid) (launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status) (launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code)
︙			︙
314 315 316 317 318 319 320 ~~321~~ 322 323 324 325 326 327 328 329 330 331 ~~332~~ 333 334 335 336 337 338 339 340 341 342 343 344 345 ~~346~~ 347 348 349 350 351 352 353 ~~354~~ 355 356 357 358 359 360 361 ~~362~~ 363 364 365 366 367 368 369	(launch:load-logpro-dat run-id test-id stepname)) (if (steprun-good? logpro-used (launch:einf-exit-code exit-info)) (if (not (null? tal)) (loop (car tal) (cdr tal) stepname)) (debug:print 4 default-log-port "WARNING: step " (car ezstep) " failed. Stopping"))) (debug:print 4 default-log-port "WARNING: a prior step failed, stopping at " ezstep))))))) ~~(define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)~~ (let* ((start-seconds (current-seconds)) (calc-minutes (lambda () (inexact->exact (round (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) ~~(tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)~~ (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory)))) (let ((new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (get-df (current-directory))) (delta (abs (- df disk-free)))) (if (> delta 200) ;; ignore changes under 200 Meg df #f)))) ~~(set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat))~~ (and runtlim (let* ((run-seconds (- (current-seconds) start-seconds)) (time-exceeded (> run-seconds runtlim))) (if time-exceeded (begin (debug:print-info 0 default-log-port "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim) #t) #f))))) ~~(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)~~ (if kill-job? (begin (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries? (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0)) ~~(pid2 (rmt:test-get-top-process-pid run-id test-id))~~ (pids (delete-duplicates (filter number? (list pid1 pid2))))) (if (not (null? pids)) (begin (for-each (lambda (pid) (handle-exceptions exn	\| \| \| \| \|	314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369	(launch:load-logpro-dat run-id test-id stepname)) (if (steprun-good? logpro-used (launch:einf-exit-code exit-info)) (if (not (null? tal)) (loop (car tal) (cdr tal) stepname)) (debug:print 4 default-log-port "WARNING: step " (car ezstep) " failed. Stopping"))) (debug:print 4 default-log-port "WARNING: a prior step failed, stopping at " ezstep))))))) (define (launch:monitor-job area-dat run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) (let* ((start-seconds (current-seconds)) (calc-minutes (lambda () (inexact->exact (round (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f area-dat test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory)))) (let ((new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (get-df (current-directory))) (delta (abs (- df disk-free)))) (if (> delta 200) ;; ignore changes under 200 Meg df #f)))) (set! kill-job? (or (test-get-kill-request area-dat run-id test-id) ;; run-id test-name itemdat)) (and runtlim (let* ((run-seconds (- (current-seconds) start-seconds)) (time-exceeded (> run-seconds runtlim))) (if time-exceeded (begin (debug:print-info 0 default-log-port "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim) #t) #f))))) (tests:update-central-meta-info area-dat run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) (if kill-job? (begin (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries? (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0)) (pid2 (rmt:test-get-top-process-pid area-dat run-id test-id)) (pids (delete-duplicates (filter number? (list pid1 pid2))))) (if (not (null? pids)) (begin (for-each (lambda (pid) (handle-exceptions exn
︙			︙
383 384 385 386 387 388 389 ~~390~~ 391 392 ~~393~~ 394 395 396 397 398 399 400 401 402 ~~403~~ 404 405 406 407 408 409 410	(handle-exceptions exn #f (process-signal pid-num signal/kill))) (process:get-sub-pids pid)))) ;; (debug:print-info 0 default-log-port "not killing process " pid " as it is not alive")))) pids) ~~(tests:test-set-status! run-id test-id "KILLED" "KILLED" (args:get-arg "-m") #f))~~ (begin (debug:print-error 0 default-log-port "Nothing to kill, pid1=" pid1 ", pid2=" pid2) ~~(tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (args:get-arg "-m") #f)~~ ))) (mutex-unlock! m) ;; no point in sticking around. Exit now. (exit))) (if (hash-table-ref/default misc-flags 'keep-going #f) (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free))))))) ~~(tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional~~ (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) (if (list? cmdinfo) ;; ((testpath /tmp/mrwellan/jazzmind/src/example_run/tests/sqlitespeed) ;; (test-name sqlitespeed) (runscript runscript.rb) (db-host localhost) (run-id 1))	\| \| \|	383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410	(handle-exceptions exn #f (process-signal pid-num signal/kill))) (process:get-sub-pids pid)))) ;; (debug:print-info 0 default-log-port "not killing process " pid " as it is not alive")))) pids) (tests:test-set-status! area-dat run-id test-id "KILLED" "KILLED" (args:get-arg "-m") #f)) (begin (debug:print-error 0 default-log-port "Nothing to kill, pid1=" pid1 ", pid2=" pid2) (tests:test-set-status! area-dat run-id test-id "KILLED" "FAILED TO KILL" (args:get-arg "-m") #f) ))) (mutex-unlock! m) ;; no point in sticking around. Exit now. (exit))) (if (hash-table-ref/default misc-flags 'keep-going #f) (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free))))))) (tests:update-central-meta-info area-dat run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) (if (list? cmdinfo) ;; ((testpath /tmp/mrwellan/jazzmind/src/example_run/tests/sqlitespeed) ;; (test-name sqlitespeed) (runscript runscript.rb) (db-host localhost) (run-id 1))
︙			︙
436 437 438 439 440 441 442 ~~443~~ 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 ~~463~~ 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 ~~481~~ 482 483 484 485 486 487 ~~488~~ 489 490 491 492 493 ~~494~~ 495 496 497 ~~498~~ 499 500 501 502 503 504 ~~505 506~~ 507 508 509 510 511 512 513	(if (substring-index "/" runscript) runscript ;; use unadultered if contains slashes (let ((fulln (conc testpath "/" runscript))) (if (and (file-exists? fulln) (file-execute-access? fulln)) fulln runscript))))) ;; assume it is on the path ~~) ;; (rollup-status 0)~~ ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) (if (or (file-exists? top-path) (> count 10)) (change-directory top-path) (begin (debug:print 0 default-log-port "INFO: Not starting job yet - directory " top-path " not found") (thread-sleep! 10) (loop (+ count 1))))) (launch:setup) ;; should be properly in the top-path now (set! tconfigreg (tests:get-all)) (let ((sighand (lambda (signum) ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting (if (eq? signum signal/stop) (debug:print-error 0 default-log-port "attempt to STOP process. Exiting.")) (set! time-to-exit #t) (print "Received signal " signum ", cleaning up before exit. Please wait...") (let ((th1 (make-thread (lambda () ~~(rmt:test-set-state-status run-id test-id "INCOMPLETE" "KILLED" #f)~~ (print "Killed by signal " signum ". Exiting") (thread-sleep! 1) (exit 1)))) (th2 (make-thread (lambda () (thread-sleep! 2) (debug:print 0 default-log-port "Done") (exit 4))))) (thread-start! th2) (thread-start! th1) (thread-join! th2))))) (set-signal-handler! signal/int sighand) (set-signal-handler! signal/term sighand) ) ;; (set-signal-handler! signal/stop sighand) ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, ;; Mark the test as REMOTEHOSTSTART IMMEDIATELY ;; ~~(let* ((test-info (rmt:get-test-info-by-id run-id test-id))~~ (test-host (db:test-get-host test-info)) (test-pid (db:test-get-process_id test-info))) (cond ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "KILLREQ" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 default-log-port "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") ~~(rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)~~ ) ;; prime it for running ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART")) (if (process:alive-on-host? test-host test-pid) (debug:print-error 0 default-log-port "test state is " (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") ~~(rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)~~ )) ((not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") ~~(rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)~~ ) (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")) (debug:print-error 0 default-log-port "test state is " (db:test-get-state test-info) ", cannot proceed") (exit)))) (debug:print 2 default-log-port "Exectuing " test-name " (id: " test-id ") on " (get-host-name)) ~~(set! keys (rmt:get-keys)) ~~;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process~~~~ ;; one of these is defunct/redundant ... (if (not (launch:setup force: #t)) (begin (debug:print 0 default-log-port "Failed to setup, exiting") ;; (sqlite3:finalize! db) ;; (sqlite3:finalize! tdb) (exit 1)))	\| \| \| \| \| \| \| <	436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512	(if (substring-index "/" runscript) runscript ;; use unadultered if contains slashes (let ((fulln (conc testpath "/" runscript))) (if (and (file-exists? fulln) (file-execute-access? fulln)) fulln runscript))))) ;; assume it is on the path (area-dat (make-remote))) ;; (rollup-status 0) ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) (if (or (file-exists? top-path) (> count 10)) (change-directory top-path) (begin (debug:print 0 default-log-port "INFO: Not starting job yet - directory " top-path " not found") (thread-sleep! 10) (loop (+ count 1))))) (launch:setup) ;; should be properly in the top-path now (set! tconfigreg (tests:get-all)) (let ((sighand (lambda (signum) ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting (if (eq? signum signal/stop) (debug:print-error 0 default-log-port "attempt to STOP process. Exiting.")) (set! time-to-exit #t) (print "Received signal " signum ", cleaning up before exit. Please wait...") (let ((th1 (make-thread (lambda () (rmt:test-set-state-status area-dat run-id test-id "INCOMPLETE" "KILLED" #f) (print "Killed by signal " signum ". Exiting") (thread-sleep! 1) (exit 1)))) (th2 (make-thread (lambda () (thread-sleep! 2) (debug:print 0 default-log-port "Done") (exit 4))))) (thread-start! th2) (thread-start! th1) (thread-join! th2))))) (set-signal-handler! signal/int sighand) (set-signal-handler! signal/term sighand) ) ;; (set-signal-handler! signal/stop sighand) ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, ;; Mark the test as REMOTEHOSTSTART IMMEDIATELY ;; (let* ((test-info (rmt:get-test-info-by-id area-dat run-id test-id)) (test-host (db:test-get-host test-info)) (test-pid (db:test-get-process_id test-info))) (cond ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "KILLREQ" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 default-log-port "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:test-set-state-status area-dat run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) ;; prime it for running ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART")) (if (process:alive-on-host? test-host test-pid) (debug:print-error 0 default-log-port "test state is " (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:test-set-state-status area-dat run-id test-id "REMOTEHOSTSTART" "n/a" #f) )) ((not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:test-set-state-status area-dat run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")) (debug:print-error 0 default-log-port "test state is " (db:test-get-state test-info) ", cannot proceed") (exit)))) (debug:print 2 default-log-port "Exectuing " test-name " (id: " test-id ") on " (get-host-name)) (set! keys (rmt:get-keys area-dat)) ;; one of these is defunct/redundant ... (if (not (launch:setup force: #t)) (begin (debug:print 0 default-log-port "Failed to setup, exiting") ;; (sqlite3:finalize! db) ;; (sqlite3:finalize! tdb) (exit 1)))
︙			︙
581 582 583 584 585 586 587 ~~588~~ 589 590 591 592 593 ~~594~~ 595 596 597 598 599 600 601	;; (change-directory top-path) ;; Can setup as client for server mode now ;; (client:setup) ;; environment overrides are done before the remaining critical envars. (alist->env-vars env-ovrd) ~~(runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals)~~ (set-item-env-vars itemdat) (save-environment-as-files "megatest") ;; open-run-close not needed for test-set-meta-info ;; (tests:set-full-meta-info #f test-id run-id 0 work-area) ;; (tests:set-full-meta-info test-id run-id 0 work-area) ~~(tests:set-full-meta-info #f test-id run-id 0 work-area 10)~~ ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here (if (args:get-arg "-xterm") (set! fullrunscript "xterm") (if (and fullrunscript (file-exists? fullrunscript)	\| \|	580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600	;; (change-directory top-path) ;; Can setup as client for server mode now ;; (client:setup) ;; environment overrides are done before the remaining critical envars. (alist->env-vars env-ovrd) (runs:set-megatest-env-vars area-dat run-id inkeys: keys inkeyvals: keyvals) (set-item-env-vars itemdat) (save-environment-as-files "megatest") ;; open-run-close not needed for test-set-meta-info ;; (tests:set-full-meta-info #f test-id run-id 0 work-area) ;; (tests:set-full-meta-info test-id run-id 0 work-area) (tests:set-full-meta-info #f area-dat test-id run-id 0 work-area 10) ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here (if (args:get-arg "-xterm") (set! fullrunscript "xterm") (if (and fullrunscript (file-exists? fullrunscript)
︙			︙
628 629 630 631 632 633 634 ~~635~~ 636 637 638 639 640 641 642	(debug:print-info 0 default-log-port "Megatest exectute of test " test-name ", item path " item-path " complete. Notifying the db ...") (hash-table-set! misc-flags 'keep-going #f) (thread-join! th1) (thread-sleep! 1) ;; givbe thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec. (mutex-lock! m) (let* ((item-path (item-list->path itemdat)) ;; only state and status needed - use lazy routine ~~(testinfo (rmt:get-testinfo-state-status run-id test-id)))~~ ;; Am I completed? (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) ;; NOTE: It should not be REMOTEHOSTSTART but for reasons I don't yet understand it sometimes gets stuck in that state ;; (not (equal? (db:test-get-state testinfo) "COMPLETED")) (let ((new-state (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status ;; "COMPLETED" ;; (db:test-get-state testinfo))) ;; else preseve the state as set within the test ) (new-status (cond	\|	627 628 629 630 631 632 633 634 635 636 637 638 639 640 641	(debug:print-info 0 default-log-port "Megatest exectute of test " test-name ", item path " item-path " complete. Notifying the db ...") (hash-table-set! misc-flags 'keep-going #f) (thread-join! th1) (thread-sleep! 1) ;; givbe thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec. (mutex-lock! m) (let* ((item-path (item-list->path itemdat)) ;; only state and status needed - use lazy routine (testinfo (rmt:get-testinfo-state-status area-dat run-id test-id))) ;; Am I completed? (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) ;; NOTE: It should not be REMOTEHOSTSTART but for reasons I don't yet understand it sometimes gets stuck in that state ;; (not (equal? (db:test-get-state testinfo) "COMPLETED")) (let ((new-state (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status ;; "COMPLETED" ;; (db:test-get-state testinfo))) ;; else preseve the state as set within the test ) (new-status (cond
︙			︙
650 651 652 653 654 655 656 ~~657~~ 658 659 660 661 662 663 664 665 666 ~~667 668 669~~ 670 671 672 673 674 675 676	(if (equal? (db:test-get-status testinfo) "AUTO") "AUTO-WARN" "WARN")) ((eq? (launch:einf-rollup-status exit-info) 3) "CHECK") ((eq? (launch:einf-rollup-status exit-info) 4) "WAIVED") ((eq? (launch:einf-rollup-status exit-info) 5) "ABORT") ((eq? (launch:einf-rollup-status exit-info) 6) "SKIP") (else "FAIL")))) ;; (db:test-get-status testinfo))) (debug:print-info 1 default-log-port "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (launch:einf-exit-status exit-info) " and rollup-status of " (launch:einf-rollup-status exit-info)) ~~(tests:test-set-status! run-id~~ test-id new-state new-status (args:get-arg "-m") #f) ;; need to update the top test record if PASS or FAIL and this is a subtest ;; NO NEED TO CALL set-state-status-and-roll-up-items HERE, THIS IS DONE IN set-state-status-and-roll-up-items called by tests:test-set-status! )) ;; for automated creation of the rollup html file this is a good place... (if (not (equal? item-path "")) (tests:summarize-items run-id test-id test-name #f)) (tests:summarize-test run-id test-id) ;; don't force - just update if no (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) (mutex-unlock! m) (debug:print 2 default-log-port "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n") (if (not (launch:einf-exit-status exit-info)) (exit 4))))))) (define (launch:cache-config)	\| \| \| \|	649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675	(if (equal? (db:test-get-status testinfo) "AUTO") "AUTO-WARN" "WARN")) ((eq? (launch:einf-rollup-status exit-info) 3) "CHECK") ((eq? (launch:einf-rollup-status exit-info) 4) "WAIVED") ((eq? (launch:einf-rollup-status exit-info) 5) "ABORT") ((eq? (launch:einf-rollup-status exit-info) 6) "SKIP") (else "FAIL")))) ;; (db:test-get-status testinfo))) (debug:print-info 1 default-log-port "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (launch:einf-exit-status exit-info) " and rollup-status of " (launch:einf-rollup-status exit-info)) (tests:test-set-status! area-dat run-id test-id new-state new-status (args:get-arg "-m") #f) ;; need to update the top test record if PASS or FAIL and this is a subtest ;; NO NEED TO CALL set-state-status-and-roll-up-items HERE, THIS IS DONE IN set-state-status-and-roll-up-items called by tests:test-set-status! )) ;; for automated creation of the rollup html file this is a good place... (if (not (equal? item-path "")) (tests:summarize-items area-dat run-id test-id test-name #f)) (tests:summarize-test area-dat run-id test-id) ;; don't force - just update if no (rmt:update-run-stats area-dat run-id (rmt:get-raw-run-stats area-dat run-id))) (mutex-unlock! m) (debug:print 2 default-log-port "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n") (if (not (launch:einf-exit-status exit-info)) (exit 4))))))) (define (launch:cache-config)
︙			︙
731 732 733 734 735 736 737 ~~738~~ 739 740 741 742 743 744 745	(debug:print 0 default-log-port "NOTE: skipping launch:setup-body call since we have fulldata") (mutex-unlock! launch-setup-mutex) toppath) (let ((res (launch:setup-body force: force))) (mutex-unlock! launch-setup-mutex) res))) ~~(define (launch:setup-body #!key (force #f))~~ (let* ((toppath (or toppath (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath (runname (common:args-get-runname)) (target (common:args-get-target)) (linktree (common:get-linktree)) (sections (if target (list "default" target) #f)) ;; for runconfigs (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config (rundir (if (and runname target linktree)(conc linktree "/" target "/" runname) #f))	\|	730 731 732 733 734 735 736 737 738 739 740 741 742 743 744	(debug:print 0 default-log-port "NOTE: skipping launch:setup-body call since we have fulldata") (mutex-unlock! launch-setup-mutex) toppath) (let ((res (launch:setup-body force: force))) (mutex-unlock! launch-setup-mutex) res))) (define (launch:setup-body area-dat #!key (force #f)) (let* ((toppath (or toppath (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath (runname (common:args-get-runname)) (target (common:args-get-target)) (linktree (common:get-linktree)) (sections (if target (list "default" target) #f)) ;; for runconfigs (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config (rundir (if (and runname target linktree)(conc linktree "/" target "/" runname) #f))
︙			︙
791 792 793 794 795 796 797 ~~798~~ 799 800 801 802 803 804 805	(set! toppath toppath) (if (not toppath) (begin (debug:print-error 0 default-log-port "you are not in a megatest area!") (exit 1))) (setenv "MT_RUN_AREA_HOME" toppath) ;; the seed read is done, now read runconfigs, cache it then read megatest.config one more time and cache it ~~(let* ((keys (rmt:get-keys))~~ (key-vals (keys:target->keyval keys target)) (linktree (or (getenv "MT_LINKTREE") (if configdat (configf:lookup configdat "setup" "linktree") #f))) (second-pass (find-and-read-config mtconfig environ-patt: "env-override" given-toppath: toppath	\|	790 791 792 793 794 795 796 797 798 799 800 801 802 803 804	(set! toppath toppath) (if (not toppath) (begin (debug:print-error 0 default-log-port "you are not in a megatest area!") (exit 1))) (setenv "MT_RUN_AREA_HOME" toppath) ;; the seed read is done, now read runconfigs, cache it then read megatest.config one more time and cache it (let* ((keys (rmt:get-keys area-dat)) (key-vals (keys:target->keyval keys target)) (linktree (or (getenv "MT_LINKTREE") (if configdat (configf:lookup configdat "setup" "linktree") #f))) (second-pass (find-and-read-config mtconfig environ-patt: "env-override" given-toppath: toppath
︙			︙
929 930 931 932 933 934 935 ~~936~~ 937 938 939 940 941 942 943	(lnkbase (conc linktree "/" target "/" runname)) (lnkpath (conc lnkbase "/" testname)) (lnkpathf (conc lnkpath (if not-iterated "" "/") item-path)) (lnktarget (conc lnkpath "/" item-path))) ;; Update the rundir path in the test record for all, rundir=physical, shortdir=logical ;; rundir shortdir ~~(rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path run-id)~~ (debug:print 2 default-log-port "INFO:\n lnkbase=" lnkbase "\n lnkpath=" lnkpath "\n toptest-path=" toptest-path "\n test-path=" test-path) (if (not (file-exists? linktree)) (begin (debug:print 0 default-log-port "WARNING: linktree did not exist! Creating it now at " linktree) (create-directory linktree #t))) ;; (system (conc "mkdir -p " linktree)))) ;; create the directory for the tests dir links, this is needed no matter what...	\|	928 929 930 931 932 933 934 935 936 937 938 939 940 941 942	(lnkbase (conc linktree "/" target "/" runname)) (lnkpath (conc lnkbase "/" testname)) (lnkpathf (conc lnkpath (if not-iterated "" "/") item-path)) (lnktarget (conc lnkpath "/" item-path))) ;; Update the rundir path in the test record for all, rundir=physical, shortdir=logical ;; rundir shortdir (rmt:general-call area-dat 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path run-id) (debug:print 2 default-log-port "INFO:\n lnkbase=" lnkbase "\n lnkpath=" lnkpath "\n toptest-path=" toptest-path "\n test-path=" test-path) (if (not (file-exists? linktree)) (begin (debug:print 0 default-log-port "WARNING: linktree did not exist! Creating it now at " linktree) (create-directory linktree #t))) ;; (system (conc "mkdir -p " linktree)))) ;; create the directory for the tests dir links, this is needed no matter what...
︙			︙
989 990 991 992 993 994 995 ~~996~~ 997 998 ~~999~~ 1000 1001 1002 1003 ~~1004~~ 1005 1006 1007 1008 1009 ~~1010~~ 1011 1012 1013 1014 1015 1016 1017	;; NB - This was not working right - some top tests are not getting the path set!!! ;; ;; Do the setting of this record after the paths are created so that the shortdir can ;; be set to the real directory location. This is safer for future clean up if the link ;; tree is damaged or lost. ;; (if (not (hash-table-ref/default toptest-paths testname #f)) ~~(let* ((testinfo (rmt:get-test-info-by-id run-id test-id)) ;; run-id testname item-path))~~ (curr-test-path (if testinfo ;; (filedb:get-path fdb ;; (db:get-path dbstruct ~~;; (rmt:sdb-qry 'getstr~~ (db:test-get-rundir testinfo) ;; ) ;; ) #f))) (hash-table-set! toptest-paths testname curr-test-path) ;; NB// Was this for the test or for the parent in an iterated test? ~~(rmt:general-call 'test-set-rundir-shortdir run-id lnkpath~~ (if (file-exists? lnkpath) ;; (resolve-pathname lnkpath) (common:nice-path lnkpath) lnkpath) testname "" run-id) ~~;; (rmt:general-call 'test-set-rundir run-id lnkpath testname "") ;; toptest-path)~~ (if (or (not curr-test-path) (not (directory-exists? toptest-path))) (begin (debug:print-info 2 default-log-port "Creating " toptest-path " and link " lnkpath) (handle-exceptions exn #f ;; don't care to catch and deal with errors here for now.	\| \| \| \|	988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016	;; NB - This was not working right - some top tests are not getting the path set!!! ;; ;; Do the setting of this record after the paths are created so that the shortdir can ;; be set to the real directory location. This is safer for future clean up if the link ;; tree is damaged or lost. ;; (if (not (hash-table-ref/default toptest-paths testname #f)) (let* ((testinfo (rmt:get-test-info-by-id area-dat run-id test-id)) ;; run-id testname item-path)) (curr-test-path (if testinfo ;; (filedb:get-path fdb ;; (db:get-path dbstruct ;; (rmt:sdb-qry area-dat 'getstr (db:test-get-rundir testinfo) ;; ) ;; ) #f))) (hash-table-set! toptest-paths testname curr-test-path) ;; NB// Was this for the test or for the parent in an iterated test? (rmt:general-call area-dat 'test-set-rundir-shortdir run-id lnkpath (if (file-exists? lnkpath) ;; (resolve-pathname lnkpath) (common:nice-path lnkpath) lnkpath) testname "" run-id) ;; (rmt:general-call area-dat 'test-set-rundir run-id lnkpath testname "") ;; toptest-path) (if (or (not curr-test-path) (not (directory-exists? toptest-path))) (begin (debug:print-info 2 default-log-port "Creating " toptest-path " and link " lnkpath) (handle-exceptions exn #f ;; don't care to catch and deal with errors here for now.
︙			︙
1133 1134 1135 1136 1137 1138 1139 ~~1140~~ 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 ~~1154~~ 1155 1156 1157 1158 ~~1159 1160~~ 1161 1162 1163 1164 1165 1166 1167	(test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path)) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path (work-area #f) (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all (diskpath #f) (cmdparms #f) (fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x)))) (mt-bindir-path #f) ~~(testinfo (rmt:get-test-info-by-id run-id test-id))~~ (mt_target (string-intersperse (map cadr keyvals) "/")) (debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '()) (if (args:get-arg "-logging")(list "-logging") '())))) ;; (if hosts (set! hosts (string-split hosts))) ;; set the megatest to be called on the remote host (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest")) (set! mt-bindir-path (pathname-directory remote-megatest)) (if launcher (set! launcher (string-split launcher))) ;; set up the run work area for this test (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir (begin (debug:print-info 0 default-log-port "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) ~~(runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record~~ ;; prevent overlapping actions - set to LAUNCHED as early as possible ;; ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f) ;; (pp (hash-table->alist tconfig)) (set! diskpath (get-best-disk configdat tconfig)) (if diskpath (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) (set! work-area (car dat)) (set! toptest-work-area (cadr dat)) (debug:print-info 2 default-log-port "Using work area " work-area))	\| \| \| \|	1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166	(test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path)) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path (work-area #f) (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all (diskpath #f) (cmdparms #f) (fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x)))) (mt-bindir-path #f) (testinfo (rmt:get-test-info-by-id area-dat run-id test-id)) (mt_target (string-intersperse (map cadr keyvals) "/")) (debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '()) (if (args:get-arg "-logging")(list "-logging") '())))) ;; (if hosts (set! hosts (string-split hosts))) ;; set the megatest to be called on the remote host (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest")) (set! mt-bindir-path (pathname-directory remote-megatest)) (if launcher (set! launcher (string-split launcher))) ;; set up the run work area for this test (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir (begin (debug:print-info 0 default-log-port "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) (runs:remove-test-directory area-dat testinfo 'remove-data-only))) ;; remove data only, do not perturb the record ;; prevent overlapping actions - set to LAUNCHED as early as possible ;; ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail (tests:test-set-status! area-dat run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) (rmt:set-state-status-and-roll-up-items area-dat run-id test-name item-path #f "LAUNCHED" #f) ;; (pp (hash-table->alist tconfig)) (set! diskpath (get-best-disk configdat tconfig)) (if diskpath (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat))) (set! work-area (car dat)) (set! toptest-work-area (cadr dat)) (debug:print-info 2 default-log-port "Using work area " work-area))
︙			︙
1190 1191 1192 1193 1194 1195 1196 ~~1197~~ 1198 1199 1200 1201 1202 1203 1204	(list 'runtlim (if run-time-limit (common:hms-string->seconds run-time-limit) #f)) (list 'env-ovrd (hash-table-ref/default configdat "env-override" '())) (list 'set-vars (if params (hash-table-ref/default params "-setvars" #f))) (list 'runname runname) (list 'mt-bindir-path mt-bindir-path)))))))) ;; clean out step records from previous run if they exist ~~;; (rmt:delete-test-step-records run-id test-id)~~ ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway (if (file-exists? work-area) (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir (cond ;; ((and launcher hosts) ;; must be using ssh hostname ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms))))	\|	1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203	(list 'runtlim (if run-time-limit (common:hms-string->seconds run-time-limit) #f)) (list 'env-ovrd (hash-table-ref/default configdat "env-override" '())) (list 'set-vars (if params (hash-table-ref/default params "-setvars" #f))) (list 'runname runname) (list 'mt-bindir-path mt-bindir-path)))))))) ;; clean out step records from previous run if they exist ;; (rmt:delete-test-step-records area-dat run-id test-id) ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway (if (file-exists? work-area) (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir (cond ;; ((and launcher hosts) ;; must be using ssh hostname ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms))))
︙			︙
1267 1268 1269 1270 1271 1272 1273 ~~1274~~ 1275 1276 1277 1278 1279 1280 1281 1282 1283 ~~1284~~ 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296	(alist->env-vars testprevvals) (alist->env-vars commonprevvals) launch-results)) (change-directory toppath))) ;; recover a test where the top controlling mtest may have died ;; ~~(define (launch:recover-test run-id test-id)~~ ;; this function is called on the test run host via ssh ;; ;; 1. look at the process from pid ;; - is it owned by calling user ;; - it it's run directory correct for the test ;; - is there a controlling mtest (maybe stuck) ;; 2. if recovery is needed watch pid ;; - when it exits take the exit code and do the needful ;; ~~(let* ((pid (rmt:test-get-top-process-id run-id test-id))~~ (psres (with-input-from-pipe (conc "ps -F -u " (current-user-name) " \| grep -E '" pid " ' \| grep -v 'grep -E " pid "'") (lambda () (read-line)))) (rundir (if (string? psres) ;; real process owned by user (read-symbolic-link (conc "/proc/" pid "/cwd")) #f))) ;; now wait on that process if all is correct ;; periodically update the db with runtime ;; when the process exits look at the db, if still RUNNING after 10 seconds set ;; state/status appropriately (process-wait pid)))	\| \|	1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295	(alist->env-vars testprevvals) (alist->env-vars commonprevvals) launch-results)) (change-directory toppath))) ;; recover a test where the top controlling mtest may have died ;; (define (launch:recover-test area-dat run-id test-id) ;; this function is called on the test run host via ssh ;; ;; 1. look at the process from pid ;; - is it owned by calling user ;; - it it's run directory correct for the test ;; - is there a controlling mtest (maybe stuck) ;; 2. if recovery is needed watch pid ;; - when it exits take the exit code and do the needful ;; (let* ((pid (rmt:test-get-top-process-id area-dat run-id test-id)) (psres (with-input-from-pipe (conc "ps -F -u " (current-user-name) " \| grep -E '" pid " ' \| grep -v 'grep -E " pid "'") (lambda () (read-line)))) (rundir (if (string? psres) ;; real process owned by user (read-symbolic-link (conc "/proc/" pid "/cwd")) #f))) ;; now wait on that process if all is correct ;; periodically update the db with runtime ;; when the process exits look at the db, if still RUNNING after 10 seconds set ;; state/status appropriately (process-wait pid)))