Megatest: Diff

Differences From Artifact [924f51a8e1]:

File launch.scm — part of check-in [53cfda8951] at 2020-10-02 16:03:57 on branch v1.70-defunct — If local check pstree locally ==/3.05/0.8/PASS/1202/mars/== (user: matt, size: 94581) [annotate] [blame] [check-ins using] [more...]

To Artifact [9905b8fdbe]:

File launch.scm — part of check-in [b50ac24a01] at 2020-10-03 21:25:20 on branch v1.70-defunct — Pull in all v1.65 before patching with prereq fix. (user: matt, size: 86277) [annotate] [blame] [check-ins using] [more...]

︙
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63	29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71	+ + - + - + + + + + + +	(import (prefix sqlite3 sqlite3:)) (declare (unit launch)) (declare (uses subrun)) (declare (uses common)) (declare (uses configf)) (declare (uses db)) (declare (uses ezsteps)) (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") (include "megatest-fossil-hash.scm") ;;====================================================================== ;; ezsteps ;;====================================================================== ;; ezsteps were going to be coded as ;; stepname[,predstep1,predstep2 ...] [{VAR1=first,second,third}] command to execute ;; BUT ;; now are ;; stepname {VAR=first,second,third ...} command ... ;; where the {VAR=first,second,third ...} is optional. ;; given an exit code and whether or not logpro was used calculate OK/BAD ;; return #t if we are ok, #f otherwise ~~(define (steprun-good? logpro exitcode)~~ (define (steprun-good? logpro exitcode stepparms) (or (eq? exitcode 0) ~~(and logpro (eq? exitcode 2))))~~ (and logpro (eq? exitcode 2)) ;; shouldn't this be (member exitcode 2 ...) with the other ok codes? (let* ((params (alist-ref 'params stepparms)) ;; get the params section (keep-going (if params (alist-ref "keep-going" params equal?) #f))) (debug:print 0 default-log-port "keep-going=" keep-going) (and keep-going (equal? (car keep-going) "yes"))))) ;; if handed a string, process it, else look for MT_CMDINFO (define (launch:get-cmdinfo-assoc-list #!key (encoded-cmd #f)) (let ((enccmd (if encoded-cmd encoded-cmd (getenv "MT_CMDINFO")))) (if enccmd (common:read-encoded-string enccmd) '())))
︙
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286	92 93 94 95 96 97 98 99 100 101 102 103 104 105	- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -	;; ) (cond ((equal? status "PASS") "PASS") ;; skip the message part if status is pass (status (conc (configf:lookup dat "final" "exit-status") ": " (if msg msg "no message"))) (else #f))) #f))) ~~(define (launch:runstep ezstep run-id test-id exit-info m tal testconfig) ;;; TODO: deprecate me in favor of ezsteps.scm~~ ~~(let* ((stepname (car ezstep)) ;; do stuff to run the step~~ ~~(stepinfo (cadr ezstep))~~ ~~;; (let ((info (cadr ezstep)))~~ ~~;; (if (proc? info) "" info)))~~ ~~;; (stepproc (let ((info (cadr ezstep)))~~ ~~;; (if (proc? info) info #f)))~~ ~~(stepparts (string-match (regexp "^(\\{([^\\}\\{])\\}\\s\|)(.)$") stepinfo))~~ ~~(stepparams (if (and (list? stepparts)~~ ~~(> (length stepparts) 1))~~ ~~(list-ref stepparts 2)~~ ~~#f)) ;; for future use, {VAR=1,2,3}, run step for each~~ ~~(paramparts (if (string? stepparams)~~ ~~(map (lambda (x)(string-split x "=")) (string-split-fields "[^;]=[^;]" stepparams))~~ ~~'()))~~ ~~(subrun (alist-ref "subrun" paramparts equal?))~~ ~~(stepcmd (if (and (list? stepparts)~~ ~~(> (length stepparts) 2))~~ ~~(list-ref stepparts 3)~~ ~~(conc "# error, no command for step "stepname)))~~ ~~(script "") ; "#!/bin/bash\n") ;; yep, we depend on bin/bash FIXME!!!\~~ ~~(logpro-file (conc stepname ".logpro"))~~ ~~(html-file (conc stepname ".html"))~~ ~~(dat-file (conc stepname ".dat"))~~ ~~(tconfig-logpro (configf:lookup testconfig "logpro" stepname))~~ ~~(logpro-used (common:file-exists? logpro-file)))~~ ~~(debug:print 0 default-log-port* "stepparts: " stepparts ", stepparams: " stepparams~~ ~~", paramparts: " paramparts ", subrun: " subrun ", stepcmd: " stepcmd)~~ ~~(if (and tconfig-logpro~~ ~~(not logpro-used)) ;; no logpro file found but have a defn in the testconfig~~ ~~(begin~~ ~~(with-output-to-file logpro-file~~ ~~(lambda ()~~ ~~(print ";; logpro file extracted from testconfig\n"~~ ~~";;")~~ ~~(print tconfig-logpro)))~~ ~~(set! logpro-used #t)))~~ ~~;; NB// can safely assume we are in test-area directory~~ ~~(debug:print 4 default-log-port "ezsteps:\n stepname: " stepname " stepinfo: " stepinfo " stepparts: " stepparts~~ ~~" stepparams: " stepparams " stepcmd: " stepcmd)~~ ~~;; ;; first source the previous environment~~ ~~;; (let ((prev-env (conc ".ezsteps/" prevstep (if (string-search (regexp "csh")~~ ~~;; (get-environment-variable "SHELL")) ".csh" ".sh"))))~~ ~~;; (if (and prevstep (common:file-exists? prev-env))~~ ~~;; (set! script (conc script "source " prev-env))))~~ ~~;; call the command using mt_ezstep~~ ~~;; (set! script (conc "mt_ezstep " stepname " " (if prevstep prevstep "x") " " stepcmd))~~ ~~(debug:print 4 default-log-port "script: " script)~~ ~~(rmt:teststep-set-status! run-id test-id stepname "start" "-" #f #f)~~ ~~;; now launch the actual process~~ ~~(call-with-environment-variables~~ ~~(list (cons "PATH" (conc (get-environment-variable "PATH") ":.")))~~ ~~(lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1")~~ ~~(let* ((cmd (conc stepcmd " > " stepname ".log 2>&1")) ;; >outfile 2>&1~~ ~~(pid #f))~~ ~~(let ((proc (lambda ()~~ ~~(set! pid (process-run "/bin/bash" (list "-c" cmd))))))~~ ~~(if subrun~~ ~~(begin~~ ~~(debug:print-info 0 default-log-port "Running without MT_.* environment variables.")~~ ~~(common:without-vars proc "^MT_."))~~ ~~(proc)))~~ ~~(with-output-to-file "Makefile.ezsteps"~~ ~~(lambda ()~~ ~~(print stepname ".log :")~~ ~~(print "\t" cmd)~~ ~~(if (common:file-exists? (conc stepname ".logpro"))~~ ~~(print "\tlogpro " stepname ".logpro " stepname ".html < " stepname ".log"))~~ ~~(print)~~ ~~(print stepname " : " stepname ".log")~~ ~~(print))~~ ~~#:append)~~ ~~(rmt:test-set-top-process-pid run-id test-id pid)~~ ~~(let processloop ((i 0))~~ ~~(let-values (((pid-val exit-status exit-code)(process-wait pid #t)))~~ ~~(mutex-lock! m)~~ ~~(launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid)~~ ~~(launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status)~~ ~~(launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code)~~ ~~(mutex-unlock! m)~~ ~~(if (eq? pid-val 0)~~ ~~(begin~~ ~~(thread-sleep! 2)~~ ~~(processloop (+ i 1))))~~ ~~)))))~~ ~~(debug:print-info 0 default-log-port* "step " stepname " completed with exit code " (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2))~~ ~~;; now run logpro if needed~~ ~~(if logpro-used~~ ~~(let* ((logpro-exe (or (getenv "LOGPRO_EXE") "logpro"))~~ ~~(pid (process-run (conc "/bin/sh -c '"logpro-exe" "logpro-file " " (conc stepname ".html") " < " stepname ".log > /dev/null'"))))~~ ~~(let processloop ((i 0))~~ ~~(let-values (((pid-val exit-status exit-code)(process-wait pid #t)))~~ ~~(mutex-lock! m)~~ ~~;; (make-launch:einf pid: pid exit-status: exit-status exit-code: exit-code)~~ ~~(launch:einf-pid-set! exit-info pid) ;; (vector-set! exit-info 0 pid)~~ ~~(launch:einf-exit-status-set! exit-info exit-status) ;; (vector-set! exit-info 1 exit-status)~~ ~~(launch:einf-exit-code-set! exit-info exit-code) ;; (vector-set! exit-info 2 exit-code)~~ ~~(mutex-unlock! m)~~ ~~(if (eq? pid-val 0)~~ ~~(begin~~ ~~(thread-sleep! 2)~~ ~~(processloop (+ i 1)))))~~ ~~(debug:print-info 0 default-log-port "logpro for step " stepname " exited with code " (launch:einf-exit-code exit-info))))) ;; (vector-ref exit-info 2)))))~~ ~~(let ((exinfo (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2))~~ ~~(logfna (if logpro-used (conc stepname ".html") ""))~~ ~~(comment #f))~~ ~~(if logpro-used~~ ~~(let ((datfile (conc stepname ".dat")))~~ ~~;; load the .dat file into the test_data table if it exists~~ ~~(if (common:file-exists? datfile)~~ ~~(set! comment (launch:load-logpro-dat run-id test-id stepname)))~~ ~~(rmt:test-set-log! run-id test-id (conc stepname ".html"))))~~ ~~(rmt:teststep-set-status! run-id test-id stepname "end" exinfo comment logfna))~~ ~~;; set the test final status~~ ~~(let* ((process-exit-status (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2))~~ ~~(this-step-status (cond~~ ~~((and (eq? process-exit-status 2) logpro-used) 'warn) ;; logpro 2 = warnings~~ ~~((and (eq? process-exit-status 3) logpro-used) 'check) ;; logpro 3 = check~~ ~~((and (eq? process-exit-status 4) logpro-used) 'waived) ;; logpro 4 = waived~~ ~~((and (eq? process-exit-status 5) logpro-used) 'abort) ;; logpro 5 = abort~~ ~~((and (eq? process-exit-status 6) logpro-used) 'skip) ;; logpro 6 = skip~~ ~~((eq? process-exit-status 0) 'pass) ;; logpro 0 = pass~~ ~~(else 'fail)))~~ ~~(overall-status (cond~~ ~~((eq? (launch:einf-rollup-status exit-info) 2) 'warn) ;; rollup-status (vector-ref exit-info 3)~~ ~~((eq? (launch:einf-rollup-status exit-info) 0) 'pass) ;; (vector-ref exit-info 3)~~ ~~(else 'fail)))~~ ~~(next-status (cond~~ ~~((eq? overall-status 'pass) this-step-status)~~ ~~((eq? overall-status 'warn)~~ ~~(if (eq? this-step-status 'fail) 'fail 'warn))~~ ~~((eq? overall-status 'abort) 'abort)~~ ~~(else 'fail)))~~ ~~(next-state ;; "RUNNING") ;; WHY WAS THIS CHANGED TO NOT USE (null? tal) ??~~ ~~(cond~~ ~~((null? tal) ;; more to run?~~ ~~"COMPLETED")~~ ~~(else "RUNNING"))))~~ ~~(debug:print 4 default-log-port "Exit value received: " (launch:einf-exit-code exit-info) " logpro-used: " logpro-used~~ ~~" this-step-status: " this-step-status " overall-status: " overall-status~~ ~~" next-status: " next-status " rollup-status: " (launch:einf-rollup-status exit-info)) ;; (vector-ref exit-info 3))~~ ~~(case next-status~~ ~~((warn)~~ ~~(launch:einf-rollup-status-set! exit-info 2) ;; (vector-set! exit-info 3 2) ;; rollup-status~~ ~~;; NB// test-set-status! does rdb calls under the hood~~ ~~(tests:test-set-status! run-id test-id next-state "WARN"~~ ~~(if (eq? this-step-status 'warn) "Logpro warning found" #f)~~ ~~#f))~~ ~~((check)~~ ~~(launch:einf-rollup-status-set! exit-info 3) ;; (vector-set! exit-info 3 3) ;; rollup-status~~ ~~;; NB// test-set-status! does rdb calls under the hood~~ ~~(tests:test-set-status! run-id test-id next-state "CHECK"~~ ~~(if (eq? this-step-status 'check) "Logpro check found" #f)~~ ~~#f))~~ ~~((waived)~~ ~~(launch:einf-rollup-status-set! exit-info 4) ;; (vector-set! exit-info 3 3) ;; rollup-status~~ ~~;; NB// test-set-status! does rdb calls under the hood~~ ~~(tests:test-set-status! run-id test-id next-state "WAIVED"~~ ~~(if (eq? this-step-status 'check) "Logpro waived found" #f)~~ ~~#f))~~ ~~((abort)~~ ~~(launch:einf-rollup-status-set! exit-info 5) ;; (vector-set! exit-info 3 4) ;; rollup-status~~ ~~;; NB// test-set-status! does rdb calls under the hood~~ ~~(tests:test-set-status! run-id test-id next-state "ABORT"~~ ~~(if (eq? this-step-status 'abort) "Logpro abort found" #f)~~ ~~#f))~~ ~~((skip)~~ ~~(launch:einf-rollup-status-set! exit-info 6) ;; (vector-set! exit-info 3 4) ;; rollup-status~~ ~~;; NB// test-set-status! does rdb calls under the hood~~ ~~(tests:test-set-status! run-id test-id next-state "SKIP"~~ ~~(if (eq? this-step-status 'skip) "Logpro skip found" #f)~~ ~~#f))~~ ~~((pass)~~ ~~(tests:test-set-status! run-id test-id next-state "PASS" #f #f))~~ ~~(else ;; 'fail~~ ~~(launch:einf-rollup-status-set! exit-info 1) ;; (vector-set! exit-info 3 1) ;; force fail, this used to be next-state but that doesn't make sense. should always be "COMPLETED"~~ ~~(tests:test-set-status! run-id test-id "COMPLETED" "FAIL" (conc "Failed at step " stepname) #f)~~ ~~)))~~ ~~logpro-used))~~ (define (launch:manage-steps run-id test-id item-path fullrunscript ezsteps subrun test-name tconfigreg exit-info m) ;; (let-values ;; (((pid exit-status exit-code) ;; (run-n-wait fullrunscript))) ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) ;; Since we should have a clean slate at this time there is no need to do ;; any of the other stuff that tests:test-set-status! does. Let's just
︙
347 348 349 350 351 352 353 ~~354~~ 355 356 357 358 359 360 361 362 363 364 ~~365 366~~ 367 368 369 370 ~~371~~ 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 ~~389~~ 390 391 392 393 394 ~~395~~ 396 397 398 399 400 401 402	166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225	- + - - + + + + - + - + - + + +	(set! ezsteps #t) ;; set the needed flag (set! ezstepslst (append (or ezstepslst '()) (list (list "subrun" (conc "{subrun=true} " mt-cmd))))))) ;; process the ezsteps (if ezsteps ~~(~~begin~~~~ (let* ((all-steps-dat (make-hash-table))) ;; keep all the info around as stepname ==> alist; where 'params is the params list (add other stuff as needed) (if (not (common:file-exists? ".ezsteps"))(create-directory ".ezsteps")) ;; if ezsteps was defined then we are sure to have at least one step but check anyway (if (not (> (length ezstepslst) 0)) (debug:print-error 0 default-log-port "ezsteps defined but ezstepslst is zero length") (let loop ((ezstep (car ezstepslst)) (tal (cdr ezstepslst)) (prevstep #f)) (debug:print-info 0 default-log-port "Processing ezstep \"" (string-intersperse ezstep " ") "\"") ;; check exit-info (vector-ref exit-info 1) (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) ~~(let ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig)) (stepname (car ezstep)))~~ (let* ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat)) (stepname (car ezstep)) (stepparms (hash-table-ref all-steps-dat stepname))) (setenv "MT_STEP_NAME" stepname) (pp (hash-table->alist all-steps-dat)) ;; if logpro-used read in the stepname.dat file (if (and logpro-used (common:file-exists? (conc stepname ".dat"))) (launch:load-logpro-dat run-id test-id stepname)) ~~(if (steprun-good? logpro-used (launch:einf-exit-code exit-info))~~ (if (steprun-good? logpro-used (launch:einf-exit-code exit-info) stepparms) (if (not (null? tal)) (loop (car tal) (cdr tal) stepname)) (debug:print 0 default-log-port "WARNING: step " (car ezstep) " failed. Stopping"))) (debug:print 0 default-log-port "WARNING: a prior step failed, stopping at " ezstep))))))))) (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) (let* ((update-period (string->number (or (configf:lookup configdat "setup" "test-stats-update-period") "30"))) (start-seconds (current-seconds)) (calc-minutes (lambda () (inexact->exact (round (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) ~~(tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)~~ (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) ~~(common:telemetry-log "zombie" (conc "launch:monitor-job - ~~top of loop encountered at "(current-seconds)" with last-sync="last-sync))~~~~ ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - ;; top of loop encountered at "(current-seconds)" with ;; last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds
︙
410 411 412 413 414 415 416 ~~417~~ 418 419 420 421 422 423 424 425 ~~426~~ 427 428 429 430 431 432 ~~433~~ ~~434 435 436 437~~ ~~438 439~~ 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 ~~458~~ 459 460 461 462 463 464 465 466 467 468 469 ~~470 471 472~~ 473 474 475 476 477 478 479	233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299	- + - + - + - - - - + - - - + - - - + + + + +	(do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") ~~(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)~~ (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 default-log-port "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) ~~(~~when~~ do-sync~~ (if do-sync ;; save meta data about the running of this test ~~;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)~~ ~~;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))~~ ~~(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds)))~~ (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) ~~(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))~~ (if kill-job? (begin (debug:print-info 0 default-log-port "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries? (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0)) (pid2 (rmt:test-get-top-process-pid run-id test-id)) (pids (delete-duplicates (filter number? (list pid1 pid2))))) (if (not (null? pids)) (begin (for-each (lambda (pid) (handle-exceptions exn (begin (debug:print-info 0 default-log-port "Unable to kill process with pid " pid ", possibly already killed.") ~~(debug:print 0 default-log-port " message: " ((condition-property-accessor 'exn 'message) exn)))~~ (debug:print 0 default-log-port " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)) (debug:print 0 default-log-port "WARNING: Request received to kill job " pid) ;; " (attempt # " kill-tries ")") (debug:print-info 0 default-log-port "Signal mask=" (signal-mask)) ;; (if (process:alive? pid) ;; (begin (map (lambda (pid-num) (process-signal pid-num signal/term)) (process:get-sub-pids pid)) (thread-sleep! 5) ;; (if (process:process-alive? pid) (map (lambda (pid-num) (handle-exceptions ~~exn #f (process-signal pid-num signal/kill)))~~ exn (begin (debug:print 0 default-log-port " .... had trouble sending kill to " pid-num ", exn=" exn) #f) (process-signal pid-num signal/kill))) (process:get-sub-pids pid)))) ;; (debug:print-info 0 default-log-port "not killing process " pid " as it is not alive")))) pids) ;; BB: question to Matt -- does the tests:test-state-status! encompass rollup to toplevel? If not, should it? (tests:test-set-status! run-id test-id "KILLED" "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) ;; BB ADDED kill-reason -- confirm OK with Matt (begin (debug:print-error 0 default-log-port "Nothing to kill, pid1=" pid1 ", pid2=" pid2)
︙
487 488 489 490 491 492 493 ~~494~~ 495 496 497 498 499 500 501	307 308 309 310 311 312 313 314 315 316 317 318 319 320 321	- +	(begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) ~~(tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional~~ (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) ;;(bb-check-path msg: "launch:execute incoming")
︙
640 641 642 643 644 645 646 ~~647~~ 648 649 650 651 652 653 654 655 656 657 658 659 660 ~~661~~ ~~662~~ 663 664 665 666 667 668 669	460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493	- + + + + + + - + -	(db:test-get-host test-info) (begin (debug:print 0 default-log-port "ERROR: failed to find a record for test-id " test-id ", exiting.") (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. ~~((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun~~ ((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (and (equal? (db:test-get-state test-info) "COMPLETED") ;; completed/abort => rerun if asked (member (db:test-get-status test-info) '("ABORT")))) (debug:print 0 default-log-port "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) ;; prime it for running ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART")) (if (process:alive-on-host? test-host test-pid) (debug:print-error 0 default-log-port "test state is " (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed") (exit))) ((member (db:test-get-state test-info) '("COMPLETED")) ;; we do NOT want to re-run COMPLETED jobs. Mark as NOT_STARTED to run! (debug:print-error 0 default-log-port "test state is " (db:test-get-state test-info) ", cannot proceed") (exit)) ((not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) ~~(rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)~~ (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)) ) (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")) (debug:print-error 0 default-log-port "test state is " (db:test-get-state test-info) ", cannot proceed") (exit)))) ;; cleanup prior execution's steps (rmt:delete-steps-for-test! run-id test-id)
︙
1218 1219 1220 1221 1222 1223 1224 ~~1225~~ 1226 1227 1228 1229 1230 1231 ~~1232~~ 1233 1234 1235 1236 1237 1238 1239	1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063	- + - +	(begin (if (not (common:file-exists? linktree)) (begin (handle-exceptions exn (begin (debug:print-error 0 default-log-port "Something went wrong when trying to create linktree dir at " linktree) ~~(debug:print 0 default-log-port " message: " ((condition-property-accessor 'exn 'message) exn))~~ (debug:print 0 default-log-port " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (exit 1)) (create-directory linktree #t)))) (handle-exceptions exn (begin (debug:print-error 0 default-log-port "Something went wrong when trying to create link to linktree at " toppath) ~~(debug:print 0 default-log-port " message: " ((condition-property-accessor 'exn 'message) exn)))~~ (debug:print 0 default-log-port " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)) (let ((tlink (conc toppath "/lt"))) (if (not (common:file-exists? tlink)) (create-symbolic-link linktree tlink))))) (begin (debug:print-error 0 default-log-port "linktree not defined in [setup] section of megatest.config") ))) (if (and toppath
︙
1290 1291 1292 1293 1294 1295 1296 ~~1297~~ 1298 1299 1300 1301 1302 1303 1304	1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132	- + + + + +	;; (if (common:low-noise-print 20 "No valid disks or no disk with enough space") ;; (debug:print-error 0 default-log-port "No valid disks found in megatest.config. Please add some to your [disks] section and ensure the directory exists and has enough space!\n You can change minspace in the [setup] section of megatest.config. Current setting is: " minspace)) ;;(exit 1) (if (null? disks) (cons 1 (conc toppath "/runs")) (let ((paths (sort disks (lambda (x y) (> (string-length (cadr x)) (string-length (cadr y))))))) (let loop ((head (car paths)) (tail (cdr paths))) ~~(let ((result (handle-exceptions exn ~~#f (create-directory (cadr head) #t))))~~~~ (let ((result (handle-exceptions exn (begin (debug:print 0 default-log-port "failed to create dir " (cadr head) ", exn=" exn) #f) (create-directory (cadr head) #t)))) (if result result (if (null? tail) (cons 1 (conc toppath "/runs")) (loop (car tail) (cdr tail))))))))))) ;; no disks definition - use mtrah/runs, fall back to currdir/runs (let* ((toppath (or toppath
︙
1386 1387 1388 1389 1390 1391 1392 ~~1393~~ 1394 1395 1396 1397 1398 1399 1400	1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228	- +	;; create the directory for the tests dir links, this is needed no matter what... try up to three times (let loop ((done 3)) (let ((success (if (and (not (common:directory-exists? lnkbase)) (not (common:file-exists? lnkbase))) (handle-exceptions exn (begin ~~(debug:print-error 0 default-log-port "Problem creating linktree base at " lnkbase)~~ (debug:print-error 0 default-log-port "Problem creating linktree base at " lnkbase ", exn=" exn) (print-error-message exn (current-error-port)) #t) (create-directory lnkbase #t) #f)))) (if (and (not success)(> done 0)) (loop (- done 1)))))
︙
1409 1410 1411 1412 1413 1414 1415 ~~1416~~ 1417 1418 1419 1420 1421 1422 1423 ~~1424~~ 1425 1426 1427 1428 1429 1430 1431 1432 ~~1433~~ 1434 1435 1436 1437 1438 1439 1440	1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271	- + + - + + - + +	;; level (if (not not-iterated) ;; i.e. iterated (let ((iterated-parent (pathname-directory (conc lnkpath "/" item-path)))) (debug:print-info 2 default-log-port "Creating iterated parent " iterated-parent) (handle-exceptions exn (begin ~~(debug:print-error 0 default-log-port " Failed to create directory " iterated-parent ((condition-property-accessor 'exn 'message) exn) ~~", continuing but link tree may be corrupted")~~~~ (debug:print-error 0 default-log-port " Failed to create directory " iterated-parent ((condition-property-accessor 'exn 'message) exn) ", continuing but link tree may be corrupted, exn=" exn) #;(exit 1)) (create-directory iterated-parent #t)))) (if (symbolic-link? lnkpath) (handle-exceptions exn (begin ~~(debug:print-error 0 default-log-port " Failed to remove symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ~~", continuing but link tree may be corrupted.")~~~~ (debug:print-error 0 default-log-port " Failed to remove symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", continuing but link tree may be corrupted. exn=" exn) #;(exit 1)) (delete-file lnkpath))) (if (not (or (common:file-exists? lnkpath) (symbolic-link? lnkpath))) (handle-exceptions exn (begin ~~(debug:print-error 0 default-log-port " Failed to create symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ~~", continuing but link tree may be corrupted.")~~~~ (debug:print-error 0 default-log-port " Failed to create symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", continuing but link tree may be corrupted. exn=" exn) #;(exit 1)) (create-symbolic-link toptest-path lnkpath))) ;; NB - This was not working right - some top tests are not getting the path set!!! ;; ;; Do the setting of this record after the paths are created so that the shortdir can ;; be set to the real directory location. This is safer for future clean up if the link
︙
1457 1458 1459 1460 1461 1462 1463 ~~1464 1465~~ 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 ~~1478 1479~~ 1480 1481 1482 1483 1484 1485 1486 1487 1488 ~~1489~~ 1490 1491 1492 1493 1494 1495 1496	1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330	- - + + + + - - + + + - +	testname "" run-id) ;; (rmt:general-call 'test-set-rundir run-id lnkpath testname "") ;; toptest-path) (if (or (not curr-test-path) (not (directory-exists? toptest-path))) (begin (debug:print-info 2 default-log-port "Creating " toptest-path " and link " lnkpath) (handle-exceptions ~~exn #f ~~;; do~~n~~'t care to catch and deal with errors here for now.~~~~ exn (begin (debug:print 0 default-log-port "failed to create directory " toptest-path ", exn=" exn) #f) (create-directory toptest-path #t)) (hash-table-set! toptest-paths testname toptest-path))))) ;; The toptest path has been created, the link to the test in the linktree has ;; been created. Now, if this is an iterated test the real test dir must be created (if (not not-iterated) ;; this is an iterated test (begin ;; (let ((lnktarget (conc lnkpath "/" item-path))) (debug:print 2 default-log-port "Setting up sub test run area") (debug:print 2 default-log-port " - creating run area in " test-path) (handle-exceptions exn (begin ~~(debug:print-error 0 default-log-port " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) ~~", exiting")~~ (ex~~it 1)~~)~~ (debug:print-error 0 default-log-port " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) ", continuing (might cause downstream issues?), exn=" exn) #f) (create-directory test-path #t)) (debug:print 2 default-log-port " - creating link from: " test-path "\n" " to: " lnktarget) ;; If there is already a symlink delete it and recreate it. (handle-exceptions exn (begin ~~(debug:print-error 0 default-log-port " Failed to re-create link " lnktarget ((condition-property-accessor 'exn 'message) exn) ", exiting")~~ (debug:print-error 0 default-log-port " Failed to re-create link " lnktarget ((condition-property-accessor 'exn 'message) exn) ", exiting, exn=" exn) (exit)) (if (symbolic-link? lnktarget) (delete-file lnktarget)) (if (not (common:file-exists? lnktarget)) (create-symbolic-link test-path lnktarget))))) (if (not (directory? test-path)) (create-directory test-path #t)) ;; this is a hack, I don't know why out of the blue this path does not exist sometimes
︙
1548 1549 1550 1551 1552 1553 1554 ~~1555~~ 1556 1557 1558 1559 1560 1561 1562	1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396	- +	;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; ;; (begin ;; (thread-sleep! 1) ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)))))) (item-path (item-list->path itemdat)) (contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour"))) (let loop ((delta (- (current-seconds) last-launch)) ~~(launch-delay (configf:lookup-number configdat "setup" "launch-delay" default: 1)))~~ (launch-delay (configf:lookup-number configdat "setup" "launch-delay" default: 0))) (if (> launch-delay delta) (begin (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay. (debug:print-info 0 default-log-port "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds")) (thread-sleep! (- launch-delay delta)) (loop (- (current-seconds) last-launch) launch-delay)))) (change-directory toppath)
︙
1591 1592 1593 1594 1595 1596 1597 ~~1598~~ 1599 1600 1601 1602 1603 1604 1605	1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440	+ - +	(remote-megatest (configf:lookup configdat "setup" "executable")) (run-time-limit (or (configf:lookup tconfig "requirements" "runtimelim") (configf:lookup configdat "setup" "runtimelim"))) ;; FIXME SOMEDAY: not good how this is so obtuse, this hack is to ;; allow running from dashboard. Extract the path ;; from the called megatest and convert dashboard ;; or dboard to megatest (local-megatest (common:find-local-megatest)) ~~(local-megatest (let* ((lm (car (argv)))~~ #;(local-megatest (let* ((lm (car (argv))) (dir (pathname-directory lm)) (exe (pathname-strip-directory lm))) (conc (if dir (conc dir "/") "") (case (string->symbol exe) ((dboard) "../megatest") ((mtest) "../megatest") ((dashboard) "megatest")
︙
1714 1715 1716 1717 1718 1719 1720 ~~1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731~~ 1732 1733 1734 1735 1736 1737 1738	1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573	- - - - - - - - - - - + + + + + + + + + + +	) itemdat))) (testprevvals (alist->env-vars (hash-table-ref/default tconfig "pre-launch-env-overrides" '()))) ;; Launchwait defaults to true, must override it to turn off wait (launchwait (if (equal? (configf:lookup configdat "setup" "launchwait") "no") #f #t)) (launch-results-prev (apply (if launchwait ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed. process:cmd-run-with-stderr-and-exitcode->list process-run) (if useshell (let ((cmdstr (string-intersperse fullcmd " "))) (if launchwait cmdstr (conc cmdstr " >> mt_launch.log 2>&1 &"))) (car fullcmd)) (if useshell '() (cdr fullcmd)))) process:cmd-run-with-stderr-and-exitcode->list process-run) (if useshell (let ((cmdstr (string-intersperse fullcmd " "))) (if launchwait cmdstr (conc cmdstr " >> mt_launch.log 2>&1 &"))) (car fullcmd)) (if useshell '() (cdr fullcmd)))) (success (if launchwait (equal? 0 (cadr launch-results-prev)) #t)) (launch-results (if launchwait (car launch-results-prev) launch-results-prev))) (if (not success) (tests:test-set-status! run-id test-id "COMPLETED" "DEAD" "launcher failed; exited non-zero; check mt_launch.log" #f)) ;; (if launch-results launch-results "FAILED")) (mutex-unlock! launch-setup-mutex) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork. ;; (rmt:no-sync-del! lock-key) ;; release the lock for starting this test (if (not launchwait) ;; give the OS a little time to allow the process to start
︙