Megatest: Diff

Differences From Artifact [31523ae98e]:

File runs.scm — part of check-in [39d1713c91] at 2014-11-02 09:24:19 on branch streamline-exception-handling — stopped calls to set state and status where the test-id was not known (test not actually registered yet). Added safety net to some calls to print call chain if called with bad data, moved couple mutexes around to prevent contention (user: matt, size: 85533) [annotate] [blame] [check-ins using] [more...]

To Artifact [396462afab]:

File runs.scm — part of check-in [ec50f4ac00] at 2014-11-25 21:10:17 on branch v1.60 — Add big delay and take a break when system is clearly overloaded. (user: matt, size: 86163) [annotate] [blame] [check-ins using] [more...]

︙
64 65 66 67 68 69 70 71 72 73 74 75 76 77	64 65 66 67 68 69 70 71 72 73 74 75 76 77 78	+	(for-each (lambda (keyval) (setenv (car keyval)(cadr keyval))) keyvals) ;; Set up various and sundry known vars here (setenv "MT_RUN_AREA_HOME" toppath) (setenv "MT_RUNNAME" runname) (setenv "MT_TARGET" target) (setenv "MT_TESTSUITENAME" (common:get-testsuite-name)) (set! envdat (append envdat (list (list "MT_RUN_AREA_HOME" toppath) (list "MT_RUNNAME" runname) (list "MT_TARGET" target)))) ;; Now can read the runconfigs file ;;
︙
157 158 159 160 161 162 163 ~~164 165 166~~ 167 168 169 170 171 172 173	158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174	- - - + + +	(if (> (- currtime lasttime) waitval) (begin (hash-table-set! runs:denoise key currtime) #t) #f))) (define (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) ~~(thread-sleep! (cond ((> runs:can-run-more-tests-count 20) 2);; obviously haven't had any work to do for a while (else 0)))~~ ;;(thread-sleep! (cond ;; ((> runs:can-run-more-tests-count 20) 2);; obviously haven't had any work to do for a while ;; (else 0))) (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (config-lookup configdat "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) (if (> (+ num-running num-running-in-jobgroup) 0)
︙
211 212 213 214 215 216 217 ~~218~~ 219 220 221 222 ~~223 224~~ ~~225 226~~ 227 228 229 230 ~~231 232~~ 233 234 235 236 237 238 239	212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241	- + + + + - - + + - - - - + +	(test-records (make-hash-table)) ;; need to process runconfigs before generating these lists (all-tests-registry #f) ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names (all-test-names #f) ;; (hash-table-keys all-tests-registry)) (test-names #f) ;; (tests:filter-test-names all-test-names test-patts)) (required-tests #f) ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work (task-key (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id))) ~~(t~~asks-~~db (tasks:open-db)))~~ (tdbdat (tasks:open-db))) (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) (set-signal-handler! signal/int (lambda (signum) (signal-mask! signum) (print "Received signal " signum ", cleaning up before exit. Please wait...") ~~(let ((tdb (tasks:open-db))) (tasks:set-state-given-param-key ~~tdb~~ task-key "killed")~~ (let ((tdbdat (tasks:open-db))) (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "killed")) ~~;; (sqlite3:interrupt! tdb) ;; seems silly?~~ ~~(sqlite3:finalize! tdb))~~ (print "Killed by signal " signum ". Exiting") (exit))) ;; register this run in monitor.db ~~(tasks:add ~~tasks-~~db "run-tests" user target runname test-patts task-key) ;; params) (tasks:set-state-given-param-key ~~tasks-~~db task-key "running")~~ (tasks:add (db:delay-if-busy tdbdat) "run-tests" user target runname test-patts task-key) ;; params) (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "running") (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process (if (file-exists? runconfigf) (setup-env-defaults runconfigf run-id already-seen-runconfig-info keyvals target) (debug:print 0 "WARNING: You do not have a run config file: " runconfigf)) ;; Now generate all the tests lists (set! all-tests-registry (tests:get-all))
︙
390 391 392 393 394 395 396 ~~397 398~~ 399 400 401 402 403 404 405	392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408	- - + + +	(if (not (hash-table-ref/default flags "-preclean" #f)) (hash-table-set! flags "-preclean" #t)) (if (not (hash-table-ref/default flags "-rerun" #f)) (hash-table-set! flags "-rerun" "STUCK/DEAD,n/a,ZERO_ITEMS")) (runs:run-tests target runname test-patts user flags run-count: (- run-count 1))))) (debug:print-info 0 "No tests to run"))) (debug:print-info 4 "All done by here") ~~(tasks:set-state-given-param-key ~~tasks-~~db task-key "done") (sqlite3:finalize! tasks-db)))~~ (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "done") ;; (sqlite3:finalize! tasks-db) )) ;; loop logic. These are used in runs:run-tests-queue to make it a bit more readable. ;; ;; If reg not full and have items in tal then loop with (car tal)(cdr tal) reg reruns ;; If reg is full (i.e. length >= n ;; loop with (car reg) tal (cdr reg) reruns
︙
549 550 551 552 553 554 555 ~~556~~ 557 558 559 560 561 562 563 564 565 566	552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571	- + + +	(runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull) reruns))) (list (car newtal)(append (cdr newtal) reg) '() reruns)))) ((and (null? fails) (null? prereq-fails) (null? non-completed)) ~~(if (runs:can-keep-running? hed 5)~~ (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) (debug:print-info 1 "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default seen-cant-run-tests hed 0)) ;; getting here likely means the system is way overloaded, kill a full minute before continuing (thread-sleep! 60) ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))
︙
667 668 669 670 671 672 673 ~~674 675 676 677~~ ~~678 679 680~~ ~~681 682 683 684 685~~ ~~686 687~~ ~~688 689 690~~ 691 692 693 694 695 696 697	672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699	- - - - + + + + + + - - - + + + - - - - - + + + + - - + - - -	reruns) #f)) ;; Register tests ;; ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) ~~(if #t~~ ;; always do firm registration now in v1.60 and greater ;; (eq? transport-type 'fs) ;; no point in parallel registration if use fs (~~begin~~ (rmt:general-call 'register-test run-id run-id test-name item-path) (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done)) ;; always do firm registration now in v1.60 and greater ;; (eq? transport-type 'fs) ;; no point in parallel registration if use fs (let register-loop ((numtries 15)) (rmt:general-call 'register-test run-id run-id test-name item-path) (thread-sleep! 0.5) (if (rmt:get-test-id run-id test-name item-path) (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done) ~~~~(let ((th (make-thread (lambda ()~~ ~~(mutex-lock!~~ registry-~~mutex~~) ~~(hash-tab~~le~~-set! test-~~registry (runs:make-full-test-name test-name item-path) ~~'start~~)~~ (if (> numtries 0) (register-loop (- numtries 1)) (debug:print 0 "ERROR: failed to register test " (runs:make-full-test-name test-name item-path))))) ~~(mutex-unlock! registry-mutex)~~ ~~;; If haven't done it before register a top level test if this is an itemized test~~ (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done)) (rmt:general-call 'register-test run-id run-id test-name "")) (rmt:ge~~neral-call 'register~~-test ~~run~~-id run-id test-name ~~item-path~~) (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done)) (begin (rmt:general-call 'register-test run-id run-id test-name "") (if (rmt:get-test-id run-id test-name "") ~~~~(mutex-lock! registry-mutex)~~ (hash-table-set! test-registry (runs:make-full-test-name test-name ~~item-path~~) 'done)~~ (hash-table-set! test-registry (runs:make-full-test-name test-name "") 'done)))) ~~(mutex-unlock! registry-mutex))~~ ~~(conc test-name "/" item-path))))~~ ~~(thread-start! th)))~~ (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) (if (and (null? tal)(null? reg)) (list hed tal (append reg (list hed)) reruns) (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) ;; NB// Here we are building reg as we register tests ;; if regfull we must pop the front item off reg
︙
888 889 890 891 892 893 894 ~~895~~ 896 897 898 899 900 901 902	890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905	- + +	(max-retries (config-lookup configdat "setup" "maxretries")) (max-concurrent-jobs (let ((mcj (config-lookup configdat "setup" "max_concurrent_jobs"))) (if (and mcj (string->number mcj)) (string->number mcj) 1))) ;; length of the register queue ahead (reglen (if (number? reglen-in) reglen-in 1)) (last-time-incomplete (- (current-seconds) 900)) ;; force at least one clean up cycle ~~(last-time-some-running (current-seconds)))~~ (last-time-some-running (current-seconds)) (tdbdat (tasks:open-db))) ;; Initialize the test-registery hash with tests that already have a record ;; convert state to symbol and use that as the hash value (for-each (lambda (trec) (let ((id (db:test-get-id trec)) (tn (db:test-get-testname trec)) (ip (db:test-get-item-path trec))
︙
936 937 938 939 940 941 942 ~~943~~ 944 945 946 947 948 949 950	939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958	+ + + + + - +	(items (tests:testqueue-get-items test-record)) (item-path (item-list->path itemdat)) (tfullname (runs:make-full-test-name test-name item-path)) (newtal (append tal (list hed))) (regfull (>= (length reg) reglen)) (num-running (rmt:get-count-tests-running-for-run-id run-id))) ;; every couple minutes verify the server is there for this run (if (and (common:low-noise-print 60 "try start server" run-id) (tasks:need-server run-id)) (tasks:start-and-wait-for-server tdbdat run-id 10)) ~~(if (> num-running 0)~~ (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running 240)) (hash-table-set! max-tries-hash tfullname (+ (hash-table-ref/default max-tries-hash tfullname 0) 1))) ;; (debug:print 0 "max-tries-hash: " (hash-table->alist max-tries-hash)) ;; Ensure all top level tests get registered. This way they show up as "NOT_STARTED" on the dashboard
︙
1114 1115 1116 1117 1118 1119 1120 ~~1121~~ 1122 1123 1124 1125 1126 1127 1128	1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136	- +	(if (> (current-seconds)(+ last-time-incomplete 900)) (begin (debug:print-info 0 "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) (rmt:find-and-mark-incomplete run-id #f))) (if (not (eq? num-running prev-num-running)) (debug:print-info 0 "run-wait specified, waiting on " num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds))))) ~~(thread-sleep! 15)~~ (thread-sleep! 5) ;; (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! (debug:print-info 1 "All tests launched"))) (define (runs:calc-fails prereqs-not-met)
︙
1394 1395 1396 1397 1398 1399 1400 ~~1401~~ 1402 1403 1404 1405 1406 1407 1408	1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416	- +	;; 'set-state-status ;; ;; NB// should pass in keys? ;; (define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(remove-data-only #f)) (common:clear-caches) ;; clear all caches (let* ((db #f) ~~(t~~asks-~~db (tasks:open-db))~~ (tdbdat (tasks:open-db)) (keys (rmt:get-keys)) (rundat (mt:get-runs-by-patt keys runnamepatt target)) (header (vector-ref rundat 0)) (runs (vector-ref rundat 1)) (states (if state (string-split state ",") '())) (statuses (if status (string-split status ",") '())) (state-status (if (string? new-state-status) (string-split new-state-status ",") '(#f #f))))
︙
1431 1432 1433 1434 1435 1436 1437 1438 1439 ~~1440~~ 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450	1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460	+ - + +	'())) (lasttpath "/does/not/exist/I/hope")) (debug:print-info 4 "runs:operate-on run=" run ", header=" header) (if (not (null? tests)) (begin (case action ((remove-runs) (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) ;; seek and kill in flight -runtests with % as testpatt here (if (equal? testpatt "%") ~~(tasks:kill-runner ~~tasks-~~db target run-name)~~ (tasks:kill-runner (db:delay-if-busy tdbdat) target run-name) (debug:print 0 "not attempting to kill any run launcher processes as testpatt is " testpatt)) (debug:print 1 "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((set-state-status) (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) (debug:print 1 "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((print-run) (debug:print 1 "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header) action) ((run-wait) (debug:print 1 "Waiting for run " runkey ", run=" runnamepatt " to complete")) (else
︙
1546 1547 1548 1549 1550 1551 1552 ~~1553~~ 1554 1555 1556 1557 1558 1559 1560	1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571	- + +	;; (if (null? (glob (conc runpath "/"))) ;; (begin ;; (debug:print 1 "Removing run dir " runpath) ;; (system (conc "rmdir -p " runpath)))) ))))) )) runs) ~~(sqlite3:finalize! ~~tasks-~~db))~~ ;; (sqlite3:finalize! (db:delay-if-busy tdbdat)) ) #t) (define (runs:remove-test-directory db test remove-data-only) (let ((run-dir (db:test-get-rundir test)) ;; run dir is from the link tree (real-dir (if (file-exists? run-dir) (resolve-pathname run-dir) #f)))
︙