Megatest: Diff

Differences From Artifact [2bddd948f9]:

File runs.scm — part of check-in [d5867f23a9] at 2013-11-24 22:41:28 on branch inmem-per-run-db — Progressing (user: matt, size: 73958) [annotate] [blame] [check-ins using]

To Artifact [8f583e8498]:

File runs.scm — part of check-in [84d0a58461] at 2013-11-26 21:53:39 on branch inmem-per-run-db — Inching along ... (user: matt, size: 73924) [annotate] [blame] [check-ins using]

︙
157 158 159 160 161 162 163 ~~164~~ 165 166 167 ~~168 169~~ 170 171 172 173 174 175 176	157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176	- + - - + +	(currtime (current-seconds))) (if (> (- currtime lasttime) waitval) (begin (hash-table-set! runs:denoise key currtime) #t) #f))) ~~(define (runs:can-run-more-tests jobgroup max-concurrent-jobs)~~ (define (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) (thread-sleep! (cond ((> runs:can-run-more-tests-count 20) 2);; obviously haven't had any work to do for a while (else 0))) ~~(let* ((num-running (rmt:get-count-tests-running)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup jobgroup))~~ (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (config-lookup configdat "jobgroups" jobgroup))) (if (> (+ num-running num-running-in-jobgroup) 0) (set! runs:can-run-more-tests-count (+ runs:can-run-more-tests-count 1))) (if (not (eq? last-num-running-tests num-running)) (begin (debug:print 2 "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running) (set! last-num-running-tests num-running)))
︙
375 376 377 378 379 380 381 ~~382~~ 383 384 385 386 387 388 389	375 376 377 378 379 380 381 382 383 384 385 386 387 388 389	- +	'() reg))) (define runs:nothing-left-in-queue-count 0) (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records) (let* ((loop-list (list hed tal reg reruns)) ~~(prereqs-not-met (rmt:get-prereqs-not-met run-id waitons item-path ~~mode: testmode)) ;; (mt:lazy-get-prereqs-not-met run-id waitons item-path mode:~~ testmode))~~ (prereqs-not-met (rmt:get-prereqs-not-met run-id waitons item-path testmode)) (fails (runs:calc-fails prereqs-not-met)) (non-completed (runs:calc-not-completed prereqs-not-met))) (debug:print-info 4 "START OF INNER COND #2 " "\n can-run-more: " can-run-more "\n testname: " hed "\n prereqs-not-met: " (runs:pretty-string prereqs-not-met) "\n non-completed: " (runs:pretty-string non-completed)
︙
574 575 576 577 578 579 580 ~~581~~ 582 583 584 585 586 ~~587~~ 588 589 590 591 592 593 594	574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594	- + - +	((string? t) t) (else (conc t)))) inlst)) (define (runs:process-expanded-tests hed tal reg reruns reglen regfull test-record runname test-name item-path jobgroup max-concurrent-jobs run-id waitons item-path testmode test-patts required-tests test-registry registry-mutex flags keyvals run-info newtal all-tests-registry) ~~(let* ((run-limits-info (runs:can-run-more-tests jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running~~ (let* ((run-limits-info (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running (have-resources (car run-limits-info)) (num-running (list-ref run-limits-info 1)) (num-running-in-jobgroup (list-ref run-limits-info 2)) (max-concurrent-jobs (list-ref run-limits-info 3)) (job-group-limit (list-ref run-limits-info 4)) ~~(prereqs-not-met (rmt:get-prereqs-not-met run-id waitons item-path ~~mode: testmode)) ;; (mt:lazy-get-prereqs-not-met run-id waitons item-path mode:~~ testmode))~~ (prereqs-not-met (rmt:get-prereqs-not-met run-id waitons item-path testmode)) (fails (runs:calc-fails prereqs-not-met)) (non-completed (runs:calc-not-completed prereqs-not-met)) (loop-list (list hed tal reg reruns))) (debug:print-info 4 "have-resources: " have-resources " prereqs-not-met: (" (string-intersperse (map (lambda (t) (if (vector? t)
︙
620 621 622 623 624 625 626 ~~627~~ 628 629 630 631 632 633 634 ~~635 636~~ 637 638 639 640 641 642 643	620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643	- + - - + +	;; Register tests ;; ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) (if (eq? transport-type 'fs) ;; no point in parallel registration if use fs (begin ~~(rmt:general-call 'register-test run-id test-name item-path)~~ (rmt:general-call 'register-test run-id run-id test-name item-path) (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done)) (let ((th (make-thread (lambda () (mutex-lock! registry-mutex) (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'start) (mutex-unlock! registry-mutex) ;; If haven't done it before register a top level test if this is an itemized test (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done)) ~~(rmt:general-call 'register-test run-id test-name "")) (rmt:general-call 'register-test run-id test-name item-path)~~ (rmt:general-call 'register-test run-id run-id test-name "")) (rmt:general-call 'register-test run-id run-id test-name item-path) (mutex-lock! registry-mutex) (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done) (mutex-unlock! registry-mutex)) (conc test-name "/" item-path)))) (thread-start! th))) (runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?) (if (and (null? tal)(null? reg))
︙
812 813 814 815 816 817 818 ~~819~~ 820 821 822 823 824 825 826	812 813 814 815 816 817 818 819 820 821 822 823 824 825 826	- +	(hash-table-set! max-tries-hash tfullname (+ (hash-table-ref/default max-tries-hash tfullname 0) 1))) ;; (debug:print 0 "max-tries-hash: " (hash-table->alist max-tries-hash)) ;; Ensure all top level tests get registered. This way they show up as "NOT_STARTED" on the dashboard ;; and it is clear they should have run but did not. (if (not (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f)) (begin ~~(rmt:general-call 'register-test run-id test-name "")~~ (rmt:general-call 'register-test run-id run-id test-name "") (hash-table-set! test-registry (runs:make-full-test-name test-name "") 'done))) ;; Fast skip of tests that are already "COMPLETED" - NO! Cannot do that as the items may not have been expanded yet :( ;; (if (member (hash-table-ref/default test-registry tfullname #f) '(DONOTRUN removed)) ;; common:cant-run-states-sym) ;; '(COMPLETED KILLED WAIVED UNKNOWN INCOMPLETE)) (begin
︙
928 929 930 931 932 933 934 ~~935~~ 936 937 938 939 940 941 942	928 929 930 931 932 933 934 935 936 937 938 939 940 941 942	- +	#f (loop (car tal)(cdr tal) reg reruns))) ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS ((or (procedure? items)(eq? items 'have-procedure)) ~~(let ((can-run-more (runs:can-run-more-tests jobgroup max-concurrent-jobs)))~~ (let ((can-run-more (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) (car can-run-more)) (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records))) (if loop-list (apply loop loop-list))) ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns))))
︙
1035 1036 1037 1038 1039 1040 1041 ~~1042~~ 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 ~~1057~~ 1058 1059 ~~1060~~ 1061 1062 1063 1064 1065 1066 1067	1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067	- + - + - +	(begin (hash-table-set! test-meta-updated test-name #t) (runs:update-test_meta test-name test-conf))) ;; itemdat => ((ripeness "overripe") (temperature "cool") (season "summer")) (let* ((new-test-path (string-intersperse (cons test-path (map cadr itemdat)) "/")) (test-id (rmt:get-test-id run-id test-name item-path)) ~~(testdat (if test-id (rmt:get-test-info-by-id test-id) #f)))~~ (testdat (if test-id (rmt:get-test-info-by-id run-id test-id) #f))) (if (not testdat) (let loop () ;; ensure that the path exists before registering the test ;; NOPE: Cannot! Don't know yet which disk area will be assigned.... ;; (system (conc "mkdir -p " new-test-path)) ;; ;; (open-run-close tests:register-test db run-id test-name item-path) ;; ;; NB// for the above line. I want the test to be registered long before this routine gets called! ;; (if (not test-id)(set! test-id (rmt:get-test-id-cached run-id test-name item-path))) (if (not test-id) (begin (debug:print 2 "WARN: Test not pre-created? test-name=" test-name ", item-path=" item-path ", run-id=" run-id) ~~(rmt:general-call 'register-test run-id test-name item-path)~~ (rmt:general-call 'register-test run-id run-id test-name item-path) (set! test-id (rmt:get-test-id run-id test-name item-path)))) (debug:print-info 4 "test-id=" test-id ", run-id=" run-id ", test-name=" test-name ", item-path=\"" item-path "\"") ~~(set! testdat (rmt:get-test-info-by-id test-id))~~ (set! testdat (rmt:get-test-info-by-id run-id test-id)) (if (not testdat) (begin (debug:print-info 0 "WARNING: server is overloaded, trying again in one second") (thread-sleep! 1) (loop))))) (if (not testdat) ;; should NOT happen (debug:print 0 "ERROR: failed to get test record for test-id " test-id))
︙
1132 1133 1134 1135 1136 1137 1138 ~~1139~~ 1140 1141 1142 1143 1144 1145 1146	1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146	- +	(set! skip-test "Skipping due to previous tests running")))) ((and skip-check (configf:lookup test-conf "skip" "fileexists")) (if (file-exists? (configf:lookup test-conf "skip" "fileexists")) (set! skip-test (conc "Skipping due to existance of file " (configf:lookup test-conf "skip" "fileexists")))))) (if skip-test (begin ~~(mt:test-set-state-status-by-id test-id "COMPLETED" "SKIP" skip-test)~~ (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test) (debug:print-info 1 "SKIPPING Test " full-test-name " due to " skip-test)) (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags)) (begin (print "ERROR: Failed to launch the test. Exiting as soon as possible") (set! globalexitstatus 1) ;; (process-signal (current-process-id) signal/kill)))))))) ((KILLED)
︙
1258 1259 1260 1261 1262 1263 1264 ~~1265~~ 1266 1267 1268 1269 1270 1271 1272	1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272	- +	(> (string-length dira)(string-length dirb)) #f))))) (test-retry-time (make-hash-table)) (allow-run-time 10)) ;; seconds to allow for killing tests before just brutally killing 'em (let loop ((test (car sorted-tests)) (tal (cdr sorted-tests))) (let* ((test-id (db:test-get-id test)) ~~(new-test-dat (rmt:get-test-info-by-id test-id)))~~ (new-test-dat (rmt:get-test-info-by-id run-id test-id))) (if (not new-test-dat) (begin (debug:print 0 "ERROR: We have a test-id of " test-id " but no record was found. NOTE: No locking of records is done between processes, do not simultaneously remove the same run from two processes!") (if (not (null? tal)) (loop (car tal)(cdr tal)))) (let* ((item-path (db:test-get-item-path new-test-dat)) (test-name (db:test-get-testname new-test-dat))
︙
1287 1288 1289 1290 1291 1292 1293 ~~1294~~ 1295 1296 ~~1297~~ 1298 1299 1300 1301 1302 1303 ~~1304~~ 1305 1306 1307 1308 1309 1310 1311	1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311	- + - + - +	(hash-table-set! test-retry-time test-fulln (current-seconds)))) (if (> (- (current-seconds)(hash-table-ref test-retry-time test-fulln)) allow-run-time) ;; This test is not in a correct state for cleaning up. Let's try some graceful shutdown steps first ;; Set the test to "KILLREQ" and wait five seconds then try again. Repeat up to five times then give ;; up and blow it away. (begin (debug:print 0 "WARNING: could not gracefully remove test " test-fulln ", tried to kill it to no avail. Forcing state to FAILEDKILL and continuing") ~~(mt:test-set-state-status-by-id (db:test-get-id test) "FAILEDKILL" "n/a" #f)~~ (mt:test-set-state-status-by-id run-id (db:test-get-id test) "FAILEDKILL" "n/a" #f) (thread-sleep! 1)) (begin ~~(mt:test-set-state-status-by-id (db:test-get-id test) "KILLREQ" "n/a" #f)~~ (mt:test-set-state-status-by-id run-id (db:test-get-id test) "KILLREQ" "n/a" #f) (thread-sleep! 1))) ;; NOTE: This is suboptimal as the testdata will be used later and the state/status may have changed ... (if (null? tal) (loop new-test-dat tal) (loop (car tal)(append tal (list new-test-dat))))) (begin ~~(mt:test-set-state-status-by-id (db:test-get-id test) "REMOVING" "LOCKED" #f)~~ (mt:test-set-state-status-by-id run-id (db:test-get-id test) "REMOVING" "LOCKED" #f) (debug:print-info 1 "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir) (if (and real-dir (> (string-length real-dir) 5) (file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc. (begin ;; let* ((realpath (resolve-pathname run-dir))) (debug:print-info 1 "Recursively removing " real-dir) (if (file-exists? real-dir)
︙
1334 1335 1336 1337 1338 1339 1340 ~~1341~~ 1342 1343 1344 1345 1346 1347 1348	1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348	- +	)) ;; Only delete the records after removing the directory. If things fail we have a record (rmt:delete-test-records (db:test-get-id test)) (if (not (null? tal)) (loop (car tal)(cdr tal)))))) ((set-state-status) (debug:print-info 2 "new state " (car state-status) ", new status " (cadr state-status)) ~~(mt:test-set-state-status-by-id (db:test-get-id test) (car state-status)(cadr state-status) #f)~~ (mt:test-set-state-status-by-id run-id (db:test-get-id test) (car state-status)(cadr state-status) #f) (if (not (null? tal)) (loop (car tal)(cdr tal)))) ((run-wait) (debug:print-info 2 "still waiting, " (length tests) " tests still running") (thread-sleep! 10) (let ((new-tests (proc-get-tests run-id))) (if (null? new-tests)
︙