Megatest: Diff

Differences From Artifact [fad1b4b96b]:

File runs.scm — part of check-in [dde2e2b857] at 2020-08-19 23:53:00 on branch v1.65 — Can't use stmt caching where string compiled changes with every query. Fixed wrong presevation of last-update for clean printing of pass/fail etc. (user: mrwellan, size: 157168) [annotate] [blame] [check-ins using]

To Artifact [951f97a94b]:

File runs.scm — part of check-in [54d01a66a4] at 2020-10-04 02:13:51 on branch v1.65-defunct — effed (user: mrwellan, size: 160747) [annotate] [blame] [check-ins using]

︙			︙
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28	;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;; strftime('%m/%d/%Y %H:%M:%S','now','localtime') (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) ~~posix-extras directory-utils pathname-expand typed-records format sxml-serializer ~~sxml-modifications)~~~~ (declare (unit runs)) (declare (uses db)) (declare (uses common)) (declare (uses items)) (declare (uses runconfig)) (declare (uses tests))	\| >	14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29	;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;; strftime('%m/%d/%Y %H:%M:%S','now','localtime') (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) posix-extras directory-utils pathname-expand typed-records format sxml-serializer sxml-modifications matchable) (declare (unit runs)) (declare (uses db)) (declare (uses common)) (declare (uses items)) (declare (uses runconfig)) (declare (uses tests))
︙			︙
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66	;; (defstruct runs:dat reglen regfull runname max-concurrent-jobs run-id test-patts required-tests test-registry registry-mutex flags keyvals run-info all-tests-registry ~~can-run-more-tests~~ ((can-run-more-tests-count 0) : fixnum) (last-fuel-check 0) ;; time when we last checked fuel (beginning-of-time (current-seconds)) ) (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup ~~waitons testmode newtal ~~itemmaps prereqs-not-met)~~~~ ;; look in the $MT_RUN_AREA_HOME/.softlocks directory for key-host-pid.softlock files ;; - remove any that are over 3600 seconds old ;; - if there are any that are younger than 10 seconds ;; * sleep 10 seconds ;; * touch my key-host-pid.softlock file ;; * return	\| > > > > > \| > > > >	44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76	;; (defstruct runs:dat reglen regfull runname max-concurrent-jobs run-id test-patts required-tests test-registry registry-mutex flags keyvals run-info all-tests-registry ;; stores results from last runs:can-run-more-tests (can-run-more-tests #f) ;; (list can-run-more-flag num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit) ((can-run-more-tests-count 0) : fixnum) (last-fuel-check 0) ;; time when we last checked fuel (beginning-of-time (current-seconds)) (load-mgmt-function #f) (wait-for-jobs-function #f) (last-load-check-time 0) (last-jobs-check-time 0) ) (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup waitons testmode newtal itemmaps (prereqs-not-met #f) (last-update 0) ;; ) ;; look in the $MT_RUN_AREA_HOME/.softlocks directory for key-host-pid.softlock files ;; - remove any that are over 3600 seconds old ;; - if there are any that are younger than 10 seconds ;; * sleep 10 seconds ;; * touch my key-host-pid.softlock file ;; * return
︙			︙
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 ~~107 108~~ 109 110 111 112 113 114 115	((< age 10) #t) (else #f)))) lock-files))) (if fresh-locks (begin (if (runs:lownoise "runners-softlock-wait" 360) (debug:print-info 0 default-log-port "Other runners in flight, giving up some time...")) ~~(thread-sleep! 10))~~ (begin (if (runs:lownoise "runners-softlock-nowait" 360) (debug:print-info 0 default-log-port "No runners in flight, updating softlock")) (let* ((ouf (open-output-file my-lock-file))) (with-output-to-port ouf (lambda ()(print (current-seconds)))) (close-output-port ouf)))) (runs:dat-last-fuel-check-set! rdat (current-seconds)))))) ;; Fourth try, do accounting through time ;; (define (runs:parallel-runners-mgmt rdat) ~~(let ((time-to-check 10) ;; 28 (time-to-wait 30)~~ (now-time (current-seconds))) (if (> (- now-time (runs:dat-last-fuel-check rdat)) time-to-check) ;; time to check (runs:wait-on-softlock rdat "runners")))) ;; To test parallel-runners management start a repl: ;; megatest -repl ;; then run:	\| \| \|	97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125	((< age 10) #t) (else #f)))) lock-files))) (if fresh-locks (begin (if (runs:lownoise "runners-softlock-wait" 360) (debug:print-info 0 default-log-port "Other runners in flight, giving up some time...")) (thread-sleep! 2)) (begin (if (runs:lownoise "runners-softlock-nowait" 360) (debug:print-info 0 default-log-port "No runners in flight, updating softlock")) (let* ((ouf (open-output-file my-lock-file))) (with-output-to-port ouf (lambda ()(print (current-seconds)))) (close-output-port ouf)))) (runs:dat-last-fuel-check-set! rdat (current-seconds)))))) ;; Fourth try, do accounting through time ;; (define (runs:parallel-runners-mgmt rdat) (let ((time-to-check (configf:lookup-number configdat "runners" "time-to-check" default: 10)) ;; 28 (time-to-wait (configf:lookup-number configdat "runners" "time-to-wait" default: 30)) (now-time (current-seconds))) (if (> (- now-time (runs:dat-last-fuel-check rdat)) time-to-check) ;; time to check (runs:wait-on-softlock rdat "runners")))) ;; To test parallel-runners management start a repl: ;; megatest -repl ;; then run:
︙			︙
211 212 213 214 215 216 217 ~~218~~ 219 220 221 ~~222~~ 223 224 225 226 227 228 229	(let fatal-loop ((count 0)) (handle-exceptions exn (let ((call-chain (get-call-chain)) (msg ((condition-property-accessor 'exn 'message) exn))) (if (< count 5) (begin ;; this call is colliding, do some crude stuff to fix it. ~~(debug:print 0 default-log-port "ERROR: configdat was inaccessible! This should never happen. Retry #" count)~~ (launch:setup force-reread: #t) (fatal-loop (+ count 1))) (begin ~~(debug:print 0 default-log-port "FATAL: configdat was inaccessible! This should never happen. Retried " count ~~" times. Message: " msg)~~~~ (debug:print 0 default-log-port "Call chain:") (with-output-to-port default-log-port (lambda () (print "configdat is >>"configdat"<<") (pp configdat) (pp call-chain)))	\| > \| >	221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241	(let fatal-loop ((count 0)) (handle-exceptions exn (let ((call-chain (get-call-chain)) (msg ((condition-property-accessor 'exn 'message) exn))) (if (< count 5) (begin ;; this call is colliding, do some crude stuff to fix it. (debug:print 0 default-log-port "ERROR: configdat was inaccessible! This should never happen. Retry #" count ", exn=" exn) (launch:setup force-reread: #t) (fatal-loop (+ count 1))) (begin (debug:print 0 default-log-port "FATAL: configdat was inaccessible! This should never happen. Retried " count " times. Message: " msg) (debug:print 0 default-log-port "Call chain:") (with-output-to-port default-log-port (lambda () (print "configdat is >>"configdat"<<") (pp configdat) (pp call-chain)))
︙			︙
309 310 311 312 313 314 315 ~~316 317 318 319 320 321 322 323 324 325 326 327 328~~ 329 330 331 332 333 334 335	(define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) ~~(thread-sleep! (cond ;; BB: check with Matt. Should this sleep move~~ ~~;; to cond clauses below where we determine we~~ ~~;; have too many jobs running rather than each~~ ~~;; time the and condition above is true (which~~ ~~;; seems like always)?~~ ~~((>~~ (runs:dat-~~can-r~~un~~-more-tests-count~~ runsdat) ~~20) ;; o~~r~~igi~~n~~al intent was - save cycles, wait a~~ lo~~ng time~~ ~~(if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 default-log-port "waiting for tasks to complete, sleeping briefly ..."))~~ ~~10) ;; obviously haven't had any work to do for a while~~ ~~(else 0)))~~ ~~;; ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?~~ ~~;; (configf:lookup-number configdat "setup" "inter-test-delay" default: 0.01)~~ ~~;; )))~~ (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup configdat "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) (if (> (+ num-running num-running-in-jobgroup) 0)	< < < < < \| < < < < < < \|	321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336	(define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup configdat "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) (if (> (+ num-running num-running-in-jobgroup) 0)
︙			︙
376 377 378 379 380 381 382 ~~383~~ 384 385 386 387 388 389 390 391 392 393 ~~394~~ 395 396 397 398 399 400 401	(full-log-fname (conc log-dir "/" log-file))) (if run-pre-hook (if (null? existing-tests) (let* ((use-log-dir (if (not (directory-exists? log-dir)) (handle-exceptions exn (begin ~~(debug:print 0 default-log-port "WARNING: Failed to create " log-dir)~~ #f) (create-directory log-dir #t) #t) #t)) (start-time (current-seconds)) (actual-logf (if use-log-dir full-log-fname log-file))) (handle-exceptions exn (begin (print-call-chain default-log-port) ~~(debug:print 0 default-log-port "Message: " ((condition-property-accessor 'exn 'message) exn))~~ (debug:print 0 default-log-port "ERROR: failed to run pre-hook " run-pre-hook ", check the log " log-file)) (debug:print-info 0 default-log-port "running run-pre-hook: \"" run-pre-hook "\", log is " actual-logf) (system (conc run-pre-hook " >> " actual-logf " 2>&1")) (debug:print-info 0 default-log-port "pre-hook \"" run-pre-hook "\" took " (- (current-seconds) start-time) " seconds to run."))) (debug:print 0 default-log-port "Skipping pre-hook call \"" run-pre-hook "\" as there are existing tests for this run."))))) (define (runs:run-post-hook run-id)	\| \|	377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402	(full-log-fname (conc log-dir "/" log-file))) (if run-pre-hook (if (null? existing-tests) (let* ((use-log-dir (if (not (directory-exists? log-dir)) (handle-exceptions exn (begin (debug:print 0 default-log-port "WARNING: Failed to create " log-dir ", exn=" exn) #f) (create-directory log-dir #t) #t) #t)) (start-time (current-seconds)) (actual-logf (if use-log-dir full-log-fname log-file))) (handle-exceptions exn (begin (print-call-chain default-log-port) (debug:print 0 default-log-port "Message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (debug:print 0 default-log-port "ERROR: failed to run pre-hook " run-pre-hook ", check the log " log-file)) (debug:print-info 0 default-log-port "running run-pre-hook: \"" run-pre-hook "\", log is " actual-logf) (system (conc run-pre-hook " >> " actual-logf " 2>&1")) (debug:print-info 0 default-log-port "pre-hook \"" run-pre-hook "\" took " (- (current-seconds) start-time) " seconds to run."))) (debug:print 0 default-log-port "Skipping pre-hook call \"" run-pre-hook "\" as there are existing tests for this run."))))) (define (runs:run-post-hook run-id)
︙			︙
416 417 418 419 420 421 422 ~~423~~ 424 425 426 427 428 429 430 431 432 433 ~~434~~ 435 436 437 438 439 440 441	(if run-post-hook ;; (if (null? existing-tests) ;; (debug:print 0 default-log-port "Skipping post-hook call \"" run-post-hook "\" as there are existing tests for this run."))))) (let* ((use-log-dir (if (not (directory-exists? log-dir)) (handle-exceptions exn (begin ~~(debug:print 0 default-log-port "WARNING: Failed to create " log-dir)~~ #f) (create-directory log-dir #t) #t) #t)) (start-time (current-seconds)) (actual-logf (if use-log-dir full-log-fname log-file))) (handle-exceptions exn (begin (print-call-chain default-log-port) ~~(debug:print 0 default-log-port "Message: " ((condition-property-accessor 'exn 'message) exn))~~ (debug:print 0 default-log-port "ERROR: failed to run post-hook " run-post-hook ", check the log " log-file)) (debug:print-info 0 default-log-port "running run-post-hook: \"" run-post-hook "\", log is " actual-logf) (system (conc run-post-hook " >> " actual-logf " 2>&1")) (debug:print-info 0 default-log-port "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run.")))))) ;; return #t when all items in waitors-upon list are represented in test-patt, #f otherwise. (define (runs:testpatts-mention-waitors-upon? test-patt waitors-upon)	\| \|	417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442	(if run-post-hook ;; (if (null? existing-tests) ;; (debug:print 0 default-log-port "Skipping post-hook call \"" run-post-hook "\" as there are existing tests for this run."))))) (let* ((use-log-dir (if (not (directory-exists? log-dir)) (handle-exceptions exn (begin (debug:print 0 default-log-port "WARNING: Failed to create " log-dir ", exn=" exn) #f) (create-directory log-dir #t) #t) #t)) (start-time (current-seconds)) (actual-logf (if use-log-dir full-log-fname log-file))) (handle-exceptions exn (begin (print-call-chain default-log-port) (debug:print 0 default-log-port "Message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (debug:print 0 default-log-port "ERROR: failed to run post-hook " run-post-hook ", check the log " log-file)) (debug:print-info 0 default-log-port "running run-post-hook: \"" run-post-hook "\", log is " actual-logf) (system (conc run-post-hook " >> " actual-logf " 2>&1")) (debug:print-info 0 default-log-port "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run.")))))) ;; return #t when all items in waitors-upon list are represented in test-patt, #f otherwise. (define (runs:testpatts-mention-waitors-upon? test-patt waitors-upon)
︙			︙
533 534 535 536 537 538 539 ~~540~~ 541 542 543 544 545 546 547	(debug:print-info 0 default-log-port "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests) ));; tests will be ANDed with this list ;; register this run in monitor.db (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) (rmt:tasks-set-state-given-param-key task-key "running") ~~(common:telemetry-log "run-tests"~~ payload: `( (target . ,target) (run-name . ,runname) (test-patts . ,test-patts) ) ) ;; Now generate all the tests lists	\|	534 535 536 537 538 539 540 541 542 543 544 545 546 547 548	(debug:print-info 0 default-log-port "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests) ));; tests will be ANDed with this list ;; register this run in monitor.db (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) (rmt:tasks-set-state-given-param-key task-key "running") #;(common:telemetry-log "run-tests" payload: `( (target . ,target) (run-name . ,runname) (test-patts . ,test-patts) ) ) ;; Now generate all the tests lists
︙			︙
747 748 749 750 751 752 753 ~~754~~ 755 756 757 758 759 760 761	(th2 (make-thread (lambda () ;; BBQ: why are we visiting ALL runs here? ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going (handle-exceptions exn ~~(debug:print 0 default-log-port "error in calling find-and-mark-incomplete for run-id " run-id)~~ (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) run-ids))) "runs: mark-incompletes"))) ;; (thread-start! th1) (thread-start! th2) ;; (thread-join! th1) ;; just do the main stuff in the main thread	\|	748 749 750 751 752 753 754 755 756 757 758 759 760 761 762	(th2 (make-thread (lambda () ;; BBQ: why are we visiting ALL runs here? ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going (handle-exceptions exn (debug:print 0 default-log-port "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn) (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) run-ids))) "runs: mark-incompletes"))) ;; (thread-start! th1) (thread-start! th2) ;; (thread-join! th1) ;; just do the main stuff in the main thread
︙			︙
829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 ~~846~~ 847 ~~848 849 850 851 852 853 854 855 856 857~~ 858 859 860 861 862 863 864	;; => sometimes need to squeeze things in (added to reg) ;; => review of a previously seen test is higher priority of never visited test ;; reg - list of previously visited tests ;; tal - list of never visited tests ;; prefer next hed to be from reg than tal. (define runs:nothing-left-in-queue-count 0) ;;====================================================================== ;; runs:expand-items is called by runs:run-tests-queue ;;====================================================================== ;; ;; return value of runs:expand-items is passed back to runs-tests-queue and is fed to named loop with this signature: ;; (let loop ((hed (car sorted-test-names)) ;; (tal (cdr sorted-test-names)) ;; (reg '()) ;; registered, put these at the head of tal ;; (reruns '())) (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs ~~run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps)~~ (let* ((loop-list (list hed tal reg reruns)) ~~(prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path~~ mode: testmode itemmaps: itemmaps))) ~~(if (list? res)~~ ~~res~~ ~~(begin~~ (debug:print 0 default-log-port ~~"ERROR: rmt:get-prereqs-not-met returned non-list!\n"~~ ~~" res=" res " run-id=" run-id " waitons=" waitons " hed=" hed " item-path=" item-path " testmode=" testmode " itemmaps=" itemmaps)~~ ~~'()))))~~ (have-itemized (not (null? (lset-intersection eq? testmode '(itemmatch itemwait))))) ~~;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))~~ (fails (runs:calc-fails prereqs-not-met)) (prereq-fails (runs:calc-prereq-fail prereqs-not-met)) (non-completed (runs:calc-not-completed prereqs-not-met)) (runnables (runs:calc-runnable prereqs-not-met)) (unexpanded-prereqs (filter (lambda (testname) (let* ((test-rec (hash-table-ref test-records testname))	> > > > > > > > > > > > > > \| > > > > \| < < < < < < < \| <	830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875	;; => sometimes need to squeeze things in (added to reg) ;; => review of a previously seen test is higher priority of never visited test ;; reg - list of previously visited tests ;; tal - list of never visited tests ;; prefer next hed to be from reg than tal. (define runs:nothing-left-in-queue-count 0) ;; cache the result of get-prereqs-not-met and don't call it if called in past 10 seconds ;; NOTE: This is assuming that testdat is highly specific to this test ;; (define (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path #!key (mode '(normal))(itemmaps #f)) ;; mode: testmode itemmaps: itemmaps) (if (and (runs:testdat-prereqs-not-met testdat) (< (- (current-seconds) (runs:testdat-last-update testdat)) 10)) ;;; only refresh for this test if ;;; it has been at least 10 seconds (runs:testdat-prereqs-not-met testdat) ;; return the cached result (let* ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: mode itemmaps: itemmaps))) (runs:testdat-prereqs-not-met-set! testdat res) (runs:testdat-last-update-set! testdat (current-seconds)) res))) ;;====================================================================== ;; runs:expand-items is called by runs:run-tests-queue ;;====================================================================== ;; ;; return value of runs:expand-items is passed back to runs-tests-queue and is fed to named loop with this signature: ;; (let loop ((hed (car sorted-test-names)) ;; (tal (cdr sorted-test-names)) ;; (reg '()) ;; registered, put these at the head of tal ;; (reruns '())) (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps testdat) (let* ((loop-list (list hed tal reg reruns)) (prereqs-not-met (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) (have-itemized (not (null? (lset-intersection eq? testmode '(itemmatch itemwait))))) (fails (runs:calc-fails prereqs-not-met)) (prereq-fails (runs:calc-prereq-fail prereqs-not-met)) (non-completed (runs:calc-not-completed prereqs-not-met)) (runnables (runs:calc-runnable prereqs-not-met)) (unexpanded-prereqs (filter (lambda (testname) (let* ((test-rec (hash-table-ref test-records testname))
︙			︙
985 986 987 988 989 990 991 ~~992~~ 993 ~~994~~ 995 996 997 998 999 1000 1001	((and (null? fails) ;; have not-started tests, but unable to run them. everything looks completed with no prospect of unsticking something that is stuck. we should mark hed as moribund and exit or continue if there are more tests to consider (null? prereq-fails) (null? non-completed)) (debug:print-info 4 default-log-port "cond branch - " "ei-4") (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) ~~(debug:print-info 0 default-log-port "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default seen-cant-run-tests hed 0)) ;;~~ ;; getting here likely means the system is way overloaded, kill a full minute before continuing ~~(thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)~~ ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 default-log-port "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))	\| \| > > > >	996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016	((and (null? fails) ;; have not-started tests, but unable to run them. everything looks completed with no prospect of unsticking something that is stuck. we should mark hed as moribund and exit or continue if there are more tests to consider (null? prereq-fails) (null? non-completed)) (debug:print-info 4 default-log-port "cond branch - " "ei-4") (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) (debug:print-info 0 default-log-port "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default seen-cant-run-tests hed 0) ", going to wait 60 sec.") ;; ;; getting here likely means the system is way overloaded, kill a full minute before continuing ;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!! ;; No runsdat, can't do this yet ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) ;; (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 default-log-port "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))
︙			︙
1117 1118 1119 1120 1121 1122 1123 ~~1124~~ 1125 1126 1127 1128 1129 1130 1131	(if (vector? t) (conc (db:test-get-state t) "/" (db:test-get-status t)) (conc " WARNING: t is not a vector=" t ))) prereqs-not-met) ", ") ") fails: " fails "\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) (if (and (not (null? prereqs-not-met)) (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) (debug:print-info 2 default-log-port "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) ;; Don't know at this time if the test have been launched at some time in the past ;; i.e. is this a re-launch?	> > > > > > > > > > > > \| > > > > > >	1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164	(if (vector? t) (conc (db:test-get-state t) "/" (db:test-get-status t)) (conc " WARNING: t is not a vector=" t ))) prereqs-not-met) ", ") ") fails: " fails "\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) ;; well, first lets see if cpu load throttling is enabled. If so wait around until the ;; average cpu load is under the threshold before continuing ;; (if (runs:dat-load-mgmt-function runsdat) ((runs:dat-load-mgmt-function runsdat)) (runs:dat-load-mgmt-function-set! runsdat (lambda () ;; jobtools maxload is useful for where the full Megatest run is done on one machine (if (and (not (common:on-homehost?)) maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues (if maxhomehostload (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))))) (if (and (not (null? prereqs-not-met)) (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) (debug:print-info 2 default-log-port "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) ;; Don't know at this time if the test have been launched at some time in the past ;; i.e. is this a re-launch?
︙			︙
1190 1191 1192 1193 1194 1195 1196 ~~1197~~ 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 ~~1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224~~ 1225 1226 1227 1228 1229 ~~1230~~ 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 ~~1251~~ 1252 1253 1254 1255 ~~1256 1257 1258 1259~~ 1260 1261 1262 1263 1264 1265 1266	;; If no resources are available just kill time and loop again ;; ((not have-resources) ;; simply try again after waiting a second (if (runs:lownoise "no resources" 60) (debug:print-info 1 default-log-port "no resources to run new tests, waiting ...")) ;; Have gone back and forth on this but db starvation is an issue. ;; wait one second before looking again to run jobs. ~~(thread-sleep! 1)~~ ;; could have done hed tal here but doing car/cdr of newtal to rotate tests (list (car newtal)(cdr newtal) reg reruns)) ;; This is the final stage, everything is in place so launch the test ;; ((and have-resources (or (null? prereqs-not-met) (and (member 'toplevel testmode) ;; 'toplevel) (null? non-completed) (not (member 'exclusive testmode))))) ;; (hash-table-delete! max-tries-hash (db:test-make-full-name test-name item-path)) ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! max-tries-hash (make-hash-table)) ~~;; well, first lets see if cpu load throttling is enabled. If so wait around until the~~ ~~;; average cpu load is under the threshold before continuing~~ ~~;; jobtools maxload is useful for where the full Megatest run is done on one machine~~ ~~(if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized~~ ~~(common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))~~ ~~;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues~~ ~~(if maxhomehostload~~ ~~(common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))~~ (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! global-delta) (if (or (not (null? tal))(not (null? reg))) ~~(runs:loop-values tal reg reglen regfull reruns)~~ #f)) ;; must be we have unmet prerequisites ;; (else (debug:print 4 default-log-port "FAILS: " fails) ;; If one or more of the prereqs-not-met are FAIL then we can issue ;; a message and drop hed from the items to be processed. ;; (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) (if (and (not (null? prereqs-not-met)) (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) (debug:print-info 1 default-log-port "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) (if (or (null? fails) (member 'toplevel testmode)) (begin ;; couldn't run, take a breather (if (runs:lownoise "Waiting for more work to do..." 60) (debug:print-info 0 default-log-port "Waiting for more work to do...")) ~~(thread-sleep! 1)~~ (list (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (begin (let ((my-test-id (rmt:get-test-id run-id test-name item-path))) ~~(mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2"))~~ (if (or (not (null? reg))(not (null? tal))) (if (vector? hed) (begin (debug:print 1 default-log-port "WARNING: Dropping test " test-name "/" item-path " from the launch list as it has prerequistes that are FAIL") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))	\| < < < < < < < < < < < \| \| > > \| \| < < <	1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287	;; If no resources are available just kill time and loop again ;; ((not have-resources) ;; simply try again after waiting a second (if (runs:lownoise "no resources" 60) (debug:print-info 1 default-log-port "no resources to run new tests, waiting ...")) ;; Have gone back and forth on this but db starvation is an issue. ;; wait one second before looking again to run jobs. (thread-sleep! 0.25) ;; could have done hed tal here but doing car/cdr of newtal to rotate tests (list (car newtal)(cdr newtal) reg reruns)) ;; This is the final stage, everything is in place so launch the test ;; ((and have-resources (or (null? prereqs-not-met) (and (member 'toplevel testmode) ;; 'toplevel) (null? non-completed) (not (member 'exclusive testmode))))) ;; (hash-table-delete! max-tries-hash (db:test-make-full-name test-name item-path)) ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! max-tries-hash (make-hash-table)) (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat testdat) (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! global-delta) (if (or (not (null? tal))(not (null? reg))) (runs:loop-values tal reg reglen regfull reruns) ;; hed should be dropped at this time #f)) ;; must be we have unmet prerequisites ;; (else (debug:print 4 default-log-port "FAILS: " fails) ;; If one or more of the prereqs-not-met are FAIL then we can issue ;; a message and drop hed from the items to be processed. ;; (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) (if (and (not (null? prereqs-not-met)) (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) (debug:print-info 1 default-log-port "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) (if (or (null? fails) (member 'toplevel testmode)) (begin ;; couldn't run, take a breather (if (runs:lownoise "Waiting for more work to do..." 60) (debug:print-info 0 default-log-port "Waiting for more work to do...")) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) (list (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (begin (let ((my-test-id (rmt:get-test-id run-id test-name item-path))) (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2")) (if (or (not (null? reg))(not (null? tal))) (if (vector? hed) (begin (debug:print 1 default-log-port "WARNING: Dropping test " test-name "/" item-path " from the launch list as it has prerequistes that are FAIL") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))
︙			︙
1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445	)) ;; every time though the loop increment the test/itempatt val. ;; when the min is > max-allowed and none running then force exit ;; (define max-tries-hash (make-hash-table)) ;;====================================================================== ;; runs:run-tests-queue is called by runs:run-tests ;;====================================================================== ;; ;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... > (define (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests reglen-in all-tests-registry) ;; At this point the list of parent tests is expanded	> > >	1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469	)) ;; every time though the loop increment the test/itempatt val. ;; when the min is > max-allowed and none running then force exit ;; (define max-tries-hash (make-hash-table)) (define (runs:pretty-long-list lst) (if (> (length lst) 8)(append (take lst 3)(list "...")) lst)) ;;====================================================================== ;; runs:run-tests-queue is called by runs:run-tests ;;====================================================================== ;; ;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... > (define (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests reglen-in all-tests-registry) ;; At this point the list of parent tests is expanded
︙			︙
1571 1572 1573 1574 1575 1576 1577 ~~1578 1579 1580 1581 1582 1583 1584 1585 1586~~ 1587 1588 1589 1590 1591 1592 1593	testmode: testmode newtal: newtal itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) ~~;; -- removed BB 17ww28 - no longer needed.~~ ~~;; every 15 minutes verify the server is there for this run~~ ~~;; (if (and (common:low-noise-print 240 "try start server" run-id)~~ ;; (not (or (and runremote ~~;; (remote-server-url runremote)~~ ~~;; (server:ping (remote-server-url runremote)))~~ ~~;; (server:check-if-running toppath))))~~ ~~;; (server:kind-run toppath))~~ (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup configdat "setup" "give-up-waiting") 36000))) (hash-table-set! max-tries-hash tfullname (+ (hash-table-ref/default max-tries-hash tfullname 0) 1))) ;; (debug:print 0 default-log-port "max-tries-hash: " (hash-table->alist max-tries-hash))	< < < < < < < < <	1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608	testmode: testmode newtal: newtal itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup configdat "setup" "give-up-waiting") 36000))) (hash-table-set! max-tries-hash tfullname (+ (hash-table-ref/default max-tries-hash tfullname 0) 1))) ;; (debug:print 0 default-log-port "max-tries-hash: " (hash-table->alist max-tries-hash))
︙			︙
1612 1613 1614 1615 1616 1617 1618 ~~1619~~ 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 ~~1633~~ 1634 1635 1636 1637 1638 1639 1640	reruns)))) ;; (loop (car tal)(cdr tal) reg reruns)))) (runs:incremental-print-results run-id) (debug:print 4 default-log-port "TOP OF LOOP => " "test-name: " test-name "\n hed: " hed ~~"\n tal: " ~~tal~~~~ "\n reg: " reg "\n test-record " test-record "\n itemdat: " itemdat "\n items: " items "\n item-path: " item-path "\n waitons: " waitons "\n num-retries: " num-retries "\n reruns: " reruns "\n regfull: " regfull "\n reglen: " reglen "\n length reg: " (length reg) ) ~~(runs:parallel-runners-mgmt runsdat)~~ ;; check for hed in waitons => this would be circular, remove it and issue an ;; error (if (member test-name waitons) (begin (debug:print-error 0 default-log-port "test " test-name " has listed itself as a waiton, please correct this!") (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))	\| \|	1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655	reruns)))) ;; (loop (car tal)(cdr tal) reg reruns)))) (runs:incremental-print-results run-id) (debug:print 4 default-log-port "TOP OF LOOP => " "test-name: " test-name "\n hed: " hed "\n tal: " (runs:pretty-long-list tal) "\n reg: " reg "\n test-record " test-record "\n itemdat: " itemdat "\n items: " items "\n item-path: " item-path "\n waitons: " waitons "\n num-retries: " num-retries "\n reruns: " reruns "\n regfull: " regfull "\n reglen: " reglen "\n length reg: " (length reg) ) ;; (runs:parallel-runners-mgmt runsdat) ;; check for hed in waitons => this would be circular, remove it and issue an ;; error (if (member test-name waitons) (begin (debug:print-error 0 default-log-port "test " test-name " has listed itself as a waiton, please correct this!") (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))
︙			︙
1662 1663 1664 1665 1666 1667 1668 ~~1669~~ ~~1670~~ ~~1671~~ 1672 1673 1674 1675 1676 1677 1678	;; ((not items) (debug:print-info 4 default-log-port "cond branch - " "rtq-2") (debug:print-info 4 default-log-port "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) ~~(runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))~~ ~~(runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ~~;; look at the test jobgroup and tot jobs running~~~~ ~~(let ((loop-list (runs:process-expanded-tests runsdat testdat)))~~ (if loop-list (apply loop loop-list)))) ;; items processed into a list but not came in as a list been processed ;; ((and (list? items) ;; thus we know our items are already calculated (not itemdat)) ;; and not yet expanded into the list of things to be done (debug:print-info 4 default-log-port "cond branch - " "rtq-3")	\| > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > \| > \|	1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745	;; ((not items) (debug:print-info 4 default-log-port "cond branch - " "rtq-2") (debug:print-info 4 default-log-port "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) ;; gonna try a strategy change here. ;; ;; check if can run more tests. if yes, continue, if no, rest until can run more ;; look at the test jobgroup and tot jobs running ;; ;; NOTE: This does NOT actually gate here, only captures the proc to be called later ;; (if (not (runs:dat-wait-for-jobs-function runsdat)) (runs:dat-wait-for-jobs-function-set! runsdat (lambda (testdat-in) (let* ((jobgroup (runs:testdat-jobgroup testdat-in)) (can-run-more-tests (runs:dat-can-run-more-tests runsdat)) (last-jobs-check-time (runs:dat-last-jobs-check-time runsdat)) (should-check-jobs (match can-run-more-tests ((can-run-more-flag num-running nr-in-jobgroup max-concurrent-jobs . params) (if (< (- max-concurrent-jobs num-running) 25) (begin (debug:print-info 0 default-log-port "less than 20 jobs headroom, ("max-concurrent-jobs "-"num-running")>20. Forcing prelaunch check.") #t) #f)) (else #f)))) ;; no record yet (if should-check-jobs (let loop-can-run-more ((res (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) (remtries 1440)) ;; we can wait for up to two hours for jobs to get done (match res ((run-more num-running . rem) (if (or run-more (< remtries 1)) (begin (if (runs:lownoise "num-running" 30) (debug:print-info 0 default-log-port "Have "num-running" tests of max " max-concurrent-jobs)) (runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through (begin (if (runs:lownoise "num-running" 10) (debug:print-info 0 default-log-port "Can't run more tests, have "num-running" tests of " max-concurrent-jobs " allowed.")) (thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable ;; wait for load here (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) (- remtries 1))))))) ))))) ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running (if loop-list (apply loop loop-list)))) ;; items processed into a list but not came in as a list been processed ;; ((and (list? items) ;; thus we know our items are already calculated (not itemdat)) ;; and not yet expanded into the list of things to be done (debug:print-info 4 default-log-port "cond branch - " "rtq-3")
︙			︙
1727 1728 1729 1730 1731 1732 1733 ~~1734~~ 1735 ~~1736 1737~~ 1738 1739 ~~1740 1741 1742~~ 1743 1744 1745 1746 1747 1748 1749	(loop (car tal)(cdr tal) reg reruns))) ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS ((or (procedure? items)(eq? items 'have-procedure)) (debug:print-info 4 default-log-port "cond branch - " "rtq-4") ~~(let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)))~~ (if (and (list? can-run-more) (car can-run-more)) (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup ~~max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here~~ (if loop-list (apply loop loop-list) ~~(debug:print-info 4 default-log-port " -- Can't expand hed="hed) ) )~~ ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns)))) ;; this case should not happen, added to help catch any bugs ((and (list? items) itemdat) (debug:print-info 4 default-log-port "cond branch - " "rtq-5") (debug:print-error 0 default-log-port "Should not have a list of items in a test and the itemspath set - please report this")	\| \| \| > > > \| < <	1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817	(loop (car tal)(cdr tal) reg reruns))) ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS ((or (procedure? items)(eq? items 'have-procedure)) (debug:print-info 4 default-log-port "cond branch - " "rtq-4") (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) (car can-run-more)) ;; itemized test expanded here (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) (if loop-list (apply loop loop-list) (debug:print-info 4 default-log-port " -- Can't expand hed="hed))) ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns)))) ;; this case should not happen, added to help catch any bugs ((and (list? items) itemdat) (debug:print-info 4 default-log-port "cond branch - " "rtq-5") (debug:print-error 0 default-log-port "Should not have a list of items in a test and the itemspath set - please report this")
︙			︙
1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 ~~1798~~ 1799 1800 1801 1802 1803 1804 1805	(debug:print-info 4 default-log-port "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) ))) ;; end loop on sorted test names ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched (rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now if -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes (prev-num-running 0)) ;; (debug:print-info 13 default-log-port "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup configdat "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 default-log-port "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no (debug:print-info 0 default-log-port "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 default-log-port "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ~~(thread-sleep! 5)~~ (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 default-log-port "Calling Post Hook") ;; (runs:run-post-hook run-id)	> > > \|	1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876	(debug:print-info 4 default-log-port "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) ))) ;; end loop on sorted test names ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched (rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now if -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes (prev-num-running 0)) ;; (debug:print-info 13 default-log-port "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup configdat "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 default-log-port "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no (debug:print-info 0 default-log-port "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 default-log-port "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 default-log-port "Calling Post Hook") ;; (runs:run-post-hook run-id)
︙			︙
1850 1851 1852 1853 1854 1855 1856 ~~1857~~ 1858 1859 1860 1861 1862 1863 1864	(if (not (vector? t)) (conc t) (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t)))) lst)) ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step ;; ~~(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry)~~ ;; All these vars might be referenced by the testconfig file reader ;; ;; NEED to reprocess testconfig here, ensuring that item variables are available. ;; This is for Tal's issue with item-specific env vars not being set for use in skip. ;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273 ;; (let* ((test-name (tests:testqueue-get-testname test-record))	\|	1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935	(if (not (vector? t)) (conc t) (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t)))) lst)) ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step ;; (define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry runsdat testdat-rec) ;; All these vars might be referenced by the testconfig file reader ;; ;; NEED to reprocess testconfig here, ensuring that item variables are available. ;; This is for Tal's issue with item-specific env vars not being set for use in skip. ;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273 ;; (let* ((test-name (tests:testqueue-get-testname test-record))