Megatest

Check-in [290d5d298f]
Login
Overview
Comment:Added filter to skip COMPLETED tests on startup. Move code to query db for checking if ok to run more tests to server side
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | dev
Files: files | file ages | folders
SHA1: 290d5d298fff3ca3a29016a0df86f0bdf61e93de
User & Date: mrwellan on 2013-06-07 22:03:08
Other Links: branch diff | manifest | tags
Context
2013-06-18
00:45
Tweaks for debugging stuck launch queue issues check-in: 9243ce728a user: matt tags: dev
2013-06-07
22:07
Merged development into v1.5501 as it diverges more than a patch release should diverge. check-in: fd2b134daa user: mrwellan tags: v1.55
22:03
Added filter to skip COMPLETED tests on startup. Move code to query db for checking if ok to run more tests to server side check-in: 290d5d298f user: mrwellan tags: dev
13:55
This version successfully ran stdcell tests (200 parallel jobs, 6000 total jobs) but it has problems with queue management check-in: a5d4098b27 user: mrwellan tags: dev
Changes

Modified megatest-version.scm from [19b1c1747c] to [252b48ae3a].

1
2
3
4
5
6
7
;; Always use two digit decimal
;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00..

(declare (unit megatest-version))

(define megatest-version 1.5429)






|

1
2
3
4
5
6
7
;; Always use two digit decimal
;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00..

(declare (unit megatest-version))

(define megatest-version 1.5431)

Modified run-tests-queue-classic.scm from [4c9cbe32ec] to [78c0dba7f7].

20
21
22
23
24
25
26

27
28
29
30
31
32
33
		   (tal         (cdr sorted-test-names))
		   (reruns      '()))
	  (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns))
	  ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns)
	  (let* ((test-record (hash-table-ref test-records hed))
		 (test-name   (tests:testqueue-get-testname test-record))
		 (tconfig     (tests:testqueue-get-testconfig test-record))

		 (testmode    (let ((m (config-lookup tconfig "requirements" "mode")))
				(if m (string->symbol m) 'normal)))
		 (waitons     (tests:testqueue-get-waitons    test-record))
		 (priority    (tests:testqueue-get-priority   test-record))
		 (itemdat     (tests:testqueue-get-itemdat    test-record)) ;; itemdat can be a string, list or #f
		 (items       (tests:testqueue-get-items      test-record))
		 (item-path   (item-list->path itemdat))







>







20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
		   (tal         (cdr sorted-test-names))
		   (reruns      '()))
	  (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns))
	  ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns)
	  (let* ((test-record (hash-table-ref test-records hed))
		 (test-name   (tests:testqueue-get-testname test-record))
		 (tconfig     (tests:testqueue-get-testconfig test-record))
		 (jobgroup    (config-lookup tconfig "requirements" "jobgroup"))
		 (testmode    (let ((m (config-lookup tconfig "requirements" "mode")))
				(if m (string->symbol m) 'normal)))
		 (waitons     (tests:testqueue-get-waitons    test-record))
		 (priority    (tests:testqueue-get-priority   test-record))
		 (itemdat     (tests:testqueue-get-itemdat    test-record)) ;; itemdat can be a string, list or #f
		 (items       (tests:testqueue-get-items      test-record))
		 (item-path   (item-list->path itemdat))
52
53
54
55
56
57
58

59
60
61
62
63
64
65
66
		  (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))

	    (cond ;; OUTER COND
	     ((not items) ;; when false the test is ok to be handed off to launch (but not before)
	      (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests))
	               (not (null? tal)))
	          (loop (car newtal)(cdr newtal) reruns))

	      (let* ((run-limits-info         (runs:can-run-more-tests test-record max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
		     (have-resources          (car run-limits-info))
		     (num-running             (list-ref run-limits-info 1))
		     (num-running-in-jobgroup (list-ref run-limits-info 2))
		     (max-concurrent-jobs     (list-ref run-limits-info 3))
		     (job-group-limit         (list-ref run-limits-info 4))
		     (prereqs-not-met         (db:get-prereqs-not-met run-id waitons item-path mode: testmode))
		     (fails                   (runs:calc-fails prereqs-not-met))







>
|







53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
		  (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))

	    (cond ;; OUTER COND
	     ((not items) ;; when false the test is ok to be handed off to launch (but not before)
	      (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests))
	               (not (null? tal)))
	          (loop (car newtal)(cdr newtal) reruns))
	      (let* ((run-limits-info         ;; (cdb:remote-run runs:can-run-more-tests #f jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
		      (open-run-close runs:can-run-more-tests #f jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
		     (have-resources          (car run-limits-info))
		     (num-running             (list-ref run-limits-info 1))
		     (num-running-in-jobgroup (list-ref run-limits-info 2))
		     (max-concurrent-jobs     (list-ref run-limits-info 3))
		     (job-group-limit         (list-ref run-limits-info 4))
		     (prereqs-not-met         (db:get-prereqs-not-met run-id waitons item-path mode: testmode))
		     (fails                   (runs:calc-fails prereqs-not-met))
189
190
191
192
193
194
195
196

197
198
199
200
201
202
203
		    (debug:print-info 4 "End of items list, looping with next after short delay")
                    ;; (thread-sleep! (+ 0.01 *global-delta*))
		    (loop (car tal)(cdr tal) reruns))))

	     ;; if items is a proc then need to run items:get-items-from-config, get the list and loop 
	     ;;    - but only do that if resources exist to kick off the job
	     ((or (procedure? items)(eq? items 'have-procedure))
	      (let ((can-run-more    (runs:can-run-more-tests test-record max-concurrent-jobs)))

		(if (and (list? can-run-more)
			 (car can-run-more))
		    (let* ((prereqs-not-met (db:get-prereqs-not-met run-id waitons item-path mode: testmode))
			   (fails           (runs:calc-fails prereqs-not-met))
			   (non-completed   (runs:calc-not-completed prereqs-not-met)))
		      (debug:print-info 8 "can-run-more: " can-run-more
				   "\n testname:        " hed







|
>







191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
		    (debug:print-info 4 "End of items list, looping with next after short delay")
                    ;; (thread-sleep! (+ 0.01 *global-delta*))
		    (loop (car tal)(cdr tal) reruns))))

	     ;; if items is a proc then need to run items:get-items-from-config, get the list and loop 
	     ;;    - but only do that if resources exist to kick off the job
	     ((or (procedure? items)(eq? items 'have-procedure))
	      (let ((can-run-more    ;; (cdb:remote-run runs:can-run-more-tests #f jobgroup max-concurrent-jobs)))
		     (open-run-close runs:can-run-more-tests #f jobgroup max-concurrent-jobs)))
		(if (and (list? can-run-more)
			 (car can-run-more))
		    (let* ((prereqs-not-met (db:get-prereqs-not-met run-id waitons item-path mode: testmode))
			   (fails           (runs:calc-fails prereqs-not-met))
			   (non-completed   (runs:calc-not-completed prereqs-not-met)))
		      (debug:print-info 8 "can-run-more: " can-run-more
				   "\n testname:        " hed

Modified run-tests-queue-new.scm from [bdb0e5926d] to [b23b3c860e].

1
2
3
4
5
6
7

8
9
10
11
12
13
14
15
16








17
18
19
20
21
22
23
24
25
26
27

28
29
30
31
32
33
34

35
36







37
38
39
40
41
42
43

;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... >
(define (runs:run-tests-queue-new run-id runname test-records keyvals flags test-patts required-tests reglen)
  ;; At this point the list of parent tests is expanded 
  ;; NB// Should expand items here and then insert into the run queue.
  (debug:print 5 "test-records: " test-records ", flags: " (hash-table->alist flags))
  (let ((run-info              (cdb:remote-run db:get-run-info #f run-id))

	(sorted-test-names     (tests:sort-by-priority-and-waiton test-records))
	(test-registry         (make-hash-table))
	(registry-mutex        (make-mutex))
	(num-retries           0)
	(max-retries           (config-lookup *configdat* "setup" "maxretries"))
	(max-concurrent-jobs   (let ((mcj (config-lookup *configdat* "setup"     "max_concurrent_jobs")))
				 (if (and mcj (string->number mcj))
				     (string->number mcj)
				     1)))) ;; length of the register queue ahead








    (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100))
    (if (not (null? sorted-test-names))
	(let loop ((hed         (car sorted-test-names))
		   (tal         (cdr sorted-test-names))
		   (reg         '()) ;; registered, put these at the head of tal 
		   (reruns      '()))
	  (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns))
	  ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns)
	  (let* ((test-record (hash-table-ref test-records hed))
		 (test-name   (tests:testqueue-get-testname test-record))
		 (tconfig     (tests:testqueue-get-testconfig test-record))

		 (testmode    (let ((m (config-lookup tconfig "requirements" "mode")))
				(if m (string->symbol m) 'normal)))
		 (waitons     (tests:testqueue-get-waitons    test-record))
		 (priority    (tests:testqueue-get-priority   test-record))
		 (itemdat     (tests:testqueue-get-itemdat    test-record)) ;; itemdat can be a string, list or #f
		 (items       (tests:testqueue-get-items      test-record))
		 (item-path   (item-list->path itemdat))

		 (newtal      (append tal (list hed)))
		 (regfull     (> (length reg) reglen)))







	    ;; (if (> (length reg) 10)
	    ;;     (begin
	    ;;       (set! tal (cons hed tal))
	    ;;       (set! hed (car reg))
	    ;;       (set! reg (cdr reg))
	    ;;       (set! newtal tal)))
	    (debug:print 6







>









>
>
>
>
>
>
>
>











>







>


>
>
>
>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... >
(define (runs:run-tests-queue-new run-id runname test-records keyvals flags test-patts required-tests reglen)
  ;; At this point the list of parent tests is expanded 
  ;; NB// Should expand items here and then insert into the run queue.
  (debug:print 5 "test-records: " test-records ", flags: " (hash-table->alist flags))
  (let ((run-info              (cdb:remote-run db:get-run-info #f run-id))
	(tests-info            (cdb:remote-run db:get-tests-for-run #f run-id #f '() '())) ;;  qryvals: "id,testname,item_path"))
	(sorted-test-names     (tests:sort-by-priority-and-waiton test-records))
	(test-registry         (make-hash-table))
	(registry-mutex        (make-mutex))
	(num-retries           0)
	(max-retries           (config-lookup *configdat* "setup" "maxretries"))
	(max-concurrent-jobs   (let ((mcj (config-lookup *configdat* "setup"     "max_concurrent_jobs")))
				 (if (and mcj (string->number mcj))
				     (string->number mcj)
				     1)))) ;; length of the register queue ahead
    ;; Initialize the test-registery hash with tests that already have a record
    (for-each (lambda (trec)
		(let ((id (db:test-get-id        trec))
		      (tn (db:test-get-testname  trec))
		      (ip (db:test-get-item-path trec))
		      (st (db:test-get-state     trec)))
		  (hash-table-set! test-registry (runs:make-full-test-name tn ip) (string->symbol st))))
	      tests-info)
    (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100))
    (if (not (null? sorted-test-names))
	(let loop ((hed         (car sorted-test-names))
		   (tal         (cdr sorted-test-names))
		   (reg         '()) ;; registered, put these at the head of tal 
		   (reruns      '()))
	  (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns))
	  ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns)
	  (let* ((test-record (hash-table-ref test-records hed))
		 (test-name   (tests:testqueue-get-testname test-record))
		 (tconfig     (tests:testqueue-get-testconfig test-record))
		 (jobgroup    (config-lookup tconfig "requirements" "jobgroup"))
		 (testmode    (let ((m (config-lookup tconfig "requirements" "mode")))
				(if m (string->symbol m) 'normal)))
		 (waitons     (tests:testqueue-get-waitons    test-record))
		 (priority    (tests:testqueue-get-priority   test-record))
		 (itemdat     (tests:testqueue-get-itemdat    test-record)) ;; itemdat can be a string, list or #f
		 (items       (tests:testqueue-get-items      test-record))
		 (item-path   (item-list->path itemdat))
		 (tfullname   (runs:make-full-test-name test-name item-path))
		 (newtal      (append tal (list hed)))
		 (regfull     (> (length reg) reglen)))

	    ;; Fast skip of tests that are already "COMPLETED"
	    (if (equal? (hash-table-ref/default test-registry tfullname #f) 'COMPLETED)
		(begin
		  (debug:print-info 0 "Skipping COMPLETED test " tfullname)
		  (if (not (null? tal))
		      (loop (car tal)(cdr tal) reg reruns))))
	    ;; (if (> (length reg) 10)
	    ;;     (begin
	    ;;       (set! tal (cons hed tal))
	    ;;       (set! hed (car reg))
	    ;;       (set! reg (cdr reg))
	    ;;       (set! newtal tal)))
	    (debug:print 6
59
60
61
62
63
64
65

66
67
68
69
70
71
72
73
		  (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))

	    (cond ;; OUTER COND
	     ((not items) ;; when false the test is ok to be handed off to launch (but not before)
	      (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests))
	               (not (null? tal)))
	          (loop (car tal)(cdr tal) reg reruns))

	      (let* ((run-limits-info         (runs:can-run-more-tests test-record max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
		     (have-resources          (car run-limits-info))
		     (num-running             (list-ref run-limits-info 1))
		     (num-running-in-jobgroup (list-ref run-limits-info 2))
		     (max-concurrent-jobs     (list-ref run-limits-info 3))
		     (job-group-limit         (list-ref run-limits-info 4))
		     (prereqs-not-met         (db:get-prereqs-not-met run-id waitons item-path mode: testmode))
		     (fails                   (runs:calc-fails prereqs-not-met))







>
|







77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
		  (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))

	    (cond ;; OUTER COND
	     ((not items) ;; when false the test is ok to be handed off to launch (but not before)
	      (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests))
	               (not (null? tal)))
	          (loop (car tal)(cdr tal) reg reruns))
	      (let* ((run-limits-info         (cdb:remote-run runs:can-run-more-tests #f jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
		      ;; (open-run-close runs:can-run-more-tests #f jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
		     (have-resources          (car run-limits-info))
		     (num-running             (list-ref run-limits-info 1))
		     (num-running-in-jobgroup (list-ref run-limits-info 2))
		     (max-concurrent-jobs     (list-ref run-limits-info 3))
		     (job-group-limit         (list-ref run-limits-info 4))
		     (prereqs-not-met         (db:get-prereqs-not-met run-id waitons item-path mode: testmode))
		     (fails                   (runs:calc-fails prereqs-not-met))
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
					       (cdb:tests-register-test *runremote* run-id test-name ""))
					   (cdb:tests-register-test *runremote* run-id test-name item-path)
		        		   (mutex-lock! registry-mutex)
					   (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done)
		        		   (mutex-unlock! registry-mutex))
		        		 (conc test-name "/" item-path))))
		    (thread-start! th))
		  (runs:shrink-can-run-more-tests-count)   ;; DELAY TWEAKER (still needed?)
		  (if (and (null? tal)(null? reg))
		      (loop hed tal reg reruns)
		      (loop (runs:queue-next-hed tal reg reglen regfull)
			    (runs:queue-next-tal tal reg reglen regfull)
			    (let ((newl (append reg (list hed))))
			      (if regfull 
				  (cdr newl)







|







133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
					       (cdb:tests-register-test *runremote* run-id test-name ""))
					   (cdb:tests-register-test *runremote* run-id test-name item-path)
		        		   (mutex-lock! registry-mutex)
					   (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done)
		        		   (mutex-unlock! registry-mutex))
		        		 (conc test-name "/" item-path))))
		    (thread-start! th))
		  (cdb:remote-run runs:shrink-can-run-more-tests-count #f)   ;; DELAY TWEAKER (still needed?)
		  (if (and (null? tal)(null? reg))
		      (loop hed tal reg reruns)
		      (loop (runs:queue-next-hed tal reg reglen regfull)
			    (runs:queue-next-tal tal reg reglen regfull)
			    (let ((newl (append reg (list hed))))
			      (if regfull 
				  (cdr newl)
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
		  (loop (car newtal)(cdr newtal) reg reruns))
		 ((and have-resources
		       (or (null? prereqs-not-met)
			   (and (eq? testmode 'toplevel)
				(null? non-completed))))
		  (run:test run-id run-info keyvals runname test-record flags #f)
		  (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'running)
		  (runs:shrink-can-run-more-tests-count)  ;; DELAY TWEAKER (still needed?)
		  ;; (thread-sleep! *global-delta*)
		  (if (not (null? tal))
		      (loop (runs:queue-next-hed tal reg reglen regfull)
			    (runs:queue-next-tal tal reg reglen regfull)
			    (runs:queue-next-reg tal reg reglen regfull)
			    reruns)))
		 (else ;; must be we have unmet prerequisites







|







166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
		  (loop (car newtal)(cdr newtal) reg reruns))
		 ((and have-resources
		       (or (null? prereqs-not-met)
			   (and (eq? testmode 'toplevel)
				(null? non-completed))))
		  (run:test run-id run-info keyvals runname test-record flags #f)
		  (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'running)
		  (cdb:remote-run runs:shrink-can-run-more-tests-count #f)  ;; DELAY TWEAKER (still needed?)
		  ;; (thread-sleep! *global-delta*)
		  (if (not (null? tal))
		      (loop (runs:queue-next-hed tal reg reglen regfull)
			    (runs:queue-next-tal tal reg reglen regfull)
			    (runs:queue-next-reg tal reg reglen regfull)
			    reruns)))
		 (else ;; must be we have unmet prerequisites
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
			(loop (car newtal)(cdr newtal) reg reruns))
		      ;; the waiton is FAIL so no point in trying to run hed ever again
		      (if (not (null? tal))
			  (if (vector? hed)
			      (begin 
				(debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed)
					     " from the launch list as it has prerequistes that are FAIL")
				(runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?)
				;; (thread-sleep! *global-delta*)
				(hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'removed)
				(loop (runs:queue-next-hed tal reg reglen regfull)
				      (runs:queue-next-tal tal reg reglen regfull)
				      (runs:queue-next-reg tal reg reglen regfull)
				      (cons hed reruns)))
			      (begin
				(debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;;  " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)")
				(runs:shrink-can-run-more-tests-count) ;; DELAY TWEAKER (still needed?)
				;; (thread-sleep! (+ 0.01 *global-delta*))
				(loop hed tal reg reruns))))))))) ;; END OF INNER COND
	     
	     ;; case where an items came in as a list been processed
	     ((and (list? items)     ;; thus we know our items are already calculated
		   (not   itemdat)) ;; and not yet expanded into the list of things to be done
	      (if (and (debug:debug-mode 1) ;; (>= *verbosity* 1)







|








|







190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
			(loop (car newtal)(cdr newtal) reg reruns))
		      ;; the waiton is FAIL so no point in trying to run hed ever again
		      (if (not (null? tal))
			  (if (vector? hed)
			      (begin 
				(debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed)
					     " from the launch list as it has prerequistes that are FAIL")
				(cdb:remote-run runs:shrink-can-run-more-tests-count #f) ;; DELAY TWEAKER (still needed?)
				;; (thread-sleep! *global-delta*)
				(hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'removed)
				(loop (runs:queue-next-hed tal reg reglen regfull)
				      (runs:queue-next-tal tal reg reglen regfull)
				      (runs:queue-next-reg tal reg reglen regfull)
				      (cons hed reruns)))
			      (begin
				(debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;;  " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)")
				(cdb:remote-run runs:shrink-can-run-more-tests-count #f) ;; DELAY TWEAKER (still needed?)
				;; (thread-sleep! (+ 0.01 *global-delta*))
				(loop hed tal reg reruns))))))))) ;; END OF INNER COND
	     
	     ;; case where an items came in as a list been processed
	     ((and (list? items)     ;; thus we know our items are already calculated
		   (not   itemdat)) ;; and not yet expanded into the list of things to be done
	      (if (and (debug:debug-mode 1) ;; (>= *verbosity* 1)
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
			  (runs:queue-next-tal tal reg reglen regfull)
			  (runs:queue-next-reg tal reg reglen regfull)
			  reruns))))

	     ;; if items is a proc then need to run items:get-items-from-config, get the list and loop 
	     ;;    - but only do that if resources exist to kick off the job
	     ((or (procedure? items)(eq? items 'have-procedure))
	      (let ((can-run-more    (runs:can-run-more-tests test-record max-concurrent-jobs)))
		(if (and (list? can-run-more)
			 (car can-run-more))
		    (let* ((prereqs-not-met (db:get-prereqs-not-met run-id waitons item-path mode: testmode))
			   (fails           (runs:calc-fails prereqs-not-met))
			   (non-completed   (runs:calc-not-completed prereqs-not-met)))
		      (debug:print-info 8 "can-run-more: " can-run-more
					"\n testname:        " hed







|







236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
			  (runs:queue-next-tal tal reg reglen regfull)
			  (runs:queue-next-reg tal reg reglen regfull)
			  reruns))))

	     ;; if items is a proc then need to run items:get-items-from-config, get the list and loop 
	     ;;    - but only do that if resources exist to kick off the job
	     ((or (procedure? items)(eq? items 'have-procedure))
	      (let ((can-run-more    (cdb:remote-run runs:can-run-more-tests #f jobgroup max-concurrent-jobs)))
		(if (and (list? can-run-more)
			 (car can-run-more))
		    (let* ((prereqs-not-met (db:get-prereqs-not-met run-id waitons item-path mode: testmode))
			   (fails           (runs:calc-fails prereqs-not-met))
			   (non-completed   (runs:calc-not-completed prereqs-not-met)))
		      (debug:print-info 8 "can-run-more: " can-run-more
					"\n testname:        " hed

Modified runs.scm from [513fa82301] to [d409eb1269].

161
162
163
164
165
166
167
168
169
170
171



172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190

(define (set-item-env-vars itemdat)
  (for-each (lambda (item)
	      (debug:print 2 "setenv " (car item) " " (cadr item))
	      (setenv (car item) (cadr item)))
	    itemdat))

(define *last-num-running-tests* 0)

;; Every time can-run-more-tests is called increment the delay
;; if the cou



(define *runs:can-run-more-tests-count* 0)
(define (runs:shrink-can-run-more-tests-count)
  (set! *runs:can-run-more-tests-count* 0)) ;; (/ *runs:can-run-more-tests-count* 2)))

(define (runs:can-run-more-tests test-record max-concurrent-jobs)
  (thread-sleep! (cond
		  ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while
		  (else 0)))
  (let* ((tconfig                 (tests:testqueue-get-testconfig test-record))
	 (jobgroup                (config-lookup tconfig "requirements" "jobgroup"))
	 (num-running             (cdb:remote-run db:get-count-tests-running #f))
	 (num-running-in-jobgroup (cdb:remote-run db:get-count-tests-running-in-jobgroup #f jobgroup))
	 (job-group-limit         (config-lookup *configdat* "jobgroups" jobgroup)))
    (if (> (+ num-running num-running-in-jobgroup) 0)
	(set! *runs:can-run-more-tests-count* (+ *runs:can-run-more-tests-count* 1)))
    (if (not (eq? *last-num-running-tests* num-running))
	(begin
	  (debug:print 2 "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
	  (set! *last-num-running-tests* num-running)))







<
<

|
>
>
>

|


|



<
<
|
|







161
162
163
164
165
166
167


168
169
170
171
172
173
174
175
176
177
178
179
180


181
182
183
184
185
186
187
188
189

(define (set-item-env-vars itemdat)
  (for-each (lambda (item)
	      (debug:print 2 "setenv " (car item) " " (cadr item))
	      (setenv (car item) (cadr item)))
	    itemdat))



;; Every time can-run-more-tests is called increment the delay
;;
;; NOTE: We run this server-side!! Do not use this global except in the runs:can-run-more-tests routine
;;
(define *last-num-running-tests* 0)
(define *runs:can-run-more-tests-count* 0)
(define (runs:shrink-can-run-more-tests-count db) ;; the db is just so we can use cdb:remote-run
  (set! *runs:can-run-more-tests-count* 0)) ;; (/ *runs:can-run-more-tests-count* 2)))

(define (runs:can-run-more-tests db jobgroup max-concurrent-jobs)
  (thread-sleep! (cond
		  ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while
		  (else 0)))


  (let* ((num-running             (db:get-count-tests-running db))
	 (num-running-in-jobgroup (db:get-count-tests-running-in-jobgroup db jobgroup))
	 (job-group-limit         (config-lookup *configdat* "jobgroups" jobgroup)))
    (if (> (+ num-running num-running-in-jobgroup) 0)
	(set! *runs:can-run-more-tests-count* (+ *runs:can-run-more-tests-count* 1)))
    (if (not (eq? *last-num-running-tests* num-running))
	(begin
	  (debug:print 2 "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
	  (set! *last-num-running-tests* num-running)))
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
			    (real-dir      (if (file-exists? run-dir)
					       (resolve-pathname run-dir)
					       #f))
			    (test-state    (db:test-get-state new-test-dat))
			    (test-fulln    (db:test-get-fullname new-test-dat)))
			   (case action
			     ((remove-runs)
			      (debug:print-info 0 "test: " test-name " item path: " item-path " test-state: " test-state)
			      (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
				  (begin
				    (if (not (hash-table-ref/default test-retry-time test-fulln #f))
					(hash-table-set! test-retry-time test-fulln (current-seconds)))
				    (if (> (- (current-seconds)(hash-table-ref test-retry-time test-fulln)) allow-run-time)
				      ;; This test is not in a correct state for cleaning up. Let's try some graceful shutdown steps first
				      ;; Set the test to "KILLREQ" and wait five seconds then try again. Repeat up to five times then give







|







611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
			    (real-dir      (if (file-exists? run-dir)
					       (resolve-pathname run-dir)
					       #f))
			    (test-state    (db:test-get-state new-test-dat))
			    (test-fulln    (db:test-get-fullname new-test-dat)))
			   (case action
			     ((remove-runs)
			      (debug:print-info 0 "test: " test-name " itest-state: " test-state)
			      (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
				  (begin
				    (if (not (hash-table-ref/default test-retry-time test-fulln #f))
					(hash-table-set! test-retry-time test-fulln (current-seconds)))
				    (if (> (- (current-seconds)(hash-table-ref test-retry-time test-fulln)) allow-run-time)
				      ;; This test is not in a correct state for cleaning up. Let's try some graceful shutdown steps first
				      ;; Set the test to "KILLREQ" and wait five seconds then try again. Repeat up to five times then give