Megatest

Check-in [a5d4098b27]
Login
Overview
Comment:This version successfully ran stdcell tests (200 parallel jobs, 6000 total jobs) but it has problems with queue management
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | dev
Files: files | file ages | folders
SHA1: a5d4098b27b21a6a937fff6ddca048978ef8d69c
User & Date: mrwellan on 2013-06-07 13:55:46
Other Links: branch diff | manifest | tags
Context
2013-06-07
22:03
Added filter to skip COMPLETED tests on startup. Move code to query db for checking if ok to run more tests to server side check-in: 290d5d298f user: mrwellan tags: dev
13:55
This version successfully ran stdcell tests (200 parallel jobs, 6000 total jobs) but it has problems with queue management check-in: a5d4098b27 user: mrwellan tags: dev
09:32
Changed state for deleted runs to lowercase to differentiate test state from run state check-in: 81d027b1d3 user: matt tags: dev
Changes

Modified db.scm from [9d124c0ee8] to [9515a748b1].

788
789
790
791
792
793
794
795

796
797
798
799
800
801
802
788
789
790
791
792
793
794

795
796
797
798
799
800
801
802







-
+







			      (conc " status "
				    (if not-in "NOT" "") 
				    " IN ('" 
				    (string-intersperse statuses "','")
				    "')")))
	 (tests-match-qry (tests:match->sqlqry testpatt))
	 (qry             (conc "SELECT " qryvals
				" FROM tests WHERE run_id=? "
				" FROM tests WHERE run_id=? AND state != 'DELETED' "
				(if states-qry   (conc " AND " states-qry)   "")
				(if statuses-qry (conc " AND " statuses-qry) "")
				(if tests-match-qry (conc " AND (" tests-match-qry ") ") "")
				(case sort-by
				  ((rundir)     " ORDER BY length(rundir) DESC;")
				  ((event_time) " ORDER BY event_time ASC;")
				  (else         ";"))
843
844
845
846
847
848
849
850

851
852
853
854
855
856
857
843
844
845
846
847
848
849

850
851
852
853
854
855
856
857







-
+







			      (conc " status "
				    (if not-in "NOT" "") 
				    " IN ('" 
				    (string-intersperse statuses "','")
				    "')")))
	 (tests-match-qry (tests:match->sqlqry testpatt))
	 (qry             (conc "SELECT " qryvals 
				" FROM tests WHERE " 
				" FROM tests WHERE state != 'DELETED' AND " 
				(if run-ids
				    (if (list? run-ids)
					(conc " run_id in (" (string-intersperse (map conc run-ids) ",") ") ")
					(conc "run_id=" run-ids " "))
				    " ") ;; #f => run-ids don't filter on run-ids
				(if states-qry   (conc " AND " states-qry)   "")
				(if statuses-qry (conc " AND " statuses-qry) "")

Modified http-transport.scm from [d202a69f70] to [7cb86699d1].

211
212
213
214
215
216
217
218

219
220
221
222
223
224
225
211
212
213
214
215
216
217

218
219
220
221
222
223
224
225







-
+







			      (set! res (with-input-from-request 
					 fullurl 
					 (list (cons 'dat msg)) 
					 read-string))
			      (close-all-connections!) 
			      (mutex-unlock! *http-mutex*)))
	      (time-out     (lambda ()
			      (thread-sleep! 5)
			      (thread-sleep! 45)
			      (if (not res)
				  (begin
				    (debug:print 0 "WARNING: communication with the server timed out.")
				    (mutex-unlock! *http-mutex*)
				    (http-transport:client-send-receive serverdat msg numretries: (- numretries 1))
				    (if (< numretries 3) ;; on last try just exit
					(begin

Modified run-tests-queue-classic.scm from [e6bbd11139] to [4c9cbe32ec].

89
90
91
92
93
94
95
96
97

98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
89
90
91
92
93
94
95


96
97

98
99
100
101
102
103
104
105
106
107
108
109
110

111
112
113
114
115
116
117







-
-
+

-













-







		 ;; Registry has been started for this test but has not yet completed
		 ;; this should be rare, the case where there are only a couple of tests and the db is slow
		 ;; delay a short while and continue
		 ;; ((eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f) 'start)
		 ;;  (thread-sleep! 0.01)
		 ;;  (loop (car newtal)(cdr newtal) reruns))
		 ;; count number of 'done, if more than 100 then skip on through.
		 (;; (and (< (length (filter (lambda (x)(eq? x 'done))(hash-table-values test-registry))) 100) ;; why get more than 200 ahead?
		  (not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) ;; ) ;; too many changes required. Implement later.
		 ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) ;; ) ;; too many changes required. Implement later.
		  (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" )
		  ;; NEED TO THREADIFY THIS
		  (let ((th (make-thread (lambda ()
		        		   (mutex-lock! registry-mutex)
		        		   (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'start)
		        		   (mutex-unlock! registry-mutex)
					   ;; If haven't done it before register a top level test if this is an itemized test
					   (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done))
					       (cdb:tests-register-test *runremote* run-id test-name ""))
					   (cdb:tests-register-test *runremote* run-id test-name item-path)
		        		   (mutex-lock! registry-mutex)
					   (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done)
		        		   (mutex-unlock! registry-mutex))
		        		 (conc test-name "/" item-path))))
		    (thread-start! th))
		  ;; TRY (thread-sleep! *global-delta*)
		  (runs:shrink-can-run-more-tests-count)   ;; DELAY TWEAKER (still needed?)
		  (loop (car newtal)(cdr newtal) reruns))
		 ;; At this point *all* test registrations must be completed.
		 ((not (null? (filter (lambda (x)(eq? 'start x))(hash-table-values test-registry))))
		  (debug:print-info 0 "Waiting on test registrations: " (string-intersperse 
									 (filter (lambda (x)
										   (eq? (hash-table-ref/default test-registry x #f) 'start))

Modified run-tests-queue-new.scm from [cca55ba979] to [bdb0e5926d].

1
2
3

4
5
6
7
8
9
10
11
12
13
14
15
1
2

3
4
5
6
7

8
9
10
11
12
13
14


-
+




-








;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... >
(define (runs:run-tests-queue-new run-id runname test-records keyvallst flags test-patts required-tests reglen)
(define (runs:run-tests-queue-new run-id runname test-records keyvals flags test-patts required-tests reglen)
  ;; At this point the list of parent tests is expanded 
  ;; NB// Should expand items here and then insert into the run queue.
  (debug:print 5 "test-records: " test-records ", flags: " (hash-table->alist flags))
  (let ((run-info              (cdb:remote-run db:get-run-info #f run-id))
	(key-vals              (cdb:remote-run db:get-key-vals #f run-id))
	(sorted-test-names     (tests:sort-by-priority-and-waiton test-records))
	(test-registry         (make-hash-table))
	(registry-mutex        (make-mutex))
	(num-retries           0)
	(max-retries           (config-lookup *configdat* "setup" "maxretries"))
	(max-concurrent-jobs   (let ((mcj (config-lookup *configdat* "setup"     "max_concurrent_jobs")))
				 (if (and mcj (string->number mcj))
57
58
59
60
61
62
63
64

65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82






83
84
85
86
87
88
89
90

91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112



113
114
115
116
117

118
119
120


121
122
123
124
125
126
127
56
57
58
59
60
61
62

63
64
65
66
67
68
69
70
71
72
73
74
75






76
77
78
79
80
81
82
83
84
85
86
87
88

89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108



109
110
111
112
113
114
115

116
117


118
119
120
121
122
123
124
125
126







-
+












-
-
-
-
-
-
+
+
+
+
+
+







-
+



















-
-
-
+
+
+




-
+

-
-
+
+







	    (if (member test-name waitons)
		(begin
		  (debug:print 0 "ERROR: test " test-name " has listed itself as a waiton, please correct this!")
		  (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))

	    (cond ;; OUTER COND
	     ((not items) ;; when false the test is ok to be handed off to launch (but not before)
	      (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path))
	      (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests))
	               (not (null? tal)))
	          (loop (car tal)(cdr tal) reg reruns))
	      (let* ((run-limits-info         (runs:can-run-more-tests test-record max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
		     (have-resources          (car run-limits-info))
		     (num-running             (list-ref run-limits-info 1))
		     (num-running-in-jobgroup (list-ref run-limits-info 2))
		     (max-concurrent-jobs     (list-ref run-limits-info 3))
		     (job-group-limit         (list-ref run-limits-info 4))
		     (prereqs-not-met         (db:get-prereqs-not-met run-id waitons item-path mode: testmode))
		     (fails                   (runs:calc-fails prereqs-not-met))
		     (non-completed           (runs:calc-not-completed prereqs-not-met)))
		(debug:print-info 8 "have-resources: " have-resources " prereqs-not-met: " 
				  (string-intersperse 
				   (map (lambda (t)
					  (if (vector? t)
					      (conc (db:test-get-state t) "/" (db:test-get-status t))
					      (conc " WARNING: t is not a vector=" t )))
					prereqs-not-met) ", ") " fails: " fails)
			     (string-intersperse 
			      (map (lambda (t)
				     (if (vector? t)
					 (conc (db:test-get-state t) "/" (db:test-get-status t))
					 (conc " WARNING: t is not a vector=" t )))
				   prereqs-not-met) ", ") " fails: " fails)
		(debug:print-info 4 "hed=" hed "\n  test-record=" test-record "\n  test-name: " test-name "\n  item-path: " item-path "\n  test-patts: " test-patts)

		;; Don't know at this time if the test have been launched at some time in the past
		;; i.e. is this a re-launch?
		(debug:print-info 4 "run-limits-info = " run-limits-info)
		(cond ;; INNER COND #1 for a launchable test
		 ;; Check item path against item-patts
		 ((not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path)) ;; This test/itempath is not to be run
		 ((not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) ;; This test/itempath is not to be run
		  ;; else the run is stuck, temporarily or permanently
		  ;; but should check if it is due to lack of resources vs. prerequisites
		  (debug:print-info 1 "Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " test-patts)
		  ;; (thread-sleep! *global-delta*)
		  (if (not (null? tal))
		      (loop (runs:queue-next-hed tal reg reglen regfull)
			    (runs:queue-next-tal tal reg reglen regfull)
			    (runs:queue-next-reg tal reg reglen regfull)
			    reruns)))
		 ;; Registry has been started for this test but has not yet completed
		 ;; this should be rare, the case where there are only a couple of tests and the db is slow
		 ;; delay a short while and continue
		 ;; ((eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f) 'start)
		 ;;  (thread-sleep! 0.01)
		 ;;  (loop (car newtal)(cdr newtal) reruns))
		 ;; count number of 'done, if more than 100 then skip on through.
		 ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) ;; ) ;; too many changes required. Implement later.
		  (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" )
		  (let ((th (make-thread (lambda ()
					   (mutex-lock! registry-mutex)
					   (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'start)
					   (mutex-unlock! registry-mutex)
		        		   (mutex-lock! registry-mutex)
		        		   (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'start)
		        		   (mutex-unlock! registry-mutex)
					   ;; If haven't done it before register a top level test if this is an itemized test
					   (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done))
					       (cdb:tests-register-test *runremote* run-id test-name ""))
					   (cdb:tests-register-test *runremote* run-id test-name item-path)
					   (mutex-lock! registry-mutex)
		        		   (mutex-lock! registry-mutex)
					   (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done)
					   (mutex-unlock! registry-mutex))
					 (conc test-name "/" item-path))))
		        		   (mutex-unlock! registry-mutex))
		        		 (conc test-name "/" item-path))))
		    (thread-start! th))
		  (runs:shrink-can-run-more-tests-count)   ;; DELAY TWEAKER (still needed?)
		  (if (and (null? tal)(null? reg))
		      (loop hed tal reg reruns)
		      (loop (runs:queue-next-hed tal reg reglen regfull)
			    (runs:queue-next-tal tal reg reglen regfull)
			    (let ((newl (append reg (list hed))))
146
147
148
149
150
151
152
153

154
155
156
157
158
159
160
145
146
147
148
149
150
151

152
153
154
155
156
157
158
159







-
+







		  (thread-sleep! 1) ;; (+ 2 *global-delta*))
		  ;; could have done hed tal here but doing car/cdr of newtal to rotate tests
		  (loop (car newtal)(cdr newtal) reg reruns))
		 ((and have-resources
		       (or (null? prereqs-not-met)
			   (and (eq? testmode 'toplevel)
				(null? non-completed))))
		  (run:test run-id run-info key-vals runname test-record flags #f)
		  (run:test run-id run-info keyvals runname test-record flags #f)
		  (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'running)
		  (runs:shrink-can-run-more-tests-count)  ;; DELAY TWEAKER (still needed?)
		  ;; (thread-sleep! *global-delta*)
		  (if (not (null? tal))
		      (loop (runs:queue-next-hed tal reg reglen regfull)
			    (runs:queue-next-tal tal reg reglen regfull)
			    (runs:queue-next-reg tal reg reglen regfull)
198
199
200
201
202
203
204
205

206
207
208
209
210
211
212
197
198
199
200
201
202
203

204
205
206
207
208
209
210
211







-
+







		  (pp items))
	      (for-each
	       (lambda (my-itemdat)
		 (let* ((new-test-record (let ((newrec (make-tests:testqueue)))
					   (vector-copy! test-record newrec)
					   newrec))
			(my-item-path (item-list->path my-itemdat)))
		   (if (tests:match test-patts hed my-item-path) ;; (patt-list-match my-item-path item-patts)           ;; yes, we want to process this item, NOTE: Should not need this check here!
		   (if (tests:match test-patts hed my-item-path required: required-tests) ;; (patt-list-match my-item-path item-patts)           ;; yes, we want to process this item, NOTE: Should not need this check here!
		       (let ((newtestname (runs:make-full-test-name hed my-item-path)))    ;; test names are unique on testname/item-path
			 (tests:testqueue-set-items!     new-test-record #f)
			 (tests:testqueue-set-itemdat!   new-test-record my-itemdat)
			 (tests:testqueue-set-item_path! new-test-record my-item-path)
			 (hash-table-set! test-records newtestname new-test-record)
			 (set! tal (cons newtestname tal)))))) ;; since these are itemized create new test names testname/itempath
	       items)
245
246
247
248
249
250
251
252

253
254
255
256
257
258
259
244
245
246
247
248
249
250

251
252
253
254
255
256
257
258







-
+







		       ((or (null? prereqs-not-met) ;; all prereqs met, fire off the test
			    ;; or, if it is a 'toplevel test and all prereqs not met are COMPLETED then launch
			    (and (eq? testmode 'toplevel)
				 (null? non-completed)))
			(let ((test-name (tests:testqueue-get-testname test-record)))
			  (setenv "MT_TEST_NAME" test-name) ;; 
			  (setenv "MT_RUNNAME"   runname)
			  (set-megatest-env-vars run-id) ;; these may be needed by the launching process
			  (set-megatest-env-vars run-id inrunname: runname) ;; these may be needed by the launching process
			  (let ((items-list (items:get-items-from-config tconfig)))
			    (if (list? items-list)
				(begin
				  (tests:testqueue-set-items! test-record items-list)
				  ;; (thread-sleep! *global-delta*)
				  (loop hed tal reg reruns))
				(begin
273
274
275
276
277
278
279
280
281


282
283
284
285
286
287
288
272
273
274
275
276
277
278


279
280
281
282
283
284
285
286
287







-
-
+
+







				(loop (runs:queue-next-hed tal reg reglen regfull)
				      (runs:queue-next-tal tal reg reglen regfull)
				      (runs:queue-next-reg tal reg reglen regfull)
				      reruns))
			    (loop (car newtal)(cdr newtal) reg reruns))) ;; an issue with prereqs not yet met?
		       ((and (not (null? fails))(eq? testmode 'normal))
			(debug:print-info 1 "test "  hed " (mode=" testmode ") has failed prerequisite(s); "
					  (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ")
					  ", removing it from to-do list")
				     (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ")
				     ", removing it from to-do list")
			(if (not (null? tal))
			    (begin
			      ;; (thread-sleep! *global-delta*)
			      (loop (runs:queue-next-hed tal reg reglen regfull)
				    (runs:queue-next-tal tal reg reglen regfull)
				    (runs:queue-next-reg tal reg reglen regfull)
				    (cons hed reruns)))))

Modified runs.scm from [366e8472b0] to [513fa82301].

597
598
599
600
601
602
603
604

605
606
607
608
609
610
611
612
613
614
615
616
617
618
619

620
621
622
623
624
625
626
597
598
599
600
601
602
603

604
605
606
607
608
609
610
611
612
613
614
615
616
617
618

619
620
621
622
623
624
625
626







-
+














-
+







		    (debug:print-info 0 "action not recognised " action)))
		 (let ((sorted-tests     (sort tests (lambda (a b)(let ((dira (db:test-get-rundir a))
									(dirb (db:test-get-rundir b)))
								    (if (and (string? dira)(string? dirb))
									(> (string-length dira)(string-length dirb))
									#f)))))
		       (test-retry-time  (make-hash-table))
		       (allow-run-time   25)) ;; seconds to allow for killing tests before just brutally killing 'em
		       (allow-run-time   10)) ;; seconds to allow for killing tests before just brutally killing 'em
		   (let loop ((test (car sorted-tests))
			      (tal  (cdr sorted-tests)))
		     (let* ((test-id       (db:test-get-id test))
			    (new-test-dat  (cdb:remote-run db:get-test-info-by-id #f test-id))
			    (item-path     (db:test-get-item-path new-test-dat))
			    (test-name     (db:test-get-testname new-test-dat))
			    (run-dir       (db:test-get-rundir new-test-dat))    ;; run dir is from the link tree
			    (real-dir      (if (file-exists? run-dir)
					       (resolve-pathname run-dir)
					       #f))
			    (test-state    (db:test-get-state new-test-dat))
			    (test-fulln    (db:test-get-fullname new-test-dat)))
			   (case action
			     ((remove-runs)
			      (debug:print-info 0 "test-state: " test-state)
			      (debug:print-info 0 "test: " test-name " item path: " item-path " test-state: " test-state)
			      (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
				  (begin
				    (if (not (hash-table-ref/default test-retry-time test-fulln #f))
					(hash-table-set! test-retry-time test-fulln (current-seconds)))
				    (if (> (- (current-seconds)(hash-table-ref test-retry-time test-fulln)) allow-run-time)
				      ;; This test is not in a correct state for cleaning up. Let's try some graceful shutdown steps first
				      ;; Set the test to "KILLREQ" and wait five seconds then try again. Repeat up to five times then give

Modified tests/fullrun/megatest.config from [48f6d0e4a8] to [485bc46598].

14
15
16
17
18
19
20
21

22
23
24
25
26
27
28
14
15
16
17
18
19
20

21
22
23
24
25
26
27
28







-
+







# Set launchwait to yes to use the old launch run code that waits for the launch process to return before 
# proceeding.
# launchwait yes

# If defined the runs:run-tests-queue-new queue code is used with the register test depth
# given. Otherwise the old code is used. The old code will be removed in the future and 
# a default of 10 used.
# runqueue 2
runqueue 2

# It is possible (but not recommended) to override the rsync command used
# to populate the test directories. For test development the following 
# example can be useful
#
# testcopycmd cp --remove-destination -rsv TEST_SRC_PATH/. TEST_TARG_PATH/. >> TEST_TARG_PATH/mt_launch.log 2>> TEST_TARG_PATH/mt_launch.log