Megatest

Check-in [e953469a27]
Login
Overview
Comment:Added checking for exceeding max runs to the run-later queue
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | experimental
Files: files | file ages | folders
SHA1: e953469a27dd90a5a0628a8e0823e56768d274d0
User & Date: mrwellan on 2011-06-22 23:14:51
Other Links: branch diff | manifest | tags
Context
2011-06-22
23:44
Fixed dashboard scrolling induced crash Closed-Leaf check-in: ef4bccf3fa user: mrwellan tags: experimental
23:14
Added checking for exceeding max runs to the run-later queue check-in: e953469a27 user: mrwellan tags: experimental
2011-06-20
14:42
Switched when queued items get run. check-in: 5c521cf269 user: mrwellan tags: experimental
Changes

Modified runs.scm from [a0270270a3] to [75f08f0e3f].

243
244
245
246
247
248
249














250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273


274
275
276
277
278
279
280
	 (res    '()))
    ;; (print "INFO: Looking at tests " (string-intersperse tests ","))
    (for-each (lambda (testpath)
		(if (file-exists? (conc testpath "/testconfig"))
		    (set! res (cons (last (string-split testpath "/")) res))))
	      tests)
    res))















(define (run-tests db test-names)
  (let* ((keys        (db-get-keys db))
	 (keyvallst   (keys->vallist keys #t))
	 (run-id      (register-run db keys))) ;;  test-name)))
    ;; on the first pass or call to run-tests set FAILS to NOT_STARTED if
    ;; -keepgoing is specified
    (if (and (eq? *passnum* 0)
	     (args:get-arg "-keepgoing"))
	(db:set-tests-state-status db run-id test-names #f "FAIL" "NOT_STARTED" "FAIL"))
    (set! *passnum* (+ *passnum* 1))
    (let loop ((numtimes 0))
      (for-each 
       (lambda (test-name)
	 (let ((num-running (db:get-count-tests-running db))
	       (max-concurrent-jobs (config-lookup *configdat* "setup" "max_concurrent_jobs")))
	   ;; (print "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
	   (if (or (not max-concurrent-jobs)
		   (and max-concurrent-jobs
			(string->number max-concurrent-jobs)
			(not (>= num-running (string->number max-concurrent-jobs)))))
		 (run-one-test db run-id test-name keyvallst)
	       (print "WARNING: Max running jobs exceeded, current number running: " num-running 
		      ", max_concurrent_jobs: \"" max-concurrent-jobs "\""))))


       test-names)
      ;; (run-waiting-tests db)
      (if (args:get-arg "-keepgoing")
	  (let ((estrem (db:estimated-tests-remaining db run-id)))
	    (if (> estrem 0)
		(begin
		  (print "Keep going, estimated " estrem " tests remaining to run, will continue in 10 seconds ...")







>
>
>
>
>
>
>
>
>
>
>
>
>
>














<
<
<
|
<
<
<

<
<
>
>







243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277



278



279


280
281
282
283
284
285
286
287
288
	 (res    '()))
    ;; (print "INFO: Looking at tests " (string-intersperse tests ","))
    (for-each (lambda (testpath)
		(if (file-exists? (conc testpath "/testconfig"))
		    (set! res (cons (last (string-split testpath "/")) res))))
	      tests)
    res))

(define (runs:can-run-more-tests db)
  (let ((num-running (db:get-count-tests-running db))
	(max-concurrent-jobs (config-lookup *configdat* "setup" "max_concurrent_jobs")))
    ;; (print "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
    (if (or (not max-concurrent-jobs)
	    (and max-concurrent-jobs
		 (string->number max-concurrent-jobs)
		 (not (>= num-running (string->number max-concurrent-jobs)))))
	#t
	(begin 
	  (print "WARNING: Max running jobs exceeded, current number running: " num-running 
			 ", max_concurrent_jobs: " max-concurrent-jobs)
	  #f))))

(define (run-tests db test-names)
  (let* ((keys        (db-get-keys db))
	 (keyvallst   (keys->vallist keys #t))
	 (run-id      (register-run db keys))) ;;  test-name)))
    ;; on the first pass or call to run-tests set FAILS to NOT_STARTED if
    ;; -keepgoing is specified
    (if (and (eq? *passnum* 0)
	     (args:get-arg "-keepgoing"))
	(db:set-tests-state-status db run-id test-names #f "FAIL" "NOT_STARTED" "FAIL"))
    (set! *passnum* (+ *passnum* 1))
    (let loop ((numtimes 0))
      (for-each 
       (lambda (test-name)



	 (if (runs:can-run-more-tests db)



		 (run-one-test db run-id test-name keyvallst)


	     ;; add some delay 
	     (sleep 2)))
       test-names)
      ;; (run-waiting-tests db)
      (if (args:get-arg "-keepgoing")
	  (let ((estrem (db:estimated-tests-remaining db run-id)))
	    (if (> estrem 0)
		(begin
		  (print "Keep going, estimated " estrem " tests remaining to run, will continue in 10 seconds ...")
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
		   (testdat   #f)
		   (num-running (db:get-count-tests-running db))
		   (max-concurrent-jobs (config-lookup *configdat* "setup" "max_concurrent_jobs"))
		   (parent-test (and (not (null? items))(equal? item-path "")))
		   (single-test (and (null? items) (equal? item-path "")))
		   (item-test   (not (equal? item-path ""))))
	      ;; (print "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
	      (if (not (or (not max-concurrent-jobs)
			   (and max-concurrent-jobs
				(string->number max-concurrent-jobs)
				(not (>= num-running (string->number max-concurrent-jobs))))))
		  (print "WARNING: Max running jobs exceeded, current number running: " num-running 
			 ", max_concurrent_jobs: " max-concurrent-jobs)
		  (begin
		    (let loop2 ((ts (db:get-test-info db run-id test-name item-path)) ;; #f)
				(ct 0))
		      (if (and (not ts)
			       (< ct 10))
			  (begin
			    (register-test db run-id test-name item-path)







|
<
<
<
<
<







330
331
332
333
334
335
336
337





338
339
340
341
342
343
344
		   (testdat   #f)
		   (num-running (db:get-count-tests-running db))
		   (max-concurrent-jobs (config-lookup *configdat* "setup" "max_concurrent_jobs"))
		   (parent-test (and (not (null? items))(equal? item-path "")))
		   (single-test (and (null? items) (equal? item-path "")))
		   (item-test   (not (equal? item-path ""))))
	      ;; (print "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
	      (if (runs:can-run-more-tests db)





		  (begin
		    (let loop2 ((ts (db:get-test-info db run-id test-name item-path)) ;; #f)
				(ct 0))
		      (if (and (not ts)
			       (< ct 10))
			  (begin
			    (register-test db run-id test-name item-path)
422
423
424
425
426
427
428



429
430
431
432
433
434
435

436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
	(last-try-time      (current-seconds))
	(times              (list 1))) ;; minutes to wait before trying again to kick off runs
    ;; BUG this hack of brute force retrying works quite well for many cases but 
    ;;     what is needed is to check the db for tests that have failed less than
    ;;     N times or never been started and kick them off again
    (let loop ((waiting-test-names (hash-table-keys *waiting-queue*)))
      (cond



       ((null? waiting-test-names)
	(print "All tests launched"))
       ((> numtries 4)
	(print "NOTE: Tried launching four times, perhaps run megatest again in a few minutes"))
       (else
	(set! numtries (+ numtries 1))
	(for-each (lambda (testname)

		    (let* ((testdat (hash-table-ref *waiting-queue* testname))
			   (prereqs ((car testdat)))
			   (ldb     (if db db (open-db))))
		      ;; (print "prereqs remaining: " prereqs)
		      (if (null? prereqs)
			  (begin
			    (print "Prerequisites met, launching " testname)
			    ((cadr testdat))
			    (hash-table-delete! *waiting-queue* testname)))
		      (if (not db)
			  (sqlite3:finalize! ldb))))
		  waiting-test-names)
	;; (sleep 10) ;; no point in rushing things at this stage?
	(loop (hash-table-keys *waiting-queue*)))))))

(define (get-dir-up-one dir) 
  (let ((dparts  (string-split dir "/")))
    (conc "/" (string-intersperse 







>
>
>


<
<



>










|







425
426
427
428
429
430
431
432
433
434
435
436


437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
	(last-try-time      (current-seconds))
	(times              (list 1))) ;; minutes to wait before trying again to kick off runs
    ;; BUG this hack of brute force retrying works quite well for many cases but 
    ;;     what is needed is to check the db for tests that have failed less than
    ;;     N times or never been started and kick them off again
    (let loop ((waiting-test-names (hash-table-keys *waiting-queue*)))
      (cond
       ((not (runs:can-run-more-tests db))
	(sleep 2)
	(loop waiting-test-names))
       ((null? waiting-test-names)
	(print "All tests launched"))


       (else
	(set! numtries (+ numtries 1))
	(for-each (lambda (testname)
		    (if (runs:can-run-more-tests db)
		    (let* ((testdat (hash-table-ref *waiting-queue* testname))
			   (prereqs ((car testdat)))
			   (ldb     (if db db (open-db))))
		      ;; (print "prereqs remaining: " prereqs)
		      (if (null? prereqs)
			  (begin
			    (print "Prerequisites met, launching " testname)
			    ((cadr testdat))
			    (hash-table-delete! *waiting-queue* testname)))
		      (if (not db)
			      (sqlite3:finalize! ldb)))))
		  waiting-test-names)
	;; (sleep 10) ;; no point in rushing things at this stage?
	(loop (hash-table-keys *waiting-queue*)))))))

(define (get-dir-up-one dir) 
  (let ((dparts  (string-split dir "/")))
    (conc "/" (string-intersperse 

Modified tests/megatest.config from [7cce1c9833] to [b1ab6001d9].

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
[fields]
sysname TEXT
fsname TEXT
datapath TEXT

[setup]
# exectutable /path/to/megatest
max_concurrent_jobs 8
runsdir /tmp/runs

[jobtools]
# ## launcher launches jobs, the job is managed on the target host
## by megatest, comment out launcher to run local
# workhosts localhost hermes
launcher nbfake







|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
[fields]
sysname TEXT
fsname TEXT
datapath TEXT

[setup]
# exectutable /path/to/megatest
max_concurrent_jobs 4
runsdir /tmp/runs

[jobtools]
# ## launcher launches jobs, the job is managed on the target host
## by megatest, comment out launcher to run local
# workhosts localhost hermes
launcher nbfake