Megatest

Check-in [fdbfca4367]
Login
Overview
Comment:Added adaptive delay to prevent hammering the db when tests are running but none are ready for launch
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: fdbfca436760012a652611eac324264bce14cf34
User & Date: matt on 2013-01-30 22:01:38
Other Links: manifest | tags
Context
2013-01-30
22:04
Added mockup for cached writes check-in: 4309310007 user: mrwellan tags: trunk
22:01
Added adaptive delay to prevent hammering the db when tests are running but none are ready for launch check-in: fdbfca4367 user: matt tags: trunk
14:53
Added help for -list-targets, -list-disks and -list-db-targets check-in: 9fb0a7f622 user: mrwellan tags: trunk
Changes

Modified runs.scm from [fcf8110ccc] to [1837f1ac5c].

116
117
118
119
120
121
122




123

124
125




126
127
128
129
130
131
132





133
134
135
136
137
138
139
(define (set-item-env-vars itemdat)
  (for-each (lambda (item)
	      (debug:print 2 "setenv " (car item) " " (cadr item))
	      (setenv (car item) (cadr item)))
	    itemdat))

(define *last-num-running-tests* 0)




(define (runs:can-run-more-tests test-record)

  (let* ((tconfig                 (tests:testqueue-get-testconfig test-record))
	 (jobgroup                (config-lookup tconfig "requirements" "jobgroup"))




	 (num-running             (cdb:remote-run db:get-count-tests-running #f))
	 (num-running-in-jobgroup (cdb:remote-run db:get-count-tests-running-in-jobgroup #f jobgroup))
	 (max-concurrent-jobs     (let ((mcj (config-lookup *configdat* "setup"     "max_concurrent_jobs")))
				    (if (and mcj (string->number mcj))
					(string->number mcj)
					1)))
	 (job-group-limit         (config-lookup *configdat* "jobgroups" jobgroup)))





    (if (not (eq? *last-num-running-tests* num-running))
	(begin
	  (debug:print 2 "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
	  (set! *last-num-running-tests* num-running)))
    (if (not (eq? 0 *globalexitstatus*))
	(list #f num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit)
	(let ((can-not-run-more (cond







>
>
>
>

>


>
>
>
>







>
>
>
>
>







116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
(define (set-item-env-vars itemdat)
  (for-each (lambda (item)
	      (debug:print 2 "setenv " (car item) " " (cadr item))
	      (setenv (car item) (cadr item)))
	    itemdat))

(define *last-num-running-tests* 0)
(define *runs:can-run-more-tests-delay* 0)
(define (runs:shrink-can-run-more-tests-delay)
  (set! *runs:can-run-more-tests-delay* (/ *runs:can-run-more-tests-delay* 2)))

(define (runs:can-run-more-tests test-record)
  (thread-sleep! *runs:can-run-more-tests-delay*)
  (let* ((tconfig                 (tests:testqueue-get-testconfig test-record))
	 (jobgroup                (config-lookup tconfig "requirements" "jobgroup"))
	 ;; Heuristic fix. These are getting called too rapidly when jobs are running or stuck
	 ;; so we are going to increment a global delay by 0.1 seconds up to 10 seconds
	 ;; every time runs:can-run-more-tests is called.
	 ;; when a test is launched or other activity occurs divide the delay by 2
	 (num-running             (cdb:remote-run db:get-count-tests-running #f))
	 (num-running-in-jobgroup (cdb:remote-run db:get-count-tests-running-in-jobgroup #f jobgroup))
	 (max-concurrent-jobs     (let ((mcj (config-lookup *configdat* "setup"     "max_concurrent_jobs")))
				    (if (and mcj (string->number mcj))
					(string->number mcj)
					1)))
	 (job-group-limit         (config-lookup *configdat* "jobgroups" jobgroup)))
    (if (and (> (+ num-running num-running-in-jobgroup) 0)
	     (< *runs:can-run-more-tests-delay* 10))
	(begin
	  (set! *runs:can-run-more-tests-delay* (+ *runs:can-run-more-tests-delay* 0.1))
	  (debug:print-info 14 "can-run-more-tests-delay: " *runs:can-run-more-tests-delay*)))
    (if (not (eq? *last-num-running-tests* num-running))
	(begin
	  (debug:print 2 "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
	  (set! *last-num-running-tests* num-running)))
    (if (not (eq? 0 *globalexitstatus*))
	(list #f num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit)
	(let ((can-not-run-more (cond
414
415
416
417
418
419
420

421
422
423
424
425
426
427
428
429
430
431

432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450

451
452
453
454

455
456
457
458
459
460
461
		 ( ;; (and
		  (not (hash-table-ref/default test-registery (runs:make-full-test-name test-name item-path) #f))
		      ;; (and max-concurrent-jobs (> (- max-concurrent-jobs num-running) 5)))
		  (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" )
		  (open-run-close db:tests-register-test #f run-id test-name item-path)
		  (hash-table-set! test-registery (runs:make-full-test-name test-name item-path) #t)
		  (thread-sleep! *global-delta*)

		  (loop (car newtal)(cdr newtal) reruns))
		 ((not have-resources) ;; simply try again after waiting a second
		  (debug:print-info 1 "no resources to run new tests, waiting ...")
		  (thread-sleep! (+ 2 *global-delta*))
		  ;; could have done hed tal here but doing car/cdr of newtal to rotate tests
		  (loop (car newtal)(cdr newtal) reruns))
		 ((and have-resources
		       (or (null? prereqs-not-met)
			   (and (eq? testmode 'toplevel)
				(null? non-completed))))
		  (run:test run-id runname keyvallst test-record flags #f)

		  (thread-sleep! *global-delta*)
		  (if (not (null? tal))
		      (loop (car tal)(cdr tal) reruns)))
		 (else ;; must be we have unmet prerequisites
		    (debug:print 4 "FAILS: " fails)
		    ;; If one or more of the prereqs-not-met are FAIL then we can issue
		    ;; a message and drop hed from the items to be processed.
		    (if (null? fails)
			(begin
			  ;; couldn't run, take a breather
			  (debug:print-info 4 "Shouldn't really get here, race condition? Unable to launch more tests at this moment, killing time ...")
			  (thread-sleep! (+ 0.01 *global-delta*)) ;; long sleep here - no resources, may as well be patient
			  ;; we made new tal by sticking hed at the back of the list
			  (loop (car newtal)(cdr newtal) reruns))
			;; the waiton is FAIL so no point in trying to run hed ever again
			(if (not (null? tal))
			    (if (vector? hed)
				(begin (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed)
						    " from the launch list as it has prerequistes that are FAIL")

				       (thread-sleep! *global-delta*)
				       (loop (car tal)(cdr tal) (cons hed reruns)))
				(begin
				  (debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;;  " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)")

				  (thread-sleep! (+ 0.01 *global-delta*))
				  (loop hed tal reruns))))))))) ;; END OF INNER COND
	     
	     ;; case where an items came in as a list been processed
	     ((and (list? items)     ;; thus we know our items are already calculated
		   (not   itemdat)) ;; and not yet expanded into the list of things to be done
	      (if (and (debug:debug-mode 1) ;; (>= *verbosity* 1)







>











>



















>




>







428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
		 ( ;; (and
		  (not (hash-table-ref/default test-registery (runs:make-full-test-name test-name item-path) #f))
		      ;; (and max-concurrent-jobs (> (- max-concurrent-jobs num-running) 5)))
		  (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" )
		  (open-run-close db:tests-register-test #f run-id test-name item-path)
		  (hash-table-set! test-registery (runs:make-full-test-name test-name item-path) #t)
		  (thread-sleep! *global-delta*)
(runs:shrink-can-run-more-tests-delay)
		  (loop (car newtal)(cdr newtal) reruns))
		 ((not have-resources) ;; simply try again after waiting a second
		  (debug:print-info 1 "no resources to run new tests, waiting ...")
		  (thread-sleep! (+ 2 *global-delta*))
		  ;; could have done hed tal here but doing car/cdr of newtal to rotate tests
		  (loop (car newtal)(cdr newtal) reruns))
		 ((and have-resources
		       (or (null? prereqs-not-met)
			   (and (eq? testmode 'toplevel)
				(null? non-completed))))
		  (run:test run-id runname keyvallst test-record flags #f)
(runs:shrink-can-run-more-tests-delay)
		  (thread-sleep! *global-delta*)
		  (if (not (null? tal))
		      (loop (car tal)(cdr tal) reruns)))
		 (else ;; must be we have unmet prerequisites
		    (debug:print 4 "FAILS: " fails)
		    ;; If one or more of the prereqs-not-met are FAIL then we can issue
		    ;; a message and drop hed from the items to be processed.
		    (if (null? fails)
			(begin
			  ;; couldn't run, take a breather
			  (debug:print-info 4 "Shouldn't really get here, race condition? Unable to launch more tests at this moment, killing time ...")
			  (thread-sleep! (+ 0.01 *global-delta*)) ;; long sleep here - no resources, may as well be patient
			  ;; we made new tal by sticking hed at the back of the list
			  (loop (car newtal)(cdr newtal) reruns))
			;; the waiton is FAIL so no point in trying to run hed ever again
			(if (not (null? tal))
			    (if (vector? hed)
				(begin (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed)
						    " from the launch list as it has prerequistes that are FAIL")
(runs:shrink-can-run-more-tests-delay)
				       (thread-sleep! *global-delta*)
				       (loop (car tal)(cdr tal) (cons hed reruns)))
				(begin
				  (debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;;  " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)")
(runs:shrink-can-run-more-tests-delay)
				  (thread-sleep! (+ 0.01 *global-delta*))
				  (loop hed tal reruns))))))))) ;; END OF INNER COND
	     
	     ;; case where an items came in as a list been processed
	     ((and (list? items)     ;; thus we know our items are already calculated
		   (not   itemdat)) ;; and not yet expanded into the list of things to be done
	      (if (and (debug:debug-mode 1) ;; (>= *verbosity* 1)