Megatest

Check-in [da9a8edadb]
Login
Overview
Comment:Tweaked delays to reduce db load. Shrunk maxretries to match
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | v1.507
Files: files | file ages | folders
SHA1: da9a8edadbcab047c17cc513e1e0e381115cdcd1
User & Date: mrwellan on 2012-10-17 20:11:28
Other Links: manifest | tags
Context
2012-10-18
16:39
Cherry picked the test matching (test vs. test/) fix to trunk check-in: 517fde8bdf user: mrwellan tags: trunk, v1.508
14:56
Added fine grained control over debug:print printing and added speciallized printer debug:print-info check-in: b1669bc3d1 user: mrwellan tags: enhanced-debug
2012-10-17
20:11
Tweaked delays to reduce db load. Shrunk maxretries to match check-in: da9a8edadb user: mrwellan tags: trunk, v1.507
15:21
Another tweak to how -itempatt works with new matching. check-in: 273fe5758e user: mrwellan tags: trunk, v1.505
Changes

Modified db.scm from [42a639cdea] to [a4f8dba59f].

56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

(define (open-db) ;;  (conc *toppath* "/megatest.db") (car *configinfo*)))
  (let* ((dbpath    (conc *toppath* "/megatest.db")) ;; fname)
	 (dbexists  (file-exists? dbpath))
	 (db        (sqlite3:open-database dbpath)) ;; (never-give-up-open-db dbpath))
	 (handler   (make-busy-timeout (if (args:get-arg "-override-timeout")
					   (string->number (args:get-arg "-override-timeout"))
					   136000)))) ;; 136000)))
    (debug:print 4 "INFO: dbpath=" dbpath)
    (sqlite3:set-busy-handler! db handler)
    (if (not dbexists)
	(db:initialize db))
    (db:set-sync db)
    db))








|







56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

(define (open-db) ;;  (conc *toppath* "/megatest.db") (car *configinfo*)))
  (let* ((dbpath    (conc *toppath* "/megatest.db")) ;; fname)
	 (dbexists  (file-exists? dbpath))
	 (db        (sqlite3:open-database dbpath)) ;; (never-give-up-open-db dbpath))
	 (handler   (make-busy-timeout (if (args:get-arg "-override-timeout")
					   (string->number (args:get-arg "-override-timeout"))
					   136000)))) ;; 136000))) ;; 136000 = 2.2 minutes
    (debug:print 4 "INFO: dbpath=" dbpath)
    (sqlite3:set-busy-handler! db handler)
    (if (not dbexists)
	(db:initialize db))
    (db:set-sync db)
    db))

Modified runs.scm from [b4bca3f412] to [8169338d90].

407
408
409
410
411
412
413
414
415

416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
		       (and max-concurrent-jobs (> (- max-concurrent-jobs num-running) 5)))
		  (debug:print 4 "INFO: Pre-registering test " test-name "/" item-path " to create placeholder" )
		  (open-run-close db:tests-register-test #f run-id test-name item-path)
		  (hash-table-set! test-registery (runs:make-full-test-name test-name item-path) #t)
		  (thread-sleep! *global-delta*)
		  (loop (car newtal)(cdr newtal) reruns))
		 ((not have-resources) ;; simply try again after waiting a second
		  (thread-sleep! (+ 1 *global-delta*))
		  (debug:print 1 "INFO: no resources to run new tests, waiting ...")

		  ;; could have done hed tal here but doing car/cdr of newtal to rotate tests
		  (loop (car newtal)(cdr newtal) reruns))
		 ((and have-resources
		       (or (null? prereqs-not-met)
			   (and (eq? testmode 'toplevel)
				(null? non-completed))))
		  (run:test run-id runname keyvallst test-record flags #f)
		  (thread-sleep! *global-delta*)
		  (if (not (null? tal))
		      (loop (car tal)(cdr tal) reruns)))
		 (else ;; must be we have unmet prerequisites
		    (debug:print 4 "FAILS: " fails)
		    ;; If one or more of the prereqs-not-met are FAIL then we can issue
		    ;; a message and drop hed from the items to be processed.
		    (if (null? fails)
			(begin
			  ;; couldn't run, take a breather
			  (debug:print 4 "INFO: Shouldn't really get here, race condition? Unable to launch more tests at this moment, killing time ...")
			  (thread-sleep! (+ 1 *global-delta*)) ;; long sleep here - no resources, may as well be patient
			  ;; we made new tal by sticking hed at the back of the list
			  (loop (car newtal)(cdr newtal) reruns))
			;; the waiton is FAIL so no point in trying to run hed ever again
			(if (not (null? tal))
			    (if (vector? hed)
				(begin (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed)
						    " from the launch list as it has prerequistes that are FAIL")
				       (thread-sleep! *global-delta*)
				       (loop (car tal)(cdr tal) (cons hed reruns)))
				(begin
				  (debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;;  " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)")
				  (thread-sleep! *global-delta*)
				  (loop hed tal reruns))))))))) ;; END OF INNER COND
	     
	     ;; case where an items came in as a list been processed
	     ((and (list? items)     ;; thus we know our items are already calculated
		   (not   itemdat)) ;; and not yet expanded into the list of things to be done
	      (if (and (>= *verbosity* 1)
		       (> (length items) 0)







<

>


















|











|







407
408
409
410
411
412
413

414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
		       (and max-concurrent-jobs (> (- max-concurrent-jobs num-running) 5)))
		  (debug:print 4 "INFO: Pre-registering test " test-name "/" item-path " to create placeholder" )
		  (open-run-close db:tests-register-test #f run-id test-name item-path)
		  (hash-table-set! test-registery (runs:make-full-test-name test-name item-path) #t)
		  (thread-sleep! *global-delta*)
		  (loop (car newtal)(cdr newtal) reruns))
		 ((not have-resources) ;; simply try again after waiting a second

		  (debug:print 1 "INFO: no resources to run new tests, waiting ...")
		  (thread-sleep! (+ 0.01 *global-delta*))
		  ;; could have done hed tal here but doing car/cdr of newtal to rotate tests
		  (loop (car newtal)(cdr newtal) reruns))
		 ((and have-resources
		       (or (null? prereqs-not-met)
			   (and (eq? testmode 'toplevel)
				(null? non-completed))))
		  (run:test run-id runname keyvallst test-record flags #f)
		  (thread-sleep! *global-delta*)
		  (if (not (null? tal))
		      (loop (car tal)(cdr tal) reruns)))
		 (else ;; must be we have unmet prerequisites
		    (debug:print 4 "FAILS: " fails)
		    ;; If one or more of the prereqs-not-met are FAIL then we can issue
		    ;; a message and drop hed from the items to be processed.
		    (if (null? fails)
			(begin
			  ;; couldn't run, take a breather
			  (debug:print 4 "INFO: Shouldn't really get here, race condition? Unable to launch more tests at this moment, killing time ...")
			  (thread-sleep! (+ 0.01 *global-delta*)) ;; long sleep here - no resources, may as well be patient
			  ;; we made new tal by sticking hed at the back of the list
			  (loop (car newtal)(cdr newtal) reruns))
			;; the waiton is FAIL so no point in trying to run hed ever again
			(if (not (null? tal))
			    (if (vector? hed)
				(begin (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed)
						    " from the launch list as it has prerequistes that are FAIL")
				       (thread-sleep! *global-delta*)
				       (loop (car tal)(cdr tal) (cons hed reruns)))
				(begin
				  (debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;;  " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)")
				  (thread-sleep! (+ 0.01 *global-delta*))
				  (loop hed tal reruns))))))))) ;; END OF INNER COND
	     
	     ;; case where an items came in as a list been processed
	     ((and (list? items)     ;; thus we know our items are already calculated
		   (not   itemdat)) ;; and not yet expanded into the list of things to be done
	      (if (and (>= *verbosity* 1)
		       (> (length items) 0)
465
466
467
468
469
470
471
472
473

474
475
476
477
478
479
480
			 (tests:testqueue-set-itemdat!   new-test-record my-itemdat)
			 (tests:testqueue-set-item_path! new-test-record my-item-path)
			 (hash-table-set! test-records newtestname new-test-record)
			 (set! tal (cons newtestname tal)))))) ;; since these are itemized create new test names testname/itempath
	       items)
	      (if (not (null? tal))
		  (begin
		    (thread-sleep! *global-delta*)
		    (debug:print 4 "INFO: End of items list, looping with next")

		    (loop (car tal)(cdr tal) reruns))))

	     ;; if items is a proc then need to run items:get-items-from-config, get the list and loop 
	     ;;    - but only do that if resources exist to kick off the job
	     ((or (procedure? items)(eq? items 'have-procedure))
	      (let ((can-run-more    (open-run-close runs:can-run-more-tests #f test-record)))
		(if can-run-more







<
|
>







465
466
467
468
469
470
471

472
473
474
475
476
477
478
479
480
			 (tests:testqueue-set-itemdat!   new-test-record my-itemdat)
			 (tests:testqueue-set-item_path! new-test-record my-item-path)
			 (hash-table-set! test-records newtestname new-test-record)
			 (set! tal (cons newtestname tal)))))) ;; since these are itemized create new test names testname/itempath
	       items)
	      (if (not (null? tal))
		  (begin

		    (debug:print 4 "INFO: End of items list, looping with next after short delay")
		    (thread-sleep! (+ 0.01 *global-delta*))
		    (loop (car tal)(cdr tal) reruns))))

	     ;; if items is a proc then need to run items:get-items-from-config, get the list and loop 
	     ;;    - but only do that if resources exist to kick off the job
	     ((or (procedure? items)(eq? items 'have-procedure))
	      (let ((can-run-more    (open-run-close runs:can-run-more-tests #f test-record)))
		(if can-run-more
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519




520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
				   "\n testmode:        " testmode
				   "\n num-retries:     " num-retries
				   "\n (eq? testmode 'toplevel): " (eq? testmode 'toplevel)
				   "\n (null? non-completed):    " (null? non-completed)
				   "\n reruns:          " reruns
				   "\n items:           " items
				   "\n can-run-more:    " can-run-more)

		      (cond ;; INNER COND #2
		       ((or (null? prereqs-not-met) ;; all prereqs met, fire off the test
			    ;; or, if it is a 'toplevel test and all prereqs not met are COMPLETED then launch
			    (and (eq? testmode 'toplevel)
				 (null? non-completed)))
			(let ((test-name (tests:testqueue-get-testname test-record)))
			  (setenv "MT_TEST_NAME" test-name) ;; 
			  (setenv "MT_RUNNAME"   runname)
			  (open-run-close set-megatest-env-vars #f run-id) ;; these may be needed by the launching process
			  (let ((items-list (items:get-items-from-config tconfig)))
			    (if (list? items-list)
				(begin
				  (tests:testqueue-set-items! test-record items-list)
				  (thread-sleep! *global-delta*)
				  (loop hed tal reruns))
				(begin
				  (debug:print 0 "ERROR: The proc from reading the setup did not yield a list - please report this")
				  (exit 1))))))
		       ((null? fails)
			(debug:print 4 "INFO: fails is null, moving on in the queue but keeping " hed " for now")
			(thread-sleep! *global-delta*)
			;; only increment num-retries when there are no tests runing
			(if (eq? 0 (list-ref can-run-more 1))




			    (set! num-retries (+ num-retries 1)))
			(if (> num-retries  max-retries)
			    (if (not (null? tal))
				(loop (car tal)(cdr tal) reruns))
			    (loop (car newtal)(cdr newtal) reruns))) ;; an issue with prereqs not yet met?
		       ((and (not (null? fails))(eq? testmode 'normal))
			(debug:print 1 "INFO: test "  hed " (mode=" testmode ") has failed prerequisite(s); "
				     (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ")
				     ", removing it from to-do list")
			(if (not (null? tal))
			    (begin
			      (thread-sleep! *global-delta*)
			      (loop (car tal)(cdr tal)(cons hed reruns)))))
		       (else
			(debug:print 8 "ERROR: No handler for this condition.")
			(thread-sleep! *global-delta*)
			(loop (car newtal)(cdr newtal) reruns)))) ;; END OF IF CAN RUN MORE

		    ;; if can't run more just loop with next possible test
		    (begin
		      (debug:print 4 "INFO: processing the case with a lambda for items or 'have-procedure. Moving through the queue without dropping " hed)
		      (thread-sleep! (+ 1 *global-delta*))
		      (loop (car newtal)(cdr newtal) reruns))))) ;; END OF (or (procedure? items)(eq? items 'have-procedure))
	     
	     ;; this case should not happen, added to help catch any bugs
	     ((and (list? items) itemdat)
	      (debug:print 0 "ERROR: Should not have a list of items in a test and the itemspath set - please report this")
	      (exit 1))
	     ((not (null? reruns))
	      (let* ((newlst (open-run-close tests:filter-non-runnable #f run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED,
		     (junked (lset-difference equal? tal newlst)))
		(debug:print 4 "INFO: full drop through, if reruns is less than 100 we will force retry them, reruns=" reruns ", tal=" tal)
		(if (< num-retries max-retries)
		    (set! newlst (append reruns newlst)))
		(set! num-retries (+ num-retries 1))
		(thread-sleep! *global-delta*)
		(if (not (null? newlst))
		    ;; since reruns have been tacked on to newlst create new reruns from junked
		    (loop (car newlst)(cdr newlst)(delete-duplicates junked)))))
	     ((not (null? tal))
	      (debug:print 4 "INFO: I'm pretty sure I shouldn't get here."))
	     (else
	      (debug:print 4 "INFO: Exiting loop with...\n  hed=" hed "\n  tal=" tal "\n  reruns=" reruns))







|




















<


>
>
>
>
|














|



















|







489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516

517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
				   "\n testmode:        " testmode
				   "\n num-retries:     " num-retries
				   "\n (eq? testmode 'toplevel): " (eq? testmode 'toplevel)
				   "\n (null? non-completed):    " (null? non-completed)
				   "\n reruns:          " reruns
				   "\n items:           " items
				   "\n can-run-more:    " can-run-more)
		      ;; (thread-sleep! (+ 0.01 *global-delta*))
		      (cond ;; INNER COND #2
		       ((or (null? prereqs-not-met) ;; all prereqs met, fire off the test
			    ;; or, if it is a 'toplevel test and all prereqs not met are COMPLETED then launch
			    (and (eq? testmode 'toplevel)
				 (null? non-completed)))
			(let ((test-name (tests:testqueue-get-testname test-record)))
			  (setenv "MT_TEST_NAME" test-name) ;; 
			  (setenv "MT_RUNNAME"   runname)
			  (open-run-close set-megatest-env-vars #f run-id) ;; these may be needed by the launching process
			  (let ((items-list (items:get-items-from-config tconfig)))
			    (if (list? items-list)
				(begin
				  (tests:testqueue-set-items! test-record items-list)
				  (thread-sleep! *global-delta*)
				  (loop hed tal reruns))
				(begin
				  (debug:print 0 "ERROR: The proc from reading the setup did not yield a list - please report this")
				  (exit 1))))))
		       ((null? fails)
			(debug:print 4 "INFO: fails is null, moving on in the queue but keeping " hed " for now")

			;; only increment num-retries when there are no tests runing
			(if (eq? 0 (list-ref can-run-more 1))
			    (begin
			      (if (> num-retries 100) ;; first 100 retries are low time cost
				  (thread-sleep! (+ 2 *global-delta*))
				  (thread-sleep! (+ 0.01 *global-delta*)))
			      (set! num-retries (+ num-retries 1))))
			(if (> num-retries  max-retries)
			    (if (not (null? tal))
				(loop (car tal)(cdr tal) reruns))
			    (loop (car newtal)(cdr newtal) reruns))) ;; an issue with prereqs not yet met?
		       ((and (not (null? fails))(eq? testmode 'normal))
			(debug:print 1 "INFO: test "  hed " (mode=" testmode ") has failed prerequisite(s); "
				     (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ")
				     ", removing it from to-do list")
			(if (not (null? tal))
			    (begin
			      (thread-sleep! *global-delta*)
			      (loop (car tal)(cdr tal)(cons hed reruns)))))
		       (else
			(debug:print 8 "ERROR: No handler for this condition.")
			(thread-sleep! (+ 1 *global-delta*))
			(loop (car newtal)(cdr newtal) reruns)))) ;; END OF IF CAN RUN MORE

		    ;; if can't run more just loop with next possible test
		    (begin
		      (debug:print 4 "INFO: processing the case with a lambda for items or 'have-procedure. Moving through the queue without dropping " hed)
		      (thread-sleep! (+ 1 *global-delta*))
		      (loop (car newtal)(cdr newtal) reruns))))) ;; END OF (or (procedure? items)(eq? items 'have-procedure))
	     
	     ;; this case should not happen, added to help catch any bugs
	     ((and (list? items) itemdat)
	      (debug:print 0 "ERROR: Should not have a list of items in a test and the itemspath set - please report this")
	      (exit 1))
	     ((not (null? reruns))
	      (let* ((newlst (open-run-close tests:filter-non-runnable #f run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED,
		     (junked (lset-difference equal? tal newlst)))
		(debug:print 4 "INFO: full drop through, if reruns is less than 100 we will force retry them, reruns=" reruns ", tal=" tal)
		(if (< num-retries max-retries)
		    (set! newlst (append reruns newlst)))
		(set! num-retries (+ num-retries 1))
		(thread-sleep! (+ 1 *global-delta*))
		(if (not (null? newlst))
		    ;; since reruns have been tacked on to newlst create new reruns from junked
		    (loop (car newlst)(cdr newlst)(delete-duplicates junked)))))
	     ((not (null? tal))
	      (debug:print 4 "INFO: I'm pretty sure I shouldn't get here."))
	     (else
	      (debug:print 4 "INFO: Exiting loop with...\n  hed=" hed "\n  tal=" tal "\n  reruns=" reruns))

Modified tests/fullrun/megatest.config from [359ec4dac2] to [5d94f21968].

13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
[setup]
# FULL or 2, NORMAL or 1, OFF or 0
synchronous OFF
# Throttle roughly scales the db access milliseconds to seconds delay
throttle 0.2
# Max retries allows megatest to re-check that a tests status has changed
# as tests can have transient FAIL status occasionally
maxretries 500

[validvalues]
state start end 
status pass fail n/a 0 1 running

# These are set before all tests, override them 
# in the testconfig [pre-launch-env-overrides] section







|







13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
[setup]
# FULL or 2, NORMAL or 1, OFF or 0
synchronous OFF
# Throttle roughly scales the db access milliseconds to seconds delay
throttle 0.2
# Max retries allows megatest to re-check that a tests status has changed
# as tests can have transient FAIL status occasionally
maxretries 20

[validvalues]
state start end 
status pass fail n/a 0 1 running

# These are set before all tests, override them 
# in the testconfig [pre-launch-env-overrides] section