Megatest

Check-in [0b57bca235]
Login
Overview
Comment:Speculative change to track tests that were waitons of a stuck or non-runnable test
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fix-chained-waiton
Files: files | file ages | folders
SHA1: 0b57bca23545dcaa64ffc9396cbf57d431237f72
User & Date: mrwellan on 2016-05-16 09:05:38
Other Links: branch diff | manifest | tags
Context
2016-06-21
04:45
Merged v1.61 into fix-chained-waiton check-in: 750fe28e23 user: matt tags: fix-chained-waiton
2016-05-16
09:05
Speculative change to track tests that were waitons of a stuck or non-runnable test check-in: 0b57bca235 user: mrwellan tags: fix-chained-waiton
2016-05-13
11:32
Merged latest from v1.61 check-in: 2fe9144186 user: mrwellan tags: fix-chained-waiton
Changes

Modified runs.scm from [29ad163503] to [3a5049e8c7].

159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
				 ((and job-group-limit
				       (>= num-running-in-jobgroup job-group-limit))
				  (if (runs:lownoise (conc "maxjobgroup " jobgroup) 60)
				      (debug:print 1 "WARNING: number of jobs " num-running-in-jobgroup 
						   " in jobgroup \"" jobgroup "\" exceeds limit of " job-group-limit))
				  #t)
				 (else #f))))
	  ;; lets use the debugger eh?
	  (debugger-start start: 15)
	  (debugger-trace-var "runs:can-run-more-tests" "")
	  (debugger-trace-var "can-not-run-more"         can-not-run-more)
	  (debugger-trace-var "num-running"              num-running)
	  (debugger-trace-var "num-running-in-jobgroup"  num-running-in-jobgroup)
	  (debugger-trace-var "job-group-limit"          job-group-limit)
	  (debugger-pauser)
	  (list (not can-not-run-more) num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit)))))


;;  test-names: Comma separated patterns same as test-patts but used in selection 
;;              of tests to run. The item portions are not respected.
;;              FIXME: error out if /patt specified
;;            







|
|
|
|
|
|
|
|







159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
				 ((and job-group-limit
				       (>= num-running-in-jobgroup job-group-limit))
				  (if (runs:lownoise (conc "maxjobgroup " jobgroup) 60)
				      (debug:print 1 "WARNING: number of jobs " num-running-in-jobgroup 
						   " in jobgroup \"" jobgroup "\" exceeds limit of " job-group-limit))
				  #t)
				 (else #f))))
;;	  ;; lets use the debugger eh?
;;	  (debugger-start start: 15)
;;	  (debugger-trace-var "runs:can-run-more-tests" "")
;;	  (debugger-trace-var "can-not-run-more"         can-not-run-more)
;;	  (debugger-trace-var "num-running"              num-running)
;;	  (debugger-trace-var "num-running-in-jobgroup"  num-running-in-jobgroup)
;;	  (debugger-trace-var "job-group-limit"          job-group-limit)
;;	  (debugger-pauser)
	  (list (not can-not-run-more) num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit)))))


;;  test-names: Comma separated patterns same as test-patts but used in selection 
;;              of tests to run. The item portions are not respected.
;;              FIXME: error out if /patt specified
;;            
399
400
401
402
403
404
405







406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
	     (delete-duplicates (append waitons waitors)))

            ;; remember deps
            (hash-table-set!
             test-deps
             hed
             (delete-duplicates (append waitons waitors (hash-table-ref/default test-deps hed '()))))








	    (let ((remtests (delete-duplicates (append waitons tal))))
	      (if (not (null? remtests))
		  (begin
		    ;; (debug:print-info 0 "Preprocessing continues for " (string-intersperse remtests ", "))
		    (loop (car remtests)(cdr remtests))))))))
        
    (if (not (null? required-tests))
	(debug:print-info 1 "Adding \"" (string-intersperse required-tests " ") "\" to the run queue"))
    ;; NOTE: these are all parent tests, items are not expanded yet.
    (debug:print-info 4 "test-records=" (hash-table->alist test-records))
    (let ((reglen (configf:lookup *configdat* "setup" "runqueue")))
      (if (> (length (hash-table-keys test-records)) 0)
	  (let* ((keep-going        #t)
		 (run-queue-retries 5)
		 (th1        (make-thread (lambda ()
					    (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry))
					    ;; (handle-exceptions
					    ;;  exn
					    ;;  (begin
					    ;;    (print-call-chain (current-error-port))
					    ;;    (debug:print 0 "ERROR: failure in runs:run-tests-queue thread, error: " ((condition-property-accessor 'exn 'message) exn))
					    ;;    (if (> run-queue-retries 0)
					    ;; 	   (begin







>
>
>
>
>
>
>
















|







399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
	     (delete-duplicates (append waitons waitors)))

            ;; remember deps
            (hash-table-set!
             test-deps
             hed
             (delete-duplicates (append waitons waitors (hash-table-ref/default test-deps hed '()))))

	    ;; (print "INFO::: test-deps")
	    ;; (pp (hash-table->alist test-deps))
;; 	    (debugger-start start: 21)
;; 	    (debugger-trace-var "waiton processing" "")
;; 	    (debugger-trace-var "test-deps"     (hash-table->alist test-deps))
;; 	    (debugger-pauser)

	    (let ((remtests (delete-duplicates (append waitons tal))))
	      (if (not (null? remtests))
		  (begin
		    ;; (debug:print-info 0 "Preprocessing continues for " (string-intersperse remtests ", "))
		    (loop (car remtests)(cdr remtests))))))))
        
    (if (not (null? required-tests))
	(debug:print-info 1 "Adding \"" (string-intersperse required-tests " ") "\" to the run queue"))
    ;; NOTE: these are all parent tests, items are not expanded yet.
    (debug:print-info 4 "test-records=" (hash-table->alist test-records))
    (let ((reglen (configf:lookup *configdat* "setup" "runqueue")))
      (if (> (length (hash-table-keys test-records)) 0)
	  (let* ((keep-going        #t)
		 (run-queue-retries 5)
		 (th1        (make-thread (lambda ()
					    (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry test-deps))
					    ;; (handle-exceptions
					    ;;  exn
					    ;;  (begin
					    ;;    (print-call-chain (current-error-port))
					    ;;    (debug:print 0 "ERROR: failure in runs:run-tests-queue thread, error: " ((condition-property-accessor 'exn 'message) exn))
					    ;;    (if (> run-queue-retries 0)
					    ;; 	   (begin
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
		      "\n testmode:        " testmode
		      "\n (member 'toplevel testmode): " (member 'toplevel testmode)
		      "\n (null? non-completed):    " (null? non-completed)
		      "\n reruns:          " reruns
		      "\n items:           " items
		      "\n can-run-more:    " can-run-more)

    ;; lets use the debugger eh?
    (debugger-start start: 2)
    (debugger-trace-var "runs:expand-items" "")
    (debugger-trace-var "can-run-more"     can-run-more)
    (debugger-trace-var "hed"              hed)
    (debugger-trace-var "prereqs-not-met"  (runs:pretty-string prereqs-not-met))
    (debugger-pauser)

    (cond
     ;; all prereqs met, fire off the test
     ;; or, if it is a 'toplevel test and all prereqs not met are COMPLETED then launch

     ((and (not (member 'toplevel testmode))
	   (member (hash-table-ref/default test-registry (db:test-make-full-name hed item-path) 'n/a)







|
|
|
|
|
|
|







536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
		      "\n testmode:        " testmode
		      "\n (member 'toplevel testmode): " (member 'toplevel testmode)
		      "\n (null? non-completed):    " (null? non-completed)
		      "\n reruns:          " reruns
		      "\n items:           " items
		      "\n can-run-more:    " can-run-more)

;;     ;; lets use the debugger eh?
;;     (debugger-start start: 2)
;;     (debugger-trace-var "runs:expand-items" "")
;;     (debugger-trace-var "can-run-more"     can-run-more)
;;     (debugger-trace-var "hed"              hed)
;;     (debugger-trace-var "prereqs-not-met"  (runs:pretty-string prereqs-not-met))
;;     (debugger-pauser)

    (cond
     ;; all prereqs met, fire off the test
     ;; or, if it is a 'toplevel test and all prereqs not met are COMPLETED then launch

     ((and (not (member 'toplevel testmode))
	   (member (hash-table-ref/default test-registry (db:test-make-full-name hed item-path) 'n/a)
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980

981
982
983
984
985
986
987

;; every time though the loop increment the test/itempatt val.
;; when the min is > max-allowed and none running then force exit
;;
(define *max-tries-hash* (make-hash-table))

;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... >
(define (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests reglen-in all-tests-registry)
  ;; At this point the list of parent tests is expanded 
  ;; NB// Should expand items here and then insert into the run queue.
  (debug:print 5 "test-records: " test-records ", flags: " (hash-table->alist flags))

  ;; Do mark-and-find clean up of db before starting runing of quue
  ;;
  ;; (rmt:find-and-mark-incomplete)

  (let ((run-info              (rmt:get-run-info run-id))
	(tests-info            (mt:get-tests-for-run run-id #f '() '())) ;;  qryvals: "id,testname,item_path"))
	(sorted-test-names     (tests:sort-by-priority-and-waiton test-records))
	(test-registry         (make-hash-table))

	(registry-mutex        (make-mutex))
	(num-retries           0)
	(max-retries           (config-lookup *configdat* "setup" "maxretries"))
	(max-concurrent-jobs   (let ((mcj (config-lookup *configdat* "setup"     "max_concurrent_jobs")))
				 (if (and mcj (string->number mcj))
				     (string->number mcj)
				     1))) ;; length of the register queue ahead







|












>







968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995

;; every time though the loop increment the test/itempatt val.
;; when the min is > max-allowed and none running then force exit
;;
(define *max-tries-hash* (make-hash-table))

;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... >
(define (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests reglen-in all-tests-registry test-deps)
  ;; At this point the list of parent tests is expanded 
  ;; NB// Should expand items here and then insert into the run queue.
  (debug:print 5 "test-records: " test-records ", flags: " (hash-table->alist flags))

  ;; Do mark-and-find clean up of db before starting runing of quue
  ;;
  ;; (rmt:find-and-mark-incomplete)

  (let ((run-info              (rmt:get-run-info run-id))
	(tests-info            (mt:get-tests-for-run run-id #f '() '())) ;;  qryvals: "id,testname,item_path"))
	(sorted-test-names     (tests:sort-by-priority-and-waiton test-records))
	(test-registry         (make-hash-table))
	(no-can-run            (make-hash-table)) ;; test/test/patt => #t hash of tests that can not run
	(registry-mutex        (make-mutex))
	(num-retries           0)
	(max-retries           (config-lookup *configdat* "setup" "maxretries"))
	(max-concurrent-jobs   (let ((mcj (config-lookup *configdat* "setup"     "max_concurrent_jobs")))
				 (if (and mcj (string->number mcj))
				     (string->number mcj)
				     1))) ;; length of the register queue ahead
1003
1004
1005
1006
1007
1008
1009


1010
1011
1012
1013
1014
1015
1016
    (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100))

    (let loop ((hed         (car sorted-test-names))
	       (tal         (cdr sorted-test-names))
	       (reg         '()) ;; registered, put these at the head of tal 
	       (reruns      '()))



      (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns))

      ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
      ;; moving this to a parallel thread and just run it once.
      ;;
      (if (> (current-seconds)(+ last-time-incomplete 900))
          (begin







>
>







1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
    (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100))

    (let loop ((hed         (car sorted-test-names))
	       (tal         (cdr sorted-test-names))
	       (reg         '()) ;; registered, put these at the head of tal 
	       (reruns      '()))

      (set! reruns '()) ;; force it to test impact!!
      
      (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns))

      ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
      ;; moving this to a parallel thread and just run it once.
      ;;
      (if (> (current-seconds)(+ last-time-incomplete 900))
          (begin
1086
1087
1088
1089
1090
1091
1092

1093
1094
1095

1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107

1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122





1123
1124
1125
1126
1127
1128
1129
		     "\n  reg:         " reg)

	;; lets use the debugger eh?
	(debugger-start start: 7)
	(debugger-trace-var "runs:run-tests-queue" "")
	(debugger-trace-var "hed"              hed)
	(debugger-trace-var "tal"              tal)

	(debugger-trace-var "items"            items)
	(debugger-trace-var "item-path"        item-path)
	(debugger-trace-var "waitons"          waitons) 

	(debugger-pauser)


	;; check for hed in waitons => this would be circular, remove it and issue an
	;; error
	(if (member test-name waitons)
	    (begin
	      (debug:print 0 "ERROR: test " test-name " has listed itself as a waiton, please correct this!")
	      (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))

	(cond 
	 

	 ;; We want to catch tests that have waitons that are NOT in the queue and discard them IFF 
	 ;; they have been through the wringer 10 or more times
	 ((and (list? waitons)
	       (not (null? waitons))
	       (> (hash-table-ref/default *max-tries-hash* tfullname 0) 10)
	       (not (null? (filter
			    number?
			    (map (lambda (waiton)
				   (if (and (not (member waiton tal))            ;; this waiton is not in the list to be tried to run
					    (not (member waiton reruns)))
				       1
				       #f))
				 waitons))))) ;; could do this more elegantly with a marker....
	  (debug:print 0 "WARNING: Marking test " tfullname " as not runnable. It is waiting on tests that cannot be run. Giving up now.")
	  (hash-table-set! test-registry tfullname 'removed))






	 ;; items is #f then the test is ok to be handed off to launch (but not before)
	 ;; 
	 ((not items)
	  (debug:print-info 4 "OUTER COND: (not items)")
	  (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests))
		   (not (null? tal)))







>


|
>











|
>














|
>
>
>
>
>







1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
		     "\n  reg:         " reg)

	;; lets use the debugger eh?
	(debugger-start start: 7)
	(debugger-trace-var "runs:run-tests-queue" "")
	(debugger-trace-var "hed"              hed)
	(debugger-trace-var "tal"              tal)
	(debugger-trace-var "reruns"           reruns)
	(debugger-trace-var "items"            items)
	(debugger-trace-var "item-path"        item-path)
	(debugger-trace-var "waitons"          waitons)
	(debugger-trace-var "no-can-run"       (hash-table->alist no-can-run))
	(debugger-pauser)


	;; check for hed in waitons => this would be circular, remove it and issue an
	;; error
	(if (member test-name waitons)
	    (begin
	      (debug:print 0 "ERROR: test " test-name " has listed itself as a waiton, please correct this!")
	      (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))

	(cond 

	 ;; hed, test-deps :: hed -> ( waitons )
	 ;; We want to catch tests that have waitons that are NOT in the queue and discard them IFF 
	 ;; they have been through the wringer 10 or more times
	 ((and (list? waitons)
	       (not (null? waitons))
	       (> (hash-table-ref/default *max-tries-hash* tfullname 0) 10)
	       (not (null? (filter
			    number?
			    (map (lambda (waiton)
				   (if (and (not (member waiton tal))            ;; this waiton is not in the list to be tried to run
					    (not (member waiton reruns)))
				       1
				       #f))
				 waitons))))) ;; could do this more elegantly with a marker....
	  (debug:print 0 "WARNING: Marking test " tfullname " as not runnable. It is waiting on tests that cannot be run. Giving up now.")
	  (hash-table-set! test-registry tfullname 'removed)
	  (hash-table-set! no-can-run tfullname #t)
	  (for-each
	   (lambda (waiton)
	     (hash-table-set! no-can-run waiton #t)) ;; NB// this does not account for itemmap and itemwait
	   waitons))

	 ;; items is #f then the test is ok to be handed off to launch (but not before)
	 ;; 
	 ((not items)
	  (debug:print-info 4 "OUTER COND: (not items)")
	  (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests))
		   (not (null? tal)))