Megatest

Diff
Login

Differences From Artifact [2889bf8c3c]:

To Artifact [4c772a83cb]:


224
225
226
227
228
229
230
231
232
233
234
235
236
237
238



















239
240
241
242
243
244
245
246
247
248
249
250
251




252

253
254
255
256
257

258
259
260




261
262
263
264
265
266
267
224
225
226
227
228
229
230








231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266

267
268
269
270
271
272
273



274
275
276
277
278
279
280
281
282
283
284







-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+













+
+
+
+
-
+





+
-
-
-
+
+
+
+







	 (test-names         #f)  ;; (tests:filter-test-names all-test-names test-patts))
	 (required-tests     #f)  ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work
	 (task-key           (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id)))
	 (tdbdat             (tasks:open-db area-dat)))

    (if (tasks:need-server run-id area-dat)(tasks:start-and-wait-for-server tdbdat run-id 10))

    (set-signal-handler! signal/int
			 (lambda (signum)
			   (signal-mask! signum)
			   (print "Received signal " signum ", cleaning up before exit. Please wait...")
			   (let ((tdbdat (tasks:open-db area-dat)))
			     (rmt:tasks-set-state-given-param-key task-key "killed"))
			   (print "Killed by signal " signum ". Exiting")
			   (exit)))
    (let ((sighand (lambda (signum)
		     ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting
		     (set! *time-to-exit* #t)
		     (print "Received signal " signum ", cleaning up before exit. Please wait...")
		     ;;   (let ((tdbdat (tasks:open-db area-dat)))
		     (let ((th1 (make-thread (lambda ()
					       (let ((tdbdat (tasks:open-db)))
						 (rmt:tasks-set-state-given-param-key task-key "killed"))
					       (print "Killed by signal " signum ". Exiting")
					       (exit))))
			   (th2 (make-thread (lambda ()
					       (thread-sleep! 3)
					       (debug:print 0 "Done")
					       (exit 4)))))
		       (thread-start! th2)
		       (thread-start! th1)
		       (thread-join! th2)))))
      (set-signal-handler! signal/int sighand)
      (set-signal-handler! signal/term sighand))

    ;; register this run in monitor.db
    (rmt:tasks-add "run-tests" user target runname test-patts task-key area-dat) ;; params)
    (rmt:tasks-set-state-given-param-key task-key "running" area-dat)
    (runs:set-megatest-env-vars run-id area-dat inkeys: keys inrunname: runname) ;; these may be needed by the launching process
    (if (file-exists? runconfigf)
	(setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target)
	(debug:print 0 "WARNING: You do not have a run config file: " runconfigf))

    ;; Now generate all the tests lists
    (set! all-tests-registry (tests:get-all area-dat))
    (set! all-test-names     (hash-table-keys all-tests-registry))
    (set! test-names         (tests:filter-test-names all-test-names test-patts))

    ;; I think seeding required-tests with all test-names makes sense but lack analysis to back that up.
    ;;
    ;; (set! required-tests     (lset-intersection equal? (string-split test-patts ",") all-test-names))
    (set! required-tests     (lset-intersection equal? (string-split test-patts ",") test-names))
    (set! required-tests     (lset-intersection equal? test-names all-test-names))
    
    ;; look up all tests matching the comma separated list of globs in
    ;; test-patts (using % as wildcard)

    ;; (set! test-names (delete-duplicates (tests:get-valid-tests toppath test-patts)))
    (debug:print-info 0 "tests search path: " (string-intersperse (tests:get-tests-search-path configdat area-dat " ")))
    (debug:print-info 0 "tests search path: " (tests:get-tests-search-path configdat area-dat))
    (debug:print-info 0 "all tests:  " (string-intersperse (sort all-test-names string<) " "))
    (debug:print-info 0 "test names: " (string-intersperse (sort test-names string<) " "))
    ;; (debug:print-info 0 "tests search path: " (string-intersperse (tests:get-tests-search-path *configdat*) " "))
    (debug:print-info 0 "all tests:         " (string-intersperse (sort all-test-names string<) " "))
    (debug:print-info 0 "test names:        " (string-intersperse (sort test-names string<) " "))
    (debug:print-info 0 "required tests:    " (string-intersperse (sort required-tests string<) " "))

    ;; on the first pass or call to run-tests set FAILS to NOT_STARTED if
    ;; -keepgoing is specified
    (if (eq? *passnum* 0)
	(begin
	  ;; Is this still necessary? I think not. Unreachable tests are marked as such and 
	  ;; should not cause problems here.
546
547
548
549
550
551
552
553

554
555
556
557
558
559
560
563
564
565
566
567
568
569

570
571
572
573
574
575
576
577







-
+







             ;;    		   (not (member x allinqueue)))
             ;;    		 prereqstrs))
	     (give-up    #f))

	;; We can get here when a prereq has not been run due to *it* having a prereq that failed.
	;; We need to use this to dequeue this item as CANNOTRUN
	;; 
	(if (member testmode '(toplevel))
	(if (member 'toplevel testmode) ;; '(toplevel)) ;; NOTE: this probably should be (member 'toplevel testmode)
	    (for-each (lambda (prereq)
			(if (eq? (hash-table-ref/default test-registry prereq 'justfine) 'CANNOTRUN)
			    (set! give-up #t)))
		      prereqstrs))

	(if (and give-up
		 (not (and (null? tal)(null? reg))))
654
655
656
657
658
659
660


661

662
663
664
665
666
667
668
669
670
671
672
673





674
675
676
677
678
679
680
671
672
673
674
675
676
677
678
679

680
681
682
683
684
685
686
687
688
689
690
691

692
693
694
695
696
697
698
699
700
701
702
703







+
+
-
+











-
+
+
+
+
+







	 (num-running             (list-ref run-limits-info 1))
	 (num-running-in-jobgroup (list-ref run-limits-info 2)) 
	 (max-concurrent-jobs     (list-ref run-limits-info 3))
	 (job-group-limit         (list-ref run-limits-info 4))
	 (prereqs-not-met         (rmt:get-prereqs-not-met run-id waitons item-path testmode area-dat itemmap: itemmap))
	 ;; (prereqs-not-met         (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))
	 (fails                   (runs:calc-fails prereqs-not-met))
	 (non-completed           (filter (lambda (x)             ;; remove hed from not completed list, duh, of course it is not completed!
					    (not (equal? x hed)))
	 (non-completed           (runs:calc-not-completed prereqs-not-met))
					  (runs:calc-not-completed prereqs-not-met)))
	 (loop-list               (list hed tal reg reruns))
	 ;; configure the load runner
	 (numcpus                 (common:get-num-cpus))
	 (maxload                 (string->number (or (configf:lookup configdat "jobtools" "maxload") "3")))
	 (waitdelay               (string->number (or (configf:lookup configdat "jobtools" "waitdelay") "60"))))
    (debug:print-info 4 "have-resources: " have-resources " prereqs-not-met: (" 
		      (string-intersperse 
		       (map (lambda (t)
			      (if (vector? t)
				  (conc (db:test-get-state t) "/" (db:test-get-status t))
				  (conc " WARNING: t is not a vector=" t )))
			    prereqs-not-met) ", ") ") fails: " fails)
			    prereqs-not-met)
		       ", ") ") fails: " fails
		       "\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f))
			    

    
    (if (and (not (null? prereqs-not-met))
	     (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))
	(debug:print-info 2 "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", ")))

    ;; Don't know at this time if the test have been launched at some time in the past
    ;; i.e. is this a re-launch?
750
751
752
753
754
755
756
757

758
759
760
761
762
763
764
773
774
775
776
777
778
779

780
781
782
783
784
785
786
787







-
+







      ;; could have done hed tal here but doing car/cdr of newtal to rotate tests
      (list (car newtal)(cdr newtal) reg reruns))
     
     ;; This is the final stage, everything is in place so launch the test
     ;;
     ((and have-resources
	   (or (null? prereqs-not-met)
	       (and (eq? testmode 'toplevel)
	       (and (member 'toplevel testmode) ;;  'toplevel)
		    (null? non-completed))))
      ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path))
      ;; we are going to reset all the counters for test retries by setting a new hash table
      ;; this means they will increment only when nothing can be run
      (set! *max-tries-hash* (make-hash-table))
      ;; well, first lets see if cpu load throttling is enabled. If so wait around until the
      ;; average cpu load is under the threshold before continuing
783
784
785
786
787
788
789
790


791
792
793
794
795
796
797
806
807
808
809
810
811
812

813
814
815
816
817
818
819
820
821







-
+
+







      ;; a message and drop hed from the items to be processed.
      ;; (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met)
      (if (and (not (null? prereqs-not-met))
	       (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))
	  (debug:print-info 1 "waiting on tests; " (string-intersperse 
						    (runs:mixed-list-testname-and-testrec->list-of-strings 
						     prereqs-not-met) ", ")))
      (if (null? fails)
      (if (or (null? fails)
	      (member 'toplevel testmode))
	  (begin
	    ;; couldn't run, take a breather
	    (if  (runs:lownoise "Waiting for more work to do..." 60)
		 (debug:print-info 0 "Waiting for more work to do..."))
	    (thread-sleep! 1)
	    (list (car newtal)(cdr newtal) reg reruns))
	  ;; the waiton is FAIL so no point in trying to run hed ever again
855
856
857
858
859
860
861

862

863
864
865
866
867
868
869
879
880
881
882
883
884
885
886

887
888
889
890
891
892
893
894







+
-
+







				  reruns))))
		     (else
		      (if (runs:lownoise (conc "FAILED prerequitests and we tried" hed) 60)
			  (debug:print 0 "WARNING: test " hed " has FAILED prerequitests and we've tried at least 10 times to run it. Giving up now."))
		      ;; (debug:print 0 "         prereqs: " prereqs-not-met)
		      (hash-table-set! test-registry hed 'removed)
		      (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "TEN_STRIKES" #f)
		      ;; I'm unclear on if this roll up is needed - it may be the root cause of the "all set to FAIL" bug.
		      (mt:roll-up-pass-fail-counts run-id test-name item-path "FAIL") ;; treat as FAIL
		      (rmt:roll-up-pass-fail-counts run-id test-name item-path "FAIL") ;; treat as FAIL
		      (list (if (null? tal)(car newtal)(car tal))
			    tal
			    reg
			    reruns)))))
	      ;; can't drop this - maybe running? Just keep trying
	      (let ((runable-tests (runs:runable-tests prereqs-not-met)))
		(if (null? runable-tests)
973
974
975
976
977
978
979
980

981
982
983
984
985
986
987
998
999
1000
1001
1002
1003
1004

1005
1006
1007
1008
1009
1010
1011
1012







-
+







	(if (and (common:low-noise-print 60 "try start server"  run-id)
		 (tasks:need-server run-id area-dat))
	    (tasks:start-and-wait-for-server tdbdat run-id 10)) ;; NOTE: delay and wait is done under the hood
	
	(if (> num-running 0)
	  (set! last-time-some-running (current-seconds)))

      (if (> (current-seconds)(+ last-time-some-running 240))
      (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
	  (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
	;; (debug:print 0 "max-tries-hash: " (hash-table->alist *max-tries-hash*))

	;; Ensure all top level tests get registered. This way they show up as "NOT_STARTED" on the dashboard
	;; and it is clear they *should* have run but did not.
	(if (not (hash-table-ref/default test-registry (db:test-make-full-name test-name "") #f))
	    (begin
1345
1346
1347
1348
1349
1350
1351
1352














1353
1354
1355
1356
1357
1358
1359
1370
1371
1372
1373
1374
1375
1376

1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397







-
+
+
+
+
+
+
+
+
+
+
+
+
+
+







		   ;; run-ids = #f means *all* runs
		   (let ((running-tests (rmt:get-tests-for-runs-mindata #f full-test-name '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED") '() #f area-dat)))
		     (if (not (null? running-tests)) ;; have to skip 
			 (set! skip-test "Skipping due to previous tests running"))))
		  ((and skip-check
			(configf:lookup test-conf "skip" "fileexists"))
		   (if (file-exists? (configf:lookup test-conf "skip" "fileexists"))
		       (set! skip-test (conc "Skipping due to existance of file " (configf:lookup test-conf "skip" "fileexists"))))))
		       (set! skip-test (conc "Skipping due to existance of file " (configf:lookup test-conf "skip" "fileexists")))))

		  ((and skip-check
			(configf:lookup test-conf "skip" "rundelay"))
		   ;; run-ids = #f means *all* runs
		   (let* ((numseconds      (common:hms-string->seconds (configf:lookup test-conf "skip" "rundelay")))
			  (running-tests   (rmt:get-tests-for-runs-mindata #f full-test-name '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED") '() #f))
			  (completed-tests (rmt:get-tests-for-runs-mindata #f full-test-name '("COMPLETED") '("PASS" "FAIL" "ABORT") #f))
			  (last-run-times  (map db:mintest-get-event_time completed-tests))
			  (time-since-last (- (current-seconds) (if (null? last-run-times) 0 (apply max last-run-times)))))
		     (if (or (not (null? running-tests)) ;; have to skip if test is running
			     (> numseconds time-since-last))
			 (set! skip-test (conc "Skipping due to previous test run less than " (configf:lookup test-conf "skip" "rundelay") " ago"))))))
		 
		 (if skip-test
		     (begin
		       (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test)
		       (debug:print-info 1 "SKIPPING Test " full-test-name " due to " skip-test))
		     (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags))
			 (begin
			   (print "ERROR: Failed to launch the test. Exiting as soon as possible")
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587






1588
1589
1590
1591
1592
1593
1594
1595
1615
1616
1617
1618
1619
1620
1621




1622
1623
1624
1625
1626
1627

1628
1629
1630
1631
1632
1633
1634







-
-
-
-
+
+
+
+
+
+
-







				(debug:print-info 2 "still waiting, " (length tests) " tests still running")
				(thread-sleep! 10)
				(let ((new-tests (proc-get-tests run-id)))
				  (if (null? new-tests)
				      (debug:print-info 1 "Run completed according to zero tests matching provided criteria.")
				      (loop (car new-tests)(cdr new-tests)))))
			       ((archive)
				(if (not toplevel-with-children)
				    (case (string->symbol (args:get-arg "-archive"))
				      ((save save-remove keep-html)
				       (debug:print-info 0 "Estimating disk space usage for " test-fulln)
				(if (and run-dir (not toplevel-with-children))
				    (let ((ddir (conc run-dir "/")))
				      (case (string->symbol (args:get-arg "-archive"))
					((save save-remove keep-html)
					 (if (file-exists? ddir)
					     (debug:print-info 0 "Estimating disk space usage for " test-fulln ": " (common:get-disk-space-used ddir)))))))
				       (debug:print-info 0 "   " (common:get-disk-space-used (conc run-dir "/"))))))
				(if (not (null? tal))
				    (loop (car tal)(cdr tal))))
			       )))
		       )
		     (if worker-thread (thread-join! worker-thread))))))
	   ;; remove the run if zero tests remain
	   (if (eq? action 'remove-runs)