Megatest

Diff
Login

Differences From Artifact [de4f2b1394]:

To Artifact [ebf1e29df4]:


220
221
222
223
224
225
226
227

228
229
230
231
232
233
234
220
221
222
223
224
225
226

227
228
229
230
231
232
233
234







-
+







    (common:check-db-dir-and-exit-if-insufficient)
    
    ;; override the number of reruns from the configs
    (if (and config-reruns
	     (> run-count config-reruns))
	(set! run-count config-reruns))
    
    (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
    ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))

    (let ((sighand (lambda (signum)
		     ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting
		     (set! *time-to-exit* #t)
		     (print "Received signal " signum ", cleaning up before exit. Please wait...")
		     (let ((th1 (make-thread (lambda ()
					       (let ((tdbdat (tasks:open-db)))
721
722
723
724
725
726
727
728

729
730
731
732
733
734
735
721
722
723
724
725
726
727

728
729
730
731
732
733
734
735







-
+







					(debug:print-error 0 *default-log-port* "prereqs-not-met is not a list! " prereqs-not-met)
					'())))
	 (non-completed           (filter (lambda (x)             ;; remove hed from not completed list, duh, of course it is not completed!
					    (not (equal? x hed)))
					  (runs:calc-not-completed prereqs-not-met)))
	 (loop-list               (list hed tal reg reruns))
	 ;; configure the load runner
	 (numcpus                 (common:get-num-cpus))
	 (numcpus                 (common:get-num-cpus #f))
	 (maxload                 (string->number (or (configf:lookup *configdat* "jobtools" "maxload") "3")))
	 (waitdelay               (string->number (or (configf:lookup *configdat* "jobtools" "waitdelay") "60"))))
    (debug:print-info 4 *default-log-port* "have-resources: " have-resources " prereqs-not-met: (" 
		      (string-intersperse 
		       (map (lambda (t)
			      (if (vector? t)
				  (conc (db:test-get-state t) "/" (db:test-get-status t))
926
927
928
929
930
931
932
933

934
935
936
937
938
939
940
926
927
928
929
930
931
932

933
934
935
936
937
938
939
940







-
+







		     (else
		      (if (runs:lownoise (conc "FAILED prerequitests and we tried" hed) 60)
			  (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequitests and we've tried at least 10 times to run it. Giving up now."))
		      ;; (debug:print 0 *default-log-port* "         prereqs: " prereqs-not-met)
		      (hash-table-set! test-registry hed 'removed)
		      (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "TEN_STRIKES" #f)
		      ;; I'm unclear on if this roll up is needed - it may be the root cause of the "all set to FAIL" bug.
		      (rmt:roll-up-pass-fail-counts run-id test-name item-path #f "FAIL") ;; treat as FAIL
		      (rmt:roll-up-pass-fail-counts run-id test-name item-path #f "FAIL" #f) ;; treat as FAIL
		      (list (if (null? tal)(car newtal)(car tal))
			    tal
			    reg
			    reruns)))))
	      ;; can't drop this - maybe running? Just keep trying
	      (let ((runable-tests (runs:runable-tests prereqs-not-met)))
		(if (null? runable-tests)
1322
1323
1324
1325
1326
1327
1328
1329

1330
1331
1332
1333
1334
1335
1336
1322
1323
1324
1325
1326
1327
1328

1329
1330
1331
1332
1333
1334
1335
1336







-
+







	  (debug:print-info 4 *default-log-port* "Exiting loop with...\n  hed=" hed "\n  tal=" tal "\n  reruns=" reruns))
	 )))
    ;; now *if* -run-wait we wait for all tests to be done
    ;; Now wait for any RUNNING tests to complete (if in run-wait mode)
    (thread-sleep! 5) ;; I think there is a race condition here. Let states/statuses settle
    (let wait-loop ((num-running      (rmt:get-count-tests-running-for-run-id run-id))
		    (prev-num-running 0))
      ;; (debug:print 0 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running)
      ;; (BB> "num-running=" num-running ", prev-num-running=" prev-num-running)
      (if (and (or (args:get-arg "-run-wait")
		   (equal? (configf:lookup *configdat* "setup" "run-wait") "yes"))
	       (> num-running 0))
	  (begin
	    ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
	    ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0))
	    (if (> (current-seconds)(+ last-time-incomplete 900))
1542
1543
1544
1545
1546
1547
1548
1549

1550
1551
1552
1553
1554
1555
1556
1542
1543
1544
1545
1546
1547
1548

1549
1550
1551
1552
1553
1554
1555
1556







-
+







		  ((and skip-check
			(configf:lookup test-conf "skip" "rundelay"))
		   ;; run-ids = #f means *all* runs
		   (let* ((numseconds      (common:hms-string->seconds (configf:lookup test-conf "skip" "rundelay")))
			  (running-tests   (rmt:get-tests-for-runs-mindata #f full-test-name '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED") '() #f))
			  (completed-tests (rmt:get-tests-for-runs-mindata #f full-test-name '("COMPLETED" "INCOMPLETE") '("PASS" "FAIL" "ABORT") #f)) ;; ironically INCOMPLETE is same as COMPLETED in this contex
			  (last-run-times  (map db:mintest-get-event_time completed-tests))
			  (time-since-last (- (current-seconds) (if (null? last-run-times) 0 (apply max last-run-times)))))
			  (time-since-last (- (current-seconds) (if (null? last-run-times) 0 (common:max last-run-times)))))
		     (if (or (not (null? running-tests)) ;; have to skip if test is running
			     (> numseconds time-since-last))
			 (set! skip-test (conc "Skipping due to previous test run less than " (configf:lookup test-conf "skip" "rundelay") " ago"))))))
		 
		 (if skip-test
		     (begin
		       (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test)
1667
1668
1669
1670
1671
1672
1673
1674

1675
1676
1677
1678
1679
1680
1681
1667
1668
1669
1670
1671
1672
1673

1674
1675
1676
1677
1678
1679
1680
1681







-
+







		(lasttpath "/does/not/exist/I/hope")
		(worker-thread #f))
	   (debug:print-info 4 *default-log-port* "runs:operate-on run=" run ", header=" header)
	   (if (not (null? tests))
	       (begin
		 (case action
		   ((remove-runs)
		    (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
		    ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
		    ;; seek and kill in flight -runtests with % as testpatt here
		    ;; (if (equal? testpatt "%")
		    (tasks:kill-runner target run-name testpatt)
		    ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt))
		    (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((set-state-status)
		    (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))