Megatest

Diff
Login

Differences From Artifact [31523ae98e]:

To Artifact [396462afab]:


64
65
66
67
68
69
70

71
72
73
74
75
76
77
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78







+







    (for-each (lambda (keyval)
		(setenv (car keyval)(cadr keyval)))
	      keyvals)
    ;; Set up various and sundry known vars here
    (setenv "MT_RUN_AREA_HOME" toppath)
    (setenv "MT_RUNNAME" runname)
    (setenv "MT_TARGET"  target)
    (setenv "MT_TESTSUITENAME" (common:get-testsuite-name))
    (set! envdat (append 
		  envdat
		  (list (list "MT_RUN_AREA_HOME" toppath)
			(list "MT_RUNNAME"       runname)
			(list "MT_TARGET"        target))))
    ;; Now can read the runconfigs file
    ;; 
157
158
159
160
161
162
163
164
165
166



167
168
169
170
171
172
173
158
159
160
161
162
163
164



165
166
167
168
169
170
171
172
173
174







-
-
-
+
+
+







    (if (> (- currtime lasttime) waitval)
	(begin
	  (hash-table-set! *runs:denoise* key currtime)
	  #t)
	#f)))

(define (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)
  (thread-sleep! (cond
		  ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while
		  (else 0)))
  ;;(thread-sleep! (cond
  ;;      	  ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while
  ;;      	  (else 0)))
  (let* ((num-running             (rmt:get-count-tests-running run-id))
	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
	 (job-group-limit         (let ((jobg-count (config-lookup *configdat* "jobgroups" jobgroup)))
				    (if (string? jobg-count)
					(string->number jobg-count)
					jobg-count))))
    (if (> (+ num-running num-running-in-jobgroup) 0)
211
212
213
214
215
216
217
218



219
220
221
222

223
224


225
226
227
228
229
230
231
232


233
234
235
236
237
238
239
212
213
214
215
216
217
218

219
220
221
222
223
224
225
226


227
228


229
230
231
232


233
234
235
236
237
238
239
240
241







-
+
+
+




+
-
-
+
+
-
-




-
-
+
+







	 (test-records       (make-hash-table))
	 ;; need to process runconfigs before generating these lists
	 (all-tests-registry #f)  ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names
	 (all-test-names     #f)  ;; (hash-table-keys all-tests-registry))
	 (test-names         #f)  ;; (tests:filter-test-names all-test-names test-patts))
	 (required-tests     #f)  ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work
	 (task-key           (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id)))
	 (tasks-db           (tasks:open-db)))
	 (tdbdat             (tasks:open-db)))

    (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))

    (set-signal-handler! signal/int
			 (lambda (signum)
			   (signal-mask! signum)
			   (print "Received signal " signum ", cleaning up before exit. Please wait...")
			   (let ((tdb (tasks:open-db)))
			     (tasks:set-state-given-param-key tdb task-key "killed")
			   (let ((tdbdat (tasks:open-db)))
			     (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "killed"))
			     ;; (sqlite3:interrupt! tdb) ;; seems silly?
			     (sqlite3:finalize! tdb))
			   (print "Killed by signal " signum ". Exiting")
			   (exit)))

    ;; register this run in monitor.db
    (tasks:add tasks-db "run-tests" user target runname test-patts task-key) ;; params)
    (tasks:set-state-given-param-key tasks-db task-key "running")
    (tasks:add (db:delay-if-busy tdbdat) "run-tests" user target runname test-patts task-key) ;; params)
    (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "running")
    (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process
    (if (file-exists? runconfigf)
	(setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target)
	(debug:print 0 "WARNING: You do not have a run config file: " runconfigf))

    ;; Now generate all the tests lists
    (set! all-tests-registry (tests:get-all))
390
391
392
393
394
395
396
397
398



399
400
401
402
403
404
405
392
393
394
395
396
397
398


399
400
401
402
403
404
405
406
407
408







-
-
+
+
+







		  (if (not (hash-table-ref/default flags "-preclean" #f))
		      (hash-table-set! flags "-preclean" #t))
		  (if (not (hash-table-ref/default flags "-rerun" #f))
		      (hash-table-set! flags "-rerun" "STUCK/DEAD,n/a,ZERO_ITEMS"))
		  (runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))))
	  (debug:print-info 0 "No tests to run")))
    (debug:print-info 4 "All done by here")
    (tasks:set-state-given-param-key tasks-db task-key "done")
    (sqlite3:finalize! tasks-db)))
    (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "done")
    ;; (sqlite3:finalize! tasks-db)
    ))


;; loop logic. These are used in runs:run-tests-queue to make it a bit more readable.
;;
;; If reg not full and have items in tal then loop with (car tal)(cdr tal) reg reruns
;; If reg is full (i.e. length >= n
;;   loop with (car reg) tal (cdr reg) reruns
549
550
551
552
553
554
555
556

557
558
559


560
561
562
563
564
565
566
552
553
554
555
556
557
558

559
560
561
562
563
564
565
566
567
568
569
570
571







-
+



+
+







			(runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull)
			reruns)))
	      (list (car newtal)(append (cdr newtal) reg) '() reruns))))

     ((and (null? fails)
	   (null? prereq-fails)
	   (null? non-completed))
      (if  (runs:can-keep-running? hed 5)
      (if  (runs:can-keep-running? hed 20)
	  (begin
	    (runs:inc-cant-run-tests hed)
	    (debug:print-info 1 "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0))
	    ;; getting here likely means the system is way overloaded, kill a full minute before continuing
	    (thread-sleep! 60)
	    ;; num-retries code was here
	    ;; we use this opportunity to move contents of reg to tal
	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
	  (begin
	    (debug:print-info 1 "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
	    (let ((test-id (rmt:get-test-id run-id hed "")))
	      (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))
667
668
669
670
671
672
673
674
675
676
677






678
679
680



681
682
683
684
685




686
687

688
689
690
691
692
693
694
695
696
697
672
673
674
675
676
677
678




679
680
681
682
683
684



685
686
687





688
689
690
691


692



693
694
695
696
697
698
699







-
-
-
-
+
+
+
+
+
+
-
-
-
+
+
+
-
-
-
-
-
+
+
+
+
-
-
+
-
-
-







		reruns)
	  #f))
     
     ;; Register tests 
     ;;
     ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f))
      (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" )
      (if #t ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs
	  (begin
	    (rmt:general-call 'register-test run-id run-id test-name item-path)
	    (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done))
      ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs
      (let register-loop ((numtries 15))
	(rmt:general-call 'register-test run-id run-id test-name item-path)
	(thread-sleep! 0.5)
	(if (rmt:get-test-id run-id test-name item-path)
	    (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done)
	  (let ((th (make-thread (lambda ()
				   (mutex-lock! registry-mutex)
				   (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'start)
	    (if (> numtries 0)
		(register-loop (- numtries 1))
		(debug:print 0 "ERROR: failed to register test " (runs:make-full-test-name test-name item-path)))))
				   (mutex-unlock! registry-mutex)
				   ;; If haven't done it before register a top level test if this is an itemized test
				   (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done))
				       (rmt:general-call 'register-test run-id run-id test-name ""))
				   (rmt:general-call 'register-test run-id run-id test-name item-path)
      (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done))
	  (begin
	    (rmt:general-call 'register-test run-id run-id test-name "")
	    (if (rmt:get-test-id run-id test-name "")
				   (mutex-lock! registry-mutex)
				   (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done)
		(hash-table-set! test-registry (runs:make-full-test-name test-name "") 'done))))
				   (mutex-unlock! registry-mutex))
				 (conc test-name "/" item-path))))
	    (thread-start! th)))
      (runs:shrink-can-run-more-tests-count)   ;; DELAY TWEAKER (still needed?)
      (if (and (null? tal)(null? reg))
	  (list hed tal (append reg (list hed)) reruns)
	  (list (runs:queue-next-hed tal reg reglen regfull)
		(runs:queue-next-tal tal reg reglen regfull)
		;; NB// Here we are building reg as we register tests
		;; if regfull we must pop the front item off reg
888
889
890
891
892
893
894
895


896
897
898
899
900
901
902
890
891
892
893
894
895
896

897
898
899
900
901
902
903
904
905







-
+
+







	(max-retries           (config-lookup *configdat* "setup" "maxretries"))
	(max-concurrent-jobs   (let ((mcj (config-lookup *configdat* "setup"     "max_concurrent_jobs")))
				 (if (and mcj (string->number mcj))
				     (string->number mcj)
				     1))) ;; length of the register queue ahead
	(reglen                (if (number? reglen-in) reglen-in 1))
	(last-time-incomplete  (- (current-seconds) 900)) ;; force at least one clean up cycle
	(last-time-some-running (current-seconds)))
	(last-time-some-running (current-seconds))
	(tdbdat                (tasks:open-db)))

    ;; Initialize the test-registery hash with tests that already have a record
    ;; convert state to symbol and use that as the hash value
    (for-each (lambda (trec)
		(let ((id (db:test-get-id        trec))
		      (tn (db:test-get-testname  trec))
		      (ip (db:test-get-item-path trec))
936
937
938
939
940
941
942





943

944
945
946
947
948
949
950
939
940
941
942
943
944
945
946
947
948
949
950

951
952
953
954
955
956
957
958







+
+
+
+
+
-
+







	     (items       (tests:testqueue-get-items      test-record))
	     (item-path   (item-list->path itemdat))
	     (tfullname   (runs:make-full-test-name test-name item-path))
	     (newtal      (append tal (list hed)))
	     (regfull     (>= (length reg) reglen))
	     (num-running (rmt:get-count-tests-running-for-run-id run-id)))

	;; every couple minutes verify the server is there for this run
	(if (and (common:low-noise-print 60 "try start server"  run-id)
		 (tasks:need-server run-id))
	    (tasks:start-and-wait-for-server tdbdat run-id 10))
	
      (if (> num-running 0)
	(if (> num-running 0)
	  (set! last-time-some-running (current-seconds)))

      (if (> (current-seconds)(+ last-time-some-running 240))
	  (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
	;; (debug:print 0 "max-tries-hash: " (hash-table->alist *max-tries-hash*))

	;; Ensure all top level tests get registered. This way they show up as "NOT_STARTED" on the dashboard
1114
1115
1116
1117
1118
1119
1120
1121

1122
1123
1124
1125
1126
1127
1128
1122
1123
1124
1125
1126
1127
1128

1129
1130
1131
1132
1133
1134
1135
1136







-
+







	    (if (> (current-seconds)(+ last-time-incomplete 900))
		(begin
		  (debug:print-info 0 "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name))
		  (set! last-time-incomplete (current-seconds))
		  (rmt:find-and-mark-incomplete run-id #f)))
	    (if (not (eq? num-running prev-num-running))
		(debug:print-info 0 "run-wait specified, waiting on " num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))
	    (thread-sleep! 15)
	    (thread-sleep! 5)
	    ;; (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running))))
	    (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running))))
    ;; LET* ((test-record
    ;; we get here on "drop through". All done!
    (debug:print-info 1 "All tests launched")))

(define (runs:calc-fails prereqs-not-met)
1394
1395
1396
1397
1398
1399
1400
1401

1402
1403
1404
1405
1406
1407
1408
1402
1403
1404
1405
1406
1407
1408

1409
1410
1411
1412
1413
1414
1415
1416







-
+







;;    'set-state-status
;;
;; NB// should pass in keys?
;;
(define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(remove-data-only #f))
  (common:clear-caches) ;; clear all caches
  (let* ((db           #f)
	 (tasks-db     (tasks:open-db))
	 (tdbdat       (tasks:open-db))
	 (keys         (rmt:get-keys))
	 (rundat       (mt:get-runs-by-patt keys runnamepatt target))
	 (header       (vector-ref rundat 0))
	 (runs         (vector-ref rundat 1))
	 (states       (if state  (string-split state  ",") '()))
	 (statuses     (if status (string-split status ",") '()))
	 (state-status (if (string? new-state-status) (string-split new-state-status ",") '(#f #f))))
1431
1432
1433
1434
1435
1436
1437

1438
1439
1440

1441
1442
1443

1444
1445
1446
1447
1448
1449
1450
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448

1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460







+


-
+



+







			       '()))
		(lasttpath "/does/not/exist/I/hope"))
	   (debug:print-info 4 "runs:operate-on run=" run ", header=" header)
	   (if (not (null? tests))
	       (begin
		 (case action
		   ((remove-runs)
		    (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
		    ;; seek and kill in flight -runtests with % as testpatt here
		    (if (equal? testpatt "%")
			(tasks:kill-runner tasks-db target run-name)
			(tasks:kill-runner (db:delay-if-busy tdbdat) target run-name)
			(debug:print 0 "not attempting to kill any run launcher processes as testpatt is " testpatt))
		    (debug:print 1 "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((set-state-status)
		    (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
		    (debug:print 1 "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((print-run)
		    (debug:print 1 "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header)
		    action)
		   ((run-wait)
		    (debug:print 1 "Waiting for run " runkey ", run=" runnamepatt " to complete"))
		   (else
1546
1547
1548
1549
1550
1551
1552
1553


1554
1555
1556
1557
1558
1559
1560
1556
1557
1558
1559
1560
1561
1562

1563
1564
1565
1566
1567
1568
1569
1570
1571







-
+
+







		       ;;    (if (null? (glob (conc runpath "/*")))
		       ;;        (begin
		       ;; 	 (debug:print 1 "Removing run dir " runpath)
		       ;; 	 (system (conc "rmdir -p " runpath))))
		       )))))
	 ))
     runs)
    (sqlite3:finalize! tasks-db))
    ;; (sqlite3:finalize! (db:delay-if-busy tdbdat))
    )
  #t)

(define (runs:remove-test-directory db test remove-data-only)
  (let* ((run-dir       (db:test-get-rundir test))    ;; run dir is from the link tree
	 (real-dir      (if (file-exists? run-dir)
			    (resolve-pathname run-dir)
			    #f)))