Megatest

Check-in [8280550d67]
Login
Overview
Comment:trim some load control sleeps in runs.scm that are not needed as we have new load control ==/9.8/2.2/WARN/1203/mars==
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65-cleanup
Files: files | file ages | folders
SHA1: 8280550d675d1c3e45309c3ba2f49e462b7b606f
User & Date: matt on 2020-08-27 07:10:36
Original Comment: trim some load control sleeps in runs.scm that are not needed as we have new load control
Other Links: branch diff | manifest | tags
Context
2020-08-27
07:56
Comment out a telemetry call check-in: def8386816 user: matt tags: v1.65-cleanup
07:10
trim some load control sleeps in runs.scm that are not needed as we have new load control ==/9.8/2.2/WARN/1203/mars== check-in: 8280550d67 user: matt tags: v1.65-cleanup
2020-08-26
23:04
Tweaked server gate - still not right :( ==/FAIL/orion/== check-in: 7e26fb2f0c user: mrwellan tags: v1.65-cleanup
Changes

Modified runs.scm from [96a5410a91] to [b6fddf76c2].

87
88
89
90
91
92
93
94

95
96
97
98
99
100
101
87
88
89
90
91
92
93

94
95
96
97
98
99
100
101







-
+







					 ((< age 10)  #t)
					 (else #f))))
				    lock-files)))
	  (if fresh-locks
	      (begin
		(if (runs:lownoise "runners-softlock-wait" 360)
		    (debug:print-info 0 *default-log-port* "Other runners in flight, giving up some time..."))
		(thread-sleep! 10))
		(thread-sleep! 2))
	      (begin
		(if (runs:lownoise "runners-softlock-nowait" 360)
		    (debug:print-info 0 *default-log-port* "No runners in flight, updating softlock"))
		(let* ((ouf (open-output-file my-lock-file)))
		  (with-output-to-port ouf
		    (lambda ()(print (current-seconds))))
		  (close-output-port ouf))))
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328











329
330
331
332
333
334
335
311
312
313
314
315
316
317











318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335







-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+







(define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)

  ;; Take advantage of a good place to exit if running the one-pass methodology
  (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20)
	   (args:get-arg "-one-pass"))
      (exit 0))

  (thread-sleep! (cond ;; BB: check with Matt.  Should this sleep move
		       ;; to cond clauses below where we determine we
		       ;; have too many jobs running rather than each
		       ;; time the and condition above is true (which
		  ;; seems like always)?
		  ((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way
		   0)
        	  ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time
		   (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ..."))
		   10)  ;; obviously haven't had any work to do for a while
		  (else 0)))
;;  (thread-sleep! (cond ;; BB: check with Matt.  Should this sleep move
;;		       ;; to cond clauses below where we determine we
;;		       ;; have too many jobs running rather than each
;;		       ;; time the and condition above is true (which
;;		  ;; seems like always)?
;;		  ((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way
;;		   0)
;;        	  ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time
;;		   (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ..."))
;;		   10)  ;; obviously haven't had any work to do for a while
;;		  (else 0)))
;;		   ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?
;;		   (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)
;;		   )))
  
  (let* ((num-running             (rmt:get-count-tests-running run-id #f)) ;; fastmode=no
	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
	 (job-group-limit         (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))
991
992
993
994
995
996
997
998

999
1000
1001
1002
1003
1004
1005
991
992
993
994
995
996
997

998
999
1000
1001
1002
1003
1004
1005







-
+







	   (null? non-completed))
     (debug:print-info 4 *default-log-port* "cond branch - "  "ei-4")
      (if  (runs:can-keep-running? hed 20)
	  (begin
	    (runs:inc-cant-run-tests hed)
	    (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; 
	    ;; getting here likely means the system is way overloaded, kill a full minute before continuing
	    (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
	    (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
	    ;; num-retries code was here
	    ;; we use this opportunity to move contents of reg to tal
	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
	  (begin
	    (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
	    (let ((test-id (rmt:get-test-id run-id hed "")))
	      (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))
1805
1806
1807
1808
1809
1810
1811
1812

1813
1814
1815
1816
1817
1818
1819
1805
1806
1807
1808
1809
1810
1811

1812
1813
1814
1815
1816
1817
1818
1819







-
+







		  (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id
				    ". Running as pid " (current-process-id) " on " (get-host-name))
		  (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
		  (rmt:find-and-mark-incomplete run-id #f)
		  (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running
				    " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at "
				    (time->string (seconds->local-time (current-seconds))))))
	    (thread-sleep! 5)
	    (thread-sleep! 1) ;; (if (>= num-running max-concurrent-jobs) 5 1))
	    (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes
		       num-running))))
    ;; LET* ((test-record
    ;; we get here on "drop through". All done!
    ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. 
    ;; (debug:print-info 0 *default-log-port* "Calling Post Hook")    
    ;; (runs:run-post-hook run-id)
2533
2534
2535
2536
2537
2538
2539
2540

2541
2542
2543
2544
2545
2546
2547
2533
2534
2535
2536
2537
2538
2539

2540
2541
2542
2543
2544
2545
2546
2547







-
+







                                  (debug:print-info 2 *default-log-port* "new state " new-state ", new status " new-status )
                                  (mt:test-set-state-status-by-id run-id test-id new-state new-status #f))
				(if (not (null? tal))
				    (loop (car tal)(cdr tal))))
			       ((run-wait)
                                ;; BB TODO - manage has-subrun case
				(debug:print-info 2 *default-log-port* "still waiting, " (length tests) " tests still running")
				(thread-sleep! 10)
				(thread-sleep! 5)
				(let ((new-tests (proc-get-tests run-id)))
				  (if (null? new-tests)
				      (debug:print-info 1 *default-log-port* "Run completed according to zero tests matching provided criteria.")
				      (loop (car new-tests)(cdr new-tests)))))
			       ((archive)
                                ;; BB TODO - manage has-subrun case
				(if (and run-dir (not toplevel-with-children))