Megatest

Check-in [8280550d67]
Login
Overview
Comment:trim some load control sleeps in runs.scm that are not needed as we have new load control ==/9.8/2.2/WARN/1203/mars==
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65-cleanup
Files: files | file ages | folders
SHA1: 8280550d675d1c3e45309c3ba2f49e462b7b606f
User & Date: matt on 2020-08-27 07:10:36
Original Comment: trim some load control sleeps in runs.scm that are not needed as we have new load control
Other Links: branch diff | manifest | tags
Context
2020-08-27
07:56
Comment out a telemetry call check-in: def8386816 user: matt tags: v1.65-cleanup
07:10
trim some load control sleeps in runs.scm that are not needed as we have new load control ==/9.8/2.2/WARN/1203/mars== check-in: 8280550d67 user: matt tags: v1.65-cleanup
2020-08-26
23:04
Tweaked server gate - still not right :( ==/FAIL/orion/== check-in: 7e26fb2f0c user: mrwellan tags: v1.65-cleanup
Changes

Modified runs.scm from [96a5410a91] to [b6fddf76c2].

87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
					 ((< age 10)  #t)
					 (else #f))))
				    lock-files)))
	  (if fresh-locks
	      (begin
		(if (runs:lownoise "runners-softlock-wait" 360)
		    (debug:print-info 0 *default-log-port* "Other runners in flight, giving up some time..."))
		(thread-sleep! 10))
	      (begin
		(if (runs:lownoise "runners-softlock-nowait" 360)
		    (debug:print-info 0 *default-log-port* "No runners in flight, updating softlock"))
		(let* ((ouf (open-output-file my-lock-file)))
		  (with-output-to-port ouf
		    (lambda ()(print (current-seconds))))
		  (close-output-port ouf))))







|







87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
					 ((< age 10)  #t)
					 (else #f))))
				    lock-files)))
	  (if fresh-locks
	      (begin
		(if (runs:lownoise "runners-softlock-wait" 360)
		    (debug:print-info 0 *default-log-port* "Other runners in flight, giving up some time..."))
		(thread-sleep! 2))
	      (begin
		(if (runs:lownoise "runners-softlock-nowait" 360)
		    (debug:print-info 0 *default-log-port* "No runners in flight, updating softlock"))
		(let* ((ouf (open-output-file my-lock-file)))
		  (with-output-to-port ouf
		    (lambda ()(print (current-seconds))))
		  (close-output-port ouf))))
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
(define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)

  ;; Take advantage of a good place to exit if running the one-pass methodology
  (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20)
	   (args:get-arg "-one-pass"))
      (exit 0))

  (thread-sleep! (cond ;; BB: check with Matt.  Should this sleep move
		       ;; to cond clauses below where we determine we
		       ;; have too many jobs running rather than each
		       ;; time the and condition above is true (which
		  ;; seems like always)?
		  ((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way
		   0)
        	  ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time
		   (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ..."))
		   10)  ;; obviously haven't had any work to do for a while
		  (else 0)))
;;		   ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?
;;		   (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)
;;		   )))
  
  (let* ((num-running             (rmt:get-count-tests-running run-id #f)) ;; fastmode=no
	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
	 (job-group-limit         (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))







|
|
|
|
|
|
|
|
|
|
|







311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
(define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)

  ;; Take advantage of a good place to exit if running the one-pass methodology
  (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20)
	   (args:get-arg "-one-pass"))
      (exit 0))

;;  (thread-sleep! (cond ;; BB: check with Matt.  Should this sleep move
;;		       ;; to cond clauses below where we determine we
;;		       ;; have too many jobs running rather than each
;;		       ;; time the and condition above is true (which
;;		  ;; seems like always)?
;;		  ((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way
;;		   0)
;;        	  ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time
;;		   (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ..."))
;;		   10)  ;; obviously haven't had any work to do for a while
;;		  (else 0)))
;;		   ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?
;;		   (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)
;;		   )))
  
  (let* ((num-running             (rmt:get-count-tests-running run-id #f)) ;; fastmode=no
	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
	 (job-group-limit         (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
	   (null? non-completed))
     (debug:print-info 4 *default-log-port* "cond branch - "  "ei-4")
      (if  (runs:can-keep-running? hed 20)
	  (begin
	    (runs:inc-cant-run-tests hed)
	    (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; 
	    ;; getting here likely means the system is way overloaded, kill a full minute before continuing
	    (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
	    ;; num-retries code was here
	    ;; we use this opportunity to move contents of reg to tal
	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
	  (begin
	    (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
	    (let ((test-id (rmt:get-test-id run-id hed "")))
	      (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))







|







991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
	   (null? non-completed))
     (debug:print-info 4 *default-log-port* "cond branch - "  "ei-4")
      (if  (runs:can-keep-running? hed 20)
	  (begin
	    (runs:inc-cant-run-tests hed)
	    (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; 
	    ;; getting here likely means the system is way overloaded, kill a full minute before continuing
	    (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
	    ;; num-retries code was here
	    ;; we use this opportunity to move contents of reg to tal
	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
	  (begin
	    (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
	    (let ((test-id (rmt:get-test-id run-id hed "")))
	      (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
		  (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id
				    ". Running as pid " (current-process-id) " on " (get-host-name))
		  (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
		  (rmt:find-and-mark-incomplete run-id #f)
		  (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running
				    " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at "
				    (time->string (seconds->local-time (current-seconds))))))
	    (thread-sleep! 5)
	    (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes
		       num-running))))
    ;; LET* ((test-record
    ;; we get here on "drop through". All done!
    ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. 
    ;; (debug:print-info 0 *default-log-port* "Calling Post Hook")    
    ;; (runs:run-post-hook run-id)







|







1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
		  (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id
				    ". Running as pid " (current-process-id) " on " (get-host-name))
		  (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
		  (rmt:find-and-mark-incomplete run-id #f)
		  (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running
				    " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at "
				    (time->string (seconds->local-time (current-seconds))))))
	    (thread-sleep! 1) ;; (if (>= num-running max-concurrent-jobs) 5 1))
	    (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes
		       num-running))))
    ;; LET* ((test-record
    ;; we get here on "drop through". All done!
    ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. 
    ;; (debug:print-info 0 *default-log-port* "Calling Post Hook")    
    ;; (runs:run-post-hook run-id)
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
                                  (debug:print-info 2 *default-log-port* "new state " new-state ", new status " new-status )
                                  (mt:test-set-state-status-by-id run-id test-id new-state new-status #f))
				(if (not (null? tal))
				    (loop (car tal)(cdr tal))))
			       ((run-wait)
                                ;; BB TODO - manage has-subrun case
				(debug:print-info 2 *default-log-port* "still waiting, " (length tests) " tests still running")
				(thread-sleep! 10)
				(let ((new-tests (proc-get-tests run-id)))
				  (if (null? new-tests)
				      (debug:print-info 1 *default-log-port* "Run completed according to zero tests matching provided criteria.")
				      (loop (car new-tests)(cdr new-tests)))))
			       ((archive)
                                ;; BB TODO - manage has-subrun case
				(if (and run-dir (not toplevel-with-children))







|







2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
                                  (debug:print-info 2 *default-log-port* "new state " new-state ", new status " new-status )
                                  (mt:test-set-state-status-by-id run-id test-id new-state new-status #f))
				(if (not (null? tal))
				    (loop (car tal)(cdr tal))))
			       ((run-wait)
                                ;; BB TODO - manage has-subrun case
				(debug:print-info 2 *default-log-port* "still waiting, " (length tests) " tests still running")
				(thread-sleep! 5)
				(let ((new-tests (proc-get-tests run-id)))
				  (if (null? new-tests)
				      (debug:print-info 1 *default-log-port* "Run completed according to zero tests matching provided criteria.")
				      (loop (car new-tests)(cdr new-tests)))))
			       ((archive)
                                ;; BB TODO - manage has-subrun case
				(if (and run-dir (not toplevel-with-children))