Megatest

Check-in [9771b5d5a9]
Login
Overview
Comment:load control is working but servers are still getting wedged over time
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.80
Files: files | file ages | folders
SHA1: 9771b5d5a97110ffe8a2cc963916648db3c2dd35
User & Date: matt on 2023-04-10 06:09:34
Other Links: branch diff | manifest | tags
Context
2023-04-10
07:58
fixed start up wedging check-in: ce4cc8997a user: matt tags: v1.80
06:09
load control is working but servers are still getting wedged over time check-in: 9771b5d5a9 user: matt tags: v1.80
2023-04-09
22:14
flag loaded at 50 threads. check-in: 4c1e85ecfb user: matt tags: v1.80
Changes

Modified tcp-transportmod.scm from [bee5aadbcc] to [f7ad6026cc].

229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
			 (thread-sleep! delay-wait)))))
	     (case status
	       ((busy) ;; result will be how long the server wants you to delay
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is overloaded, will try again in "result" seconds.")
		(thread-sleep! (if (number? result) result 2))
		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       ((loaded)
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, will try again in a 1/4 second.")
		(tt:backoff-incr (tt-host conn)(tt-port conn))
		result) ;; (tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       (else
		result)))
	    (else ;; did not receive properly formated result
	     (if (not res) ;; tt:handler is telling us that communication failed
		 (let* ((host    (tt-conn-host conn))
			(port    (tt-conn-port conn))







|
|







229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
			 (thread-sleep! delay-wait)))))
	     (case status
	       ((busy) ;; result will be how long the server wants you to delay
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is overloaded, will try again in "result" seconds.")
		(thread-sleep! (if (number? result) result 2))
		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       ((loaded)
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.")
		(tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn))
		result) ;; (tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       (else
		result)))
	    (else ;; did not receive properly formated result
	     (if (not res) ;; tt:handler is telling us that communication failed
		 (let* ((host    (tt-conn-host conn))
			(port    (tt-conn-port conn))
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359

360
361
362
363
364
365
366
367
  (let* ((host-port (conc host":"port))
	 (bkoff     (hash-table-ref/default *tt:backoff-smoothing* host-port #f)))
    (if bkoff
	(let* ((wait-delay (tt:backoff-wait-delay bkoff))
	       (last-ioerr (tt:backoff-last-ioerr bkoff))
	       (last-adj-t (tt:backoff-last-adj-t bkoff))
	       (delta      (- (current-seconds) last-adj-t))
	       (adj        (* delta 0.01)) ;; it takes ten seconds to recover from hitting an io err
	       (new-wait   (if (> wait-delay 0)
			       (if (> adj wait-delay)
				   0
				   (- wait-delay adj))
			       0)))
	  (if (> new-wait 0)
	      (begin

		(debug:print-info 0 *default-log-port* "Server loaded, DelayWait: "new-wait)
		(tt:backoff-wait-delay-set! bkoff new-wait)
		(tt:backoff-last-adj-t-set! bkoff (current-seconds))
		(thread-sleep! new-wait))
	      (hash-table-delete! *tt:backoff-smoothing* host-port))))))

(define (tt:send-receive-direct host port dat #!key (ping-mode #f)(tries-remaining 25))
  (assert (number? port) "FATAL: tt:send-receive-direct called with port not a number "port)







|







>
|







345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
  (let* ((host-port (conc host":"port))
	 (bkoff     (hash-table-ref/default *tt:backoff-smoothing* host-port #f)))
    (if bkoff
	(let* ((wait-delay (tt:backoff-wait-delay bkoff))
	       (last-ioerr (tt:backoff-last-ioerr bkoff))
	       (last-adj-t (tt:backoff-last-adj-t bkoff))
	       (delta      (- (current-seconds) last-adj-t))
	       (adj        (* delta 0.001)) ;; it takes 100 seconds to recover from hitting an io err
	       (new-wait   (if (> wait-delay 0)
			       (if (> adj wait-delay)
				   0
				   (- wait-delay adj))
			       0)))
	  (if (> new-wait 0)
	      (begin
		(if (common:low-noise-print 10 "delay wait message")
		    (debug:print-info 0 *default-log-port* "Server loaded, DelayWait: "new-wait))
		(tt:backoff-wait-delay-set! bkoff new-wait)
		(tt:backoff-last-adj-t-set! bkoff (current-seconds))
		(thread-sleep! new-wait))
	      (hash-table-delete! *tt:backoff-smoothing* host-port))))))

(define (tt:send-receive-direct host port dat #!key (ping-mode #f)(tries-remaining 25))
  (assert (number? port) "FATAL: tt:send-receive-direct called with port not a number "port)
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
	  (full-err-print exn  "ERROR: i/o error")
	  (tt:backoff-incr host port)
	  #f)
     (exn (i/o net)
	  (if ping-mode
	      #f
	      (if (>= tries-remaining 0)
		  (let* ((backoff-delay (* (- 26 tries-remaining) 0.5)))
		    (debug:print 0 *default-log-port* "WARNING: TCP overload, trying again in "backoff-delay"s.")
		    (thread-sleep! backoff-delay)
		    (tt:backoff-incr host port)
		    (retry))
		  (assert #f "FATAL: Too many retries in tt:send-receive-direct"))))
     (exn ()
	  (full-err-print exn "Unhandled exception from client side.")







|







400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
	  (full-err-print exn  "ERROR: i/o error")
	  (tt:backoff-incr host port)
	  #f)
     (exn (i/o net)
	  (if ping-mode
	      #f
	      (if (>= tries-remaining 0)
		  (let* ((backoff-delay (* (- 26 tries-remaining) 0.1)))
		    (debug:print 0 *default-log-port* "WARNING: TCP overload, trying again in "backoff-delay"s.")
		    (thread-sleep! backoff-delay)
		    (tt:backoff-incr host port)
		    (retry))
		  (assert #f "FATAL: Too many retries in tt:send-receive-direct"))))
     (exn ()
	  (full-err-print exn "Unhandled exception from client side.")

Modified utils/load-the-db.scm from [46853b5895] to [92b9fb2b93].

1
2
3
4
5
6
7
8
9




10
11
12
13

14
15
16
17
18
19
20
21
22
23





24

25
26
27
28
29
30
;; start the repl and then load this file

(define start-time (current-seconds))

(let loop ((last-print 0)
	   (num-calls  0))
  (let ((all-run-ids (rmt:get-all-run-ids))
	(do-print    (> (- (current-seconds) last-print) 2))
	(max-query   0))




    (for-each
     (lambda (run-id)
       ;; (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode)
       (let* ((all-run-data (rmt:get-tests-for-run run-id "%" '() '() #f #f #f #f #f #f #f #f)))

	 (for-each
	  (lambda (testdat)
	    (let* ((test-id (vector-ref testdat 0))
		   (start-at (current-milliseconds))
		   (testinfo (rmt:get-test-info-by-id run-id test-id))
		   (query-time (- (current-milliseconds) start-at)))
	      (if (> query-time max-query)
		  (set! max-query query-time))))
	  all-run-data)
	 (if do-print





	     (print "Running "(- (current-seconds) start-time)"s, run "run-id" has "(length all-run-data)" tests, max query "max-query))))

     all-run-ids)
    (loop (if do-print
	      (current-seconds)
	      last-print)
	  (+ num-calls (length all-run-ids)))))







|
|
|
>
>
>
>




>










>
>
>
>
>
|
>




|

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
;; start the repl and then load this file

(define start-time (current-seconds))

(let loop ((last-print 0)
	   (num-calls  0))
  (let* ((all-run-ids (rmt:get-all-run-ids))
	 (do-print    (> (- (current-seconds) last-print) 2))
	 (max-query   0)
	 (num-calls   (+ num-calls
			 1                    ;; account for call above
			 (length all-run-ids) ;; account for the get-tests-for-run in the for-each below
			 )))
    (for-each
     (lambda (run-id)
       ;; (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode)
       (let* ((all-run-data (rmt:get-tests-for-run run-id "%" '() '() #f #f #f #f #f #f #f #f)))
	 (set! num-calls (+ num-calls (length all-run-data)))
	 (for-each
	  (lambda (testdat)
	    (let* ((test-id (vector-ref testdat 0))
		   (start-at (current-milliseconds))
		   (testinfo (rmt:get-test-info-by-id run-id test-id))
		   (query-time (- (current-milliseconds) start-at)))
	      (if (> query-time max-query)
		  (set! max-query query-time))))
	  all-run-data)
	 (if do-print
	     (let* ((run-time (- (current-seconds) start-time))
		    (qry-rate (if (> run-time 0)
				  (inexact->exact (round (/  num-calls run-time)))
				  -1)))
		(print "Running "run-time"s, run "run-id
		    " has "(length all-run-data)" tests, max query "max-query
		    "ms with avg query rate "qry-rate" qry/s")))))
     all-run-ids)
    (loop (if do-print
	      (current-seconds)
	      last-print)
	  num-calls)))