Megatest

Check-in [7dcbd017e3]
Login
Overview
Comment:Added better fallback on communication failure.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.80
Files: files | file ages | folders
SHA1: 7dcbd017e3cc12525fbffde21829cf66bd3339d4
User & Date: matt on 2023-04-08 11:48:18
Other Links: branch diff | manifest | tags
Context
2023-04-09
14:45
Clean up uses of tcp. tcp6 did not seem to work. Increased tcp backlog (didn't seem to help) and improved backoff check-in: 4860a4e6aa user: matt tags: v1.80
2023-04-08
11:48
Added better fallback on communication failure. check-in: 7dcbd017e3 user: matt tags: v1.80
02:45
Removed assert from simple lock. Removed use of statement cache which fixed bind issues check-in: 532f050f93 user: matt tags: v1.80
Changes

Modified tcp-transportmod.scm from [a71da4bf27] to [32789e6d1a].

179
180
181
182
183
184
185
186
187






188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
		 (begin
		   (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
		   (server-start-proc)
		   (tt-last-serv-start-set! ttdat (current-seconds))))
	     (thread-sleep! 1)
	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))

(define (tt:ping host port server-id)
  (let*  ((res (tt:send-receive-direct host port `(ping #f #f #f)))) ;; please send me your server-id






    ;;
    ;; need two threads, one a 5 second timer
    ;;
    (match res
      ((status errmsg result meta)
       (if (equal? result server-id)
	   (let* ((server-state (alist-ref 'sstate meta)))
	     ;; (debug:print 0 *default-log-port* "Ping to "host":"port" successful.")
	     (or server-state 'unk)) ;; then we are good
	   (begin
	     (debug:print 0 *default-log-port* "WARNING: server-id does not match, expected: "server-id", got: "result)
	     #f)))
      (else
       ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res)
       #f))))

;; client side handler
;;
;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
;;
(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
  ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.







|
|
>
>
>
>
>
>














|







179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
		 (begin
		   (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
		   (server-start-proc)
		   (tt-last-serv-start-set! ttdat (current-seconds))))
	     (thread-sleep! 1)
	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))

(define (tt:ping host port server-id #!optional (tries-left 5))
  (let*  ((res      (tt:send-receive-direct host port `(ping #f #f #f))) ;; please send me your server-id
	  (try-again (lambda ()
		       (if (> tries-left 0)
			   (begin
			     (thread-sleep! 1)
			     (tt:ping host port server-id (- tries-left 1)))
			   #f))))
    ;;
    ;; need two threads, one a 5 second timer
    ;;
    (match res
      ((status errmsg result meta)
       (if (equal? result server-id)
	   (let* ((server-state (alist-ref 'sstate meta)))
	     ;; (debug:print 0 *default-log-port* "Ping to "host":"port" successful.")
	     (or server-state 'unk)) ;; then we are good
	   (begin
	     (debug:print 0 *default-log-port* "WARNING: server-id does not match, expected: "server-id", got: "result)
	     #f)))
      (else
       ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res)
       (try-again)))))

;; client side handler
;;
;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
;;
(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
  ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244





245
246
247
248
249
250
251
252
253


254

255

256



257
258
259
260
261
262
263
264
265
		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       ((loaded)
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, will try again in a 1/4 second.")
		(thread-sleep! 0.25)
		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       (else
		result)))
	    (else
	     (if (not res)
		 (let* ((host    (tt-conn-host conn))
			(port    (tt-conn-port conn))
			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
			(pid     (tt-conn-pid  conn))
			(servinf (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname)))
		   (hash-table-set! (tt-conns ttdat) dbfname #f)
		   (if (file-exists? servinf)
		       (begin





			 (debug:print 0 *default-log-port* "INFO: no ping response from server "host":"port" for "dbfname)
			 (if (and (file-exists? servinf)
				  (> (- (current-seconds)(file-modification-time servinf)) 60))
			     (begin
			       (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.")
			       (handle-exceptions
				   exn
				 #f
				 (delete-file* servinf)))))


		       (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf))

		   (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))

		 (assert #f "FATAL: tt:handler received bad data "res)))))



	(begin
	  (thread-sleep! 1) ;; give it a rest and try again
	  (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))))

(define (tt:bid-for-servership run-id)
  #f)

;; gets server info and appends path to server file
;; sorts by age, oldest first







|
|








>
>
>
>
>
|
|
|
|
|
|
|
|
|
>
>
|
>
|
>
|
>
>
>

|







234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       ((loaded)
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, will try again in a 1/4 second.")
		(thread-sleep! 0.25)
		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       (else
		result)))
	    (else ;; did not receive properly formated result
	     (if (not res) ;; tt:handler is telling us that communication failed
		 (let* ((host    (tt-conn-host conn))
			(port    (tt-conn-port conn))
			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
			(pid     (tt-conn-pid  conn))
			(servinf (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname)))
		   (hash-table-set! (tt-conns ttdat) dbfname #f)
		   (if (file-exists? servinf)
		       (begin
			 (if (< attemptnum 5)
			     (begin
			       (thread-sleep! 1)
			       (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
			     (begin
			       (debug:print 0 *default-log-port* "INFO: no response from server "host":"port" for "dbfname)
			       (if (and (file-exists? servinf)
					(> (- (current-seconds)(file-modification-time servinf)) 60))
				   (begin
				     (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.")
				     (handle-exceptions
					 exn
				       #f
				       (delete-file* servinf))
				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))))
		       (begin ;; no server file, delay and try again
			 (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf)
			 (thread-sleep! 1)
			 (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))
		 (begin ;; this case is where res is malformed. Probably should abort
		   (assert #f "FATAL: tt:handler received bad data "res)
		   ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.")
		   ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)
		   )))))
	(begin
	  (thread-sleep! 1) ;; no conn yet set up, give it a rest and try again
	  (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))))

(define (tt:bid-for-servership run-id)
  #f)

;; gets server info and appends path to server file
;; sorts by age, oldest first

Added utils/load-the-db.scm version [46853b5895].





























































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
;; start the repl and then load this file

(define start-time (current-seconds))

(let loop ((last-print 0)
	   (num-calls  0))
  (let ((all-run-ids (rmt:get-all-run-ids))
	(do-print    (> (- (current-seconds) last-print) 2))
	(max-query   0))
    (for-each
     (lambda (run-id)
       ;; (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode)
       (let* ((all-run-data (rmt:get-tests-for-run run-id "%" '() '() #f #f #f #f #f #f #f #f)))
	 (for-each
	  (lambda (testdat)
	    (let* ((test-id (vector-ref testdat 0))
		   (start-at (current-milliseconds))
		   (testinfo (rmt:get-test-info-by-id run-id test-id))
		   (query-time (- (current-milliseconds) start-at)))
	      (if (> query-time max-query)
		  (set! max-query query-time))))
	  all-run-data)
	 (if do-print
	     (print "Running "(- (current-seconds) start-time)"s, run "run-id" has "(length all-run-data)" tests, max query "max-query))))
     all-run-ids)
    (loop (if do-print
	      (current-seconds)
	      last-print)
	  (+ num-calls (length all-run-ids)))))