Megatest

Check-in [ce4cc8997a]
Login
Overview
Comment:fixed start up wedging
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.80
Files: files | file ages | folders
SHA1: ce4cc8997a57e8e9962dc5105982b58c7f74d307
User & Date: matt on 2023-04-10 07:58:47
Other Links: branch diff | manifest | tags
Context
2023-04-10
11:58
Merged fork check-in: 962cf22780 user: mrwellan tags: v1.80
07:58
fixed start up wedging check-in: ce4cc8997a user: matt tags: v1.80
06:09
load control is working but servers are still getting wedged over time check-in: 9771b5d5a9 user: matt tags: v1.80
Changes

Modified tcp-transportmod.scm from [f7ad6026cc] to [f46139fb7e].

244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265






266
267
268
269
270
271
272
			(port    (tt-conn-port conn))
			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
			(pid     (tt-conn-pid  conn))
			(servinf (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname)))
		   (hash-table-set! (tt-conns ttdat) dbfname #f)
		   (if (file-exists? servinf)
		       (begin
			 (if (< attemptnum 5)
			     (begin
			       (thread-sleep! 1)
			       (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
			     (begin
			       (debug:print 0 *default-log-port* "INFO: no response from server "host":"port" for "dbfname)
			       (if (and (file-exists? servinf)
					(> (- (current-seconds)(file-modification-time servinf)) 60))
				   (begin
				     (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.")
				     (handle-exceptions
					 exn
				       #f
				       (delete-file* servinf))
				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))))






		       (begin ;; no server file, delay and try again
			 (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf)
			 (thread-sleep! 1)
			 (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))
		 (begin ;; this case is where res is malformed. Probably should abort
		   (assert #f "FATAL: tt:handler received bad data "res)
		   ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.")







|

|











|
>
>
>
>
>
>







244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
			(port    (tt-conn-port conn))
			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
			(pid     (tt-conn-pid  conn))
			(servinf (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname)))
		   (hash-table-set! (tt-conns ttdat) dbfname #f)
		   (if (file-exists? servinf)
		       (begin
			 (if (< attemptnum 3)
			     (begin
			       (thread-sleep! 0.25)
			       (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
			     (begin
			       (debug:print 0 *default-log-port* "INFO: no response from server "host":"port" for "dbfname)
			       (if (and (file-exists? servinf)
					(> (- (current-seconds)(file-modification-time servinf)) 60))
				   (begin
				     (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.")
				     (handle-exceptions
					 exn
				       #f
				       (delete-file* servinf))
				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
				   (begin
				     ;; start server - addressed in client-connect-to-server
				     ;; delay        - addressed in client-connect-to-server
				     ;; try again
				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
				   ))))
		       (begin ;; no server file, delay and try again
			 (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf)
			 (thread-sleep! 1)
			 (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))
		 (begin ;; this case is where res is malformed. Probably should abort
		   (assert #f "FATAL: tt:handler received bad data "res)
		   ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.")
399
400
401
402
403
404
405




406
407
408
409
410
411
412

413
414
415
416
417
418
419
     (exn (io-error)
	  (full-err-print exn  "ERROR: i/o error")
	  (tt:backoff-incr host port)
	  #f)
     (exn (i/o net)
	  (if ping-mode
	      #f




	      (if (>= tries-remaining 0)
		  (let* ((backoff-delay (* (- 26 tries-remaining) 0.1)))
		    (debug:print 0 *default-log-port* "WARNING: TCP overload, trying again in "backoff-delay"s.")
		    (thread-sleep! backoff-delay)
		    (tt:backoff-incr host port)
		    (retry))
		  (assert #f "FATAL: Too many retries in tt:send-receive-direct"))))

     (exn ()
	  (full-err-print exn "Unhandled exception from client side.")
	  #f))))


;;======================================================================
;; server







>
>
>
>
|
|
|
|
|
|
|
>







405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
     (exn (io-error)
	  (full-err-print exn  "ERROR: i/o error")
	  (tt:backoff-incr host port)
	  #f)
     (exn (i/o net)
	  (if ping-mode
	      #f
	      (cond
	       ((>  tries-remaining 4) ;; server likely defunct
		(tt:backoff-incr host port)
		#f)
	       ((>= tries-remaining 0)
		(let* ((backoff-delay (* (- 26 tries-remaining) 0.1)))
		  (debug:print 0 *default-log-port* "WARNING: TCP overload, trying again in "backoff-delay"s.")
		  (thread-sleep! backoff-delay)
		  (tt:backoff-incr host port)
		  (retry))
		(assert #f "FATAL: Too many retries in tt:send-receive-direct"))
	       (else #f))))
     (exn ()
	  (full-err-print exn "Unhandled exception from client side.")
	  #f))))


;;======================================================================
;; server
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
        (dbprep-rx    (regexp "^SERVER: dbprep"))
        (dbprep-found 0)
	(bad-dat      (list #f #f #f #f #f #f logf)))
     (let ((fdat     (handle-exceptions
			 exn
		       (begin
			 ;; WARNING: this is potentially dangerous to blanket ignore the errors
			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn=" exn)
			 '()) ;; no idea what went wrong, call it a bad server, return empty list
		       (with-input-from-file logf read-lines))))
       (if (null? fdat) ;; bad data, return bad-dat
	   bad-dat
	   (let loop ((inl  (car fdat))
		      (tail (cdr fdat))
		      (lnum 0))







|







659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
        (dbprep-rx    (regexp "^SERVER: dbprep"))
        (dbprep-found 0)
	(bad-dat      (list #f #f #f #f #f #f logf)))
     (let ((fdat     (handle-exceptions
			 exn
		       (begin
			 ;; WARNING: this is potentially dangerous to blanket ignore the errors
			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn))
			 '()) ;; no idea what went wrong, call it a bad server, return empty list
		       (with-input-from-file logf read-lines))))
       (if (null? fdat) ;; bad data, return bad-dat
	   bad-dat
	   (let loop ((inl  (car fdat))
		      (tail (cdr fdat))
		      (lnum 0))
742
743
744
745
746
747
748
749
750






751
752
753
754
755
756
757
			 (let* ((indat         (deserialize))
				(result        #f)
				(exn-result    #f)
				(stdout-result (with-output-to-string
						 (lambda ()
						   (let ((res (handle-exceptions
								  exn
								(begin
								  (set! exn-result (condition->list exn))






								  #f)
								(handler indat))))
						     (set! result res)))))
				(full-result (list result exn-result (if (equal? stdout-result "") #f stdout-result))))
			   (handle-exceptions
			       exn
			     (begin







|
|
>
>
>
>
>
>







753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
			 (let* ((indat         (deserialize))
				(result        #f)
				(exn-result    #f)
				(stdout-result (with-output-to-string
						 (lambda ()
						   (let ((res (handle-exceptions
								  exn
								(let* ((errdat (condition->list exn)))
								  (set! exn-result errdat)
								  (debug:print 0 *default-log-port* "ERROR: handler exception, these are bad, will exit in five seconds.")
								  (pp errdat *default-log-port*)
								  ;; these are always bad, set up an exit thread
								  (thread-start! (make-thread (lambda ()
												(thread-sleep! 5)
												(exit))))
								  #f)
								(handler indat))))
						     (set! result res)))))
				(full-result (list result exn-result (if (equal? stdout-result "") #f stdout-result))))
			   (handle-exceptions
			       exn
			     (begin