Megatest

Diff
Login

Differences From Artifact [a1fcad65c5]:

To Artifact [46ac0a4bbf]:


109
110
111
112
113
114
115

116
117
118
119
120
121
122
123
124
125
126
127
128
129
130



131
132
133
134
135
136

137
138



139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169

170
171
172
173

174
175
176
177
178
179
180
181
182

183
184
185
186
187
188
189


190

191
192
193
194
195
196
197
198

;; parameters
;;
(define tt-server-timeout-param (make-parameter 600))

;; make ttdat visible
(define *server-info* #f)


(define (tt:make-remote areapath)
  (make-tt areapath: areapath))

;; 1 ... or #f
;; and check that dbfname matches. FIXME: the propagation of dbfname and run-id
;; might not make the best sense
;;
(define (tt:valid-run-id run-id dbfname)
  (and (or (number? run-id)
	   (not run-id))
       (equal? (dbfile:run-id->dbfname run-id) dbfname)))

(tcp-buffer-size 2048)
;; (max-connections 4096) 




;; do all the busy work of finding and setting up conn for
;; connecting to a server
;; 
(define (tt:client-connect-to-server ttdat dbfname run-id testsuite)
  (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id)

  (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f))
	 (server-start-proc (lambda ()



			      (tt:server-process-run
			       (tt-areapath ttdat)
			       testsuite ;; (dbfile:testsuite-name)
			       (common:find-local-megatest)
			       run-id))))
    (if conn
	(begin 
          ; (debug:print-info 0 *default-log-port* "already connected to the server")
           conn) ;; we are already connected to the server
	(let* ((sdat (tt:get-current-server-info ttdat dbfname)))
	  (match sdat
	    ((host port start-time server-id pid dbfname2 servinffile)
	     (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.")
             ;(debug:print-info 0 *default-log-port* "in match servinffile:" servinffile)
	     (let* ((host-port (conc host":"port))
		    (conn (make-tt-conn
			   host: host
			   port: port
			   host-port: host-port
			   dbfname: dbfname
			   servinf-file: servinffile
			   server-id: server-id
			   server-start: start-time
			   pid: pid)))
	       ;; verify we can talk to this server
	       (let* ((result   (tt:timed-ping host port server-id))
		      (ping-res (car result))
		      (ping     (cdr result)))
                 (debug:print-info 0 *default-log-port* "ping time: " ping)
		 (case ping-res
		   ((running)

		    (hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good?
		    conn)
		   ((starting)
		    (thread-sleep! 0.5)

		    (tt:client-connect-to-server ttdat dbfname run-id testsuite))
		   (else
		    (let* ((curr-secs (current-seconds)))
		      ;; rm the (last server) would go here
		      (if (> (- curr-secs (tt-last-serv-start ttdat)) 10)
			  (begin
			    (tt-last-serv-start-set! ttdat curr-secs)
			    (server-start-proc))) ;; start server if 30 sec since last attempt
		      (thread-sleep! 1)

		      (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
	    (else ;; no good server found, if haven't started server in > 5 secs, start another
	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 5) ;; BUG - grow this number really do not want to swamp the machine with servers
		 (begin
		   (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
		   (server-start-proc)
		   (tt-last-serv-start-set! ttdat (current-seconds))))


	     (thread-sleep! 1)

	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))

(define (tt:timed-ping host port server-id)
  (let* ((start-time (current-milliseconds))
	 (result     (tt:ping host port server-id)))
    (cons result (- (current-milliseconds) start-time))))
    








>














|
>
>
>




|

>
|
|
>
>
>
|
|
|
|
|


|





|














|


>




>
|






|

>
|

|

|

|
>
>

>
|







109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212

;; parameters
;;
(define tt-server-timeout-param (make-parameter 600))

;; make ttdat visible
(define *server-info* #f)
(define *server-run*  #t)

(define (tt:make-remote areapath)
  (make-tt areapath: areapath))

;; 1 ... or #f
;; and check that dbfname matches. FIXME: the propagation of dbfname and run-id
;; might not make the best sense
;;
(define (tt:valid-run-id run-id dbfname)
  (and (or (number? run-id)
	   (not run-id))
       (equal? (dbfile:run-id->dbfname run-id) dbfname)))

(tcp-buffer-size 2048)
;; (max-connections 4096)

(define (tt:get-conn ttdat dbfname)
  (hash-table-ref/default (tt-conns ttdat) dbfname #f))

;; do all the busy work of finding and setting up conn for
;; connecting to a server
;; 
(define (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)
  (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id)
  (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id)
  (let* ((conn              (tt:get-conn ttdat dbfname))
	 (server-start-proc (or server-start-proc
				(lambda ()
				  (assert (equal? dbfname "main.db") ;; only main.db is started here
					  "FATAL: called server-start-proc for db other than main.db")
				  (tt:server-process-run
				   (tt-areapath ttdat)
				   testsuite ;; (dbfile:testsuite-name)
				   (common:find-local-megatest)
				   run-id)))))
    (if conn
	(begin 
          (debug:print-info 2 *default-log-port* "already connected to a server")
           conn) ;; we are already connected to the server
	(let* ((sdat (tt:get-current-server-info ttdat dbfname)))
	  (match sdat
	    ((host port start-time server-id pid dbfname2 servinffile)
	     (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.")
             (debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile)
	     (let* ((host-port (conc host":"port))
		    (conn (make-tt-conn
			   host: host
			   port: port
			   host-port: host-port
			   dbfname: dbfname
			   servinf-file: servinffile
			   server-id: server-id
			   server-start: start-time
			   pid: pid)))
	       ;; verify we can talk to this server
	       (let* ((result   (tt:timed-ping host port server-id))
		      (ping-res (car result))
		      (ping     (cdr result)))
                 (debug:print-info 2 *default-log-port* "host " host " port " port " ping time: " ping " result " ping-res)
		 (case ping-res
		   ((running)
                    (debug:print-info 2 *default-log-port* "Setting conn = " conn " in hash table")
		    (hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good?
		    conn)
		   ((starting)
		    (thread-sleep! 0.5)
                    (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect")
		    (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))
		   (else
		    (let* ((curr-secs (current-seconds)))
		      ;; rm the (last server) would go here
		      (if (> (- curr-secs (tt-last-serv-start ttdat)) 10)
			  (begin
			    (tt-last-serv-start-set! ttdat curr-secs)
			    (server-start-proc))) ;; start server if 10 sec since last attempt
		      (thread-sleep! 1)
                      (debug:print-info 2 *default-log-port* "server ping result was neither running nor starting. Retrying connect")
		      (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)))))))
	    (else ;; no good server found, if haven't started server in > 5 secs, start another
	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers
		 (begin
		   (debug:print-info 0 *default-log-port* "Starting server for "dbfname)
		   (server-start-proc)
		   (tt-last-serv-start-set! ttdat (current-seconds))
                   (thread-sleep! 3)
                   ))
	     (thread-sleep! 1)
             (debug:print-info 0 *default-log-port* "Connect to server for " dbfname)
	     (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)))))))

(define (tt:timed-ping host port server-id)
  (let* ((start-time (current-milliseconds))
	 (result     (tt:ping host port server-id)))
    (cons result (- (current-milliseconds) start-time))))
    

220
221
222
223
224
225
226
227

228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246

247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
       ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res)
       (try-again)))))

;; client side handler
;;
;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
;;
(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)

  ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.
  (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f)))
    (if conn
	;; have connection, call the server
	(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
	  ;; res is (status errmsg result meta)
         ; (debug:print 0 *default-log-port* "conn:" conn " res: " res)
	  (match res
	    ((status errmsg result meta)
	     (if (list? meta)
		 (let* ((delay-wait (alist-ref 'delay-wait meta)))
		   (if (and (number? delay-wait)
			    (> delay-wait 0))
		       (begin
			 (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds")
			 (thread-sleep! delay-wait)))))
	     (case status
	       ((busy) ;; result will be how long the server wants you to delay
		(let* ((dly  (if (number? result) result 0.1)))

		  (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, will try again in "dly" seconds.")
		  (thread-sleep! dly)
		  (tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)))
	       ((loaded)
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.")
		(tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn))
		result) ;; (tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       (else
		result)))
	    (else ;; did not receive properly formated result
	     (if (not res) ;; tt:handler is telling us that communication failed
		 (let* ((host    (tt-conn-host conn))
			(port    (tt-conn-port conn))
			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
			(pid     (tt-conn-pid  conn))
                        ;;(servinf (tt-conn-servinf-file conn))) 
			(servinf (tt-servinf-file ttdat))) ;; (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname))) ;; TODO, use (server:get-servinfo-dir areapath)
		   (hash-table-set! (tt-conns ttdat) dbfname #f)
		   (if (and servinf (file-exists? servinf))
		       (begin
			 (if (< attemptnum 10)
			     (begin
			       (thread-sleep! 0.5)
			       (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
			     (begin
			       (debug:print 0 *default-log-port* "INFO: no response from server "host":"port" for "dbfname)
			       (if (and (file-exists? servinf)
					(> (- (current-seconds)(file-modification-time servinf)) 60))
				   (begin
				     (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.")
				     (handle-exceptions
					 exn
				       #f
				       (delete-file* servinf))
				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
				   (begin
				     ;; start server - addressed in client-connect-to-server
				     ;; delay        - addressed in client-connect-to-server
				     ;; try again
				     (thread-sleep! 0.25) ;; dunno, I think this needs to be here
				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
				   ))))
		       (begin ;; no server file, delay and try again
			 (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf)
			 (thread-sleep! 0.5)
			 (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))
		 (begin ;; this case is where res is malformed. Probably should abort
		   (assert #f "FATAL: tt:handler received bad data "res)
		   ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.")
		   ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)
		   )))))
	(begin
	  (thread-sleep! 1) ;; no conn yet set up, give it a rest and try again
	  (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))))

(define (tt:bid-for-servership run-id)
  #f)

;; gets server info and appends path to server file
;; sorts by age, oldest first
;;







|
>
|
|




<











|
>
|

|



|



|






|





|










|





|


|

|



|



|







234
235
236
237
238
239
240
241
242
243
244
245
246
247
248

249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
       ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res)
       (try-again)))))

;; client side handler
;;
;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
;;
(define (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)
  ;; connect-to-server will start a server if needed.
  (let* ((areapath (tt-areapath ttdat))
	 (conn     (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))) ;; looks up conn keyed by dbfname
    (if conn
	;; have connection, call the server
	(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
	  ;; res is (status errmsg result meta)

	  (match res
	    ((status errmsg result meta)
	     (if (list? meta)
		 (let* ((delay-wait (alist-ref 'delay-wait meta)))
		   (if (and (number? delay-wait)
			    (> delay-wait 0))
		       (begin
			 (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds")
			 (thread-sleep! delay-wait)))))
	     (case status
	       ((busy) ;; result will be how long the server wants you to delay
		(let* ((raw-dly  (if (number? result) result 0.1))
		       (dly      (+ raw-dly (/ attemptnum 10)))) ;; (* raw-dly (/ attemptnum 2))))
		  (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, cmd is "cmd", will try again in "dly" seconds. This is attempt "(- attemptnum 1))
		  (thread-sleep! dly)
		  (tt:handler  ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)))
	       ((loaded)
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.")
		(tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn))
		result) ;; (tt:handler  ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe))
	       (else
		result)))
	    (else ;; did not receive properly formated result
	     (if (not res) ;; tt:send-receive telling us that communication failed
		 (let* ((host    (tt-conn-host conn))
			(port    (tt-conn-port conn))
			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
			(pid     (tt-conn-pid  conn))
                        ;;(servinf (tt-conn-servinf-file conn))) 
			(servinf (tt-servinf-file ttdat))) ;; (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname))) ;; TODO, use (server:get-servinfo-dir areapath)
		   (hash-table-set! (tt-conns ttdat) dbfname #f) ;; clear out the conn for this dbfname to force finding new server
		   (if (and servinf (file-exists? servinf))
		       (begin
			 (if (< attemptnum 10)
			     (begin
			       (thread-sleep! 0.5)
			       (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))
			     (begin
			       (debug:print 0 *default-log-port* "INFO: no response from server "host":"port" for "dbfname)
			       (if (and (file-exists? servinf)
					(> (- (current-seconds)(file-modification-time servinf)) 60))
				   (begin
				     (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.")
				     (handle-exceptions
					 exn
				       #f
				       (delete-file* servinf))
				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))
				   (begin
				     ;; start server - addressed in client-connect-to-server
				     ;; delay        - addressed in client-connect-to-server
				     ;; try again
				     (thread-sleep! 0.25) ;; dunno, I think this needs to be here
				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))
				   ))))
		       (begin ;; no server file, delay and try again
			 (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", no servinf file. Server exited? ")
			 (thread-sleep! 0.5)
			 (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))))
		 (begin ;; this case is where res is malformed. Probably should abort
		   (assert #f "FATAL: tt:handler received bad data "res)
		   ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.")
		   ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe)
		   )))))
	(begin
	  (thread-sleep! 1) ;; no conn yet set up, give it a rest and try again
	  (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)))))

(define (tt:bid-for-servership run-id)
  #f)

;; gets server info and appends path to server file
;; sorts by age, oldest first
;;
462
463
464
465
466
467
468

469
470
471

472
473
474
475
476
477
478
479
;; This is the routine called in megatest.scm to start a server
;;
;; Server viability is checked in keep-running. Blindly start and run here.
;;
(define (tt:start-server areapath run-id dbfname-in handler keys)
  (assert areapath "FATAL: areapath not provided for tt:start-server")
  ;; is there already a server for this dbfile? Then exit.

  (let* ((ttdat   (make-tt areapath: areapath))
	 (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))
	 (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead

    (if (> (length servers) 4)
	(begin
	  (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.")
	  (exit))
	(let* ((dbstruct   (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
	  (tt-handler-set! ttdat (handler dbstruct))
	  (let* ((tcp-thread (make-thread
			      (lambda ()







>



>
|







477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
;; This is the routine called in megatest.scm to start a server
;;
;; Server viability is checked in keep-running. Blindly start and run here.
;;
(define (tt:start-server areapath run-id dbfname-in handler keys)
  (assert areapath "FATAL: areapath not provided for tt:start-server")
  ;; is there already a server for this dbfile? Then exit.
  (debug:print 2 *default-log-port* "tt:start-server: " dbfname-in)
  (let* ((ttdat   (make-tt areapath: areapath))
	 (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))
	 (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead
         (debug:print 0 *default-log-port* "Found " (length servers) " already running for " dbfname)
    (if (> (length servers) 0)
	(begin
	  (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.")
	  (exit))
	(let* ((dbstruct   (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
	  (tt-handler-set! ttdat (handler dbstruct))
	  (let* ((tcp-thread (make-thread
			      (lambda ()
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552



553
554
555
556
557
558
559
  ;; wait for a port before creating the registration file
  ;;
  (let* ((db-locked-in #f)
	 (areapath     (tt-areapath ttdat))
	 (nosyncdbpath (conc areapath"/.mtdb"))
	 (cleanup (lambda ()
		    (if (tt-cleanup-proc ttdat)
			((tt-cleanup-proc ttdat)))
		    (dbfile:with-no-sync-db nosyncdbpath
					    (lambda (db)
					      (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct)))
						(debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname)
						(db:no-sync-del! db dbfname)
						#;(if dbtmpname
						    (delete-file dbtmpname))))))))
    (set! *server-info* ttdat)
    (let loop ((count 0))
      (if (> count 240)
	  (begin
	    (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
	    (exit 1))
	  (if (not (tt-port ttdat)) ;; no connection yet
	      (begin
		(thread-sleep! 0.25)
		(loop (+ count 1))))))
    
    (tt:create-server-registration-file ttdat dbfname)
    ;; now start watching the last-access, if it hasn't been touched
    ;; in over ten seconds we exit
    (thread-sleep! 0.05) ;; any real need for delay here?
    (let loop ()
      (let* ((servers (tt:get-server-info-sorted ttdat dbfname))
	     (ok      (cond



		       ((null? servers) #f) ;; not ok
		       ((equal? (list-ref (car servers) 6) ;; compare the servinfofile
				(tt-servinf-file ttdat))
			(let* ((res (if db-locked-in
					#t
					(let* ((lock-result  ;; this is the primary lock - need to double verify that got it
						(dbfile:with-no-sync-db







|



|

<
|


















>
>
>







537
538
539
540
541
542
543
544
545
546
547
548
549

550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
  ;; wait for a port before creating the registration file
  ;;
  (let* ((db-locked-in #f)
	 (areapath     (tt-areapath ttdat))
	 (nosyncdbpath (conc areapath"/.mtdb"))
	 (cleanup (lambda ()
		    (if (tt-cleanup-proc ttdat)
			((tt-cleanup-proc ttdat))) ;; removes .servinfo file
		    (dbfile:with-no-sync-db nosyncdbpath
					    (lambda (db)
					      (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct)))
						;; (debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname)
						(db:no-sync-del! db dbfname)

                                                ))))))
    (set! *server-info* ttdat)
    (let loop ((count 0))
      (if (> count 240)
	  (begin
	    (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
	    (exit 1))
	  (if (not (tt-port ttdat)) ;; no connection yet
	      (begin
		(thread-sleep! 0.25)
		(loop (+ count 1))))))
    
    (tt:create-server-registration-file ttdat dbfname)
    ;; now start watching the last-access, if it hasn't been touched
    ;; in over ten seconds we exit
    (thread-sleep! 0.05) ;; any real need for delay here?
    (let loop ()
      (let* ((servers (tt:get-server-info-sorted ttdat dbfname))
	     (ok      (cond
		       ((not *server-run*)
			(debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.")
			#f)
		       ((null? servers) #f) ;; not ok
		       ((equal? (list-ref (car servers) 6) ;; compare the servinfofile
				(tt-servinf-file ttdat))
			(let* ((res (if db-locked-in
					#t
					(let* ((lock-result  ;; this is the primary lock - need to double verify that got it
						(dbfile:with-no-sync-db
574
575
576
577
578
579
580

581
582
583
584
585
586
587
						(debug:print 0 *default-log-port* "Failed to get server lock for "dbfname)
						#f))))))
			  (if (and res (common:low-noise-print 120 "top server message"))
			      (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for "
						dbfname" on "(tt-host ttdat)":"(tt-port ttdat)))
			  res))
		       (else

			(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
			(let* ((leadsrv (car servers)))
			  (match leadsrv
			    ((host port startseconds server-id pid dbfname servinfofile)
			     (let* ((result  (tt:timed-ping host port server-id))
				    (res     (car result))
				    (ping    (cdr result)))







>







593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
						(debug:print 0 *default-log-port* "Failed to get server lock for "dbfname)
						#f))))))
			  (if (and res (common:low-noise-print 120 "top server message"))
			      (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for "
						dbfname" on "(tt-host ttdat)":"(tt-port ttdat)))
			  res))
		       (else
                        ;; wrong servinfo file
			(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
			(let* ((leadsrv (car servers)))
			  (match leadsrv
			    ((host port startseconds server-id pid dbfname servinfofile)
			     (let* ((result  (tt:timed-ping host port server-id))
				    (res     (car result))
				    (ping    (cdr result)))
614
615
616
617
618
619
620
621

622
623
624
625
626
627
628
629
	      (cleanup)
	      (exit)))

	(let* ((last-update (dbr:dbstruct-last-update dbstruct))
	       (curr-secs   (current-seconds)))
	  (if (and (eq? (tt-state ttdat) 'running)
		   (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db?
	      (begin

		(set! (file-modification-time (tt-servinf-file ttdat)) (current-seconds))
		((dbr:dbstruct-sync-proc dbstruct) last-update)
		(dbr:dbstruct-last-update-set! dbstruct curr-secs))))
	  
	(if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param))
	    (begin
	      (thread-sleep! 5)
	      (loop)))))







|
>
|







634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
	      (cleanup)
	      (exit)))

	(let* ((last-update (dbr:dbstruct-last-update dbstruct))
	       (curr-secs   (current-seconds)))
	  (if (and (eq? (tt-state ttdat) 'running)
		   (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db?
	      (let* ((sinfo-file (tt-servinf-file ttdat)))
		;; (debug:print 0 *default-log-port* "INFO: touching "sinfo-file)
		(set! (file-modification-time sinfo-file) (current-seconds))
		((dbr:dbstruct-sync-proc dbstruct) last-update)
		(dbr:dbstruct-last-update-set! dbstruct curr-secs))))
	  
	(if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param))
	    (begin
	      (thread-sleep! 5)
	      (loop)))))
702
703
704
705
706
707
708
709















710

711
712
713
714
715
716
717
718
719
720
721
722
723
724
725

726
727
728
729
730
731
732
;; find valid server
;; get servers listed, last part of name must match :<dbfname>
;; if more than one, wait one second and look again
;; future: ping oldest, if alive remove other :<dbfname> files
;;
(define (tt:find-server areapath dbfname)
  (let* ((servdir  (tt:get-servinfo-dir areapath))
	 (sfiles   (glob (conc servdir"/*:"dbfname))))















    sfiles))


;; given a path to a server info file return: host port startseconds server-id pid dbfname logf
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
;;
(define (tt:server-get-info logf)
  (let ((server-rx    (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx    (regexp "^SERVER: dbprep"))
        (dbprep-found 0)
	(bad-dat      (list #f #f #f #f #f #f logf)))
     (let ((fdat     (handle-exceptions
			 exn
		       (begin
			 ;; WARNING: this is potentially dangerous to blanket ignore the errors
			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn))

			 '()) ;; no idea what went wrong, call it a bad server, return empty list
		       (with-input-from-file logf read-lines))))
       (if (null? fdat) ;; bad data, return bad-dat
	   bad-dat
	   (let loop ((inl  (car fdat))
		      (tail (cdr fdat))
		      (lnum 0))







|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>













|
|
>







723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
;; find valid server
;; get servers listed, last part of name must match :<dbfname>
;; if more than one, wait one second and look again
;; future: ping oldest, if alive remove other :<dbfname> files
;;
(define (tt:find-server areapath dbfname)
  (let* ((servdir  (tt:get-servinfo-dir areapath))
	 (sfiles   (glob (conc servdir"/*:"dbfname)))
	 (goodfiles '()))

    ;; filter the files here by looking in processes table (if we are not main.db)
    ;; and or look at the time stamp on the servinfo file, a running server will
    ;; touch the file every minute (again, this will only apply for main.db)
    (for-each (lambda (fname)
		(let* ((age (- (current-seconds)(file-modification-time fname))))
		  (if (> age 200) ;; can't trust it if over twenty seconds old
		      (begin
			(debug:print 0 *default-log-port* "WARNING: removing stale servinfo file "fname)
			(handle-exceptions
			 exn
			 (debug:print 0 *default-log-port* "WARNING: error attempting to remove stale servinfo file "fname)
			 (delete-file fname))) ;; 
		      (set! goodfiles (cons fname goodfiles)))))
	      sfiles)
    goodfiles))

;; given a path to a server info file return: host port startseconds server-id pid dbfname logf
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
;;
(define (tt:server-get-info logf)
  (let ((server-rx    (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx    (regexp "^SERVER: dbprep"))
        (dbprep-found 0)
	(bad-dat      (list #f #f #f #f #f #f logf)))
     (let ((fdat     (handle-exceptions
			 exn
		       (begin
			 ;; BUG, TODO: add err checking, for now blanket ignore the errors?
			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf
					   ", exn="(condition->list exn))
			 '()) ;; no idea what went wrong, call it a bad server, return empty list
		       (with-input-from-file logf read-lines))))
       (if (null? fdat) ;; bad data, return bad-dat
	   bad-dat
	   (let loop ((inl  (car fdat))
		      (tail (cdr fdat))
		      (lnum 0))
748
749
750
751
752
753
754







755
756
757
758
759
760
761
762
763
764
765


766
767

768
769
770
771
772

773
774
775

776
777
778

779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796

797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
			    (string->number pid)
			    dbfname
			    logf))
		     (else
		      (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst)
		      bad-dat)))))))))








;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area
  (assert areapath  "FATAL: tt:server-process-run called without areapath defined.")
  (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.")
  (assert mtexe     "FATAL: tt:server-process-run called without mtexe defined.")
  ;; mtest -server - -m testsuite:ext-tests -db 6.db
  (let* ((dbfname  (dbmod:run-id->dbfname run-id))


	 (load     (get-normalized-cpu-load))
	 (trying   (length (tt:find-server areapath dbfname)))

	 (nrun     (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
    (cond
     ((> load 2.0)
      (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server.")
      (thread-sleep! 1))

     ((> nrun 100)
      (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
      (thread-sleep! 1))

     ((> trying 4)
      (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
      (thread-sleep! 1))

     (else
      (if (not (file-exists? (conc areapath"/logs")))
	      (create-directory (conc areapath"/logs") #t))
	  (let* ((logfile   (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
		 (cmdln     (conc
			     mtexe
			     " -startdir "areapath
			     " -server - ";; (or target-host "-")
			     " -m testsuite:"testsuite
			     " -db "dbfname ;; (dbmod:run-id->dbfname run-id)
			     " " profile-mode
			     (conc " >> " logfile " 2>&1 &"))))
	    ;; we want the remote server to start in *toppath* so push there
	    ;; (push-directory areapath) ;; use cd in the command line instead
	    (debug:print 0 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
	    ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))

	    (system cmdln)

	    ;; ;; use below to go back to nbfake - nbfake does cause trouble ...
	    ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
	    ;; (setenv "NBFAKE_LOG" logfile)
	    ;; (system (conc "cd "areapath" ; nbfake " cmdln))
	    ;; (unsetenv "NBFAKE_QUIET")
	    ;; (unsetenv "NBFAKE_LOG")
	    
	    ;;(pop-directory)
	    )))))

;;======================================================================
;; tcp connection stuff
;;======================================================================

;; find a port and start tcp-server. This only starts the tcp portion of
;; the server, look at (tt:start-server ...) above for the entry point







>
>
>
>
>
>
>










|
>
>
|
|
>
|
|
|
|
|
>
|
|
|
>
|
|
|
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|

|
>
|
|
|
|
|
|
|
|
|







786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
			    (string->number pid)
			    dbfname
			    logf))
		     (else
		      (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst)
		      bad-dat)))))))))

(define *last-server-start* (make-hash-table))

(define (tt:too-recent-server-start dbfname)
  (let* ((last-run-time (hash-table-ref/default *last-server-start* dbfname #f)))
    (and last-run-time
	 (< (- (current-seconds) last-run-time) 5))))
    
;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area
  (assert areapath  "FATAL: tt:server-process-run called without areapath defined.")
  (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.")
  (assert mtexe     "FATAL: tt:server-process-run called without mtexe defined.")
  ;; mtest -server - -m testsuite:ext-tests -db 6.db
  (let* ((dbfname  (dbmod:run-id->dbfname run-id)))
    (if (tt:too-recent-server-start dbfname)
	#f
	(let* ((load     (get-normalized-cpu-load))
	       (srvrs    (tt:find-server areapath dbfname))
	       (trying   (length srvrs))
	       (nrun     (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
	  (cond
	   ((> load 2.0)
	    (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes")
	    (thread-sleep! 1)
	    #f)
	   ((> nrun 100)
	    (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
	    (thread-sleep! 1)
	    #f)
	   ((> trying 2)
	    (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
	    (thread-sleep! 1)
	    #f)
	   (else
	    (if (not (file-exists? (conc areapath"/logs")))
		(create-directory (conc areapath"/logs") #t))
	    (let* ((logfile   (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
		   (cmdln     (conc
			       mtexe
			       " -startdir "areapath
			       " -server - ";; (or target-host "-")
			       " -m testsuite:"testsuite
			       " -db "dbfname ;; (dbmod:run-id->dbfname run-id)
			       " " profile-mode
			       (conc " >> " logfile " 2>&1 &"))))
	      ;; we want the remote server to start in *toppath* so push there
	      ;; (push-directory areapath) ;; use cd in the command line instead
	      (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
	      ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))

	      (system cmdln)
	      (hash-table-set! *last-server-start* dbfname (current-seconds))
	      ;; ;; use below to go back to nbfake - nbfake does cause trouble ...
	      ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
	      ;; (setenv "NBFAKE_LOG" logfile)
	      ;; (system (conc "cd "areapath" ; nbfake " cmdln))
	      ;; (unsetenv "NBFAKE_QUIET")
	      ;; (unsetenv "NBFAKE_LOG")
	      
	      ;;(pop-directory)
	      #t)))))))

;;======================================================================
;; tcp connection stuff
;;======================================================================

;; find a port and start tcp-server. This only starts the tcp portion of
;; the server, look at (tt:start-server ...) above for the entry point