Megatest

Check-in [677b6ef8e8]
Login
Overview
Comment:server start smooth, but initial data load to inmem broken.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.80-tcp-inmem
Files: files | file ages | folders
SHA1: 677b6ef8e8e338f699d7b7bca2fa6077538e6724
User & Date: matt on 2023-02-21 11:39:16
Other Links: branch diff | manifest | tags
Context
2023-02-21
15:40
wip check-in: b51df9a459 user: matt tags: v1.80-tcp-inmem
11:39
server start smooth, but initial data load to inmem broken. check-in: 677b6ef8e8 user: matt tags: v1.80-tcp-inmem
2023-02-20
20:05
Hacky solution for no-sync db check-in: b8f2578046 user: matt tags: v1.80-tcp-inmem
Changes

Modified api.scm from [2d5deac3b3] to [a4804c006e].

145
146
147
148
149
150
151
152
153
154
155
156
157
158
159

    ;; TASKS
    tasks-add
    tasks-set-state-given-param-key
    ))

(define *db-write-mutexes* (make-hash-table))

;; These are called by the server on recipt of /api calls
;;    - keep it simple, only return the actual result of the call, i.e. no meta info here
;;
;;    - returns #( flag result )
;;
(define (api:execute-requests dbstruct dat)
  (db:open-no-sync-db) ;; sets *no-sync-db*







|







145
146
147
148
149
150
151
152
153
154
155
156
157
158
159

    ;; TASKS
    tasks-add
    tasks-set-state-given-param-key
    ))

(define *db-write-mutexes* (make-hash-table))
(define *server-signature* #f)
;; These are called by the server on recipt of /api calls
;;    - keep it simple, only return the actual result of the call, i.e. no meta info here
;;
;;    - returns #( flag result )
;;
(define (api:execute-requests dbstruct dat)
  (db:open-no-sync-db) ;; sets *no-sync-db*
229
230
231
232
233
234
235
236

237
238



239
240




241

242
243
244
245
246
247
248
249
250
251
252
253

254
255
256
257
258
259

260
261
262
263
264
265
266
267
              #;(common:telemetry-log (conc "api-out:"(->string cmd))
              payload: `((params . ,params)
              (ok-res . #f)))
              (vector #t res))))))))

;; indat is (cmd run-id params meta)
;;
;; WARNING: Do not print anything in this function as it reads/writes to current in/out port

;;
(define (api:tcp-dispatch-request-make-handler dbstruct) ;; cmd run-id params)



  (lambda ()
    (let* ((indat (deserialize)))




      (set! *api-process-request-count* (+ *api-process-request-count* 1))

      (match indat
	((cmd run-id params meta)
	 (let* ((status  (cond
			  ;; turn off busy throttling while trying to get things stable
			  ;; ((> *api-process-request-count* 50) 'busy)
			  ;; ((> *api-process-request-count* 25) 'loaded)
			  (else 'ok)))
		(errmsg  (case status
			   ((busy)   (conc "Server overloaded, "*api-process-request-count*" threads in flight"))
			   ((loaded) (conc "Server loaded, "*api-process-request-count*" threads in flight"))
			   (else     #f)))
		(result  (case status

			   ((busy loaded) #f)
			   (else
			    (case cmd
			      ((ping) (tt:mk-signature *toppath*))
			      (else
			       (api:dispatch-request dbstruct cmd run-id params))))))

		(payload (list status errmsg result '())))
	   (set! *api-process-request-count* (- *api-process-request-count* 1))
	   (serialize payload)))
	(else
	 (assert #f "FATAL: failed to deserialize indat "indat))))))
       

(define (api:dispatch-request dbstruct cmd run-id params)







|
>


>
>
>

|
>
>
>
>
|
>



<
|
|


|
|


>
|


|


>
|







229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253

254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
              #;(common:telemetry-log (conc "api-out:"(->string cmd))
              payload: `((params . ,params)
              (ok-res . #f)))
              (vector #t res))))))))

;; indat is (cmd run-id params meta)
;;
;; WARNING: Do not print anything in the lambda of this function as it
;;          reads/writes to current in/out port
;;
(define (api:tcp-dispatch-request-make-handler dbstruct) ;; cmd run-id params)
  (assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.")
  (if (not *server-signature*)
      (set! *server-signature* (tt:mk-signature *toppath*)))
  (lambda ()
    (let* ((indat      (deserialize))
	   (newcount   (+ *api-process-request-count* 1))
	   (delay-wait (if (> newcount 10)
			   (- newcount 10)
			   0)))
      (set! *api-process-request-count* newcount)
      (set! *db-last-access* (current-seconds))
      (match indat
	((cmd run-id params meta)
	 (let* ((status  (cond

			  ;; ((> newcount 30) 'busy)
			  ;; ((> newcount 15) 'loaded)
			  (else 'ok)))
		(errmsg  (case status
			   ((busy)   (conc "Server overloaded, "newcount" threads in flight"))
			   ((loaded) (conc "Server loaded, "newcount" threads in flight"))
			   (else     #f)))
		(result  (case status
			   ((busy)  (- newcount 29))
			   ((loaded) #f)
			   (else
			    (case cmd
			      ((ping) *server-signature*)
			      (else
			       (api:dispatch-request dbstruct cmd run-id params))))))
		(meta   `((wait . ,delay-wait)))
		(payload (list status errmsg result meta)))
	   (set! *api-process-request-count* (- *api-process-request-count* 1))
	   (serialize payload)))
	(else
	 (assert #f "FATAL: failed to deserialize indat "indat))))))
       

(define (api:dispatch-request dbstruct cmd run-id params)

Modified dbfile.scm from [1ed5efa652] to [8707a0c314].

102
103
104
105
106
107
108

109
110
111
112
113
114
115
(define *db-sync-in-progress* #f)
(define *db-with-db-mutex*    (make-mutex))
(define *max-api-process-requests* 0)
(define *api-process-request-count* 0)
(define *db-write-access*     #t)
(define *db-last-sync*        0)                 ;; last time the sync to megatest.db happened
(define *db-multi-sync-mutex* (make-mutex))      ;; protect access to *db-sync-in-progress*, *db-last-sync*


(define (db:generic-error-printout exn . message)
  (print-call-chain (current-error-port))
  (apply dbfile:print-err message)
  (dbfile:print-err
    ", error: "     ((condition-property-accessor 'exn 'message)   exn)
    ", arguments: " ((condition-property-accessor 'exn 'arguments) exn)







>







102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
(define *db-sync-in-progress* #f)
(define *db-with-db-mutex*    (make-mutex))
(define *max-api-process-requests* 0)
(define *api-process-request-count* 0)
(define *db-write-access*     #t)
(define *db-last-sync*        0)                 ;; last time the sync to megatest.db happened
(define *db-multi-sync-mutex* (make-mutex))      ;; protect access to *db-sync-in-progress*, *db-last-sync*
(define *db-last-access*      (current-seconds))

(define (db:generic-error-printout exn . message)
  (print-call-chain (current-error-port))
  (apply dbfile:print-err message)
  (dbfile:print-err
    ", error: "     ((condition-property-accessor 'exn 'message)   exn)
    ", arguments: " ((condition-property-accessor 'exn 'arguments) exn)

Modified dbmod.scm from [e83bf190a4] to [dd9c226075].

69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
	(let* ((last-update (dbr:dbstruct-last-update dbstruct))
	       (curr-secs   (current-seconds)))
	  (if (> (- curr-secs last-update) 2)
	      (begin
		((dbr:dbstruct-sync-proc dbstruct) last-update)
		(dbr:dbstruct-last-update-set! dbstruct curr-secs)))
	  dbstruct)
	(let* ((newdbstruct (dbmod:open-dbmoddb areapath run-id init-proc keys syncdir: 'fromdisk)))
	  (hash-table-set! *dbmod:nfs-db-handles* dbfname newdbstruct)
	  newdbstruct))))

;;======================================================================
;; The inmem one-db file per server method goes in here
;;======================================================================








|







69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
	(let* ((last-update (dbr:dbstruct-last-update dbstruct))
	       (curr-secs   (current-seconds)))
	  (if (> (- curr-secs last-update) 2)
	      (begin
		((dbr:dbstruct-sync-proc dbstruct) last-update)
		(dbr:dbstruct-last-update-set! dbstruct curr-secs)))
	  dbstruct)
	(let* ((newdbstruct (dbmod:open-dbmoddb areapath run-id dbfname init-proc keys syncdir: 'fromdisk)))
	  (hash-table-set! *dbmod:nfs-db-handles* dbfname newdbstruct)
	  newdbstruct))))

;;======================================================================
;; The inmem one-db file per server method goes in here
;;======================================================================

108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
;;
;; Updates fields in dbstruct
;; Returns dbstruct
;;
;; * This routine creates the db if not found
;; * Probably can get rid of the dbstruct-in
;; 
(define (dbmod:open-dbmoddb areapath run-id init-proc keys
			    #!key (dbstruct-in #f)
			    (syncdir 'todisk))
  (let* ((dbstruct     (or dbstruct-in (make-dbr:dbstruct areapath: areapath)))
	 (dbfname      (dbmod:run-id->dbfname run-id))
	 (dbpath       (dbmod:get-dbdir dbstruct run-id))             ;; directory where all the .db files are kept
	 (dbfullname   (dbmod:run-id->full-dbfname dbstruct run-id))
	 (dbexists     (file-exists? dbfullname))
	 (inmem        (dbmod:open-inmem-db init-proc))
	 (write-access (file-write-access? dbpath))
	 (db           (dbfile:with-simple-file-lock
			(conc dbfullname".lock")







|



|







108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
;;
;; Updates fields in dbstruct
;; Returns dbstruct
;;
;; * This routine creates the db if not found
;; * Probably can get rid of the dbstruct-in
;; 
(define (dbmod:open-dbmoddb areapath run-id dbfname-in init-proc keys
			    #!key (dbstruct-in #f)
			    (syncdir 'todisk))
  (let* ((dbstruct     (or dbstruct-in (make-dbr:dbstruct areapath: areapath)))
	 (dbfname      (or dbfname-in (dbmod:run-id->dbfname run-id)))
	 (dbpath       (dbmod:get-dbdir dbstruct run-id))             ;; directory where all the .db files are kept
	 (dbfullname   (dbmod:run-id->full-dbfname dbstruct run-id))
	 (dbexists     (file-exists? dbfullname))
	 (inmem        (dbmod:open-inmem-db init-proc))
	 (write-access (file-write-access? dbpath))
	 (db           (dbfile:with-simple-file-lock
			(conc dbfullname".lock")

Modified megatest.scm from [3813f2fa2d] to [839a9089d2].

601
602
603
604
605
606
607

608
609
610
611
612
613
614
615
616
;;
(if (or (args:get-arg "-log")(args:get-arg "-server")) ;; redirect the log always when a server
    (handle-exceptions
	exn
      (begin
	(print "ERROR: Failed to switch to log output. " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
      (let* ((tl   (or (args:get-arg "-log")(launch:setup)))   ;; run launch:setup if -server, ensure we do NOT run launch:setup if -log specified

	     (logf (or (args:get-arg "-log") ;; use -log unless we are a server, then craft a logfile name
		       (conc tl "/logs/server-" (current-process-id) "-" (get-host-name) ".log")))
	     (oup  (open-logfile logf)))
	(if (not (args:get-arg "-log"))
	    (hash-table-set! args:arg-hash "-log" logf)) ;; fake out future queries of -log
	(debug:print-info 0 *default-log-port* "Sending log output to " logf)
	(set! *default-log-port* oup))))

(if (or (args:get-arg "-h")







>

|







601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
;;
(if (or (args:get-arg "-log")(args:get-arg "-server")) ;; redirect the log always when a server
    (handle-exceptions
	exn
      (begin
	(print "ERROR: Failed to switch to log output. " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
      (let* ((tl   (or (args:get-arg "-log")(launch:setup)))   ;; run launch:setup if -server, ensure we do NOT run launch:setup if -log specified
	     (dbname (args:get-arg "-db"))   ;; for the server logfile name
	     (logf (or (args:get-arg "-log") ;; use -log unless we are a server, then craft a logfile name
		       (conc tl "/logs/server-"(or dbname "unk")"-"(current-process-id) "-" (get-host-name) ".log")))
	     (oup  (open-logfile logf)))
	(if (not (args:get-arg "-log"))
	    (hash-table-set! args:arg-hash "-log" logf)) ;; fake out future queries of -log
	(debug:print-info 0 *default-log-port* "Sending log output to " logf)
	(set! *default-log-port* oup))))

(if (or (args:get-arg "-h")
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
;; Start the server - can be done in conjunction with -runall or -runtests (one day...)
;;   we start the server if not running else start the client thread
;;======================================================================

;; Server? Start up here.
;;
(if (args:get-arg "-server")
    (let* ((run-id     (args:get-arg "-run-id"))
	   (dbfname    (args:get-arg "-db"))
	   (tl         (launch:setup))
	   (keys       (keys:config-get-fields *configdat*)))
      (case (rmt:transport-mode)
	((http)(http-transport:launch))
	((tcp)
	 (debug:print 0 *default-log-port* "INFO: Running using tcp method.")
	 (if run-id
	     (tt:start-server tl run-id dbfname api:tcp-dispatch-request-make-handler keys)
	     (begin
	       (debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -run-id is required.")
	       (exit 1))))
	(else (debug:print 0 *default-log-port* "ERROR: rmt:transport-mode value not recognised "(rmt:transport-mode))))
      (set! *didsomething* #t)))

;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to
;; a specific Megatest area. Detail are being hashed out and this may change.
;;







|







|
|

|







934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
;; Start the server - can be done in conjunction with -runall or -runtests (one day...)
;;   we start the server if not running else start the client thread
;;======================================================================

;; Server? Start up here.
;;
(if (args:get-arg "-server")
    (let* (;; (run-id     (args:get-arg "-run-id"))
	   (dbfname    (args:get-arg "-db"))
	   (tl         (launch:setup))
	   (keys       (keys:config-get-fields *configdat*)))
      (case (rmt:transport-mode)
	((http)(http-transport:launch))
	((tcp)
	 (debug:print 0 *default-log-port* "INFO: Running using tcp method.")
	 (if dbfname
	     (tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys)
	     (begin
	       (debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.")
	       (exit 1))))
	(else (debug:print 0 *default-log-port* "ERROR: rmt:transport-mode value not recognised "(rmt:transport-mode))))
      (set! *didsomething* #t)))

;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to
;; a specific Megatest area. Detail are being hashed out and this may change.
;;

Modified tcp-transportmod.scm from [a92ff9f544] to [2a0b975322].

135
136
137
138
139
140
141
142
143
144
145
146
147
148


149
150
151

152
153
154
155
156
157
158
			   pid: pid)))
	       (hash-table-set! (tt-conns ttdat) dbfname conn)
	       ;; verify we can talk to this server
	       (if (tt:ping host port server-id)
		   conn
		   (let* ((curr-secs (current-seconds)))
		     ;; rm the (last server) would go here
		     (if (> (- curr-secs (tt-last-serv-start ttdat)) 30)
			 (begin
			   (tt-last-serv-start-set! ttdat curr-secs)
			   (server-start-proc))) ;; don't try and start server unless 30 sec has gone by since last attempt
		     (thread-sleep! 1)
		     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))
	    (else


	     (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
	     (tt-last-serv-start-set! ttdat (current-seconds))
	     (server-start-proc)

	     (thread-sleep! 1)
	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
    
(define (tt:ping host port server-id)
  (let*  ((res (tt:send-receive-direct host port `(ping #f #f #f)))) ;; please send me your server-id
    ;;
    ;; need two threads, one a 5 second timer







|






>
>
|
<
|
>







135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151

152
153
154
155
156
157
158
159
160
			   pid: pid)))
	       (hash-table-set! (tt-conns ttdat) dbfname conn)
	       ;; verify we can talk to this server
	       (if (tt:ping host port server-id)
		   conn
		   (let* ((curr-secs (current-seconds)))
		     ;; rm the (last server) would go here
		     (if (> (- curr-secs (tt-last-serv-start ttdat)) 10)
			 (begin
			   (tt-last-serv-start-set! ttdat curr-secs)
			   (server-start-proc))) ;; don't try and start server unless 30 sec has gone by since last attempt
		     (thread-sleep! 1)
		     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))
	    (else
	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 5) ;; really do not want to swamp the machine with servers
		 (begin
		   (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)

		   (server-start-proc)
		   (tt-last-serv-start-set! ttdat (current-seconds))))
	     (thread-sleep! 1)
	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
    
(define (tt:ping host port server-id)
  (let*  ((res (tt:send-receive-direct host port `(ping #f #f #f)))) ;; please send me your server-id
    ;;
    ;; need two threads, one a 5 second timer
168
169
170
171
172
173
174


175
176
177
178
179
180
181
182
183







184
185
186
187
188
189
190
191
192
193
194
195
196


197


198


199


200
201
202
203
204
205
206
207
	     #f)))
      (else
       ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res)
       #f))))

;; client side handler
;;


(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
  ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.
  (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f)))
    (if conn
	;; have connection, call the server
	(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
	  ;; res is (status errmsg result meta)
	  (match res
	    ((status errmsg result meta)







	     (case status
	       ((busy)
		(debug:print 0 *default-log-port* "WARNING: server is overloaded, will try again in few seconds.")
		(thread-sleep! 2)
		(tt:handler  ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe))
	       ((loaded)
		(debug:print 0 *default-log-port* "WARNING: server is loaded, will try again in a second.")
		(thread-sleep! 1)
		(tt:handler  ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe))
	       (else
		result)))
	    (else
	     (if (not res)


		 (begin ;; let* ((srvfile (tt-conn-servinf-file ))) ;; server likely died


		   (hash-table-set! (tt-conns ttdat) dbfname #f)


		   (debug:print 0 *default-log-port* "INFO: connection to server broken, reconnecting.")


		   (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe))
		 (assert #f "FATAL: tt:handler received bad data "res)))))
	(begin
	  (thread-sleep! 1) ;; give it a rest and try again
	  (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))))

	;; no conn yet, find and or start and find a server
;; 	(let* ((server (tt:find-server ttdat dbfname)))







>
>









>
>
>
>
>
>
>

|
|
|
|

|
|
|




>
>
|
>
>

>
>
|
>
>
|







170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
	     #f)))
      (else
       ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res)
       #f))))

;; client side handler
;;
;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
;;
(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
  ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.
  (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f)))
    (if conn
	;; have connection, call the server
	(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
	  ;; res is (status errmsg result meta)
	  (match res
	    ((status errmsg result meta)
	     (if (list? meta)
		 (let* ((delay-wait (alist-ref 'delay-wait meta)))
		   (if (and (number? delay-wait)
			    (> delay-wait 0))
		       (begin
			 (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds")
			 (thread-sleep! delay-wait)))))
	     (case status
	       ((busy) ;; result will be how long the server wants you to delay
		(debug:print 0 *default-log-port* "WARNING: server is overloaded, will try again in "result" seconds.")
		(thread-sleep! (if (number? result) result 2))
		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       ((loaded)
		(debug:print 0 *default-log-port* "WARNING: server is loaded, will try again in a 1/4 second.")
		(thread-sleep! 0.25)
		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       (else
		result)))
	    (else
	     (if (not res)
		 (let* ((host    (tt-conn-host conn))
			(port    (tt-conn-port conn))
			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
			(pid     (tt-conn-pid  conn))
			(servinf (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname)))
		   (hash-table-set! (tt-conns ttdat) dbfname #f)
		   (if (file-exists? servinf)
		       (begin
			 (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", attempting to remove servinfo file.")
			 (delete-file* servinf))
		       (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf))
		   (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
		 (assert #f "FATAL: tt:handler received bad data "res)))))
	(begin
	  (thread-sleep! 1) ;; give it a rest and try again
	  (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))))

	;; no conn yet, find and or start and find a server
;; 	(let* ((server (tt:find-server ttdat dbfname)))
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
;;
(define (tt:start-server areapath run-id dbfname handler keys)
  (assert areapath "FATAL: areapath not provided for tt:start-server")
  ;; is there already a server for this dbfile? Then exit.
  (let* ((ttdat   (make-tt areapath: areapath)))
    ;; (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead
    ;; (if (null? servers)
    (let* ((dbstruct   (dbmod:open-dbmoddb areapath run-id (dbfile:db-init-proc) keys)))
      (tt-handler-set! ttdat (handler dbstruct))
      (let* ((tcp-thread (make-thread
			  (lambda ()
			    (tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data
			  "tcp-server-thread"))
	     (run-thread (make-thread
			  (lambda ()







|







308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
;;
(define (tt:start-server areapath run-id dbfname handler keys)
  (assert areapath "FATAL: areapath not provided for tt:start-server")
  ;; is there already a server for this dbfile? Then exit.
  (let* ((ttdat   (make-tt areapath: areapath)))
    ;; (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead
    ;; (if (null? servers)
    (let* ((dbstruct   (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
      (tt-handler-set! ttdat (handler dbstruct))
      (let* ((tcp-thread (make-thread
			  (lambda ()
			    (tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data
			  "tcp-server-thread"))
	     (run-thread (make-thread
			  (lambda ()
321
322
323
324
325
326
327
328

329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365


366
367

368
369
370
371
372
373
374
375



376
377
378
379
380
381
382
    ;; (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.")
    ;; (exit)))))
    ))

(define (tt:keep-running ttdat dbfname dbstruct)
  ;; verfiy conn for ready
  ;; listener socket has been started by this stage
  (thread-sleep! 1)

  (let* ((cleanup (lambda ()
		    (if (tt-cleanup-proc ttdat)
			((tt-cleanup-proc ttdat))))))
    (let loop ((count 0))
      (if (> count 60)
	  (begin
	    (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
	    (exit 1))
	  (if (not (tt-port ttdat)) ;; no connection yet
	      (let* ((last-update (dbr:dbstruct-last-update dbstruct))
		     (curr-secs   (current-seconds)))
		(if (> (- curr-secs last-update) 3) ;; every 3-4 seconds
		    (begin
		      ((dbr:dbstruct-sync-proc dbstruct) last-update)
		      (dbr:dbstruct-last-update-set! curr-secs)))
		(thread-sleep! 1)
		(loop (+ count 1))))))
    
    (tt:create-server-registration-file ttdat dbfname)
    ;; now start watching the last-access, if it hasn't been touched
    ;; in over ten seconds we exit
    (thread-sleep! 1)
    (let loop ()
      (let* ((servers (tt:get-server-info-sorted ttdat dbfname))
	     (ok      (cond
		       ((null? servers) #f) ;; not ok
		       ((equal? (list-ref (car servers) 6) ;; compare the servinfofile
				(tt-servinf-file ttdat))
			(debug:print-info 0 *default-log-port* "Keep running, I'm the top server.")
			#t)
		       (else
			(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
			(let* ((leadsrv (car servers)))
			  (match leadsrv
			    ((host port startseconds server-id pid dbfname servinfofile)
			     (if (tt:ping host port server-id)
				 #f ;; not the server, but all good, want to exit


				 (begin
				   ;; what to do here? remove the offending file?

				   (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile)
				   (delete-file* servinfofile)
				   #t ;; not the server but the server is not reachable
				   )))
			    (else
			     (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv)
			     (assert #f "Bad server record "leadsrv))))))))
	(if (not ok)



	    (begin
	      (cleanup)
	      (exit)))
	
	(if (< (- (current-seconds) (tt-last-access ttdat)) 60)
	    (begin
	      (thread-sleep! 5)







|
>











|



|





|















>
>
|
<
>
|
|
|
|
|


|
>
>
>







340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388

389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
    ;; (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.")
    ;; (exit)))))
    ))

(define (tt:keep-running ttdat dbfname dbstruct)
  ;; verfiy conn for ready
  ;; listener socket has been started by this stage
  ;; wait for a port before creating the registration file
  ;;
  (let* ((cleanup (lambda ()
		    (if (tt-cleanup-proc ttdat)
			((tt-cleanup-proc ttdat))))))
    (let loop ((count 0))
      (if (> count 60)
	  (begin
	    (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
	    (exit 1))
	  (if (not (tt-port ttdat)) ;; no connection yet
	      (let* ((last-update (dbr:dbstruct-last-update dbstruct))
		     (curr-secs   (current-seconds)))
		(if (> (- curr-secs last-update) 3) ;; every 3-4 seconds update the db? maybe this should be refresh the inmem?
		    (begin
		      ((dbr:dbstruct-sync-proc dbstruct) last-update)
		      (dbr:dbstruct-last-update-set! curr-secs)))
		(thread-sleep! 0.25)
		(loop (+ count 1))))))
    
    (tt:create-server-registration-file ttdat dbfname)
    ;; now start watching the last-access, if it hasn't been touched
    ;; in over ten seconds we exit
    (thread-sleep! 0.05) ;; any real need for delay here?
    (let loop ()
      (let* ((servers (tt:get-server-info-sorted ttdat dbfname))
	     (ok      (cond
		       ((null? servers) #f) ;; not ok
		       ((equal? (list-ref (car servers) 6) ;; compare the servinfofile
				(tt-servinf-file ttdat))
			(debug:print-info 0 *default-log-port* "Keep running, I'm the top server.")
			#t)
		       (else
			(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
			(let* ((leadsrv (car servers)))
			  (match leadsrv
			    ((host port startseconds server-id pid dbfname servinfofile)
			     (if (tt:ping host port server-id)
				 #f ;; not the server, but all good, want to exit
				 (if (and (file-exists? servinfofile)
					  (> (- (current-seconds)(file-modification-time servinfofile)) 5))
				     (begin

				       ;; can't ping and file has been on disk 5 seconds, go ahead and try to remove it
				       (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile)
				       (delete-file* servinfofile)
				       #t) ;; not the server but the server is not reachable
				     #t)))
			    (else ;; should never get here
			     (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv)
			     (assert #f "Bad server record "leadsrv))))))))
	(if ok
	    ;; (if (> *api-process-request-count* 0) ;; have requests in flight
	    ;;	(tt-last-access-set! ttdat (current-seconds)))
	    (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access
	    (begin
	      (cleanup)
	      (exit)))
	
	(if (< (- (current-seconds) (tt-last-access ttdat)) 60)
	    (begin
	      (thread-sleep! 5)
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479

480
481

482
483
484
485
486
487
488
489
490
491

492

493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
;;
(define (tt:server-get-info logf)
  (let ((server-rx    (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx    (regexp "^SERVER: dbprep"))
        (dbprep-found 0)
	(bad-dat      (list #f #f #f #f #f #f logf)))
    (handle-exceptions
     exn
     (begin
       ;; WARNING: this is potentially dangerous to blanket ignore the errors
       (if (file-exists? logf)
	   (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn))
       bad-dat) ;; no idea what went wrong, call it a bad server
     (with-input-from-file
	 logf

       (lambda ()
	 (let loop ((inl  (read-line))

		    (lnum 0))
	   (if (not (eof-object? inl))
	       (let ((mlst (string-match server-rx inl))
		     (dbprep (string-match dbprep-rx inl)))
		 (if dbprep (set! dbprep-found 1))
		 (if (not mlst)
		     (if (< lnum 500) ;; give up if more than 500 lines of server log read
			 (loop (read-line)(+ lnum 1))
			 (begin 
                           (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf )

                           bad-dat))

		     (match mlst
			    ((_ host port start server-id pid dbfname)
			     (list host
				   (string->number port)
				   (string->number start)
				   server-id
				   (string->number pid)
				   dbfname
				   logf))
			    (else
			     (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst)
			     bad-dat))))
	       (begin 
		 (if dbprep-found
		     (begin
		       (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time))
		       (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting?
		     (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds))))
		 bad-dat))))))))

;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area
  (assert areapath  "FATAL: tt:server-process-run called without areapath defined.")
  (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.")
  (assert mtexe     "FATAL: tt:server-process-run called without mtexe defined.")
  (let* ((logfile   (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
	 (cmdln     (conc
		     mtexe
		     " -server - ";; (or target-host "-")
		     " -m testsuite:" testsuite
		     " -run-id " (or run-id "main")
		     " -db "  (dbmod:run-id->dbfname run-id)
		     " " profile-mode
		     ))) ;; (conc " >> " logfile " 2>&1 &")))))
    ;; we want the remote server to start in *toppath* so push there
    ;; (push-directory areapath) ;; use cd in the command line instead
    (debug:print 0 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)"...")
    ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))







|
|
|
|
<
|
|
|
<
>
|
|
>
|
<
|
|
|
|
|
|
<
<
>
|
>
|
|
|
|
|
|
|
|
|
|
|
<
<
<
<
<
<
<
|















|







489
490
491
492
493
494
495
496
497
498
499

500
501
502

503
504
505
506
507

508
509
510
511
512
513


514
515
516
517
518
519
520
521
522
523
524
525
526
527







528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
;;
(define (tt:server-get-info logf)
  (let ((server-rx    (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx    (regexp "^SERVER: dbprep"))
        (dbprep-found 0)
	(bad-dat      (list #f #f #f #f #f #f logf)))
     (let ((fdat     (handle-exceptions
			 exn
		       (begin
			 ;; WARNING: this is potentially dangerous to blanket ignore the errors

			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn=" exn)
			 '()) ;; no idea what went wrong, call it a bad server, return empty list
		       (with-input-from-file logf read-lines))))

       (if (null? fdat) ;; bad data, return bad-dat
	   bad-dat
	   (let loop ((inl  (car fdat))
		      (tail (cdr fdat))
		      (lnum 0))

	     (let ((mlst (string-match server-rx inl))
		   (dbprep (string-match dbprep-rx inl)))
	       (if dbprep (set! dbprep-found 1))
	       (if (not mlst)
		   (if (> lnum 500) ;; give up if more than 500 lines of server log read
		       bad-dat


		       (if (null? tail)
			   bad-dat
			   (loop (car tail)(cdr tail)(+ lnum 1))))
		   (match mlst ;; have a not null list
		     ((_ host port start server-id pid dbfname)
		      (list host
			    (string->number port)
			    (string->number start)
			    server-id
			    (string->number pid)
			    dbfname
			    logf))
		     (else
		      (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst)







		      bad-dat)))))))))

;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area
  (assert areapath  "FATAL: tt:server-process-run called without areapath defined.")
  (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.")
  (assert mtexe     "FATAL: tt:server-process-run called without mtexe defined.")
  (let* ((logfile   (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
	 (cmdln     (conc
		     mtexe
		     " -server - ";; (or target-host "-")
		     " -m testsuite:" testsuite
		     ;; " -run-id " (or run-id "main") ;; NO, we do NOT want to have run id as part of this
		     " -db "  (dbmod:run-id->dbfname run-id)
		     " " profile-mode
		     ))) ;; (conc " >> " logfile " 2>&1 &")))))
    ;; we want the remote server to start in *toppath* so push there
    ;; (push-directory areapath) ;; use cd in the command line instead
    (debug:print 0 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)"...")
    ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
  (let* ((tlsn (tcp-listen port 1000 #f)) ;; (tcp-listen TCPPORT [BACKLOG [HOST]])
	 (addr  (tt:get-best-guess-address (get-host-name)))) ;; (get-my-best-address))) ;; (hostinfo-addresses (host-information (current-hostname)))
    (tt-port-set!      uconn port)
    (tt-host-set!      uconn addr)
    (tt-host-port-set! uconn (conc addr":"port))
    (tt-socket-set!    uconn tlsn)
    uconn))



;;======================================================================
;; utils
;;======================================================================

;; Generate a unique signature for this server
(define (tt:mk-signature areapath)







<
<







595
596
597
598
599
600
601


602
603
604
605
606
607
608
  (let* ((tlsn (tcp-listen port 1000 #f)) ;; (tcp-listen TCPPORT [BACKLOG [HOST]])
	 (addr  (tt:get-best-guess-address (get-host-name)))) ;; (get-my-best-address))) ;; (hostinfo-addresses (host-information (current-hostname)))
    (tt-port-set!      uconn port)
    (tt-host-set!      uconn addr)
    (tt-host-port-set! uconn (conc addr":"port))
    (tt-socket-set!    uconn tlsn)
    uconn))



;;======================================================================
;; utils
;;======================================================================

;; Generate a unique signature for this server
(define (tt:mk-signature areapath)