Megatest

Check-in [c88c8f26e0]
Login
Overview
Comment:Overhauled rpc-transport:launch and rpc-transport:run to account for things introduced to http-transport like inmem db maintenance ; made and used rpc-transport:server-shutdown ; made (more) opinionaltedly encapsulated procedures for task unit
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | rpc-transport
Files: files | file ages | folders
SHA1: c88c8f26e022c48a91a0dd152eb6058e6167d669
User & Date: bjbarcla on 2016-11-02 00:44:01
Other Links: branch diff | manifest | tags
Context
2016-11-02
03:09
made changes to fix tcp-buffer size to 0 so rpc worked. small fight here between http-transport and rpc-transport here... hopefully this patch does the job. check-in: 58d1011733 user: bjbarcla tags: rpc-transport
00:44
Overhauled rpc-transport:launch and rpc-transport:run to account for things introduced to http-transport like inmem db maintenance ; made and used rpc-transport:server-shutdown ; made (more) opinionaltedly encapsulated procedures for task unit check-in: c88c8f26e0 user: bjbarcla tags: rpc-transport
2016-11-01
02:46
picked nit check-in: e9f1cb9e4b user: bjbarcla tags: rpc-transport
Changes

Modified http-transport.scm from [f8800e432b] to [8236b0c25e].

383
384
385
386
387
388
389
390

391
392
393
394
395
396
397
383
384
385
386
387
388
389

390
391
392
393
394
395
396
397







-
+







				    (begin
				      (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server " server-id " for run " run-id)
				      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature")
				      (exit))
				    (loop start-time
					  (equal? sdat last-sdat)
					  sdat)))))))
         (iface       (car server-info))
         (iface       (car server-info)) ;; BB> this represents ip address, not interface (like eth0 as I would expect from the term)
         (port        (cadr server-info))
         (last-access 0)
	 (server-timeout (server:get-timeout)))
    (let loop ((count         0)
	       (server-state 'available)
	       (bad-sync-count 0))

Modified rmt.scm from [ec0b0ba85a] to [fa9d72cfee].

72
73
74
75
76
77
78


79

80
81
82
83
84
85
86
87
88
89
90

91
92
93
94
95
96
97
98

99
100
101
102
103
104
105
72
73
74
75
76
77
78
79
80

81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

101
102
103
104
105
106
107
108







+
+
-
+











+







-
+







	    #f))))

(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected


  ;; clean out old connections
  ;; side-effect: clean out old connections
  ;; (mutex-lock! *db-multi-sync-mutex*)
  (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin
    (for-each 
     (lambda (run-id)
       (let ((connection (hash-table-ref/default *runremote* run-id #f)))
         (if (and (vector? connection)
        	  (< (http-transport:server-dat-get-last-access connection) expire-time)) ;; BB> BBTODO: make this generic, not http transport specific.
             (begin
               (debug:print-info 0 *default-log-port* "Discarding connection to server for run-id " run-id ", too long between accesses")
               (hash-table-delete! *runremote* run-id)))))
     (hash-table-keys *runremote*)))
  
  ;; (mutex-unlock! *db-multi-sync-mutex*)
  ;; (mutex-lock! *send-receive-mutex*)
  (let* ((run-id          (if rid rid 0))
	 (connection-info (rmt:get-connection-info run-id)))
    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)
    (if connection-info
	;; use the server if have connection info
	(let* ((transport-type (vector-ref connection-info 6))
	(let* ((transport-type (vector-ref connection-info 6)) ;; BB: assumes all transport-type'-servertdat vector's item 6 ids transport type
               (dat     (case transport-type ;; BB: replaced *transport-type* global with run-id specific transport-type, item 6 in server-info vector which was populated by *-transport:client-connect with >> (vector iface port api-uri api-url api-req (current-seconds) 'http  ) <<
			  ((http)(condition-case
				  (http-transport:client-api-send-receive run-id connection-info cmd params)
				  ((commfail)(vector #f "communications fail"))
				  ((exn)(vector #f "other fail"))))
                          ;;((rpc) (rpc-transport:client-api-send-receive run-id connection-info cmd params)) ;; BB: let us error out for now
			  (else  

Modified rpc-transport.scm from [702db23fbc] to [bd4853520b].

19
20
21
22
23
24
25




26
27
28
29
30
31
32
33
34
35
36
































37
38
39
40
41
42
43
44
45
46
47
48












49
50
51
52



53
54



55
56
57
58
59


60
61
62
63
64





65


66
67







68
69
70
71

72
73
74
75
76



77

78









79
80
81
82


83
84
85
86
87
88
89
90
91





92

93
94
95
96

97
98
99
100
101
102
103
104
105
106
107










108
109


110

111
112

113
114
115
116
117
118
119
120
121
122
123
124
125
126

127






128






129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148















































































































149
150
151
152
153
154
155
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78






79
80
81
82
83
84
85
86
87
88
89
90




91
92
93


94
95
96





97
98





99
100
101
102
103
104
105
106


107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

127
128
129
130
131
132
133
134
135
136
137
138
139

140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156

157
158
159
160
161
162
163
164
165
166
167
168
169
170
171


172
173
174
175
176
177
178
179
180
181
182
183
184
185

186
187

188



189
190



191




192
193

194
195
196
197
198
199
200
201
202
203
204
205
206




















207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324







+
+
+
+











+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+






-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
+
+
+
-
-
+
+
+
-
-
-
-
-
+
+
-
-
-
-
-
+
+
+
+
+

+
+
-
-
+
+
+
+
+
+
+




+





+
+
+
-
+

+
+
+
+
+
+
+
+
+


-

+
+









+
+
+
+
+
-
+




+









-
-
+
+
+
+
+
+
+
+
+
+


+
+
-
+

-
+
-
-
-


-
-
-

-
-
-
-

+
-
+
+
+
+
+
+

+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







(declare (uses common))
(declare (uses db))
(declare (uses tests))
(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running.

(include "common_records.scm")
(include "db_records.scm")

(define *heartbeat-mutex* (make-mutex))
(define *server-loop-heart-beat* (current-seconds))


;; procstr is the name of the procedure to be called as a string
(define (rpc-transport:autoremote procstr params)
  (handle-exceptions
   exn
   (begin
     (debug:print 1 *default-log-port* "Remote failed for " proc " " params)
     (apply (eval (string->symbol procstr)) params))
   ;; (if *runremote*
   ;;    (apply (eval (string->symbol (conc "remote:" procstr))) params)
   (apply (eval (string->symbol procstr)) params)))

;; retry an operation (depends on srfi-18)
(define (retry-thunk the-thunk #!key (accept-result? (lambda (x) x)) (retries 4) (wait-seconds-between-tries 0.2) (failure-value #f))
  (let loop ((res (the-thunk)) (retries-left retries))
    (cond
     ((accept-result? res) res)
     ((> retries-left 0)
      (thread-sleep! wait-seconds-between-tries)
      (loop (the-thunk) (sub1 retries-left)))
     (else failure-value))))


(define (rpc-transport:server-shutdown server-id rpc:listener #!key (from-on-exit #f))
  (BB> "rpc-transport:server-shutdown entered.")
  (on-exit (lambda () #t)) ;; turn off on-exit stuff
  ;;(tcp-close rpc:listener) ;; gotta exit nicely
  ;;(tasks:bb-server-set-state! server-id "stopped")


  ;; TODO: (low) the following is extraordinaritly slow.  Maybe we don't even need portlogger for rpc anyway??  the exception-based failover when ports are taken is fast!
  ;;(BB> "before plog rel")
  ;;(portlogger:open-run-close portlogger:set-port (rpc:default-server-port) "released")
  
  (set! *time-to-exit* #t)
  (BB> "before db:sync-touched")
  (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t))
  (BB> "before bb-server-delete-record")
  (tasks:bb-server-delete-record server-id " rpc-transport:keep-running complete")
  (BB> "Before (exit)")
  (unless from-on-exit (exit))
  )


;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (rpc-transport:launch run-id)
  (let* ((tdbdat (tasks:open-db)))
    (BB> "rpc-transport:launch fired for run-id="run-id)
    (set! *run-id*   run-id)
    (if (args:get-arg "-daemonize")
        (daemon:ize))
    (if (server:check-if-running run-id)
  (BB> "rpc-transport:launch fired for run-id="run-id)
  (set! *run-id*   run-id)

  ;; send to background if requested
  (when (args:get-arg "-daemonize")
      (daemon:ize)
      (when *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
        (current-error-port *alt-log-file*)
        (current-output-port *alt-log-file*)))

  ;; double check we dont alrady have a running server for this run-id
  (when (server:check-if-running run-id)
        (begin
          (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running")
          (exit 0)))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id 'rpc))
    (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running")
    (exit 0))

               (remtries  4))
      (if (not server-id)
  ;; let's get a server-id for this server
  ;;   if at first we do not suceed, try 3 more times.
  (let ((server-id (retry-thunk
          (if (> remtries 0)
              (begin
                (thread-sleep! 2)
                (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id 'rpc)
                      (- remtries 1)))
                    (lambda () (tasks:bb-server-lock-slot run-id 'rpc))
                    retries: 4)))
              (begin
                ;; since we didn't get the server lock we are going to clean up and bail out
                (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
                (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " rpc-transport:launch")))
          (begin
    (when (not server-id) ;; dang we couldn't get a server-id.
      ;; since we didn't get the server lock we are going to clean up and bail out
      (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
      (tasks:bb-server-delete-records-for-this-pid " rpc-transport:launch")
      (exit 1))

    ;; we got a server-id (and a corresponding entry in servers table in globally shared mdb)
    ;; all systems go.  Proceed to setup rpc server.  
            (rpc-transport:run (if (args:get-arg "-server")(args:get-arg "-server") "-") run-id server-id)
            (exit))))))
    (rpc-transport:run
     (if (args:get-arg "-server")
         (args:get-arg "-server")
         "-")
     run-id
     server-id)
    (exit)))

(define *rpc-listener-port* #f)
(define *rpc-listener-port-bind-timestamp* #f)

(define *on-exit-flag #f)
(define (rpc-transport:run hostn run-id server-id)
  (BB> "rpc-transport:run fired for hostn="hostn" run-id="run-id" server-id="server-id)
  (debug:print 2 *default-log-port* "Attempting to start the rpc server ...")
   ;; (trace rpc:publish-procedure!)

  ;;======================================================================
  ;;	  start of publish-procedure section
  ;;======================================================================
  (rpc:publish-procedure! 'server:login server:login)
  (rpc:publish-procedure! 'server:login server:login) ;; this allows client to validate it is the same megatest instance as the server.  No security here, just making sure we're in the right room.
  (rpc:publish-procedure! 'testing (lambda () "Just testing"))

  ;; BB: BBTODO: publish procedure to receive request from client's rpc:send-receive/rpc-transport:client-api-send-receive call
  
  ;;======================================================================
  ;;	  end of publish-procedure section
  ;;======================================================================



  (BB> "flag1")
  (let* ((db              #f)
         (tdbdat          (tasks:open-db))
	 (hostname        (let ((res (get-host-name))) (BB> "hostname="res) res))
         (server-start-time (current-seconds))
         (server-timeout (server:get-timeout))
	 (ipaddrstr       (let* ((ipstr (if (string=? "-" hostn)
					   ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
					   (server:get-best-guess-address hostname)
					   #f))
                                 (res (if ipstr ipstr hostn)))
                            (BB> "ipaddrstr="res)                             
                            res)) ;; hostname))) 
	 (start-port      (let ((res (portlogger:open-run-close portlogger:find-port))) (BB> "start-port="res) res))
	 (link-tree-path  (configf:lookup *configdat* "setup" "linktree"))

         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
         ;; rpc:listener is the tcp-listen result from inside the find-free-port-and-open complex.
         ;;   It is our handle on the listening tcp port
         ;;   We will attach this to our rpc server with rpc:make-server in thread th1 .
	 (rpc:listener    (rpc-transport:find-free-port-and-open start-port))
	 (rpc:listener    (rpc-transport:find-free-port-and-open start-port)) 
	 (th1             (make-thread
			   (lambda ()
			     ((rpc:make-server rpc:listener) #t))
			   "rpc:server"))
         
			   ;; (cute (rpc:make-server rpc:listener) "rpc:server")
			   ;; 'rpc:server))
	 (hostname        (if (string=? "-" hostn)
			      (get-host-name) 
			      hostn))
	 (ipaddrstr       (if (string=? "-" hostn)
			      (server:get-best-guess-address hostname) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
			      #f))
	 (portnum         (let ((res (rpc:default-server-port))) (BB> "rpc:default-server-port="res" rpc-listener-port="*rpc-listener-port*) res))
	 (host:port       (conc (if ipaddrstr ipaddrstr hostname) ":" portnum))
	 (tdb             (tasks:open-db)))
	 (host:port       (conc (if ipaddrstr ipaddrstr hostname) ":" portnum)))

    ;; if rpc found it needed a different port than portlogger provided, keep portlogger in the loop.
    (when (not (equal? start-port portnum))
      (BB> "portlogger proffered "start-port" but rpc grabbed "portnum)
      (portlogger:open-run-close portlogger:set-port start-port "released")
      (portlogger:open-run-close portlogger:take-port portnum))
    ;;============================================================
    ;;  activate thread th1 to attach opened tcp port to rpc server
    ;;=============================================================
    (BB> "Got here before thread start of rpc listener")
    (thread-start! th1)


    (BB> "started thread th1="th1)
    (BB> "started rpc server thread th1="th1)
    (set! db *inmemdb*)
    (tasks:server-set-interface-port 
o    (tasks:bb-server-set-interface-port server-id ipaddrstr portnum)
     (db:delay-if-busy tdbdat)
     server-id 
     ipaddrstr portnum)
    (debug:print 0 *default-log-port* "Server started on " host:port)
    
    ;; (trace rpc:publish-procedure!)
    ;; (rpc:publish-procedure! 'server:login server:login)
    ;; (rpc:publish-procedure! 'testing (lambda () "Just testing"))

    ;;======================================================================
    ;;	  ;; end of publish-procedure section
    ;;======================================================================
    ;;
    (on-exit (lambda ()
               (rpc-transport:server-shutdown server-id rpc:listener from-on-exit: #t)))
	       (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "stopped")))
    
    ;; check again for running servers for this run-id in case one has snuck in since we checked last in rpc-transport:launch
    (if (not (equal? server-id (tasks:bb-server-am-i-the-server? run-id)));; try to ensure no double registering of servers
        (begin ;; i am not the server, another server snuck in and beat this one to the punch
          (tcp-close rpc:listener) ;; gotta exit nicely and free up that tcp port
          (tasks:bb-server-set-state! server-id "collision"))

        (begin ;; i am the server
          ;; setup the in-memory db
          (set! *inmemdb*  (db:setup run-id))
          (db:get-db *inmemdb* run-id)

          ;; let's make it official
    (set! *rpc:listener* rpc:listener)
    (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")
    (set! *inmemdb*  (db:setup run-id))
    ;; if none running or if > 20 seconds since 
    ;; server last used then start shutdown
    (let loop ((count 0))
      (thread-sleep! 5) ;; no need to do this very often
      (let ((numrunning -1)) ;; (db:get-count-tests-running db)))
	(if (or (> numrunning 0)
		(> (+ *last-db-access* 60)(current-seconds)))
	    (begin
	      (debug:print-info 0 *default-log-port* "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*))
	      (loop (+ 1 count)))
	    (begin
	      (debug:print-info 0 *default-log-port* "Starting to shutdown the server side")
	      (open-run-close tasks:server-delete-record tasks:open-db server-id " rpc-transport:try-start-server stop")
	      (thread-sleep! 10)
	      (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*)
	      (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting")
	      ))))))
          (set! *rpc:listener* rpc:listener) 
          (tasks:bb-server-set-state! server-id "running") ;; update our mdb servers entry

          
          
          ;; this let loop will hold open this thread until we want the server to shut down.
          ;;   if no requests received within the last 20 seconds :
          ;;   database hasnt changed in ??
          ;;

          ;; begin new loop
          (let loop ((count          0)
                     (bad-sync-count 0))

            ;; Use this opportunity to sync the inmemdb to db
            (let ((start-time (current-milliseconds))
                  (sync-time  #f)
                  (rem-time   #f))
              ;; inmemdb is a dbstruct
              (condition-case
               (db:sync-touched *inmemdb* *run-id* force-sync: #t)
               ((sync-failed)(cond
                              ((> bad-sync-count 10) ;; time to give up
                               (rpc-transport:server-shutdown server-id rpc:listener))
                              (else ;; (> bad-sync-count 0)  ;; we've had a fail or two, delay and loop
                               (thread-sleep! 5)
                               (loop count (+ bad-sync-count 1)))))
               ((exn)
                (debug:print-error 0 *default-log-port* "error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server")
                (rpc-transport:server-shutdown server-id rpc:listener)))
              (set! sync-time  (- (current-milliseconds) start-time))
              (set! rem-time (quotient (- 4000 sync-time) 1000))
              (debug:print 4 *default-log-port* "SYNC: time= " sync-time ", rem-time=" rem-time)
              
              (if (and (<= rem-time 4)
                       (> rem-time 0))
                  (thread-sleep! rem-time)
                  (thread-sleep! 4))) ;; fallback for if the math is changed ...
            
            (if (< count 1) ;; 3x3 = 9 secs aprox
                (loop (+ count 1) bad-sync-count))

            ;; BB: don't see how this is possible with RPC
            ;; ;; Check that iface and port have not changed (can happen if server port collides)
            ;; (mutex-lock! *heartbeat-mutex*)
            ;; (set! sdat *server-info*)
            ;; (mutex-unlock! *heartbeat-mutex*)
            
            ;; (if (or (not (equal? sdat (list iface port)))
            ;;         (not server-id))
            ;;     (begin 
            ;;       (debug:print-info 0 *default-log-port* "interface changed, refreshing iface and port info")
            ;;       (set! iface (car sdat))
            ;;       (set! port  (cadr sdat))))
            
            ;; Transfer *last-db-access* to last-access to use in checking that we are still alive
            (mutex-lock! *heartbeat-mutex*)
            (set! last-access *last-db-access*)
            (mutex-unlock! *heartbeat-mutex*)
            
            ;; (debug:print 11 *default-log-port* "last-access=" last-access ", server-timeout=" server-timeout)
            ;;
            ;; no_traffic, no running tests, if server 0, no running servers
            ;;
            ;; (let ((wait-on-running (configf:lookup *configdat* "server" b"wait-on-running"))) ;; wait on running tasks (if not true then exit on time out)
            ;;
            (let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600))
                   (adjusted-timeout (if (> hrs-since-start 1)
                                         (- server-timeout (inexact->exact (round (* hrs-since-start 60))))  ;; subtract 60 seconds per hour
                                         server-timeout)))
              (if (common:low-noise-print 120 "server timeout")
                  (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout))
              (if (and *server-run*
                       (> (+ last-access server-timeout)
                          (current-seconds)))
                  (begin
                    (if (common:low-noise-print 120 "server continuing")
                        (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)))
                    ;;
                    ;; Consider implementing some smarts here to re-insert the record or kill self is
                    ;; the db indicates so
                    ;;
                    ;; (if (tasks:server-am-i-the-server? tdb run-id)
                    ;;     (tasks:server-set-state! tdb server-id "running"))
                    ;;
                    (loop 0 bad-sync-count))
                  (rpc-transport:server-shutdown server-id rpc:listener))))
          ;; end new loop
          
          ;; ;; begin old loop
          ;; (let loop ((count 0))
          ;;   (BB> "Found top of rpc-transport:run stay-alive loop.")
          ;;   (thread-sleep! 5) ;; no need to do this very often
          ;;   (let ((numrunning -1)) ;; (db:get-count-tests-running db)))
          ;;     (if (or (> numrunning 0)
          ;;             (> (+ *last-db-access* 60)(current-seconds)))
          ;;         (begin
          ;;           (debug:print-info 0 *default-log-port* "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*))
          ;;           (loop (+ 1 count)))
          ;;         (begin
          ;;           (debug:print-info 0 *default-log-port* "Starting to shutdown the server side")
          ;;           (open-run-close tasks:server-delete-record tasks:open-db server-id " rpc-transport:try-start-server stop")
          ;;           (thread-sleep! 10)
          ;;           (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*)
          ;;           (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting")
          ;;           ))))
          ;; ;; end old loop


          ))))


(define (rpc-transport:find-free-port-and-open port #!key )
  (handle-exceptions
   exn
   (begin
     (print "Failed to bind to port " (rpc:default-server-port) ", trying next port")
     (rpc-transport:find-free-port-and-open (+ port 1)))
193
194
195
196
197
198
199
200

201
202
203
204
205
206
207
362
363
364
365
366
367
368

369
370
371
372
373
374
375
376







-
+







		  (begin
		    (server:try-running run-id)
		    (thread-sleep! 2)
		    (rpc-transport:client-setup run-id (- remtries 1)))))
 	    (let* ((server-db-info (open-run-close tasks:get-server-info tasks:open-db run-id)))
 	      (debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	      (if server-db-info
 		  (let* ((iface     (tasks:hostinfo-get-interface server-db-info))
		  (let* ((iface     (tasks:hostinfo-get-interface server-db-info))
 			 (port      (tasks:hostinfo-get-port      server-db-info))
			 (server-dat (list iface port #f #f #f))
 			 (ping-res  ((rpc:procedure 'server:login host port) *toppath*)))
 		    (if start-res
 			(begin
 			  (hash-table-set! *runremote* run-id server-dat)
			  server-dat)

Modified server.scm from [c9c38a4420] to [b3bc3d6537].

258
259
260
261
262
263
264



265
266
267
268
269
270
271
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274







+
+
+







       (if (eof-object? inl)
	   (case (string->symbol res)
	     ((NOREPLY)  #f)
	     ((LOGIN_OK) #t)
	     (else       #f))
	   (loop (read-line) inl))))))


;; Client will call this procedure on the server via the low-level transport (http/rpc/etc) to verify its toppath matches the server's toppath.
;; A true result means client and server are associated with same megatest instance, share the same megatest.config, etc...)  A false result means the client should not talk to this server.
(define (server:login toppath)
  (lambda (toppath)
    (set! *last-db-access* (current-seconds))
    (if (equal? *toppath* toppath)
	(begin
	  ;; (debug:print-info 2 *default-log-port* "login successful")
	  #t)

Modified tasks.scm from [c4bf88d4c8] to [afe54500ce].

289
290
291
292
293
294
295




296
297
298
299
300
301
302
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306







+
+
+
+







		 (remtries 100))
	(if (member port used-ports)
	    (if (> remtries 0)
		(loop (get-rand-port)(- remtries 1))
		(get-rand-port))
	    port))))))

;; there can be multiple servers spawned for the same runid.  we want exactly zero or one servers per runid.  The caller is a nascent server.  It wants to know if it should proceed or if it is redundant.  this function chooses a winner and tells me if I am the winner.  Alternative is lots of runaway servers.  Nobody wants that, trust me.
;;
;; algo: get all server info entries for this runid.  Each nascent server will insert an entry for its runid before getting here.  Entries are visible globally.  If current hostname and current processid match first entry, then yes I am the server;  return server-id as my prize for winning.  Otherwise, I am not the server;  return #f.
;;
(define (tasks:server-am-i-the-server? mdb run-id)
  (let* ((all    (tasks:server-get-servers-vying-for-run-id mdb run-id))
	 (first  (if (null? all)
		     #f;; (begin (debug:print-error 0 *default-log-port* "no servers listed, should be at least one by now.") 
		       ;;      (sqlite3:finalize! mdb)
		       ;;      (exit 1))
		     (car (db:get-rows all)))))
331
332
333
334
335
336
337












338
339
340
341
342
343





344
345
346
347
348
349
350
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371







+
+
+
+
+
+
+
+
+
+
+
+






+
+
+
+
+









;; BB> bb opinion - want to push responsibility into api (encapsulation), like waiting if db is busy and finding the db handle in the first place.  why should the caller need to be concerned??  If my opinion carries, we'll remove the bb- and make other needful adjustments.
(define (bb-mdb-inserter mdb-expecting-proc mdbless-args)
  (let ((mdb (db:delay-if-busy (tasks:open-db))))
    (apply mdb-expecting-proc (cons mdb mdbless-args))))

(define (tasks:bb-server-lock-slot . args)
  (bb-mdb-inserter tasks:server-lock-slot args))

(define (tasks:bb-server-set-interface-port . args)
  (bb-mdb-inserter tasks:server-set-interface-port args))

(define (tasks:bb-server-am-i-the-server? . args)
  (bb-mdb-inserter tasks:server-am-i-the-server? args))

(define (tasks:bb-server-set-state! . args)
  (bb-mdb-inserter tasks:server-set-state! args))
  
(define (tasks:bb-get-server-info . args)
  (bb-mdb-inserter tasks:get-server-info args))

(define (tasks:bb-num-in-available-state . args)
  (bb-mdb-inserter tasks:num-in-available-state args))

(define (tasks:bb-server-delete-records-for-this-pid . args)
  (bb-mdb-inserter tasks:server-delete-records-for-this-pid args))

(define (tasks:bb-server-delete-record . args)
  (bb-mdb-inserter tasks:server-delete-record args))


;; BB: renaming tasks:get-server to get-server-info to make clear we aren't creating servers here
(define (tasks:get-server-info mdb run-id #!key (retries 10))
  (let ((res  #f)
	(best #f))
    (handle-exceptions