Megatest

Check-in [1cf31de4df]
Login
Overview
Comment:sync point
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | rpc-transport
Files: files | file ages | folders
SHA1: 1cf31de4df37a06d5df9b2cf671860544560e17d
User & Date: bjbarcla on 2016-11-15 16:37:14
Other Links: branch diff | manifest | tags
Context
2016-11-15
18:14
fixed bug where client stopped working when server needed restart (cached rpc stub was not refreshing when port changed) check-in: cf3341f204 user: bjbarcla tags: rpc-transport
16:37
sync point check-in: 1cf31de4df user: bjbarcla tags: rpc-transport
14:02
wrapped access to *runremote* has with mutes check-in: 3dffa0e4f9 user: bjbarcla tags: rpc-transport
Changes

Modified client.scm from [d92954c0d9] to [65a69d122a].

49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
;;   (case (server:get-transport)
;;     ((rpc)  (rpc:client-connect  iface port))
;;     ((http) (http:client-connect iface port))
;;     ((zmq)  (zmq:client-connect  iface port))
;;     (else   (rpc:client-connect  iface port))))

(define (client:setup run-id #!key (remaining-tries 10))
  (BB> "Entered client:setup with run-id="run-id" and remaining-tries="remaining-tries)
   
  (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries)
  (let* ((server-dat (tasks:bb-get-server-info run-id))
         (transport (if (and server-dat (vector? server-dat)) (string->symbol (tasks:hostinfo-get-transport server-dat)) 'noserver)))
    
    (case transport
      ((noserver) ;; no server registered







|







49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
;;   (case (server:get-transport)
;;     ((rpc)  (rpc:client-connect  iface port))
;;     ((http) (http:client-connect iface port))
;;     ((zmq)  (zmq:client-connect  iface port))
;;     (else   (rpc:client-connect  iface port))))

(define (client:setup run-id #!key (remaining-tries 10))
  ;;(BB> "Entered client:setup with run-id="run-id" and remaining-tries="remaining-tries)
   
  (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries)
  (let* ((server-dat (tasks:bb-get-server-info run-id))
         (transport (if (and server-dat (vector? server-dat)) (string->symbol (tasks:hostinfo-get-transport server-dat)) 'noserver)))
    
    (case transport
      ((noserver) ;; no server registered

Modified common_records.scm from [6bf211fc41] to [c5826de33b].

115
116
117
118
119
120
121





122
123
124
125
126
127
128
       (let* ((this-loc (vector-ref frame 0))
              (this-func (cadr (string-split this-loc " "))))
         (if (equal? this-func "BB>")
             (set! location this-loc))))
     stack)
    (let ((dp-args (append (list 0 *default-log-port* location"   "  ) in-args)))
      (apply debug:print dp-args))))






(define (debug:print-error n e . params)
  ;; normal print
  (if (debug:debug-mode n)
      (with-output-to-port (or e (current-error-port))
	(lambda ()
	  (if *logging*







>
>
>
>
>







115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
       (let* ((this-loc (vector-ref frame 0))
              (this-func (cadr (string-split this-loc " "))))
         (if (equal? this-func "BB>")
             (set! location this-loc))))
     stack)
    (let ((dp-args (append (list 0 *default-log-port* location"   "  ) in-args)))
      (apply debug:print dp-args))))

(define (BB> . in-args)
  (apply print "BB> " in-args)
  "shouldn't do anything")


(define (debug:print-error n e . params)
  ;; normal print
  (if (debug:debug-mode n)
      (with-output-to-port (or e (current-error-port))
	(lambda ()
	  (if *logging*

Modified http-transport.scm from [05b9c5420a] to [e7487d8749].

337
338
339
340
341
342
343

344
345
346
347
348
349
350
      (conc "http://" 
	    (http-transport:server-dat-get-iface vec)
	    ":"
	    (http-transport:server-dat-get-port  vec))
      #f))

(define (http-transport:server-dat-update-last-access vec)

  (if (vector? vec)
      (vector-set! vec 5 (current-seconds))
      (begin
	(print-call-chain (current-error-port))
	(debug:print-error 0 *default-log-port* "call to http-transport:server-dat-update-last-access with non-vector!!"))))

;;







>







337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
      (conc "http://" 
	    (http-transport:server-dat-get-iface vec)
	    ":"
	    (http-transport:server-dat-get-port  vec))
      #f))

(define (http-transport:server-dat-update-last-access vec)
  (BB> "entered http-transport:server-dat-update-last-access vec="vec)
  (if (vector? vec)
      (vector-set! vec 5 (current-seconds))
      (begin
	(print-call-chain (current-error-port))
	(debug:print-error 0 *default-log-port* "call to http-transport:server-dat-update-last-access with non-vector!!"))))

;;

Modified rmt.scm from [117ca0eaf3] to [eac4b98e46].

111
112
113
114
115
116
117



118
119

120
121
122
123
124
125
126
        ;; otherwise pick the global default as preference. (set in common.scm)
        *transport-type*)))

(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;



(define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected
  ;; side-effect: clean out old connections

  (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin
    (for-each 
     (lambda (run-id)
       (let ((connection (rmt:get-cinfo run-id)))
         (if (and (vector? connection)
        	  (< (http-transport:server-dat-get-last-access connection) expire-time)) ;; BB> BBTODO: make this generic, not http transport specific.
             (begin







>
>
>


>







111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
        ;; otherwise pick the global default as preference. (set in common.scm)
        *transport-type*)))

(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;

(define *rmt:srmutex* (make-mutex))

(define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected
  ;; side-effect: clean out old connections
  (mutex-lock! *rmt:srmutex*)
  (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin
    (for-each 
     (lambda (run-id)
       (let ((connection (rmt:get-cinfo run-id)))
         (if (and (vector? connection)
        	  (< (http-transport:server-dat-get-last-access connection) expire-time)) ;; BB> BBTODO: make this generic, not http transport specific.
             (begin
134
135
136
137
138
139
140


141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

159
160
161
162
163
164
165
166
167
168
169
170
171

172
173
174
175
176
177
178
    (if connection-info
	;; use the server if have connection info
	(let* ((transport-type (rmt:run-id->transport-type run-id))
               
               ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
               ;;  Here, we make request to remote server
               ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


               (dat     (case transport-type ;; BB: replaced *transport-type* global with run-id specific transport-type
			  ((http)(condition-case
				  (http-transport:client-api-send-receive run-id connection-info cmd params)
				  ((commfail)(vector #f "communications fail"))
				  ((exn)(vector #f "other fail"))))
                          ((rpc) (rpc-transport:client-api-send-receive run-id connection-info cmd params)) ;; BB: let us error out for now
			  (else  
                           (debug:print-error 0 *default-log-port* "(1) Transport [" transport-type
                                              "] specified for run-id [" run-id
                                              "] is not implemented in rmt:send-receive.  Cannot proceed." (symbol? transport-type))
                           (vector #f (conc "transport ["transport-type"] unimplemented")))))
               
               
	       (success (if (vector? dat) (vector-ref dat 0) #f))
	       (res     (if (vector? dat) (vector-ref dat 1) #f)))
	  (if (vector? connection-info)(http-transport:server-dat-update-last-access connection-info)) ;; BB> BBTODO: make this generic, not http transport specific.
	  (if success
	      (begin

		;; (mutex-unlock! *send-receive-mutex*)
		(case transport-type 
		  ((http rpc) res) ;; (db:string->obj res))
                  (else
                   (debug:print-error 0 *default-log-port* "(2) Transport [" transport-type
                                      "] specified for run-id [" run-id
                                      "] is not implemented in rmt:send-receive.  Cannot proceed. Also unexpected since this branch follows success which would follow a suported transport...")
                   #f)
                  )) ;; (vector-ref res 1)))
              
              ;; no success...
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 *default-log-port* "WARNING: Communication failed, trying call to rmt:send-receive again.")

                (case transport-type
                  
                  ((http rpc)
                   (hash-table-delete! *runremote* run-id) ;; don't keep using the same connection
                   ;; NOTE: killing server causes this process to block forever. No idea why. Dec 2. 
                   ;; (if (eq? (modulo attemptnum 5) 0)
                   ;;     (tasks:kill-server-run-id run-id tag: "api-send-receive-failed"))







>
>
|
|
|
|
|
|
|
|
|
|
|







>













>







138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
    (if connection-info
	;; use the server if have connection info
	(let* ((transport-type (rmt:run-id->transport-type run-id))
               
               ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
               ;;  Here, we make request to remote server
               ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
               (dat     (begin
                          
                          (case transport-type ;; BB: replaced *transport-type* global with run-id specific transport-type
                            ((http)(condition-case
                                    (http-transport:client-api-send-receive run-id connection-info cmd params) 
                                    ((commfail)(vector #f "communications fail"))
                                    ((exn)(vector #f "other fail"))))
                            ((rpc) (rpc-transport:client-api-send-receive run-id connection-info cmd params)) ;; BB: let us error out for now
                            (else  
                             (debug:print-error 0 *default-log-port* "(1) Transport [" transport-type
                                                "] specified for run-id [" run-id
                                                "] is not implemented in rmt:send-receive.  Cannot proceed." (symbol? transport-type))
                             (vector #f (conc "transport ["transport-type"] unimplemented"))))))
               
               
	       (success (if (vector? dat) (vector-ref dat 0) #f))
	       (res     (if (vector? dat) (vector-ref dat 1) #f)))
	  (if (vector? connection-info)(http-transport:server-dat-update-last-access connection-info)) ;; BB> BBTODO: make this generic, not http transport specific.
	  (if success
	      (begin
                (mutex-unlock! *rmt:srmutex*)
		;; (mutex-unlock! *send-receive-mutex*)
		(case transport-type 
		  ((http rpc) res) ;; (db:string->obj res))
                  (else
                   (debug:print-error 0 *default-log-port* "(2) Transport [" transport-type
                                      "] specified for run-id [" run-id
                                      "] is not implemented in rmt:send-receive.  Cannot proceed. Also unexpected since this branch follows success which would follow a suported transport...")
                   #f)
                  )) ;; (vector-ref res 1)))
              
              ;; no success...
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 *default-log-port* "WARNING: Communication failed, trying call to rmt:send-receive again.")
                (mutex-unlock! *rmt:srmutex*)
                (case transport-type
                  
                  ((http rpc)
                   (hash-table-delete! *runremote* run-id) ;; don't keep using the same connection
                   ;; NOTE: killing server causes this process to block forever. No idea why. Dec 2. 
                   ;; (if (eq? (modulo attemptnum 5) 0)
                   ;;     (tasks:kill-server-run-id run-id tag: "api-send-receive-failed"))
186
187
188
189
190
191
192
193
194
195
196
197
198
199

200
201
202
203
204
205
206
                   ;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))
                   ;; (thread-sleep! 2)
                   (rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1)))
                  (else
                   (debug:print-error 0 *default-log-port* "(3) Transport [" transport-type
                                      "] specified for run-id [" run-id
                                      "] is not implemented in rmt:send-receive.  Cannot proceed.")
                   #f)))))
        
	;; no connection info; try to start a server
	;;
	;; Note: The tasks db was checked for a server in starting mode in the rmt:get-connection-info call
	;;
        (begin

          (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
          (thread-sleep! (random 5)) ;; give some time to settle and minimize collison?
          (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))
  

(define (rmt:update-db-stats run-id rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)







|






>







194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
                   ;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))
                   ;; (thread-sleep! 2)
                   (rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1)))
                  (else
                   (debug:print-error 0 *default-log-port* "(3) Transport [" transport-type
                                      "] specified for run-id [" run-id
                                      "] is not implemented in rmt:send-receive.  Cannot proceed.")
                   (exit 1))))))
        
	;; no connection info; try to start a server
	;;
	;; Note: The tasks db was checked for a server in starting mode in the rmt:get-connection-info call
	;;
        (begin
          (mutex-unlock! *rmt:srmutex*)
          (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
          (thread-sleep! (random 5)) ;; give some time to settle and minimize collison?
          (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))
  

(define (rmt:update-db-stats run-id rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)

Modified rpc-transport.scm from [6a9546f520] to [d50c6c50d7].

253
254
255
256
257
258
259

260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285

286
287
288
289
290
291
292
293
294
295
296
297
298
299
300

301
302
303
304
305
306
307
          (hash-table-set! *api-exec-ht* '(iface . port) res)
          res))))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; this client-side procedure makes rpc call to server and returns result
;;
(define (rpc-transport:client-api-send-receive run-id serverdat cmd params #!key (numretries 3))

  (if (not (vector? serverdat))
      (begin
        (BB> "WHAT?? for run-id="run-id", serverdat="serverdat)
        (print-call-chain)
        (exit 1)))
  (let* ((iface (rpc-transport:server-dat-get-iface serverdat))
         (port  (rpc-transport:server-dat-get-port serverdat))
         (res #f)
         (api-exec (rpc-transport:get-api-exec iface port))  
         (send-receive (lambda ()
                         (tcp-buffer-size 0)
                         (set! res (retry-thunk
                                    (lambda ()
                                      (condition-case
                                       ;;(vector #t (run-remote cmd params))
                                       (vector 'success (api-exec cmd params))
                                       [x (exn i/o net) (vector 'comms-fail (conc "communications fail ["(->string x)"]") x)]
                                       [x () (vector 'other-fail "other fail ["(->string x)"]" x)]))
                                    chatty: #f
                                    accept-result?: (lambda(x)
                                                      (and (vector? x) (vector-ref x 0)))
                                    retries: 4
                                    back-off-factor: 1.5
                                    random-wait: 0.2
                                    retry-delay: 0.1
                                    final-failure-returns-actual: #t))

                         res
                         ))
         (th1 (make-thread send-receive "send-receive"))
         (time-out-reached #f)
         (time-out     (lambda ()
			      (thread-sleep! 45)
                              (set! time-out-reached #t)
                              (thread-terminate! th1)
			      #f))

         (th2 (make-thread time-out     "time out")))
	 (thread-start! th1)
	 (thread-start! th2)
	 (thread-join! th1)
	 (thread-terminate! th2)

	 (debug:print-info 11 *default-log-port* "got res=" res)
	 (if (vector? res)
             (case (vector-ref res 0)
               ((success) (vector #t (vector-ref res 1)))
               ((comms-fail)
                (debug:print 0 *default-log-port* "WARNING: comms failure for rpc request")
                ;;(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))







>


















|







>















>







253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
          (hash-table-set! *api-exec-ht* '(iface . port) res)
          res))))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; this client-side procedure makes rpc call to server and returns result
;;
(define (rpc-transport:client-api-send-receive run-id serverdat cmd params #!key (numretries 3))
  (BB> "entered nsport:client-api-send-receive with run-id="run-id " serverdat="serverdat" cmd="cmd" params="params" numretries="numretries)
  (if (not (vector? serverdat))
      (begin
        (BB> "WHAT?? for run-id="run-id", serverdat="serverdat)
        (print-call-chain)
        (exit 1)))
  (let* ((iface (rpc-transport:server-dat-get-iface serverdat))
         (port  (rpc-transport:server-dat-get-port serverdat))
         (res #f)
         (api-exec (rpc-transport:get-api-exec iface port))  
         (send-receive (lambda ()
                         (tcp-buffer-size 0)
                         (set! res (retry-thunk
                                    (lambda ()
                                      (condition-case
                                       ;;(vector #t (run-remote cmd params))
                                       (vector 'success (api-exec cmd params))
                                       [x (exn i/o net) (vector 'comms-fail (conc "communications fail ["(->string x)"]") x)]
                                       [x () (vector 'other-fail "other fail ["(->string x)"]" x)]))
                                    chatty: #t
                                    accept-result?: (lambda(x)
                                                      (and (vector? x) (vector-ref x 0)))
                                    retries: 4
                                    back-off-factor: 1.5
                                    random-wait: 0.2
                                    retry-delay: 0.1
                                    final-failure-returns-actual: #t))
                         (BB> "HEY res="res)
                         res
                         ))
         (th1 (make-thread send-receive "send-receive"))
         (time-out-reached #f)
         (time-out     (lambda ()
			      (thread-sleep! 45)
                              (set! time-out-reached #t)
                              (thread-terminate! th1)
			      #f))

         (th2 (make-thread time-out     "time out")))
	 (thread-start! th1)
	 (thread-start! th2)
	 (thread-join! th1)
	 (thread-terminate! th2)
         (BB> "alt got res="res)
	 (debug:print-info 11 *default-log-port* "got res=" res)
	 (if (vector? res)
             (case (vector-ref res 0)
               ((success) (vector #t (vector-ref res 1)))
               ((comms-fail)
                (debug:print 0 *default-log-port* "WARNING: comms failure for rpc request")
                ;;(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
583
584
585
586
587
588
589
590
591
          (tasks:bb-server-force-clean-run-record  run-id iface port
                                                   " rpc-transport:client-setup (server-dat = #t)")
          (if (> remtries 2)
              (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little
              (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time
          (server:try-running run-id)
          (thread-sleep! 5)   ;; give server a little time to start up
          (client:setup run-id remaining-tries: (sub1 remtries))
          " rpc-transport:client-setup (server-dat = #t)"))))







|
<
586
587
588
589
590
591
592
593

          (tasks:bb-server-force-clean-run-record  run-id iface port
                                                   " rpc-transport:client-setup (server-dat = #t)")
          (if (> remtries 2)
              (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little
              (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time
          (server:try-running run-id)
          (thread-sleep! 5)   ;; give server a little time to start up
          (client:setup run-id remaining-tries: (sub1 remtries))))))