Megatest

Check-in [b3a83a0cec]
Login
Overview
Comment:Added a little time spread on client starting servers - try to avoid startup storms
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | try-nanomsg
Files: files | file ages | folders
SHA1: b3a83a0cec1ff4307b0218ec05dd90c0b4adba2b
User & Date: matt on 2014-11-29 21:50:06
Other Links: branch diff | manifest | tags
Context
2014-11-29
22:44
Re-enable closing connections if open longer than the server timeout and no accesses Closed-Leaf check-in: cec4ee3511 user: matt tags: try-nanomsg
21:50
Added a little time spread on client starting servers - try to avoid startup storms check-in: b3a83a0cec user: matt tags: try-nanomsg
20:33
http transport with try-nanomsg changes completed first pass Validation 100% check-in: b260c4c5d9 user: matt tags: try-nanomsg
Changes

Modified client.scm from [8f8fe7296f] to [72d1a98b4a].

90
91
92
93
94
95
96



97
98
99

100
101
102
103
104
105
106
107
108
109
110
111
112
		      (hash-table-delete! *runremote* run-id)
		      (tasks:kill-server-run-id run-id)
		      (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
							   run-id 
							   (tasks:hostinfo-get-interface server-dat)
							   (tasks:hostinfo-get-port      server-dat)
							   " client:setup (server-dat = #t)")



		      (server:try-running run-id)
		      (thread-sleep! 5) ;; give server a little time to start up
		      (client:setup run-id remaining-tries: (- remaining-tries 1)))))

	      (begin    ;; no server registered
		(let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id)))
		  (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
		  (if (< num-available 2)
		      (server:try-running run-id))
		  (thread-sleep! 5) ;; give server a little time to start up
		  (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))

;; 	(let ((host-info (hash-table-ref/default *runremote* run-id #f)))
;; 	  (if host-info ;; this is a bit circular. the host-info *is* the start-res FIXME
;; 	      (let* ((iface     (http-transport:server-dat-get-iface host-info))
;; 		     (port      (http-transport:server-dat-get-port  host-info))
;; 		     (start-res (case *transport-type* 







>
>
>

|
|
>





|







90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
		      (hash-table-delete! *runremote* run-id)
		      (tasks:kill-server-run-id run-id)
		      (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
							   run-id 
							   (tasks:hostinfo-get-interface server-dat)
							   (tasks:hostinfo-get-port      server-dat)
							   " client:setup (server-dat = #t)")
		      (if (> remaining-tries 8)
			  (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little
			  (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time
		      (server:try-running run-id)
		      (thread-sleep! 5)   ;; give server a little time to start up
		      (client:setup run-id remaining-tries: (- remaining-tries 1))
		      )))
	      (begin    ;; no server registered
		(let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id)))
		  (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
		  (if (< num-available 2)
		      (server:try-running run-id))
		  (thread-sleep! (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
		  (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))

;; 	(let ((host-info (hash-table-ref/default *runremote* run-id #f)))
;; 	  (if host-info ;; this is a bit circular. the host-info *is* the start-res FIXME
;; 	      (let* ((iface     (http-transport:server-dat-get-iface host-info))
;; 		     (port      (http-transport:server-dat-get-port  host-info))
;; 		     (start-res (case *transport-type* 

Modified http-transport.scm from [4218ba432d] to [012f18812a].

285
286
287
288
289
290
291
292


293
294
295
296
297
298
299
					   exn
					   (begin
					     (set! success #f)
					     (debug:print 0 "WARNING: failure in with-input-from-request to " fullurl ".")
					     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
					     (hash-table-delete! *runremote* run-id)
					     ;; Killing associated server to allow clean retry.")
					     ;; (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine.


					     #f)
					   (with-input-from-request ;; was dat
					    fullurl 
					    (list (cons 'key "thekey")
						  (cons 'cmd cmd)
						  (cons 'params sparams))
					    read-string))







|
>
>







285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
					   exn
					   (begin
					     (set! success #f)
					     (debug:print 0 "WARNING: failure in with-input-from-request to " fullurl ".")
					     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
					     (hash-table-delete! *runremote* run-id)
					     ;; Killing associated server to allow clean retry.")
					     (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine?
					     (signal (make-composite-condition
						      (make-property-condition 'commfail 'message "failed to connect to server")))
					     #f)
					   (with-input-from-request ;; was dat
					    fullurl 
					    (list (cons 'key "thekey")
						  (cons 'cmd cmd)
						  (cons 'params sparams))
					    read-string))

Modified rmt.scm from [fbd1a17708] to [df66a53f8b].

85
86
87
88
89
90
91

92

93
94
95
96
97
98
99
  ;; (mutex-lock! *send-receive-mutex*)
  (let* ((run-id          (if rid rid 0))
	 (connection-info (rmt:get-connection-info run-id)))
    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)
    (if connection-info
	;; use the server if have connection info
	(let* ((dat     (case *transport-type*

			  ((http)(http-transport:client-api-send-receive run-id connection-info cmd params))

			  ((nmsg)(condition-case
				  (nmsg-transport:client-api-send-receive run-id connection-info cmd params)
				  ((timeout)(vector #f "timeout talking to server"))))
			  (else  (exit))))
	       (success (if (and dat (vector? dat)) (vector-ref dat 0) #f))
	       (res     (if (and dat (vector? dat)) (vector-ref dat 1) #f)))
	  (http-transport:server-dat-update-last-access connection-info)







>
|
>







85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
  ;; (mutex-lock! *send-receive-mutex*)
  (let* ((run-id          (if rid rid 0))
	 (connection-info (rmt:get-connection-info run-id)))
    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)
    (if connection-info
	;; use the server if have connection info
	(let* ((dat     (case *transport-type*
			  ((http)(condition-case
				  (http-transport:client-api-send-receive run-id connection-info cmd params)
				  ((commfail)(vector #f "communications fail"))))
			  ((nmsg)(condition-case
				  (nmsg-transport:client-api-send-receive run-id connection-info cmd params)
				  ((timeout)(vector #f "timeout talking to server"))))
			  (else  (exit))))
	       (success (if (and dat (vector? dat)) (vector-ref dat 0) #f))
	       (res     (if (and dat (vector? dat)) (vector-ref dat 1) #f)))
	  (http-transport:server-dat-update-last-access connection-info)