Megatest

Check-in [b3a83a0cec]
Login
Overview
Comment:Added a little time spread on client starting servers - try to avoid startup storms
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | try-nanomsg
Files: files | file ages | folders
SHA1: b3a83a0cec1ff4307b0218ec05dd90c0b4adba2b
User & Date: matt on 2014-11-29 21:50:06
Other Links: branch diff | manifest | tags
Context
2014-11-29
22:44
Re-enable closing connections if open longer than the server timeout and no accesses Closed-Leaf check-in: cec4ee3511 user: matt tags: try-nanomsg
21:50
Added a little time spread on client starting servers - try to avoid startup storms check-in: b3a83a0cec user: matt tags: try-nanomsg
20:33
http transport with try-nanomsg changes completed first pass Validation 100% check-in: b260c4c5d9 user: matt tags: try-nanomsg
Changes

Modified client.scm from [8f8fe7296f] to [72d1a98b4a].

90
91
92
93
94
95
96



97
98
99



100
101
102
103
104
105

106
107
108
109
110
111
112
90
91
92
93
94
95
96
97
98
99
100


101
102
103
104
105
106
107
108

109
110
111
112
113
114
115
116







+
+
+

-
-
+
+
+





-
+







		      (hash-table-delete! *runremote* run-id)
		      (tasks:kill-server-run-id run-id)
		      (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
							   run-id 
							   (tasks:hostinfo-get-interface server-dat)
							   (tasks:hostinfo-get-port      server-dat)
							   " client:setup (server-dat = #t)")
		      (if (> remaining-tries 8)
			  (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little
			  (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time
		      (server:try-running run-id)
		      (thread-sleep! 5) ;; give server a little time to start up
		      (client:setup run-id remaining-tries: (- remaining-tries 1)))))
		      (thread-sleep! 5)   ;; give server a little time to start up
		      (client:setup run-id remaining-tries: (- remaining-tries 1))
		      )))
	      (begin    ;; no server registered
		(let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id)))
		  (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
		  (if (< num-available 2)
		      (server:try-running run-id))
		  (thread-sleep! 5) ;; give server a little time to start up
		  (thread-sleep! (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
		  (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))

;; 	(let ((host-info (hash-table-ref/default *runremote* run-id #f)))
;; 	  (if host-info ;; this is a bit circular. the host-info *is* the start-res FIXME
;; 	      (let* ((iface     (http-transport:server-dat-get-iface host-info))
;; 		     (port      (http-transport:server-dat-get-port  host-info))
;; 		     (start-res (case *transport-type* 

Modified http-transport.scm from [4218ba432d] to [012f18812a].

285
286
287
288
289
290
291
292



293
294
295
296
297
298
299
285
286
287
288
289
290
291

292
293
294
295
296
297
298
299
300
301







-
+
+
+







					   exn
					   (begin
					     (set! success #f)
					     (debug:print 0 "WARNING: failure in with-input-from-request to " fullurl ".")
					     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
					     (hash-table-delete! *runremote* run-id)
					     ;; Killing associated server to allow clean retry.")
					     ;; (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine.
					     (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine?
					     (signal (make-composite-condition
						      (make-property-condition 'commfail 'message "failed to connect to server")))
					     #f)
					   (with-input-from-request ;; was dat
					    fullurl 
					    (list (cons 'key "thekey")
						  (cons 'cmd cmd)
						  (cons 'params sparams))
					    read-string))

Modified rmt.scm from [fbd1a17708] to [df66a53f8b].

85
86
87
88
89
90
91

92


93
94
95
96
97
98
99
85
86
87
88
89
90
91
92

93
94
95
96
97
98
99
100
101







+
-
+
+







  ;; (mutex-lock! *send-receive-mutex*)
  (let* ((run-id          (if rid rid 0))
	 (connection-info (rmt:get-connection-info run-id)))
    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)
    (if connection-info
	;; use the server if have connection info
	(let* ((dat     (case *transport-type*
			  ((http)(condition-case
			  ((http)(http-transport:client-api-send-receive run-id connection-info cmd params))
				  (http-transport:client-api-send-receive run-id connection-info cmd params)
				  ((commfail)(vector #f "communications fail"))))
			  ((nmsg)(condition-case
				  (nmsg-transport:client-api-send-receive run-id connection-info cmd params)
				  ((timeout)(vector #f "timeout talking to server"))))
			  (else  (exit))))
	       (success (if (and dat (vector? dat)) (vector-ref dat 0) #f))
	       (res     (if (and dat (vector? dat)) (vector-ref dat 1) #f)))
	  (http-transport:server-dat-update-last-access connection-info)