Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -92,19 +92,23 @@ (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id (tasks:hostinfo-get-interface server-dat) (tasks:hostinfo-get-port server-dat) " client:setup (server-dat = #t)") + (if (> remaining-tries 8) + (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little + (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time (server:try-running run-id) - (thread-sleep! 5) ;; give server a little time to start up - (client:setup run-id remaining-tries: (- remaining-tries 1))))) + (thread-sleep! 5) ;; give server a little time to start up + (client:setup run-id remaining-tries: (- remaining-tries 1)) + ))) (begin ;; no server registered (let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id))) (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) (if (< num-available 2) (server:try-running run-id)) - (thread-sleep! 5) ;; give server a little time to start up + (thread-sleep! (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. (client:setup run-id remaining-tries: (- remaining-tries 1))))))))) ;; (let ((host-info (hash-table-ref/default *runremote* run-id #f))) ;; (if host-info ;; this is a bit circular. the host-info *is* the start-res FIXME ;; (let* ((iface (http-transport:server-dat-get-iface host-info)) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -287,11 +287,13 @@ (set! success #f) (debug:print 0 "WARNING: failure in with-input-from-request to " fullurl ".") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (hash-table-delete! *runremote* run-id) ;; Killing associated server to allow clean retry.") - ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine. + (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? + (signal (make-composite-condition + (make-property-condition 'commfail 'message "failed to connect to server"))) #f) (with-input-from-request ;; was dat fullurl (list (cons 'key "thekey") (cons 'cmd cmd) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -87,11 +87,13 @@ (connection-info (rmt:get-connection-info run-id))) ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also) (if connection-info ;; use the server if have connection info (let* ((dat (case *transport-type* - ((http)(http-transport:client-api-send-receive run-id connection-info cmd params)) + ((http)(condition-case + (http-transport:client-api-send-receive run-id connection-info cmd params) + ((commfail)(vector #f "communications fail")))) ((nmsg)(condition-case (nmsg-transport:client-api-send-receive run-id connection-info cmd params) ((timeout)(vector #f "timeout talking to server")))) (else (exit)))) (success (if (and dat (vector? dat)) (vector-ref dat 0) #f))