Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -52,11 +52,13 @@ (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported (5)") (exit)))) (define (client:setup run-id #!key (remaining-tries 10) (failed-connects 0)) (case (server:get-transport) - ((rpc) (rpc-transport:client-setup run-id remaining-tries: remaining-tries failed-connects: failed-connects)) ;;(client:setup-rpc run-id)) + ((rpc) (let ((res (rpc-transport:client-setup run-id remaining-tries: remaining-tries failed-connects: failed-connects))) + (remote-conndat-set! *runremote* runremote-server-dat) + res)) ((http)(client:setup-http run-id remaining-tries: remaining-tries failed-connects: failed-connects)) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported (6)") (exit)))) ;; (client:setup-rpc run-id)))) @@ -156,10 +158,30 @@ ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; + +(define (client:setup-rpc run-id #!key (remaining-tries 10) (failed-connects 0)) + (debug:print-info 2 *default-log-port* "client:setup-rpc remaining-tries=" remaining-tries) + (let* ((server-dat (tasks:get-server (db:delay-if-busy (tasks:open-db)) run-id)) + (num-available (tasks:num-in-available-state (db:delay-if-busy (tasks:open-db)) run-id))) + (cond + ((<= remaining-tries 0) + (debug:print-error 0 *default-log-port* "failed to start or connect to server for run-id " run-id) + (exit 1)) + (server-dat + (debug:print-info 4 *default-log-port* "client:setup-rpc server-dat=" server-dat ", remaining-tries=" remaining-tries) + + (rpc-transport:client-setup run-id server-dat remaining-tries: remaining-tries)) + (else + (if (< num-available 2) + (server:try-running run-id)) + (thread-sleep! (+ 2 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. + (client:setup run-id remaining-tries: (- remaining-tries 1)))))) + + (define (client:setup-http run-id #!key (remaining-tries 10) (failed-connects 0)) (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) (let* ((tdbdat (tasks:open-db))) (if (<= remaining-tries 0) (begin Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -335,11 +335,11 @@ ;; (define (http-transport:client-connect iface port) (let* ((api-url (conc "http://" iface ":" port "/api")) (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) (api-req (make-request method: 'POST uri: api-uri)) - (server-dat (vector iface port api-uri api-url api-req (current-seconds)))) + (server-dat (vector iface port api-uri api-url api-req (current-seconds) 'http))) server-dat)) ;; run http-transport:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; Index: rpc-transport.scm ================================================================== --- rpc-transport.scm +++ rpc-transport.scm @@ -52,19 +52,10 @@ (mutex-unlock! *heartbeat-mutex*) res)) - ;; (handle-exceptions - ;; exn - ;; (begin - ;; (debug:print 0 *default-log-port* "Remote failed for " proc " " params " exn="exn) - ;; (apply (eval (string->symbol procstr)) params)) - ;; ;; (if *runremote* - ;; ;; (apply (eval (string->symbol (conc "remote:" procstr))) params) - ;; (apply (eval (string->symbol procstr)) params))) - ;; retry an operation (depends on srfi-18) ;; ================== ;; idea here is to avoid spending time on coding retrying something. Trying to be generic here. ;; ;; Exception handling: @@ -619,14 +610,14 @@ (BB> "Self test fail. login-res="login-res" testing-res="testing-res" *toppath*="*toppath*) #f)) res)) -(define (rpc-transport:client-setup run-id server-dat #!key (remtries 10)) - ;;(BB> "entered rpc-transport:client-setup with run-id="run-id" and server-dat="server-dat" and retries="remtries) +(define (rpc-transport:client-setup run-id server-dat #!key (remaining-tries 10)) + ;;(BB> "entered rpc-transport:client-setup with run-id="run-id" and server-dat="server-dat" and retries="remaining-tries) (tcp-buffer-size 0) - (debug:print-info 0 *default-log-port* "rpc-transport:client-setup run-id="run-id" server-dat=" server-dat ", remaining-tries=" remtries) + (debug:print-info 0 *default-log-port* "rpc-transport:client-setup run-id="run-id" server-dat=" server-dat ", remaining-tries=" remaining-tries) (let* ((iface (tasks:hostinfo-get-interface server-dat)) (hostname (tasks:hostinfo-get-hostname server-dat)) (port (tasks:hostinfo-get-port server-dat)) (runremote-server-dat (vector iface port #f #f #f (current-seconds) 'rpc)) ;; http version := (vector iface port api-uri api-url api-req (current-seconds) 'http ) (ping-res (retry-thunk (lambda () ;; make 3 attempts to ping. @@ -635,18 +626,17 @@ retries: 3))) ;; we got here from rmt:get-connection-info on the condition that *runremote* has no entry for run-id... (if ping-res (begin (debug:print-info 0 *default-log-port* "rpc-transport:client-setup CONNECTION ESTABLISHED run-id="run-id" server-dat=" server-dat) - (rmt:set-cinfo run-id runremote-server-dat) ;; (hash-table-set! *runremote* run-id runremote-server-dat) ;; side-effect - *runremote* cache init fpr rmt:* runremote-server-dat) (begin ;; login failed but have a server record, clean out the record and try again (debug:print-info 0 *default-log-port* "rpc-transport:client-setup UNABLE TO CONNECT run-id="run-id" server-dat=" server-dat) (tasks:kill-server-run-id run-id) (tasks:server-force-clean-run-record (db:delay-if-busy (tasks:open-db)) run-id iface port " rpc-transport:client-setup (server-dat = #t)") - (if (> remtries 2) + (if (> remaining-tries 2) (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time (server:try-running run-id) (thread-sleep! 5) ;; give server a little time to start up - (client:setup run-id remaining-tries: (sub1 remtries)))))) + (client:setup run-id remaining-tries: (sub1 remaining-tries))))))