Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -53,50 +53,60 @@ ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; -(define (client:setup run-id #!key (remaining-tries 10)) +(define (client:setup run-id #!key (remaining-tries 20) (failed-connects 0)) (debug:print 0 "INFO: client:setup remaining-tries=" remaining-tries) (if (<= remaining-tries 0) (begin (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id) (exit 1)) (let ((server-dat (and run-id (hash-table-ref/default *runremote* run-id #f)))) (if server-dat (let ((start-res (http-transport:client-connect run-id ;; NB// confusion over server-dat and connection result! - (tasks:hostinfo-get-interface server-dat) - (tasks:hostinfo-get-port server-dat)))) + (car server-dat) + (cadr server-dat)))) (if start-res ;; sucessful login? start-res - (begin ;; login failed - (hash-table-delete! *runremote* run-id) - (open-run-close tasks:server-force-clean-run-record - tasks:open-db - run-id - (tasks:hostinfo-get-interface server-dat) - (tasks:hostinfo-get-port server-dat)) - (thread-sleep! 5) - (client:setup run-id remaining-tries: (- remaining-tries 1))))) + (if (and (< remaining-tries 10) + (odd? remaining-tries)) + (begin ;; login failed + (hash-table-delete! *runremote* run-id) + (open-run-close tasks:server-force-clean-run-record + tasks:open-db + run-id + (car server-dat) + (cadr server-dat)) + (thread-sleep! 5) + (client:setup run-id remaining-tries: (- remaining-tries 1))) + (begin + (thread-sleep! 5) + (client:setup run-id remaining-tries: (- remaining-tries 1)))))) (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id))) (if server-dat (let ((start-res (http-transport:client-connect run-id (tasks:hostinfo-get-interface server-dat) (tasks:hostinfo-get-port server-dat)))) (if start-res start-res - (begin ;; login failed - (hash-table-delete! *runremote* run-id) - (open-run-close tasks:server-force-clean-run-record - tasks:open-db - run-id - (tasks:hostinfo-get-interface server-dat) - (tasks:hostinfo-get-port server-dat)) - (thread-sleep! 2) - (server:try-running run-id) - (thread-sleep! 5) ;; give server a little time to start up - (client:setup run-id remaining-tries: (- remaining-tries 1))))) + (if (and (< remaining-tries 10) + (odd? remaining-tries)) + (begin ;; login failed + (hash-table-delete! *runremote* run-id) + (open-run-close tasks:server-force-clean-run-record + tasks:open-db + run-id + (tasks:hostinfo-get-interface server-dat) + (tasks:hostinfo-get-port server-dat)) + (thread-sleep! 2) + (server:try-running run-id) + (thread-sleep! 5) ;; give server a little time to start up + (client:setup run-id remaining-tries: (- remaining-tries 1))) + (begin + (thread-sleep! 5) + (client:setup run-id remaining-tries: (- remaining-tries 1)))))) (begin ;; no server registered (thread-sleep! 2) (server:try-running run-id) (thread-sleep! 5) ;; give server a little time to start up (client:setup run-id remaining-tries: (- remaining-tries 1))))))))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -107,11 +107,12 @@ ;; > file 2>&1 (define (server:try-running run-id) (let* ((rand-name (random 100)) (cmdln (conc (if (getenv "MT_MEGATEST") (getenv "MT_MEGATEST") "megatest") - " -server - -run-id " run-id " name=" rand-name " > " *toppath* "/db/" run-id "-" rand-name ".log 2>&1 &"))) + ;; " -server - -run-id " run-id " name=" rand-name " > " *toppath* "/db/" run-id "-" rand-name ".log 2>&1 &"))) + " -server - -run-id " run-id " > " *toppath* "/db/" run-id ".log 2>&1 &"))) (debug:print 0 "INFO: Starting server (" cmdln ") as none running ...") (push-directory *toppath*) (system cmdln) (pop-directory)))