Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -53,54 +53,58 @@ ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; -(define (client:setup run-id #!key (remaining-tries 3)) +(define (client:setup run-id #!key (remaining-tries 10)) + (debug:print 0 "INFO: client:setup remaining-tries=" remaining-tries) (if (<= remaining-tries 0) (begin (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id) (exit 1)) (let ((server-dat (and run-id (hash-table-ref/default *runremote* run-id #f)))) (if server-dat - (let ((start-res (http-transport:client-connect run-id + (let ((start-res (http-transport:client-connect run-id ;; NB// confusion over server-dat and connection result! (tasks:hostinfo-get-interface server-dat) (tasks:hostinfo-get-port server-dat)))) (if start-res ;; sucessful login? (begin - (hash-table-set! *runremote* run-id server-dat) - server-dat) + (hash-table-set! *runremote* run-id start-res) + start-res) (begin ;; login failed (hash-table-delete! *runremote* run-id) (open-run-close tasks:server-force-clean-run-record tasks:open-db run-id (tasks:hostinfo-get-interface server-dat) (tasks:hostinfo-get-port server-dat)) + (thread-sleep! 5) (client:setup run-id remaining-tries: (- remaining-tries 1))))) (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id))) (if server-dat (let ((start-res (http-transport:client-connect run-id (tasks:hostinfo-get-interface server-dat) (tasks:hostinfo-get-port server-dat)))) (if start-res (begin - (hash-table-set! *runremote* run-id server-dat) - server-dat) + (hash-table-set! *runremote* run-id start-res) + start-res) (begin ;; login failed (hash-table-delete! *runremote* run-id) (open-run-close tasks:server-force-clean-run-record tasks:open-db run-id (tasks:hostinfo-get-interface server-dat) (tasks:hostinfo-get-port server-dat)) + (thread-sleep! 2) (server:try-running run-id) - (thread-sleep! 3) ;; give server a little time to start up + (thread-sleep! 5) ;; give server a little time to start up (client:setup run-id remaining-tries: (- remaining-tries 1))))) (begin ;; no server registered + (thread-sleep! 2) (server:try-running run-id) - (thread-sleep! 3) ;; give server a little time to start up + (thread-sleep! 5) ;; give server a little time to start up (client:setup run-id remaining-tries: (- remaining-tries 1))))))))) ;; keep this as a function to ease future (define (client:start run-id server-info) (http-transport:client-connect run-id Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -400,10 +400,14 @@ ;; (define (http-transport:launch run-id) (set! *run-id* run-id) (if (args:get-arg "-daemonize") (daemon:ize)) + (if (server:check-if-running run-id) + (begin + (debug:print 0 "INFO: Server for run-id " run-id " already running") + (exit 0))) (let ((server-id (open-run-close tasks:server-lock-slot tasks:open-db run-id))) (if (not server-id) (begin ;; since we didn't get the server lock we are going to clean up and bail out (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -35,22 +35,36 @@ ;;====================================================================== ;; cmd is a symbol ;; vars is a json string encoding the parameters for the call ;; -(define (rmt:send-receive cmd run-id params) - (let* ((connection-info (hash-table-ref/default *runremote* run-id #f)) +(define (rmt:send-receive cmd rid params) + (let* ((run-id (if rid rid 0)) + (connection-info (let ((cinfo (hash-table-ref/default *runremote* run-id #f))) + (if cinfo + cinfo + (let loop ((numtries 100)) + (thread-sleep! 1) + (let ((res (client:setup run-id))) + (if res + res + (if (> numtries 0) + (loop (- numtries 1)) + (begin + (debug:print 0 "ERROR: 100 tries and no server, giving up") + (exit 1))))))))) (jparams (db:obj->string params)) (res (http-transport:client-api-send-receive run-id connection-info cmd jparams))) (if res (db:string->obj res) ;; (rmt:json-str->dat res) (let ((new-connection-info (client:setup run-id))) (debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.") - (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params))))) + (rmt:send-receive cmd run-id params))))) (define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) - (let* ((jparams (db:obj->string params)) ;; (rmt:dat->json-str params)) + (let* ((run-id (if run-id run-id 0)) + (jparams (db:obj->string params)) ;; (rmt:dat->json-str params)) (res (http-transport:client-api-send-receive run-id connection-info cmd jparams numretries: 3))) (if res (db:string->obj res) ;; (rmt:json-str->dat res) ;; this one does NOT keep trying res))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -103,13 +103,15 @@ ;; with spiffy or rpc this simply returns the return data to be returned ;; (define (server:reply return-addr query-sig success/fail result) (db:obj->string (vector success/fail query-sig result))) +;; > file 2>&1 (define (server:try-running run-id) - (let ((cmdln (conc (if (getenv "MT_MEGATEST") (getenv "MT_MEGATEST") "megatest") - " -server - -run-id " run-id " &> " *toppath* "/db/" run-id ".log &"))) + (let* ((rand-name (random 100)) + (cmdln (conc (if (getenv "MT_MEGATEST") (getenv "MT_MEGATEST") "megatest") + " -server - -run-id " run-id " name=" rand-name " > " *toppath* "/db/" run-id "-" rand-name ".log 2>&1 &"))) (debug:print 0 "INFO: Starting server (" cmdln ") as none running ...") (push-directory *toppath*) (system cmdln) (pop-directory))) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -92,17 +92,16 @@ (define (tasks:hostinfo-get-pid vec) (vector-ref vec 5)) (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) (define (tasks:server-lock-slot mdb run-id) (tasks:server-clean-out-old-records-for-run-id mdb run-id) - (server:check-if-running run-id) (if (< (tasks:num-in-available-state mdb run-id) 4) (begin (tasks:server-set-available mdb run-id) (thread-sleep! 2) ;; Try removing this. It may not be needed. (tasks:server-am-i-the-server? mdb run-id)) - #f)) + #f)) ;; register that this server may come online (first to register goes though with the process) (define (tasks:server-set-available mdb run-id) (sqlite3:execute mdb Index: tests/Makefile ================================================================== --- tests/Makefile +++ tests/Makefile @@ -22,11 +22,11 @@ all : test1 test2 test3 test4 test5 test6 test7 test8 test9 server : cd ..;make;make install - cd fullrun;../../bin/megatest -server - -debug 22 -run-id $(RUNID) + cd fullrun;../../bin/megatest -server - -debug $(DEBUG) -run-id $(RUNID) stopserver : cd ..;make && make install cd fullrun;$(MEGATEST) -stop-server 0