Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -108,11 +108,11 @@ (remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id) (thread-sleep! 1) (client:setup-http areapath remaining-tries: (- remaining-tries 1)) ))) (begin ;; no server registered - (server:kind-run areapath) + ;; (server:kind-run areapath) + (server:start-and-wait areapath) (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. - (server:start-and-wait areapath) (client:setup-http areapath remaining-tries: (- remaining-tries 1))))))))) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -150,11 +150,12 @@ (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) - (server-timeout (or (server:get-timeout) 100))) ;; default to 100 seconds + (server-timeout (or (server:get-timeout) 100)) + (force-server #f)) ;; default to 100 seconds ;; launching and hosts (defstruct host (reachable #f) (last-update 0) @@ -1013,10 +1014,22 @@ ;; (define (common:use-cache?) (not (or (args:get-arg "-no-cache") (and *configdat* (equal? (configf:lookup *configdat* "setup" "use-cache") "no"))))) + +;; force use of server? +;; +(define (common:force-server?) + (let ((force-setting (configf:lookup "server" "force")) + (force-type (if force-setting (string->symbol force-setting) #f))) + (case force-type + ((#f) #f) + ((always) #t) + ((test) (if (args:get-arg "-execute") ;; we are in a test + #t + #f))))) ;;====================================================================== ;; M I S C L I S T S ;;====================================================================== Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -107,11 +107,12 @@ (remote-hh-dat-set! runremote (common:get-homehost)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a read - ((and (cdr (remote-hh-dat runremote)) ;; on homehost + ((and (not (remote-force-server runremote)) ;; honor forced use of server + (cdr (remote-hh-dat runremote)) ;; on homehost (member cmd api:read-only-queries)) ;; this is a read (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3") (rmt:open-qry-close-locally cmd 0 params)) @@ -124,19 +125,21 @@ (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a write, we already have a server - ((and (cdr (remote-hh-dat runremote)) ;; on homehost + ((and (not (remote-force-server runremote)) ;; honor forced use of server + (cdr (remote-hh-dat runremote)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write - (remote-server-url runremote)) ;; have a server + (remote-server-url runremote)) ;; have a server (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4") (rmt:open-qry-close-locally cmd 0 params)) ;; on homehost, no server contact made and this is a write, passively start a server - ((and (cdr (remote-hh-dat runremote)) ; new + ((and (not (remote-force-server runremote)) ;; honor forced use of server + (cdr (remote-hh-dat runremote)) ;; new (not (remote-server-url runremote)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") (let ((server-url (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call (if server-url @@ -149,14 +152,16 @@ ((and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost (not (remote-conndat runremote))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) (mutex-unlock! *rmt-mutex*) (server:start-and-wait *toppath*) + (if (common:force-server?)(remote-force-server-set! runremote #t)) (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as ;; all set up if get this far, dispatch the query - ((cdr (remote-hh-dat runremote)) ;; we are on homehost + ((and (not (remote-force-server runremote)) + (cdr (remote-hh-dat runremote))) ;; we are on homehost (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 7") (rmt:open-qry-close-locally cmd (if rid rid 0) params)) ;; not on homehost, do server query Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -257,22 +257,28 @@ *my-client-signature*))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched (define (server:kind-run areapath) - (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun - (call-num (car last-run-dat)) - (when-run (cadr last-run-dat)) - (run-delay (+ (case call-num - ((0) 0) - ((1) 20) - ((2) 300) - (else 600)) - (random 5)))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously - (if (> (- (current-seconds) when-run) run-delay) - (server:run areapath)) - (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))) + (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? + (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun + (call-num (car last-run-dat)) + (when-run (cadr last-run-dat)) + (run-delay (+ (case call-num + ((0) 0) + ((1) 20) + ((2) 300) + (else 600)) + (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously + (lock-file (conc areapath "/logs/server-start.lock"))) + (if (> (- (current-seconds) when-run) run-delay) + (begin + (common:simple-file-lock lock-file expire-time: 15) + (server:run areapath) + (thread-sleep! 5) ;; don't release the lock for at least a few seconds + (common:simple-file-release-lock lock-file))) + (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) (let loop ((server-url (server:check-if-running areapath))) (if (or server-url