Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -136,11 +136,11 @@ (define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set (define *homehost-mutex* (make-mutex)) (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) - (server-url (if *toppath* (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*) #f)) + (server-url (if *toppath* (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (or (server:get-timeout) 100))) ;; default to 100 seconds Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -399,11 +399,11 @@ (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access (set! *dbstruct-db* (db:setup)) ;; run-id)) (set! server-going #t) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") ;;(BB> "http-transport: ->running") - (server:write-dotserver *toppath* (conc iface ":" port)) + (server:write-dotserver *toppath* iface port (current-process-id) 'http) (thread-start! *watchdog*) (server:complete-attempt *toppath*)) (begin ;; gotta exit nicely ;;(BB> "http-transport: ->collision") (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision") @@ -428,11 +428,12 @@ (if (or (not (equal? sdat (list iface port))) (not server-id)) (begin (debug:print-info 0 *default-log-port* "interface changed, refreshing iface and port info") (set! iface (car sdat)) - (set! port (cadr sdat)))) + (set! port (cadr sdat)) + (server:write-dotserver *toppath* iface port (current-process-id) 'http))) ;; Transfer *db-last-access* to last-access to use in checking that we are still alive (mutex-lock! *heartbeat-mutex*) (set! last-access *db-last-access*) (mutex-unlock! *heartbeat-mutex*) @@ -447,25 +448,30 @@ (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) - (if (and *server-run* + (cond + ((not (server:confirm-dotserver *toppath* iface port (current-process-id) 'http)) + (debug:print-info 0 *default-log-port* "Server .server file does not exist or contents do not match. Initiate server shutdown.") + (http-transport:server-shutdown server-id port)) + ((and *server-run* (> (+ last-access server-timeout) (current-seconds))) - (begin - (if (common:low-noise-print 120 "server continuing") - (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))) - ;; - ;; Consider implementing some smarts here to re-insert the record or kill self is - ;; the db indicates so - ;; - ;; (if (tasks:server-am-i-the-server? tdb run-id) - ;; (tasks:server-set-state! tdb server-id "running")) - ;; - (loop 0 server-state bad-sync-count (current-milliseconds))) - (http-transport:server-shutdown server-id port)))))) + (if (common:low-noise-print 120 "server continuing") + (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))) + ;; + ;; Consider implementing some smarts here to re-insert the record or kill self is + ;; the db indicates so + ;; + ;; (if (tasks:server-am-i-the-server? tdb run-id) + ;; (tasks:server-set-state! tdb server-id "running")) + ;; + (loop 0 server-state bad-sync-count (current-milliseconds))) + (else + (debug:print-info 0 *default-log-port* "Server timeed out. seconds since last db access: " (- (current-seconds) last-access)) + (http-transport:server-shutdown server-id port))))))) ;; code cut out from above ;; ;; (condition-case ;; ;; (if (and (member (mutex-state *db-sync-mutex*) '(abandoned not-abandoned)) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -31,12 +31,13 @@ ;;====================================================================== ;; if a server is either running or in the process of starting call client:setup ;; else return #f to let the calling proc know that there is no server available ;; -(define (rmt:get-connection-info run-id) - (let ((cinfo (remote-conndat *runremote*))) +(define (rmt:get-connection-info areapath) ;; TODO: push areapath down. + (let ((cinfo (remote-conndat *runremote*)) + (run-id 0)) (if cinfo cinfo (if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id) (client:setup run-id) #f)))) @@ -114,43 +115,57 @@ ;; (mutex-unlock! *rmt-mutex*) ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") ;; (rmt:open-qry-close-locally cmd 0 params)) - ;; no server contact made and this is a write, passively start a server - ((and (not (remote-server-url *runremote*)) + ;; on homehost, no server contact made and this is a write, passively start a server + ((and (cdr (remote-hh-dat *runremote*)) ; new + (not (remote-server-url *runremote*)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") - (let ((serverconn (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call - (if serverconn - (remote-server-url-set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed + (let ((server-url (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call + (if server-url + (remote-server-url-set! *runremote* server-url) ;; the string can be consumed by the client setup if needed (if (not (server:start-attempted? *toppath*)) (server:kind-run *toppath*)))) - (if (cdr (remote-hh-dat *runremote*)) ;; we are on the homehost, just do the call - (begin - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") - (rmt:open-qry-close-locally cmd 0 params)) - (begin ;; not on homehost, start server and wait - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.2") - (tasks:start-and-wait-for-server (tasks:open-db) 0 15) - (rmt:send-receive cmd rid params attemptnum: attemptnum)))) + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") + (rmt:open-qry-close-locally cmd 0 params)) + + + + ;;; + ;; (begin ;; not on homehost, start server and wait + ;; (mutex-unlock! *rmt-mutex*) + ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.2") + ;; (tasks:start-and-wait-for-server (tasks:open-db) 0 15) + ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;) ;) +;;;; + ;; if not on homehost ensure we have a connection to a live server ;; NOTE: we *have* a homehost record by now - ((and (not (cdr (remote-hh-dat *runremote*))) ;; are we on a homehost? + ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost + (not (remote-conndat *runremote*)) ;; and no connection + (server:read-dotserver *toppath*)) ;; .server file exists + ;; something caused the server entry in tdb to disappear, but the server is still running + (server:remove-dotserver-file *toppath* ".*") + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 20") + (rmt:send-receive cmd rid params attemptnum: (add1 attemptnum))) + ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost (not (remote-conndat *runremote*))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*)) (mutex-unlock! *rmt-mutex*) (tasks:start-and-wait-for-server (tasks:open-db) 0 15) - (remote-conndat-set! *runremote* (rmt:get-connection-info 0)) ;; calls client:setup which calls client:setup-http - (rmt:send-receive cmd rid params attemptnum: attemptnum)) + (remote-conndat-set! *runremote* (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http + (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as ;; all set up if get this far, dispatch the query ((cdr (remote-hh-dat *runremote*)) ;; we are on homehost (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 7") (rmt:open-qry-close-locally cmd (if rid rid 0) params)) + ;; not on homehost, do server query (else (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9") (let* ((conninfo (remote-conndat *runremote*)) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -8,11 +8,11 @@ ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. (require-extension (srfi 18) extras tcp s11n) -(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest directory-utils) +(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest directory-utils posix-extras) ;; (use zmq) (use spiffy uri-common intarweb http-client spiffy-request-vars) (declare (unit server)) @@ -49,16 +49,16 @@ ;; start_server ;; (define (server:launch run-id transport-type) ;;(BB> "server:launch fired for run-id="run-id" transport-type="transport-type) - (let ((attempt-in-progress (server:start-attempted? *toppath*))) + (let ((attempt-in-progress (server:start-attempted? *toppath*))) ; check for .server-starting (when attempt-in-progress (debug:print-info 0 *default-log-port* "Server start attempt in progress in other process (=> "attempt-in-progress"<=). Aborting server launch attempt in this process ("(current-process-id)")") (exit))) - (let ((dotserver-url (server:check-if-running *toppath*))) + (let ((dotserver-url (server:check-if-running *toppath*))) ;; check for .server (when dotserver-url (debug:print-info 0 *default-log-port* "Server already running (=> "dotserver-url"<=). Aborting server launch attempt in this process ("(current-process-id)")") (exit) )) @@ -202,68 +202,129 @@ (with-input-from-file flagfile (lambda () (read-line)))) ((file-exists? flagfile) ;; it is stale. (server:complete-attempt areapath) #f) (else #f))))) - + (define (server:read-dotserver areapath) (let ((dotfile (conc areapath "/.server"))) (handle-exceptions exn #f ;; if things go wrong pretend we can't see the file - (if (and (file-exists? dotfile) - (file-read-access? dotfile)) - (with-input-from-file - dotfile - (lambda () - (read-line))) - #f)))) + (cond + ((not (file-exists? dotfile)) + #f) + ((not (file-read-access? dotfile)) + #f) + ((> (server:dotserver-age-seconds areapath) (+ 5 (server:get-timeout))) + (server:remove-dotserver-file areapath ".*") + #f) + (else + (let* ((line + (with-input-from-file + dotfile + (lambda () + (read-line)))) + (tokens (if (string? line) (string-split line ":") #f))) + (cond + ((eq? 4 (length tokens)) + tokens) + (else #f)))))))) + +(define (server:read-dotserver->url areapath) + (let ((dotserver-tokens (server:read-dotserver areapath))) + (if dotserver-tokens + (conc (list-ref dotserver-tokens 0) ":" (list-ref dotserver-tokens 1)) + #f))) ;; write a .server file in *toppath* with hostport ;; return #t on success, #f otherwise ;; -(define (server:write-dotserver areapath hostport) +(define (server:write-dotserver areapath host port pid transport) (let ((lock-file (conc areapath "/.server.lock")) (server-file (conc areapath "/.server"))) (if (common:simple-file-lock lock-file) (let ((res (handle-exceptions exn #f ;; failed for some reason, for the moment simply return #f (with-output-to-file server-file (lambda () - (print hostport))) + (print (conc host ":" port ":" pid ":" transport)))) #t))) - (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " created") + (debug:print-info 0 *default-log-port* "server file " server-file " for " host ":" port " created pid="pid) (common:simple-file-release-lock lock-file) res) #f))) + +;; this will check that the .server file present matches the server calling this procedure. +;; if parameters match (this-pid and transport) the file will be touched and #t returned +;; otherwise #f will be returned. +(define (server:confirm-dotserver areapath this-iface this-port this-pid this-transport) + (let* ((tokens (server:read-dotserver areapath))) + (cond + ((not tokens) + (debug:print-info 0 *default-log-port* "INFO: .server file does not exist.") + #f) + ((not (eq? 4 (length tokens))) + (debug:print-info 0 *default-log-port* "INFO: .server file is corrupt. There are not 4 tokens as expeted; there are "(length tokens)".") + #f) + ((not (equal? this-iface (list-ref tokens 0))) + (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for iface, server has value >"(list-ref tokens 0)"< but this server's value is >"this-iface"<") + #f) + ((not (equal? (->string this-port) (list-ref tokens 1))) + (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for port, .server has value >"(list-ref tokens 1)"< but this server's value is >"(->string this-port)"<") + #f) + ((not (equal? (->string this-pid) (list-ref tokens 2))) + (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for pid, .server has value >"(list-ref tokens 2)"< but this server's value is >"(->string this-pid)"<") + #f) + ((not (equal? (->string this-transport) (->string (list-ref tokens 3)))) + (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for transport, .server has value >"(list-ref tokens 3)"< but this server's value is >"this-transport"<") + #f) + (else (server:touch-dotserver areapath) + #t)))) + +(define (server:touch-dotserver areapath) + (let ((server-file (conc areapath "/.server"))) + (change-file-times server-file (current-seconds) (current-seconds)))) + +(define (server:dotserver-age-seconds areapath) + (let ((server-file (conc areapath "/.server"))) + (begin + (handle-exceptions + exn + #f + (- (current-seconds) + (file-modification-time server-file)))))) + (define (server:remove-dotserver-file areapath hostport) - (let ((dotserver (server:read-dotserver areapath)) + (let ((dotserver-url (server:read-dotserver->url areapath)) (server-file (conc areapath "/.server")) (lock-file (conc areapath "/.server.lock"))) - (if (and dotserver (string-match (conc ".*:" hostport "$") dotserver)) ;; port matches, good enough info to decide to remove the file + (if (and dotserver-url (string-match (conc ".*:" hostport "$") dotserver-url)) ;; port matches, good enough info to decide to remove the file (if (common:simple-file-lock lock-file) (begin (handle-exceptions exn #f (delete-file* server-file)) (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " removed") - (common:simple-file-release-lock lock-file)))))) + (common:simple-file-release-lock lock-file)) + (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - could not get lock.")) + (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - dotserver-url("dotserver-url") did not match hostport pattern ("hostport")")))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) - (let* ((dotserver (server:read-dotserver areapath))) ;; tdbdat (tasks:open-db))) - (if dotserver + (let* ((dotserver-url (server:read-dotserver->url areapath))) ;; tdbdat (tasks:open-db))) + (if dotserver-url (let* ((res (case *transport-type* - ((http)(server:ping-server dotserver)) + ((http)(server:ping-server dotserver-url)) ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) ))) (if res - dotserver + dotserver-url (begin (server:remove-dotserver-file areapath ".*") ;; remove stale dotserver #f))) #f))) @@ -272,11 +333,11 @@ ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; (define (server:ping host-port-in #!key (do-exit #f)) (let ((host:port (if (not host-port-in) ;; use read-dotserver to find - (server:read-dotserver *toppath*) + (server:read-dotserver->url *toppath*) (if (number? host-port-in) ;; we were handed a server-id (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) ;; (print "srec: " srec " host-port-in: " host-port-in) (if srec (conc (vector-ref srec 3) ":" (vector-ref srec 4))