Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -136,11 +136,11 @@ (define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set (define *homehost-mutex* (make-mutex)) (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) - (server-url (if *toppath* (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*) #f)) + (server-url (if *toppath* (server:read-dotserver->server-url *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (or (server:get-timeout) 100))) ;; default to 100 seconds Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -399,11 +399,11 @@ (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access (set! *dbstruct-db* (db:setup)) ;; run-id)) (set! server-going #t) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") ;;(BB> "http-transport: ->running") - (server:write-dotserver *toppath* (conc iface ":" port)) + (server:write-dotserver *toppath* iface port (current-process-id) 'http) (thread-start! *watchdog*) (server:complete-attempt *toppath*)) (begin ;; gotta exit nicely ;;(BB> "http-transport: ->collision") (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision") @@ -456,11 +456,17 @@ (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))) ;; ;; Consider implementing some smarts here to re-insert the record or kill self is ;; the db indicates so - ;; + ;; + ;; BB - added this because servers are hanging about, alive and well + ;; but in defunctdefault state in tdb and a .server file + ;; preventing replacement servers from starting. + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") + + ;; ;; (if (tasks:server-am-i-the-server? tdb run-id) ;; (tasks:server-set-state! tdb server-id "running")) ;; (loop 0 server-state bad-sync-count (current-milliseconds))) (http-transport:server-shutdown server-id port)))))) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -118,11 +118,11 @@ ;; no server contact made and this is a write, passively start a server ((and (not (remote-server-url *runremote*)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") - (let ((serverconn (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call + (let ((serverconn (server:read-dotserver->server-url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call (if serverconn (remote-server-url-set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed (if (not (server:start-attempted? *toppath*)) (server:kind-run *toppath*)))) (if (cdr (remote-hh-dat *runremote*)) ;; we are on the homehost, just do the call Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -137,11 +137,11 @@ (push-directory areapath) (cond (attempt-in-progress (debug:print 0 *default-log-port* "INFO: Not trying to start server because attempt is in progress: "attempt-in-progress)) (dot-server-url - (debug:print 0 *default-log-port* "INFO: Not trying to start server because one is already running : "dot-server-url)) + (debug:print 0 *default-log-port* "INFO: Not trying to start server because one is already running : "dot-server-url)) (else (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") (thread-start! log-rotate) ;; host.domain.tld match host? @@ -208,11 +208,12 @@ (with-input-from-file flagfile (lambda () (read-line)))) ((file-exists? flagfile) ;; it is stale. (server:complete-attempt areapath) #f) (else #f))))) - + + (define (server:read-dotserver areapath) (let ((dotfile (conc areapath "/.server"))) (handle-exceptions exn #f ;; if things go wrong pretend we can't see the file @@ -222,34 +223,51 @@ dotfile (lambda () (read-line))) #f)))) +(define (server:read-dotserver->server-url areapath) + (let* ((temp (server:read-dotserver areapath)) + (tokens (if temp (string-split temp ":") '()))) + (if (eq? 3 (length tokens)) + (string-join (list-ref tokens 0) ":" (list-ref tokens 1)) + #f))) + +(define (server:read-dotserver->pid areapath) + (let* ((temp (server:read-dotserver areapath)) + (tokens (if temp (string-split temp ":") '()))) + (if (eq? 3 (length tokens)) + (list-ref tokens 2) + #f))) + +(define (server:running-or-starting? areapath) ;; Note: may be unreiable on non-homehost due to NFS lag + (or (server:read-dotserver areapath) (server:start-attempted? areapath))) + ;; write a .server file in *toppath* with hostport ;; return #t on success, #f otherwise ;; -(define (server:write-dotserver areapath hostport) +(define (server:write-dotserver areapath hostportpid) (let ((lock-file (conc areapath "/.server.lock")) (server-file (conc areapath "/.server"))) (if (common:simple-file-lock lock-file) (let ((res (handle-exceptions exn #f ;; failed for some reason, for the moment simply return #f (with-output-to-file server-file (lambda () - (print hostport))) + (print hostportpid))) #t))) - (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " created") + (debug:print-info 0 *default-log-port* "server file " server-file " for " hostportpid " created") (common:simple-file-release-lock lock-file) res) #f))) (define (server:remove-dotserver-file areapath hostport) - (let ((dotserver (server:read-dotserver areapath)) + (let ((serverurl (server:read-dotserver->server-url areapath)) (server-file (conc areapath "/.server")) (lock-file (conc areapath "/.server.lock"))) - (if (and dotserver (string-match (conc ".*:" hostport "$") dotserver)) ;; port matches, good enough info to decide to remove the file + (if (and serverurl (string-match (conc ".*:" hostport "$") serverurl)) ;; port matches, good enough info to decide to remove the file (if (common:simple-file-lock lock-file) (begin (handle-exceptions exn #f @@ -258,20 +276,20 @@ (common:simple-file-release-lock lock-file)))))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) - (let* ((dotserver (server:read-dotserver areapath))) ;; tdbdat (tasks:open-db))) - (if dotserver + (let* ((serverurl (server:read-dotserver->server-url areapath))) ;; tdbdat (tasks:open-db))) + (if serverurl (let* ((res (case *transport-type* - ((http)(server:ping-server dotserver)) + ((http)(server:ping-server serverurl)) ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) ))) (if res - dotserver + serverurl (begin - (server:remove-dotserver-file areapath ".*") ;; remove stale dotserver + (server:remove-dotserver-file areapath ".*") ;; remove stale dotserver file #f))) #f))) ;; called in megatest.scm, host-port is string hostname:port ;; @@ -278,11 +296,11 @@ ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; (define (server:ping host-port-in #!key (do-exit #f)) (let ((host:port (if (not host-port-in) ;; use read-dotserver to find - (server:read-dotserver *toppath*) + (server:read-dotserver->server-url *toppath*) (if (number? host-port-in) ;; we were handed a server-id (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) ;; (print "srec: " srec " host-port-in: " host-port-in) (if srec (conc (vector-ref srec 3) ":" (vector-ref srec 4)) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -354,29 +354,14 @@ ;; strftime('%s','now')-heartbeat < 10 AND mt_version = ? "SELECT id,interface,port,pubport,transport,pid,hostname FROM servers WHERE run_id=? AND state='running' ORDER BY start_time DESC LIMIT 1;" run-id) ;; (common:version-signature) run-id) res))) - + (define (tasks:server-running-or-starting? mdb run-id) - (let ((res #f)) - (sqlite3:for-each-row - (lambda (id) - (set! res id)) - mdb ;; NEEDS dbprep ADDED - "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND (strftime('%s','now') - start_time) < 60));" run-id) - res)) - -(define (tasks:server-running? mdb run-id) - (let ((res #f)) - (sqlite3:for-each-row - (lambda (id) - (set! res id)) - mdb ;; NEEDS dbprep ADDED - "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id) - res)) - + (server:running-or-starting? *toppath*)) + (define (tasks:need-server run-id) (equal? (configf:lookup *configdat* "server" "required") "yes")) ;; (maxqry (cdr (rmt:get-max-query-average run-id))) ;; (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))