@@ -202,10 +202,11 @@ )) (thread-sleep! 1) (debug:print-info 0 *default-log-port* "Connect to server for " dbfname) (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) +;; returns ( result . ping_time ) (define (tt:timed-ping host port server-id) (let* ((start-time (current-milliseconds)) (result (tt:ping host port server-id))) (cons result (- (current-milliseconds) start-time)))) @@ -472,181 +473,153 @@ ;; start the listener and start responding to requests ;; ;; NOTE: organise by dbfname, not run-id so we don't need ;; to pull in more modules ;; -;; This is the routine called in megatest.scm to start a server +;; This is the routine called in megatest.scm to start a server. NOTE: sequence is different for main.db vs. X.db ;; ;; Server viability is checked in keep-running. Blindly start and run here. ;; (define (tt:start-server areapath run-id dbfname-in handler keys) (assert areapath "FATAL: areapath not provided for tt:start-server") - ;; is there already a server for this dbfile? Then exit. - (debug:print 2 *default-log-port* "tt:start-server: " dbfname-in) (let* ((ttdat (make-tt areapath: areapath)) - (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))) - (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead - (debug:print 0 *default-log-port* "Found " (length servers) " already running for " dbfname) - (if (> (length servers) 0) - (begin - (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.") - (exit)) - (let* ((dbstruct (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys))) - (tt-handler-set! ttdat (handler dbstruct)) - (let* ((tcp-thread (make-thread - (lambda () - (tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data - "tcp-server-thread")) - (run-thread (make-thread - (lambda () - (tt:keep-running ttdat dbfname dbstruct))))) - (thread-start! tcp-thread) - (thread-start! run-thread) - - (let* ((areapath (tt-areapath ttdat)) - (nosyncdbpath (conc areapath"/.mtdb"))) - ;; this didn't seem to work, is port not available yet? - (let loop ((count 0)) - (if (tt-port ttdat) - (begin - (procinf-port-set! *procinf* (tt-port ttdat)) - (procinf-dbname-set! *procinf* dbfname) - (dbfile:with-no-sync-db - nosyncdbpath - (lambda (nsdb) - (dbfile:insert-or-update-process nsdb *procinf*)))) - (if (< count 10) - (begin - (thread-sleep! 0.25) - (loop (+ count 1))) - (debug:print 0 *default-log-port* "ERROR: (tt-port ttdat) no port set!")))) - - (thread-join! run-thread) ;; run thread will exit on timeout or other conditions - ;; replace with call to (dbfile:set-process-done nsdb host pid reason) - (procinf-status-set! *procinf* "done") - (procinf-end-set! *procinf* (current-seconds)) - ;; either convert this to use set-process-done or get rid of set-process-done - (dbfile:with-no-sync-db - nosyncdbpath - (lambda (nsdb) - (dbfile:insert-or-update-process nsdb *procinf*))) - (debug:print 0 *default-log-port* "Exiting now.") - (exit))))))) + (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))) + (let* ((dbstruct (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys))) + (tt-handler-set! ttdat (handler dbstruct)) + (let* ((servinf-created #f) + (tcp-thread (make-thread + (lambda () + ;; NOTE: tt-port and tt-host are set in connect-listener which is called under tt:start-tcp-server + (tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data + "tcp-server-thread")) + (run-thread (make-thread + (lambda () + (tt:keep-running ttdat dbfname dbstruct))))) + (thread-start! tcp-thread) + + (let* ((areapath (tt-areapath ttdat)) + (nosyncdbpath (conc areapath"/.mtdb")) + (servers ;; (tt:find-server areapath dbfname))) + (tt:get-server-info-sorted ttdat dbfname)) ;; (host port startseconds server-id servinfofile) + (good-srvrs + ;; contact servers via ping, if no response remove the .servinfo file + (let loop ((servrs servers) + (prime-host #f) + (result '())) + (if (null? servrs) + (reverse result) + (let* ((servdat (car servrs))) + (match servdat + ((host port startseconds server-id servinfofile) + (let* ((ping-res (tt:timed-ping host port server-id)) + (good-ping (match ping-res + ((result . ping-time) + (not result)) ;; we couldn't reach the server or it was not a megatest server + (else #f))) ;; the ping failed completely? + (same-host (or (not prime-host) ;; i.e. this is the first host + (equal? prime-host host))) + (keep-srv (and good-ping same-host))) + (if keep-srv + (loop (cdr servrs) + host + (cons servdat result)) + (begin + (handle-exceptions + exn + (debug:print-info 0 *default-log-port* "Error removing server info file: "servinfofile", " + (condition->list exn)) + (delete-file* servinfofile)) + (loop (cdr servrs) prime-host result))))) + (else + ;; can't delete it as we don't have a filename. NOTE: Should really never get here. + (debug:print-info 0 *default-log-port* "ERROR: bad servinfo record \""servdat"\"") + (loop (cdr servrs) prime-host result)) ;; drop + ))))) + (home-host (if (null? good-srvrs) + #f + (caar good-srvrs)))) + ;; by here we have a trustworthy list of servers and we have removed the .servinfo file for any unresponsive servers + ;; and the list is in good-srvrs + (cond + ((not home-host) ;; no servers yet, go ahead and start + (debug:print-info 0 *default-log-port* "No servers yet, starting on "(get-host-name))) + ((> (length good-srvrs) 2) ;; don't need more, just exit + (debug:print-info 0 *default-log-port* "Have "(length good-srvrs)", no need for more, exiting.") + (exit)) + ((not (equal? home-host (get-host-name))) ;; there is a home-host and we are not on it + (debug:print-info 0 *default-log-port* "Prime main server is on host "home-host", but we are on host "(get-host-name)", exiting.") + (exit)) + (else + (debug:print-info 0 *default-log-port* "Starting on host "(get-host-name)", along with "(length good-srvrs)" other servers."))) + + ;; this didn't seem to work, is port not available yet? + (let loop ((count 0)) + (if (tt-port ttdat) + (begin + (procinf-port-set! *procinf* (tt-port ttdat)) + (procinf-dbname-set! *procinf* dbfname) + (dbfile:with-no-sync-db + nosyncdbpath + (lambda (nsdb) + (dbfile:insert-or-update-process nsdb *procinf*)))) + (if (< count 10) + (begin + (thread-sleep! 0.25) + (loop (+ count 1))) + (begin + (debug:print 0 *default-log-port* "ERROR: (tt-port ttdat) no port set! Exiting.") + (exit))))) + + ;; create a servinfo file start keep-running + (tt:create-server-registration-file ttdat dbfname) + (procinf-status-set! *procinf* "running") + (dbfile:with-no-sync-db + nosyncdbpath + (lambda (nsdb) + (dbfile:insert-or-update-process nsdb *procinf*))) + (thread-start! run-thread) + + (thread-join! run-thread) ;; run thread will exit on timeout or other conditions + + ;; replace with call to (dbfile:set-process-done nsdb host pid reason) + (procinf-status-set! *procinf* "done") + (procinf-end-set! *procinf* (current-seconds)) + ;; either convert this to use set-process-done or get rid of set-process-done + (dbfile:with-no-sync-db + nosyncdbpath + (lambda (nsdb) + (dbfile:insert-or-update-process nsdb *procinf*))) + (debug:print 0 *default-log-port* "Exiting now.") + (exit)))))) (define (tt:keep-running ttdat dbfname dbstruct) - ;; verfiy conn for ready - ;; listener socket has been started by this stage - ;; wait for a port before creating the registration file - ;; - (let* ((db-locked-in #f) - (areapath (tt-areapath ttdat)) - (nosyncdbpath (conc areapath"/.mtdb"))) - (set! *server-info* ttdat) - (let loop ((count 0)) - (if (> count 240) - (begin - (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.") - (exit 1)) - (if (not (tt-port ttdat)) ;; no connection yet - (begin - (thread-sleep! 0.25) - (loop (+ count 1)))))) - - (tt:create-server-registration-file ttdat dbfname) - ;; now start watching the last-access, if it hasn't been touched - ;; in over ten seconds we exit - (thread-sleep! 0.05) ;; any real need for delay here? - (let loop () - (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) - (ok (cond - ((not *server-run*) - (debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.") - #f) - ((null? servers) #f) ;; not ok - ((equal? (list-ref (car servers) 6) ;; compare the servinfofile - (tt-servinf-file ttdat)) - (let* ((res (if db-locked-in - #t - ;; - ;; let's replace the below "winning" lock method with: - ;; 1. create a lock file with pid etc. - ;; 2. if there are no other lock files make an entry in the no-sync db - ;; 3. gather the lock entries, apply the "winner" heuristic - ;; 4. if I'm the winner, set tt-state to 'running else set to 'notthewinner - ;; - ;; New idea: - ;; 1. check all processes entries that match the db - ;; 2. sort by fixed heuristic - ;; 3. if I'm number one, set state to 'running and db-locked-in to #t - (let* ((candidates (map dbfile:row->procinf - (dbfile:with-no-sync-db - nosyncdbpath - (lambda (nsdb) - (dbfile:get-process-options nsdb "server" dbfname))))) - (primecand (begin - (assert (not (null? candidates)) - "HOW CAN WE NOT BE IN THE PROCESSES DB AS A SERVER?") - (car candidates)))) - - ;; compare primecand with myself - ;; if not me check that it is reachable - ;; if reachable exit - #f) - - #;(let* ((lock-result ;; this is the primary lock - need to double verify that got it - (dbfile:with-no-sync-db - nosyncdbpath - (lambda (db) - (db:no-sync-lock-and-check db dbfname - (tt-servinf-file ttdat) - ;; (dbr:dbstruct-dbtmpname dbstruct) - )))) - (success (car lock-result))) - (if success - (begin - (tt-state-set! ttdat 'running) - (debug:print 0 *default-log-port* "Got server lock for " dbfname) - (set! db-locked-in #t) - #t) - (begin - (debug:print 0 *default-log-port* "Failed to get server lock for "dbfname) - #f)))))) - (if (and res (common:low-noise-print 120 "top server message")) - (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for " - dbfname" on "(tt-host ttdat)":"(tt-port ttdat))) - res)) - (else - ;; wrong servinfo file - (debug:print-info 0 *default-log-port* "I'm not the lead server: "servers) - (let* ((leadsrv (car servers))) - (match leadsrv - ((host port startseconds server-id pid dbfname servinfofile) - (let* ((result (tt:timed-ping host port server-id)) - (res (car result)) - (ping (cdr result))) - (debug:print-info 0 *default-log-port* "Ping to "host":"port", with server-id "server-id - ", and file "servinfofile" returned "res) - (if res - #f ;; not the server, but all good, want to exit - (if (and (file-exists? servinfofile) - (> (- (current-seconds)(file-modification-time servinfofile)) 30)) - (begin - ;; can't ping and file has been on disk 15 seconds, go ahead and try to remove it - (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile) - (handle-exceptions - exn - (debug:print-info 0 *default-log-port* "Error removing server info file: "servinfofile) - (delete-file* servinfofile) - ) - #t) ;; not the server but the server is not reachable - (begin - (debug:print 0 *default-log-port* "I'm not the server but could not ping "host":"port", will try again.") - (thread-sleep! 1) ;; just because - #t))))) - (else ;; should never get here - (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv) - (assert #f "Bad server record "leadsrv)))))))) + + ;; at this point the server is running and responding to calls, we just monitor + ;; for db calls and exit if there are none. + + ;; if I am not in the first 3 servers, exit + + (let loop () + (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) + (home-host (if (null? servers) + #f + (caar servers))) + (my-index (list-index (lambda (x) + (equal? (list-ref x 6) + (tt-servinf-file ttdat))) + servers)) + (ok (cond + ((not *server-run*) + (debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.") + #f) + ((null? servers) + (debug:print 0 *default-log-port* "WARNING: no servinfo files found, this cannot be.") + #f) ;; not ok + ((> my-index 2) + (debug:print 0 *default-log-port* "WARNING: there are more than two servers ahead of me, I'm not needed, exiting.") + #f) ;; not ok to not be in first three + (else #t)))) (if ok (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access (begin (debug:print 0 *default-log-port* "Exiting immediately") (tt:shutdown-server ttdat) @@ -664,12 +637,12 @@ (if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param)) (begin (thread-sleep! 5) (loop))))) - ;; (cleanup) ;; all done by tt:shutdown-server - (debug:print 0 *default-log-port* "INFO: Server timed out, exiting from tt:keep-running."))) + ;; (cleanup) ;; all done by tt:shutdown-server + (debug:print 0 *default-log-port* "INFO: Server timed out, exiting from tt:keep-running.")) (define (tt:shutdown-server ttdat) (let* ((host (tt-host ttdat)) (port (tt-port ttdat))