@@ -177,15 +177,18 @@ run-id))))) (if conn (begin (debug:print-info 2 *default-log-port* "already connected to a server") conn) ;; we are already connected to the server - (let* ((sdats (tt:get-server-info-sorted ttdat dbfname)) + + ;; no conn + (let* ((sdats (tt:get-server-info-sorted ttdat dbfname)) (sdat (if (null? sdats) #f (car sdats)))) - (match sdat + (debug:print-info 2 *default-log-port* "found sdat " sdat) + (match sdat ((host port start-time server-id pid dbfname2 servinffile) (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.") (debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile) (let* ((host-port (conc host":"port)) (conn (make-tt-conn @@ -211,37 +214,37 @@ (thread-sleep! 0.5) (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect") (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)) (else (let* ((curr-secs (current-seconds))) - (if (not ping-res) ;; the server is actually dead, remove the .servinfo file + ;; rm the (last server) would go here + (if (> (- curr-secs (tt-last-serv-start ttdat)) 10) (begin (debug:print-info 0 *default-log-port* "Unreachable server at " host":"port" with servinfo file "servinffile", removing it") (if (file-exists? servinffile) (handle-exceptions exn #f - (delete-file servinffile))))) - ;; rm the (last server) would go here - (if (> (- curr-secs (tt-last-serv-start ttdat)) 10) - (begin + (delete-file servinffile))) (tt-last-serv-start-set! ttdat curr-secs) + (debug:print-info 0 *default-log-port* "Starting a new server on " (get-host-name)) (server-start-proc))) ;; start server if 10 sec since last attempt (thread-sleep! 1) - (debug:print-info 0 *default-log-port* "server ping result was "ping-res" neither running nor starting. Retrying connect") + (debug:print-info 0 *default-log-port* "Retrying connect") (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) + (else ;; no good server found, if haven't started server in > 5 secs, start another (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers (begin - (debug:print-info 0 *default-log-port* "Starting server for "dbfname) + (debug:print-info 0 *default-log-port* "Starting server for "dbfname " on " (get-host-name)) (server-start-proc) (tt-last-serv-start-set! ttdat (current-seconds)) - (thread-sleep! 3) + (thread-sleep! 6) )) (thread-sleep! 1) - (debug:print-info 0 *default-log-port* "Connect to server for " dbfname) + (debug:print-info 0 *default-log-port* "Connect to server from " (get-host-name) " for " dbfname) (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) ;; returns ( result . ping_time ) (define (tt:timed-ping host port server-id) (let* ((start-time (current-milliseconds)) @@ -590,10 +593,11 @@ (begin (debug:print 0 *default-log-port* "ERROR: (tt-port ttdat) no port set! Exiting.") (exit))))) ;; create a servinfo file start keep-running + (debug:print 0 *default-log-port* "Creating servinfo file for " dbfname) (tt:create-server-registration-file ttdat dbfname) (procinf-status-set! *procinf* "running") (tt-state-set! ttdat 'running) (dbfile:with-no-sync-db nosyncdbpath @@ -904,18 +908,23 @@ ;; (connect-listener uconn port))) (define (setup-listener-portlogger uconn) (let ((port (portlogger:open-run-close portlogger:find-port))) (assert (tt? uconn) "FATAL: setup-listener called with wrong struct "uconn) + (debug:print 2 *default-log-port* "setup-listener-portlogger got port " port) (handle-exceptions exn (if (< port 65535) (begin (portlogger:open-run-close portlogger:set-failed port) (thread-sleep! 0.25) (setup-listener-portlogger uconn)) - #f) + (begin + (debug:print 0 *default-log-port* "setup-listener-portlogger: could not get a port") + #f + ) + ) (connect-listener uconn port)))) (define (connect-listener uconn port) ;; (tcp-listener-socket LISTENER)(socket-name so) ;; sockaddr-address, sockaddr-port, sockaddr->string