Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -217,12 +217,16 @@ (pid (tt-conn-pid conn)) (servinf (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname))) (hash-table-set! (tt-conns ttdat) dbfname #f) (if (file-exists? servinf) (begin - (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", attempting to remove servinfo file.") - (delete-file* servinf)) + (debug:print 0 *default-log-port* "INFO: no ping response from server "host":"port" for "dbfname) + (if (and (file-exists? servinf) + (> (- (current-seconds)(file-modification-time servinf)) 60)) + (begin + (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.") + (delete-file* servinf)))) (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf)) (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) (assert #f "FATAL: tt:handler received bad data "res))))) (begin (thread-sleep! 1) ;; give it a rest and try again @@ -372,20 +376,21 @@ (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) (ok (cond ((null? servers) #f) ;; not ok ((equal? (list-ref (car servers) 6) ;; compare the servinfofile (tt-servinf-file ttdat)) - (debug:print-info 0 *default-log-port* "Keep running, I'm the top server.") + (debug:print-info 0 *default-log-port* "Keep running, I'm the top server on "(tt-host ttdat)":"(tt-port ttdat)) (if db-locked-in #t (let* ((lockinfo (dbfile:with-no-sync-db nosyncdbpath (lambda (db) (db:no-sync-get-lock db dbfname)))) (success (car lockinfo))) (if success (begin (debug:print 0 *default-log-port* "Got server lock for "dbfname) + (set! db-locked-in #t) #t) (begin (debug:print 0 *default-log-port* "Failed to get server lock for "dbfname) #f))))) (else @@ -394,17 +399,19 @@ (match leadsrv ((host port startseconds server-id pid dbfname servinfofile) (if (tt:ping host port server-id) #f ;; not the server, but all good, want to exit (if (and (file-exists? servinfofile) - (> (- (current-seconds)(file-modification-time servinfofile)) 5)) + (> (- (current-seconds)(file-modification-time servinfofile)) 15)) (begin ;; can't ping and file has been on disk 5 seconds, go ahead and try to remove it (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile) (delete-file* servinfofile) #t) ;; not the server but the server is not reachable - #t))) + (begin + (debug:print 0 *default-log-port* "I'm not the server but will try again since "servinfofile" is fresh") + #t)))) (else ;; should never get here (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv) (assert #f "Bad server record "leadsrv)))))))) (if ok ;; (if (> *api-process-request-count* 0) ;; have requests in flight @@ -616,11 +623,13 @@ (define (setup-listener uconn #!optional (port 4242)) (assert (tt? uconn) "FATAL: setup-listener called with wrong struct "uconn) (handle-exceptions exn (if (< port 65535) - (setup-listener uconn (+ port 1)) + (begin + (thread-sleep! 0.25) + (setup-listener uconn (+ port 1))) #f) (connect-listener uconn port))) (define (connect-listener uconn port) ;; (tcp-listener-socket LISTENER)(socket-name so)