Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -97,10 +97,11 @@ (host-port #f) (cmd-thread #f) (ro-mode #f) (ro-mode-checked #f) (last-access (current-seconds)) + (servinf-file #f) ) (define (tt:make-remote areapath) (make-tt areapath: areapath)) @@ -115,20 +116,21 @@ testsuite ;; (dbfile:testsuite-name) (common:find-local-megatest) run-id)))) (if conn conn ;; we are already connected to the server - (let* ((sdat (tt:get-current-server-info ttdat dbfname run-id))) + (let* ((sdat (tt:get-current-server-info ttdat dbfname))) (match sdat - ((host port start-time server-id pid dbfname2) + ((host port start-time server-id pid dbfname2 servinffile) (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.") (let* ((host-port (conc host":"port)) (conn (make-tt-conn host: host port: port host-port: host-port dbfname: dbfname + servinf-file: servinffile server-id: server-id server-start: start-time pid: pid))) (hash-table-set! (tt-conns ttdat) dbfname conn) ;; verify we can talk to this server @@ -138,10 +140,11 @@ ;; rm the (last server) would go here (server-start-proc) (thread-sleep! 1) (tt:client-connect-to-server ttdat dbfname run-id testsuite))))) (else + (debug:print-info 0 *default-log-port* "Number of records did not match expected. Bad server info?") (server-start-proc) (thread-sleep! 1) (tt:client-connect-to-server ttdat dbfname run-id testsuite))))))) (define (tt:ping host port server-id) @@ -150,16 +153,18 @@ ;; need two threads, one a 5 second timer ;; (match res ((status errmsg result meta) (if (equal? result server-id) - #t ;; then we are good + (begin + (debug:print 0 *default-log-port* "Ping to "host":"port" successful.") + #t) ;; then we are good (begin (debug:print 0 *default-log-port* "WARNING: server-id does not match, expected: "server-id", got: "result) #f))) (else - (debug:print 0 *default-log-port* "res not in form (status errmsg resutl meta), got: "res) + ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res) #f)))) ;; client side handler ;; (define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe) @@ -208,12 +213,28 @@ ;; readonly-mode dbfname testsuite mtexe))))))) (define (tt:bid-for-servership run-id) #f) -(define (tt:get-current-server-info ttdat dbfname run-id) +;; gets server info and appends path to server file +;; sorts by age, oldest first +;; +;; returns list of (host port startseconds server-id servinfofile) +;; +(define (tt:get-server-info-sorted ttdat dbfname) + (let* ((areapath (tt-areapath ttdat)) + (sfiles (tt:find-server areapath dbfname)) + (sdats (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read + (sorted (sort sdats (lambda (a b) + (< (list-ref a 2)(list-ref b 2)))))) + sorted)) + +(define (tt:get-current-server-info ttdat dbfname) (assert (tt-areapath ttdat) "FATAL: areapath not set in ttdat.") + ;; + ;; TODO - replace most of below with tt;get-server-info-sorted + ;; (let* ((areapath (tt-areapath ttdat)) (sfiles (tt:find-server areapath dbfname)) (sdats (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read (sorted (sort sdats (lambda (a b) (< (list-ref a 2)(list-ref b 2)))))) @@ -257,75 +278,110 @@ ;; NOTE: organise by dbfname, not run-id so we don't need ;; to pull in more modules ;; ;; This is the routine called in megatest.scm to start a server ;; +;; Server viability is checked in keep-running. Blindly start and run here. +;; (define (tt:start-server areapath run-id dbfname handler keys) (assert areapath "FATAL: areapath not provided for tt:start-server") ;; is there already a server for this dbfile? Then exit. - (let* ((ttdat (make-tt areapath: areapath)) - (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead - (if (null? servers) - (let* ((dbstruct (dbmod:open-dbmoddb areapath run-id (dbfile:db-init-proc) keys))) - (tt-handler-set! ttdat (handler dbstruct)) - (let* ((tcp-thread (make-thread - (lambda () - (tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data - "tcp-server-thread")) - (run-thread (make-thread - (lambda () - (tt:keep-running ttdat dbfname dbstruct))))) - (thread-start! tcp-thread) - (thread-start! run-thread) - (thread-join! run-thread) ;; run thread will exit on timeout or other conditions - ;; - ;; set a flag here to tell tcp-thread to stop running - ;; - ;; (thread-join! tcp-thread) ;; can't wait - ;; - ;; remove the servinfo file - ;; - ;; close the database, remove lock in on-disk db - ;; - ;; close the listener ports - ;; - (exit))) - (begin - (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.") - (exit))))) + (let* ((ttdat (make-tt areapath: areapath))) + ;; (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead + ;; (if (null? servers) + (let* ((dbstruct (dbmod:open-dbmoddb areapath run-id (dbfile:db-init-proc) keys))) + (tt-handler-set! ttdat (handler dbstruct)) + (let* ((tcp-thread (make-thread + (lambda () + (tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data + "tcp-server-thread")) + (run-thread (make-thread + (lambda () + (tt:keep-running ttdat dbfname dbstruct))))) + (thread-start! tcp-thread) + (thread-start! run-thread) + (thread-join! run-thread) ;; run thread will exit on timeout or other conditions + ;; + ;; set a flag here to tell tcp-thread to stop running + ;; + ;; (thread-join! tcp-thread) ;; can't wait + ;; + ;; remove the servinfo file + ;; + ;; close the database, remove lock in on-disk db + ;; + ;; close the listener ports + ;; + (exit))) + ;;(begin + ;; (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.") + ;; (exit))))) + )) (define (tt:keep-running ttdat dbfname dbstruct) ;; verfiy conn for ready ;; listener socket has been started by this stage (thread-sleep! 1) - (let loop ((count 0)) - (if (> count 60) - (begin - (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.") - (exit 1)) - (if (not (tt-port ttdat)) ;; no connection yet - (let* ((last-update (dbr:dbstruct-last-update dbstruct)) - (curr-secs (current-seconds))) - (if (> (- curr-secs last-update) 3) ;; every 3-4 seconds - (begin - ((dbr:dbstruct-sync-proc dbstruct) last-update) - (dbr:dbstruct-last-update-set! curr-secs))) - (thread-sleep! 1) - (loop (+ count 1)))))) - - (tt:create-server-registration-file ttdat dbfname) - ;; now start watching the last-access, if it hasn't been touched - ;; in over ten seconds we exit - (let loop () - (if (< (- (current-seconds) (tt-last-access ttdat)) 60) - (begin - (thread-sleep! 2) - (loop)))) - (if (tt-cleanup-proc ttdat) - ((tt-cleanup-proc ttdat))) - (debug:print 0 *default-log-port* "INFO: Server timed out, exiting.")) - + (let* ((cleanup (lambda () + (if (tt-cleanup-proc ttdat) + ((tt-cleanup-proc ttdat)))))) + (let loop ((count 0)) + (if (> count 60) + (begin + (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.") + (exit 1)) + (if (not (tt-port ttdat)) ;; no connection yet + (let* ((last-update (dbr:dbstruct-last-update dbstruct)) + (curr-secs (current-seconds))) + (if (> (- curr-secs last-update) 3) ;; every 3-4 seconds + (begin + ((dbr:dbstruct-sync-proc dbstruct) last-update) + (dbr:dbstruct-last-update-set! curr-secs))) + (thread-sleep! 1) + (loop (+ count 1)))))) + + (tt:create-server-registration-file ttdat dbfname) + ;; now start watching the last-access, if it hasn't been touched + ;; in over ten seconds we exit + (thread-sleep! 1) + (let loop () + (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) + (ok (cond + ((null? servers) #f) ;; not ok + ((equal? (list-ref (car servers) 6) ;; compare the servinfofile + (tt-servinf-file ttdat)) + (debug:print-info 0 *default-log-port* "Keep running, I'm the top server.") + #t) + (else + (debug:print-info 0 *default-log-port* "I'm not the lead server: "servers) + (let* ((leadsrv (car servers))) + (match leadsrv + ((host port startseconds server-id pid dbfname servinfofile) + (if (tt:ping host port server-id) + #f ;; not the server, but all good, want to exit + (begin + ;; what to do here? remove the offending file? + (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile) + (delete-file* servinfofile) + #t ;; not the server but the server is not reachable + ))) + (else + (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv) + (assert #f "Bad server record "leadsrv)))))))) + (if (not ok) + (begin + (cleanup) + (exit))) + + (if (< (- (current-seconds) (tt-last-access ttdat)) 60) + (begin + (thread-sleep! 5) + (loop))))) + (cleanup) + (debug:print 0 *default-log-port* "INFO: Server timed out, exiting."))) + + ;; ;; given an already set up uconn start the cmd-loop ;; ;; ;; (define (tt:cmd-loop ttdat) ;; (let* ((serv-listener (-socket uconn)) ;; (listener (lambda () @@ -380,10 +436,11 @@ (serv-id (tt:mk-signature areapath)) (clean-proc (lambda () (delete-file* servinf)))) (assert (and host port) "FATAL: tt:create-server-registration-file called with no conn, dbfname="dbfname) (tt-cleanup-proc-set! ttdat clean-proc) + (tt-servinf-file-set! ttdat servinf) (with-output-to-file servinf (lambda () (print "SERVER STARTED: "host":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)" dbfname: "dbfname))) serv-id)) @@ -395,19 +452,19 @@ (define (tt:find-server areapath dbfname) (let* ((servdir (tt:get-servinfo-dir areapath)) (sfiles (glob (conc servdir"/*:"dbfname)))) sfiles)) -;; given a path to a server info file return: host port startseconds server-id +;; given a path to a server info file return: host port startseconds server-id pid dbfname logf ;; example of what it's looking for in the log file: ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 ;; (define (tt:server-get-info logf) (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id (dbprep-rx (regexp "^SERVER: dbprep")) (dbprep-found 0) - (bad-dat (list #f #f #f #f #f #f))) + (bad-dat (list #f #f #f #f #f #f logf))) (handle-exceptions exn (begin ;; WARNING: this is potentially dangerous to blanket ignore the errors (if (file-exists? logf) @@ -433,11 +490,12 @@ (list host (string->number port) (string->number start) server-id (string->number pid) - dbfname)) + dbfname + logf)) (else (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst) bad-dat)))) (begin (if dbprep-found @@ -468,12 +526,14 @@ ;; we want the remote server to start in *toppath* so push there ;; (push-directory areapath) ;; use cd in the command line instead (debug:print 0 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)"...") ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ... + (setenv "NBFAKE_LOG" logfile) (system (conc "cd "areapath" ; nbfake " cmdln)) (unsetenv "NBFAKE_QUIET") + (unsetenv "NBFAKE_LOG") ;;(pop-directory) )) ;;====================================================================== ;; tcp connection stuff