Index: TODO ================================================================== --- TODO +++ TODO @@ -16,10 +16,17 @@ # along with Megatest. If not, see . TODO ==== +23WW48 +. Allow two or three servers to run for any given db +. Update avg call count/sec every 30 sec in no-sync +. get server uses no-sync process info to decide which server to suggest +. Use process table to decide who will do sync back +. Fix metadat being synced over and over + 23WW47 . Finding server .. look at .servinfo for likely prime main .. ask the .servinfo prime main for real prime main .. save prime main (for how long, 10 seconds or 10 minutes?) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -728,11 +728,11 @@ ;; #t ;; #f))) ;; timeout is hms string: 1h 5m 3s, default is 1 minute ;; This is currently broken. Just use the number of hours with no unit. -;; Default is 60 seconds. +;; Default is 600 seconds. ;; (define (server:expiration-timeout) (let* ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (string? tmo) (let* ((num (string->number tmo))) Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -185,17 +185,26 @@ (thread-sleep! 0.5) (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect") (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)) (else (let* ((curr-secs (current-seconds))) + (if (not ping-res) ;; the server is actually dead, remove the .servinfo file + (begin + (debug:print-info 0 *default-log-port* "Unreachable server at " + host":"port" with servinfo file "servinffile", removing it") + (if (file-exists? servinffile) + (handle-exceptions + exn + #f + (delete-file servinffile))))) ;; rm the (last server) would go here (if (> (- curr-secs (tt-last-serv-start ttdat)) 10) (begin (tt-last-serv-start-set! ttdat curr-secs) (server-start-proc))) ;; start server if 10 sec since last attempt (thread-sleep! 1) - (debug:print-info 0 *default-log-port* "server ping result was neither running nor starting. Retrying connect") + (debug:print-info 0 *default-log-port* "server ping result was "ping-res" neither running nor starting. Retrying connect") (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) (else ;; no good server found, if haven't started server in > 5 secs, start another (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers (begin (debug:print-info 0 *default-log-port* "Starting server for "dbfname) @@ -566,10 +575,12 @@ (dbfile:insert-or-update-process nsdb *procinf*))) (thread-start! run-thread) (thread-join! run-thread) ;; run thread will exit on timeout or other conditions + ;; (tcp-close (tt-socket ttdat)) ;; close up ports here + ;; replace with call to (dbfile:set-process-done nsdb host pid reason) (procinf-status-set! *procinf* "done") (procinf-end-set! *procinf* (current-seconds)) ;; either convert this to use set-process-done or get rid of set-process-done (dbfile:with-no-sync-db @@ -629,11 +640,11 @@ (if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param)) (begin (thread-sleep! 5) (loop))))) - ;; (cleanup) ;; all done by tt:shutdown-server + (tt:shutdown-server ttdat) (debug:print 0 *default-log-port* "INFO: Server timed out, exiting from tt:keep-running."))) (define (tt:shutdown-server ttdat) (let* ((host (tt-host ttdat)) @@ -641,11 +652,10 @@ (sinf (tt-servinf-file ttdat))) (tt-state-set! ttdat 'shutdown) (portlogger:open-run-close portlogger:set-port port "released") (if (file-exists? sinf) (delete-file* sinf)) - (tcp-close (tt-socket ttdat)) ;; close up ports here )) ;; return servid ;; side-effects: ;; ttdat-cleanup-proc is populated with function to remove the serverinfo file @@ -772,24 +782,29 @@ #f) (else (if (not (file-exists? (conc areapath"/logs"))) (create-directory (conc areapath"/logs") #t)) (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log")) - (cmdln (conc + (cmdln (conc mtexe " -startdir "areapath " -server - ";; (or target-host "-") " -m testsuite:"testsuite " -db "dbfname ;; (dbmod:run-id->dbfname run-id) " " profile-mode - (conc " >> " logfile " 2>&1 &")))) + #;(conc " >> " logfile " 2>&1 &")))) ;; we want the remote server to start in *toppath* so push there ;; (push-directory areapath) ;; use cd in the command line instead (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath) ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) - (system cmdln) + (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ... + (setenv "NBFAKE_LOG" logfile) + (system (conc "cd "areapath" ; nbfake " cmdln)) + (unsetenv "NBFAKE_QUIET") + (unsetenv "NBFAKE_LOG") + ;; (system cmdln) (hash-table-set! *last-server-start* dbfname (current-seconds)) ;; ;; use below to go back to nbfake - nbfake does cause trouble ... ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ... ;; (setenv "NBFAKE_LOG" logfile) ;; (system (conc "cd "areapath" ; nbfake " cmdln))