@@ -111,10 +111,11 @@ ;; (define tt-server-timeout-param (make-parameter 600)) ;; make ttdat visible (define *server-info* #f) +(define *server-run* #t) (define (tt:make-remote areapath) (make-tt areapath: areapath)) ;; 1 ... or #f @@ -125,29 +126,31 @@ (and (or (number? run-id) (not run-id)) (equal? (dbfile:run-id->dbfname run-id) dbfname))) (tcp-buffer-size 2048) -;; (max-connections 4096) +;; (max-connections 4096) + +(define (tt:get-conn ttdat dbfname) + (hash-table-ref/default (tt-conns ttdat) dbfname #f)) ;; do all the busy work of finding and setting up conn for ;; connecting to a server ;; -(define (tt:client-connect-to-server ttdat dbfname run-id testsuite) +(define (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc) (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id) (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id) - (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f)) - (server-start-proc (lambda () - (print "dbfname: "dbfname) - (assert (equal? dbfname "main.db") ;; only main.db is started here - "FATAL: called server-start-proc for db other than main.db") - (tt:server-process-run - (tt-areapath ttdat) - testsuite ;; (dbfile:testsuite-name) - (common:find-local-megatest) - run-id)) - )) + (let* ((conn (tt:get-conn ttdat dbfname)) + (server-start-proc (or server-start-proc + (lambda () + (assert (equal? dbfname "main.db") ;; only main.db is started here + "FATAL: called server-start-proc for db other than main.db") + (tt:server-process-run + (tt-areapath ttdat) + testsuite ;; (dbfile:testsuite-name) + (common:find-local-megatest) + run-id))))) (if conn (begin (debug:print-info 2 *default-log-port* "already connected to a server") conn) ;; we are already connected to the server (let* ((sdat (tt:get-current-server-info ttdat dbfname))) @@ -176,21 +179,21 @@ (hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good? conn) ((starting) (thread-sleep! 0.5) (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect") - (tt:client-connect-to-server ttdat dbfname run-id testsuite)) + (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)) (else (let* ((curr-secs (current-seconds))) ;; rm the (last server) would go here (if (> (- curr-secs (tt-last-serv-start ttdat)) 10) (begin (tt-last-serv-start-set! ttdat curr-secs) (server-start-proc))) ;; start server if 10 sec since last attempt (thread-sleep! 1) (debug:print-info 2 *default-log-port* "server ping result was neither running nor starting. Retrying connect") - (tt:client-connect-to-server ttdat dbfname run-id testsuite))))))) + (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) (else ;; no good server found, if haven't started server in > 5 secs, start another (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers (begin (debug:print-info 0 *default-log-port* "Starting server for "dbfname) (server-start-proc) @@ -197,11 +200,11 @@ (tt-last-serv-start-set! ttdat (current-seconds)) (thread-sleep! 3) )) (thread-sleep! 1) (debug:print-info 0 *default-log-port* "Connect to server for " dbfname) - (tt:client-connect-to-server ttdat dbfname run-id testsuite))))))) + (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) (define (tt:timed-ping host port server-id) (let* ((start-time (current-milliseconds)) (result (tt:ping host port server-id))) (cons result (- (current-milliseconds) start-time)))) @@ -233,14 +236,14 @@ ;; client side handler ;; ;;(tt:handler # get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest") ;; -(define (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe) +(define (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc) ;; connect-to-server will start a server if needed. (let* ((areapath (tt-areapath ttdat)) - (conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; looks up conn keyed by dbfname + (conn (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))) ;; looks up conn keyed by dbfname (if conn ;; have connection, call the server (let* ((res (tt:send-receive ttdat conn cmd run-id params))) ;; res is (status errmsg result meta) (match res @@ -560,10 +563,13 @@ ;; in over ten seconds we exit (thread-sleep! 0.05) ;; any real need for delay here? (let loop () (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) (ok (cond + ((not *server-run*) + (debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.") + #f) ((null? servers) #f) ;; not ok ((equal? (list-ref (car servers) 6) ;; compare the servinfofile (tt-servinf-file ttdat)) (let* ((res (if db-locked-in #t @@ -629,12 +635,13 @@ (let* ((last-update (dbr:dbstruct-last-update dbstruct)) (curr-secs (current-seconds))) (if (and (eq? (tt-state ttdat) 'running) (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db? - (begin - (set! (file-modification-time (tt-servinf-file ttdat)) (current-seconds)) + (let* ((sinfo-file (tt-servinf-file ttdat))) + ;; (debug:print 0 *default-log-port* "INFO: touching "sinfo-file) + (set! (file-modification-time sinfo-file) (current-seconds)) ((dbr:dbstruct-sync-proc dbstruct) last-update) (dbr:dbstruct-last-update-set! dbstruct curr-secs)))) (if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param)) (begin @@ -717,17 +724,25 @@ ;; if more than one, wait one second and look again ;; future: ping oldest, if alive remove other : files ;; (define (tt:find-server areapath dbfname) (let* ((servdir (tt:get-servinfo-dir areapath)) - (sfiles (glob (conc servdir"/*:"dbfname)))) - + (sfiles (glob (conc servdir"/*:"dbfname))) + (goodfiles '())) + ;; filter the files here by looking in processes table (if we are not main.db) ;; and or look at the time stamp on the servinfo file, a running server will ;; touch the file every minute (again, this will only apply for main.db) - - sfiles)) + (for-each (lambda (fname) + (let* ((age (- (current-seconds)(file-modification-time fname)))) + (if (> age 10) ;; can't trust it if over ten seconds old + (begin + (debug:print 0 *default-log-port* "WARNING: removing stale servinfo file "fname) + (delete-file fname)) + (set! goodfiles (cons fname goodfiles))))) + sfiles) + goodfiles)) ;; given a path to a server info file return: host port startseconds server-id pid dbfname logf ;; example of what it's looking for in the log file: ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 ;; @@ -769,10 +784,17 @@ logf)) (else (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst) bad-dat))))))))) +(define *last-server-start* (make-hash-table)) + +(define (tt:too-recent-server-start dbfname) + (let* ((last-run-time (hash-table-ref/default *last-server-start* dbfname #f))) + (and last-run-time + (< (- (current-seconds) last-run-time) 5)))) + ;; Given an area path, start a server process ### NOTE ### > file 2>&1 ;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; @@ -779,55 +801,58 @@ (define (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area (assert areapath "FATAL: tt:server-process-run called without areapath defined.") (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.") (assert mtexe "FATAL: tt:server-process-run called without mtexe defined.") ;; mtest -server - -m testsuite:ext-tests -db 6.db - (let* ((dbfname (dbmod:run-id->dbfname run-id)) - (load (get-normalized-cpu-load)) - (srvrs (tt:find-server areapath dbfname)) - (trying (length srvrs)) - (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname)))) - (cond - ((> load 2.0) - (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes") - (thread-sleep! 1) - #f) - ((> nrun 100) - (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.") - (thread-sleep! 1) - #f) - ((> trying 2) - (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.") - (thread-sleep! 1) - #f) - (else - (if (not (file-exists? (conc areapath"/logs"))) - (create-directory (conc areapath"/logs") #t)) - (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log")) - (cmdln (conc - mtexe - " -startdir "areapath - " -server - ";; (or target-host "-") - " -m testsuite:"testsuite - " -db "dbfname ;; (dbmod:run-id->dbfname run-id) - " " profile-mode - (conc " >> " logfile " 2>&1 &")))) - ;; we want the remote server to start in *toppath* so push there - ;; (push-directory areapath) ;; use cd in the command line instead - (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath) - ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) - - (system cmdln) - ;; ;; use below to go back to nbfake - nbfake does cause trouble ... - ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ... - ;; (setenv "NBFAKE_LOG" logfile) - ;; (system (conc "cd "areapath" ; nbfake " cmdln)) - ;; (unsetenv "NBFAKE_QUIET") - ;; (unsetenv "NBFAKE_LOG") - - ;;(pop-directory) - #t))))) + (let* ((dbfname (dbmod:run-id->dbfname run-id))) + (if (tt:too-recent-server-start dbfname) + #f + (let* ((load (get-normalized-cpu-load)) + (srvrs (tt:find-server areapath dbfname)) + (trying (length srvrs)) + (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname)))) + (cond + ((> load 2.0) + (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes") + (thread-sleep! 1) + #f) + ((> nrun 100) + (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.") + (thread-sleep! 1) + #f) + ((> trying 2) + (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.") + (thread-sleep! 1) + #f) + (else + (if (not (file-exists? (conc areapath"/logs"))) + (create-directory (conc areapath"/logs") #t)) + (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log")) + (cmdln (conc + mtexe + " -startdir "areapath + " -server - ";; (or target-host "-") + " -m testsuite:"testsuite + " -db "dbfname ;; (dbmod:run-id->dbfname run-id) + " " profile-mode + (conc " >> " logfile " 2>&1 &")))) + ;; we want the remote server to start in *toppath* so push there + ;; (push-directory areapath) ;; use cd in the command line instead + (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath) + ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) + + (system cmdln) + (hash-table-set! *last-server-start* dbfname (current-seconds)) + ;; ;; use below to go back to nbfake - nbfake does cause trouble ... + ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ... + ;; (setenv "NBFAKE_LOG" logfile) + ;; (system (conc "cd "areapath" ; nbfake " cmdln)) + ;; (unsetenv "NBFAKE_QUIET") + ;; (unsetenv "NBFAKE_LOG") + + ;;(pop-directory) + #t))))))) ;;====================================================================== ;; tcp connection stuff ;;======================================================================