@@ -135,15 +135,19 @@ (define (tt:client-connect-to-server ttdat dbfname run-id testsuite) (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id) (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id) (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f)) (server-start-proc (lambda () + (print "dbfname: "dbfname) + (assert (equal? dbfname "main.db") ;; only main.db is started here + "FATAL: called server-start-proc for db other than main.db") (tt:server-process-run (tt-areapath ttdat) testsuite ;; (dbfile:testsuite-name) (common:find-local-megatest) - run-id)))) + run-id)) + )) (if conn (begin (debug:print-info 2 *default-log-port* "already connected to a server") conn) ;; we are already connected to the server (let* ((sdat (tt:get-current-server-info ttdat dbfname))) @@ -716,10 +720,15 @@ ;; future: ping oldest, if alive remove other : files ;; (define (tt:find-server areapath dbfname) (let* ((servdir (tt:get-servinfo-dir areapath)) (sfiles (glob (conc servdir"/*:"dbfname)))) + + ;; filter the files here by looking in processes table (if we are not main.db) + ;; and or look at the time stamp on the servinfo file, a running server will + ;; touch the file every minute (again, this will only apply for main.db) + sfiles)) ;; given a path to a server info file return: host port startseconds server-id pid dbfname logf ;; example of what it's looking for in the log file: ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 @@ -730,12 +739,13 @@ (dbprep-found 0) (bad-dat (list #f #f #f #f #f #f logf))) (let ((fdat (handle-exceptions exn (begin - ;; WARNING: this is potentially dangerous to blanket ignore the errors - (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn)) + ;; BUG, TODO: add err checking, for now blanket ignore the errors? + (debug:print-info 0 *default-log-port* "Unable to get server info from "logf + ", exn="(condition->list exn)) '()) ;; no idea what went wrong, call it a bad server, return empty list (with-input-from-file logf read-lines)))) (if (null? fdat) ;; bad data, return bad-dat bad-dat (let loop ((inl (car fdat)) @@ -773,22 +783,26 @@ (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.") (assert mtexe "FATAL: tt:server-process-run called without mtexe defined.") ;; mtest -server - -m testsuite:ext-tests -db 6.db (let* ((dbfname (dbmod:run-id->dbfname run-id)) (load (get-normalized-cpu-load)) - (trying (length (tt:find-server areapath dbfname))) + (srvrs (tt:find-server areapath dbfname)) + (trying (length srvrs)) (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname)))) (cond ((> load 2.0) - (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server.") - (thread-sleep! 1)) + (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes") + (thread-sleep! 1) + #f) ((> nrun 100) (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.") - (thread-sleep! 1)) - ((> trying 4) + (thread-sleep! 1) + #f) + ((> trying 2) (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.") - (thread-sleep! 1)) + (thread-sleep! 1) + #f) (else (if (not (file-exists? (conc areapath"/logs"))) (create-directory (conc areapath"/logs") #t)) (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log")) (cmdln (conc @@ -811,11 +825,11 @@ ;; (system (conc "cd "areapath" ; nbfake " cmdln)) ;; (unsetenv "NBFAKE_QUIET") ;; (unsetenv "NBFAKE_LOG") ;;(pop-directory) - ))))) + #t))))) ;;====================================================================== ;; tcp connection stuff ;;======================================================================