Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -147,11 +147,11 @@ tasks-add tasks-set-state-given-param-key )) (define *db-write-mutexes* (make-hash-table)) - +(define *server-signature* #f) ;; These are called by the server on recipt of /api calls ;; - keep it simple, only return the actual result of the call, i.e. no meta info here ;; ;; - returns #( flag result ) ;; @@ -231,35 +231,45 @@ (ok-res . #f))) (vector #t res)))))))) ;; indat is (cmd run-id params meta) ;; -;; WARNING: Do not print anything in this function as it reads/writes to current in/out port +;; WARNING: Do not print anything in the lambda of this function as it +;; reads/writes to current in/out port ;; (define (api:tcp-dispatch-request-make-handler dbstruct) ;; cmd run-id params) + (assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.") + (if (not *server-signature*) + (set! *server-signature* (tt:mk-signature *toppath*))) (lambda () - (let* ((indat (deserialize))) - (set! *api-process-request-count* (+ *api-process-request-count* 1)) + (let* ((indat (deserialize)) + (newcount (+ *api-process-request-count* 1)) + (delay-wait (if (> newcount 10) + (- newcount 10) + 0))) + (set! *api-process-request-count* newcount) + (set! *db-last-access* (current-seconds)) (match indat ((cmd run-id params meta) (let* ((status (cond - ;; turn off busy throttling while trying to get things stable - ;; ((> *api-process-request-count* 50) 'busy) - ;; ((> *api-process-request-count* 25) 'loaded) + ;; ((> newcount 30) 'busy) + ;; ((> newcount 15) 'loaded) (else 'ok))) (errmsg (case status - ((busy) (conc "Server overloaded, "*api-process-request-count*" threads in flight")) - ((loaded) (conc "Server loaded, "*api-process-request-count*" threads in flight")) + ((busy) (conc "Server overloaded, "newcount" threads in flight")) + ((loaded) (conc "Server loaded, "newcount" threads in flight")) (else #f))) (result (case status - ((busy loaded) #f) + ((busy) (- newcount 29)) + ((loaded) #f) (else (case cmd - ((ping) (tt:mk-signature *toppath*)) + ((ping) *server-signature*) (else (api:dispatch-request dbstruct cmd run-id params)))))) - (payload (list status errmsg result '()))) + (meta `((wait . ,delay-wait))) + (payload (list status errmsg result meta))) (set! *api-process-request-count* (- *api-process-request-count* 1)) (serialize payload))) (else (assert #f "FATAL: failed to deserialize indat "indat)))))) Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -104,10 +104,11 @@ (define *max-api-process-requests* 0) (define *api-process-request-count* 0) (define *db-write-access* #t) (define *db-last-sync* 0) ;; last time the sync to megatest.db happened (define *db-multi-sync-mutex* (make-mutex)) ;; protect access to *db-sync-in-progress*, *db-last-sync* +(define *db-last-access* (current-seconds)) (define (db:generic-error-printout exn . message) (print-call-chain (current-error-port)) (apply dbfile:print-err message) (dbfile:print-err Index: dbmod.scm ================================================================== --- dbmod.scm +++ dbmod.scm @@ -71,11 +71,11 @@ (if (> (- curr-secs last-update) 2) (begin ((dbr:dbstruct-sync-proc dbstruct) last-update) (dbr:dbstruct-last-update-set! dbstruct curr-secs))) dbstruct) - (let* ((newdbstruct (dbmod:open-dbmoddb areapath run-id init-proc keys syncdir: 'fromdisk))) + (let* ((newdbstruct (dbmod:open-dbmoddb areapath run-id dbfname init-proc keys syncdir: 'fromdisk))) (hash-table-set! *dbmod:nfs-db-handles* dbfname newdbstruct) newdbstruct)))) ;;====================================================================== ;; The inmem one-db file per server method goes in here @@ -110,15 +110,15 @@ ;; Returns dbstruct ;; ;; * This routine creates the db if not found ;; * Probably can get rid of the dbstruct-in ;; -(define (dbmod:open-dbmoddb areapath run-id init-proc keys +(define (dbmod:open-dbmoddb areapath run-id dbfname-in init-proc keys #!key (dbstruct-in #f) (syncdir 'todisk)) (let* ((dbstruct (or dbstruct-in (make-dbr:dbstruct areapath: areapath))) - (dbfname (dbmod:run-id->dbfname run-id)) + (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))) (dbpath (dbmod:get-dbdir dbstruct run-id)) ;; directory where all the .db files are kept (dbfullname (dbmod:run-id->full-dbfname dbstruct run-id)) (dbexists (file-exists? dbfullname)) (inmem (dbmod:open-inmem-db init-proc)) (write-access (file-write-access? dbpath)) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -603,12 +603,13 @@ (handle-exceptions exn (begin (print "ERROR: Failed to switch to log output. " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)) (let* ((tl (or (args:get-arg "-log")(launch:setup))) ;; run launch:setup if -server, ensure we do NOT run launch:setup if -log specified + (dbname (args:get-arg "-db")) ;; for the server logfile name (logf (or (args:get-arg "-log") ;; use -log unless we are a server, then craft a logfile name - (conc tl "/logs/server-" (current-process-id) "-" (get-host-name) ".log"))) + (conc tl "/logs/server-"(or dbname "unk")"-"(current-process-id) "-" (get-host-name) ".log"))) (oup (open-logfile logf))) (if (not (args:get-arg "-log")) (hash-table-set! args:arg-hash "-log" logf)) ;; fake out future queries of -log (debug:print-info 0 *default-log-port* "Sending log output to " logf) (set! *default-log-port* oup)))) @@ -935,22 +936,22 @@ ;;====================================================================== ;; Server? Start up here. ;; (if (args:get-arg "-server") - (let* ((run-id (args:get-arg "-run-id")) + (let* (;; (run-id (args:get-arg "-run-id")) (dbfname (args:get-arg "-db")) (tl (launch:setup)) (keys (keys:config-get-fields *configdat*))) (case (rmt:transport-mode) ((http)(http-transport:launch)) ((tcp) (debug:print 0 *default-log-port* "INFO: Running using tcp method.") - (if run-id - (tt:start-server tl run-id dbfname api:tcp-dispatch-request-make-handler keys) + (if dbfname + (tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys) (begin - (debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -run-id is required.") + (debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.") (exit 1)))) (else (debug:print 0 *default-log-port* "ERROR: rmt:transport-mode value not recognised "(rmt:transport-mode)))) (set! *didsomething* #t))) ;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -137,20 +137,22 @@ ;; verify we can talk to this server (if (tt:ping host port server-id) conn (let* ((curr-secs (current-seconds))) ;; rm the (last server) would go here - (if (> (- curr-secs (tt-last-serv-start ttdat)) 30) + (if (> (- curr-secs (tt-last-serv-start ttdat)) 10) (begin (tt-last-serv-start-set! ttdat curr-secs) (server-start-proc))) ;; don't try and start server unless 30 sec has gone by since last attempt (thread-sleep! 1) (tt:client-connect-to-server ttdat dbfname run-id testsuite))))) (else - (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname) - (tt-last-serv-start-set! ttdat (current-seconds)) - (server-start-proc) + (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 5) ;; really do not want to swamp the machine with servers + (begin + (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname) + (server-start-proc) + (tt-last-serv-start-set! ttdat (current-seconds)))) (thread-sleep! 1) (tt:client-connect-to-server ttdat dbfname run-id testsuite))))))) (define (tt:ping host port server-id) (let* ((res (tt:send-receive-direct host port `(ping #f #f #f)))) ;; please send me your server-id @@ -170,36 +172,53 @@ ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res) #f)))) ;; client side handler ;; +;;(tt:handler # get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest") +;; (define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe) ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now. (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f))) (if conn ;; have connection, call the server (let* ((res (tt:send-receive ttdat conn cmd run-id params))) ;; res is (status errmsg result meta) (match res ((status errmsg result meta) + (if (list? meta) + (let* ((delay-wait (alist-ref 'delay-wait meta))) + (if (and (number? delay-wait) + (> delay-wait 0)) + (begin + (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds") + (thread-sleep! delay-wait))))) (case status - ((busy) - (debug:print 0 *default-log-port* "WARNING: server is overloaded, will try again in few seconds.") - (thread-sleep! 2) - (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)) + ((busy) ;; result will be how long the server wants you to delay + (debug:print 0 *default-log-port* "WARNING: server is overloaded, will try again in "result" seconds.") + (thread-sleep! (if (number? result) result 2)) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) ((loaded) - (debug:print 0 *default-log-port* "WARNING: server is loaded, will try again in a second.") - (thread-sleep! 1) - (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)) + (debug:print 0 *default-log-port* "WARNING: server is loaded, will try again in a 1/4 second.") + (thread-sleep! 0.25) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) (else result))) (else (if (not res) - (begin ;; let* ((srvfile (tt-conn-servinf-file ))) ;; server likely died + (let* ((host (tt-conn-host conn)) + (port (tt-conn-port conn)) + ;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db + (pid (tt-conn-pid conn)) + (servinf (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname))) (hash-table-set! (tt-conns ttdat) dbfname #f) - (debug:print 0 *default-log-port* "INFO: connection to server broken, reconnecting.") - (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)) + (if (file-exists? servinf) + (begin + (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", attempting to remove servinfo file.") + (delete-file* servinf)) + (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf)) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) (assert #f "FATAL: tt:handler received bad data "res))))) (begin (thread-sleep! 1) ;; give it a rest and try again (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe))))) @@ -291,11 +310,11 @@ (assert areapath "FATAL: areapath not provided for tt:start-server") ;; is there already a server for this dbfile? Then exit. (let* ((ttdat (make-tt areapath: areapath))) ;; (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead ;; (if (null? servers) - (let* ((dbstruct (dbmod:open-dbmoddb areapath run-id (dbfile:db-init-proc) keys))) + (let* ((dbstruct (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys))) (tt-handler-set! ttdat (handler dbstruct)) (let* ((tcp-thread (make-thread (lambda () (tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data "tcp-server-thread")) @@ -323,11 +342,12 @@ )) (define (tt:keep-running ttdat dbfname dbstruct) ;; verfiy conn for ready ;; listener socket has been started by this stage - (thread-sleep! 1) + ;; wait for a port before creating the registration file + ;; (let* ((cleanup (lambda () (if (tt-cleanup-proc ttdat) ((tt-cleanup-proc ttdat)))))) (let loop ((count 0)) (if (> count 60) @@ -335,21 +355,21 @@ (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.") (exit 1)) (if (not (tt-port ttdat)) ;; no connection yet (let* ((last-update (dbr:dbstruct-last-update dbstruct)) (curr-secs (current-seconds))) - (if (> (- curr-secs last-update) 3) ;; every 3-4 seconds + (if (> (- curr-secs last-update) 3) ;; every 3-4 seconds update the db? maybe this should be refresh the inmem? (begin ((dbr:dbstruct-sync-proc dbstruct) last-update) (dbr:dbstruct-last-update-set! curr-secs))) - (thread-sleep! 1) + (thread-sleep! 0.25) (loop (+ count 1)))))) (tt:create-server-registration-file ttdat dbfname) ;; now start watching the last-access, if it hasn't been touched ;; in over ten seconds we exit - (thread-sleep! 1) + (thread-sleep! 0.05) ;; any real need for delay here? (let loop () (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) (ok (cond ((null? servers) #f) ;; not ok ((equal? (list-ref (car servers) 6) ;; compare the servinfofile @@ -361,20 +381,25 @@ (let* ((leadsrv (car servers))) (match leadsrv ((host port startseconds server-id pid dbfname servinfofile) (if (tt:ping host port server-id) #f ;; not the server, but all good, want to exit - (begin - ;; what to do here? remove the offending file? - (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile) - (delete-file* servinfofile) - #t ;; not the server but the server is not reachable - ))) - (else + (if (and (file-exists? servinfofile) + (> (- (current-seconds)(file-modification-time servinfofile)) 5)) + (begin + ;; can't ping and file has been on disk 5 seconds, go ahead and try to remove it + (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile) + (delete-file* servinfofile) + #t) ;; not the server but the server is not reachable + #t))) + (else ;; should never get here (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv) (assert #f "Bad server record "leadsrv)))))))) - (if (not ok) + (if ok + ;; (if (> *api-process-request-count* 0) ;; have requests in flight + ;; (tt-last-access-set! ttdat (current-seconds))) + (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access (begin (cleanup) (exit))) (if (< (- (current-seconds) (tt-last-access ttdat)) 60) @@ -466,51 +491,43 @@ (define (tt:server-get-info logf) (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id (dbprep-rx (regexp "^SERVER: dbprep")) (dbprep-found 0) (bad-dat (list #f #f #f #f #f #f logf))) - (handle-exceptions - exn - (begin - ;; WARNING: this is potentially dangerous to blanket ignore the errors - (if (file-exists? logf) - (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn)) - bad-dat) ;; no idea what went wrong, call it a bad server - (with-input-from-file - logf - (lambda () - (let loop ((inl (read-line)) - (lnum 0)) - (if (not (eof-object? inl)) - (let ((mlst (string-match server-rx inl)) - (dbprep (string-match dbprep-rx inl))) - (if dbprep (set! dbprep-found 1)) - (if (not mlst) - (if (< lnum 500) ;; give up if more than 500 lines of server log read - (loop (read-line)(+ lnum 1)) - (begin - (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) - bad-dat)) - (match mlst - ((_ host port start server-id pid dbfname) - (list host - (string->number port) - (string->number start) - server-id - (string->number pid) - dbfname - logf)) - (else - (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst) - bad-dat)))) - (begin - (if dbprep-found - (begin - (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) - (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? - (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) - bad-dat)))))))) + (let ((fdat (handle-exceptions + exn + (begin + ;; WARNING: this is potentially dangerous to blanket ignore the errors + (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn=" exn) + '()) ;; no idea what went wrong, call it a bad server, return empty list + (with-input-from-file logf read-lines)))) + (if (null? fdat) ;; bad data, return bad-dat + bad-dat + (let loop ((inl (car fdat)) + (tail (cdr fdat)) + (lnum 0)) + (let ((mlst (string-match server-rx inl)) + (dbprep (string-match dbprep-rx inl))) + (if dbprep (set! dbprep-found 1)) + (if (not mlst) + (if (> lnum 500) ;; give up if more than 500 lines of server log read + bad-dat + (if (null? tail) + bad-dat + (loop (car tail)(cdr tail)(+ lnum 1)))) + (match mlst ;; have a not null list + ((_ host port start server-id pid dbfname) + (list host + (string->number port) + (string->number start) + server-id + (string->number pid) + dbfname + logf)) + (else + (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst) + bad-dat))))))))) ;; Given an area path, start a server process ### NOTE ### > file 2>&1 ;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. @@ -522,11 +539,11 @@ (let* ((logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) (cmdln (conc mtexe " -server - ";; (or target-host "-") " -m testsuite:" testsuite - " -run-id " (or run-id "main") + ;; " -run-id " (or run-id "main") ;; NO, we do NOT want to have run id as part of this " -db " (dbmod:run-id->dbfname run-id) " " profile-mode ))) ;; (conc " >> " logfile " 2>&1 &"))))) ;; we want the remote server to start in *toppath* so push there ;; (push-directory areapath) ;; use cd in the command line instead @@ -580,12 +597,10 @@ (tt-port-set! uconn port) (tt-host-set! uconn addr) (tt-host-port-set! uconn (conc addr":"port)) (tt-socket-set! uconn tlsn) uconn)) - - ;;====================================================================== ;; utils ;;======================================================================