Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -411,11 +411,12 @@ (define (http-transport:keep-running) ;; if none running or if > 20 seconds since ;; server last used then start shutdown ;; This thread waits for the server to come alive (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server") - (let* ((sdat #f) + (let* ((servinfofile #f) + (sdat #f) (no-sync-db (db:open-no-sync-db)) (tmp-area (common:get-db-tmp-area)) (started-file (conc tmp-area "/.server-started")) (server-start-time (current-seconds)) (server-info (let loop ((start-time (current-seconds)) @@ -432,10 +433,11 @@ (> (- (current-seconds) start-time) 2)) (let* ((servinfodir (conc *toppath*"/.servinfo")) (ipaddr (car sdat)) (port (cadr sdat)) (servinf (conc servinfodir"/"ipaddr":"port))) + (set! servinfofile servinf) (if (not (file-exists? servinfodir)) (create-directory servinfodir #t)) (with-output-to-file servinf (lambda () (let* ((serv-id (server:mk-signature))) @@ -496,29 +498,16 @@ (if (not server-going) ;; *dbstruct-dbs* (begin (debug:print 0 *default-log-port* "SERVER: dbprep") (set! *dbstruct-dbs* (db:setup #t)) ;; run-id)) FIXME!!! (set! server-going #t) - (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version)) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. - - ;; (thread-start! *watchdog*) - ) + (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version))) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. (if (and no-sync-db (common:low-noise-print 5 "sync-all")) ;; cheesy way to reduce frequency of running sync :) (begin (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S")) - - ;; This is tougher than it seems - have to deal with multiple dbs - ;; (db:process-transaction-queue *dbstruct-dbs*) - - (db:all-db-sync *dbstruct-dbs*) - - ;; (db:do-sync no-sync-db) - ;; (db:run-lock-and-sync *no-sync-db*) - ) - ) - ) + (db:all-db-sync *dbstruct-dbs*)))) ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) (rem-time (quotient (- 4000 sync-time) 1000))) @@ -561,22 +550,10 @@ (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) (cond - #;((and *server-run* - (> (- (current-seconds) server-start-time) 420)) ;; let's try server replacement - ;; ((adj-proc-load . 0.056875) (adj-core-load . 0.11375) (1m-load . 0.91) (5m-load . 0.77) (15m-load . 1.0) (proc . 16) (core . 8) (phys . 1)) - (let* ((loaddat (common:get-normalized-cpu-load #f)) - (adj-proc-load (alist-ref 'adj-proc-load loaddat)) - (adj-core-load (alist-ref 'adj-core-load loaddat)) - (adj-load (max adj-proc-load adj-core-load))) - (if (< adj-load 2) ;; reduce chance of runaway - (server:run *toppath*)) - (db:all-db-sync *dbstruct-dbs*) - (thread-sleep! 30) - (http-transport:server-shutdown port))) ((and *server-run* (> (+ last-access server-timeout) (current-seconds))) (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) @@ -584,14 +561,16 @@ (handle-exceptions exn (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on log file " server-log-file ". Are you out of space on that disk? exn=" exn) (if (and (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter (not *server-overloaded*)) - (change-file-times server-log-file curr-time curr-time) - (if (common:low-noise-print 120 "start new server") - (server:kind-run *toppath*) ;; server:kind-run uses [servers] numservers - ))))) + (change-file-times servinfofile curr-time curr-time))) + (if (or (common:low-noise-print 120 "start new server") + (> *api-process-request-count* 50)) ;; if this server is kind of busy start up another + (begin + (debug:print-info 0 *default-log-port* "Server is busy, start another if possible...") + (server:kind-run *toppath*))))) (loop 0 server-state bad-sync-count (current-milliseconds))) (else (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) (http-transport:server-shutdown port))))))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -493,11 +493,11 @@ ((best-five)(names->dats (best-five))) ((all-valid)(names->dats all-valid)) ((best) (let* ((best-five (best-five)) (len (length best-five))) (hash-table-ref serversdat (list-ref best-five (random len))))) - + ((count)(length all-valid)) (else (debug:print 0 *default-log-port* "ERROR: invalid command "mode) #f))) (begin (server:run areapath) @@ -516,11 +516,12 @@ ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least seconds old ;; (server:wait-for-server-start-last-flag areapath) - (server:run areapath) + (if (< (server:choose-server areapath 'count) 10) + (server:run areapath)) #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((lock-file (conc areapath "/logs/server-start.lock"))) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (common:simple-file-lock-and-wait lock-file expire-time: 25) (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag)