Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -125,10 +125,12 @@ (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (cond ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f #f "remote must be called with a vector") ) + ((> *api-process-request-count* 20) + (vector #f 'overloaded)) (else (let* ((cmd-in (vector-ref dat 0)) (cmd (if (symbol? cmd-in) cmd-in (string->symbol cmd-in))) @@ -271,16 +273,17 @@ ;; TESTMETA ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) ;; TASKS ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)))))) + ;; save all stats + (let ((delta-t (- (current-milliseconds) + start-t))) + (hash-table-set! *db-api-call-time* cmd + (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if (not writecmd-in-readonly-mode) - (let ((delta-t (- (current-milliseconds) - start-t))) - (hash-table-set! *db-api-call-time* cmd - (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))) - (vector #t res)) + (vector #t res) (vector #f res))))))) ;; http-server send-response ;; api:process-request ;; db:* Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -78,11 +78,11 @@ (exit 1)) ;; ;; Alternatively here, we can get the list of candidate servers and work our way ;; through them searching for a good one. ;; - (let* ((server-dat (server:get-first-best areapath)) + (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath)) (runremote (or area-dat *runremote*))) (if (not server-dat) ;; no server found (client:setup-http areapath remaining-tries: (- remaining-tries 1)) (let ((host (cadr server-dat)) (port (caddr server-dat))) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -150,12 +150,14 @@ (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) - (server-timeout (or (server:get-timeout) 100)) - (force-server #f)) ;; default to 100 seconds + (server-timeout (or (server:get-timeout) 100)) ;; default to 100 seconds + (force-server #f) + (ro-mode #f) + (ro-mode-checked #f)) ;; flag that indicates we have checked for ro-mode ;; launching and hosts (defstruct host (reachable #f) (last-update 0) @@ -1062,20 +1064,25 @@ ;; force use of server? ;; (define (common:force-server?) (let* ((force-setting (configf:lookup *configdat* "server" "force")) - (force-type (if force-setting (string->symbol force-setting) #f))) - (case force-type - ((#f) #f) - ((always) #t) - ((test) (if (args:get-arg "-execute") ;; we are in a test - #t - #f)) - (else - (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.") - #t)))) ;; default to requiring server + (force-type (if force-setting (string->symbol force-setting) #f)) + (force-result (case force-type + ((#f) #f) + ((always) #t) + ((test) (if (args:get-arg "-execute") ;; we are in a test + #t + #f)) + (else + (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.") + #t)))) ;; default to requiring server + (if force-result + (begin + (debug:print-info 0 *default-log-port* "forcing use of server, force setting is \"" force-setting "\".") + #t) + #f))) ;;====================================================================== ;; M I S C L I S T S ;;====================================================================== Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -2096,18 +2096,22 @@ ;; generate stats from *db-api-call-time* (let ((ordered-keys (sort (hash-table-keys *db-api-call-time*) (lambda (a b) (let ((sum-a (common:sum (hash-table-ref *db-api-call-time* a))) (sum-b (common:sum (hash-table-ref *db-api-call-time* b)))) - (> sum-a sum-b)))))) + (> sum-a sum-b))))) + (total 0)) (for-each (lambda (cmd-key) (let* ((dat (hash-table-ref *db-api-call-time* cmd-key)) - (avg (if (> (length dat) 0) + (num (length dat)) + (avg (if (> num 0) (/ (common:sum dat)(length dat))))) + (set! total (+ total num)) (debug:print-info 0 *default-log-port* cmd-key "\tavg: " avg " max: " (common:max dat) " min: " (common:min-max < dat) " num: " (length dat)))) - ordered-keys))) + ordered-keys) + (debug:print-info 0 *default-log-port* "TOTAL: " total " api calls since start."))) (define (db:get-all-run-ids dbstruct) (db:with-db dbstruct #f Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -418,11 +418,14 @@ (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) (begin (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) (flush-output *default-log-port*))) - + (if (common:low-noise-print 60 "dbstats") + (begin + (debug:print 0 *default-log-port* "Server stats:") + (db:print-current-query-stats))) (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600)) (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -785,11 +785,11 @@ (sections (if target (list "default" target) #f)) ;; for runconfigs (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config (rundir (if (and runname target linktree)(conc linktree "/" target "/" runname) #f)) (mtcachef (and rundir (conc rundir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) (rccachef (and rundir (conc rundir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) - (cancreate (and rundir (common:file-exists? rundir)(file-write-access? rundir))) + (cancreate (and rundir (common:file-exists? rundir)(file-write-access? rundir))) (cxt (hash-table-ref/default *contexts* toppath #f))) ;; create our cxt for this area if it doesn't already exist (if (not cxt)(hash-table-set! *contexts* toppath (make-cxt))) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -56,15 +56,26 @@ ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; - (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value - (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas - (dbfile (conc *toppath* "/megatest.db")) - (readonly-mode (not (file-write-access? dbfile))) ;; TODO: use dbstruct or runremote to figure this out in future - (runremote (or area-dat *runremote*))) + (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value + (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas + (runremote (or area-dat + *runremote*)) + (readonly-mode (if (and runremote + (remote-ro-mode-checked runremote)) + (remote-ro-mode runremote) + (let* ((dbfile (conc *toppath* "/megatest.db")) + (ro-mode (not (file-write-access? dbfile)))) ;; TODO: use dbstruct or runremote to figure this out in future + (if runremote + (begin + (remote-ro-mode-set! runremote ro-mode) + (remote-ro-mode-checked-set! runremote #t) + ro-mode) + ro-mode))))) + ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) (cond ;; give up if more than 15 attempts ((> attemptnum 15) (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.") @@ -94,11 +105,11 @@ (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") (remote-conndat-set! runremote #f) (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a record for our connection for given area - ((not runremote) + ((not runremote) ;; can remove this one. should never get here. (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a homehost record @@ -156,11 +167,12 @@ (not (remote-conndat runremote))) (and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost (not (remote-conndat runremote)))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) (mutex-unlock! *rmt-mutex*) - (server:start-and-wait *toppath*) + (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up? + (server:start-and-wait *toppath*)) (remote-force-server-set! runremote (common:force-server?)) (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as ;; all set up if get this far, dispatch the query ((and (not (remote-force-server runremote)) @@ -195,18 +207,24 @@ res) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " is unknown") (mutex-unlock! *rmt-mutex*) (exit 1))) - (begin - (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) - (remote-conndat-set! runremote #f) - (remote-server-url-set! runremote #f) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") - (mutex-unlock! *rmt-mutex*) - (server:start-and-wait *toppath*) - (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) + (if (eq? res 'overloaded) + (let ((wait-delay (+ attemptnum (* attemptnum 10)))) + (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") + (thread-sleep! wait-delay) + (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) + (begin + (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) + (remote-conndat-set! runremote #f) + (remote-server-url-set! runremote #f) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") + (mutex-unlock! *rmt-mutex*) + (if (not (server:check-if-running *toppath*)) + (server:start-and-wait *toppath*)) + (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))))))))) ;; (define (rmt:update-db-stats run-id rawcmd params duration) ;; (mutex-lock! *db-stats-mutex*) ;; (handle-exceptions ;; exn Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -167,19 +167,19 @@ (day-seconds (* 24 60 60))) ;; if the directory exists continue to get the list ;; otherwise attempt to create the logs dir and then ;; continue (if (if (directory-exists? (conc areapath "/logs")) - #t + '() (if (file-write-access? areapath) (begin (condition-case (create-directory (conc areapath "/logs") #t) (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list."))) (directory-exists? (conc areapath "/logs"))) - #f)) + '())) (let* ((server-logs (glob (conc areapath "/logs/server-*.log"))) (num-serv-logs (length server-logs))) (if (null? server-logs) '() (let loop ((hed (car server-logs)) @@ -218,21 +218,24 @@ ;; (define (server:get-best srvlst) (let ((now (current-seconds))) (sort (filter (lambda (rec) - (let ((start-time (list-ref rec 3)) - (mod-time (list-ref rec 0))) - ;; (print "start-time: " start-time " mod-time: " mod-time) - (and start-time mod-time - (> (- now start-time) 0) ;; been running at least 0 seconds - (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds - (< (- now start-time) - (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) - 180) - (random 360))) ;; under one hour running time +/- 180 - ))) + (if (and (list? rec) + (> (length rec) 2)) + (let ((start-time (list-ref rec 3)) + (mod-time (list-ref rec 0))) + ;; (print "start-time: " start-time " mod-time: " mod-time) + (and start-time mod-time + (> (- now start-time) 0) ;; been running at least 0 seconds + (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds + (< (- now start-time) + (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) + 180) + (random 360))) ;; under one hour running time +/- 180 + )) + #f)) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) @@ -240,10 +243,20 @@ (let ((srvrs (server:get-best (server:get-list areapath)))) (if (and srvrs (not (null? srvrs))) (car srvrs) #f))) + +(define (server:get-rand-best areapath) + (let ((srvrs (server:get-best (server:get-list areapath)))) + (if (and (list? srvrs) + (not (null? srvrs))) + (let* ((len (length srvrs)) + (idx (random len))) + (list-ref srvrs idx)) + #f))) + (define (server:record->url servr) (match-let (((mod-time host port start-time pid) servr)) (if (and host port) @@ -292,13 +305,20 @@ (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; -(define (server:check-if-running areapath) - (let* ((servers (server:get-best (server:get-list areapath)))) - (if (null? servers) +(define (server:check-if-running areapath #!key (numservers "2")) + (let* ((ns (string->number + (or (configf:lookup *configdat* "server" "numservers") numservers))) + (servers (server:get-best (server:get-list areapath)))) + ;; (print "servers: " servers " ns: " ns) + (if (or (and servers + (null? servers)) + (not servers) + (and (list? servers) + (< (length servers) (random ns)))) ;; somewhere between 0 and numservers #f (let loop ((hed (car servers)) (tal (cdr servers))) (let ((res (server:check-server hed))) (if res