Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -126,11 +126,11 @@ (cond ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f (vector #f "remote must be called with a vector"))) ((> *api-process-request-count* 20) ;; 20) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") - (vector #f (vector 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! + (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else (let* ((cmd-in (vector-ref dat 0)) (cmd (if (symbol? cmd-in) cmd-in (string->symbol cmd-in))) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -225,19 +225,21 @@ ;; process and return it. (let* ((send-recieve (lambda () (mutex-lock! *http-mutex*) ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines) ;; ((exn http client-error) e (print e))) - (set! res (vector + (set! res (vector ;;; DON'T FORGET - THIS IS THE CLIENT SIDE! NOTE: consider moving this to client.scm since we are only supporting http transport at this time. success (db:string->obj (handle-exceptions exn - (begin + (let ((call-chain (get-call-chain)) + (msg ((condition-property-accessor 'exn 'message) exn))) (set! success #f) (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* " message: " msg) + (debug:print 0 *default-log-port* " cmd: " cmd " params: " params) (if runremote (remote-conndat-set! runremote #f)) ;; Killing associated server to allow clean retry.") ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? (mutex-unlock! *http-mutex*) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -796,11 +796,11 @@ (server:kill server))))) (sort servers (lambda (a b) (let ((ma (or (any->number (car a)) 9e9)) (mb (or (any->number (car b)) 9e9))) (> ma mb))))) - (debug:print-info 1 *default-log-port* "Done with listservers") + ;; (debug:print-info 1 *default-log-port* "Done with listservers") (set! *didsomething* #t) (exit)) (exit)))) ;; must do, would have to add checks to many/all calls below Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -196,14 +196,18 @@ (if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time ;; (mutex-unlock! *rmt-mutex*) (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) (mutex-unlock! *rmt-mutex*) (if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end - (if (and (symbol? res) - (eq? res 'overloaded)) + (if (and (vector? res) + (eq? (vector-length res) 2) + (eq? (vector-ref res 2) 'overloaded)) ;; since we are looking at the data to carry the error we'll use a fairly obtuse combo to minimise the chances of some sort of collision. (let ((wait-delay (+ attemptnum (* attemptnum 10)))) (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") + (mutex-lock! *rmt-mutex*) + (set! *runremote* #f) ;; force starting over + (mutex-unlock! *rmt-mutex*) (thread-sleep! wait-delay) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) res) ;; All good, return res (begin (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -253,10 +253,14 @@ (thread-start! th1) (thread-join! th2))))) (set-signal-handler! signal/int sighand) (set-signal-handler! signal/term sighand)) + ;; force the starting of a server + (debug:print 0 *default-log-port* "waiting on server...") + (server:start-and-wait *toppath*) + (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process (set! runconf (if (file-exists? runconfigf) (setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target) (begin (debug:print 0 *default-log-port* "WARNING: You do not have a run config file: " runconfigf) @@ -1176,14 +1180,15 @@ newtal: newtal itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) - ;; every couple minutes verify the server is there for this run - ;; (if (and (common:low-noise-print 60 "try start server" run-id) - ;; (tasks:need-server run-id)) - ;; (tasks:start-and-wait-for-server tdbdat run-id 10)) ;; NOTE: delay and wait is done under the hood + + ;; every 15 minutes verify the server is there for this run + (if (and (common:low-noise-print 240 "try start server" run-id) + (not (server:check-if-running *toppath*))) + (server:kind-run *toppath*)) (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -218,31 +218,35 @@ ;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off ;; and servers should stick around for about two hours or so. ;; (define (server:get-best srvlst) - (let ((now (current-seconds))) - (sort - (filter (lambda (rec) - (if (and (list? rec) - (> (length rec) 2)) - (let ((start-time (list-ref rec 3)) - (mod-time (list-ref rec 0))) - ;; (print "start-time: " start-time " mod-time: " mod-time) - (and start-time mod-time - (> (- now start-time) 0) ;; been running at least 0 seconds - (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds - (< (- now start-time) - (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) - 180) - (random 360))) ;; under one hour running time +/- 180 - )) - #f)) - srvlst) - (lambda (a b) - (< (list-ref a 3) - (list-ref b 3)))))) + (let* ((nums (server:get-num-servers)) + (now (current-seconds)) + (slst (sort + (filter (lambda (rec) + (if (and (list? rec) + (> (length rec) 2)) + (let ((start-time (list-ref rec 3)) + (mod-time (list-ref rec 0))) + ;; (print "start-time: " start-time " mod-time: " mod-time) + (and start-time mod-time + (> (- now start-time) 0) ;; been running at least 0 seconds + (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds + (< (- now start-time) + (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) + 180) + (random 360))) ;; under one hour running time +/- 180 + )) + #f)) + srvlst) + (lambda (a b) + (< (list-ref a 3) + (list-ref b 3)))))) + (if (> (length slst) nums) + (take slst nums) + slst))) (define (server:get-first-best areapath) (let ((srvrs (server:get-best (server:get-list areapath)))) (if (and srvrs (not (null? srvrs))) @@ -305,16 +309,20 @@ (server:kind-run areapath)) (thread-sleep! 5) (loop (server:check-if-running areapath))))))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. + +(define (server:get-num-servers #!key (numservers 2)) + (let ((ns (string->number + (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) + (or ns numservers))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; -(define (server:check-if-running areapath #!key (numservers "2")) - (let* ((ns (string->number - (or (configf:lookup *configdat* "server" "numservers") numservers))) +(define (server:check-if-running areapath) ;; #!key (numservers "2")) + (let* ((ns (server:get-num-servers)) (servers (server:get-best (server:get-list areapath)))) ;; (print "servers: " servers " ns: " ns) (if (or (and servers (null? servers)) (not servers)