Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -71,12 +71,12 @@ (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) (if (< num-available 2) (server:try-running run-id)) (thread-sleep! (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. (client:setup run-id remaining-tries: (- remaining-tries 1)))))) - ((http) (BB> "have http") (client:setup-http run-id server-dat remaining-tries)) - ((rpc) (BB> "have rpc") (rpc-transport:client-setup run-id server-dat remtries: remaining-tries)) + ((http) (client:setup-http run-id server-dat remaining-tries)) + ((rpc) (rpc-transport:client-setup run-id server-dat remtries: remaining-tries)) (else (debug:print-error 0 *default-log-port* "(6) Transport [" transport "] specified for run-id [" run-id "] is not implemented in client:setup. Cannot proceed.") (exit 1))))) Index: common_records.scm ================================================================== --- common_records.scm +++ common_records.scm @@ -117,15 +117,10 @@ (if (equal? this-func "BB>") (set! location this-loc)))) stack) (let ((dp-args (append (list 0 *default-log-port* location" " ) in-args))) (apply debug:print dp-args)))) - -(define (BB> . in-args) - (apply print "BB> " in-args) - "shouldn't do anything") - (define (debug:print-error n e . params) ;; normal print (if (debug:debug-mode n) (with-output-to-port (or e (current-error-port)) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -339,11 +339,11 @@ ":" (http-transport:server-dat-get-port vec)) #f)) (define (http-transport:server-dat-update-last-access vec) - (BB> "entered http-transport:server-dat-update-last-access vec="vec) + ;;(BB> "entered http-transport:server-dat-update-last-access vec="vec) (if (vector? vec) (vector-set! vec 5 (current-seconds)) (begin (print-call-chain (current-error-port)) (debug:print-error 0 *default-log-port* "call to http-transport:server-dat-update-last-access with non-vector!!")))) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -733,16 +733,17 @@ ;; Start the server - can be done in conjunction with -runall or -runtests (one day...) ;; we start the server if not running else start the client thread ;;====================================================================== (if (args:get-arg "-server") - + ;; Server? Start up here. ;; (let* ((tl (launch:setup)) (run-id (and (args:get-arg "-run-id") (string->number (args:get-arg "-run-id"))))) + (BB> "megatest -server called; starting server") (if run-id (begin (server:launch run-id (->string *transport-type*)) (set! *didsomething* #t)) (debug:print-error 0 *default-log-port* "server requires run-id be specified with -run-id"))) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -126,11 +126,11 @@ (exit 1)) (mutex-lock! *rmt:srmutex*) ;; deadlock is here! ;; expire connections - (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin + (let ((expire-time (- (current-seconds) (server:get-timeout) 60))) ;; don't forget the 60 second margin (for-each (lambda (run-id) (let ((connection (rmt:get-cinfo run-id))) (if (and (vector? connection) (< (http-transport:server-dat-get-last-access connection) expire-time)) ;; BB> BBTODO: make this generic, not http transport specific. @@ -140,11 +140,11 @@ (hash-table-keys *runremote*))) (let* ((run-id (if rid rid 0)) (connection-info (rmt:get-connection-info-start-server-if-none run-id))) ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also) - (BB> "in rmt:send-receive; run-id="run-id";;connection-info="connection-info) + ;;(BB> "in rmt:send-receive; run-id="run-id";;connection-info="connection-info) (if connection-info ;; use the server if have connection info (let* ((transport-type (rmt:run-id->transport-type run-id)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -165,11 +165,11 @@ (vector #f (conc "transport ["transport-type"] unimplemented")))))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) - (BB> "in rmt:send-receive; transport-type="transport-type" success="success" connection-info="connection-info" res="res " dat="dat) + ;;(BB> "in rmt:send-receive; transport-type="transport-type" success="success" connection-info="connection-info" res="res " dat="dat) (if (and success (vector? connection-info)) (http-transport:server-dat-update-last-access connection-info)) ;; BB> BBTODO: make this generic, not http transport specific. (if success (begin (mutex-unlock! *rmt:srmutex*) Index: rpc-transport.scm ================================================================== --- rpc-transport.scm +++ rpc-transport.scm @@ -46,11 +46,11 @@ (res (vector-ref resdat 1))) (mutex-lock! *heartbeat-mutex*) (set! *last-db-access* (current-seconds)) ;; bump *last-db-access*; this will renew keep-running thread's lease on life for another (server:get-timeout) seconds - (BB> "in api-exec; last-db-access updated to "*last-db-access*) + ;;(BB> "in api-exec; last-db-access updated to "*last-db-access*) (mutex-unlock! *heartbeat-mutex*) res)) @@ -160,13 +160,13 @@ ;;(portlogger:open-run-close portlogger:set-port (rpc:default-server-port) "released") (set! *time-to-exit* #t) (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t)) (tasks:bb-server-delete-record server-id " rpc-transport:keep-running complete") - (BB> "Before (exit) (from-on-exit="from-on-exit")") + ;;(BB> "Before (exit) (from-on-exit="from-on-exit")") (unless from-on-exit (exit)) ;; sometimes we hang (around) here with 100% cpu. - (BB> "After") + ;;(BB> "After") ;; strace reveals endless: ;; getrusage(RUSAGE_SELF, {ru_utime={413, 917868}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 9874}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 13874}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 105880}, ru_stime={0, 60003}, ...}) = 0 @@ -255,11 +255,11 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; this client-side procedure makes rpc call to server and returns result ;; (define (rpc-transport:client-api-send-receive run-id serverdat cmd params #!key (numretries 3)) - (BB> "entered rpc-transport:client-api-send-receive with run-id="run-id " serverdat="serverdat" cmd="cmd" params="params" numretries="numretries) + ;;(BB> "entered rpc-transport:client-api-send-receive with run-id="run-id " serverdat="serverdat" cmd="cmd" params="params" numretries="numretries) (if (not (vector? serverdat)) (begin (BB> "WHAT?? for run-id="run-id", serverdat="serverdat) (print-call-chain) (exit 1))) @@ -274,19 +274,19 @@ (condition-case ;;(vector #t (run-remote cmd params)) (vector 'success (api-exec cmd params)) [x (exn i/o net) (vector 'comms-fail (conc "communications fail ["(->string x)"]") x)] [x () (vector 'other-fail "other fail ["(->string x)"]" x)])) - chatty: #t + chatty: #f accept-result?: (lambda(x) (and (vector? x) (vector-ref x 0))) retries: 4 back-off-factor: 1.5 random-wait: 0.2 retry-delay: 0.1 final-failure-returns-actual: #t)) - (BB> "HEY res="res) + ;;(BB> "HEY res="res) res )) (th1 (make-thread send-receive "send-receive")) (time-out-reached #f) (time-out (lambda () @@ -298,11 +298,11 @@ (th2 (make-thread time-out "time out"))) (thread-start! th1) (thread-start! th2) (thread-join! th1) (thread-terminate! th2) - (BB> "alt got res="res) + ;;(BB> "alt got res="res) (debug:print-info 11 *default-log-port* "got res=" res) (if (vector? res) (case (vector-ref res 0) ((success) (vector #t (vector-ref res 1))) ((comms-fail) @@ -398,12 +398,14 @@ (set! db *inmemdb*) (debug:print 0 *default-log-port* "Server started on " host:port) - (thread-sleep! 5) - (if (rpc-transport:self-test run-id ipaddrstr portnum) + ;;(thread-sleep! 5) + + (if (retry-thunk (lambda () + (rpc-transport:self-test run-id ipaddrstr portnum))) (debug:print 0 *default-log-port* "INFO: rpc self test passed!") (begin (debug:print 0 *default-log-port* "Error: rpc listener did not pass self test. Shutting down. On: " host:port) (exit))) @@ -479,11 +481,10 @@ ;; (set! port (cadr sdat)))) ;; Transfer *last-db-access* to last-access to use in checking that we are still alive (mutex-lock! *heartbeat-mutex*) (set! last-access *last-db-access*) - (BB> "in rpc-transport:run ; last-access="last-access) (mutex-unlock! *heartbeat-mutex*) ;; (debug:print 11 *default-log-port* "last-access=" last-access ", server-timeout=" server-timeout) ;; ;; no_traffic, no running tests, if server 0, no running servers @@ -509,11 +510,11 @@ (if (tasks:bb-server-am-i-the-server? run-id) (tasks:bb-server-set-state! server-id "running")) ;; (loop 0 bad-sync-count)) (begin - (BB> "SERVER SHUTDOWN CALLED! last-access="last-access" current-seconds="(current-seconds)" server-timeout="server-timeout) + ;;(BB> "SERVER SHUTDOWN CALLED! last-access="last-access" current-seconds="(current-seconds)" server-timeout="server-timeout) (rpc-transport:server-shutdown server-id rpc:listener))))) ;; end new loop )))) @@ -552,29 +553,29 @@ (login-res ((rpc:procedure 'server:login host port) *toppath*)) (res (and login-res (equal? testing-res "Just testing")))) (if login-res (begin - (BB> "Self test PASS. login-res="login-res" testing-res="testing-res" *toppath*="*toppath*) + ;;(BB> "Self test PASS. login-res="login-res" testing-res="testing-res" *toppath*="*toppath*) #t) (begin - (BB> "Self test fail. login-res="login-res" testing-res="testing-res" *toppath*="*toppath*) + ;;(BB> "Self test fail. login-res="login-res" testing-res="testing-res" *toppath*="*toppath*) #f)) res)) (define (rpc-transport:client-setup run-id server-dat #!key (remtries 10)) - (BB> "entered rpc-transport:client-setup with run-id="run-id" and server-dat="server-dat" and retries="remtries) + ;;(BB> "entered rpc-transport:client-setup with run-id="run-id" and server-dat="server-dat" and retries="remtries) (tcp-buffer-size 0) (debug:print-info 0 *default-log-port* "rpc-transport:client-setup run-id="run-id" server-dat=" server-dat ", remaining-tries=" remtries) (let* ((iface (tasks:hostinfo-get-interface server-dat)) (hostname (tasks:hostinfo-get-hostname server-dat)) (port (tasks:hostinfo-get-port server-dat)) (runremote-server-dat (vector iface port #f #f #f (current-seconds) 'rpc)) ;; http version := (vector iface port api-uri api-url api-req (current-seconds) 'http ) (ping-res (retry-thunk (lambda () ;; make 3 attempts to ping. ((rpc:procedure 'server:login iface port) *toppath*)) - chatty: #t + chatty: #f retries: 3))) ;; we got here from rmt:get-connection-info on the condition that *runremote* has no entry for run-id... (if ping-res (begin (debug:print-info 0 *default-log-port* "rpc-transport:client-setup CONNECTION ESTABLISHED run-id="run-id" server-dat=" server-dat) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -280,9 +280,9 @@ (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) (string->number tmo)) (* 60 60 (string->number tmo)) ;; (* 3 24 60 60) ;; default to three days - (* 60 1) ;; default to one minute + (* 60 3) ;; default to three minutes ;; (* 60 60 25) ;; default to 25 hours )))