Index: http-transportmod.scm ================================================================== --- http-transportmod.scm +++ http-transportmod.scm @@ -97,18 +97,23 @@ ;; Configurations for server (tcp-buffer-size 2048) (max-connections 2048) (defstruct servdat - host - port - uuid - dbfile - ) + (host #f) + (port #f) + (uuid #f) + (dbfile #f) + (api-url #f) + (api-uri #f) + (api-req #f)) (define (servdat->url sdat) - (conc (srvdat-host sdat)":"(srvdat-port srvdat))) + (conc (servdat-host sdat)":"(servdat-port sdat))) + +;; this must be set up prior to making calls +(define api-proc (make-parameter conc)) (define (http-transport:make-server-url hostport) (if (not hostport) #f (conc "http://" (car hostport) ":" (cadr hostport)))) @@ -150,11 +155,10 @@ (signal (make-composite-condition (make-property-condition 'server 'message "server error"))))) - ;; http-transport:handle-directory) ;; simple-directory-handler) ;; Setup the web server and a /ctrl interface ;; (vhost-map `(((* any) . ,(lambda (continue) ;; open the db on the first call ;; This is were we set up the database connections @@ -163,11 +167,11 @@ (res #f)) (cond ((equal? (uri-path (request-uri (current-request))) '(/ "api")) (send-response ;; the $ is the request vars proc - body: ((http-get-function 'api:process-request) *dbstruct-db* $) + body: ((api-proc) *dbstruct-db* $) headers: '((content-type text/plain))) (mutex-lock! *heartbeat-mutex*) (set! *db-last-access* (current-seconds)) (mutex-unlock! *heartbeat-mutex*)) ((equal? (uri-path (request-uri (current-request))) @@ -293,16 +297,20 @@ (define (http-transport:inc-requests-and-prep-to-close-all-connections) (mutex-lock! *http-mutex*) (set! *http-requests-in-progress* (+ 1 *http-requests-in-progress*))) +;; Turn off proxy handling +(define (http-transport:client-turn-off-proxy) + (determine-proxy (constantly #f))) ;; From (chicken base) + ;; serverdat contains uuid to be used for connection validation ;; -;; TODO: translate http-transport:client-connect to use servdat +;; NOTE: serverdat must be initialized once by servdat-init ;; -(define (http-transport:basic-send-receive serverdat cmd params #!key (numretries 3)) - (let* ((fullurl (servdat->url serverdat)) +(define (http-transport:send-receive serverdat cmd params #!key (numretries 3)) + (let* ((fullurl (servdat-api-uri serverdat)) ;; gets uri for /api (res #f) (success #t) (server-id (servdat-uuid serverdat))) ;; set up the http-client here (max-retry-attempts 1) @@ -317,11 +325,11 @@ #t ;; success (with-input-from-request fullurl (list (cons 'key server-id) (cons 'cmd cmd) - (cons 'params sparams)) + (cons 'params params)) read))))) (time-out (lambda () (thread-sleep! 45) (debug:print 0 *default-log-port* "WARNING: send-receive took more than 45 seconds!!") #f)) @@ -330,131 +338,11 @@ (thread-start! th1) (thread-start! th2) (thread-join! th1) (close-idle-connections!) (thread-terminate! th2) - (if (vector? res) - (if (vector-ref res 0) ;; this is the first flag or the second flag? - (let* ((res-dat (vector-ref res 1))) - (if (and (string? res-dat) (string-contains res-dat "server-id mismatch")) - (signal (make-composite-condition - (make-property-condition - 'servermismatch - 'message (vector-ref res 1)))) - res)) ;; this is the *inner* vector? seriously? why? - (if (debug:debug-mode 11) - (let ((call-chain (get-call-chain))) ;; note: this code also called in nmsg-transport - consider consolidating it - (print-call-chain (current-error-port)) - (debug:print-error 11 *default-log-port* "error above occured at server, res=" res) ;; " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 11 *default-log-port* " server call chain:") - (pp (vector-ref res 1) (current-error-port)) - (signal (vector-ref res 0))) - res)) - (signal (make-composite-condition - (make-property-condition - 'timeout - 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server"))))))) - -;; Send "cmd" with json payload "params" to serverdat and receive result -;; -(define (http-transport:client-api-send-receive run-id serverdat cmd params #!key (numretries 3)(area-dat #f)) - (let* ((fullurl (if (vector? serverdat) - (http-transport:server-dat-get-api-req serverdat) - (begin - (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info") - (exit 1)))) - (res (vector #f "uninitialized")) - (success #t) - (sparams (db:obj->string params transport: 'http)) - (runremote (or area-dat *runremote*)) - (server-id (if (vector? serverdat) - (http-transport:server-dat-get-server-id serverdat) - (begin - (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info") - (exit 1))))) - (debug:print-info 11 *default-log-port* "cmd=" cmd " fullurl=" fullurl " server-id=" server-id " current time:" (current-seconds)) - - ;; set up the http-client here - (max-retry-attempts 1) - ;; consider all requests indempotent - (retry-request? (lambda (request) - #f)) - ;; send the data and get the response - ;; extract the needed info from the http data and - ;; process and return it. - (let* ((send-recieve (lambda () - (mutex-lock! *http-mutex*) - ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines) - ;; ((exn http client-error) e (print e))) - (set! res (vector ;;; DON'T FORGET - THIS IS THE CLIENT SIDE! NOTE: consider moving this to client.scm since we are only supporting http transport at this time. - success - (db:string->obj - (handle-exceptions - exn - (let ((call-chain (get-call-chain)) - (msg ((condition-property-accessor 'exn 'message) exn))) - (set! success #f) - (if (debug:debug-mode 1) - (debug:print-info 0 *default-log-port* "couldn't talk to server, trying again ...") - (begin - (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") - (debug:print 0 *default-log-port* " message: " msg ", exn=" exn) - (debug:print 0 *default-log-port* " cmd: " cmd " params: " params " key:" (or server-id "thekey")) - (debug:print 0 *default-log-port* " call-chain: " call-chain))) - (if runremote - (remote-conndat-set! runremote #f)) - ;; Killing associated server to allow clean retry.") - ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? - (mutex-unlock! *http-mutex*) - ;;; (signal (make-composite-condition - ;;; (make-property-condition 'commfail 'message "failed to connect to server"))) - ;;; "communications failed" - (db:obj->string #f)) - (with-input-from-request ;; was dat - fullurl - (list (cons 'key (or server-id "thekey")) - (cons 'cmd cmd) - (cons 'params sparams)) - read-string)) - transport: 'http) - 0)) ;; added this speculatively - ;; Shouldn't this be a call to the managed call-all-connections stuff above? - (close-idle-connections!) - (mutex-unlock! *http-mutex*) - )) - (time-out (lambda () - (thread-sleep! 45) - (debug:print 0 *default-log-port* "WARNING: send-receive took more than 45 seconds!!") - #f)) - (th1 (make-thread send-recieve "with-input-from-request")) - (th2 (make-thread time-out "time out"))) - (thread-start! th1) - (thread-start! th2) - (thread-join! th1) - (vector-set! res 0 success) - (thread-terminate! th2) - (if (vector? res) - (if (vector-ref res 0) ;; this is the first flag or the second flag? - (let* ((res-dat (vector-ref res 1))) - (if (and (string? res-dat) (string-contains res-dat "server-id mismatch")) - (signal (make-composite-condition - (make-property-condition - 'servermismatch - 'message (vector-ref res 1)))) - res)) ;; this is the *inner* vector? seriously? why? - (if (debug:debug-mode 11) - (let ((call-chain (get-call-chain))) ;; note: this code also called in nmsg-transport - consider consolidating it - (print-call-chain (current-error-port)) - (debug:print-error 11 *default-log-port* "error above occured at server, res=" res) ;; " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 11 *default-log-port* " server call chain:") - (pp (vector-ref res 1) (current-error-port)) - (signal (vector-ref res 0))) - res)) - (signal (make-composite-condition - (make-property-condition - 'timeout - 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server"))))))) + res))) ;; careful closing of connections stored in *runremote* ;; (define (http-transport:close-connections #!key (area-dat #f)) (let* ((runremote (or area-dat *runremote*)) @@ -507,14 +395,35 @@ (let* ((api-url (conc "http://" iface ":" port "/api")) (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) (api-req (make-request method: 'POST uri: api-uri)) (server-dat (vector iface port api-uri api-url api-req (current-seconds) server-id))) server-dat)) + +;; initialize servdat for client side +(define (servdat-init sdat-in iface port uuid) + (let* ((sdat (or sdat-in (make-servdat)))) + (if uuid (servdat-uuid-set! sdat uuid)) + (servdat-host-set! sdat iface) + (servdat-port-set! sdat port) + (servdat-api-url-set! sdat (conc "http://" iface ":" port "/api")) + (servdat-api-uri-set! sdat (uri-reference (servdat->url sdat))) + (servdat-api-req-set! sdat (make-request method: 'POST + uri: (servdat-api-uri sdat))) + sdat)) ;;====================================================================== ;; NEW SERVER METHOD ;;====================================================================== + +;; only use for main.db - need to re-write some of this :( +;; +(define (get-lock-db sdat dbfile) + (let* ((dbh (db:open-run-db dbfile db:initialize-db)) + (res (db:get-iam-server-lock dbh dbfile))) + (sqlite3:finalize! dbh) + res)) + (define *srvpktspec* `((server (host . h) (port . p) (servkey . k) @@ -629,10 +538,74 @@ ;;====================================================================== ;; END NEW SERVER METHOD ;;====================================================================== + +(define (http-transport:wait-for-server pkts-dir db-file server-key) + (let* ((sdat *server-info*)) + (let loop ((start-time (current-seconds)) + (changed #t) + (last-sdat "not this")) + (begin ;; let ((sdat #f)) + (thread-sleep! 0.01) + (debug:print-info 0 *default-log-port* "Waiting for server alive signature") + (mutex-lock! *heartbeat-mutex*) + (set! sdat *server-info*) + (mutex-unlock! *heartbeat-mutex*) + (if (and sdat + (not changed) + (> (- (current-seconds) start-time) 2)) + (begin + (debug:print-info 0 *default-log-port* "Received server alive signature, now attempting to lock in server") + ;; create a server pkt in *toppath*/.meta/srvpkts + + ;; TODO: + ;; 1. change sdat to stuct + ;; 2. add uuid to struct + ;; 3. update uuid in sdat here + ;; + (servdat-uuid-set! sdat + (register-server + pkts-dir *srvpktspec* + (get-host-name) + (servdat-port sdat) server-key + (servdat-host sdat) db-file)) + + ;; now read pkts and see if we are a contender + (let* ((all-pkts (get-all-server-pkts pkts-dir *srvpktspec*)) + (viables (get-viable-servers all-pkts db-file)) + (best-srv (get-best-candidate viables db-file)) + (best-srv-key (if best-srv (alist-ref 'servkey best-srv) #f))) + (debug:print 0 *default-log-port* "best-srv-key: "best-srv-key", server-key: "server-key) + ;; am I the best-srv, compare server-keys to know + (if (equal? best-srv-key server-key) + (if (get-lock-db sdat db-file) ;; (db:get-iam-server-lock *dbstruct-db* *toppath* run-id) + (debug:print 0 *default-log-port* "I'm the server!") + (servdat-dbfile-set! sdat db-file)) + (begin + (debug:print 0 *default-log-port* "I'm not the server, exiting.") + (bdat-time-to-exit-set! *bdat* #t) + (thread-sleep! 0.2) + (exit))) + (begin + (debug:print 0 *default-log-port* + "Keys do not match "best-srv-key", "server-key", exiting.") + (bdat-time-to-exit-set! *bdat* #t) + (thread-sleep! 0.2) + (exit))) + sdat)) + (begin + (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) + (sleep 4) + (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes + (begin + (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") + (exit)) + (loop start-time + (equal? sdat last-sdat) + sdat))))))) ;; run http-transport:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; (define (http-transport:keep-running dbname) @@ -639,11 +612,11 @@ ;; if none running or if > 20 seconds since ;; server last used then start shutdown ;; This thread waits for the server to come alive (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server") (let* ((run-id (let ((rid (args:get-arg "-run-id"))) ;; consider getting rid of the -run-id mechanism - (if rid + (if rid ;; replace with -db (string->number rid) #f))) (db-file (if dbname (db:dbname->path *toppath* dbname) (db:run-id->path *toppath* run-id))) @@ -650,79 +623,11 @@ (sdat #f) ;; (tmp-area (common:get-db-tmp-area)) (server-start-time (current-seconds)) (pkts-dir (get-pkts-dir)) (server-key (server:mk-signature)) - (server-info (let loop ((start-time (current-seconds)) - (changed #t) - (last-sdat "not this")) - (begin ;; let ((sdat #f)) - (thread-sleep! 0.01) - (debug:print-info 0 *default-log-port* "Waiting for server alive signature") - (mutex-lock! *heartbeat-mutex*) - (set! sdat *server-info*) - (mutex-unlock! *heartbeat-mutex*) - (if (and sdat - (not changed) - (> (- (current-seconds) start-time) 2)) - (begin - (debug:print-info 0 *default-log-port* "Received server alive signature, now attempting to lock in server") - ;; create a server pkt in *toppath*/.meta/srvpkts - - ;; TODO: - ;; 1. change sdat to stuct - ;; 2. add uuid to struct - ;; 3. update uuid in sdat here - ;; - (servdat-uuid-set! sdat - (register-server - pkts-dir *srvpktspec* - (get-host-name) - (servdat-port sdat) server-key - (servdat-host sdat) db-file)) - - ;; now read pkts and see if we are a contender - (let* ((all-pkts (get-all-server-pkts pkts-dir *srvpktspec*)) - (viables (get-viable-servers all-pkts db-file)) - (best-srv (get-best-candidate viables db-file)) - (best-srv-key (if best-srv (alist-ref 'servkey best-srv) #f))) - (debug:print 0 *default-log-port* "best-srv-key: "best-srv-key", server-key: "server-key) - ;; am I the best-srv, compare server-keys to know - (if (equal? best-srv-key server-key) - (if (db:get-main-lock db-file) - (begin - (debug:print 0 *default-log-port* "I'm the server!") - (servdat-dbfile-set! sdat db-file)) - (begin - (debug:print 0 *default-log-port* "I'm not the server, exiting.") - (bdat-time-to-exit-set! *bdat* #t) - (thread-sleep! 0.2) - (exit))) - (begin - (debug:print 0 *default-log-port* - "Keys do not match "best-srv-key", "server-key", exiting.") - (bdat-time-to-exit-set! *bdat* #t) - (thread-sleep! 0.2) - (exit))) - sdat)) - (begin - (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) - (sleep 4) - (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes - (begin - (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") - #;(common:save-pkt `((action . died) - (T . server) - (pid . ,(current-process-id)) - (ipaddr . ,(car sdat)) - (port . ,(cadr sdat)) - (msg . "Transport died?")) - *configdat* #t) - (exit)) - (loop start-time - (equal? sdat last-sdat) - sdat))))))) + (server-info (http-transport:wait-for-server pkts-dir db-file server-key )) (iface (servdat-host server-info)) (port (servdat-port server-info)) (last-access 0) (server-timeout (server:expiration-timeout)) (server-log-file (args:get-arg "-log"))) ;; always set when we are a server @@ -887,29 +792,10 @@ (thread-sleep! 0.252) ;; give the server time to settle before starting the keep-running monitor. (thread-start! th3) (set! *didsomething* #t) (thread-join! th2) (exit)))) - -;; (define (http-transport:server-signal-handler signum) -;; (signal-mask! signum) -;; (handle-exceptions -;; exn -;; (debug:print 0 *default-log-port* " ... exiting ...") -;; (let ((th1 (make-thread (lambda () -;; (thread-sleep! 1)) -;; "eat response")) -;; (th2 (make-thread (lambda () -;; (debug:print-error 0 *default-log-port* "Received ^C, attempting clean exit. Please be patient and wait a few seconds before hitting ^C again.") -;; (thread-sleep! 3) ;; give the flush three seconds to do it's stuff -;; (debug:print 0 *default-log-port* " Done.") -;; (exit 4)) -;; "exit on ^C timer"))) -;; (thread-start! th2) -;; (thread-start! th1) -;; (thread-join! th2)))) - ;; Generate a unique signature for this server (define (server:mk-signature) (message-digest-string (md5-primitive) (with-output-to-string Index: rmtmod.scm ================================================================== --- rmtmod.scm +++ rmtmod.scm @@ -476,11 +476,11 @@ ;; is needed to deal with ;; attemtped ;; communication to ;; servers that have gone ;; away - (http-transport:client-api-send-receive 0 conninfo cmd params) + (http-transport:send-receive 0 conninfo cmd params) ((servermismatch) (vector #f "Server id mismatch" )) ((commfail)(vector #f "communications fail")) ((exn)(vector #f "other fail" (print-call-chain))))) (dat (if (and (vector? dat-in) ;; ... check it is a correct size (> (vector-length dat-in) 1)) @@ -1741,22 +1741,12 @@ (common:set-last-run-version))) ;; This login does no retries under the hood - it acts a bit like a ping. ;; Deprecated for nmsg-transport. ;; -(define (rmt:login-no-auto-client-setup connection-info) - (rmt:send-receive-no-auto-client-setup connection-info 'login 0 (list *toppath* megatest-version *my-client-signature*))) - -(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) - (let* ((run-id (if run-id run-id 0)) - (res (handle-exceptions - exn - #f - (http-transport:client-api-send-receive run-id connection-info cmd params)))) - (if (and res (vector-ref res 0)) - (vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!! - #f))) +;; (define (rmt:login-no-auto-client-setup connection-info) +;; (rmt:send-receive-no-auto-client-setup connection-info 'login 0 (list *toppath* megatest-version *my-client-signature*))) (define (std-exit-procedure) ;;(common:telemetry-log-close) (on-exit (lambda () 0)) ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) @@ -1896,58 +1886,20 @@ ;; called in megatest.scm, host-port is string hostname:port ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; -(define (server:ping host-port-in server-id #!key (do-exit #f)) - (let ((host:port (if (not host-port-in) ;; use read-dotserver to find - #f ;; (server:check-if-running *toppath*) - ;; (if (number? host-port-in) ;; we were handed a server-id - ;; (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) - ;; ;; (print "srec: " srec " host-port-in: " host-port-in) - ;; (if srec - ;; (conc (vector-ref srec 3) ":" (vector-ref srec 4)) - ;; (conc "no such server-id " host-port-in))) - host-port-in))) ;; ) - (let* ((host-port (if host:port - (let ((slst (string-split host:port ":"))) - (if (eq? (length slst) 2) - (list (car slst)(string->number (cadr slst))) - #f)) - #f))) -;; (toppath (launch:setup))) - ;; (print "host-port=" host-port) - (if (not host-port) - (begin - (if host-port-in - (debug:print 0 *default-log-port* "ERROR: bad host:port")) - (if do-exit (exit 1)) - #f) - (let* ((iface (car host-port)) - (port (cadr host-port)) - (server-dat (http-transport:client-connect iface port server-id)) - (login-res (rmt:login-no-auto-client-setup server-dat))) - (if (and (list? login-res) - (car login-res)) - (begin - ;; (print "LOGIN_OK") - (if do-exit (exit 0)) - #t) - (begin - ;; (print "LOGIN_FAILED") - (if do-exit (exit 1)) - #f))))))) +(define (server:ping host port server-id #!key (do-exit #f)) + (let* ((sdat (servdat-init #f host port server-id))) + (http-transport:send-receive sdat 'ping '()))) ;; ping the given server ;; (define (server:check-server server-record) (let* ((server-url (server:record->url server-record)) (server-id (server:record->id server-record)) - (res (case *transport-type* - ((http)(server:ping server-url server-id)) - ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) - ))) + (res (server:ping server-url server-id))) (if res server-url #f))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. @@ -1974,11 +1926,11 @@ ;; run-id to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least 3 seconds old - (server:wait-for-server-start-last-flag areapath) + ;; (server:wait-for-server-start-last-flag areapath) (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun (call-num (car last-run-dat)) (when-run (cadr last-run-dat)) (run-delay (+ (case call-num @@ -2005,11 +1957,12 @@ ((http) (http:client-connect iface port)) ((zmq) (zmq:client-connect iface port)) (else (rpc:client-connect iface port)))) (define (client:setup areapath #!key (remaining-tries 100) (failed-connects 0)) - (client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects) + (print "got here") + ;; (client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects) #;(case (server:get-transport) ((rpc) (rpc-transport:client-setup remaining-tries: remaining-tries failed-connects: failed-connects)) ;;(client:setup-rpc run-id)) ((http)(client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects)) (else (rpc-transport:client-setup remaining-tries: remaining-tries failed-connects: failed-connects)))) ;; (client:setup-rpc run-id)))) @@ -2024,11 +1977,11 @@ ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; -(define (client:setup-http areapath #!key (remaining-tries 100) (failed-connects 0)(area-dat #f)) +#;(define (client:setup-http areapath #!key (remaining-tries 100) (failed-connects 0)(area-dat #f)) (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) (server:start-and-wait areapath) (if (<= remaining-tries 0) (begin (debug:print-error 0 *default-log-port* "failed to start or connect to server") Index: servermod.scm ================================================================== --- servermod.scm +++ servermod.scm @@ -62,79 +62,12 @@ (define (server:make-server-url hostport) (if (not hostport) #f (conc "http://" (car hostport) ":" (cadr hostport)))) -;;====================================================================== -;; logic for getting homehost. Returns (host . at-home) -;; IF *toppath* is not set, wait up to five seconds trying every two seconds -;; (this is to accomodate the watchdog) -;; -#;(define (common:get-homehost #!key (trynum 5)) - (assert *toppath* "ERROR: common:get-homehost called before launch:setup. This is fatal.") - ;; called often especially at start up. use mutex to eliminate collisions - (mutex-lock! *homehost-mutex*) - (cond - (*home-host* - (mutex-unlock! *homehost-mutex*) - *home-host*) - ((not *toppath*) - (mutex-unlock! *homehost-mutex*) - ;; (launch:setup) ;; safely mutexed now - (if (> trynum 0) - (begin - (thread-sleep! 2) - (common:get-homehost trynum: (- trynum 1))) - #f)) - (else - (let* ((currhost (get-host-name)) - (bestadrs (server:get-best-guess-address currhost)) - ;; first look in config, then look in file .homehost, create it if not found - (homehost (or (configf:lookup *configdat* "server" "homehost" ) - (handle-exceptions - exn - (if (> trynum 0) - (let ((delay-time (* (- 5 trynum) 5))) - (mutex-unlock! *homehost-mutex*) - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying " - delay-time " seconds and trying again, message: " ((condition-property-accessor 'exn 'message) exn) - ", exn=" exn) - (thread-sleep! delay-time) - (common:get-homehost trynum: (- trynum 1))) - (begin - (mutex-unlock! *homehost-mutex*) - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time) - "] Failed to read .homehost file after trying five times. Giving up and exiting, message: " - ((condition-property-accessor 'exn 'message) exn)) - (exit 1))) - (let ((hhf (conc *toppath* "/.homehost"))) - (if (common:file-exists? hhf) - (with-input-from-file hhf read-line) - (if (file-writable? *toppath*) - (begin - (with-output-to-file hhf - (lambda () - (print bestadrs))) - (begin - (mutex-unlock! *homehost-mutex*) - (car (common:get-homehost)))) - #f)))))) - (at-home (or (equal? homehost currhost) - (equal? homehost bestadrs)))) - (set! *home-host* (cons homehost at-home)) - (mutex-unlock! *homehost-mutex*) - *home-host*)))) - -;;====================================================================== -;; am I on the homehost? -;; -#;(define (common:on-homehost?) - (let ((hh (common:get-homehost))) - (if hh - (cdr hh) - #f))) - +;; reuse this for server load? +;; #;(define (common:wait-for-homehost-load maxnormload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f (common:get-homehost))) (hh (if hh-dat (car hh-dat) #f))) @@ -146,18 +79,14 @@ ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area (let* ((curr-host (get-host-name)) - ;; (attempt-in-progress (server:start-attempted? areapath)) - ;; (dot-server-url (server:check-if-running areapath)) (curr-ip (server:get-best-guess-address curr-host)) (curr-pid (current-process-id)) - ;; (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) - ;; (target-host (car homehost)) (testsuite (common:get-area-name)) - (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) + (logfile (conc areapath "/logs/server.log")) (profile-mode (or (configf:lookup *configdat* "misc" "profilesw") "")) (cmdln (conc (common:get-megatest-exe) " -server " (or (get-host-name) "-") @@ -167,35 +96,17 @@ "") ;; " -log " logfile " -m testsuite:" testsuite " " profile-mode )) - ;; (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) ;; we want the remote server to start in *toppath* so push there (push-directory areapath) (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") - ;; (thread-start! log-rotate) - - ;; host.domain.tld match host? - #;(if (and target-host - ;; look at target host, is it host.domain.tld or ip address and does it - ;; match current ip or hostname - (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) - (not (equal? curr-ip target-host))) - (begin - (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) - (setenv "TARGETHOST" target-host))) - (setenv "NBFAKE_LOG" logfile) - ;; (thread-sleep! (/ (pseudo-random-integer 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time - ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever - #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit)) (system (conc "nbfake " cmdln)) (unsetenv "NBFAKE_LOG") - ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) - ;; (thread-join! log-rotate) (pop-directory))) (define (server:record->url servr) (handle-exceptions exn @@ -206,38 +117,10 @@ servr)) (if (and host port) (conc host ":" port) #f)))) -;; wait for server=start-last to be three seconds old -;; -(define (server:wait-for-server-start-last-flag areapath) - (let* ((start-flag (conc areapath "/logs/server-start-last")) - ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) - (reftime (configf:lookup-number *configdat* "server" "idletime" default: 4)) - (server-key (conc (get-host-name) "-" (current-process-id)))) - (if (file-exists? start-flag) - (let* ((fmodtime (file-modification-time start-flag)) - (delta (- (current-seconds) fmodtime)) - (all-go (> delta reftime))) - (if (and all-go - (begin - (debug:print-info 0 *default-log-port* "Writing " start-flag) - (with-output-to-file start-flag - (lambda () - (print server-key))) - (thread-sleep! 0.254) - (let ((res (with-input-from-file start-flag - (lambda () - (read-line))))) - (equal? server-key res)))) - #t ;; (system (conc "touch " start-flag)) ;; lazy but safe - (begin - (debug:print-info 0 *default-log-port* "Gating server start, last start: " - fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go) - (thread-sleep! reftime) - (server:wait-for-server-start-last-flag areapath))))))) (define (server:kill servr) (handle-exceptions exn (begin