Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -379,16 +379,20 @@ ;; (define (api:process-request dbstruct $) ;; the $ is the request vars proc (set! *api-process-request-count* (+ *api-process-request-count* 1)) (let* ((cmd ($ 'cmd)) (paramsj ($ 'params)) + (key ($ 'key))) (params (db:string->obj paramsj transport: 'http)) ;; incoming data from the POST (or is it a GET?) - (resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) - (success (vector-ref resdat 0)) - (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?) - (if (not success) - (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) + (debug:print 0 *default-log-port* "cmd:" cmd " with params " params "key " key) + (if (equal? key *server-id**) + (let* ((resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) + (success (vector-ref resdat 0)) + (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?) + (debug:print 0 *default-log-port* "res:" res) + (if (not success) + (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) (if (> *api-process-request-count* *max-api-process-requests*) (set! *max-api-process-requests* *api-process-request-count*)) (set! *api-process-request-count* (- *api-process-request-count* 1)) ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds ;; (rmt:dat->json-str @@ -396,7 +400,10 @@ ;; (list? res) ;; (number? res) ;; (boolean? res)) ;; res ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res))))) - (db:obj->string res transport: 'http))) + (db:obj->string res transport: 'http)) + (begin + (debug:print 0 *default-log-port* "Login failed server-id mismatch: " server-signature ", " *server-id*) + (db:obj->string (conc "Login failed server-id mismatch: " server-signature ", " *server-id*) transport: 'http))))) Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -88,18 +88,25 @@ (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath)) (runremote (or area-dat *runremote*))) (if (not server-dat) ;; no server found (client:setup-http areapath remaining-tries: (- remaining-tries 1)) (let ((host (cadr server-dat)) - (port (caddr server-dat))) + (port (caddr server-dat)) + (server-id (caddr (cddr server-dat)))) (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) (if (and (not area-dat) (not *runremote*)) - (set! *runremote* (make-remote))) - (if (and host port) + (begin + (set! *runremote* (make-remote)) + (let* ((server-info (remote-server-info *runremote*))) + (if server-info + (begin + (remote-server-url-set! *runremote* (server:record->url server-info)) + (remote-server-id-set! *runremote* (server:record->id server-info))))))) + (if (and host port server-id) (let* ((start-res (case *transport-type* - ((http)(http-transport:client-connect host port)))) + ((http)(http-transport:client-connect host port server-id)))) (ping-res (case *transport-type* ((http)(rmt:login-no-auto-client-setup start-res))))) (if (and start-res ping-res) (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -296,11 +296,13 @@ (define (common:logpro-exit-code->test-status exit-code) (status-sym->string (common:logpro-exit-code->status-sym exit-code))) (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) - (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) + (server-url #f) ;; (server:check-if-running *toppath*) #f)) + (server-id #f) + (server-info (if *toppath* (server:check-if-running *toppath*))) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (server:expiration-timeout)) (force-server #f) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -4342,18 +4342,19 @@ set-verbosity killserver )) (define (db:login dbstruct calling-path calling-version client-signature) - (cond + (cond ((not (equal? calling-path *toppath*)) (list #f "Login failed due to mismatch paths: " calling-path ", " *toppath*)) ;; ((not (equal? *run-id* run-id)) ;; (list #f "Login failed due to mismatch run-id: " run-id ", " *run-id*)) ((not (equal? megatest-version calling-version)) (list #t (conc "Login warning due to mismatch megatest version: " calling-version ", " megatest-version))) - (else + + (else (hash-table-set! *logged-in-clients* client-signature (current-seconds)) '(#t "successful login")))) (define (db:general-call dbstruct stmtname params) (let ((query (let ((q (alist-ref (if (string? stmtname) Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -713,11 +713,11 @@ ;; colnames) (set! rownum 1) (for-each (lambda (server) (set! colnum 0) - (match-let (((mod-time host port start-time pid) + (match-let (((mod-time host port start-time server-id pid) server)) (let* ((uptime (- (current-seconds) mod-time)) (runtime (if start-time (- mod-time start-time) 0)) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -246,12 +246,19 @@ (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info") (exit 1)))) (res (vector #f "uninitialized")) (success #t) (sparams (db:obj->string params transport: 'http)) - (runremote (or area-dat *runremote*))) - (debug:print-info 11 *default-log-port* "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n") + (runremote (or area-dat *runremote*)) + (server-id (if (vector? serverdat) + (http-transport:server-dat-get-server-id serverdat) + (begin + (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info") + (exit 1))))) + ;(debug:print-info 0 *default-log-port* "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n runremote=" runremote " serverdat=" serverdat " server-id" server-id "current time:" (current-seconds) ) + ;(debug:print-info 0 *default-log-port* "cmd=" cmd " fullurl=" fullurl " server-id=" server-id " current time:" (current-seconds)) + ;; set up the http-client here (max-retry-attempts 1) ;; consider all requests indempotent (retry-request? (lambda (request) #f)) @@ -268,16 +275,16 @@ (handle-exceptions exn (let ((call-chain (get-call-chain)) (msg ((condition-property-accessor 'exn 'message) exn))) (set! success #f) - (if (debug:debug-mode 1) + (if (debug:debug-mode 3) (debug:print-info 0 *default-log-port* "couldn't talk to server, trying again ...") (begin (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") (debug:print 0 *default-log-port* " message: " msg ", exn=" exn) - (debug:print 0 *default-log-port* " cmd: " cmd " params: " params) + (debug:print 0 *default-log-port* " cmd: " cmd " params: " params " key:" (or server-id "thekey")) (debug:print 0 *default-log-port* " call-chain: " call-chain))) (if runremote (remote-conndat-set! runremote #f)) ;; Killing associated server to allow clean retry.") ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? @@ -286,11 +293,11 @@ ;;; (make-property-condition 'commfail 'message "failed to connect to server"))) ;;; "communications failed" (db:obj->string #f)) (with-input-from-request ;; was dat fullurl - (list (cons 'key (or *server-id* "thekey")) + (list (cons 'key (or server-id "thekey")) (cons 'cmd cmd) (cons 'params sparams)) read-string)) transport: 'http) 0)) ;; added this speculatively @@ -305,12 +312,13 @@ (th1 (make-thread send-recieve "with-input-from-request")) (th2 (make-thread time-out "time out"))) (thread-start! th1) (thread-start! th2) (thread-join! th1) + (vector-set! res 0 success) (thread-terminate! th2) - (debug:print-info 11 *default-log-port* "got res=" res) + (debug:print-info 0 *default-log-port* "got res=" res) (if (vector? res) (if (vector-ref res 0) ;; this is the first flag or the second flag? res ;; this is the *inner* vector? seriously? why? (if (debug:debug-mode 11) (let ((call-chain (get-call-chain))) ;; note: this code also called in nmsg-transport - consider consolidating it @@ -350,11 +358,12 @@ (define (http-transport:server-dat-get-port vec) (vector-ref vec 1)) (define (http-transport:server-dat-get-api-uri vec) (vector-ref vec 2)) (define (http-transport:server-dat-get-api-url vec) (vector-ref vec 3)) (define (http-transport:server-dat-get-api-req vec) (vector-ref vec 4)) (define (http-transport:server-dat-get-last-access vec) (vector-ref vec 5)) -(define (http-transport:server-dat-get-socket vec) (vector-ref vec 6)) +;(define (http-transport:server-dat-get-socket vec) (vector-ref vec 6)) +(define (http-transport:server-dat-get-server-id vec) (vector-ref vec 6)) (define (http-transport:server-dat-make-url vec) (if (and (http-transport:server-dat-get-iface vec) (http-transport:server-dat-get-port vec)) (conc "http://" @@ -371,15 +380,15 @@ (debug:print-error 0 *default-log-port* "call to http-transport:server-dat-update-last-access with non-vector!!")))) ;; ;; connect ;; -(define (http-transport:client-connect iface port) +(define (http-transport:client-connect iface port server-id) (let* ((api-url (conc "http://" iface ":" port "/api")) (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) (api-req (make-request method: 'POST uri: api-uri)) - (server-dat (vector iface port api-uri api-url api-req (current-seconds)))) + (server-dat (vector iface port api-uri api-url api-req (current-seconds) server-id))) server-dat)) ;; run http-transport:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; @@ -475,21 +484,27 @@ (let ((new-iface (car sdat)) (new-port (cadr sdat))) (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info") (set! iface new-iface) (set! port new-port) - (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) + (if (not *server-id*) + (set! *server-id* (server:mk-signature))) + (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv)) + (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) (flush-output *default-log-port*))) ;; Transfer *db-last-access* to last-access to use in checking that we are still alive (mutex-lock! *heartbeat-mutex*) (set! last-access *db-last-access*) (mutex-unlock! *heartbeat-mutex*) (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) (begin - (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) + (if (not *server-id*) + (set! *server-id* (server:mk-signature))) + (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv)) + (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -852,11 +852,11 @@ )) (if (args:get-arg "-ping") (let* ((server-id (string->number (args:get-arg "-ping"))) ;; extract run-id (i.e. no ":" (host:port (args:get-arg "-ping"))) - (server:ping (or server-id host:port) do-exit: #t))) + (server:ping (or server-id host:port) #f do-exit: #t))) ;;====================================================================== ;; Capture, save and manipulate environments ;;====================================================================== Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -100,10 +100,15 @@ ;; DOT INIT_RUNREMOTE -> MUTEXLOCK; ;; ensure we have a record for our connection for given area (if (not runremote) ;; can remove this one. should never get here. (begin (set! *runremote* (make-remote)) + (let* ((server-info (remote-server-info *runremote*))) + (if server-info + (begin + (remote-server-url-set! *runremote* (server:record->url server-info)) + (remote-server-id-set! *runremote* (server:record->id server-info))))) (set! runremote *runremote*))) ;; new runremote will come from this on next iteration ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"]; ;; DOT SET_HOMEHOST -> MUTEXLOCK; @@ -174,12 +179,17 @@ ;;DOT CASE6 -> "rmt:send-receive"; ;; on homehost and this is a write, we already have a server, but server has died ((and (cdr (remote-hh-dat runremote)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url runremote) ;; have a server - (not (server:ping (remote-server-url runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. + (not (server:ping (remote-server-url runremote) (remote-server-id runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. (set! *runremote* (make-remote)) + (let* ((server-info (remote-server-info *runremote*))) + (if server-info + (begin + (remote-server-url-set! *runremote* (server:record->url server-info)) + (remote-server-id-set! *runremote* (server:record->id server-info))))) (remote-force-server-set! runremote (common:force-server?)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6") (rmt:send-receive cmd rid params attemptnum: attemptnum)) @@ -202,20 +212,22 @@ ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required (cdr (remote-hh-dat runremote)) ;; have homehost (not (remote-server-url runremote)) ;; no connection yet (not (member cmd api:read-only-queries))) ;; not a read-only query (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") - (let ((server-url (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call - (if server-url - (remote-server-url-set! runremote server-url) ;; the string can be consumed by the client setup if needed + (let ((server-info (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call + (if server-info + (begin + (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed + (remote-server-id-set! runremote (server:record->id server-info))) (if (common:force-server?) (server:start-and-wait *toppath*) - (server:kind-run *toppath*)))) + (server:kind-run *toppath*))) (remote-force-server-set! runremote (common:force-server?)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8.1") - (rmt:open-qry-close-locally cmd 0 params)) + (rmt:open-qry-close-locally cmd 0 params))) ;;DOT CASE9 [label="force server\nnot on homehost"]; ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9}; ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive"; ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -93,10 +93,11 @@ (define (server:mk-signature) (message-digest-string (md5-primitive) (with-output-to-string (lambda () (write (list (current-directory) + (current-process-id) (argv))))))) ;; When using zmq this would send the message back (two step process) ;; with spiffy or rpc this simply returns the return data to be returned ;; @@ -165,16 +166,16 @@ (pop-directory))) ;; given a path to a server log return: host port startseconds ;; (define (server:logf-get-start-info logf) - (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs + (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)"))) ;; SERVER STARTED: host:port AT timesecs (handle-exceptions exn (begin (print "failed to get server info from " logf ", exn=" exn) - (list #f #f #f)) ;; no idea what went wrong, call it a bad server + (list #f #f #f #f)) ;; no idea what went wrong, call it a bad server (with-input-from-file logf (lambda () (let loop ((inl (read-line)) (lnum 0)) @@ -181,16 +182,17 @@ (if (not (eof-object? inl)) (let ((mlst (string-match rx inl))) (if (not mlst) (if (< lnum 500) ;; give up if more than 500 lines of server log read (loop (read-line)(+ lnum 1)) - (list #f #f #f)) + (list #f #f #f #f)) (let ((dat (cdr mlst))) (list (car dat) ;; host (string->number (cadr dat)) ;; port - (string->number (caddr dat)))))) - (list #f #f #f)))))))) + (string->number (caddr dat)) + (cadr (cddr dat)))))) + (list #f #f #f #f)))))))) ;; get a list of servers with all relevant data ;; ( mod-time host port start-time pid ) ;; (define (server:get-list areapath #!key (limit #f)) @@ -217,11 +219,11 @@ (tal (cdr server-logs)) (res '())) (let* ((mod-time (handle-exceptions exn (begin - (print "failed to get modification time on " hed ", exn=" exn) + (debug:print 0 *default-log-port* "failed to get modification time on " hed ", exn=" exn) (current-seconds)) ;; 0 (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted (down-time (- (current-seconds) mod-time)) (serv-dat (if (or (< num-serv-logs 10) (< down-time 900)) ;; day-seconds)) @@ -242,11 +244,11 @@ (define (server:get-num-alive srvlst) (let ((num-alive 0)) (for-each (lambda (server) - (match-let (((mod-time host port start-time pid) + (match-let (((mod-time host port start-time server-id pid) server)) (let* ((uptime (- (current-seconds) mod-time)) (runtime (if start-time (- mod-time start-time) 0))) @@ -305,13 +307,19 @@ (let* ((len (length srvrs)) (idx (random len))) (list-ref srvrs idx)) #f))) +(define (server:record->id servr) + (match-let (((mod-time host port start-time server-id pid) + servr)) + (if server-id + server-id + #f))) (define (server:record->url servr) - (match-let (((mod-time host port start-time pid) + (match-let (((mod-time host port start-time server-id pid) servr)) (if (and host port) (conc host ":" port) #f))) @@ -378,15 +386,15 @@ ;; this one seems to be the general entry point ;; (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) - (let loop ((server-url (server:check-if-running areapath)) + (let loop ((server-info (server:check-if-running areapath)) (try-num 0)) - (if (or server-url + (if (or server-info (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. - server-url + (server:record->url server-info) (let ((num-ok (length (server:get-best (server:get-list areapath))))) (if (and (> try-num 0) ;; first time through simply wait a little while then try again (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one (server:kind-run areapath)) (thread-sleep! 5) @@ -403,11 +411,11 @@ ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) (let* ((ns (server:get-num-servers)) (servers (server:get-best (server:get-list areapath)))) - ;; (print "servers: " servers " ns: " ns) + (print "in server:check-if-running servers: " servers " ns: " ns) (if (or (and servers (null? servers)) (not servers) (and (list? servers) (< (length servers) (random ns)))) ;; somewhere between 0 and numservers @@ -414,21 +422,22 @@ #f (let loop ((hed (car servers)) (tal (cdr servers))) (let ((res (server:check-server hed))) (if res - res + hed (if (null? tal) #f (loop (car tal)(cdr tal))))))))) ;; ping the given server ;; (define (server:check-server server-record) (let* ((server-url (server:record->url server-record)) + (server-id (server:record->id server-record)) (res (case *transport-type* - ((http)(server:ping server-url)) + ((http)(server:ping server-url server-id)) ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) ))) (if res server-url #f))) @@ -441,11 +450,11 @@ ;; called in megatest.scm, host-port is string hostname:port ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; -(define (server:ping host-port-in #!key (do-exit #f)) +(define (server:ping host-port-in server-id #!key (do-exit #f)) (let ((host:port (if (not host-port-in) ;; use read-dotserver to find #f ;; (server:check-if-running *toppath*) ;; (if (number? host-port-in) ;; we were handed a server-id ;; (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) ;; ;; (print "srec: " srec " host-port-in: " host-port-in) @@ -467,11 +476,11 @@ (debug:print 0 *default-log-port* "ERROR: bad host:port")) (if do-exit (exit 1)) #f) (let* ((iface (car host-port)) (port (cadr host-port)) - (server-dat (http-transport:client-connect iface port)) + (server-dat (http-transport:client-connect iface port server-id)) (login-res (rmt:login-no-auto-client-setup server-dat))) (if (and (list? login-res) (car login-res)) (begin ;; (print "LOGIN_OK")