Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -12,11 +12,11 @@ ;; C L I E N T S ;;====================================================================== (require-extension (srfi 18) extras tcp s11n) -(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) +(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest matchable) ;; (use zmq) (use (prefix sqlite3 sqlite3:)) (use spiffy uri-common intarweb http-client spiffy-request-vars uri-common intarweb directory-utils) @@ -48,102 +48,16 @@ ((rpc) (rpc:client-connect iface port)) ((http) (http:client-connect iface port)) ((zmq) (zmq:client-connect iface port)) (else (rpc:client-connect iface port)))) -(define (client:setup run-id #!key (remaining-tries 10) (failed-connects 0)) +(define (client:setup areapath #!key (remaining-tries 100) (failed-connects 0)) (case (server:get-transport) ((rpc) (rpc-transport:client-setup run-id remaining-tries: remaining-tries failed-connects: failed-connects)) ;;(client:setup-rpc run-id)) - ((http)(client:setup-http run-id remaining-tries: remaining-tries failed-connects: failed-connects)) + ((http)(client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects)) (else (rpc-transport:client-setup run-id remaining-tries: remaining-tries failed-connects: failed-connects)))) ;; (client:setup-rpc run-id)))) -;; (define (client:login-no-auto-setup server-info run-id) -;; (case (server:get-transport) -;; ((rpc) (rpc:login-no-auto-client-setup server-info run-id)) -;; ((http) (rmt:login-no-auto-client-setup server-info run-id)) -;; (else (rpc:login-no-auto-client-setup server-info run-id)))) -;; -;; (define (client:setup-rpc run-id) -;; (debug:print 0 *default-log-port* "INFO: client:setup remaining-tries=" remaining-tries) -;; (if (<= remaining-tries 0) -;; (begin -;; (debug:print-error 0 *default-log-port* "failed to start or connect to server for run-id " run-id) -;; (exit 1)) -;; (let ((host-info (hash-table-ref/default *runremote* run-id #f))) -;; (debug:print-info 0 *default-log-port* "client:setup host-info=" host-info ", remaining-tries=" remaining-tries) -;; (if host-info -;; (let* ((iface (car host-info)) -;; (port (cadr host-info)) -;; (start-res (client:connect iface port)) -;; ;; (ping-res (server:ping-server run-id iface port)) -;; (ping-res (client:login-no-auto-setup start-res run-id))) -;; (if ping-res ;; sucessful login? -;; (begin -;; (hash-table-set! *runremote* run-id start-res) -;; start-res) ;; return the server info -;; (if (member remaining-tries '(3 4 6)) -;; (begin ;; login failed -;; (debug:print 25 *default-log-port* "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" host-info) -;; (hash-table-delete! *runremote* run-id) -;; (open-run-close tasks:server-force-clean-run-record -;; tasks:open-db -;; run-id -;; (car host-info) -;; (cadr host-info) -;; " client:setup (host-info=#t)") -;; (thread-sleep! 5) -;; (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1))) -;; (begin -;; (debug:print 25 *default-log-port* "INFO: client:setup failed to connect, start-res=" start-res ", run-id=" run-id ", host-info=" host-info) -;; (thread-sleep! 5) -;; (client:setup run-id remaining-tries: (- remaining-tries 1)))))) -;; ;; YUK: rename server-dat here -;; (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id))) -;; (debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) -;; (if server-dat -;; (let* ((iface (tasks:hostinfo-get-interface server-dat)) -;; (port (tasks:hostinfo-get-port server-dat)) -;; (start-res (http-transport:client-connect iface port)) -;; ;; (ping-res (server:ping-server run-id iface port)) -;; (ping-res (rmt:login-no-auto-client-setup start-res run-id))) -;; (if start-res -;; (begin -;; (hash-table-set! *runremote* run-id start-res) -;; start-res) -;; (if (member remaining-tries '(2 5)) -;; (begin ;; login failed -;; (debug:print 25 *default-log-port* "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) -;; (hash-table-delete! *runremote* run-id) -;; (open-run-close tasks:server-force-clean-run-record -;; tasks:open-db -;; run-id -;; (tasks:hostinfo-get-interface server-dat) -;; (tasks:hostinfo-get-port server-dat) -;; " client:setup (server-dat = #t)") -;; (thread-sleep! 2) -;; (server:try-running run-id) -;; (thread-sleep! 10) ;; give server a little time to start up -;; (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1))) -;; (begin -;; (debug:print 25 *default-log-port* "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) -;; (thread-sleep! 5) -;; (client:setup run-id remaining-tries: (- remaining-tries 1)))))) -;; (begin ;; no server registered -;; (if (eq? remaining-tries 2) -;; (begin -;; ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)") -;; (client:setup run-id remaining-tries: 10)) -;; (begin -;; (thread-sleep! 2) -;; (debug:print 25 *default-log-port* "INFO: client:setup start-res (not defined here), run-id=" run-id ", server-dat=" server-dat) -;; (if (< (open-run-close tasks:num-in-available-state tasks:open-db run-id) 3) -;; (begin -;; ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)") -;; (server:try-running run-id))) -;; (thread-sleep! 10) ;; give server a little time to start up -;; (client:setup run-id remaining-tries: (- remaining-tries 1))))))))))) - ;; Do all the connection work, look up the transport type and set up the ;; connection if required. ;; ;; There are two scenarios. ;; 1. We are a test manager and we received *transport-type* and *runremote* via cmdline @@ -152,100 +66,50 @@ ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; -(define (client:setup-http run-id #!key (remaining-tries 10) (failed-connects 0)) + +(define (client:setup-http areapath #!key (remaining-tries 100) (failed-connects 0)) (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) - (let* ((tdbdat (tasks:open-db))) - (if (<= remaining-tries 0) - (begin - (debug:print-error 0 *default-log-port* "failed to start or connect to server for run-id " run-id) - (exit 1)) - (let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) - (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) - (if server-dat - (let* ((iface (tasks:hostinfo-get-interface server-dat)) - (hostname (tasks:hostinfo-get-hostname server-dat)) - (port (tasks:hostinfo-get-port server-dat)) - (start-res (case *transport-type* - ((http)(http-transport:client-connect iface port)) - ;;((nmsg)(nmsg-transport:client-connect hostname port)) - )) - (ping-res (case *transport-type* - ((http)(rmt:login-no-auto-client-setup start-res)) - ;; ((nmsg)(let ((logininfo (rmt:login-no-auto-client-setup start-res run-id))) - ;; (if logininfo - ;; (car (vector-ref logininfo 1)) - ;; #f))) - - ))) - (if (and start-res - ping-res) - (begin - (remote-conndat-set! *runremote* start-res) ;; (hash-table-set! *runremote* run-id start-res) - (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) - start-res) - (begin ;; login failed but have a server record, clean out the record and try again - (debug:print-info 0 *default-log-port* "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) - (case *transport-type* - ((http)(http-transport:close-connections run-id))) - (remote-conndat-set! *runremote* #f) ;; (hash-table-delete! *runremote* run-id) - (tasks:kill-server-run-id run-id) - (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) - run-id - (tasks:hostinfo-get-interface server-dat) - (tasks:hostinfo-get-port server-dat) - " client:setup (server-dat = #t)") - (if (> remaining-tries 8) - (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little - (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time - (server:try-running *toppath*) - (thread-sleep! 5) ;; give server a little time to start up - (client:setup run-id remaining-tries: (- remaining-tries 1)) - ))) - (begin ;; no server registered - (let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id))) - (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) - (if (< num-available 2) - (server:try-running *toppath*)) - (thread-sleep! (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. - (client:setup run-id remaining-tries: (- remaining-tries 1))))))))) - -;; keep this as a function to ease future -(define (client:start run-id server-info) - (http-transport:client-connect (tasks:hostinfo-get-interface server-info) - (tasks:hostinfo-get-port server-info))) - -;; ;; client:signal-handler -;; (define (client:signal-handler signum) -;; (signal-mask! signum) -;; (set! *time-to-exit* #t) -;; (handle-exceptions -;; exn -;; (debug:print 0 *default-log-port* " ... exiting ...") -;; (let ((th1 (make-thread (lambda () -;; "") ;; do nothing for now (was flush out last call if applicable) -;; "eat response")) -;; (th2 (make-thread (lambda () -;; (debug:print-error 0 *default-log-port* "Received ^C, attempting clean exit. Please be patient and wait a few seconds before hitting ^C again.") -;; (thread-sleep! 1) ;; give the flush one second to do it's stuff -;; (debug:print 0 *default-log-port* " Done.") -;; (exit 4)) -;; "exit on ^C timer"))) -;; (thread-start! th2) -;; (thread-start! th1) -;; (thread-join! th2)))) -;; -;; ;; client:launch -;; ;; Need to set the signal handler somewhere other than here as this -;; ;; routine will go away. -;; ;; -;; (define (client:launch run-id) -;; (set-signal-handler! signal/int client:signal-handler) -;; (set-signal-handler! signal/term client:signal-handler) -;; (if (client:setup run-id) -;; (debug:print-info 2 *default-log-port* "connected as client") -;; (begin -;; (debug:print-error 0 *default-log-port* "Failed to connect as client") -;; (exit)))) -;; + (server:start-and-wait areapath) + (if (<= remaining-tries 0) + (begin + (debug:print-error 0 *default-log-port* "failed to start or connect to server") + (exit 1)) + ;; + ;; Alternatively here, we can get the list of candidate servers and work our way + ;; through them searching for a good one. + ;; + (let* ((server-dat (server:get-first-best areapath))) + (if (not server-dat) ;; no server found + (client:setup-http areapath remaining-tries: (- remaining-tries 1)) + (let ((host (cadr server-dat)) + (port (caddr server-dat))) + (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) + (if (not *runremote*)(set! *runremote* (make-remote))) + (if (and host port) + (let* ((start-res (case *transport-type* + ((http)(http-transport:client-connect host port)))) + (ping-res (case *transport-type* + ((http)(rmt:login-no-auto-client-setup start-res))))) + (if (and start-res + ping-res) + (begin + (remote-conndat-set! *runremote* start-res) ;; (hash-table-set! *runremote* run-id start-res) + (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) + start-res) + (begin ;; login failed but have a server record, clean out the record and try again + (debug:print-info 0 *default-log-port* "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) + (case *transport-type* + ((http)(http-transport:close-connections run-id))) + (remote-conndat-set! *runremote* #f) ;; (hash-table-delete! *runremote* run-id) + (thread-sleep! 1) + (client:setup-http areapath remaining-tries: (- remaining-tries 1)) + ))) + (begin ;; no server registered + (server:kind-run areapath) + (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) + (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. + (server:start-and-wait areapath) + (client:setup-http areapath remaining-tries: (- remaining-tries 1))))))))) + Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -662,11 +662,11 @@ (iup:attribute-set! servers-matrix "FITTOTEXT" (conc "C" colnum)))) (set! colnum (+ 1 colnum)))) vals) (set! rownum (+ rownum 1))) (iup:attribute-set! servers-matrix "REDRAW" "ALL"))) - (sort servers (lambda (a b)(< (car a)(car b)))))))))) + (sort servers (lambda (a b)(> (car a)(car b)))))))))) (set! colnum 0) (for-each (lambda (colname) (iup:attribute-set! servers-matrix (conc "0:" colnum) colname) (iup:attribute-set! servers-matrix "FITTOTEXT" (conc "C" colnum)) (set! colnum (+ colnum 1))) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -36,12 +36,12 @@ (define (rmt:get-connection-info areapath) ;; TODO: push areapath down. (let ((cinfo (remote-conndat *runremote*)) (run-id 0)) (if cinfo cinfo - (if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id) - (client:setup run-id) + (if (server:check-if-running areapath) + (client:setup areapath) #f)))) (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) @@ -93,11 +93,11 @@ ;; on homehost and this is a write, we already have a server, but server has died ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url *runremote*) ;; have a server (not (server:check-if-running *toppath*))) ;; server has died. - (set! *runremote* #f) + (set! *runremote* (make-remote)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a write, we already have a server @@ -123,11 +123,11 @@ ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost (not (remote-conndat *runremote*))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*)) (mutex-unlock! *rmt-mutex*) - (tasks:start-and-wait-for-server (tasks:open-db) 0 15) + (server:start-and-wait *toppath*) (remote-conndat-set! *runremote* (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as ;; all set up if get this far, dispatch the query ((cdr (remote-hh-dat *runremote*)) ;; we are on homehost (mutex-unlock! *rmt-mutex*) @@ -166,11 +166,11 @@ (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) (remote-conndat-set! *runremote* #f) (remote-server-url-set! *runremote* #f) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") (mutex-unlock! *rmt-mutex*) - (tasks:start-and-wait-for-server (tasks:open-db) 0 15) + (server:start-and-wait *toppath*) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) ;; (define (rmt:update-db-stats run-id rawcmd params duration) ;; (mutex-lock! *db-stats-mutex*) ;; (handle-exceptions Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -86,14 +86,10 @@ ;; (send-message pubsock target send-more: #t) ;; (send-message pubsock (case (server:get-transport) ((rpc) (db:obj->string (vector success/fail query-sig result))) ((http) (db:obj->string (vector success/fail query-sig result))) - ((zmq) - (let ((pub-socket (vector-ref *runremote* 1))) - (send-message pub-socket return-addr send-more: #t) - (send-message pub-socket (db:obj->string (vector success/fail query-sig result))))) ((fs) result) (else (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) result))) @@ -132,11 +128,11 @@ (not (equal? curr-ip target-host))) (begin (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) - (setenv "TARGETHOST_LOGF" "server.log") ;; logfile) + (setenv "TARGETHOST_LOGF" logfile) (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) @@ -220,10 +216,17 @@ ))) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) + +(define (server:get-first-best areapath) + (let ((srvrs (server:get-best (server:get-list areapath)))) + (if (and srvrs + (not (null? srvrs))) + (car srvrs) + #f))) (define (server:record->url servr) (match-let (((mod-time host port start-time pid) servr)) (if (and host port) @@ -243,10 +246,21 @@ (if (or (not last-run-time) (> (- (current-seconds) last-run-time) 30)) (begin (server:run areapath) (hash-table-set! *server-kind-run* areapath (current-seconds)))))) + +(define (server:start-and-wait areapath #!key (timeout 60)) + (let ((give-up-time (+ (current-seconds) timeout))) + (let loop ((server-url (server:check-if-running areapath))) + (if (or server-url + (> (current-seconds) give-up-time)) + server-url + (begin + (server:kind-run areapath) + (thread-sleep! 5) + (loop (server:check-if-running areapath))))))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. (define (server:dotserver-age-seconds areapath) (let ((server-file (conc areapath "/.server"))) @@ -265,17 +279,17 @@ (dotserver-url (if best-server (server:record->url best-server) #f))) ;; (server:read-dotserver->url areapath))) ;; tdbdat (tasks:open-db))) (if dotserver-url (let* ((res (case *transport-type* - ((http)(server:ping-server dotserver-url)) + ((http)(server:ping dotserver-url)) ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) ))) (if res dotserver-url (begin - (server:kill best-server) + ;; (server:kill best-server) #f))) #f))) (define (server:kill servr) (match-let (((mod-time hostname port start-time pid) @@ -287,25 +301,25 @@ ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; (define (server:ping host-port-in #!key (do-exit #f)) (let ((host:port (if (not host-port-in) ;; use read-dotserver to find - (server:check-if-running *toppath*) - (if (number? host-port-in) ;; we were handed a server-id - (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) - ;; (print "srec: " srec " host-port-in: " host-port-in) - (if srec - (conc (vector-ref srec 3) ":" (vector-ref srec 4)) - (conc "no such server-id " host-port-in))) - host-port-in)))) + #f ;; (server:check-if-running *toppath*) + ;; (if (number? host-port-in) ;; we were handed a server-id + ;; (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) + ;; ;; (print "srec: " srec " host-port-in: " host-port-in) + ;; (if srec + ;; (conc (vector-ref srec 3) ":" (vector-ref srec 4)) + ;; (conc "no such server-id " host-port-in))) + host-port-in))) ;; ) (let* ((host-port (if host:port (let ((slst (string-split host:port ":"))) (if (eq? (length slst) 2) (list (car slst)(string->number (cadr slst))) #f)) - #f)) - (toppath (launch:setup))) + #f))) +;; (toppath (launch:setup))) ;; (print "host-port=" host-port) (if (not host-port) (begin (if host-port-in (debug:print 0 *default-log-port* "ERROR: bad host:port")) @@ -316,15 +330,17 @@ (server-dat (http-transport:client-connect iface port)) (login-res (rmt:login-no-auto-client-setup server-dat))) (if (and (list? login-res) (car login-res)) (begin - (print "LOGIN_OK") - (if do-exit (exit 0))) + ;; (print "LOGIN_OK") + (if do-exit (exit 0)) + #t) (begin - (print "LOGIN_FAILED") - (if do-exit (exit 1))))))))) + ;; (print "LOGIN_FAILED") + (if do-exit (exit 1)) + #f))))))) ;; run ping in separate process, safest way in some cases ;; (define (server:ping-server ifaceport) (with-input-from-pipe