Changes In Branch rpc-transport Through [1387b44afb] Excluding Merge-Ins
This is equivalent to a diff from cfb9ac119d to 1387b44afb
2016-11-01
| ||
02:46 | picked nit check-in: e9f1cb9e4b user: bjbarcla tags: rpc-transport | |
02:45 | tested; found bugs; fixed bugs check-in: 1387b44afb user: bjbarcla tags: rpc-transport | |
00:56 | did a little tidying up check-in: febd54396c user: bjbarcla tags: rpc-transport | |
2016-10-27
| ||
15:21 | Run tab resize fixed check-in: 5ff16368ff user: ritikaag tags: v1.62 | |
2016-10-26
| ||
14:48 | branch for rpc support check-in: d306d8dea0 user: bjbarcla tags: rpc-transport | |
14:21 | added support for -kill-servers and -transport switches on megatest check-in: cfb9ac119d user: bjbarcla tags: v1.62 | |
2016-10-25
| ||
21:02 | fixed -list-servers and -stop-server switches on megatest check-in: 73502a1626 user: bjbarcla tags: v1.62 | |
Modified client.scm from [c5821d20e2] to [69eebee107].
︙ | ︙ | |||
39 40 41 42 43 44 45 | ;; Not currently used! But, I think it *should* be used!!! (define (client:logout serverdat) (let ((ok (and (socket? serverdat) (cdb:logout serverdat *toppath* (client:get-signature))))) ok)) | > > | | | | | | | > > > > | > > > > > > > > > > > > > | > > > > > | > > > > > > > > | > > > > > > > > > > > > > > > > > > > > > > | 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | ;; Not currently used! But, I think it *should* be used!!! (define (client:logout serverdat) (let ((ok (and (socket? serverdat) (cdb:logout serverdat *toppath* (client:get-signature))))) ok)) ;; BB: commenting out orphan code ;;;;; ;; (define (client:connect iface port) ;; (case (server:get-transport) ;; ((rpc) (rpc:client-connect iface port)) ;; ((http) (http:client-connect iface port)) ;; ((zmq) (zmq:client-connect iface port)) ;; (else (rpc:client-connect iface port)))) (define (client:setup run-id #!key (remaining-tries 10)) (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) (let* ((server-dat (tasks:bb-get-server-info run-id)) (transport (if server-dat (string->symbol (tasks:hostinfo-get-transport server-dat)) 'noserver))) ;;(BB> "transport >"transport"< string? transport >"(string? transport)"< symbol? transport >"(symbol? transport)"<") (case transport ((noserver) ;; no server registered (if (<= remaining-tries 0) (begin (debug:print-error 0 *default-log-port* "failed to start or connect to server for run-id " run-id) (exit 1)) (begin (let ((num-available (tasks:bb-num-in-available-state run-id))) (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) (if (< num-available 2) (server:try-running run-id)) (thread-sleep! (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. (client:setup run-id remaining-tries: (- remaining-tries 1)))))) ((http)(client:setup-http run-id server-dat remaining-tries)) ;; ((rpc) (rpc-transport:client-setup run-id)) ;;(client:setup-rpc run-id)) rpc not implemented; want to see a failure here for now. (else (debug:print-error 0 *default-log-port* "Transport [" transport "] specified for run-id [" run-id "] is not implemented in client:setup. Cannot proceed.") (exit 1))))) ;; client:setup-http ;; ;; For http transport, robustly ensure an advertised-running server is actually working and responding, and ;; establish tcp connection to server. For servers marked running but not responding, kill them and clear from mdb ;; (define (client:setup-http run-id server-dat remaining-tries) (let* ((iface (tasks:hostinfo-get-interface server-dat)) (hostname (tasks:hostinfo-get-hostname server-dat)) (port (tasks:hostinfo-get-port server-dat)) (start-res (http-transport:client-connect iface port)) (ping-res (rmt:login-no-auto-client-setup start-res run-id))) (if (and start-res ping-res) (begin (hash-table-set! *runremote* run-id start-res) ;; side-effect - *runremote* cache init fpr rmt:* (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) start-res) (begin ;; login failed but have a server record, clean out the record and try again (debug:print-info 0 *default-log-port* "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) (http-transport:close-connections run-id) (hash-table-delete! *runremote* run-id) (tasks:kill-server-run-id run-id) (tasks:bb-server-force-clean-run-record run-id iface port " client:setup (server-dat = #t)") (if (> remaining-tries 8) (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time (server:try-running run-id) (thread-sleep! 5) ;; give server a little time to start up (client:setup run-id remaining-tries: (- remaining-tries 1)) )))) ;; (define (client:login-no-auto-setup server-info run-id) ;; (case (server:get-transport) ;; ((rpc) (rpc:login-no-auto-client-setup server-info run-id)) ;; ((http) (rmt:login-no-auto-client-setup server-info run-id)) ;; (else (rpc:login-no-auto-client-setup server-info run-id)))) ;; |
︙ | ︙ | |||
93 94 95 96 97 98 99 | ;; (thread-sleep! 5) ;; (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1))) ;; (begin ;; (debug:print 25 *default-log-port* "INFO: client:setup failed to connect, start-res=" start-res ", run-id=" run-id ", host-info=" host-info) ;; (thread-sleep! 5) ;; (client:setup run-id remaining-tries: (- remaining-tries 1)))))) ;; ;; YUK: rename server-dat here | | | 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | ;; (thread-sleep! 5) ;; (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1))) ;; (begin ;; (debug:print 25 *default-log-port* "INFO: client:setup failed to connect, start-res=" start-res ", run-id=" run-id ", host-info=" host-info) ;; (thread-sleep! 5) ;; (client:setup run-id remaining-tries: (- remaining-tries 1)))))) ;; ;; YUK: rename server-dat here ;; (let* ((server-dat (open-run-close tasks:get-server-info tasks:open-db run-id))) ;; (debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) ;; (if server-dat ;; (let* ((iface (tasks:hostinfo-get-interface server-dat)) ;; (port (tasks:hostinfo-get-port server-dat)) ;; (start-res (http-transport:client-connect iface port)) ;; ;; (ping-res (server:ping-server run-id iface port)) ;; (ping-res (rmt:login-no-auto-client-setup start-res run-id))) |
︙ | ︙ | |||
150 151 152 153 154 155 156 | ;; 2. We are a run tests, list runs or other interactive process and we must figure out ;; *transport-type* and *runremote* from the monitor.db ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; | < < < < < < < < < < < < < < < < < < < < < < < < < | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | | | | | | 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 | ;; 2. We are a run tests, list runs or other interactive process and we must figure out ;; *transport-type* and *runremote* from the monitor.db ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; ;; BB: commenting out orphan code. ;; ;; ;; keep this as a function to ease future ;; (define (client:start run-id server-info) ;; (http-transport:client-connect (tasks:hostinfo-get-interface server-info) ;; (tasks:hostinfo-get-port server-info))) ;; ;; client:signal-handler ;; (define (client:signal-handler signum) ;; (signal-mask! signum) ;; (set! *time-to-exit* #t) ;; (handle-exceptions ;; exn |
︙ | ︙ |
Modified http-transport.scm from [13883e3b0d] to [f8800e432b].
︙ | ︙ | |||
321 322 323 324 325 326 327 | (define (make-http-transport:server-dat)(make-vector 6)) (define (http-transport:server-dat-get-iface vec) (vector-ref vec 0)) (define (http-transport:server-dat-get-port vec) (vector-ref vec 1)) (define (http-transport:server-dat-get-api-uri vec) (vector-ref vec 2)) (define (http-transport:server-dat-get-api-url vec) (vector-ref vec 3)) (define (http-transport:server-dat-get-api-req vec) (vector-ref vec 4)) (define (http-transport:server-dat-get-last-access vec) (vector-ref vec 5)) | | | 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 | (define (make-http-transport:server-dat)(make-vector 6)) (define (http-transport:server-dat-get-iface vec) (vector-ref vec 0)) (define (http-transport:server-dat-get-port vec) (vector-ref vec 1)) (define (http-transport:server-dat-get-api-uri vec) (vector-ref vec 2)) (define (http-transport:server-dat-get-api-url vec) (vector-ref vec 3)) (define (http-transport:server-dat-get-api-req vec) (vector-ref vec 4)) (define (http-transport:server-dat-get-last-access vec) (vector-ref vec 5)) (define (http-transport:server-dat-get-transport vec) (vector-ref vec 6)) (define (http-transport:server-dat-make-url vec) (if (and (http-transport:server-dat-get-iface vec) (http-transport:server-dat-get-port vec)) (conc "http://" (http-transport:server-dat-get-iface vec) ":" |
︙ | ︙ | |||
346 347 348 349 350 351 352 | ;; ;; connect ;; (define (http-transport:client-connect iface port) (let* ((api-url (conc "http://" iface ":" port "/api")) (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) (api-req (make-request method: 'POST uri: api-uri)) | | | 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 | ;; ;; connect ;; (define (http-transport:client-connect iface port) (let* ((api-url (conc "http://" iface ":" port "/api")) (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) (api-req (make-request method: 'POST uri: api-uri)) (server-dat (vector iface port api-uri api-url api-req (current-seconds) 'http))) server-dat)) ;; run http-transport:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; (define (http-transport:keep-running server-id run-id) ;; if none running or if > 20 seconds since |
︙ | ︙ | |||
533 534 535 536 537 538 539 | (begin (current-error-port *alt-log-file*) (current-output-port *alt-log-file*))))) (if (server:check-if-running run-id) (begin (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") (exit 0))) | | | | 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 | (begin (current-error-port *alt-log-file*) (current-output-port *alt-log-file*))))) (if (server:check-if-running run-id) (begin (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") (exit 0))) (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id 'http)) (remtries 4)) (if (not server-id) (if (> remtries 0) (begin (thread-sleep! 2) (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id 'http) (- remtries 1))) (begin ;; since we didn't get the server lock we are going to clean up and bail out (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch") )) (let* ((th2 (make-thread (lambda () |
︙ | ︙ |
Modified launch.scm from [a58a11e1e1] to [af266dccde].
︙ | ︙ | |||
698 699 700 701 702 703 704 | ;; megatest.config (cache if all vars avail) ;; returns: ;; *toppath* ;; side effects: ;; sets; *configdat* (megatest.config info) ;; *runconfigdat* (runconfigs.config info) ;; *configstatus* (status of the read data) | | | 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 | ;; megatest.config (cache if all vars avail) ;; returns: ;; *toppath* ;; side effects: ;; sets; *configdat* (megatest.config info) ;; *runconfigdat* (runconfigs.config info) ;; *configstatus* (status of the read data) ;; *transport-type* (define (launch:setup #!key (force #f)) (let* ((toppath (or *toppath* (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath (runname (common:args-get-runname)) (target (common:args-get-target)) (linktree (common:get-linktree)) (sections (if target (list "default" target) #f)) ;; for runconfigs (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config |
︙ | ︙ | |||
825 826 827 828 829 830 831 832 833 834 835 836 837 838 | (debug:print-error 0 *default-log-port* "linktree not defined in [setup] section of megatest.config") ))) (if (and *toppath* (directory-exists? *toppath*)) (setenv "MT_RUN_AREA_HOME" *toppath*) (begin (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area."))) *toppath*)) (define (get-best-disk confdat testconfig) (let* ((disks (or (and testconfig (hash-table-ref/default testconfig "disks" #f)) (hash-table-ref/default confdat "disks" #f))) (minspace (let ((m (configf:lookup confdat "setup" "minspace"))) (string->number (or m "10000"))))) | > > | 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 | (debug:print-error 0 *default-log-port* "linktree not defined in [setup] section of megatest.config") ))) (if (and *toppath* (directory-exists? *toppath*)) (setenv "MT_RUN_AREA_HOME" *toppath*) (begin (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area."))) (server:set-transport) ;;(BB> "Transport is >"*transport-type*"<") *toppath*)) (define (get-best-disk confdat testconfig) (let* ((disks (or (and testconfig (hash-table-ref/default testconfig "disks" #f)) (hash-table-ref/default confdat "disks" #f))) (minspace (let ((m (configf:lookup confdat "setup" "minspace"))) (string->number (or m "10000"))))) |
︙ | ︙ |
Modified rmt.scm from [bb562bf1d7] to [f6f47d3907].
︙ | ︙ | |||
79 80 81 82 83 84 85 | ;; clean out old connections ;; (mutex-lock! *db-multi-sync-mutex*) (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin (for-each (lambda (run-id) (let ((connection (hash-table-ref/default *runremote* run-id #f))) (if (and (vector? connection) | | | > < | < | > > > | | | > | | | | | | | | | | | | | | | | 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | ;; clean out old connections ;; (mutex-lock! *db-multi-sync-mutex*) (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin (for-each (lambda (run-id) (let ((connection (hash-table-ref/default *runremote* run-id #f))) (if (and (vector? connection) (< (http-transport:server-dat-get-last-access connection) expire-time)) ;; BB> BBTODO: make this generic, not http transport specific. (begin (debug:print-info 0 *default-log-port* "Discarding connection to server for run-id " run-id ", too long between accesses") ;; bb- disabling nanomsg ;; SHOULD CLOSE THE CONNECTION HERE ;; (case *transport-type* ;; ((nmsg)(nn-close (http-transport:server-dat-get-socket ;; (hash-table-ref *runremote* run-id))))) (hash-table-delete! *runremote* run-id))))) (hash-table-keys *runremote*))) ;; (mutex-unlock! *db-multi-sync-mutex*) ;; (mutex-lock! *send-receive-mutex*) (let* ((run-id (if rid rid 0)) (connection-info (rmt:get-connection-info run-id))) ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also) (if connection-info ;; use the server if have connection info (let* ((transport-type (vector-ref connection-info 6)) (dat (case transport-type ;; BB: replaced *transport-type* global with run-id specific transport-type, item 6 in server-info vector which was populated by *-transport:client-connect with >> (vector iface port api-uri api-url api-req (current-seconds) 'http ) << ((http)(condition-case (http-transport:client-api-send-receive run-id connection-info cmd params) ((commfail)(vector #f "communications fail")) ((exn)(vector #f "other fail")))) ;;((rpc) (rpc-transport:client-api-send-receive run-id connection-info cmd params)) ;; BB: let us error out for now (else (debug:print-error 0 *default-log-port* "Transport [" transport "] specified for run-id [" run-id "] is not implemented in rmt:send-receive. Cannot proceed.") (exit 1)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (vector? connection-info)(http-transport:server-dat-update-last-access connection-info)) (if success (begin ;; (mutex-unlock! *send-receive-mutex*) (case *transport-type* ((http rpc) res) ;; (db:string->obj res)) ;; ((nmsg) res) )) ;; (vector-ref res 1))) (begin ;; let ((new-connection-info (client:setup run-id))) (debug:print 0 *default-log-port* "WARNING: Communication failed, trying call to rmt:send-receive again.") (case *transport-type* ;; ((nmsg)(nn-close (http-transport:server-dat-get-socket connection-info)))) ((http) (hash-table-delete! *runremote* run-id) ;; don't keep using the same connection ;; NOTE: killing server causes this process to block forever. No idea why. Dec 2. ;; (if (eq? (modulo attemptnum 5) 0) ;; (tasks:kill-server-run-id run-id tag: "api-send-receive-failed")) ;; (mutex-unlock! *send-receive-mutex*) ;; close the mutex here to allow other threads access to communications (tasks:start-and-wait-for-server (tasks:open-db) run-id 15) ;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1)))))) ;; no longer killing the server in http-transport:client-api-send-receive ;; may kill it here but what are the criteria? ;; start with three calls then kill server ;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id)) ;; (thread-sleep! 2) (rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1))))))) ;; no connection info? try to start a server, or access locally if no ;; server and the query is read-only ;; ;; Note: The tasks db was checked for a server in starting mode in the rmt:get-connection-info call ;; (if (and (< attemptnum 15) (member cmd api:write-queries)) (let ((faststart (configf:lookup *configdat* "server" "faststart"))) (hash-table-delete! *runremote* run-id) ;; (mutex-unlock! *send-receive-mutex*) (if (and faststart (equal? faststart "no")) (begin (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10) |
︙ | ︙ |
Modified rpc-transport.scm from [62a65daa58] to [702db23fbc].
︙ | ︙ | |||
45 46 47 48 49 50 51 | (set! *run-id* run-id) (if (args:get-arg "-daemonize") (daemon:ize)) (if (server:check-if-running run-id) (begin (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") (exit 0))) | | | > > > > > | > | | | | > > | | | > > | | | | | | | | > > > | | 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | (set! *run-id* run-id) (if (args:get-arg "-daemonize") (daemon:ize)) (if (server:check-if-running run-id) (begin (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") (exit 0))) (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id 'rpc)) (remtries 4)) (if (not server-id) (if (> remtries 0) (begin (thread-sleep! 2) (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id 'rpc) (- remtries 1))) (begin ;; since we didn't get the server lock we are going to clean up and bail out (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " rpc-transport:launch"))) (begin (rpc-transport:run (if (args:get-arg "-server")(args:get-arg "-server") "-") run-id server-id) (exit)))))) (define *rpc-listener-port* #f) (define *rpc-listener-port-bind-timestamp* #f) (define (rpc-transport:run hostn run-id server-id) (BB> "rpc-transport:run fired for hostn="hostn" run-id="run-id" server-id="server-id) (debug:print 2 *default-log-port* "Attempting to start the rpc server ...") ;; (trace rpc:publish-procedure!) (rpc:publish-procedure! 'server:login server:login) (rpc:publish-procedure! 'testing (lambda () "Just testing")) (BB> "flag1") (let* ((db #f) (tdbdat (tasks:open-db)) (hostname (let ((res (get-host-name))) (BB> "hostname="res) res)) (ipaddrstr (let* ((ipstr (if (string=? "-" hostn) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") (server:get-best-guess-address hostname) #f)) (res (if ipstr ipstr hostn))) (BB> "ipaddrstr="res) res)) ;; hostname))) (start-port (let ((res (portlogger:open-run-close portlogger:find-port))) (BB> "start-port="res) res)) (link-tree-path (configf:lookup *configdat* "setup" "linktree")) (rpc:listener (rpc-transport:find-free-port-and-open start-port)) (th1 (make-thread (lambda () ((rpc:make-server rpc:listener) #t)) "rpc:server")) ;; (cute (rpc:make-server rpc:listener) "rpc:server") ;; 'rpc:server)) (hostname (if (string=? "-" hostn) (get-host-name) hostn)) (ipaddrstr (if (string=? "-" hostn) (server:get-best-guess-address hostname) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") #f)) (portnum (let ((res (rpc:default-server-port))) (BB> "rpc:default-server-port="res" rpc-listener-port="*rpc-listener-port*) res)) (host:port (conc (if ipaddrstr ipaddrstr hostname) ":" portnum)) (tdb (tasks:open-db))) (BB> "Got here before thread start of rpc listener") (thread-start! th1) (BB> "started thread th1="th1) (set! db *inmemdb*) (tasks:server-set-interface-port (db:delay-if-busy tdbdat) server-id ipaddrstr portnum) (debug:print 0 *default-log-port* "Server started on " host:port) ;; (trace rpc:publish-procedure!) ;; (rpc:publish-procedure! 'server:login server:login) ;; (rpc:publish-procedure! 'testing (lambda () "Just testing")) ;;====================================================================== ;; ;; end of publish-procedure section ;;====================================================================== ;; (on-exit (lambda () (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "stopped"))) (set! *rpc:listener* rpc:listener) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") (set! *inmemdb* (db:setup run-id)) ;; if none running or if > 20 seconds since ;; server last used then start shutdown (let loop ((count 0)) (thread-sleep! 5) ;; no need to do this very often (let ((numrunning -1)) ;; (db:get-count-tests-running db))) (if (or (> numrunning 0) (> (+ *last-db-access* 60)(current-seconds))) (begin (debug:print-info 0 *default-log-port* "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*)) (loop (+ 1 count))) (begin (debug:print-info 0 *default-log-port* "Starting to shutdown the server side") (open-run-close tasks:server-delete-record tasks:open-db server-id " rpc-transport:try-start-server stop") (thread-sleep! 10) (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*) (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting") )))))) (define (rpc-transport:find-free-port-and-open port #!key ) (handle-exceptions exn (begin (print "Failed to bind to port " (rpc:default-server-port) ", trying next port") (rpc-transport:find-free-port-and-open (+ port 1))) (rpc:default-server-port port) (set! *rpc-listener-port* port) ;; a bit paranoid about rpc:default-server-port parameter not changing across threads (as params are wont to do). keeping this global in my back pocket in case this causes problems (set! *rpc-listener-port-bind-timestamp* (current-milliseconds)) ;; may want to test how long it has been since the last bind attempt happened... (tcp-read-timeout 240000) (BB> "rpc-transport> attempting to bind tcp port "port) (tcp-listen (rpc:default-server-port) 10000))) (define (rpc-transport:ping run-id host port) (handle-exceptions exn (begin (print "SERVER_NOT_FOUND") (exit 1)) (let ((login-res ((rpc:procedure 'server:login host port) *toppath*))) |
︙ | ︙ | |||
177 178 179 180 181 182 183 | (let ((server-dat (list iface port #f #f #f))) (hash-table-set! *runremote* run-id server-dat) server-dat) (begin (server:try-running run-id) (thread-sleep! 2) (rpc-transport:client-setup run-id (- remtries 1))))) | | | 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | (let ((server-dat (list iface port #f #f #f))) (hash-table-set! *runremote* run-id server-dat) server-dat) (begin (server:try-running run-id) (thread-sleep! 2) (rpc-transport:client-setup run-id (- remtries 1))))) (let* ((server-db-info (open-run-close tasks:get-server-info tasks:open-db run-id))) (debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) (if server-db-info (let* ((iface (tasks:hostinfo-get-interface server-db-info)) (port (tasks:hostinfo-get-port server-db-info)) (server-dat (list iface port #f #f #f)) (ping-res ((rpc:procedure 'server:login host port) *toppath*))) (if start-res |
︙ | ︙ |
Modified server.scm from [19061b35b0] to [c9c38a4420].
︙ | ︙ | |||
59 60 61 62 63 64 65 | ;; (else (debug:print-error 0 *default-log-port* "No known transport set, transport=" transport ", using rpc") ;; (rpc-transport:launch run-id))))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== | > | | < < | | | | | | | > > > > > > > | 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | ;; (else (debug:print-error 0 *default-log-port* "No known transport set, transport=" transport ", using rpc") ;; (rpc-transport:launch run-id))))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== ;; set global *transport-type* based on -transport switch and serer/transport configuration. default http otherwise. ;; called by launch:setup (define (server:set-transport) (let ((ttype (string->symbol (or (args:get-arg "-transport") (configf:lookup *configdat* "server" "transport") "http")))) (set! *transport-type* ttype) ttype)) ;; Get the transport -- DO NOT call this from client code. In client code, this is run-id sensitive and not a global (define (server:get-transport) (if *transport-type* *transport-type* (server:set-transport))) ;; Generate a unique signature for this server (define (server:mk-signature) (message-digest-string (md5-primitive) (with-output-to-string (lambda () (write (list (current-directory) (argv))))))) |
︙ | ︙ | |||
178 179 180 181 182 183 184 | (define (server:try-running run-id) (if (eq? run-id 0) (server:run run-id) (rmt:start-server run-id))) (define (server:check-if-running run-id) (let ((tdbdat (tasks:open-db))) | | | 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | (define (server:try-running run-id) (if (eq? run-id 0) (server:run run-id) (rmt:start-server run-id))) (define (server:check-if-running run-id) (let ((tdbdat (tasks:open-db))) (let loop ((server (tasks:get-server-info (db:delay-if-busy tdbdat) run-id)) (trycount 0)) (if server ;; note: client:start will set *runremote*. this needs to be changed ;; also, client:start will login to the server, also need to change that. ;; ;; client:start returns #t if login was successful. ;; |
︙ | ︙ | |||
213 214 215 216 217 218 219 | (define (server:ping run-id host:port) (let ((tdbdat (tasks:open-db))) (let* ((host-port (let ((slst (string-split host:port ":"))) (if (eq? (length slst) 2) (list (car slst)(string->number (cadr slst))) #f))) (toppath (launch:setup)) | | | 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 | (define (server:ping run-id host:port) (let ((tdbdat (tasks:open-db))) (let* ((host-port (let ((slst (string-split host:port ":"))) (if (eq? (length slst) 2) (list (car slst)(string->number (cadr slst))) #f))) (toppath (launch:setup)) (server-db-dat (if (not host-port)(tasks:get-server-info (db:delay-if-busy tdbdat) run-id) #f))) (if (not run-id) (begin (debug:print-error 0 *default-log-port* "must specify run-id when doing ping, -run-id n") (print "ERROR: No run-id") (exit 1)) (if (and (not host-port) (not server-db-dat)) |
︙ | ︙ |
Modified tasks.scm from [a06114a2ac] to [c4bf88d4c8].
︙ | ︙ | |||
168 169 170 171 172 173 174 | (define (tasks:hostinfo-get-interface vec) (vector-ref vec 1)) (define (tasks:hostinfo-get-port vec) (vector-ref vec 2)) (define (tasks:hostinfo-get-pubport vec) (vector-ref vec 3)) (define (tasks:hostinfo-get-transport vec) (vector-ref vec 4)) (define (tasks:hostinfo-get-pid vec) (vector-ref vec 5)) (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) | | | | | | 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | (define (tasks:hostinfo-get-interface vec) (vector-ref vec 1)) (define (tasks:hostinfo-get-port vec) (vector-ref vec 2)) (define (tasks:hostinfo-get-pubport vec) (vector-ref vec 3)) (define (tasks:hostinfo-get-transport vec) (vector-ref vec 4)) (define (tasks:hostinfo-get-pid vec) (vector-ref vec 5)) (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) (define (tasks:server-lock-slot mdb run-id transport-type) (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot") (if (< (tasks:num-in-available-state mdb run-id) 4) (begin (tasks:server-set-available mdb run-id transport-type) (thread-sleep! (/ (random 1500) 1000)) ;; (thread-sleep! 2) ;; Try removing this. It may not be needed. (tasks:server-am-i-the-server? mdb run-id)) #f)) ;; register that this server may come online (first to register goes though with the process) (define (tasks:server-set-available mdb run-id transport-type) (sqlite3:execute mdb "INSERT INTO servers (pid,hostname,port,pubport,start_time, priority,state,mt_version,heartbeat, interface,transport,run_id) VALUES(?, ?, ?, ?, strftime('%s','now'), ?, ?, ?,-1,?, ?, ?);" (current-process-id) ;; pid (get-host-name) ;; hostname -1 ;; port -1 ;; pubport (random 1000) ;; priority (used a tiebreaker on get-available) "available" ;; state (common:version-signature) ;; mt_version -1 ;; interface ;; (conc (server:get-transport)) ;; transport (conc transport-type) ;; transport run-id )) (define (tasks:num-in-available-state mdb run-id) (let ((res 0)) (sqlite3:for-each-row (lambda (num-in-queue) |
︙ | ︙ | |||
325 326 327 328 329 330 331 | (lambda (a . b) (set! res (cons (apply vector a b) res))) mdb (conc "SELECT " selstr " FROM servers WHERE run_id=? AND state in ('available','running','dbprep') ORDER BY start_time DESC;") run-id) (vector header res))) | > > > > > > > > > > > > > > > | | | | | | 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 | (lambda (a . b) (set! res (cons (apply vector a b) res))) mdb (conc "SELECT " selstr " FROM servers WHERE run_id=? AND state in ('available','running','dbprep') ORDER BY start_time DESC;") run-id) (vector header res))) ;; BB> bb opinion - want to push responsibility into api (encapsulation), like waiting if db is busy and finding the db handle in the first place. why should the caller need to be concerned?? If my opinion carries, we'll remove the bb- and make other needful adjustments. (define (bb-mdb-inserter mdb-expecting-proc mdbless-args) (let ((mdb (db:delay-if-busy (tasks:open-db)))) (apply mdb-expecting-proc (cons mdb mdbless-args)))) (define (tasks:bb-get-server-info . args) (bb-mdb-inserter tasks:get-server-info args)) (define (tasks:bb-num-in-available-state . args) (bb-mdb-inserter tasks:num-in-available-state args)) ;; BB: renaming tasks:get-server to get-server-info to make clear we aren't creating servers here (define (tasks:get-server-info mdb run-id #!key (retries 10)) (let ((res #f) (best #f)) (handle-exceptions exn (begin (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* "WARNING: tasks:get-server-info db access error.") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 *default-log-port* " for run " run-id) (print-call-chain (current-error-port)) (if (> retries 0) (begin (debug:print 0 *default-log-port* " trying call to tasks:get-server-info again in 10 seconds") (thread-sleep! 10) (tasks:get-server-info mdb run-id retries: (- retries 0))) (debug:print 0 *default-log-port* "10 tries of tasks:get-server-info all crashed and burned. Giving up and returning \"no server found\""))) (sqlite3:for-each-row (lambda (id interface port pubport transport pid hostname) (set! res (vector id interface port pubport transport pid hostname))) mdb ;; removed: ;; strftime('%s','now')-heartbeat < 10 AND mt_version = ? "SELECT id,interface,port,pubport,transport,pid,hostname FROM servers |
︙ | ︙ | |||
392 393 394 395 396 397 398 | ;; (else ;; #f)))) ;; try to start a server and wait for it to be available ;; (define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries) ;; ensure a server is running for this run | | | | 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 | ;; (else ;; #f)))) ;; try to start a server and wait for it to be available ;; (define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries) ;; ensure a server is running for this run (let loop ((server-dat (tasks:get-server-info (db:delay-if-busy tdbdat) run-id)) (delay-time 0)) (if (and (not server-dat) (< delay-time delay-max-tries)) (begin (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id) (debug:print 0 *default-log-port* "Try starting server for run-id " run-id)) (thread-sleep! (/ (random 2000) 1000)) (server:kind-run run-id) (thread-sleep! (min delay-time 1)) (loop (tasks:get-server-info (db:delay-if-busy tdbdat) run-id)(+ delay-time 1)))))) (define (tasks:get-all-servers mdb) (let ((res '())) (sqlite3:for-each-row (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) ;; 0 1 2 3 4 5 6 7 8 9 10 11 12 (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res))) |
︙ | ︙ | |||
441 442 443 444 445 446 447 | (unsetenv "TARGETHOST_LOGF") (unsetenv "TARGETHOST")) ;; look up a server by run-id and send it a kill, also delete the record for that server ;; (define (tasks:kill-server-run-id run-id #!key (tag "default")) (let* ((tdbdat (tasks:open-db)) | | | | 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 | (unsetenv "TARGETHOST_LOGF") (unsetenv "TARGETHOST")) ;; look up a server by run-id and send it a kill, also delete the record for that server ;; (define (tasks:kill-server-run-id run-id #!key (tag "default")) (let* ((tdbdat (tasks:open-db)) (sdat (tasks:get-server-info (db:delay-if-busy tdbdat) run-id))) (if sdat (let ((hostname (vector-ref sdat 6)) (pid (vector-ref sdat 5)) (server-id (vector-ref sdat 0))) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed") (debug:print-info 0 *default-log-port* "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (tasks:kill-server hostname pid kill-switch: "-9") ;; BB: added -9, let's not be kind here. we need it to die (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 *default-log-port* "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) ;;====================================================================== ;; M O N I T O R S |
︙ | ︙ |