Changes In Branch v1.63-tdb-dotserver-refactor Excluding Merge-Ins
This is equivalent to a diff from 2148615256 to 5cb3a069f8
2017-01-03
| ||
16:07 | wip Closed-Leaf check-in: 5cb3a069f8 user: bjbarcla tags: v1.63-tdb-dotserver-refactor | |
14:08 | wip check-in: e617679e45 user: bjbarcla tags: v1.63-tdb-dotserver-refactor | |
13:55 | Merged in v1.63-server-fix branch to v1.63 check-in: 3e16b59518 user: mrwellan tags: v1.63 | |
2016-12-29
| ||
15:16 | fixed bug with kind-run call Closed-Leaf check-in: 2148615256 user: bjbarcla tags: v1.63-server-fix | |
00:14 | added script manyservers.sh which will be basis for a test of server start & collision resilency check-in: 3d418034bd user: bjbarcla tags: v1.63-server-fix | |
Modified common.scm from [4b29489636] to [d04656dbed].
︙ | ︙ | |||
134 135 136 137 138 139 140 | (define *run-info-cache* (make-hash-table)) ;; run info is stable, no need to reget (define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set (define *homehost-mutex* (make-mutex)) (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) | | | 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | (define *run-info-cache* (make-hash-table)) ;; run info is stable, no need to reget (define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set (define *homehost-mutex* (make-mutex)) (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:read-dotserver->server-url *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (or (server:get-timeout) 100))) ;; default to 100 seconds ;; launching and hosts (defstruct host |
︙ | ︙ |
Modified http-transport.scm from [85b3cef6fd] to [e1abcb3338].
︙ | ︙ | |||
397 398 399 400 401 402 403 | (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") ;;(BB> "http-transport: ->dbprep") (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access (set! *dbstruct-db* (db:setup)) ;; run-id)) (set! server-going #t) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") ;;(BB> "http-transport: ->running") | | | | 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 | (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") ;;(BB> "http-transport: ->dbprep") (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access (set! *dbstruct-db* (db:setup)) ;; run-id)) (set! server-going #t) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") ;;(BB> "http-transport: ->running") (server:write-dotserver *toppath* iface port (current-process-id) 'http) ;; create file .server (thread-start! *watchdog*) (server:complete-attempt *toppath*)) ;; delete file .starting-server (begin ;; gotta exit nicely ;;(BB> "http-transport: ->collision") (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision") (http-transport:server-shutdown server-id port)))))) ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. |
︙ | ︙ | |||
454 455 456 457 458 459 460 | (current-seconds))) (begin (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))) ;; ;; Consider implementing some smarts here to re-insert the record or kill self is ;; the db indicates so | | > > > > > > | 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 | (current-seconds))) (begin (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))) ;; ;; Consider implementing some smarts here to re-insert the record or kill self is ;; the db indicates so ;; ;; BB - added this because servers are hanging about, alive and well ;; but in defunctdefault state in tdb and a .server file ;; preventing replacement servers from starting. (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") ;; ;; (if (tasks:server-am-i-the-server? tdb run-id) ;; (tasks:server-set-state! tdb server-id "running")) ;; (loop 0 server-state bad-sync-count (current-milliseconds))) (http-transport:server-shutdown server-id port)))))) ;; code cut out from above |
︙ | ︙ |
Modified rmt.scm from [c70f311cf0] to [193d5930f9].
︙ | ︙ | |||
116 117 118 119 120 121 122 | ;; (rmt:open-qry-close-locally cmd 0 params)) ;; no server contact made and this is a write, passively start a server ((and (not (remote-server-url *runremote*)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") | | | 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | ;; (rmt:open-qry-close-locally cmd 0 params)) ;; no server contact made and this is a write, passively start a server ((and (not (remote-server-url *runremote*)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") (let ((serverconn (server:read-dotserver->server-url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call (if serverconn (remote-server-url-set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed (if (not (server:start-attempted? *toppath*)) (server:kind-run *toppath*)))) (if (cdr (remote-hh-dat *runremote*)) ;; we are on the homehost, just do the call (begin (mutex-unlock! *rmt-mutex*) |
︙ | ︙ |
Modified server.scm from [24b58ed32b] to [055a606575].
︙ | ︙ | |||
135 136 137 138 139 140 141 | (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread"))) ;; we want the remote server to start in *toppath* so push there (push-directory areapath) (cond (attempt-in-progress (debug:print 0 *default-log-port* "INFO: Not trying to start server because attempt is in progress: "attempt-in-progress)) (dot-server-url | | | 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread"))) ;; we want the remote server to start in *toppath* so push there (push-directory areapath) (cond (attempt-in-progress (debug:print 0 *default-log-port* "INFO: Not trying to start server because attempt is in progress: "attempt-in-progress)) (dot-server-url (debug:print 0 *default-log-port* "INFO: Not trying to start server because one is already running : "dot-server-url)) (else (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") (thread-start! log-rotate) ;; host.domain.tld match host? (if (and target-host ;; look at target host, is it host.domain.tld or ip address and does it |
︙ | ︙ | |||
206 207 208 209 210 211 212 | (file-modification-time flagfile)) 15)) ;; exists and less than 15 seconds old (with-input-from-file flagfile (lambda () (read-line)))) ((file-exists? flagfile) ;; it is stale. (server:complete-attempt areapath) #f) (else #f))))) | | > > > > > > > > > > > > > > > > > > > > > > > > > | | > | | | | | | | | | | | 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 | (file-modification-time flagfile)) 15)) ;; exists and less than 15 seconds old (with-input-from-file flagfile (lambda () (read-line)))) ((file-exists? flagfile) ;; it is stale. (server:complete-attempt areapath) #f) (else #f))))) (define (server:read-dotserver areapath) (let ((dotfile (conc areapath "/.server"))) (handle-exceptions exn #f ;; if things go wrong pretend we can't see the file (if (and (file-exists? dotfile) (file-read-access? dotfile)) (with-input-from-file dotfile (lambda () (read-line))) #f)))) (define (server:read-dotserver->server-url areapath) (let* ((temp (server:read-dotserver areapath)) (tokens (if temp (string-split temp ":") '()))) (if (eq? 4 (length tokens)) (string-join (list-ref tokens 0) ":" (list-ref tokens 1)) #f))) (define (server:read-dotserver->pid areapath) (let* ((temp (server:read-dotserver areapath)) (tokens (if temp (string-split temp ":") '()))) (if (eq? 4 (length tokens)) (list-ref tokens 2) #f))) (define (server:read-dotserver->transport areapath) (let* ((temp (server:read-dotserver areapath)) (tokens (if temp (string-split temp ":") '()))) (if (eq? 4 (length tokens)) (string->symbol (list-ref tokens 3)) #f))) (define (server:running-or-starting? areapath) ;; Note: may be unreiable on non-homehost due to NFS lag (or (server:read-dotserver areapath) (server:start-attempted? areapath))) ;; write a .server file in *toppath* with hostport ;; return #t on success, #f otherwise ;; (define (server:write-dotserver areapath host port pid transport) (let ((lock-file (conc areapath "/.server.lock")) (server-file (conc areapath "/.server")) (payload (conc host ":" port ":" pid ":" transport))) (if (common:simple-file-lock lock-file) (let ((res (handle-exceptions exn #f ;; failed for some reason, for the moment simply return #f (with-output-to-file server-file (lambda () (print payload))) #t))) (debug:print-info 0 *default-log-port* "server file " server-file " for " payload " created") (common:simple-file-release-lock lock-file) res) #f))) (define (server:remove-dotserver-file areapath hostport) (let ((serverurl (server:read-dotserver->server-url areapath)) (server-file (conc areapath "/.server")) (lock-file (conc areapath "/.server.lock"))) (if (and serverurl (string-match (conc ".*:" hostport "$") serverurl)) ;; port matches, good enough info to decide to remove the file (if (common:simple-file-lock lock-file) (begin (handle-exceptions exn #f (delete-file* server-file)) (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " removed") (common:simple-file-release-lock lock-file)))))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) (let* ((serverurl (server:read-dotserver->server-url areapath))) ;; tdbdat (tasks:open-db))) (if serverurl (let* ((res (case *transport-type* ((http)(server:ping-server serverurl)) ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) ))) (if res serverurl (begin (server:remove-dotserver-file areapath ".*") ;; remove stale dotserver file #f))) #f))) ;; called in megatest.scm, host-port is string hostname:port ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; (define (server:ping host-port-in #!key (do-exit #f)) (let ((host:port (if (not host-port-in) ;; use read-dotserver to find (server:read-dotserver->server-url *toppath*) (if (number? host-port-in) ;; we were handed a server-id (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) ;; (print "srec: " srec " host-port-in: " host-port-in) (if srec (conc (vector-ref srec 3) ":" (vector-ref srec 4)) (conc "no such server-id " host-port-in))) host-port-in)))) |
︙ | ︙ |
Modified tasks.scm from [a0c6ff1ee2] to [65c2fe0cbf].
︙ | ︙ | |||
328 329 330 331 332 333 334 335 336 337 338 339 340 341 | (conc "SELECT " selstr " FROM servers WHERE state in ('available','running','dbprep') ORDER BY start_time DESC;") ) (vector header res))) (define (tasks:get-server mdb run-id #!key (retries 10)) (let ((res #f) (best #f)) (handle-exceptions exn (begin (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* "WARNING: tasks:get-server db access error.") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 *default-log-port* " for run " run-id) | > > > > | 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 | (conc "SELECT " selstr " FROM servers WHERE state in ('available','running','dbprep') ORDER BY start_time DESC;") ) (vector header res))) (define (tasks:get-server mdb run-id #!key (retries 10)) (let ((res #f) (best #f)) (set! res (vector id interface port pubport transport pid hostname))) (handle-exceptions exn (begin (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* "WARNING: tasks:get-server db access error.") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 *default-log-port* " for run " run-id) |
︙ | ︙ | |||
352 353 354 355 356 357 358 | mdb ;; removed: ;; strftime('%s','now')-heartbeat < 10 AND mt_version = ? "SELECT id,interface,port,pubport,transport,pid,hostname FROM servers WHERE run_id=? AND state='running' ORDER BY start_time DESC LIMIT 1;" run-id) ;; (common:version-signature) run-id) res))) | | < < < < < < < | < < < < < < < < | | 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 | mdb ;; removed: ;; strftime('%s','now')-heartbeat < 10 AND mt_version = ? "SELECT id,interface,port,pubport,transport,pid,hostname FROM servers WHERE run_id=? AND state='running' ORDER BY start_time DESC LIMIT 1;" run-id) ;; (common:version-signature) run-id) res))) (define (tasks:server-running-or-starting? mdb run-id) (server:running-or-starting? *toppath*)) (define (tasks:need-server run-id) (equal? (configf:lookup *configdat* "server" "required") "yes")) ;; (maxqry (cdr (rmt:get-max-query-average run-id))) ;; (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) ;; (cond ;; (forced |
︙ | ︙ |