Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -107,14 +107,12 @@ (if server-info (begin (remote-server-url-set! *runremote* (server:record->url server-info)) (remote-server-id-set! *runremote* (server:record->id server-info))))))) (if (and host port server-id) - (let* ((start-res (case *transport-type* - ((http)(http-transport:client-connect host port server-id)))) - (ping-res (case *transport-type* - ((http)(rmt:login-no-auto-client-setup start-res))))) + (let* ((start-res (http-transport:client-connect host port server-id)) + (ping-res (rmt:login-no-auto-client-setup start-res))) (if (and start-res ping-res) (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago (if runremote (begin Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -142,10 +142,11 @@ (define *passnum* 0) ;; when running track calls to run-tests or similar ;; (define *alt-log-file* #f) ;; used by -log ;; (define *common:denoise* (make-hash-table)) ;; for low noise printing (define *default-log-port* (current-error-port)) (define *time-zero* (current-seconds)) ;; for the watchdog +(define *on-exit-procs* '()) ;; add procs to this list to be executed on exit (define *default-area-tag* "local") ;; DATABASE ;; (define *dbstruct-dbs* #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this. ;; db stats @@ -3344,11 +3345,11 @@ pktsdirs)) ;;====================================================================== ;; use-lt is use linktree "lt" link to find pkts dir (define (common:save-pkt pktalist-in mtconf use-lt #!key (add-only #f)) ;; add-only saves the pkt only if there is a parent already - (if (or add-only + (if (or (not add-only) (hash-table-exists? *pkts-info* 'last-parent)) (let* ((parent (hash-table-ref/default *pkts-info* 'last-parent #f)) (pktalist (if parent (cons `(parent . ,parent) pktalist-in) @@ -3359,10 +3360,11 @@ (let ((pktsdir (or (hash-table-ref/default *pkts-info* 'pkts-dir #f) (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) (pktsdir (car pktsdirs))) ;; assume it is there (hash-table-set! *pkts-info* 'pkts-dir pktsdir) pktsdir)))) + (debug:print 0 *default-log-port* "pktsdir: "pktsdir) (handle-exceptions exn (debug:print-info 0 "failed to write out packet to " pktsdir ", exn=" exn) ;; don't care if this failed for now but MUST FIX - BUG!! (if (not (file-exists? pktsdir)) (create-directory pktsdir #t)) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -4586,22 +4586,28 @@ )) (define (std-exit-procedure) ;;(common:telemetry-log-close) - (on-exit (lambda () 0)) + (on-exit (lambda () 0)) ;; why is this here? ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) (let ((no-hurry (if *time-to-exit* ;; hurry up #f (begin (set! *time-to-exit* #t) #t)))) (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.") - (if (and no-hurry (debug:debug-mode 18)) + (if (and no-hurry + (debug:debug-mode 18)) (rmt:print-db-stats)) (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds (if *dbstruct-dbs* (db:close-all *dbstruct-dbs*)) ;; one second allocated + (if (list? *on-exit-procs*) + (for-each + (lambda (proc) + (proc)) + *on-exit-procs*)) (if *task-db* (let ((db (cdr *task-db*))) (if (sqlite3:database? db) (begin (sqlite3:interrupt! db) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -426,13 +426,27 @@ (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) (if (and sdat (not changed) (> (- (current-seconds) start-time) 2)) - (begin + (let* ((servinfodir (conc *toppath*"/.servinfo")) + (ipaddr (car sdat)) + (port (cadr sdat)) + (servinf (conc servinfodir"/"ipaddr":"port))) + (if (not (file-exists? servinfodir)) + (create-directory servinfodir #t)) + (with-output-to-file servinf + (lambda () + (print "SERVER STARTED: "ipaddr":"port" AT "(current-seconds)" server-id: "(server:get-client-signature)) + (print "started: "(seconds->year-week/day-time (current-seconds))))) + (set! *on-exit-procs* (cons + (lambda () + (delete-file* servinf)) + *on-exit-procs*)) + ;; put data about this server into a simple flat file host.port (debug:print-info 0 *default-log-port* "Received server alive signature") - (common:save-pkt `((action . alive) + #;(common:save-pkt `((action . alive) (T . server) (pid . ,(current-process-id)) (ipaddr . ,(car sdat)) (port . ,(cadr sdat))) *configdat* #t) @@ -439,13 +453,16 @@ sdat) (begin (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) (sleep 4) (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes - (begin + (let* ((ipaddr (car sdat)) + (port (cadr sdat)) + (servinf (conc *toppath*"/.servinfo/"ipaddr":"port))) (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") - (common:save-pkt `((action . died) + ;; (delete-file* servinf) ;; handled by on-exit, can be removed + #;(common:save-pkt `((action . died) (T . server) (pid . ,(current-process-id)) (ipaddr . ,(car sdat)) (port . ,(cadr sdat)) (msg . "Transport died?")) @@ -600,14 +617,17 @@ ;; (/ *total-non-write-delay* ;; *number-non-write-queries*)) ;; " ms") (db:print-current-query-stats) - (common:save-pkt `((action . exit) + #;(common:save-pkt `((action . exit) (T . server) (pid . ,(current-process-id))) - *configdat* #t) + *configdat* #t) + + ;; remove .servinfo file(s) here + (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting") (exit))) ;; all routes though here end in exit ... ;; @@ -640,14 +660,14 @@ #;(let* ((num-alive (server:get-num-alive (server:get-list *toppath*)))) (if (> num-alive 3) (begin (cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up")) (exit)))) - (common:save-pkt `((action . start) - (T . server) - (pid . ,(current-process-id))) - *configdat* #t) + #;(common:save-pkt `((action . start) + (T . server) + (pid . ,(current-process-id))) + *configdat* #t) (let* ((th2 (make-thread (lambda () (debug:print-info 0 *default-log-port* "Server run thread started") (http-transport:run (if (args:get-arg "-server") (args:get-arg "-server") Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -326,10 +326,12 @@ (list-ref b 3)))))) (if (> (length slst) nums) (take slst nums) slst))) +;; switch from server:get-list to server:get-servers-info +;; (define (server:get-first-best areapath) (let ((srvrs (server:get-best (server:get-list areapath)))) (if (and srvrs (not (null? srvrs))) (car srvrs) @@ -377,11 +379,11 @@ ;; if server-start-last exists, and wasn't old enough, wait + 1, then call this function recursively until it is old enough. ;; if it is old enough, overwrite it and wait 0.25 seconds. ;; if it then has the wrong server key, wait + 1 and call this function recursively. ;; -(define (server:wait-for-server-start-last-flag areapath) +#;(define (server:wait-for-server-start-last-flag areapath) (let* ((start-flag (conc areapath "/logs/server-start-last")) ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) (server-key (conc (get-host-name) "-" (current-process-id)))) (if (file-exists? start-flag) @@ -405,11 +407,50 @@ (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) (thread-sleep! ( + 1 idletime)) (server:wait-for-server-start-last-flag areapath))))))) +;; oldest server alive determines host then choose random of youngest +;; five servers on that host +;; +(define (server:get-servers-info areapath) + (let* ((servinfodir (conc *toppath*"/.servinfo")) + (allfiles (glob (conc servinfodir"/*"))) + (res (make-hash-table))) + (for-each + (lambda (f) + (let* ((hostport (pathname-strip-directory f)) + (serverdat (server:logf-get-start-info f))) + (hash-table-set! res hostport serverdat))) + allfiles) + res)) +;; oldest server alive determines host then choose random of youngest +;; five servers on that host +;; +(define (server:choose-server areapath) + ;; age is current-starttime + ;; find oldest alive + ;; 1. sort by age ascending and ping until good + ;; find alive rand from youngest + ;; 1. sort by age descending + ;; 2. take five + ;; 3. check alive, discard if not and repeat + (let* ((serversdat (server:get-servers-info areapath)) + (by-time-asc (sort (hash-table-keys serversdat) + (lambda (a b) + (>= (list-ref (hash-table-ref serversdat a) 2) + (list-ref (hash-table-ref serversdat b) 2)))))) + (if (not (null? by-time-asc)) + (let* ((oldest (last by-time-asc)) + (oldest-dat (hash-table-ref serversdat oldest)) + (host (list-ref oldest-dat 1)) + (all-valid (filter (lambda (x)(equal? host (list-ref x 1))) by-time-asc))) + (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) + (print "youngest: "(hash-table-ref serversdat (car all-valid))) + (car all-valid)) + #f))) ;; kind start up of server, wait before allowing another server for a given ;; area to be launched ;; (define (server:kind-run areapath)