Index: archive.scm ================================================================== --- archive.scm +++ archive.scm @@ -346,11 +346,11 @@ (rsync-exe (or (configf:lookup *configdat* "archive" "rsync") "rsync")) (print-prefix "Running: ") (archive-info (archive:allocate-new-archive-block blockid-cache *toppath* tsname min-space target-patt run-patt "megatest-db")) (archive-dir (if archive-info (cdr archive-info) #f)) (archive-id (if archive-info (car archive-info) -1)) - (home-host (common:get-homehost)) + (home-host (server:choose-server *toppath* 'homehost)) (archive-time (seconds->std-time-str (current-seconds))) (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time)) (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db")) (dbfile (conc archive-staging-db "/megatest.db"))) (create-directory archive-staging-db #t) Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -89,53 +89,52 @@ (exit 1)) ;; ;; Alternatively here, we can get the list of candidate servers and work our way ;; through them searching for a good one. ;; - (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath)) + (let* ((server-dat (server:choose-server areapath 'best)) (runremote (or area-dat *runremote*))) (if (not server-dat) ;; no server found (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)) - (let ((host (cadr server-dat)) - (port (caddr server-dat)) - (server-id (caddr (cddr server-dat)))) - (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) - (if (and (not area-dat) - (not *runremote*)) - (begin - (set! *runremote* (make-remote)) - (let* ((server-info (remote-server-info *runremote*))) - (if server-info - (begin - (remote-server-url-set! *runremote* (server:record->url server-info)) - (remote-server-id-set! *runremote* (server:record->id server-info))))))) - (if (and host port server-id) - (let* ((start-res (case *transport-type* - ((http)(http-transport:client-connect host port server-id)))) - (ping-res (case *transport-type* - ((http)(rmt:login-no-auto-client-setup start-res))))) - (if (and start-res - ping-res) - (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago - (if runremote - (begin - (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res) - (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) - start-res) - (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)))) - (begin ;; login failed but have a server record, clean out the record and try again - (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid. Fixes part of Randy;s ticket 1405717332 - (case *transport-type* - ((http)(http-transport:close-connections))) - (if *runremote* - (remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id) - ) - (thread-sleep! 1) - (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)) - ))) - (begin ;; no server registered - ;; (server:kind-run areapath) - (server:start-and-wait areapath) - (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) - (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. - (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))))))))) + (match server-dat + ((host port start-time server-id) + (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) + (if (and (not area-dat) + (not *runremote*)) + (begin + (set! *runremote* (make-remote)) + (let* ((server-info (remote-server-info *runremote*))) + (if server-info + (begin + (remote-server-url-set! *runremote* (server:record->url server-info)) + (remote-server-id-set! *runremote* (server:record->id server-info))))))) + (if (and host port server-id) + (let* ((start-res (http-transport:client-connect host port server-id)) + (ping-res (rmt:login-no-auto-client-setup start-res))) + (if (and start-res + ping-res) + (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago + (if runremote + (begin + (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res) + (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) + start-res) + (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)))) + (begin ;; login failed but have a server record, clean out the record and try again + (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid. Fixes part of Randy;s ticket 1405717332 + (case *transport-type* + ((http)(http-transport:close-connections))) + (if *runremote* + (remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id) + ) + (thread-sleep! 1) + (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)) + ))) + (begin ;; no server registered + ;; (server:kind-run areapath) + (server:start-and-wait areapath) + (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) + (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. + (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))))) + (else + (debug:print 0 *default-log-port* "ERROR: malformed server-dat="server-dat))))))) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -142,10 +142,11 @@ (define *passnum* 0) ;; when running track calls to run-tests or similar ;; (define *alt-log-file* #f) ;; used by -log ;; (define *common:denoise* (make-hash-table)) ;; for low noise printing (define *default-log-port* (current-error-port)) (define *time-zero* (current-seconds)) ;; for the watchdog +(define *on-exit-procs* '()) ;; add procs to this list to be executed on exit (define *default-area-tag* "local") ;; DATABASE ;; (define *dbstruct-dbs* #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this. ;; db stats @@ -315,11 +316,12 @@ (define (common:logpro-exit-code->test-status exit-code) (status-sym->string (common:logpro-exit-code->status-sym exit-code))) (defstruct remote - (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) + (hh-dat (or (server:choose-server *toppath* 'homehost) + (cons #f #f))) (server-url #f) ;; (server:check-if-running *toppath*) #f)) (server-id #f) (server-info (if *toppath* (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (connect-time (current-seconds)) @@ -1305,72 +1307,10 @@ ;;====================================================================== ;; logic for getting homehost. Returns (host . at-home) ;; IF *toppath* is not set, wait up to five seconds trying every two seconds ;; (this is to accomodate the watchdog) ;; -(define (common:get-homehost #!key (trynum 5)) - ;; called often especially at start up. use mutex to eliminate collisions - (mutex-lock! *homehost-mutex*) - (cond - (*home-host* - (mutex-unlock! *homehost-mutex*) - *home-host*) - ((not *toppath*) - (mutex-unlock! *homehost-mutex*) - (launch:setup) ;; safely mutexed now - (if (> trynum 0) - (begin - (thread-sleep! 2) - (common:get-homehost trynum: (- trynum 1))) - #f)) - (else - (let* ((currhost (get-host-name)) - (bestadrs (server:get-best-guess-address currhost)) - ;; first look in config, then look in file .homehost, create it if not found - (homehost (or (configf:lookup *configdat* "server" "homehost" ) - (handle-exceptions - exn - (if (> trynum 0) - (let ((delay-time (* (- 5 trynum) 5))) - (mutex-unlock! *homehost-mutex*) - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying " - delay-time " seconds and trying again, message: " ((condition-property-accessor 'exn 'message) exn) - ", exn=" exn) - (thread-sleep! delay-time) - (common:get-homehost trynum: (- trynum 1))) - (begin - (mutex-unlock! *homehost-mutex*) - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time) - "] Failed to read .homehost file after trying five times. Giving up and exiting, message: " - ((condition-property-accessor 'exn 'message) exn)) - (exit 1))) - (let ((hhf (conc *toppath* "/.homehost"))) - (if (common:file-exists? hhf) - (with-input-from-file hhf read-line) - (if (file-write-access? *toppath*) - (begin - (with-output-to-file hhf - (lambda () - (print bestadrs))) - (begin - (mutex-unlock! *homehost-mutex*) - (car (common:get-homehost)))) - #f)))))) - (at-home (or (equal? homehost currhost) - (equal? homehost bestadrs)))) - (set! *home-host* (cons homehost at-home)) - (mutex-unlock! *homehost-mutex*) - *home-host*)))) - -;;====================================================================== -;; am I on the homehost? -;; -(define (common:on-homehost?) - (let ((hh (common:get-homehost))) - (if hh - (cdr hh) - #f))) ;;====================================================================== ;; do we honor the caches of the config files? ;; (define (common:use-cache?) @@ -2049,11 +1989,11 @@ (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) (define (common:wait-for-homehost-load maxnormload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f - (common:get-homehost))) + (server:choose-server *toppath* 'homehost))) (hh (if hh-dat (car hh-dat) #f))) (common:wait-for-normalized-load maxnormload msg hh))) (define (common:get-num-cpus remote-host) (let* ((actual-host (or remote-host (get-host-name)))) @@ -3344,11 +3284,11 @@ pktsdirs)) ;;====================================================================== ;; use-lt is use linktree "lt" link to find pkts dir (define (common:save-pkt pktalist-in mtconf use-lt #!key (add-only #f)) ;; add-only saves the pkt only if there is a parent already - (if (or add-only + (if (or (not add-only) (hash-table-exists? *pkts-info* 'last-parent)) (let* ((parent (hash-table-ref/default *pkts-info* 'last-parent #f)) (pktalist (if parent (cons `(parent . ,parent) pktalist-in) @@ -3359,10 +3299,11 @@ (let ((pktsdir (or (hash-table-ref/default *pkts-info* 'pkts-dir #f) (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) (pktsdir (car pktsdirs))) ;; assume it is there (hash-table-set! *pkts-info* 'pkts-dir pktsdir) pktsdir)))) + (debug:print 0 *default-log-port* "pktsdir: "pktsdir) (handle-exceptions exn (debug:print-info 0 "failed to write out packet to " pktsdir ", exn=" exn) ;; don't care if this failed for now but MUST FIX - BUG!! (if (not (file-exists? pktsdir)) (create-directory pktsdir #t)) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -3808,13 +3808,13 @@ (debug:print 0 *default-log-port* "Failed to find megatest.config, exiting") (exit 1) ) ) - (if (not (common:on-homehost?)) + #;(if (not (common:on-homehost?)) (begin - (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (common:get-homehost)) + (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (server:get-homehost)) (debug:print 0 *default-log-port* "It will be slower.") )) (if (and (common:file-exists? mtdb-path) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -4586,22 +4586,28 @@ )) (define (std-exit-procedure) ;;(common:telemetry-log-close) - (on-exit (lambda () 0)) + (on-exit (lambda () 0)) ;; why is this here? ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) (let ((no-hurry (if *time-to-exit* ;; hurry up #f (begin (set! *time-to-exit* #t) #t)))) (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.") - (if (and no-hurry (debug:debug-mode 18)) + (if (and no-hurry + (debug:debug-mode 18)) (rmt:print-db-stats)) (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds (if *dbstruct-dbs* (db:close-all *dbstruct-dbs*)) ;; one second allocated + (if (list? *on-exit-procs*) + (for-each + (lambda (proc) + (proc)) + *on-exit-procs*)) (if *task-db* (let ((db (cdr *task-db*))) (if (sqlite3:database? db) (begin (sqlite3:interrupt! db) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -426,13 +426,27 @@ (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) (if (and sdat (not changed) (> (- (current-seconds) start-time) 2)) - (begin + (let* ((servinfodir (conc *toppath*"/.servinfo")) + (ipaddr (car sdat)) + (port (cadr sdat)) + (servinf (conc servinfodir"/"ipaddr":"port))) + (if (not (file-exists? servinfodir)) + (create-directory servinfodir #t)) + (with-output-to-file servinf + (lambda () + (print "SERVER STARTED: "ipaddr":"port" AT "(current-seconds)" server-id: "(server:get-client-signature)) + (print "started: "(seconds->year-week/day-time (current-seconds))))) + (set! *on-exit-procs* (cons + (lambda () + (delete-file* servinf)) + *on-exit-procs*)) + ;; put data about this server into a simple flat file host.port (debug:print-info 0 *default-log-port* "Received server alive signature") - (common:save-pkt `((action . alive) + #;(common:save-pkt `((action . alive) (T . server) (pid . ,(current-process-id)) (ipaddr . ,(car sdat)) (port . ,(cadr sdat))) *configdat* #t) @@ -439,13 +453,16 @@ sdat) (begin (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) (sleep 4) (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes - (begin + (let* ((ipaddr (car sdat)) + (port (cadr sdat)) + (servinf (conc *toppath*"/.servinfo/"ipaddr":"port))) (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") - (common:save-pkt `((action . died) + ;; (delete-file* servinf) ;; handled by on-exit, can be removed + #;(common:save-pkt `((action . died) (T . server) (pid . ,(current-process-id)) (ipaddr . ,(car sdat)) (port . ,(cadr sdat)) (msg . "Transport died?")) @@ -600,14 +617,17 @@ ;; (/ *total-non-write-delay* ;; *number-non-write-queries*)) ;; " ms") (db:print-current-query-stats) - (common:save-pkt `((action . exit) + #;(common:save-pkt `((action . exit) (T . server) (pid . ,(current-process-id))) - *configdat* #t) + *configdat* #t) + + ;; remove .servinfo file(s) here + (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting") (exit))) ;; all routes though here end in exit ... ;; @@ -640,14 +660,14 @@ #;(let* ((num-alive (server:get-num-alive (server:get-list *toppath*)))) (if (> num-alive 3) (begin (cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up")) (exit)))) - (common:save-pkt `((action . start) - (T . server) - (pid . ,(current-process-id))) - *configdat* #t) + #;(common:save-pkt `((action . start) + (T . server) + (pid . ,(current-process-id))) + *configdat* #t) (let* ((th2 (make-thread (lambda () (debug:print-info 0 *default-log-port* "Server run thread started") (http-transport:run (if (args:get-arg "-server") (args:get-arg "-server") Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -1563,11 +1563,11 @@ (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) ;; (list 'transport (conc *transport-type*)) ;; (list 'serverinf *server-info*) - (list 'homehost (let* ((hhdat (common:get-homehost))) + #;(list 'homehost (let* ((hhdat (server:get-homehost))) (if hhdat (car hhdat) #f))) (list 'serverurl (if *runremote* (remote-server-url *runremote*) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -656,13 +656,13 @@ (if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun") (debug:print 0 *default-log-port* (string-intersperse (argv) " "))) ;; some switches imply homehost. Exit here if not on homehost ;; -(let ((homehost-required (list "-cleanup-db" "-server"))) +(let ((homehost-required (list "-cleanup-db"))) (if (apply args:any? homehost-required) - (if (not (common:on-homehost?)) + (if (not (server:choose-server *toppath* 'home?)) (for-each (lambda (switch) (if (args:get-arg switch) (begin (debug:print 0 *default-log-port* "ERROR: you must be on the homehost to run with " switch @@ -2379,11 +2379,11 @@ (if (or (getenv "MT_RUNSCRIPT") (args:get-arg "-repl") (args:get-arg "-load")) (let* ((toppath (launch:setup)) (dbstructs (if (and toppath - (common:on-homehost?)) + (server:choose-server toppath 'home?)) (db:setup #t) #f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f))) (if *toppath* (cond ((getenv "MT_RUNSCRIPT") Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -116,11 +116,12 @@ ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"]; ;; DOT SET_HOMEHOST -> MUTEXLOCK; ;; ensure we have a homehost record (if (not (pair? (remote-hh-dat runremote))) ;; not on homehost (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little - (remote-hh-dat-set! runremote (common:get-homehost))) + (let ((hh-data (server:choose-server areapath 'homehost))) + (remote-hh-dat-set! runremote (or hh-data (cons #f #f))))) ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) (cond #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.") Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -121,25 +121,26 @@ ;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area - (let* ((curr-host (get-host-name)) + (let* (;; (curr-host (get-host-name)) ;; (attempt-in-progress (server:start-attempted? areapath)) ;; (dot-server-url (server:check-if-running areapath)) - (curr-ip (server:get-best-guess-address curr-host)) - (curr-pid (current-process-id)) - (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) - (target-host (car homehost)) + ;; (curr-ip (server:get-best-guess-address curr-host)) + ;; (curr-pid (current-process-id)) + ;; (homehost (server:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) + ;; (target-host (car homehost)) (testsuite (common:get-testsuite-name)) (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) (profile-mode (or (configf:lookup *configdat* "misc" "profilesw") "")) (cmdln (conc (common:get-megatest-exe) - " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") - " -daemonize " - "") + " -server - ";; (or target-host "-") + (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") + " -daemonize " + "") ;; " -log " logfile " -m testsuite:" testsuite " " profile-mode )) ;; (conc " >> " logfile " 2>&1 &"))))) (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? @@ -148,25 +149,25 @@ (push-directory areapath) (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") (thread-start! log-rotate) ;; host.domain.tld match host? - (if (and target-host - ;; look at target host, is it host.domain.tld or ip address and does it - ;; match current ip or hostname - (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) - (not (equal? curr-ip target-host))) - (begin - (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) - (setenv "TARGETHOST" target-host))) - + ;; (if (and target-host + ;; ;; look at target host, is it host.domain.tld or ip address and does it + ;; ;; match current ip or hostname + ;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) + ;; (not (equal? curr-ip target-host))) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) + ;; (setenv "TARGETHOST" target-host))) + ;; (setenv "TARGETHOST_LOGF" logfile) (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") - (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) + ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) ;; given a path to a server log return: host port startseconds server-id ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let @@ -271,11 +272,11 @@ (> (length new-res) limit)) new-res ;; (take new-res limit) <= need intelligent sorting before this will work new-res) (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) -(define (server:get-num-alive srvlst) +#;(define (server:get-num-alive srvlst) (let ((num-alive 0)) (for-each (lambda (server) (handle-exceptions exn @@ -326,33 +327,35 @@ (list-ref b 3)))))) (if (> (length slst) nums) (take slst nums) slst))) -(define (server:get-first-best areapath) - (let ((srvrs (server:get-best (server:get-list areapath)))) - (if (and srvrs - (not (null? srvrs))) - (car srvrs) - #f))) - -(define (server:get-rand-best areapath) - (let ((srvrs (server:get-best (server:get-list areapath)))) - (if (and (list? srvrs) - (not (null? srvrs))) - (let* ((len (length srvrs)) - (idx (random len))) - (list-ref srvrs idx)) - #f))) +;; ;; switch from server:get-list to server:get-servers-info +;; ;; +;; (define (server:get-first-best areapath) +;; (let ((srvrs (server:get-best (server:get-list areapath)))) +;; (if (and srvrs +;; (not (null? srvrs))) +;; (car srvrs) +;; #f))) +;; +;; (define (server:get-rand-best areapath) +;; (let ((srvrs (server:get-best (server:get-list areapath)))) +;; (if (and (list? srvrs) +;; (not (null? srvrs))) +;; (let* ((len (length srvrs)) +;; (idx (random len))) +;; (list-ref srvrs idx)) +;; #f))) (define (server:record->id servr) (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn) #f) - (match-let (((mod-time host port start-time server-id pid) + (match-let (((host port start-time server-id) servr)) (if server-id server-id #f)))) @@ -360,11 +363,11 @@ (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn) #f) - (match-let (((mod-time host port start-time server-id pid) + (match-let (((host port start-time server-id) servr)) (if (and host port) (conc host ":" port) #f)))) @@ -377,11 +380,11 @@ ;; if server-start-last exists, and wasn't old enough, wait + 1, then call this function recursively until it is old enough. ;; if it is old enough, overwrite it and wait 0.25 seconds. ;; if it then has the wrong server key, wait + 1 and call this function recursively. ;; -(define (server:wait-for-server-start-last-flag areapath) +#;(define (server:wait-for-server-start-last-flag areapath) (let* ((start-flag (conc areapath "/logs/server-start-last")) ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) (server-key (conc (get-host-name) "-" (current-process-id)))) (if (file-exists? start-flag) @@ -405,20 +408,93 @@ (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) (thread-sleep! ( + 1 idletime)) (server:wait-for-server-start-last-flag areapath))))))) +;; oldest server alive determines host then choose random of youngest +;; five servers on that host +;; +(define (server:get-servers-info areapath) + (let* ((servinfodir (conc *toppath*"/.servinfo")) + (allfiles (glob (conc servinfodir"/*"))) + (res (make-hash-table))) + (for-each + (lambda (f) + (let* ((hostport (pathname-strip-directory f)) + (serverdat (server:logf-get-start-info f))) + (hash-table-set! res hostport serverdat))) + allfiles) + res)) + +;; oldest server alive determines host then choose random of youngest +;; five servers on that host +;; +;; mode: +;; best - get best server (random of newest five) +;; home - get home host based on oldest server +;; info - print info +(define (server:choose-server areapath #!optional (mode 'best)) + ;; age is current-starttime + ;; find oldest alive + ;; 1. sort by age ascending and ping until good + ;; find alive rand from youngest + ;; 1. sort by age descending + ;; 2. take five + ;; 3. check alive, discard if not and repeat + (let* ((serversdat (server:get-servers-info areapath)) + (by-time-asc (sort (hash-table-keys serversdat) ;; list of "host:port" + (lambda (a b) + (>= (list-ref (hash-table-ref serversdat a) 2) + (list-ref (hash-table-ref serversdat b) 2)))))) + (if (not (null? by-time-asc)) + (let* ((oldest (last by-time-asc)) + (oldest-dat (hash-table-ref serversdat oldest)) + (host (list-ref oldest-dat 0)) + (all-valid (filter (lambda (x) + (equal? host (list-ref (hash-table-ref serversdat x) 0))) + by-time-asc)) + (best-five (lambda () + (if (> (length all-valid) 5) + (map (lambda (x) + (hash-table-ref serversdat x)) + (take all-valid 5)) + all-valid))) + (names->dats (lambda (names) + (map (lambda (x)(hash-table-ref serversdat x)) names))) + (am-home? (lambda () + (let* ((currhost (get-host-name)) + (bestadrs (server:get-best-guess-address currhost))) + (or (equal? host currhost) + (equal? host bestadrs)))))) + (case mode + ((info) + (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) + (print "youngest: "(hash-table-ref serversdat (car all-valid)))) + ((home) host) + ((homehost) (cons host (am-home?))) ;; shut up old code + ((home?) (am-home?)) + ((best-five)(names->dats (best-five))) + ((all-valid)(names->dats all-valid)) + ((best) (let* ((best-five (best-five)) + (len (length best-five))) + (list-ref best-five (random len)))) + + (else + (debug:print 0 *default-log-port* "ERROR: invalid command "mode) + #f))) + #f))) - + ;; kind start up of server, wait before allowing another server for a given ;; area to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least seconds old - (server:wait-for-server-start-last-flag areapath) - (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? + ;; (server:wait-for-server-start-last-flag areapath) + (server:run areapath) + #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((lock-file (conc areapath "/logs/server-start.lock"))) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (common:simple-file-lock-and-wait lock-file expire-time: 25) (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag) (system (conc "touch " start-flag)) ;; lazy but safe @@ -434,35 +510,33 @@ (let loop ((server-info (server:check-if-running areapath)) (try-num 0)) (if (or server-info (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. (server:record->url server-info) - (let ((num-ok (length (server:get-best (server:get-list areapath))))) + (let ((num-ok (length (server:choose-server areapath 'all-valid)))) (if (and (> try-num 0) ;; first time through simply wait a little while then try again (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one - (server:kind-run areapath)) + (server:run areapath)) (thread-sleep! 5) (loop (server:check-if-running areapath) (+ try-num 1))))))) -(define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. - (define (server:get-num-servers #!key (numservers 2)) (let ((ns (string->number (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) (or ns numservers))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed - (servers (server:get-best (server:get-list areapath)))) + (servers (server:choose-server areapath 'best-five))) ;; (server:get-best (server:get-list areapath)))) (if (or (and servers (null? servers)) - (not servers) - (and (list? servers) - (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers + (not servers)) + ;; (and (list? servers) + ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers #f (let loop ((hed (car servers)) (tal (cdr servers))) (let ((res (server:check-server hed))) (if res @@ -473,15 +547,12 @@ ;; ping the given server ;; (define (server:check-server server-record) (let* ((server-url (server:record->url server-record)) - (server-id (server:record->id server-record)) - (res (case *transport-type* - ((http)(server:ping server-url server-id)) - ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) - ))) + (server-id (server:record->id server-record)) + (res (server:ping server-url server-id))) (if res server-url #f))) (define (server:kill servr)