Index: archive.scm ================================================================== --- archive.scm +++ archive.scm @@ -346,11 +346,11 @@ (rsync-exe (or (configf:lookup *configdat* "archive" "rsync") "rsync")) (print-prefix "Running: ") (archive-info (archive:allocate-new-archive-block blockid-cache *toppath* tsname min-space target-patt run-patt "megatest-db")) (archive-dir (if archive-info (cdr archive-info) #f)) (archive-id (if archive-info (car archive-info) -1)) - (home-host (server:get-homehost)) + (home-host (server:choose-server *toppath* 'homehost)) (archive-time (seconds->std-time-str (current-seconds))) (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time)) (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db")) (dbfile (conc archive-staging-db "/megatest.db"))) (create-directory archive-staging-db #t) Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -89,51 +89,52 @@ (exit 1)) ;; ;; Alternatively here, we can get the list of candidate servers and work our way ;; through them searching for a good one. ;; - (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath)) + (let* ((server-dat (server:choose-server areapath 'best)) (runremote (or area-dat *runremote*))) (if (not server-dat) ;; no server found (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)) - (let ((host (cadr server-dat)) - (port (caddr server-dat)) - (server-id (caddr (cddr server-dat)))) - (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) - (if (and (not area-dat) - (not *runremote*)) - (begin - (set! *runremote* (make-remote)) - (let* ((server-info (remote-server-info *runremote*))) - (if server-info - (begin - (remote-server-url-set! *runremote* (server:record->url server-info)) - (remote-server-id-set! *runremote* (server:record->id server-info))))))) - (if (and host port server-id) - (let* ((start-res (http-transport:client-connect host port server-id)) - (ping-res (rmt:login-no-auto-client-setup start-res))) - (if (and start-res - ping-res) - (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago - (if runremote - (begin - (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res) - (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) - start-res) - (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)))) - (begin ;; login failed but have a server record, clean out the record and try again - (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid. Fixes part of Randy;s ticket 1405717332 - (case *transport-type* - ((http)(http-transport:close-connections))) - (if *runremote* - (remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id) - ) - (thread-sleep! 1) - (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)) - ))) - (begin ;; no server registered - ;; (server:kind-run areapath) - (server:start-and-wait areapath) - (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) - (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. - (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))))))))) + (match server-dat + ((host port start-time server-id) + (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) + (if (and (not area-dat) + (not *runremote*)) + (begin + (set! *runremote* (make-remote)) + (let* ((server-info (remote-server-info *runremote*))) + (if server-info + (begin + (remote-server-url-set! *runremote* (server:record->url server-info)) + (remote-server-id-set! *runremote* (server:record->id server-info))))))) + (if (and host port server-id) + (let* ((start-res (http-transport:client-connect host port server-id)) + (ping-res (rmt:login-no-auto-client-setup start-res))) + (if (and start-res + ping-res) + (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago + (if runremote + (begin + (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res) + (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) + start-res) + (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)))) + (begin ;; login failed but have a server record, clean out the record and try again + (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid. Fixes part of Randy;s ticket 1405717332 + (case *transport-type* + ((http)(http-transport:close-connections))) + (if *runremote* + (remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id) + ) + (thread-sleep! 1) + (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)) + ))) + (begin ;; no server registered + ;; (server:kind-run areapath) + (server:start-and-wait areapath) + (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) + (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. + (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))))) + (else + (debug:print 0 *default-log-port* "ERROR: malformed server-dat="server-dat))))))) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -316,11 +316,12 @@ (define (common:logpro-exit-code->test-status exit-code) (status-sym->string (common:logpro-exit-code->status-sym exit-code))) (defstruct remote - (hh-dat (server:get-homehost)) ;; homehost record ( addr . hhflag ) + (hh-dat (or (server:choose-server *toppath* 'homehost) + (cons #f #f))) (server-url #f) ;; (server:check-if-running *toppath*) #f)) (server-id #f) (server-info (if *toppath* (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (connect-time (current-seconds)) @@ -1988,11 +1989,11 @@ (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) (define (common:wait-for-homehost-load maxnormload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f - (server:get-homehost))) + (server:choose-server *toppath* 'homehost))) (hh (if hh-dat (car hh-dat) #f))) (common:wait-for-normalized-load maxnormload msg hh))) (define (common:get-num-cpus remote-host) (let* ((actual-host (or remote-host (get-host-name)))) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -3808,11 +3808,11 @@ (debug:print 0 *default-log-port* "Failed to find megatest.config, exiting") (exit 1) ) ) - (if (not (common:on-homehost?)) + #;(if (not (common:on-homehost?)) (begin (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (server:get-homehost)) (debug:print 0 *default-log-port* "It will be slower.") )) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -1563,11 +1563,11 @@ (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) ;; (list 'transport (conc *transport-type*)) ;; (list 'serverinf *server-info*) - (list 'homehost (let* ((hhdat (server:get-homehost))) + #;(list 'homehost (let* ((hhdat (server:get-homehost))) (if hhdat (car hhdat) #f))) (list 'serverurl (if *runremote* (remote-server-url *runremote*) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -658,11 +658,11 @@ ;; some switches imply homehost. Exit here if not on homehost ;; (let ((homehost-required (list "-cleanup-db"))) (if (apply args:any? homehost-required) - (if (not (common:on-homehost?)) + (if (not (server:choose-server *toppath* 'home?)) (for-each (lambda (switch) (if (args:get-arg switch) (begin (debug:print 0 *default-log-port* "ERROR: you must be on the homehost to run with " switch @@ -2379,11 +2379,11 @@ (if (or (getenv "MT_RUNSCRIPT") (args:get-arg "-repl") (args:get-arg "-load")) (let* ((toppath (launch:setup)) (dbstructs (if (and toppath - (common:on-homehost?)) + (server:choose-server toppath 'home?)) (db:setup #t) #f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f))) (if *toppath* (cond ((getenv "MT_RUNSCRIPT") Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -116,11 +116,12 @@ ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"]; ;; DOT SET_HOMEHOST -> MUTEXLOCK; ;; ensure we have a homehost record (if (not (pair? (remote-hh-dat runremote))) ;; not on homehost (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little - (remote-hh-dat-set! runremote (server:get-homehost))) + (let ((hh-data (server:choose-server areapath 'homehost))) + (remote-hh-dat-set! runremote (or hh-data (cons #f #f))))) ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) (cond #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.") Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -121,25 +121,26 @@ ;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area - (let* ((curr-host (get-host-name)) + (let* (;; (curr-host (get-host-name)) ;; (attempt-in-progress (server:start-attempted? areapath)) ;; (dot-server-url (server:check-if-running areapath)) - (curr-ip (server:get-best-guess-address curr-host)) - (curr-pid (current-process-id)) - (homehost (server:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) - (target-host (car homehost)) + ;; (curr-ip (server:get-best-guess-address curr-host)) + ;; (curr-pid (current-process-id)) + ;; (homehost (server:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) + ;; (target-host (car homehost)) (testsuite (common:get-testsuite-name)) (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) (profile-mode (or (configf:lookup *configdat* "misc" "profilesw") "")) (cmdln (conc (common:get-megatest-exe) - " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") - " -daemonize " - "") + " -server - ";; (or target-host "-") + (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") + " -daemonize " + "") ;; " -log " logfile " -m testsuite:" testsuite " " profile-mode )) ;; (conc " >> " logfile " 2>&1 &"))))) (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? @@ -148,25 +149,25 @@ (push-directory areapath) (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") (thread-start! log-rotate) ;; host.domain.tld match host? - (if (and target-host - ;; look at target host, is it host.domain.tld or ip address and does it - ;; match current ip or hostname - (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) - (not (equal? curr-ip target-host))) - (begin - (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) - (setenv "TARGETHOST" target-host))) - + ;; (if (and target-host + ;; ;; look at target host, is it host.domain.tld or ip address and does it + ;; ;; match current ip or hostname + ;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) + ;; (not (equal? curr-ip target-host))) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) + ;; (setenv "TARGETHOST" target-host))) + ;; (setenv "TARGETHOST_LOGF" logfile) (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") - (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) + ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) ;; given a path to a server log return: host port startseconds server-id ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let @@ -454,69 +455,36 @@ (best-five (lambda () (if (> (length all-valid) 5) (map (lambda (x) (hash-table-ref serversdat x)) (take all-valid 5)) - all-valid)))) + all-valid))) + (names->dats (lambda (names) + (map (lambda (x)(hash-table-ref serversdat x)) names))) + (am-home? (lambda () + (let* ((currhost (get-host-name)) + (bestadrs (server:get-best-guess-address currhost))) + (or (equal? host currhost) + (equal? host bestadrs)))))) (case mode ((info) (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) (print "youngest: "(hash-table-ref serversdat (car all-valid)))) - ((home) host) - ((best-five)(best-five)) - ((valid) (map (lambda (x)(hash-table-ref serverdat x)) all-valid)) - ((best)(let* ((best-five (best-five)) - (len (length best-five))) - (list-ref best-five len))) + ((home) host) + ((homehost) (cons host (am-home?))) ;; shut up old code + ((home?) (am-home?)) + ((best-five)(names->dats (best-five))) + ((all-valid)(names->dats all-valid)) + ((best) (let* ((best-five (best-five)) + (len (length best-five))) + (list-ref best-five (random len)))) (else (debug:print 0 *default-log-port* "ERROR: invalid command "mode) #f))) #f))) -(define (server:get-homehost #!key (trynum 5)) - ;; called often especially at start up. use mutex to eliminate collisions - (mutex-lock! *homehost-mutex*) - (cond - (*home-host* - (mutex-unlock! *homehost-mutex*) - *home-host*) - ((not *toppath*) - (mutex-unlock! *homehost-mutex*) - (launch:setup) ;; safely mutexed now - (if (> trynum 0) - (begin - (thread-sleep! 2) - (server:get-homehost trynum: (- trynum 1))) - #f)) - (else - (let* ((currhost (get-host-name)) - (bestadrs (server:get-best-guess-address currhost)) - (homehost (server:choose-server *toppath* 'home)) - (at-home (or (equal? homehost currhost) - (equal? homehost bestadrs)))) - - ;; if no homehost start server, wait a bit and check again - (if homehost - (begin - (set! *home-host* (cons homehost at-home)) - (mutex-unlock! *homehost-mutex*) - *home-host*) - (begin - (server:kind-run *toppath*) - (thread-sleep! 5) - (server:get-homehost trynum: (- trynum 1)))))))) - -;;====================================================================== -;; am I on the homehost? -;; -(define (common:on-homehost?) - (let ((hh (server:get-homehost))) - (if hh - (cdr hh) - #f))) - ;; kind start up of server, wait before allowing another server for a given ;; area to be launched ;; (define (server:kind-run areapath) @@ -545,11 +513,11 @@ (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. (server:record->url server-info) (let ((num-ok (length (server:choose-server areapath 'all-valid)))) (if (and (> try-num 0) ;; first time through simply wait a little while then try again (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one - (server:kind-run areapath)) + (server:run areapath)) (thread-sleep! 5) (loop (server:check-if-running areapath) (+ try-num 1))))))) (define (server:get-num-servers #!key (numservers 2))