Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -131,11 +131,12 @@ (define *test-info* (make-hash-table)) ;; cache the test info records, update the state, status, run_duration etc. from testdat.db (define *run-info-cache* (make-hash-table)) ;; run info is stable, no need to reget (define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set (define *homehost-mutex* (make-mutex)) -;; Awful. Please FIXME + +;; cache environment vars for each run here (define *env-vars-by-run-id* (make-hash-table)) ;; Testconfig and runconfig caches. (define *testconfigs* (make-hash-table)) ;; test-name => testconfig (define *runconfigs* (make-hash-table)) ;; target => runconfig @@ -515,11 +516,11 @@ ;;====================================================================== (define (common:run-sync?) (let ((ohh (common:on-homehost?)) (srv (args:get-arg "-server"))) - (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv) + ;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv) (and (common:on-homehost?) (args:get-arg "-server")))) ;;;; run-ids ;; if #f use *db-local-sync* : or 'local-sync-flags Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -371,11 +371,13 @@ (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) (if (and sdat (not changed) (> (- (current-seconds) start-time) 2)) - sdat + (begin + (debug:print-info 0 *default-log-port* "Received server alive signature") + sdat) (begin (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) (sleep 4) (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes (begin @@ -386,38 +388,46 @@ (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) (port (cadr server-info)) (last-access 0) - (server-timeout (server:get-timeout))) + (server-timeout (server:get-timeout)) + (server-going #f)) (let loop ((count 0) (server-state 'available) - (bad-sync-count 0)) + (bad-sync-count 0) + (start-time (current-milliseconds))) ;; Use this opportunity to sync the tmp db to megatest.db - (if *dbstruct-db* - (let ((start-time (current-milliseconds)) - (sync-time #f) - (rem-time #f)) - (thread-sleep! 4)) - ;; Removed code is pasted below (keeping it around until we are clear it is not needed). - ;; no *dbstruct-db* yet, set running after our first pass through and start the db - (if (eq? server-state 'available) - (let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers - (if (equal? new-server-id server-id) - (begin - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") - (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access - (set! *dbstruct-db* (db:setup)) ;; run-id)) - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") - (server:write-dotserver *toppath* (conc iface ":" port))) - (begin ;; gotta exit nicely - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision") - (http-transport:server-shutdown server-id port)))))) + (if (not server-going) ;; *dbstruct-db* + ;; Removed code is pasted below (keeping it around until we are clear it is not needed). + ;; no *dbstruct-db* yet, set running after our first pass through and start the db + (if (eq? server-state 'available) + (let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers + (if (equal? new-server-id server-id) + (begin + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") + (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access + (set! *dbstruct-db* (db:setup)) ;; run-id)) + (set! server-going #t) + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") + (server:write-dotserver *toppath* (conc iface ":" port)) + (delete-file* (conc *toppath* "/.starting-server"))) + (begin ;; gotta exit nicely + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision") + (http-transport:server-shutdown server-id port)))))) + + ;; when things go wrong we don't want to be doing the various queries too often + ;; so we strive to run this stuff only every four seconds or so. + (let* ((sync-time (- (current-milliseconds) start-time)) + (rem-time (quotient (- 4000 sync-time) 1000))) + (if (and (<= rem-time 4) + (> rem-time 0)) + (thread-sleep! rem-time))) (if (< count 1) ;; 3x3 = 9 secs aprox - (loop (+ count 1) 'running bad-sync-count)) + (loop (+ count 1) 'running bad-sync-count (current-milliseconds))) ;; Check that iface and port have not changed (can happen if server port collides) (mutex-lock! *heartbeat-mutex*) (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) @@ -457,11 +467,11 @@ ;; the db indicates so ;; ;; (if (tasks:server-am-i-the-server? tdb run-id) ;; (tasks:server-set-state! tdb server-id "running")) ;; - (loop 0 server-state bad-sync-count)) + (loop 0 server-state bad-sync-count (current-milliseconds))) (http-transport:server-shutdown server-id port)))))) ;; code cut out from above ;; ;; (condition-case @@ -523,10 +533,14 @@ ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch run-id) + (with-output-to-file + (conc *toppath* "/.starting-server") + (lambda () + (print (current-process-id) " on " (get-host-name)))) (let* ((tdbdat (tasks:open-db))) (set! *run-id* run-id) (if (args:get-arg "-daemonize") (begin (daemon:ize) @@ -535,11 +549,13 @@ (current-error-port *alt-log-file*) (current-output-port *alt-log-file*))))) (if (server:check-if-running run-id) (begin (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") - (exit 0))) + (exit 0)) + (begin ;; ok, no server detected, clean out any lingering records + (tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id "notresponding"))) (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)) (remtries 4)) (if (not server-id) (if (> remtries 0) (begin @@ -548,10 +564,11 @@ (- remtries 1))) (begin ;; since we didn't get the server lock we are going to clean up and bail out (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch") + (delete-file* (conc *toppath* "/.starting-server")) )) (let* ((th2 (make-thread (lambda () (debug:print-info 0 *default-log-port* "Server run thread started") (http-transport:run (if (args:get-arg "-server") Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -388,12 +388,12 @@ (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free))))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) - (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) - (tconfigreg (tests:get-all))) + (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) + (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) (if (list? cmdinfo) ;; ((testpath /tmp/mrwellan/jazzmind/src/example_run/tests/sqlitespeed) ;; (test-name sqlitespeed) (runscript runscript.rb) (db-host localhost) (run-id 1)) (let* ((testpath (assoc/default 'testpath cmdinfo)) ;; testpath is the test spec area (top-path (assoc/default 'toppath cmdinfo)) @@ -436,11 +436,12 @@ (change-directory top-path) (begin (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " top-path " not found") (thread-sleep! 10) (loop (+ count 1))))) - + (launch:setup) ;; should be properly in the top-path now + (set! tconfigreg (tests:get-all)) (let ((sighand (lambda (signum) ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting (if (eq? signum signal/stop) (debug:print-error 0 *default-log-port* "attempt to STOP process. Exiting.")) (set! *time-to-exit* #t) @@ -703,12 +704,14 @@ ;; *runconfigdat* (runconfigs.config info) ;; *configstatus* (status of the read data) ;; (define (launch:setup #!key (force #f)) (mutex-lock! *launch-setup-mutex*) - (if *toppath* + (if (and *toppath* + (eq? *configstatus* 'fulldata)) ;; got it all (begin + (debug:print 0 *default-log-port* "NOTE: skipping launch:setup-body call since we have fulldata") (mutex-unlock! *launch-setup-mutex*) *toppath*) (let ((res (launch:setup-body force: force))) (mutex-unlock! *launch-setup-mutex*) res))) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -697,45 +697,45 @@ (if (args:get-arg "-server") ;; Server? Start up here. ;; (let ((tl (launch:setup)) - (run-id (and (args:get-arg "-run-id") - (string->number (args:get-arg "-run-id")))) + ;; (run-id (and (args:get-arg "-run-id") + ;; (string->number (args:get-arg "-run-id")))) (transport-type (string->symbol (or (args:get-arg "-transport") "http")))) - (if run-id - (begin - (server:launch run-id transport-type) - (set! *didsomething* #t)) - (debug:print-error 0 *default-log-port* "server requires run-id be specified with -run-id"))) - - ;; Not a server? This section will decide how to communicate - ;; - ;; Setup client for all expect listed here - (if (null? (lset-intersection - equal? - (hash-table-keys args:arg-hash) - '("-list-servers" - "-stop-server" - "-kill-server" - "-show-cmdinfo" - "-list-runs" - "-ping"))) - (if (launch:setup) - (let ((run-id (and (args:get-arg "-run-id") - (string->number (args:get-arg "-run-id"))))) - ;; (set! *fdb* (filedb:open-db (conc *toppath* "/db/paths.db"))) - ;; if not list or kill then start a client (if appropriate) - (if (or (args-defined? "-h" "-version" "-create-megatest-area" "-create-test") - (eq? (length (hash-table-keys args:arg-hash)) 0)) - (debug:print-info 1 *default-log-port* "Server connection not needed") - (begin - ;; (if run-id - ;; (client:launch run-id) - ;; (client:launch 0) ;; without run-id we'll start a server for "0" - #t - )))))) + ;; (if run-id + ;; (begin + (server:launch 0 transport-type) + (set! *didsomething* #t))) +;; ;; (debug:print-error 0 *default-log-port* "server requires run-id be specified with -run-id"))) +;; +;; ;; Not a server? This section will decide how to communicate +;; ;; +;; ;; Setup client for all expect listed here +;; (if (null? (lset-intersection +;; equal? +;; (hash-table-keys args:arg-hash) +;; '("-list-servers" +;; "-stop-server" +;; "-kill-server" +;; "-show-cmdinfo" +;; "-list-runs" +;; "-ping"))) +;; (if (launch:setup) +;; (let ((run-id (and (args:get-arg "-run-id") +;; (string->number (args:get-arg "-run-id"))))) +;; ;; (set! *fdb* (filedb:open-db (conc *toppath* "/db/paths.db"))) +;; ;; if not list or kill then start a client (if appropriate) +;; (if (or (args-defined? "-h" "-version" "-create-megatest-area" "-create-test") +;; (eq? (length (hash-table-keys args:arg-hash)) 0)) +;; (debug:print-info 1 *default-log-port* "Server connection not needed") +;; (begin +;; ;; (if run-id +;; ;; (client:launch run-id) +;; ;; (client:launch 0) ;; without run-id we'll start a server for "0" +;; #t +;; )))))) (if (or (args:get-arg "-list-servers") (args:get-arg "-stop-server") (args:get-arg "-kill-server")) (let ((tl (launch:setup))) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -59,17 +59,18 @@ ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in *runremote* ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; - (if (and #f ;; FORCE NO GO FOR RIGHT NOW + (if (and ;; #f ;; FORCE NO GO FOR RIGHT NOW (not *runremote*) ;; we trust *runremote* to reflect that a server was found previously (not (member cmd api:read-only-queries))) ;; we don't trust so much the list of write queries (let ((serverconn (server:check-if-running *toppath*))) (if serverconn (set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed - (server:kind-run *toppath*)))) + (if (not (server:start-attempted? *toppath*)) + (server:kind-run *toppath*))))) (rmt:open-qry-close-locally cmd (if rid rid 0) params)) ;; (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin ;; (for-each Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -107,22 +107,23 @@ ;; (define (server:run areapath) ;; areapath is ignored for now. (let* ((curr-host (get-host-name)) (curr-ip (server:get-best-guess-address curr-host)) (curr-pid (current-process-id)) - (target-host (configf:lookup *configdat* "server" "homehost" )) + (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) + (target-host (car homehost)) (testsuite (common:get-testsuite-name)) - (logfile (conc *toppath* "/logs/server-" curr-pid ".log")) + (logfile (conc *toppath* "/logs/server.log")) (cmdln (conc (common:get-megatest-exe) " -server " (or target-host "-") " -run-id " 0 (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") (conc " -daemonize -log " logfile) "") " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread"))) ;; we want the remote server to start in *toppath* so push there (push-directory *toppath*) - (debug:print 0 *default-log-port* "INFO: Starting server (" cmdln ") as none running ...") + (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") (thread-start! log-rotate) ;; host.domain.tld match host? (if (and target-host ;; look at target host, is it host.domain.tld or ip address and does it @@ -157,15 +158,23 @@ (server:run areapath) (hash-table-set! *server-kind-run* areapath (current-seconds)))))) ;; The generic run a server command. Dispatches the call to server 0 if run-id != 0 ;; -(define (server:try-running run-id) - (if (eq? run-id 0) - (server:run run-id) - (rmt:start-server run-id))) +;; (define (server:try-running run-id) +;; (if (eq? run-id 0) +;; (server:run run-id) +;; (rmt:start-server run-id))) +(define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. +(define (server:start-attempted? areapath) + (let ((flagfile (conc areapath "/.starting-server"))) + (and (file-exists? flagfile) + (< (- (current-seconds) + (file-modification-time flagfile)) + 15)))) ;; exists and less than 15 seconds old + (define (server:read-dotserver areapath) (let ((dotfile (conc areapath "/.server"))) (if (and (file-exists? dotfile) (file-read-access? dotfile)) (with-input-from-file