Index: rmtmod.scm ================================================================== --- rmtmod.scm +++ rmtmod.scm @@ -1443,36 +1443,10 @@ (define (common:api-changed?) (not (equal? (substring (->string megatest-version) 0 4) (substring (conc (common:get-last-run-version)) 0 4)))) -;;====================================================================== -;; Move me elsewhere ... -;; RADT => Why do we meed the version check here, this is called only if version misma -;; -#;(define (common:cleanup-db dbstruct #!key (full #f)) - (apply db:multi-db-sync - dbstruct - 'schema - ;; 'new2old - 'killservers - 'adj-target - ;; 'old2new - 'new2old - ;; (if full - '(dejunk) - ;; '()) - ) - (if (common:api-changed?) - (common:set-last-run-version))) - -;; This login does no retries under the hood - it acts a bit like a ping. -;; Deprecated for nmsg-transport. -;; -;; (define (rmt:login-no-auto-client-setup connection-info) -;; (rmt:send-receive-no-auto-client-setup connection-info 'login 0 (list *toppath* megatest-version *my-client-signature*))) - (define (std-exit-procedure) ;;(common:telemetry-log-close) (on-exit (lambda () 0)) ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) (let ((no-hurry (if (bdat-time-to-exit *bdat*) ;; hurry up @@ -1488,27 +1462,31 @@ (if *server-info* (let ((pkt-file (conc (get-pkts-dir *toppath*) "/" (servdat-uuid *server-info*) ".pkt")) (dbfile (servdat-dbfile *server-info*))) - (debug:print-info 0 *default-log-port* "removing pkt "pkt-file) - (delete-file* pkt-file) (if dbfile + (begin + + ;; do a final sync here + (if (string-match ".*/main.db$" dbfile) (begin + (debug:print-info 0 *default-log-port* "removing pkt "pkt-file) + (delete-file* pkt-file) (debug:print-info 0 *default-log-port* "Releasing lock for "dbfile) (db:with-lock-db (servdat-dbfile *server-info*) (lambda (dbh dbfile) (db:release-lock dbh dbfile)))) (let* ((sdat *server-info*)) ;; we have a run-id server (rmt:send-receive-real *rmt:remote* *toppath* (db:run-id->dbname #f) - 'register-server + 'deregister-server `(,(servdat-uuid sdat) ,(current-process-id) ,(servdat-host sdat) ;; iface - ,(servdat-port sdat)))))))) + ,(servdat-port sdat))))))))) (if (bdat-task-db *bdat*) ;; TODO: Check that this is correct for task db (let ((db (cdr (bdat-task-db *bdat*)))) (if (sqlite3:database? db) (begin (sqlite3:interrupt! db) @@ -1534,238 +1512,23 @@ ) ) 0) -;;====================================================================== -;; Force a megatest cleanup-db if version is changed and skip-version-check not specified -;; Do NOT check if not on homehost! -;; -#;(define (common:exit-on-version-changed) - (if (common:on-homehost?) - (if (common:api-changed?) - (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")) - (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.db")) - (read-only (not (file-writable? dbfile))) - (dbstruct (db:setup #t))) - (debug:print 0 *default-log-port* - "WARNING: Version mismatch!\n" - " expected: " (common:version-signature) "\n" - " got: " (common:get-last-run-version)) - (cond - ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t) - ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only) - (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db - (debug:print 0 *default-log-port* " I see you are the owner of megatest.config, attempting to cleanup and reset to new version") - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "Failed to switch versions. exn=" exn) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (print-call-chain (current-error-port)) - (exit 1)) - (common:cleanup-db dbstruct))) - ((not (common:file-exists? mtconf)) - (debug:print 0 *default-log-port* " megatest.config does not exist in this area. Cannot proceed with megatest version migration.") - (exit 1)) - ((not (common:file-exists? dbfile)) - (debug:print 0 *default-log-port* " megatest.db does not exist in this area. Cannot proceed with megatest version migration.") - (exit 1)) - ((not (eq? (current-user-id)(file-owner mtconf))) - (debug:print 0 *default-log-port* " You do not own megatest.db in this area. Cannot proceed with megatest version migration.") - (exit 1)) - (read-only - (debug:print 0 *default-log-port* " You have read-only access to this area. Cannot proceed with megatest version migration.") - (exit 1)) - (else - (debug:print 0 *default-log-port* " to switch versions you can run: \"megatest -cleanup-db\"") - (exit 1))))))) -;;====================================================================== -;; (begin -;; (debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.") -;; (exit 1)))) (define (common:run-sync?) ;; (and (common:on-homehost?) (args:get-arg "-server")) - -;; this one seems to be the general entry point -;; -#;(define (server:start-and-wait areapath #!key (timeout 60)) - (let ((give-up-time (+ (current-seconds) timeout))) - (let loop ((server-info (server:check-if-running areapath)) - (try-num 0)) - (if (or server-info - (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. - (server:record->url server-info) - (let ((num-ok (length (server:get-best (server:get-list areapath))))) - (if (and (> try-num 0) ;; first time through simply wait a little while then try again - (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one - (server:kind-run areapath)) - (thread-sleep! 5) - (loop (server:check-if-running areapath) - (+ try-num 1))))))) - ;; called in megatest.scm, host-port is string hostname:port ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; (define (server:ping host port server-id #!key (do-exit #f)) (let* ((sdat (servdat-init #f host port server-id))) (rmt:send-receive sdat 'ping '()))) - -;; ping the given server -;; -#;(define (server:check-server server-record) - (let* ((server-url (server:record->url server-record)) - (server-id (server:record->id server-record)) - (res (server:ping server-url server-id))) - (if res - server-url - #f))) - -;; no longer care if multiple servers are started by accident. older -;; servers will drop off in time. -;; -;; defunct -;; -#;(define (server:check-if-running areapath) ;; #!key (numservers "2")) - (let* ((ns (server:get-num-servers)) - (servers (server:get-best (server:get-list areapath)))) - (if (or (and servers - (null? servers)) - (not servers) - (and (list? servers) - (< (length servers) (pseudo-random-integer ns)))) ;; somewhere between 0 and numservers - #f - (let loop ((hed (car servers)) - (tal (cdr servers))) - (let ((res (server:check-server hed))) - (if res - hed - (if (null? tal) - #f - (loop (car tal)(cdr tal))))))))) - -;; kind start up of servers, wait 40 seconds before allowing another server for a given -;; run-id to be launched -;; -;; defunct -;; -#;(define (server:kind-run areapath) - ;; look for $MT_RUN_AREA_HOME/logs/server-start-last - ;; and wait for it to be at least 3 seconds old - ;; (server:wait-for-server-start-last-flag areapath) - (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? - (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun - (call-num (car last-run-dat)) - (when-run (cadr last-run-dat)) - (run-delay (+ (case call-num - ((0) 0) - ((1) 20) - ((2) 300) - (else 600)) - (pseudo-random-integer 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously - (lock-file (conc areapath "/logs/server-start.lock"))) - (if (> (- (current-seconds) when-run) run-delay) - (let* ((start-flag (conc areapath "/logs/server-start-last"))) - (common:simple-file-lock-and-wait lock-file expire-time: 15) - (debug:print-info 0 *default-log-port* "server:kind-run: touching " start-flag) - (system (conc "touch " start-flag)) ;; lazy but safe - (server:run areapath) - (thread-sleep! 2) ;; don't release the lock for at least a few seconds - (common:simple-file-release-lock lock-file))) - (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) - -#;(define (client:connect iface port) - (http-transport:client-connect iface port) - #;(case (server:get-transport) - ((rpc) (rpc:client-connect iface port)) - ((http) (http:client-connect iface port)) - ((zmq) (zmq:client-connect iface port)) - (else (rpc:client-connect iface port)))) - -;; -;; defunct -;; -#;(define (client:setup areapath #!key (remaining-tries 100) (failed-connects 0)) - (print "got here") - ;; (client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects) - #;(case (server:get-transport) - ((rpc) (rpc-transport:client-setup remaining-tries: remaining-tries failed-connects: failed-connects)) ;;(client:setup-rpc run-id)) - ((http)(client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects)) - (else (rpc-transport:client-setup remaining-tries: remaining-tries failed-connects: failed-connects)))) ;; (client:setup-rpc run-id)))) - -;; Do all the connection work, look up the transport type and set up the -;; connection if required. -;; -;; There are two scenarios. -;; 1. We are a test manager and we received *transport-type* and *runremote* via cmdline -;; 2. We are a run tests, list runs or other interactive process and we must figure out -;; *transport-type* and *runremote* from the monitor.db -;; -;; client:setup -;; -;; lookup_server, need to remove *runremote* stuff -;; - -#;(define (client:setup-http areapath #!key (remaining-tries 100) (failed-connects 0)(area-dat #f)) - (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) - (server:start-and-wait areapath) - (if (<= remaining-tries 0) - (begin - (debug:print-error 0 *default-log-port* "failed to start or connect to server") - (exit 1)) - ;; - ;; Alternatively here, we can get the list of candidate servers and work our way - ;; through them searching for a good one. - ;; - (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath)) - (runremote (or area-dat *runremote*))) - (if (not server-dat) ;; no server found - (client:setup-http areapath remaining-tries: (- remaining-tries 1)) - (let ((host (cadr server-dat)) - (port (caddr server-dat)) - (server-id (caddr (cddr server-dat)))) - (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) - (if (and (not area-dat) - (not *runremote*)) - (begin - (set! *runremote* (make-and-init-remote)) - (let* ((server-info (remote-server-info *runremote*))) - (if server-info - (begin - (remote-server-url-set! *runremote* (server:record->url server-info)) - (remote-server-id-set! *runremote* (server:record->id server-info))))))) - (if (and host port server-id) - (let* ((start-res (case *transport-type* - ((http)(http-transport:client-connect host port server-id)))) - (ping-res (case *transport-type* - ((http)(rmt:login-no-auto-client-setup start-res))))) - (if (and start-res - ping-res) - (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago - (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res) - (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) - start-res) - (begin ;; login failed but have a server record, clean out the record and try again - (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid. Fixes part of Randy;s ticket 1405717332 - (case *transport-type* - ((http)(http-transport:close-connections))) - (remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id) - (thread-sleep! 1) - (client:setup-http areapath remaining-tries: (- remaining-tries 1)) - ))) - (begin ;; no server registered - ;; (server:kind-run areapath) - (server:start-and-wait areapath) - (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) - (thread-sleep! 1) ;; (+ 5 (pseudo-random-integer (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. - (client:setup-http areapath remaining-tries: (- remaining-tries 1))))))))) - ;;====================================================================== ;; http-transportmod.scm contents moved here ;;====================================================================== @@ -2350,25 +2113,33 @@ (debug:print 0 *default-log-port* "SERVER: dbprep") (db:setup dbname) ;; sets *dbstruct-db* as side effect ;; IFF I'm not main, call into main and register self (if (not is-main) - (debug:print-info 0 - *default-log-port* - "Register server returned: " - (rmt:register-server *rmt:remote* *toppath* iface port server-key dbname))) - - (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version)) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. + (let ((res (rmt:register-server *rmt:remote* + *toppath* iface port + server-key dbname))) + (if (not res) ;; we are not the server! + (begin + (debug:print 0 *default-log-port* "We are not the server for "dbname", exiting.") + (exit))))) + (debug:print 0 *default-log-port* + "SERVER: running, megatest version: " + (common:get-full-version)) (if watchdog - (if (not (member (thread-state watchdog) '(ready running blocked sleeping dead))) + (if (not (member (thread-state watchdog) + '(ready running blocked + sleeping dead))) (begin (debug:print-info 0 *default-log-port* "Starting watchdog thread (in state "(thread-state watchdog)")") (thread-start! watchdog))) - (debug:print 0 *default-log-port* "ERROR: *watchdog* not setup, cannot start it.")))) + (debug:print 0 *default-log-port* "ERROR: *watchdog* not setup, cannot start it.")) + (loop (+ count 1) 'running bad-sync-count start-time))) - ;; when things go wrong we don't want to be doing the various queries too often - ;; so we strive to run this stuff only every four seconds or so. + ;; when things go wrong we don't want to be doing the various + ;; queries too often so we strive to run this stuff only every + ;; four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) (rem-time (quotient (- 4000 sync-time) 1000))) (if (and (<= rem-time 4) (> rem-time 0)) (thread-sleep! rem-time))) Index: tests/unittests/basicserver.scm ================================================================== --- tests/unittests/basicserver.scm +++ tests/unittests/basicserver.scm @@ -26,17 +26,17 @@ launchmod) (trace-call-sites #t) (trace ;; db:get-dbdat ;; rmt:find-main-server - ;; rmt:send-receive-real - ;; rmt:send-receive + rmt:send-receive-real + rmt:send-receive ;; sexpr->string ;; server-ready? ;; rmt:register-server ;; rmt:open-main-connection - ;; rmt:general-open-connection + rmt:general-open-connection ;; rmt:get-conny ;; common:watchdog ;; rmt:find-main-server ;; get-all-server-pkts ;; get-viable-servers @@ -79,10 +79,11 @@ (thread-sleep! 2) (test #f #t (list? (rmt:general-open-connection *rmt:remote* *toppath* ".db/2.db"))) (test #f '("SYSTEM" "RELEASE") (rmt:get-keys)) (test #f 1 (rmt:send-receive 'register-run #f (list keyvals "run1" "new" "n/a" "justme" #f))) +(print "Got here.") (test #t 1 (rmt:send-receive 'register-run 1 (list keyvals "run2" "new" "n/a" "justme" #f))) (test #f 1 (rmt:register-run keyvals "run2" "new" "n/a" "justme" #f)) ;; (delete-file* "logs/1.log")