Index: dbmod.scm ================================================================== --- dbmod.scm +++ dbmod.scm @@ -228,12 +228,13 @@ ;; ;; called in http-transport and replicated in rmt.scm for *local* access. ;; (define (db:setup run-id) (assert *toppath* "FATAL: db:setup called before toppath is available.") - (let* ((dbstruct (make-dbr:dbstruct))) - (db:get-dbdat dbstruct *toppath* run-id) + (let* ((dbstruct (make-dbr:dbstruct)) + (db-file (db:run-id->path *toppath* run-id))) + (db:get-dbdat dbstruct *toppath* db-file) (set! *dbstruct-db* dbstruct) dbstruct)) ;;====================================================================== ;; setting/getting a lock on the db for only one server per db @@ -241,30 +242,36 @@ ;; NOTE: ;; These operate directly on the disk file, NOT on the inmemory db ;; The lockname is the filename (can have many to one, run-id to fname ;;====================================================================== -(define (db:get-iam-server-lock dbstruct apath run-id) - (let* ((dbh (db:get-ddb apath dbstruct run-id)) - (dbfname (db:run-id->path run-id))) - (sqlite3:with-transaction - dbh - (lambda () - (let* ((locked (db:get-locker dbh dbfname))) - (if (not locked) - (db:take-lock dbh dbfname))))))) +;; called before db is open? +;; +(define (db:get-iam-server-lock dbh dbfname) + (sqlite3:with-transaction + dbh + (lambda () + (let* ((locked (db:get-locker dbh dbfname))) + (if (not locked) + (db:take-lock dbh dbfname) + #f))))) ;; (exn sqlite3) (define (db:get-locker dbh dbfname) (condition-case - (sqlite3:first-row dbh "SELECT owner_id,owner_host,event_time FROM locks WHERE lockname=%;" dbfname) + (sqlite3:first-row dbh "SELECT owner_pid,owner_host,event_time FROM locks WHERE lockname=%;" dbfname) (exn (sqlite3) #f))) +;; should never fail because it is run in a transaction with a test for the lock +;; (define (db:take-lock dbh dbfname) - (condition-case - (sqlite3:first-row dbh "INSERT INTO locks lockname,owner_id,owner_host VALUES (?,?,?);" dbfname (current-process-id) (get-host-name)) - (exn (sqlite3) #f))) + ;; (condition-case + ;; (begin + (sqlite3:execute dbh "INSERT INTO locks (lockname,owner_pid,owner_host) VALUES (?,?,?);" dbfname (current-process-id) (get-host-name)) + ;; #t) + ;; (exn (sqlite3) #f))) + #t) (define (db:release-lock dbh dbfname) (sqlite3:execute dbh "DELETE FROM locks WHERE lockname=?;" dbfname)) ;;====================================================================== Index: http-transportmod.scm ================================================================== --- http-transportmod.scm +++ http-transportmod.scm @@ -96,11 +96,11 @@ ;; ;; Configurations for server (tcp-buffer-size 2048) (max-connections 2048) -(defstruct sdat +(defstruct servdat host port uuid) (define (http-transport:make-server-url hostport) @@ -226,11 +226,11 @@ (http-transport:try-start-server ipaddrstr (portlogger:open-run-close portlogger:find-port))) (begin (print "ERROR: Tried and tried but could not start the server")))) ;; any error in following steps will result in a retry - (set! *server-info* (make-sdat host: ipaddrstr port: portnum)) + (set! *server-info* (make-servdat host: ipaddrstr port: portnum)) (debug:print 0 *default-log-port* "INFO: Trying to start server on " ipaddrstr ":" portnum) ;; This starts the spiffy server ;; NEED WAY TO SET IP TO #f TO BIND ALL ;; (start-server bind-address: ipaddrstr port: portnum) (if config-hostname ;; this is a hint to bind directly @@ -447,10 +447,19 @@ server-dat)) ;;====================================================================== ;; NEW SERVER METHOD ;;====================================================================== + +;; only use for main.db - need to re-write some of this :( +;; +(define (get-lock-db sdat dbfile) + (let* ((dbh (db:open-run-db dbfile db:initialize-db)) + (res (db:get-iam-server-lock dbh dbfile))) + (sqlite3:finalize! dbh) + res)) + (define *srvpktspec* `((server (host . h) (port . p) (servkey . k) @@ -471,15 +480,10 @@ pktspec: pkt-spec ptype: 'server))) (debug:print 0 *default-log-port* "Server on "host":"port" registered in pkt "uuid) uuid)) -;; ya, fake it for now -;; -(define (register-server-in-db db-file) - #t) - (define (get-pkts-dir #!optional (apath #f)) (let* ((effective-toppath (or *toppath* apath))) (assert effective-toppath "ERROR: get-pkts-dir called without *toppath* set. Exiting.") (let* ((pdir (conc effective-toppath "/.meta/srvpkts"))) @@ -612,29 +616,34 @@ ;; TODO: ;; 1. change sdat to stuct ;; 2. add uuid to struct ;; 3. update uuid in sdat here ;; - (sdat-uuid-set! sdat + (servdat-uuid-set! sdat (register-server pkts-dir *srvpktspec* (get-host-name) - (sdat-port sdat) server-key - (sdat-host sdat) db-file)) + (servdat-port sdat) server-key + (servdat-host sdat) db-file)) ;; now read pkts and see if we are a contender (let* ((all-pkts (get-all-server-pkts pkts-dir *srvpktspec*)) (viables (get-viable-servers all-pkts db-file)) (best-srv (get-best-candidate viables db-file)) (best-srv-key (if best-srv (alist-ref 'servkey best-srv) #f))) (debug:print 0 *default-log-port* "best-srv-key: "best-srv-key", server-key: "server-key) ;; am I the best-srv, compare server-keys to know - (if (and (equal? best-srv-key server-key) - (register-server-in-db db-file)) - (if (db:get-iam-server-lock *dbstruct-db* *toppath* run-id) + (if (equal? best-srv-key server-key) + (if (get-lock-db sdat db-file) ;; (db:get-iam-server-lock *dbstruct-db* *toppath* run-id) (debug:print 0 *default-log-port* "I'm the server!") - (bdat-time-to-exit-set! *bdat* #t))) ;; nope, we are not needed, exit when can do + (begin + (debug:print 0 *default-log-port* "I'm not the server, exiting.") + (bdat-time-to-exit-set! *bdat* #t) + (exit))) + (begin + (debug:print 0 *default-log-port* "Keys do not match "best-srv-key", "server-key", exiting.") + (bdat-time-to-exit-set! *bdat* #t))) sdat)) (begin (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) (sleep 4) (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes @@ -649,12 +658,12 @@ *configdat* #t) (exit)) (loop start-time (equal? sdat last-sdat) sdat))))))) - (iface (car server-info)) - (port (cadr server-info)) + (iface (servdat-host server-info)) + (port (servdat-port server-info)) (last-access 0) (server-timeout (server:expiration-timeout)) (server-log-file (args:get-arg "-log"))) ;; always set when we are a server (let loop ((count 0) @@ -670,11 +679,11 @@ (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version)) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. (if watchdog (if (not (member (thread-state watchdog) '(ready running blocked sleeping dead))) (begin - (debug:print-info 0 "Starting watchdog thread (in state "(thread-state watchdog)")") + (debug:print-info 0 *default-log-port* "Starting watchdog thread (in state "(thread-state watchdog)")") (thread-start! watchdog))) (debug:print 0 *default-log-port* "ERROR: *watchdog* not setup, cannot start it.")))) ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. @@ -690,20 +699,20 @@ ;; Check that iface and port have not changed (can happen if server port collides) (mutex-lock! *heartbeat-mutex*) (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) - (if (or (not (equal? (sdat-host sdat) iface)) - (not (equal? (sdat-port sdat) port))) - (let ((new-iface (car sdat)) - (new-port (cadr sdat))) + (if (or (not (equal? (servdat-host sdat) iface)) + (not (equal? (servdat-port sdat) port))) + (let ((new-iface (servdat-host sdat)) + (new-port (servdat-port sdat))) (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info") (set! iface new-iface) (set! port new-port) - (if (not *server-id*) - (set! *server-id* (server:mk-signature))) - (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv)) + (if (not *server-id*) + (set! *server-id* (server:mk-signature))) + ;; (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv)) (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) (flush-output *default-log-port*))) ;; Transfer *db-last-access* to last-access to use in checking that we are still alive (mutex-lock! *heartbeat-mutex*) @@ -712,13 +721,13 @@ (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) (begin (if (not *server-id*) (set! *server-id* (server:mk-signature))) - (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv)) - (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) - (flush-output *default-log-port*))) + ;; (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv)) + (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) + (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) Index: launchmod.scm ================================================================== --- launchmod.scm +++ launchmod.scm @@ -463,11 +463,11 @@ (set! tconfigreg (tests:get-all)) ;; mapping of testname => test source path (let ((sighand (lambda (signum) ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting (if (eq? signum signal/stop) (debug:print-error 0 *default-log-port* "attempt to STOP process. Exiting.")) - (set! (bdat-time-to-exit *bdat*) #t) + (bdat-time-to-exit-set! *bdat* #t) (print "Received signal " signum ", cleaning up before exit (set this test to COMPLETED/ABORT) . Please wait...") (let ((th1 (make-thread (lambda () (print "set test to COMPLETED/ABORT begin.") (rmt:test-set-state-status run-id test-id "COMPLETED" "ABORT" "received kill signal") (print "set test to COMPLETED/ABORT complete.") Index: rmtmod.scm ================================================================== --- rmtmod.scm +++ rmtmod.scm @@ -1767,10 +1767,16 @@ (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.") (if (and no-hurry (debug:debug-mode 18)) (rmt:print-db-stats)) (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated + (if *server-info* + (let ((pkt-file (conc (get-pkts-dir *toppath*) + "/" (servdat-uuid *server-info*) + ".pkt"))) + (debug:print-info 0 *default-log-port* "removing pkt "pkt-file) + (delete-file* pkt-file))) (if (bdat-task-db *bdat*) (let ((db (cdr (bdat-task-db *bdat*)))) (if (sqlite3:database? db) (begin (sqlite3:interrupt! db)