Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -57,10 +57,11 @@ (thread-start! (make-thread (lambda () (let loop () (if (and *toppath* (file-exists? (conc *toppath*"/stop-the-train"))) (let* ((msg (conc "ERROR: found file "*toppath*"/stop-the-train, exiting immediately"))) + ;; yes, print to current-output-port AND *default-log-port*, annoying but necessary I think (print msg) (debug:print 0 *default-log-port* msg) (exit 1))) (thread-sleep! 5) (loop)))))) Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -82,10 +82,11 @@ ;; this is one db per server (inmem #f) ;; handle for the in memory copy (dbfile #f) ;; path to the db file on disk (dbfname #f) ;; short name of db file on disk (used to validate accessing correct db) (ondiskdb #f) ;; handle for the on-disk file + (dbtmpname #f) ;; path to db file in /tmp (non-imem method) (dbdat #f) ;; create a dbdat for the downstream calls such as db:with-db (last-update 0) (sync-proc #f) ) @@ -516,10 +517,12 @@ (if newres newres res)) res))) +;; timestring+identifier+payload +;; locks are unique on identifier, payload is informational (define (db:extract-time-identifier instr) (let ((tokens (string-split instr "+"))) (match tokens ((t i)(cons (string->number t) i)) ((t) (cons (string->number t) #f)) @@ -526,12 +529,12 @@ (else (assert #f "FATAL: db:extract-time-identifier handed bad data "instr))))) ;; transaction protected lock aquisition ;; either: -;; fails returns (#f . lock-creation-time) -;; succeeds (returns (#t . lock-creation-time) +;; fails returns (#f lock-creation-time identifier) +;; succeeds (returns (#t lock-creation-time identifier) ;; use (db:no-sync-del! db keyname) to release the lock ;; (define (db:no-sync-get-lock-with-id db keyname identifier) (sqlite3:with-transaction db @@ -540,18 +543,19 @@ (let* ((curr-val (db:no-sync-get/default db keyname #f))) (if curr-val (match (db:extract-time-identifier curr-val) ;; result->timestamp, identifier ((timestamp ident) (if (equal? ident identifier) - #t ;; this *is* my lock - #f)) ;; nope, not my lock - (else #f)) ;; nope, not my lock - (let ((lock-value (if identifier + (cons #t timestamp) ;; this *is* my lock + (cons #f timestamp))) ;; nope, not my lock + (else (cons #f #f))) ;; nope, not my lock + (let ((curr-sec (current-seconds)) + (lock-value (if identifier (conc (current-seconds)"+"identifier) (current-seconds)))) (sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-value) - #t))) + (cons #t curr-sec)))) (exn (io-error) (dbfile:print-err "ERROR: i/o error with no-sync db. Check permissions, disk space etc. and try again.")) (exn (corrupt) (dbfile:print-err "ERROR: database no-sync db is corrupt. Repair it to proceed.")) (exn (busy) (dbfile:print-err "ERROR: database no-sync db is locked. Try copying to another location, remove original and copy back.")) (exn (permission)(dbfile:print-err "ERROR: database no-sync db has some permissions problem.")) (exn () ;; (status done) ;; I don't know how to detect status done but no data! Index: dbmod.scm ================================================================== --- dbmod.scm +++ dbmod.scm @@ -97,11 +97,11 @@ (dbfile (dbr:dbdat-dbfile dbdat))) ;; if nfs mode do a sync if delta > 2 (let* ((last-update (dbr:dbstruct-last-update dbstruct)) (sync-proc (dbr:dbstruct-sync-proc dbstruct)) (curr-secs (current-seconds))) - (if (> (- curr-secs last-update) 3) + (if (> (- curr-secs last-update) 5) (begin (sync-proc last-update) ;; MOVE THIS CALL TO INSIDE THE sync-proc CALL (dbr:dbstruct-last-update-set! dbstruct curr-secs) @@ -178,11 +178,12 @@ (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))) (dbpath (dbmod:get-dbdir dbstruct)) ;; directory where all the .db files are kept (dbfullname (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id)) (dbexists (file-exists? dbfullname)) (tmpdir (conc "/tmp/"(current-user-name))) - (tmpdb (let* ((fname (conc tmpdir"/" (string-translate areapath "/" ".")"-"(current-process-id)"-"dbfname))) + (tmpdb (let* ((fname (conc tmpdir"/" (string-translate areapath "/" ".") ;; "-"(current-process-id) + "-"dbfname))) (if (not (file-exists? tmpdir))(create-directory tmpdir)) ;; check if tmpdb already exists, either delete it or ;; add something to the name fname)) (inmem (dbmod:open-inmem-db init-proc @@ -201,28 +202,30 @@ (exit))) ;; (assert (sqlite3:database? inmem) "FATAL: open-dbmoddb: inmem is not a db") ;; (assert (sqlite3:database? db) "FATAL: open-dbmoddb: db is not a db") (dbr:dbstruct-inmem-set! dbstruct inmem) (dbr:dbstruct-ondiskdb-set! dbstruct db) (dbr:dbstruct-dbfile-set! dbstruct dbfullname) + (dbr:dbstruct-dbtmpname-set! dbstruct tmpdb) (dbr:dbstruct-dbfname-set! dbstruct dbfname) (dbr:dbstruct-sync-proc-set! dbstruct (lambda (last-update) (if *sync-in-progress* (debug:print 3 *default-log-port* "WARNING: overlapping calls to sync to disk") - (let* ((sync-cmd (conc "megatest -db2db -from "tmpdb" -to "dbfullname))) - (mutex-lock! *db-with-db-mutex*) ;; this mutex is used when overloaded or during a query that modifies the db + (let* ((sync-cmd (conc "megatest -db2db -from "tmpdb" -to "dbfullname"&"))) + ;; (mutex-lock! *db-with-db-mutex*) ;; this mutex is used when overloaded or during a query that modifies the db (set! *sync-in-progress* #t) ;; (if (eq? (dbfile:cache-method) 'inmem) ;; (dbmod:sync-gasket tables last-update inmem db ;; dbfullname syncdir keys) (thread-start! (make-thread (lambda () (debug:print-info "Running "sync-cmd) - (system sync-cmd)))) - (mutex-unlock! *db-with-db-mutex*) - (thread-sleep! 0.5) ;; ensure at least 1/2 second down time between sync calls - (set! *sync-in-progress* #f))))) + (system sync-cmd) + ;; (mutex-unlock! *db-with-db-mutex*) + ;; (thread-sleep! 0.5) ;; ensure at least 1/2 second down time between sync calls + (set! *sync-in-progress* #f)))) + )))) ;; (dbmod:sync-tables tables #f db inmem) ;; (if db (dbmod:sync-gasket tables #f inmem db dbfullname 'fromdest keys) ;; ) ;; load into inmem (dbr:dbstruct-last-update-set! dbstruct (current-seconds)) ;; should this be offset back in time by one second? dbstruct)) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -2573,26 +2573,32 @@ (let* ((duh (launch:setup)) (src-db (args:get-arg "-from")) (dest-db (args:get-arg "-to")) (sync-period (args:get-arg "-period")) ;; NOT IMPLEMENTED YET (sync-timeout (args:get-arg "-timeout")) ;; NOT IMPLEMENTED YET - (lockfile (conc dest-db".lock")) + (lockfile (conc dest-db".sync-lock")) (keys (db:get-keys #f)) ) (if (and src-db dest-db) (if (file-exists? src-db) - (begin - (debug:print-info 0 *default-log-port* "Attempting to sync data from "src-db" to "dest-db"...") - (if (not (file-exists? dest-db)) - (begin - (debug:print 0 *default-log-port* "Using copy to create "dest-db" from "src-db) - (file-copy src-db dest-db)) - (let ((res (dbmod:db-to-db-sync src-db dest-db 0 (dbfile:db-init-proc) keys))) - (if res - (debug:print-info 0 *default-log-port* "Synced " res " records from "src-db" to "dest-db) - (debug:print-info 0 *default-log-port* "No sync due to permissions or other issue."))))) + (if (file-exists? lockfile) + (debug:print 0 *default-log-port* "Lock "lockfile" exists, skipping sync...") + (begin + (with-output-to-file lockfile + (lambda () + (print (current-process-id)))) + (debug:print-info 0 *default-log-port* "Attempting to sync data from "src-db" to "dest-db"...") + (if (not (file-exists? dest-db)) + (begin + (debug:print 0 *default-log-port* "Using copy to create "dest-db" from "src-db) + (file-copy src-db dest-db)) + (let ((res (dbmod:db-to-db-sync src-db dest-db 0 (dbfile:db-init-proc) keys))) + (if res + (debug:print-info 0 *default-log-port* "Synced " res " records from "src-db" to "dest-db) + (debug:print-info 0 *default-log-port* "No sync due to permissions or other issue.")))) + (delete-file* lockfile))) (debug:print 0 *default-log-port* "No sync due to unreadble or non-existant source file"src-db)) (debug:print 0 *default-log-port* "Usage for -db2db; -to and -from must be specified")) (set! *didsomething* #t))) (if (args:get-arg "-list-test-time") Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -489,11 +489,15 @@ (cleanup (lambda () (if (tt-cleanup-proc ttdat) ((tt-cleanup-proc ttdat))) (dbfile:with-no-sync-db nosyncdbpath (lambda (db) - (db:no-sync-del! db dbfname)))))) + (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct))) + (debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname) + (db:no-sync-del! db dbfname) + #;(if dbtmpname + (delete-file dbtmpname)))))))) (set! *server-info* ttdat) (let loop ((count 0)) (if (> count 240) (begin (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.") @@ -513,17 +517,19 @@ ((null? servers) #f) ;; not ok ((equal? (list-ref (car servers) 6) ;; compare the servinfofile (tt-servinf-file ttdat)) (let* ((res (if db-locked-in #t - (let* ((success (dbfile:with-no-sync-db - nosyncdbpath - (lambda (db) - (db:no-sync-get-lock-with-id db dbfname - ;; (tt-servinf-file ttdat) ;; does NOT work, must be unique to the dbname which seems silly but makes sense! - areapath ;; as good as anything - ))))) + (let* ((lock-result + (dbfile:with-no-sync-db + nosyncdbpath + (lambda (db) + (db:no-sync-get-lock-with-id db dbfname + ;; (tt-servinf-file ttdat) + (dbr:dbstruct-dbtmpname dbstruct) + )))) + (success (car lock-result))) (if success (begin (tt-state-set! ttdat 'running) (debug:print 0 *default-log-port* "Got server lock for " dbfname) @@ -530,12 +536,11 @@ (set! db-locked-in #t) #t) (begin (debug:print 0 *default-log-port* "Failed to get server lock for "dbfname) #f)))))) - (if (and res - (common:low-noise-print 120 "top server message")) + (if (and res (common:low-noise-print 120 "top server message")) (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for " dbfname" on "(tt-host ttdat)":"(tt-port ttdat))) res)) (else (debug:print-info 0 *default-log-port* "I'm not the lead server: "servers) @@ -564,12 +569,10 @@ #t))))) (else ;; should never get here (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv) (assert #f "Bad server record "leadsrv)))))))) (if ok - ;; (if (> *api-process-request-count* 0) ;; have requests in flight - ;; (tt-last-access-set! ttdat (current-seconds))) (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access (begin (debug:print 0 *default-log-port* "Exiting immediately") (cleanup) (exit))) @@ -647,11 +650,12 @@ (host (tt-host ttdat)) (port (tt-port ttdat)) (servinf (conc servdir"/"host":"port"-"(current-process-id)":"dbfname)) (serv-id (tt:mk-signature areapath)) (clean-proc (lambda () - (delete-file* servinf)))) + (delete-file* servinf) + ))) (assert (and host port) "FATAL: tt:create-server-registration-file called with no conn, dbfname="dbfname) (tt-cleanup-proc-set! ttdat clean-proc) (tt-servinf-file-set! ttdat servinf) (with-output-to-file servinf (lambda ()