Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -311,15 +311,25 @@ ;; (serialize payload) (api:unregister-thread (current-thread)) payload)) (else (assert #f "FATAL: failed to deserialize indat "indat)))))) - + +(define *api-halt-writes* #f) (define (api:dispatch-request dbstruct cmd run-id params) (if (not *no-sync-db*) (db:open-no-sync-db)) + (if (member cmd api:write-queries) + (let loop ((start-time (current-milliseconds))) + (if *api-halt-writes* + (begin + (thread-sleep! 0.2) + (if (< (- (current-milliseconds) start-time) + 5000) ;; hope it don't take more than five seconds to sync + (loop start-time) + (debug:print 0 *default-log-port* "ERROR: writes halted for more than 5 seconds, sync might be taking too long")))))) (case cmd ;;=============================================== ;; READ/WRITE QUERIES ;;=============================================== Index: dbmod.scm ================================================================== --- dbmod.scm +++ dbmod.scm @@ -225,50 +225,21 @@ (dbr:dbstruct-dbfname-set! dbstruct dbfname) (dbr:dbstruct-sync-proc-set! dbstruct (lambda (last-update) (if *sync-in-progress* (debug:print 3 *default-log-port* "WARNING: overlapping calls to sync to disk") - (let* ((syncer-logfile (conc areapath"/logs/"dbfname"-syncer.log")) - (sync-cmd (if (eq? syncdir 'todisk) - (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "tmpdb" -to "dbfullname" -period 5 -timeout 10 > /dev/null 2&>1)&") - (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "dbfullname" -to "tmpdb" -period 5 -timeout 10 > /dev/null 2&>1)&"))) - (synclock-file (conc dbfullname".lock")) - (syncer-running-file (conc dbfullname"-sync-running")) - (synclock-mod-time (if (file-exists? synclock-file) - (handle-exceptions - exn - #f - (file-modification-time synclock-file)) - #f)) - (thethread (lambda () - (thread-start! - (make-thread - (lambda () - (set! *sync-in-progress* #t) - (debug:print-info "Running "sync-cmd) - (if (file-exists? syncer-running-file) - (debug:print-info 0 *default-log-port* "Syncer still running, skipping syncer start.") - (system sync-cmd)) - (set! *sync-in-progress* #f))))))) - (if ((if (eq? syncdir 'todisk) < >) ;; use less than for todisk, greater than for from disk - (file-modification-time tmpdb) - (file-modification-time dbfullname)) - (debug:print 4 *default-log-port* "Skipping sync, "tmpdb" older than "dbfullname) - (if synclock-mod-time - (if (> (- (current-seconds) synclock-mod-time) 20) ;; something wrong with sync, remove file - (begin - (handle-exceptions - exn - #f - (begin - (debug:print 0 *default-log-port* "Sync lock file " synclock-file "is older than 20 seconds (" synclock-mod-time " seconds). Removing it") - (delete-file synclock-file) - ) - ) - (thethread)) - (debug:print 0 *default-log-port* "Skipping sync, lockfile "synclock-file" found.")) - (thethread))))))) + (begin + ;; turn off writes - send busy or block? + ;; call db2db internally + ;; turn writes back on + ;; + (set! *api-halt-writes* #t) ;; do we need a mutex? + ;; (dbmod:db-to-db-sync src-db dest-db last-update (dbfile:db-init-proc) keys) + (debug:print-info 2 *default-log-port* "Internal sync running from "tmpdb" to "dbfullname) + (dbmod:db-to-db-sync tmpdb dbfullname last-update (dbfile:db-init-proc) keys) + (set! *api-halt-writes* #f) + )))) ;; (dbmod:sync-tables tables #f db cachedb) ;; (thread-sleep! 1) ;; let things settle before syncing in needed data (dbmod:sync-gasket tables #f cachedb db dbfullname 'fromdest keys) ;; ) ;; load into cachedb (dbr:dbstruct-last-update-set! dbstruct (+ (current-seconds) -10)) ;; should this be offset back in time by one second? @@ -857,5 +828,49 @@ (sqlite3:finalize! sdb) (sqlite3:finalize! ddb) res))) #f)) ) + + +;; ATTIC + + #;(let* ((syncer-logfile (conc areapath"/logs/"dbfname"-syncer.log")) + (sync-cmd (if (eq? syncdir 'todisk) + (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "tmpdb" -to "dbfullname" -period 5 -timeout 10 > /dev/null 2&>1)&") + (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "dbfullname" -to "tmpdb" -period 5 -timeout 10 > /dev/null 2&>1)&"))) + (synclock-file (conc dbfullname".lock")) + (syncer-running-file (conc dbfullname"-sync-running")) + (synclock-mod-time (if (file-exists? synclock-file) + (handle-exceptions + exn + #f + (file-modification-time synclock-file)) + #f)) + (thethread (lambda () + (thread-start! + (make-thread + (lambda () + (set! *sync-in-progress* #t) + (debug:print-info "Running "sync-cmd) + (if (file-exists? syncer-running-file) + (debug:print-info 0 *default-log-port* "Syncer still running, skipping syncer start.") + (system sync-cmd)) + (set! *sync-in-progress* #f))))))) + (if ((if (eq? syncdir 'todisk) < >) ;; use less than for todisk, greater than for from disk + (file-modification-time tmpdb) + (file-modification-time dbfullname)) + (debug:print 4 *default-log-port* "Skipping sync, "tmpdb" older than "dbfullname) + (if synclock-mod-time + (if (> (- (current-seconds) synclock-mod-time) 20) ;; something wrong with sync, remove file + (begin + (handle-exceptions + exn + #f + (begin + (debug:print 0 *default-log-port* "Sync lock file " synclock-file "is older than 20 seconds (" synclock-mod-time " seconds). Removing it") + (delete-file synclock-file) + ) + ) + (thethread)) + (debug:print 0 *default-log-port* "Skipping sync, lockfile "synclock-file" found.")) + (thethread)))) Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -258,11 +258,11 @@ (case status ((busy) ;; result will be how long the server wants you to delay (let* ((dly (if (number? result) result 0.1))) (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, will try again in "dly" seconds.") (thread-sleep! dly) - (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe))) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))) ((loaded) (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.") (tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn)) result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe)) (else @@ -279,11 +279,11 @@ (if (and servinf (file-exists? servinf)) (begin (if (< attemptnum 10) (begin (thread-sleep! 0.5) - (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe)) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)) (begin (debug:print 0 *default-log-port* "INFO: no response from server "host":"port" for "dbfname) (if (and (file-exists? servinf) (> (- (current-seconds)(file-modification-time servinf)) 60)) (begin @@ -290,30 +290,30 @@ (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.") (handle-exceptions exn #f (delete-file* servinf)) - (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe)) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)) (begin ;; start server - addressed in client-connect-to-server ;; delay - addressed in client-connect-to-server ;; try again (thread-sleep! 0.25) ;; dunno, I think this needs to be here - (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe)) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)) )))) (begin ;; no server file, delay and try again (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", no servinf file. Server exited? ") (thread-sleep! 0.5) - (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe)))) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)))) (begin ;; this case is where res is malformed. Probably should abort (assert #f "FATAL: tt:handler received bad data "res) ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.") ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe) ))))) (begin (thread-sleep! 1) ;; no conn yet set up, give it a rest and try again - (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe))))) + (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc))))) (define (tt:bid-for-servership run-id) #f) ;; gets server info and appends path to server file @@ -732,14 +732,17 @@ ;; filter the files here by looking in processes table (if we are not main.db) ;; and or look at the time stamp on the servinfo file, a running server will ;; touch the file every minute (again, this will only apply for main.db) (for-each (lambda (fname) (let* ((age (- (current-seconds)(file-modification-time fname)))) - (if (> age 10) ;; can't trust it if over ten seconds old + (if (> age 20) ;; can't trust it if over twenty seconds old (begin (debug:print 0 *default-log-port* "WARNING: removing stale servinfo file "fname) - (delete-file fname)) + (handle-exceptions + exn + (debug:print 0 *default-log-port* "WARNING: error attempting to remove stale servinfo file "fname) + (delete-file fname))) ;; (set! goodfiles (cons fname goodfiles))))) sfiles) goodfiles)) ;; given a path to a server info file return: host port startseconds server-id pid dbfname logf