Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -2293,14 +2293,24 @@ ;; 'new2old ) (set! *didsomething* #t))) (if (args:get-arg "-sync-to-megatest.db") - (let ((res (db:multi-db-sync - (db:setup #f) - 'new2old))) - (print "Synced " res " records to megatest.db") + (let* ((dbstruct (db:setup #f)) + (tmpdbpth (cdr (dbr:dbstruct-tmpdb dbstruct))) + (lockfile (conc tmpdbpth ".lock")) + (locked (common:simple-file-lock lockfile)) + (res (if locked + (db:multi-db-sync + dbstruct + 'new2old) + #f))) + (if res + (begin + (common:simple-file-release-lock lockfile) + (print "Synced " res " records to megatest.db")) + (print "Skipping sync, there is a sync in progress.")) (set! *didsomething* #t))) (if (args:get-arg "-sync-to") (let ((toppath (launch:setup))) (tasks:sync-to-postgres *configdat* (args:get-arg "-sync-to")) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -482,164 +482,208 @@ (string-intersperse (map number->string (u8vector->list (if res res (hostname->ip hostname)))) "."))) -(define server:sync-lock-token "SERVER_SYNC_LOCK") -(define (server:release-sync-lock) - (db:no-sync-del! *no-sync-db* server:sync-lock-token)) -(define (server:have-sync-lock?) - (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token)) - (have-lock? (car have-lock-pair)) - (lock-time (cdr have-lock-pair)) - (lock-age (- (current-seconds) lock-time))) - (cond - (have-lock? #t) - ((>lock-age - (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180))) - (server:release-sync-lock) - (server:have-sync-lock?)) - (else #f)))) +;; (define server:sync-lock-token "SERVER_SYNC_LOCK") +;; (define (server:release-sync-lock) +;; (db:no-sync-del! *no-sync-db* server:sync-lock-token)) +;; (define (server:have-sync-lock?) +;; (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token)) +;; (have-lock? (car have-lock-pair)) +;; (lock-time (cdr have-lock-pair)) +;; (lock-age (- (current-seconds) lock-time))) +;; (cond +;; (have-lock? #t) +;; ((>lock-age +;; (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180))) +;; (server:release-sync-lock) +;; (server:have-sync-lock?)) +;; (else #f)))) ;; moving this here as it needs access to db and cannot be in common. ;; (define (server:writable-watchdog dbstruct) - (thread-sleep! 0.05) ;; delay for startup - (let ((legacy-sync (common:run-sync?)) - (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300)) - (debug-mode (debug:debug-mode 1)) - (last-time (current-seconds)) - (no-sync-db (db:open-no-sync-db)) - (sync-duration 0) ;; run time of the sync in milliseconds - ;;(this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x))) - ) - (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls - (debug:print-info 2 *default-log-port* "Periodic sync thread started.") - (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id) );; " this-wd-num="this-wd-num) - (if (and legacy-sync (not *time-to-exit*)) - (let* (;;(dbstruct (db:setup)) - (mtdb (dbr:dbstruct-mtdb dbstruct)) - (mtpath (db:dbdat-get-path mtdb)) - (tmp-area (common:get-db-tmp-area)) - (start-file (conc tmp-area "/.start-sync")) - (end-file (conc tmp-area "/.end-sync"))) - (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") - (let loop () - ;; sync for filesystem local db writes - ;; - (mutex-lock! *db-multi-sync-mutex*) - (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write - (sync-in-progress *db-sync-in-progress*) - - (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 5)) - (should-sync (and (not *time-to-exit*) - (> (- (current-seconds) *db-last-sync*) min-intersync-delay))) ;; sync every five seconds minimum, deprecated logic, can probably be removed - (start-time (current-seconds)) - (cpu-load-adj (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f))) - (mt-mod-time (file-modification-time mtpath)) - (last-sync-start (if (common:file-exists? start-file) - (file-modification-time start-file) - 0)) - (last-sync-end (if (common:file-exists? end-file) - (file-modification-time end-file) - 10)) - (sync-period (+ 3 (* cpu-load-adj 30))) ;; as adjusted load increases increase the sync period - (recently-synced (and (< (- start-time mt-mod-time) sync-period) ;; not useful if sync didn't modify megatest.db! - (< mt-mod-time last-sync-start))) - (sync-done (<= last-sync-start last-sync-end)) - (sync-stale (> start-time (+ last-sync-start sync-stale-seconds))) - (will-sync-pre (and (not *time-to-exit*) ;; do not start a sync if we are in the process of exiting - have-lock? - (or need-sync should-sync) - (or sync-done sync-stale) - (not sync-in-progress) - (not recently-synced))) - (will-sync (if will-sync-pre - ;; delay get lock until we decide to sync - (server:have-sync-lock?) - #f))) - ;; if another server is syncing, postpone sync - (if (and will-sync-pre (not will-sync)) - (set! *db-last-sync* start-time)) - (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop. need-sync="need-sync" sync-in-progress=" sync-in-progress - " should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync - " sync-done=" sync-done " sync-period=" sync-period) - (if (and (> sync-period 5) - (common:low-noise-print 30 "sync-period")) - (debug:print-info 0 *default-log-port* "Increased sync period due to long sync times, sync took: " sync-period " seconds.")) - ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced)) - ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) - (if will-sync (set! *db-sync-in-progress* #t)) - (mutex-unlock! *db-multi-sync-mutex*) - (if will-sync - (let (;; (max-sync-duration (configf:lookup-number *configdat* "server" "max-sync-duration")) ;; KEEPING THIS AVAILABLE BUT SHOULD NOT USE, I'M PRETTY SURE IT DOES NOT WORK! - (sync-start (current-milliseconds))) - (with-output-to-file start-file (lambda ()(print (current-process-id)))) - - ;; put lock here - - ;; (if (or (not max-sync-duration) - ;; (< sync-duration max-sync-duration)) ;; NOTE: db:sync-to-megatest.db keeps track of time of last sync and syncs incrementally - (let ((res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive - (set! sync-duration (- (current-milliseconds) sync-start)) - (if (> res 0) ;; some records were transferred, keep the db alive - (begin - (mutex-lock! *heartbeat-mutex*) - (set! *db-last-access* (current-seconds)) - (mutex-unlock! *heartbeat-mutex*) - (debug:print-info 0 *default-log-port* "sync called, " res " records transferred.")) - (debug:print-info 2 *default-log-port* "sync called but zero records transferred"))))) -;; ;; TODO: factor this next routine out into a function -;; (with-input-from-pipe ;; this should not block other threads but need to verify this -;; (conc "megatest -sync-to-megatest.db -m testsuite:" (common:get-area-name) ":" *toppath*) -;; (lambda () -;; (let loop ((inl (read-line)) -;; (res #f)) -;; (if (eof-object? inl) -;; (begin -;; (set! sync-duration (- (current-milliseconds) sync-start)) -;; (cond -;; ((not res) -;; (debug:print 0 *default-log-port* "ERROR: sync from /tmp db to megatest.db appears to have failed. Recommended that you stop your runs and run \"megatest -cleanup-db\"")) -;; ((> res 0) -;; (mutex-lock! *heartbeat-mutex*) -;; (set! *db-last-access* (current-seconds)) -;; (mutex-unlock! *heartbeat-mutex*)))) -;; (let ((num-synced (let ((matches (string-match "^Synced (\\d+).*$" inl))) -;; (if matches -;; (string->number (cadr matches)) -;; #f)))) -;; (loop (read-line) -;; (or num-synced res)))))))))) - (if will-sync - (begin - (mutex-lock! *db-multi-sync-mutex*) - (set! *db-sync-in-progress* #f) - (set! *db-last-sync* start-time) - (with-output-to-file end-file (lambda ()(print (current-process-id)))) - - ;; release lock here - (server:release-sync-lock) - (mutex-unlock! *db-multi-sync-mutex*))) - (if (and debug-mode - (> (- start-time last-time) 60)) - (begin - (set! last-time start-time) - (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*)))))) - - ;; keep going unless time to exit - ;; - (if (not *time-to-exit*) - (let delay-loop ((count 0)) - ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) - - (if (and (not *time-to-exit*) - (< count 6)) ;; was 11, changing to 4. - (begin - (thread-sleep! 1) - (delay-loop (+ count 1)))) - (if (not *time-to-exit*) (loop)))) - ;; time to exit, close the no-sync db here - (db:no-sync-close-db no-sync-db) - (if (common:low-noise-print 30) - (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) ))))))) ;;" this-wd-num="this-wd-num))))))) + (thread-sleep! 10) ;; delay for startup + (let* ((legacy-sync (common:run-sync?)) + (tmp-area (common:get-db-tmp-area)) + (tmp-db (conc tmp-area "/megatest.db")) + (staging-file (conc *toppath* "/.megatest.db")) + (mtdbfile (conc *toppath* "/megatest.db")) + (lockfile (conc tmp-db ".lock")) + (cmdline (conc "megatest -sync-to-megatest.db " + (if (args:get-arg "-log") + (conc " -log " (args:get-arg "-log")) + (conc *toppath* "/logs/sync-" (current-process-id) "-" (get-host-name) ".log")))) + (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 30))) + (if (and (not (args:get-arg "-sync-to-megatest.db")) ;; conditions under which we do not run the sync + (args:get-arg "-server")) + (let loop () + (thread-sleep! min-intersync-delay) + (if (not (common:file-exists? lockfile)) + (begin + (delete-file* staging-file) + (system (conc "sqlite3 " tmp-db " .dump | sqlite3 " staging-file)) + (delete-file* (conc mtdbfile ".backup")) + (system (conc "mv " staging-file " " mtdbfile)) + ;; (system "megatest -sync-to-megatest.db&")) + )) + + ;; keep going unless time to exit + ;; + (if (not *time-to-exit*) + (let delay-loop ((count 0)) + ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) + + (if (and (not *time-to-exit*) + (< count 6)) ;; was 11, changing to 4. + (begin + (thread-sleep! 1) + (delay-loop (+ count 1)))) + (if (not *time-to-exit*) (loop)))) + ;; time to exit, close the no-sync db here + ;; (db:no-sync-close-db no-sync-db) + (if (common:low-noise-print 30) + (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) + )))))) + +;; (let ((legacy-sync (common:run-sync?))) +;; (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300)) +;; (debug-mode (debug:debug-mode 1)) +;; (last-time (current-seconds)) +;; (no-sync-db (db:open-no-sync-db)) +;; (sync-duration 0) ;; run time of the sync in milliseconds +;; ;;(this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x))) +;; ) +;; (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls +;; (debug:print-info 2 *default-log-port* "Periodic sync thread started.") +;; (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id) );; " this-wd-num="this-wd-num) +;; (if (and legacy-sync (not *time-to-exit*)) +;; (let* (;;(dbstruct (db:setup)) +;; (mtdb (dbr:dbstruct-mtdb dbstruct)) +;; (mtpath (db:dbdat-get-path mtdb)) +;; (tmp-area (common:get-db-tmp-area)) +;; (lockfile (conc tmp-area "/megatest.db.lock")) +;; (start-file (conc tmp-area "/.start-sync")) +;; (end-file (conc tmp-area "/.end-sync"))) +;; (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") +;; (let loop () +;; ;; sync for filesystem local db writes +;; ;; +;; +;; (mutex-lock! *db-multi-sync-mutex*) +;; (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write +;; (sync-in-progress *db-sync-in-progress*) +;; +;; (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 5)) +;; (should-sync (and (not *time-to-exit*) +;; (> (- (current-seconds) *db-last-sync*) min-intersync-delay))) ;; sync every five seconds minimum, deprecated logic, can probably be removed +;; (start-time (current-seconds)) +;; (cpu-load-adj (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f))) +;; (mt-mod-time (file-modification-time mtpath)) +;; (last-sync-start (if (common:file-exists? start-file) +;; (file-modification-time start-file) +;; 0)) +;; (last-sync-end (if (common:file-exists? end-file) +;; (file-modification-time end-file) +;; 10)) +;; (sync-period (+ 3 (* cpu-load-adj 30))) ;; as adjusted load increases increase the sync period +;; (recently-synced (and (< (- start-time mt-mod-time) sync-period) ;; not useful if sync didn't modify megatest.db! +;; (< mt-mod-time last-sync-start))) +;; (sync-done (<= last-sync-start last-sync-end)) +;; (sync-stale (> start-time (+ last-sync-start sync-stale-seconds))) +;; (will-sync-pre (and (not *time-to-exit*) ;; do not start a sync if we are in the process of exiting +;; have-lock? +;; (or need-sync should-sync) +;; (or sync-done sync-stale) +;; (not sync-in-progress) +;; (not recently-synced))) +;; (will-sync (if will-sync-pre +;; ;; delay get lock until we decide to sync +;; #t ;; (server:have-sync-lock?) +;; #f))) +;; ;; if another server is syncing, postpone sync +;; (if (and will-sync-pre (not will-sync)) +;; (set! *db-last-sync* start-time)) +;; (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop. need-sync="need-sync" sync-in-progress=" sync-in-progress +;; " should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync +;; " sync-done=" sync-done " sync-period=" sync-period) +;; (if (and (> sync-period 5) +;; (common:low-noise-print 30 "sync-period")) +;; (debug:print-info 0 *default-log-port* "Increased sync period due to long sync times, sync took: " sync-period " seconds.")) +;; ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced)) +;; ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) +;; (if will-sync (set! *db-sync-in-progress* #t)) +;; (mutex-unlock! *db-multi-sync-mutex*) +;; (if will-sync +;; (let (;; (max-sync-duration (configf:lookup-number *configdat* "server" "max-sync-duration")) ;; KEEPING THIS AVAILABLE BUT SHOULD NOT USE, I'M PRETTY SURE IT DOES NOT WORK! +;; (sync-start (current-milliseconds))) +;; (with-output-to-file start-file (lambda ()(print (current-process-id)))) +;; +;; ;; put lock here +;; +;; ;; (if (or (not max-sync-duration) +;; ;; (< sync-duration max-sync-duration)) ;; NOTE: db:sync-to-megatest.db keeps track of time of last sync and syncs incrementally +;; (let ((res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive +;; (set! sync-duration (- (current-milliseconds) sync-start)) +;; (if (> res 0) ;; some records were transferred, keep the db alive +;; (begin +;; (mutex-lock! *heartbeat-mutex*) +;; (set! *db-last-access* (current-seconds)) +;; (mutex-unlock! *heartbeat-mutex*) +;; (debug:print-info 0 *default-log-port* "sync called, " res " records transferred.")) +;; (debug:print-info 2 *default-log-port* "sync called but zero records transferred"))))) +;; ;; ;; TODO: factor this next routine out into a function +;; ;; (with-input-from-pipe ;; this should not block other threads but need to verify this +;; ;; (conc "megatest -sync-to-megatest.db -m testsuite:" (common:get-area-name) ":" *toppath*) +;; ;; (lambda () +;; ;; (let loop ((inl (read-line)) +;; ;; (res #f)) +;; ;; (if (eof-object? inl) +;; ;; (begin +;; ;; (set! sync-duration (- (current-milliseconds) sync-start)) +;; ;; (cond +;; ;; ((not res) +;; ;; (debug:print 0 *default-log-port* "ERROR: sync from /tmp db to megatest.db appears to have failed. Recommended that you stop your runs and run \"megatest -cleanup-db\"")) +;; ;; ((> res 0) +;; ;; (mutex-lock! *heartbeat-mutex*) +;; ;; (set! *db-last-access* (current-seconds)) +;; ;; (mutex-unlock! *heartbeat-mutex*)))) +;; ;; (let ((num-synced (let ((matches (string-match "^Synced (\\d+).*$" inl))) +;; ;; (if matches +;; ;; (string->number (cadr matches)) +;; ;; #f)))) +;; ;; (loop (read-line) +;; ;; (or num-synced res)))))))))) +;; (if will-sync +;; (begin +;; (mutex-lock! *db-multi-sync-mutex*) +;; (set! *db-sync-in-progress* #f) +;; (set! *db-last-sync* start-time) +;; (with-output-to-file end-file (lambda ()(print (current-process-id)))) +;; +;; ;; release lock here +;; ;; (server:release-sync-lock) +;; (mutex-unlock! *db-multi-sync-mutex*))) +;; (if (and debug-mode +;; (> (- start-time last-time) 60)) +;; (begin +;; (set! last-time start-time) +;; (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*)))))) +;; +;; ;; keep going unless time to exit +;; ;; +;; (if (not *time-to-exit*) +;; (let delay-loop ((count 0)) +;; ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) +;; +;; (if (and (not *time-to-exit*) +;; (< count 6)) ;; was 11, changing to 4. +;; (begin +;; (thread-sleep! 1) +;; (delay-loop (+ count 1)))) +;; (if (not *time-to-exit*) (loop)))) +;; ;; time to exit, close the no-sync db here +;; (db:no-sync-close-db no-sync-db) +;; (if (common:low-noise-print 30) +;; (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) ))))))) ;;" this-wd-num="this-wd-num)))))))