Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -62,10 +62,12 @@ synchash-get )) (define api:write-queries '( + get-keys-write ;; dummy "write" query to force server start + ;; SERVERS start-server kill-server ;; TESTS @@ -189,10 +191,11 @@ ;;====================================================================== ;; KEYS ((get-key-val-pairs) (apply db:get-key-val-pairs dbstruct params)) ((get-keys) (db:get-keys dbstruct)) + ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server ((get-key-vals) (apply db:get-key-vals dbstruct params)) ((get-target) (apply db:get-target dbstruct params)) ((get-targets) (db:get-targets dbstruct)) ;; ARCHIVES Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -553,35 +553,44 @@ ;; if #t use timestamps : or 'timestamps (define (common:sync-to-megatest.db dbstruct) (let ((start-time (current-seconds)) (res (db:multi-db-sync dbstruct 'new2old))) (let ((sync-time (- (current-seconds) start-time))) - (debug:print-info 3 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds") + (debug:print-info 3 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id)) (if (common:low-noise-print 30 "sync new to old") - (debug:print-info 0 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds"))) + (debug:print-info 0 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id)))) res)) + + + +(define *wdnum* 0) +(define *wdnum*mutex (make-mutex)) ;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp ;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server) ;; (define (common:watchdog) + (thread-sleep! 0.05) ;; delay for startup (let ((legacy-sync (common:run-sync?)) (debug-mode (debug:debug-mode 1)) - (last-time (current-seconds))) - (debug:print-info 0 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync) - (if legacy-sync + (last-time (current-seconds)) + (this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x))) + ) + (debug:print-info 0 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num) + (if (and legacy-sync (not *time-to-exit*)) (let ((dbstruct (db:setup))) (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") (let loop () + (BB> "watchdog loop. pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) ;; sync for filesystem local db writes ;; (mutex-lock! *db-multi-sync-mutex*) - (let* ( - (need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write + (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write (sync-in-progress *db-sync-in-progress*) - (should-sync (> (- (current-seconds) *db-last-sync*) 5)) ;; sync every five seconds minimum + (should-sync (and (not *time-to-exit*) + (> (- (current-seconds) *db-last-sync*) 5))) ;; sync every five seconds minimum (will-sync (and (or need-sync should-sync) (not sync-in-progress))) (start-time (current-seconds))) ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) (if will-sync (set! *db-sync-in-progress* #t)) @@ -609,18 +618,20 @@ ;; keep going unless time to exit ;; (if (not *time-to-exit*) (let delay-loop ((count 0)) + (BB> "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) + (if (and (not *time-to-exit*) (< count 4)) ;; was 11, changing to 4. (begin (thread-sleep! 1) (delay-loop (+ count 1)))) - (loop))) + (if (not *time-to-exit*) (loop)))) (if (common:low-noise-print 30) - (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*))))))) + (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" this-wd-num="this-wd-num))))))) (define (std-exit-procedure) (on-exit (lambda () 0)) (let ((no-hurry (if *time-to-exit* ;; hurry up #f @@ -641,11 +652,11 @@ ;; (vector-set! *task-db* 0 #f) (set! *task-db* #f))))) (if (and *runremote* (remote-conndat *runremote*)) (begin - (close-all-connections!))) ;; for http-client + (http-client#close-all-connections!))) ;; for http-client (if (not (eq? *default-log-port* (current-error-port))) (close-output-port *default-log-port*)) (set! *default-log-port* (current-error-port))) "Cleanup db exit thread")) (th2 (make-thread (lambda () (debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...") Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -486,11 +486,11 @@ ;; (thread-sleep! rem-time) ;; (thread-sleep! 4))) ;; fallback for if the math is changed ... (define (http-transport:server-shutdown server-id port) (let ((tdbdat (tasks:open-db))) - (debug:print-info 0 *default-log-port* "Starting to shutdown the server.") + (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id)) ;; ;; start_shutdown ;; (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -350,11 +350,11 @@ ;; The watchdog is to keep an eye on things like db sync etc. ;; (define *watchdog* (make-thread common:watchdog "Watchdog thread")) (thread-start! *watchdog*) - +;; (BB> "thread-start! watchdog") (if (args:get-arg "-log") (let ((oup (open-output-file (args:get-arg "-log")))) (debug:print-info 0 *default-log-port* "Sending log output to " (args:get-arg "-log")) (set! *default-log-port* oup))) @@ -1990,11 +1990,11 @@ ;; Exit and clean up ;;====================================================================== (if (not *didsomething*) (debug:print 0 *default-log-port* help)) - +(BB> "thread-join! watchdog") (thread-join! *watchdog*) (set! *time-to-exit* #t) (if (not (eq? *globalexitstatus* 0)) (if (or (args:get-arg "-run")(args:get-arg "-runtests")(args:get-arg "-runall")) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -86,23 +86,38 @@ ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (member cmd api:read-only-queries)) ;; this is a read (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3") (rmt:open-qry-close-locally cmd 0 params)) + + ;; on homehost and this is a write, we already have a server, but server has died + ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost + (not (member cmd api:read-only-queries)) ;; this is a write + (remote-server-url *runremote*) ;; have a server + (not (server:read-dotserver *toppath*))) ;; server has died. + (set! *runremote* #f) + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") + (rmt:send-receive cmd rid params attemptnum: attemptnum)) + ;; on homehost and this is a write, we already have a server ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url *runremote*)) ;; have a server (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4") (rmt:open-qry-close-locally cmd 0 params)) - ;; on homehost and this is a write, we have a server (we know because case 4 checked) - ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost - (not (member cmd api:read-only-queries))) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") - (rmt:open-qry-close-locally cmd 0 params)) + + ;; commented by bb; this was blocking server passive start on write on homehost (case 5) + ;; ;; on homehost and this is a write, we have a server (we know because case 4 checked) + ;; ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost + ;; (not (member cmd api:read-only-queries))) + ;; (mutex-unlock! *rmt-mutex*) + ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") + ;; (rmt:open-qry-close-locally cmd 0 params)) + + ;; no server contact made and this is a write, passively start a server ((and (not (remote-server-url *runremote*)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") (let ((serverconn (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call @@ -349,10 +364,15 @@ (define (rmt:get-keys) (if *db-keys* *db-keys* (let ((res (rmt:send-receive 'get-keys #f '()))) (set! *db-keys* res) res))) + +(define (rmt:get-keys-write) ;; dummy query to force server start + (let ((res (rmt:send-receive 'get-keys-write #f '()))) + (set! *db-keys* res) + res)) ;; we don't reuse run-id's (except possibly *after* a db cleanup) so it is safe ;; to cache the resuls in a hash ;; (define (rmt:get-key-vals run-id)