Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -287,40 +287,42 @@ ;; (define *watchdog* (make-thread (lambda () (thread-sleep! 0.05) ;; delay for startup - (let loop () - ;; sync for filesystem local db writes - ;; - (let ((start-time (current-seconds)) - (servers-started (make-hash-table))) - (for-each - (lambda (run-id) - (mutex-lock! *db-multi-sync-mutex*) - (if (hash-table-ref/default *db-local-sync* run-id #f) - ;; (if (> (- start-time last-write) 5) ;; every five seconds - (begin ;; let ((sync-time (- (current-seconds) start-time))) - (db:multi-db-sync (list run-id) 'new2old) - (if (common:low-noise-print 30 "sync new to old") - (let ((sync-time (- (current-seconds) start-time))) - (debug:print-info 0 "Sync of newdb to olddb for run-id " run-id " completed in " sync-time " seconds") - (if (> sync-time 10) ;; took more than ten seconds, start a server for this run - (begin - (debug:print-info 0 "Sync is taking a long time, start up a server to assist for run " run-id) - (server:kind-run run-id))))) - (hash-table-delete! *db-local-sync* run-id))) - (mutex-unlock! *db-multi-sync-mutex*)) - (hash-table-keys *db-local-sync*))) - - ;; keep going unless time to exit - ;; - (if (not *time-to-exit*) - (begin - (thread-sleep! 1) ;; wait one second before syncing again - (loop))))) - "Watchdog thread")) + (let ((legacy-sync (configf:lookup *configdat* "setup" "megatest-db"))) + (let loop () + ;; sync for filesystem local db writes + ;; + (let ((start-time (current-seconds)) + (servers-started (make-hash-table))) + (for-each + (lambda (run-id) + (mutex-lock! *db-multi-sync-mutex*) + (if (and legacy-sync + (hash-table-ref/default *db-local-sync* run-id #f)) + ;; (if (> (- start-time last-write) 5) ;; every five seconds + (begin ;; let ((sync-time (- (current-seconds) start-time))) + (db:multi-db-sync (list run-id) 'new2old) + (if (common:low-noise-print 30 "sync new to old") + (let ((sync-time (- (current-seconds) start-time))) + (debug:print-info 0 "Sync of newdb to olddb for run-id " run-id " completed in " sync-time " seconds"))) + ;; (if (> sync-time 10) ;; took more than ten seconds, start a server for this run + ;; (begin + ;; (debug:print-info 0 "Sync is taking a long time, start up a server to assist for run " run-id) + ;; (server:kind-run run-id))))) + (hash-table-delete! *db-local-sync* run-id))) + (mutex-unlock! *db-multi-sync-mutex*)) + (hash-table-keys *db-local-sync*))) + + ;; keep going unless time to exit + ;; + (if (not *time-to-exit*) + (begin + (thread-sleep! 1) ;; wait one second before syncing again + (loop))))) + "Watchdog thread"))) (thread-start! *watchdog*) (if (args:get-arg "-log") (let ((oup (open-output-file (args:get-arg "-log")))) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -110,29 +110,16 @@ ;; may kill it here but what are the criteria? ;; start with three calls then kill server (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id)) (rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1))))) - (let ((max-avg-qry (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) - (if (and (< attemptnum 10) - (configf:lookup *configdat* "server" "required")) - (begin - (debug:print-info 0 "Server required mode, attempting to start server and retry query in ten seconds") - (server:kind-run run-id) - (thread-sleep! 10) - (rmt:send-receive cmd rid params (+ attemptnum 1))) - ;; (if (rmt:write-frequency-over-limit? cmd run-id)(server:kind-run run-id)) - (let* ((curr-max (rmt:get-max-query-average run-id)) - (curr-max-val (cdr curr-max))) - (debug:print-info 4 "no server and read-only query, bypassing normal channel") - (if (> curr-max-val max-avg-qry) - (if (common:low-noise-print 10 "start server due to max average query too long") - (begin - (debug:print-info 0 "Max average query, " (inexact->exact (round curr-max-val)) "ms (" (car curr-max) ") exceeds " max-avg-qry "ms, try starting server ...") - (server:kind-run run-id)) - (debug:print-info 3 "Max average query, " (inexact->exact (round curr-max-val)) "ms (" (car curr-max) ") below " max-avg-qry "ms, not starting server..."))) - (rmt:open-qry-close-locally cmd run-id params))))))) + (if (and (< attemptnum 10) + (tasks:need-server run-id)) + (begin + (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10) + (rmt:send-receive cmd rid params (+ attemptnum 1))) + (rmt:open-qry-close-locally cmd run-id params))))) (define (rmt:update-db-stats run-id rawcmd params duration) (mutex-lock! *db-stats-mutex*) (handle-exceptions exn Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -216,11 +216,11 @@ (test-names #f) ;; (tests:filter-test-names all-test-names test-patts)) (required-tests #f) ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work (task-key (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id))) (tdbdat (tasks:open-db))) - (tasks:start-and-wait-for-server tdbdat run-id 10) + (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) (set-signal-handler! signal/int (lambda (signum) (signal-mask! signum) (print "Received signal " signum ", cleaning up before exit. Please wait...") @@ -935,14 +935,15 @@ (newtal (append tal (list hed))) (regfull (>= (length reg) reglen)) (num-running (rmt:get-count-tests-running-for-run-id run-id))) ;; every couple minutes verify the server is there for this run - (if (common:low-noise-print 60 "try start server" run-id) + (if (and (common:low-noise-print 60 "try start server" run-id) + (tasks:need-server run-id)) (tasks:start-and-wait-for-server tdbdat run-id 10)) - - (if (> num-running 0) + + (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running 240)) (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1))) ;; (debug:print 0 "max-tries-hash: " (hash-table->alist *max-tries-hash*)) @@ -1433,18 +1434,18 @@ (debug:print-info 4 "runs:operate-on run=" run ", header=" header) (if (not (null? tests)) (begin (case action ((remove-runs) - (tasks:start-and-wait-for-server tdbdat run-id 10) + (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) ;; seek and kill in flight -runtests with % as testpatt here (if (equal? testpatt "%") (tasks:kill-runner (db:delay-if-busy tdbdat) target run-name) (debug:print 0 "not attempting to kill any run launcher processes as testpatt is " testpatt)) (debug:print 1 "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((set-state-status) - (tasks:start-and-wait-for-server tdbdat run-id 10) + (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) (debug:print 1 "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((print-run) (debug:print 1 "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header) action) ((run-wait) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -344,10 +344,26 @@ (lambda (id) (set! res id)) mdb ;; NEEDS dbprep ADDED "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND (strftime('%s','now') - start_time) < 60));" run-id) res)) + +(define (tasks:need-server run-id) + (let ((forced (configf:lookup *configdat* "server" "required")) + (maxqry (cdr (rmt:get-max-query-average run-id))) + (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) + (cond + (forced + (if (common:low-noise-print 60 run-id "server required is set") + (debug:print-info 0 "Server required is set, starting server.")) + #t) + ((> maxqry threshold) + (if (common:low-noise-print 60 run-id "Max query time execeeded") + (debug:print-info 0 "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, starting server.")) + #t) + (else + #f)))) ;; try to start a server and wait for it to be available ;; (define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries) ;; ensure a server is running for this run @@ -354,11 +370,12 @@ (let loop ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id)) (delay-time 0)) (if (and (not server-dat) (< delay-time delay-max-tries)) (begin - (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id)(debug:print 0 "Try starting server for run-id " run-id)) + (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id) + (debug:print 0 "Try starting server for run-id " run-id)) (server:kind-run run-id) (thread-sleep! (min delay-time 5)) (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1)))))) (define (tasks:get-all-servers mdb) Index: tests/fullrun/megatest.config ================================================================== --- tests/fullrun/megatest.config +++ tests/fullrun/megatest.config @@ -139,11 +139,11 @@ # Server is required - slower but more resistant to Sqlite issues. # required yes # Start server when average query takes longer than this -server-query-threshold 15 +server-query-threshold 55500 # daemonize yes # hostname #{scheme (get-host-name)} ## disks are: