Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -254,14 +254,16 @@ (if maindb (if (or (not (number? mtime)) (not (number? stime)) (> mtime stime) force-sync) - (let ((num-synced (db:sync-tables (db:sync-main-list maindb) maindb olddb))) - (dbr:dbstruct-set-stime! dbstruct (current-milliseconds)) - num-synced) - 0) + (begin + (db:delay-if-busy) + (let ((num-synced (db:sync-tables (db:sync-main-list maindb) maindb olddb))) + (dbr:dbstruct-set-stime! dbstruct (current-milliseconds)) + num-synced) + 0)) (begin ;; this can occur when using local access (i.e. not in a server) ;; need a flag to turn it off. ;; (debug:print 3 "WARNING: call to sync main.db to megatest.db but main not initialized") @@ -269,14 +271,16 @@ ;; any other runid is a run (if (or (not (number? mtime)) (not (number? stime)) (> mtime stime) force-sync) - (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb))) - (dbr:dbstruct-set-stime! dbstruct (current-milliseconds)) - num-synced) - 0)))) + (begin + (db:delay-if-busy) + (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb))) + (dbr:dbstruct-set-stime! dbstruct (current-milliseconds)) + num-synced) + 0))))) ;; close all opened run-id dbs (define (db:close-all dbstruct) ;; finalize main.db (db:sync-touched dbstruct 0 force-sync: #t) @@ -511,11 +515,13 @@ (let* ((toppath (launch:setup-for-run)) (dbstruct (if toppath (make-dbr:dbstruct path: toppath) #f)) (mtdb (if toppath (db:open-megatest-db))) (run-ids (if run-ids run-ids - (if toppath (db:get-all-run-ids mtdb)))) + (if toppath (begin + (db:delay-if-busy) + (db:get-all-run-ids mtdb))))) (mdb (tasks:open-db)) (servers (tasks:get-all-servers mdb))) ;; kill servers (if (member 'killservers options) @@ -526,24 +532,29 @@ servers)) ;; clear out junk records ;; (if (member 'dejunk options) - (db:clean-up mtdb)) + (begin + (db:delay-if-busy) + (db:clean-up mtdb))) ;; adjust test-ids to fit into proper range ;; (if (member 'adj-testids options) - (db:prep-megatest.db-for-migration mtdb)) + (begin + (db:delay-if-busy) + (db:prep-megatest.db-for-migration mtdb))) ;; sync runs, test_meta etc. ;; (if (member 'old2new options) (begin (db:sync-tables (db:sync-main-list mtdb) mtdb (db:get-db dbstruct #f)) (for-each (lambda (run-id) + (db:delay-if-busy) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)) (dbstruct (if toppath (make-dbr:dbstruct path: toppath local: #t) #f))) (debug:print 0 "INFO: Propagating " (length testrecs) " records for run-id=" run-id " to run specific db") (db:replace-test-records dbstruct run-id testrecs) (sqlite3:finalize! (dbr:dbstruct-get-rundb dbstruct)))) @@ -551,10 +562,11 @@ ;; now ensure all newdb data are synced to megatest.db (if (member 'new2old options) (for-each (lambda (run-id) + (db:delay-if-busy) (let ((fromdb (if toppath (make-dbr:dbstruct path: toppath local: #t) #f))) (if (eq? run-id 0) (db:sync-tables (db:sync-main-list dbstruct)(db:get-db fromdb run-id) mtdb) (db:sync-tables db:sync-tests-only (db:get-db fromdb run-id) mtdb)))) run-ids)) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -155,14 +155,16 @@ (http-transport:try-start-server run-id ipaddrstr (portlogger:open-run-close portlogger:find-port) server-id)) (begin + (tasks:wait-on-busy-monitor.db) (tasks:server-force-clean-run-record (tasks:get-db) run-id ipaddrstr portnum " http-transport:try-start-server") (print "ERROR: Tried and tried but could not start the server")))) ;; any error in following steps will result in a retry (set! *server-info* (list ipaddrstr portnum)) + (tasks:wait-on-busy-monitor.db) (tasks:server-set-interface-port (tasks:get-db) server-id ipaddrstr portnum) (debug:print 0 "INFO: Trying to start server on " ipaddrstr ":" portnum) @@ -173,10 +175,11 @@ (start-server port: portnum bind-address: (if (equal? config-hostname "-") ipaddrstr config-hostname)) (start-server port: portnum)) ;; (portlogger:open-run-close portlogger:set-port portnum "released") + (tasks:wait-on-busy-monitor.db) (tasks:server-force-clean-run-record (tasks:get-db) run-id ipaddrstr portnum " http-transport:try-start-server") (debug:print 1 "INFO: server has been stopped")))) ;;====================================================================== ;; S E R V E R U T I L I T I E S @@ -256,10 +259,11 @@ (close-all-connections!)) (debug:print 0 "WARNING: Failed to communicate with server, trying again, numretries left: " numretries) (http-transport:client-api-send-receive run-id serverdat cmd params numretries: (- numretries 1))) (begin (mutex-unlock! *http-mutex*) + (tasks:wait-on-busy-monitor.db) (tasks:kill-server-run-id run-id) #f)) (begin (debug:print-info 11 "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n") ;; set up the http-client here @@ -369,10 +373,11 @@ (debug:print-info 0 "Still waiting, last-sdat=" last-sdat) (sleep 4) (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes (let ((tdb (tasks:open-db))) (debug:print 0 "ERROR: transport appears to have died, exiting server " server-id " for run " run-id) + (tasks:wait-on-busy-monitor.db) (tasks:server-delete-record tdb server-id "failed to start, never received server alive signature") (sqlite3:finalize! tdb) (exit)) (loop start-time (equal? sdat last-sdat) @@ -404,10 +409,11 @@ ;; ;; set_running after our first pass through and start the db ;; (if (eq? server-state 'available) (begin + (tasks:wait-on-busy-monitor.db) (tasks:server-set-state! tdb server-id "dbprep") (thread-sleep! 5) ;; give some margin for queries to complete before switching from file based access to server based access (set! *inmemdb* (db:setup run-id)) (tasks:server-set-state! tdb server-id "running"))) @@ -463,10 +469,11 @@ (loop 0 server-state)) (begin (debug:print-info 0 "Starting to shutdown the server.") ;; need to delete only *my* server entry (future use) (set! *time-to-exit* #t) + (tasks:wait-on-busy-monitor.db) ;; wait here in addition to just before the shutting-down (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t)) ;; ;; start_shutdown ;; (tasks:server-set-state! tdb server-id "shutting-down") @@ -486,10 +493,11 @@ "n/a (no queries)" (/ *total-non-write-delay* *number-non-write-queries*)) " ms") (debug:print-info 0 "Server shutdown complete. Exiting") + (tasks:wait-on-busy-monitor.db) (tasks:server-delete-record tdb server-id " http-transport:keep-running") (exit)))))) ;; all routes though here end in exit ... ;; @@ -506,21 +514,24 @@ (current-output-port *alt-log-file*))))) (if (server:check-if-running run-id) (begin (debug:print 0 "INFO: Server for run-id " run-id " already running") (exit 0))) + (tasks:wait-on-busy-monitor.db) (let loop ((server-id (tasks:server-lock-slot (tasks:get-db) run-id)) (remtries 4)) (if (not server-id) (if (> remtries 0) (begin (thread-sleep! 2) + (tasks:wait-on-busy-monitor.db) (loop (tasks:server-lock-slot (tasks:get-db) run-id) (- remtries 1))) (begin ;; since we didn't get the server lock we are going to clean up and bail out (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") + (tasks:wait-on-busy-monitor.db) (tasks:server-delete-records-for-this-pid (tasks:get-db) " http-transport:launch") )) (let* ((th2 (make-thread (lambda () (debug:print-info 0 "Server run thread started") (http-transport:run Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -500,19 +500,19 @@ (for-each (lambda (disk-num) (let* ((dirpath (cadr (assoc disk-num disks))) (freespc (cond ((not (directory? dirpath)) - (if (common:low-noise-print 20 "disks" disk-num) + (if (common:low-noise-print 50 "disks not a dir " disk-num) (debug:print 0 "WARNING: disk " disk-num " at path " dirpath " is not a directory - ignoring it.")) -1) ((not (file-write-access? dirpath)) - (if (common:low-noise-print 20 "disks" disk-num) + (if (common:low-noise-print 50 "disks not writeable " disk-num) (debug:print 0 "WARNING: disk " disk-num " at path " dirpath " is not writeable - ignoring it.")) -1) ((not (eq? (string-ref dirpath 0) #\/)) - (if (common:low-noise-print 20 "disks" disk-num) + (if (common:low-noise-print 50 "disks not a proper path " disk-num) (debug:print 0 "WARNING: disk " disk-num " at path " dirpath " is not a fully qualified path - ignoring it.")) -1) (else (get-df dirpath))))) (if (> freespc bestsize) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -97,14 +97,15 @@ (let ((max-avg-qry (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) (debug:print-info 4 "no server and read-only query, bypassing normal channel") ;; (if (rmt:write-frequency-over-limit? cmd run-id)(server:kind-run run-id)) (let ((curr-max (rmt:get-max-query-average run-id))) (if (> (cdr curr-max) max-avg-qry) - (begin - (debug:print-info 0 "Max average query, " (inexact->exact (round (cdr curr-max))) "ms (" (car curr-max) ") exceeds " max-avg-qry ", try starting server ...") - (server:kind-run run-id)) - (debug:print-info 3 "Max average query, " (inexact->exact (round (cdr curr-max))) "ms (" (car curr-max) ") below " max-avg-qry ", not starting server..."))) + (if (common:low-noise-print 10 "start server due to max average query too long") + (begin + (debug:print-info 0 "Max average query, " (inexact->exact (round (cdr curr-max))) "ms (" (car curr-max) ") exceeds " max-avg-qry ", try starting server ...") + (server:kind-run run-id) + (debug:print-info 3 "Max average query, " (inexact->exact (round (cdr curr-max))) "ms (" (car curr-max) ") below " max-avg-qry ", not starting server..."))))) (rmt:open-qry-close-locally cmd run-id params))))) (define (rmt:update-db-stats run-id rawcmd params duration) (mutex-lock! *db-stats-mutex*) (handle-exceptions Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -160,13 +160,13 @@ (hash-table-set! *runs:denoise* key currtime) #t) #f))) (define (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) - (thread-sleep! (cond - ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while - (else 0))) + ;;(thread-sleep! (cond + ;; ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while + ;; (else 0))) (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (config-lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) @@ -1117,11 +1117,11 @@ (debug:print-info 0 "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) (rmt:find-and-mark-incomplete run-id #f))) (if (not (eq? num-running prev-num-running)) (debug:print-info 0 "run-wait specified, waiting on " num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds))))) - (thread-sleep! 15) + (thread-sleep! 5) ;; (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! (debug:print-info 1 "All tests launched"))) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -51,10 +51,13 @@ (vector-ref *task-db* 1) (let* ((linktree (configf:lookup *configdat* "setup" "linktree")) (dbpath (conc linktree "/.db/monitor.db"))) dbpath))) +(define (tasks:wait-on-busy-monitor.db) + (tasks:wait-on-journal (tasks:get-task-db-path) 30)) + ;; If file exists AND ;; file readable ;; ==> open it ;; If file exists AND ;; file NOT readable