Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -195,11 +195,11 @@ rdb (let* ((dbpath (db:dbfile-path run-id)) ;; (conc toppath "/db/" run-id ".db")) (dbexists (file-exists? dbpath)) (inmem (if local #f (db:open-inmem-db))) (refdb (if local #f (db:open-inmem-db))) - (db (db:lock-create-open dbpath + (db (db:lock-create-open dbpath ;; this is the database physically on disk (lambda (db) (handle-exceptions exn (begin (release-dot-lock dbpath) @@ -234,14 +234,18 @@ (begin (dbr:dbstruct-set-localdb! dbstruct run-id db) ;; (dbr:dbstruct-set-inmem! dbstruct db) ;; direct access ... db) (begin (dbr:dbstruct-set-inmem! dbstruct inmem) + (sqlite3:execute db "DELETE FROM tests WHERE state='DELETED';") ;; they just slow us down in this context (db:sync-tables db:sync-tests-only db inmem) - (db:delay-if-busy dbpath: (db:dbdat-get-path refdb)) + (db:delay-if-busy refdb) ;; dbpath: (db:dbdat-get-path refdb)) (dbr:dbstruct-set-refdb! dbstruct refdb) (db:sync-tables db:sync-tests-only db refdb) + ;; sync once more to deal with delays + (db:sync-tables db:sync-tests-only db inmem) + (db:sync-tables db:sync-tests-only db refdb) inmem)))))) ;; This routine creates the db. It is only called if the db is not already ls opened ;; (define (db:open-main dbstruct) ;; (conc *toppath* "/megatest.db") (car *configinfo*))) @@ -292,10 +296,11 @@ (refdb (dbr:dbstruct-get-refdb dbstruct)) (olddb (dbr:dbstruct-get-olddb dbstruct)) ;; (runid (dbr:dbstruct-get-run-id dbstruct)) ) (debug:print-info 4 "Syncing for run-id: " run-id) + (mutex-lock! *http-mutex*) (if (eq? run-id 0) ;; runid equal to 0 is main.db (if maindb (if (or (not (number? mtime)) (not (number? stime)) @@ -322,12 +327,15 @@ (begin (db:delay-if-busy rundb) (db:delay-if-busy olddb) (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb))) (dbr:dbstruct-set-stime! dbstruct (current-milliseconds)) + (mutex-unlock! *http-mutex*) num-synced) - 0))))) + (begin + (mutex-unlock! *http-mutex*) + 0)))))) (define (db:close-main dbstruct) (let ((maindb (dbr:dbstruct-get-main dbstruct))) (if maindb (begin Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -493,11 +493,11 @@ "n/a (no queries)" (/ *total-non-write-delay* *number-non-write-queries*)) " ms") (debug:print-info 0 "Server shutdown complete. Exiting") - (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running") + (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running complete") (exit))) ;; all routes though here end in exit ... ;; ;; start_server? @@ -539,19 +539,12 @@ server-id)) "Server run")) (th3 (make-thread (lambda () (debug:print-info 0 "Server monitor thread started") (http-transport:keep-running server-id run-id)) "Keep running"))) - ;; Database connection - - - ;; don't start the db here - - ;; (set! *inmemdb* (db:setup run-id)) - - (thread-start! th2) + (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. (thread-start! th3) (set! *didsomething* #t) (thread-join! th2) (exit)))))) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -99,11 +99,13 @@ ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* ;; (let ((test-info (rmt:get-testinfo-state-status run-id test-id))) (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") - (debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed"))) + (begin + (debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed") + (exit)))) (debug:print 2 "Exectuing " test-name " (id: " test-id ") on " (get-host-name)) (set! keys (rmt:get-keys)) ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process ;; one of these is defunct/redundant ... Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -86,12 +86,12 @@ (connection-info (rmt:get-connection-info run-id)) (jparams (db:obj->string params))) (if connection-info ;; use the server if have connection info (let* ((dat (http-transport:client-api-send-receive run-id connection-info cmd jparams)) - (res (if (vector? dat) (vector-ref dat 1) #f)) - (success (if (vector? dat) (vector-ref dat 0) #f))) + (res (if (and dat (vector? dat)) (vector-ref dat 1) #f)) + (success (if (and dat (vector? dat)) (vector-ref dat 0) #f))) (http-transport:server-dat-update-last-access connection-info) (if success (db:string->obj res) ;; (if (< attemptnum 100) ;; (begin @@ -108,11 +108,11 @@ ;; no longer killing the server in http-transport:client-api-send-receive ;; may kill it here but what are the criteria? ;; start with three calls then kill server (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id)) - + (thread-sleep! 2) (rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1))))) (if (and (< attemptnum 10) (tasks:need-server run-id)) (begin (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -554,11 +554,11 @@ (list (car newtal)(append (cdr newtal) reg) '() reruns)))) ((and (null? fails) (null? prereq-fails) (null? non-completed)) - (if (runs:can-keep-running? hed 5) + (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) (debug:print-info 1 "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal @@ -673,13 +673,18 @@ ;; Register tests ;; ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f)) (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs - (rmt:general-call 'register-test run-id run-id test-name item-path) - (if (rmt:get-test-id run-id test-name item-path) - (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done)) + (let register-loop ((numtries 15)) + (rmt:general-call 'register-test run-id run-id test-name item-path) + (thread-sleep! 0.5) + (if (rmt:get-test-id run-id test-name item-path) + (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done) + (if (> numtries 0) + (register-loop (- numtries 1)) + (debug:print 0 "ERROR: failed to register test " (runs:make-full-test-name test-name item-path))))) (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done)) (begin (rmt:general-call 'register-test run-id run-id test-name "") (if (rmt:get-test-id run-id test-name "") (hash-table-set! test-registry (runs:make-full-test-name test-name "") 'done)))) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -167,11 +167,11 @@ (define (tasks:server-lock-slot mdb run-id) (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot") (if (< (tasks:num-in-available-state mdb run-id) 4) (begin (tasks:server-set-available mdb run-id) - (thread-sleep! 2) ;; Try removing this. It may not be needed. + ;; (thread-sleep! 2) ;; Try removing this. It may not be needed. (tasks:server-am-i-the-server? mdb run-id)) #f)) ;; register that this server may come online (first to register goes though with the process) (define (tasks:server-set-available mdb run-id) @@ -344,10 +344,19 @@ (lambda (id) (set! res id)) mdb ;; NEEDS dbprep ADDED "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND (strftime('%s','now') - start_time) < 60));" run-id) res)) + +(define (tasks:server-running? mdb run-id) + (let ((res #f)) + (sqlite3:for-each-row + (lambda (id) + (set! res id)) + mdb ;; NEEDS dbprep ADDED + "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id) + res)) (define (tasks:need-server run-id) (let ((forced (configf:lookup *configdat* "server" "required")) (maxqry (cdr (rmt:get-max-query-average run-id))) (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) @@ -405,10 +414,11 @@ (sdat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) (if sdat (let ((hostname (vector-ref sdat 6)) (pid (vector-ref sdat 5)) (server-id (vector-ref sdat 0))) + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed") (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (tasks:kill-server hostname pid) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) Index: tests/fullrun/megatest.config ================================================================== --- tests/fullrun/megatest.config +++ tests/fullrun/megatest.config @@ -133,17 +133,18 @@ port 8080 # This server will keep running this number of hours after last access. # Three minutes is 0.05 hours # timeout 0.025 -timeout 0.01 +timeout 0.1 # Server is required - slower but more resistant to Sqlite issues. # required yes # Start server when average query takes longer than this -server-query-threshold 55500 +server-query-threshold 100 +# 55500 # daemonize yes # hostname #{scheme (get-host-name)} ## disks are: