Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -140,12 +140,12 @@ (sqlite3:execute db "PRAGMA synchronous = 0;") db) (let* ((parent-dir (pathname-directory fname)) (dir-writable (file-write-access? parent-dir))) (if dir-writable - (let ((lock (obtain-dot-lock fname 1 5 10)) - (exists (file-exists? fname)) + (let ((exists (file-exists? fname)) + (lock (obtain-dot-lock fname 1 5 10)) (db (sqlite3:open-database fname))) (sqlite3:set-busy-handler! db (make-busy-timeout 136000)) (sqlite3:execute db "PRAGMA synchronous = 0;") (if (not exists)(initproc db)) (release-dot-lock fname) @@ -154,11 +154,11 @@ (debug:print 0 "ERROR: no such db in non-writable dir " fname) (sqlite3:open-database fname)))))) ;; This routine creates the db. It is only called if the db is not already opened ;; -(define (db:open-rundb dbstruct run-id) ;; (conc *toppath* "/megatest.db") (car *configinfo*))) +(define (db:open-rundb dbstruct run-id #!key (attemptnum 0)) ;; (conc *toppath* "/megatest.db") (car *configinfo*))) (let* ((local (dbr:dbstruct-get-local dbstruct)) (rdb (if local (dbr:dbstruct-get-localdb dbstruct run-id) (dbr:dbstruct-get-inmem dbstruct)))) ;; (dbr:dbstruct-get-runrec dbstruct run-id 'inmem))) (if rdb @@ -167,17 +167,26 @@ (dbexists (file-exists? dbpath)) (inmem (if local #f (db:open-inmem-db))) (refdb (if local #f (db:open-inmem-db))) (db (db:lock-create-open dbpath (lambda (db) - (db:initialize-run-id-db db) - (sqlite3:execute - db - "INSERT OR IGNORE INTO tests (id,run_id,testname,event_time,item_path,state,status) VALUES (?,?,'bogustest',strftime('%s','now'),'nowherepath','DELETED','n/a');" - (* run-id 30000) ;; allow for up to 30k tests per run - run-id) - ))) ;; add strings db to rundb, not in use yet + (handle-exceptions + exn + (begin + (release-dot-lock dbpath) + (if (> attemptnum 2) + (debug:print 0 "ERROR: tried twice, cannot create/initialize db for run-id " run-id ", at path " dbpath) + (db:open-rundb dbstruct run-id attemptnum (+ attemptnum 1)))) + (db:initialize-run-id-db db) + (sqlite3:execute + db + "INSERT OR IGNORE INTO tests (id,run_id,testname,event_time,item_path,state,status) VALUES (?,?,'bogustest',strftime('%s','now'),'nowherepath','DELETED','n/a');" + (* run-id 30000) ;; allow for up to 30k tests per run + run-id) + ;; do a dummy query to test that the table exists and the db is truly readable + (sqlite3:execute db "SELECT * FROM tests WHERE id=?;" (* run-id 30000)) + )))) ;; add strings db to rundb, not in use yet ;; )) ;; (sqlite3:open-database dbpath)) (olddb (if *megatest-db* *megatest-db* (let ((db (db:open-megatest-db))) (set! *megatest-db* db) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -276,14 +276,14 @@ ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines) ;; ((exn http client-error) e (print e))) (set! res (handle-exceptions exn (begin - (debug:print 0 "WARNING: failure in with-input-from-request to " fullrul ". Killing associated server to allow clean retry.") + (debug:print 0 "WARNING: failure in with-input-from-request to " fullurl ". Killing associated server to allow clean retry.") (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (hash-table-delete! *runremote* run-id) - (tasks:kill-server-run-id run-id) + ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine. #f) (with-input-from-request ;; was dat fullurl (list (cons 'key "thekey") (cons 'cmd cmd) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -56,11 +56,11 @@ #f)))) ;; cmd is a symbol ;; vars is a json string encoding the parameters for the call ;; -(define (rmt:send-receive cmd rid params) +(define (rmt:send-receive cmd rid params #!key (attemptnum 0)) ;; clean out old connections (mutex-lock! *db-multi-sync-mutex*) (let ((expire-time (- (current-seconds) 60))) (for-each (lambda (run-id) @@ -93,11 +93,17 @@ (if res (db:string->obj res) (let ((new-connection-info (client:setup run-id))) (debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.") (hash-table-delete! *runremote* run-id) ;; don't keep using the same connection - (rmt:send-receive cmd run-id params)))) + + ;; no longer killing the server in http-transport:client-api-send-receive + ;; may kill it here but what are the criteria? + ;; start with three calls then kill server + (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id)) + + (rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1))))) (let ((max-avg-qry (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) (debug:print-info 4 "no server and read-only query, bypassing normal channel") ;; (if (rmt:write-frequency-over-limit? cmd run-id)(server:kind-run run-id)) (let ((curr-max (rmt:get-max-query-average run-id))) (if (> (cdr curr-max) max-avg-qry)