Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1367,12 +1367,12 @@ (define (db:get-status-from-final-status-file run-dir) (let ((infile (conc run-dir "/.final-status"))) ;; first verify we are able to write the output file (if (not (file-read-access? infile)) (begin - (debug:print 0 *default-log-port* "ERROR: cannot read " infile) - (debug:print 0 *default-log-port* "ERROR: run-dir is " run-dir) + (debug:print 2 *default-log-port* "ERROR: cannot read " infile) + (debug:print 2 *default-log-port* "ERROR: run-dir is " run-dir) #f ) (with-input-from-file infile read-lines) ))) Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -309,12 +309,10 @@ (define (dbfile:open-sqlite3-db dbpath init-proc) (let* ((dbexists (file-exists? dbpath)) (write-access (file-write-access? dbpath)) (db (dbfile:cautious-open-database dbpath init-proc))) #;(sqlite3:open-database dbpath) (dbfile:inc-db-open dbpath) - (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 10000)) - (sqlite3:execute db (conc "PRAGMA synchronous = 0;")) ;; (init-proc db) (make-dbr:dbdat dbfile: dbpath dbh: db read-only: (not write-access)))) (define (dbfile:print-and-exit . params) (with-output-to-port @@ -479,30 +477,20 @@ (dir-access (file-write-access? (pathname-directory fname))) (retry (lambda () (thread-sleep! delay-time) (if (> tries-left 0) (dbfile:cautious-open-database fname init-proc (- tries-left 1)))))) - (assert (>= tries-left 0) (conc "FATAL: too many attempts in dbfile:cautious-open-database of "fname", giving up.")) - - (if (and (file-write-access? fname) - (file-exists? busy-file)) - (begin - (dbfile:print-err "INFO: dbfile:cautious-open-database: journal file " busy-file " exists, trying again in few seconds.") - (thread-sleep! 1) - (if (eq? tries-left 2) - (begin - (dbfile:print-err "INFO: forcing journal rollup "busy-file) - (dbfile:brute-force-salvage-db fname))) - (dbfile:cautious-open-database fname init-proc (- tries-left 1))) (let* ((result (condition-case (if dir-access (dbfile:with-simple-file-lock (conc fname ".lock") (lambda () (let* ((db-exists (file-exists? fname)) (db (sqlite3:open-database fname))) ;; creates an empty db if it did not already exist. + (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 30000)) + (sqlite3:execute db (conc "PRAGMA synchronous = 0;")) (if (and init-proc (not db-exists)) (init-proc db)) db))) (begin (if (file-exists? fname ) @@ -527,13 +515,11 @@ (retry)) (exn () (dbfile:print-err exn "ERROR: Unknown error with database " fname " message: " ((condition-property-accessor 'exn 'message) exn)) (retry))))) - #;(if (file-write-access? fname) - (dbfile:simple-file-release-lock lock-file)) - result)))) + result))) (define (dbfile:brute-force-salvage-db fname) (let* ((backupfname (conc fname"-"(current-process-id)".bak")) (cmd (conc "cp "fname" "backupfname";mv "fname" "(conc fname ".delme;") "cp "backupfname" "fname))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -233,11 +233,11 @@ (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) (directory-exists? (conc areapath "/logs"))) '())) - ;; Get the list of server logs. First remove logs for servers that have exited. + ;; Get the list of server logs. (let* ( ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) (server-logs (glob (conc areapath "/logs/server-*-*.log"))) (num-serv-logs (length server-logs))) @@ -250,11 +250,11 @@ (tal (cdr server-logs)) (res '())) (let* ((mod-time (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "failed to get modification time on " hed ", exn=" exn) + (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn) (current-seconds)) ;; 0 (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted (down-time (- (current-seconds) mod-time)) (serv-dat (if (or (< num-serv-logs 10) (< down-time 900)) ;; day-seconds))