Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -42,10 +42,20 @@ ;; (define (exit . code) ;; (if (null? code) ;; (old-exit) ;; (old-exit code))) +(define (stop-the-train) + (thread-start! (make-thread (lambda () + (let loop () + (if (and *toppath* + (file-exists? (conc *toppath*"/stop-the-train"))) + (begin + (debug:print 0 *default-log-port* "ERROR: found file "*toppath*"/stop-the-train, exiting immediately") + (exit 1))) + (thread-sleep! 5) + (loop)))))) ;; execute thunk, return value. If exception thrown, trap exception, return #f, and emit nonfatal condition note to *default-log-port* . ;; arguments - thunk, message (define (common:fail-safe thunk warning-message-on-exception) (handle-exceptions Index: commonmod.scm ================================================================== --- commonmod.scm +++ commonmod.scm @@ -208,11 +208,15 @@ (let ((adat (get-section cfgdat "areas"))) (map (lambda (entry) `(,(car entry) . ,(val->alist (cadr entry)))) adat))) - + +;;====================================================================== +;; misc stuff +;;====================================================================== + ;; (define (debug:print . params) #f) ;; (define (debug:print-info . params) #f) ;; ;; (define (set-functions dbgp dbgpinfo) ;; (set! debug:print dbgp) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -3786,14 +3786,15 @@ "dashboard:runs-tab-updater")) ;;====================================================================== ;; The heavy lifting starts here ;;====================================================================== +(stop-the-train) (define (main) ;; (print "Starting dashboard main") - + (let* ((mtdb-path (conc *toppath* "/.megatest/main.db")) (target (args:get-arg "-target")) (commondat (dboard:commondat-make))) (if target (begin Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -331,151 +331,10 @@ (with-output-to-port (current-error-port) (lambda () (apply print params)))) -;; open an sql database inside a file lock -;; returns: db existed-prior-to-opening -;; RA => Returns a db handler; sets the lock if opened in writable mode -;; -;; (define *db-open-mutex* (make-mutex)) -;; -#;(define (dbfile:lock-create-open fname initproc) - (let* ((parent-dir (or (pathname-directory fname)(current-directory))) ;; no parent? go local - (raw-fname (pathname-file fname)) - (dir-writable (file-write-access? parent-dir)) - (file-exists (file-exists? fname)) - (file-write (if file-exists - (file-write-access? fname) - dir-writable ))) - ;; (mutex-lock! *db-open-mutex*) ;; tried this mutex, not clear it helped. - (if file-write ;; dir-writable - (condition-case - (let* ((lockfname (conc fname ".lock")) - (readyfname (conc parent-dir "/.ready-" raw-fname)) - (readyexists (common:file-exists? readyfname))) - (if (not readyexists) - (common:simple-file-lock-and-wait lockfname)) - (let ((db (sqlite3:open-database fname))) - (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)) - (sqlite3:execute db "PRAGMA synchronous = 0;") - (if (not file-exists) - (initproc db)) - (if (not readyexists) - (begin - (common:simple-file-release-lock lockfname) - (with-output-to-file - readyfname - (lambda () - (print "Ready at " - (seconds->year-work-week/day-time - (current-seconds))))))) - db)) - (exn (io-error) (dbfile:print-and-exit "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again.")) - (exn (corrupt) (dbfile:print-and-exit "ERROR: database " fname " is corrupt. Repair it to proceed.")) - (exn (busy) (dbfile:print-and-exit "ERROR: database " fname " is locked. Try copying to another location, remove original and copy back.")) - (exn (permission)(dbfile:print-and-exit "ERROR: database " fname " has some permissions problem.")) - (exn () (dbfile:print-and-exit "ERROR: Unknown error with database " fname " message: " ((condition-property-accessor 'exn 'message) exn)))) - - (condition-case - (begin - (dbfile:print-err "WARNING: opening db in non-writable dir " fname) - (let ((db (sqlite3:open-database fname))) - (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)) - (sqlite3:execute db "PRAGMA synchronous = 0;") - ;; (mutex-unlock! *db-open-mutex*) - db)) - (exn (io-error) - (dbfile:print-and-exit - "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again.")) - (exn (corrupt) - (dbfile:print-and-exit - "ERROR: database " fname " is corrupt. Repair it to proceed.")) - (exn (busy) - (dbfile:print-and-exit - "ERROR: database " fname " is locked. Try copying to another location, remove original and copy back.")) - (exn (permission) - (dbfile:print-and-exit - "ERROR: database " fname " has some permissions problem.")) - (exn () - (dbfile:print-and-exit - "ERROR: Unknown error with database " fname " message: " ((condition-property-accessor 'exn 'message) exn)))) - ))) - - -;; This routine creates the db if not already present. It is only called if the db is not already opened -;; -#;(define (db:init-dbstruct dbstruct run-id init-proc #!key (do-sync #t)) - (let* ((subdb (dbfile:get-subdb dbstruct run-id)) - (tmpdb-stack (dbr:subdb-dbstack subdb)) - (max-stale-tmp (dbr:dbstruct-max-stale-secs dbstruct));; (configf:lookup-number *configdat* "server" "filling-db-max-stale-seconds" default: 10)) - (dbpath (dbr:dbstruct-tmppath dbstruct)) ;; (db:dbfile-path)) ;; path to tmp db area - (dbname (dbfile:run-id->dbname run-id)) - (dbexists (file-exists? dbpath)) - (areapath (dbr:dbstruct-areapath dbstruct)) - (mtdbfname (conc areapath "/"dbname)) - (mtdbexists (file-exists? mtdbfname)) - (mtdbmodtime (if mtdbexists (dbfile:lazy-sqlite-db-modification-time mtdbfname) #f)) - (mtdb (db:open-sqlite-db mtdbfname init-proc)) - ;; the reference db for syncing - (refdbfname (conc dbpath "/"dbname"_ref")) - (refndb (db:open-megatest-db refdbfname)) - ;; (mtdbpath (dbr:dbdat-dbfile mtdb)) - ;; the tmpdb - (tmpdbfname (conc dbpath"/"dbname)) ;; /tmp//.megatest/[main|1,2...].db - (tmpdb (db:open-megatest-db tmpdbfname)) ;; lock-create-open dbpath db:initialize-main-db)) - (dbfexists (common:file-exists? tmpdbfname)) ;; (conc dbpath "/megatest.db"))) - (tmpdbmodtime (if dbfexists (common:lazy-sqlite-db-modification-time tmpdbfname) #f)) - - (write-access (file-write-access? mtdbfname)) - - ;; (mtdbmodtime (if mtdbexists - ;; (common:lazy-sqlite-db-modification-time mtdbpath) - ;; #f)) ; moving this before db:open-megatest-db is - ;; called. if wal mode is on -WAL and -shm file get - ;; created with causing the tmpdbmodtime timestamp - ;; always greater than mtdbmodtime (tmpdbmodtime (if - ;; dbfexists (common:lazy-sqlite-db-modification-time - ;; tmpdbfname) #f)) if wal mode is on -WAL and -shm - ;; file get created when db:open-megatest-db is - ;; called. modtimedelta will always be < 10 so db in - ;; tmp not get synced (tmpdbmodtime (if dbfexists - ;; (db:get-last-update-time (car tmpdb)) #f)) (fmt - ;; (file-modification-time tmpdbfname)) - - (modtimedelta (and mtdbmodtime tmpdbmodtime (- mtdbmodtime tmpdbmodtime)))) - - (when write-access - (sqlite3:execute (dbr:dbdat-dbh mtdb) "drop trigger if exists update_tests_trigger") - (sqlite3:execute (dbr:dbdat-dbh mtdb) "drop trigger if exists update_runs_trigger")) - - ;; (print "mtdbmodtime " mtdbmodtime " tmpdbmodtime " tmpdbmodtime " mtdbpath " mtdbpath " " (conc *toppath* "/megatest.db")) - ;; (debug:print-info 13 *default-log-port* "db:open-db>> mtdbpath="mtdbpath" mtdbexists="mtdbexists" and write-access="write-access) - (if (and dbexists (not write-access)) - (begin - (set! *db-write-access* #f) - (dbr:subdb-read-only-set! subdb #t))) - (dbr:subdb-mtdb-set! subdb mtdb) - (dbr:subdb-tmpdb-set! subdb tmpdb) - (dbr:subdb-dbstack-set! subdb (make-stack)) ;; BB: why a stack? Why would the number of db's be indeterminate? Is this a legacy of 1.db 2.db .. ? - (stack-push! (dbr:subdb-dbstack subdb) tmpdb) ;; olddb is already a (cons db path) - (dbr:subdb-refndb-set! subdb refndb) - (if (and (or (not dbfexists) - (and modtimedelta - (> modtimedelta max-stale-tmp))) ;; if db in tmp is over ten seconds older than the file in MTRA then do a sync back - do-sync) - (begin - (dbfile:print-err "filling db " (dbr:dbdat-dbfile tmpdb) " with data \n from " (dbr:dbdat-dbfile mtdb) " mod time delta: " modtimedelta) - (db:sync-tables (db:sync-all-tables-list subdb) #f mtdb refndb tmpdb) - ;; touch tmp db to avoid wal mode wierdness - (set! (file-modification-time tmpdbfname) (current-seconds)) - (dbfile:print-err "INFO: db:sync-all-tables-list done.") - ) - (dbfile:print-err " db, " (dbr:dbdat-dbfile tmpdb) " already exists or fresh enough, not propogating data from\n " (dbr:dbdat-dbfile mtdb) " mod time delta: " modtimedelta) ) - ;; (db:multi-db-sync subdb 'old2new)) ;; migrate data from megatest.db automatically - tmpdb)) - (define (dbfile:cautious-open-database fname init-proc sync-mode journal-mode #!optional (tries-left 500)) (let* ((busy-file (conc fname"-journal")) (delay-time (* (- 51 tries-left) 1.1)) (write-access (file-write-access? fname)) @@ -485,11 +344,11 @@ (if (> tries-left 0) (dbfile:cautious-open-database fname init-proc sync-mode: sync-mode journal-mode: journal-mode (- tries-left 1)))))) (assert (>= tries-left 0) (conc "FATAL: too many attempts in dbfile:cautious-open-database of "fname", giving up.")) - + (if (and (file-write-access? fname) (file-exists? busy-file)) (begin (if (common:low-noise-print 120 busy-file) (dbfile:print-err "INFO: dbfile:cautious-open-database: journal file " @@ -498,82 +357,33 @@ (if (eq? tries-left 2) (begin (dbfile:print-err "INFO: forcing journal rollup "busy-file) (dbfile:brute-force-salvage-db fname))) (dbfile:cautious-open-database fname init-proc sync-mode: sync-mode journal-mode: journal-mode (- tries-left 1))) - + (let* ((result (condition-case - (if dir-access - (dbfile:with-simple-file-lock - (conc fname ".lock") - (lambda () - (let* ((db-exists (file-exists? fname)) - (db (sqlite3:open-database fname))) ;; creates an empty db if it did not already exist. - (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 30000)) - (if sync-mode - (sqlite3:execute db (conc "PRAGMA synchronous = "sync-mode";"))) - (if journal-mode - (sqlite3:execute db (conc "PRAGMA journal_mode = "journal-mode";"))) - (if (and init-proc (not db-exists)) - (init-proc db)) - db))) - (begin - (if (file-exists? fname ) - (let ((db (sqlite3:open-database fname))) - ;; pragmas synchronous not needed because this db is used read-only - ;; (sqlite3:execute db (conc "PRAGMA synchronous = "mode";") - db ) - (print "file doesn't exist: " fname)))) - (exn (io-error) - (dbfile:print-err exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again.") - (retry)) - (exn (corrupt) - (dbfile:print-err exn "ERROR: database " fname " is corrupt. Repair it to proceed.") - (retry)) - (exn (busy) - (dbfile:print-err exn "ERROR: database " fname - " is locked. Try copying to another location, remove original and copy back.") - (retry)) - (exn (permission)(dbfile:print-err exn "ERROR: database " fname " has some permissions problem.") - (retry)) - (exn () - (dbfile:print-err exn "ERROR: Unknown error with database " fname " message: " - ((condition-property-accessor 'exn 'message) exn)) - (retry))))) - result)))) - -(define (dbfile:brute-force-salvage-db fname) - (let* ((backupfname (conc fname"-"(current-process-id)".bak")) - (cmd (conc "cp "fname" "backupfname";mv "fname" "(conc fname ".delme;") - "cp "backupfname" "fname))) - (dbfile:print-err "WARNING: attempting recovery of file "fname" by running commands:\n" - " "cmd) - (system cmd))) - -#;(define (dbfile:cautious-open-database-orig fname init-proc #!optional (tries-left 50)) - (let* ((lock-file (conc fname".lock")) - (delay-time (* (- 51 tries-left) 1.1)) - (retry (lambda () - (thread-sleep! delay-time) - (if (> tries-left 0) - (dbfile:cautious-open-database fname init-proc (- tries-left 1)))))) - (assert (>= tries-left 0) (conc "FATAL: too many attempts in dbfile:cautious-open-database of "fname", giving up.")) - (if (and (file-write-access? fname) (not (dbfile:simple-file-lock lock-file expire-time: 3))) - (begin - (dbfile:print-err "INFO: dbfile:cautious-open-database: lock file " lock-file " exists, trying again in few seconds.") - (thread-sleep! 1) - (if (eq? tries-left 2) - (begin - (dbfile:print-err "INFO: stealing the lock "lock-file) - (delete-file* lock-file))) - (dbfile:cautious-open-database fname init-proc (- tries-left 1))) - (let* ((db-exists (file-exists? fname)) - (result (condition-case - (let* ((db (sqlite3:open-database fname))) - (if (and init-proc (not db-exists)) - (init-proc db)) - db) + (if dir-access + (dbfile:with-simple-file-lock + (conc fname ".lock") + (lambda () + (let* ((db-exists (file-exists? fname)) + (db (sqlite3:open-database fname))) ;; creates an empty db if it did not already exist. + (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 30000)) + (if sync-mode + (sqlite3:execute db (conc "PRAGMA synchronous = "sync-mode";"))) + (if journal-mode + (sqlite3:execute db (conc "PRAGMA journal_mode = "journal-mode";"))) + (if (and init-proc (not db-exists)) + (init-proc db)) + db))) + (begin + (if (file-exists? fname ) + (let ((db (sqlite3:open-database fname))) + ;; pragmas synchronous not needed because this db is used read-only + ;; (sqlite3:execute db (conc "PRAGMA synchronous = "mode";") + db ) + (print "file doesn't exist: " fname)))) (exn (io-error) (dbfile:print-err exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again.") (retry)) (exn (corrupt) (dbfile:print-err exn "ERROR: database " fname " is corrupt. Repair it to proceed.") @@ -586,14 +396,19 @@ (retry)) (exn () (dbfile:print-err exn "ERROR: Unknown error with database " fname " message: " ((condition-property-accessor 'exn 'message) exn)) (retry))))) - (if (file-write-access? fname) - (dbfile:simple-file-release-lock lock-file) - ) result)))) + +(define (dbfile:brute-force-salvage-db fname) + (let* ((backupfname (conc fname"-"(current-process-id)".bak")) + (cmd (conc "cp "fname" "backupfname";mv "fname" "(conc fname ".delme;") + "cp "backupfname" "fname))) + (dbfile:print-err "WARNING: attempting recovery of file "fname" by running commands:\n" + " "cmd) + (system cmd))) (define (dbfile:open-no-sync-db dbpath) (if *no-sync-db* *no-sync-db* Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -916,10 +916,16 @@ ;; *runconfigdat* (runconfigs.config info) ;; *configstatus* (status of the read data) ;; (define (launch:setup #!key (force-reread #f) (areapath #f)) (mutex-lock! *launch-setup-mutex*) + ;; this stops the train quickly for new processes + (if (and *toppath* + (file-exists? (conc *toppath*"/stop-the-train"))) + (begin + (debug:print 0 *default-log-port* "ERROR: found file "*toppath*"/stop-the-train, exiting immediately") + (exit 1))) (if (and *toppath* (eq? *configstatus* 'fulldata) (not force-reread)) ;; got it all (begin (debug:print 2 *default-log-port* "NOTE: skipping launch:setup-body call since we have fulldata") (mutex-unlock! *launch-setup-mutex*) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -557,10 +557,12 @@ ;; (if start-watchdog ;; (thread-start! *watchdog*)) #t ) +;; stop the train watchdog +(stop-the-train) ;; bracket open-output-file with code to make leading directory if it does not exist and handle exceptions (define (open-logfile logpath-in) (condition-case (let* ((log-dir (or (pathname-directory logpath-in) "."))