Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -493,13 +493,18 @@ (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) (debug:print 0 " src db: " (db:dbdat-get-path fromdb)) (for-each (lambda (dbdat) (debug:print 0 " dbpath: " (db:dbdat-get-path dbdat))) (cons todb slave-dbs)) - (if *server-run* ;; we are inside a server - (set! *time-to-exit* #t) ;; let watch dog know that it is time to die. - (exit 1))) + (if *server-run* ;; we are inside a server, throw a sync-failed error + (signal (make-composite-condition + (make-property-condition 'sync-failed 'message "db:sync-tables failed in a server context."))))) + + ;; (set! *time-to-exit* #t) ;; let watch dog know that it is time to die. + ;; (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") + ;; (portlogger:open-run-close portlogger:set-port port "released") + ;; (exit 1))) (cond ((not fromdb) (debug:print 3 "WARNING: db:sync-tables called with fromdb missing") -1) ((not todb) (debug:print 3 "WARNING: db:sync-tables called with todb missing") -2) ((not (sqlite3:database? (db:dbdat-get-db fromdb))) (debug:print 0 "ERROR: db:sync-tables called with fromdb not a database " fromdb) -3) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -392,45 +392,57 @@ (iface (car server-info)) (port (cadr server-info)) (last-access 0) (server-timeout (server:get-timeout))) (let loop ((count 0) - (server-state 'available)) - + (server-state 'available) + (bad-sync-count 0)) ;; Use this opportunity to sync the inmemdb to db - (let ((start-time (current-milliseconds)) - (sync-time #f) - (rem-time #f)) - ;; inmemdb is a dbstruct - (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t)) - (set! sync-time (- (current-milliseconds) start-time)) - (set! rem-time (quotient (- 4000 sync-time) 1000)) - (debug:print 2 "SYNC: time= " sync-time ", rem-time=" rem-time) - - ;; - ;; set_running after our first pass through and start the db - ;; - (if (eq? server-state 'available) - (let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers - (if (equal? new-server-id server-id) - (begin - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") - (thread-sleep! 5) ;; give some margin for queries to complete before switching from file based access to server based access - (set! *inmemdb* (db:setup run-id)) - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")) - (begin ;; gotta exit nicely - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision") - (http-transport:server-shutdown server-id port))))) - - (if (and (<= rem-time 4) - (> rem-time 0)) - (thread-sleep! rem-time) - (thread-sleep! 4))) ;; fallback for if the math is changed ... + (if *inmemdb* + (let ((start-time (current-milliseconds)) + (sync-time #f) + (rem-time #f)) + ;; inmemdb is a dbstruct + (condition-case + (db:sync-touched *inmemdb* *run-id* force-sync: #t) + ((sync-failed)(cond + ((> bad-sync-count 10) ;; time to give up + (http-transport:server-shutdown server-id port)) + (else ;; (> bad-sync-count 0) ;; we've had a fail or two, delay and loop + (thread-sleep! 5) + (loop count server-state (+ bad-sync-count 1))))) + ((exn) + (debug:print 0 "ERROR: error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server") + (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running crashed") + (exit))) + (set! sync-time (- (current-milliseconds) start-time)) + (set! rem-time (quotient (- 4000 sync-time) 1000)) + (debug:print 2 "SYNC: time= " sync-time ", rem-time=" rem-time) + + (if (and (<= rem-time 4) + (> rem-time 0)) + (thread-sleep! rem-time) + (thread-sleep! 4))) ;; fallback for if the math is changed ... + + ;; + ;; no *inmemdb* yet, set running after our first pass through and start the db + ;; + (if (eq? server-state 'available) + (let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers + (if (equal? new-server-id server-id) + (begin + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") + (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access + (set! *inmemdb* (db:setup run-id)) + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")) + (begin ;; gotta exit nicely + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision") + (http-transport:server-shutdown server-id port)))))) (if (< count 1) ;; 3x3 = 9 secs aprox - (loop (+ count 1) 'running)) + (loop (+ count 1) 'running bad-sync-count)) ;; Check that iface and port have not changed (can happen if server port collides) (mutex-lock! *heartbeat-mutex*) (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) @@ -463,11 +475,11 @@ ;; the db indicates so ;; ;; (if (tasks:server-am-i-the-server? tdb run-id) ;; (tasks:server-set-state! tdb server-id "running")) ;; - (loop 0 server-state)) + (loop 0 server-state bad-sync-count)) (http-transport:server-shutdown server-id port))))) (define (http-transport:server-shutdown server-id port) (let ((tdbdat (tasks:open-db))) (debug:print-info 0 "Starting to shutdown the server.") Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -588,11 +588,16 @@ (debug:print 0 "WARNING: linktree did not exist! Creating it now at " linktree) (create-directory linktree #t))) ;; (system (conc "mkdir -p " linktree)))) ;; create the directory for the tests dir links, this is needed no matter what... (if (and (not (directory-exists? lnkbase)) (not (file-exists? lnkbase))) - (create-directory lnkbase #t)) + (handle-exceptions + exn + (begin + (debug:print "ERROR: Problem creating linktree base at " lnkbase) + (print-error-message exn (current-error-port))) + (create-directory lnkbase #t))) ;; update the toptest record with its location rundir, cache the path ;; This wass highly inefficient, one db write for every subtest, potentially ;; thousands of unnecessary updates, cache the fact it was set and don't set it ;; again. Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1623,11 +1623,11 @@ (exit 3)) ((not runname) (debug:print 0 "ERROR: Missing required parameter for " switchname ", you must specify the run name with -runname runname") (exit 3)) (else - (let ((db #f) + (let (;; (db #f) (keys #f)) (if (not (launch:setup-for-run)) (begin (debug:print 0 "Failed to setup, exiting") (exit 1))) @@ -1641,12 +1641,13 @@ (if (hash-table-ref/default runconfig (args:get-arg "-reqtarg") #f) (keys:target-set-args keys (args:get-arg "-reqtarg") args:arg-hash) (begin (debug:print 0 "ERROR: [" (args:get-arg "-reqtarg") "] not found in " runconfigf) - (if db (sqlite3:finalize! db)) - (exit 1)))) + ;; (if db (sqlite3:finalize! db)) + (exit 1) + ))) (if (args:get-arg "-target") (keys:target-set-args keys (args:get-arg "-target" args:arg-hash) args:arg-hash))) (if (not (car *configinfo*)) (begin (debug:print 0 "ERROR: Attempted to " action-desc " but run area config file not found") @@ -1653,11 +1654,11 @@ (exit 1)) ;; Extract out stuff needed in most or many calls ;; here then call proc (let* ((keyvals (keys:target->keyval keys target))) (proc target runname keys keyvals))) - (if db (sqlite3:finalize! db)) + ;; (if db (sqlite3:finalize! db)) (set! *didsomething* #t)))))) ;;====================================================================== ;; Lock/unlock runs ;;======================================================================