Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -265,10 +265,12 @@ ;; send the data and get the response ;; extract the needed info from the http data and ;; process and return it. (let* ((send-recieve (lambda () (mutex-lock! *http-mutex*) + ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines) + ;; ((exn http client-error) e (print e))) (set! res (with-input-from-request ;; was dat fullurl (list (cons 'key "thekey") (cons 'cmd cmd) (cons 'params params)) @@ -376,10 +378,12 @@ ;; ;; set_running after our first pass through and start the db ;; (if (eq? server-state 'available) (begin + (tasks:server-set-state! tdb server-id "dbprep") + (thread-sleep! 5) ;; give some margin for queries to complete before switching from file based access to server based access (set! *inmemdb* (db:setup run-id)) (tasks:server-set-state! tdb server-id "running"))) (if (and (<= rem-time 4) (> rem-time 0)) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -69,24 +69,23 @@ ;; NB// can cache the answer for server running for 10 seconds ... ;; (if (and (not (rmt:write-frequency-over-limit? cmd run-id)) (not (open-run-close tasks:server-running-or-starting? tasks:open-db run-id))) #f - (let loop ((numtries 100)) + (let loop ((numtries 2)) (let ((res (client:setup run-id))) (if res (hash-table-ref/default *runremote* run-id #f) ;; client:setup filled this in (hopefully) (if (> numtries 0) (begin ;; junk records can cause stuckness here. use this time to ;; clean out (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id "auto-start-clean-up") - (thread-sleep! 10) + (thread-sleep! 1) (loop (- numtries 1))) - (begin - (debug:print 0 "ERROR: 100 tries and no server, giving up") - (exit 1)))))))))) + #f) ;; try couple times to start server - give up and do local queries + ))))))) (jparams (db:obj->string params))) (if connection-info (let ((res (http-transport:client-api-send-receive run-id connection-info cmd jparams))) (if res (db:string->obj res) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -169,11 +169,11 @@ mdb "SELECT count(id) FROM servers WHERE run_id != 0 AND state = 'running';") res)) (define (tasks:server-clean-out-old-records-for-run-id mdb run-id tag) - (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state in ('available','shutting-down') AND (strftime('%s','now') - start_time) > 50 AND run_id=?;" + (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state in ('available','dbprep','shutting-down') AND (strftime('%s','now') - start_time) > 50 AND run_id=?;" (conc "defunct" tag) run-id)) (define (tasks:server-force-clean-running-records-for-run-id mdb run-id tag) (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=?;" (conc "defunct" tag) run-id)) @@ -188,11 +188,11 @@ (define (tasks:server-delete-record mdb server-id tag) (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE id=?;" (conc "defunct" tag) server-id) ;; use this opportuntity to clean out records over one month old or over 10 minutes old with port = -1 (i.e. a never used placeholder) - (sqlite3:execute mdb "DELETE FROM servers WHERE state not in ('running','shutting-down') AND (strftime('%s','now') - start_time) > 2628000;") + (sqlite3:execute mdb "DELETE FROM servers WHERE state not in ('running','shutting-down','dbprep') AND (strftime('%s','now') - start_time) > 2628000;") (sqlite3:execute mdb "DELETE FROM servers WHERE state like 'defunct%' AND port=-1 AND (strftime('%s','now') - start_time) > 600;") ) (define (tasks:server-set-state! mdb server-id state) (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE id=?;" state server-id)) @@ -264,11 +264,11 @@ (res '())) (sqlite3:for-each-row (lambda (a . b) (set! res (cons (apply vector a b) res))) mdb - (conc "SELECT " selstr " FROM servers WHERE run_id=? AND state in ('available','running') ORDER BY start_time DESC;") + (conc "SELECT " selstr " FROM servers WHERE run_id=? AND state in ('available','running','dbprep') ORDER BY start_time DESC;") run-id) (vector header res))) (define (tasks:get-server mdb run-id) (let ((res #f) @@ -287,12 +287,12 @@ (define (tasks:server-running-or-starting? mdb run-id) (let ((res #f)) (sqlite3:for-each-row (lambda (id) (set! res id)) - mdb - "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'available' AND (strftime('%s','now') - start_time) < 30));" run-id) + mdb ;; NEEDS dbprep ADDED + "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND (strftime('%s','now') - start_time) < 60));" run-id) res)) (define (tasks:get-all-servers mdb) (let ((res '())) (sqlite3:for-each-row @@ -333,10 +333,11 @@ (debug:print-info 1 "Sending signal/term to " pid " on " hostname) (process-signal pid signal/term) ;; local machine, send sig term (thread-sleep! 5) ;; give it five seconds to die peacefully then do a brutal kill (process-signal pid signal/kill)) (debug:print 0 "WARNING: Can't kill frozen server on remote host " hostname)))))) + ;;====================================================================== ;; Tasks and Task monitors ;;======================================================================