Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1961,16 +1961,19 @@ ;; Update run_stats for given run_id ;; input data is a list (state status count) ;; (define (db:update-run-stats dbstruct run-id stats) + (mutex-lock! *db-stats-mutex*) (db:with-db dbstruct #f #f + (lambda (db) ;; remove previous data + (let* ((stmt1 (sqlite3:prepare db "DELETE FROM run_stats WHERE run_id=? AND state=? AND status=?;")) (stmt2 (sqlite3:prepare db "INSERT INTO run_stats (run_id,state,status,count) VALUES (?,?,?,?);")) (res (sqlite3:with-transaction db @@ -1980,10 +1983,11 @@ (sqlite3:execute stmt1 run-id (car dat)(cadr dat)) (apply sqlite3:execute stmt2 run-id dat)) stats))))) (sqlite3:finalize! stmt1) (sqlite3:finalize! stmt2) + (mutex-unlock! *db-stats-mutex*) res)))) (define (db:get-main-run-stats dbstruct run-id) (db:with-db dbstruct Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -75,11 +75,11 @@ (set! *runremote* (make-remote)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a homehost record - ((not (pair? (remote-hh-dat *runremote*))) ;; have a homehost record? + ((not (pair? (remote-hh-dat *runremote*))) ;; not on homehost (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little (remote-hh-dat-set! *runremote* (common:get-homehost)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") (rmt:send-receive cmd rid params attemptnum: attemptnum)) @@ -141,18 +141,20 @@ ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;) ;) ;;;; ;; if not on homehost ensure we have a connection to a live server ;; NOTE: we *have* a homehost record by now - ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost - (not (remote-conndat *runremote*)) ;; and no connection - (server:read-dotserver *toppath*)) ;; .server file exists - ;; something caused the server entry in tdb to disappear, but the server is still running - (server:remove-dotserver-file *toppath* ".*") - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 20") - (rmt:send-receive cmd rid params attemptnum: (add1 attemptnum))) + + ;; ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost + ;; (not (remote-conndat *runremote*)) ;; and no connection + ;; (server:read-dotserver *toppath*)) ;; .server file exists + ;; ;; something caused the server entry in tdb to disappear, but the server is still running + ;; (server:remove-dotserver-file *toppath* ".*") + ;; (mutex-unlock! *rmt-mutex*) + ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 20") + ;; (rmt:send-receive cmd rid params attemptnum: (add1 attemptnum))) + ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost (not (remote-conndat *runremote*))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*)) (mutex-unlock! *rmt-mutex*) (tasks:start-and-wait-for-server (tasks:open-db) 0 15) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -448,14 +448,16 @@ (reverse res))) ;; no elegance here ... ;; (define (tasks:kill-server hostname pid #!key (kill-switch "")) + (server:remove-dotserver-file *toppath* ".*") (debug:print-info 0 *default-log-port* "Attempting to kill server process " pid " on host " hostname) (setenv "TARGETHOST" hostname) (setenv "TARGETHOST_LOGF" "server-kills.log") (system (conc "nbfake kill "kill-switch" "pid)) + (unsetenv "TARGETHOST_LOGF") (unsetenv "TARGETHOST")) ;; look up a server by run-id and send it a kill, also delete the record for that server ;; @@ -466,10 +468,11 @@ (let ((hostname (vector-ref sdat 6)) (pid (vector-ref sdat 5)) (server-id (vector-ref sdat 0))) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed") (debug:print-info 0 *default-log-port* "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) + (server:remove-dotserver-file *toppath* ".*") (tasks:kill-server hostname pid) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 *default-log-port* "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) ))