Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -1,7 +1,7 @@ ;; Always use two digit decimal ;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.5107) +(define megatest-version 1.5108) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -296,18 +296,20 @@ (status (open-run-close tasks:server-alive? tasks:open-db #f hostname: hostname port: port)) (killed #f) (zmq-socket (if status (server:client-connect hostname port) #f))) ;; no need to login as status of #t indicates we are connecting to correct ;; server - (if (or (not status) ;; no point in keeping dead records in the db - (and khost-port ;; kill by host/port - (equal? hostname (car khost-port)) - (equal? port (string->number (cadr khost-port))))) + (if (not status) ;; no point in keeping dead records in the db + (open-run-close tasks:server-deregister tasks:open-db hostname port: port pid: pid)) + + (if (and khost-port ;; kill by host/port + (equal? hostname (car khost-port)) + (equal? port (string->number (cadr khost-port)))) (tasks:kill-server status hostname port pid)) (if (and kpid - (equal? hostname (car khost-port)) + (equal? hostname (get-host-name)) (equal? kpid pid)) ;;; YEP, ALL WITH PID WILL BE KILLED!!! (tasks:kill-server status hostname #f pid)) (format #t fmtstr id mt-ver pid hostname interface port start-time priority (if status "alive" "dead")))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -142,11 +142,11 @@ (set! server-info *server-info*) (mutex-unlock! *heartbeat-mutex*) ;; The logic here is that if the server loop gets stuck blocked in working ;; we don't want to update our heartbeat (set! pulse (- (current-seconds) server-loop-heartbeat)) - (debug:print-info 1 "Heartbeat period is " pulse " seconds on " (cadr server-info) ":" (caddr server-info) ", last db access is " *last-db-access*) + (debug:print-info 1 "Heartbeat period is " pulse " seconds on " (cadr server-info) ":" (caddr server-info) ", last db access is " (- (current-seconds) *last-db-access*) " seconds ago") (if (> pulse 11) ;; must stay less than 10 seconds (begin (debug:print 0 "ERROR: Heartbeat failed, committing servercide") (exit)) (open-run-close tasks:server-update-heartbeat tasks:open-db (car server-info))) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -124,11 +124,10 @@ (heartbeat-delta 99e9)) (sqlite3:for-each-row (lambda (delta) (set! heartbeat-delta delta)) mdb "SELECT strftime('%s','now')-heartbeat FROM servers WHERE id=?;" server-id) - (debug:print 1 "Found heartbeat-delta of " heartbeat-delta " for server with id " server-id) (< heartbeat-delta 10))) (define (tasks:client-register mdb pid hostname cmdline) (sqlite3:execute mdb @@ -187,21 +186,24 @@ (loop (car tal)(cdr tal)))))))))) (define (tasks:kill-server status hostname port pid) (debug:print-info 1 "Removing defunct server record for " hostname ":" port) (if port - (open-run-close tasks:server-deregister tasks:open-db hostname port: port) - (open-run-close tasks:server-deregister tasks:open-db hostname pid: pid)) + (open-run-close tasks:server-deregister tasks:open-db hostname port: port) + (open-run-close tasks:server-deregister tasks:open-db hostname pid: pid)) (if status ;; #t means alive (begin (if (equal? hostname (get-host-name)) - (begin - (debug:print 1 "Sending signal/term to " pid " on " hostname) - (process-signal pid signal/term) - (thread-sleep! 5) ;; give it five seconds to die peacefully then do a brutal kill - (process-signal pid signal/kill)) ;; local machine, send sig term + (handle-exceptions + exn + (debug:print-info 0 "server may or may not be dead, check for megatest -server running as pid " pid "\n" + " EXCEPTION: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 1 "Sending signal/term to " pid " on " hostname) + (process-signal pid signal/term) + (thread-sleep! 5) ;; give it five seconds to die peacefully then do a brutal kill + (process-signal pid signal/kill)) ;; local machine, send sig term (begin (debug:print-info 1 "Telling alive server on " hostname ":" port " to commit servercide") (cdb:kill-server zmq-socket)))) ;; remote machine, try telling server to commit suicide (begin (if status