Megatest

Check-in [6aea490eac]
Login
Overview
Comment:Added exception handling to server kill
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | v1.5108
Files: files | file ages | folders
SHA1: 6aea490eacae55955f4ae0e77cb7b1d4fa813643
User & Date: matt on 2012-11-03 18:20:15
Other Links: manifest | tags
Context
2012-11-03
19:47
Reversed order for selecting servers so oldest is always choosen. check-in: 9b1da0d111 user: mrwellan tags: trunk
18:20
Added exception handling to server kill check-in: 6aea490eac user: matt tags: trunk, v1.5108
17:18
Added more instrumentation info on server pulse. Removed rm of monitor.db in Makefile check-in: 04c31891a9 user: matt tags: trunk
Changes

Modified megatest-version.scm from [94672ddb9c] to [c0fe9af7ba].

1
2
3
4
5
6
7
;; Always use two digit decimal
;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00..

(declare (unit megatest-version))

(define megatest-version 1.5107)






|

1
2
3
4
5
6
7
;; Always use two digit decimal
;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00..

(declare (unit megatest-version))

(define megatest-version 1.5108)

Modified megatest.scm from [c4c3f3d52b] to [5c2117b3b6].

294
295
296
297
298
299
300
301


302
303
304
305
306
307
308
309
310
311
312
313
314
315
		      (state      (vector-ref server 7))
		      (mt-ver     (vector-ref server 8))
		      (status     (open-run-close tasks:server-alive? tasks:open-db #f hostname: hostname port: port))
		      (killed     #f)
		      (zmq-socket (if status (server:client-connect hostname port) #f)))
		 ;; no need to login as status of #t indicates we are connecting to correct 
		 ;; server
		 (if (or (not status)    ;; no point in keeping dead records in the db


			 (and khost-port ;; kill by host/port
			      (equal? hostname (car khost-port))
			      (equal? port (string->number (cadr khost-port)))))
		     (tasks:kill-server status hostname port pid))

		 (if (and kpid
			  (equal? hostname (car khost-port))
			  (equal? kpid pid)) ;;; YEP, ALL WITH PID WILL BE KILLED!!!
		     (tasks:kill-server status hostname #f pid))

		 (format #t fmtstr id mt-ver pid hostname interface port start-time priority 
			 (if status "alive" "dead"))))
	     servers)
	    (debug:print-info 1 "Done with listservers")







|
>
>
|

|



|







294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
		      (state      (vector-ref server 7))
		      (mt-ver     (vector-ref server 8))
		      (status     (open-run-close tasks:server-alive? tasks:open-db #f hostname: hostname port: port))
		      (killed     #f)
		      (zmq-socket (if status (server:client-connect hostname port) #f)))
		 ;; no need to login as status of #t indicates we are connecting to correct 
		 ;; server
		 (if (not status)    ;; no point in keeping dead records in the db
		      (open-run-close tasks:server-deregister tasks:open-db hostname port: port pid: pid))

		 (if (and khost-port ;; kill by host/port
			      (equal? hostname (car khost-port))
			  (equal? port (string->number (cadr khost-port))))
		     (tasks:kill-server status hostname port pid))

		 (if (and kpid
			  (equal? hostname (get-host-name))
			  (equal? kpid pid)) ;;; YEP, ALL WITH PID WILL BE KILLED!!!
		     (tasks:kill-server status hostname #f pid))

		 (format #t fmtstr id mt-ver pid hostname interface port start-time priority 
			 (if status "alive" "dead"))))
	     servers)
	    (debug:print-info 1 "Done with listservers")

Modified server.scm from [ebf5897ef4] to [d8bc67a0d6].

140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
	  (mutex-lock! *heartbeat-mutex*)
	  (set! server-loop-heartbeat *server-loop-heart-beat*)
	  (set! server-info *server-info*)
	  (mutex-unlock! *heartbeat-mutex*)
	  ;; The logic here is that if the server loop gets stuck blocked in working
	  ;; we don't want to update our heartbeat
	  (set! pulse (- (current-seconds) server-loop-heartbeat))
	  (debug:print-info 1 "Heartbeat period is " pulse " seconds on " (cadr server-info) ":" (caddr server-info) ", last db access is " *last-db-access*)
	  (if (> pulse 11) ;; must stay less than 10 seconds 
	      (begin
		(debug:print 0 "ERROR: Heartbeat failed, committing servercide")
		(exit))
	      (open-run-close tasks:server-update-heartbeat tasks:open-db (car server-info)))
	  (if (or (> numrunning 0) ;; stay alive for two days after last access
		  (> (+ *last-db-access* 







|







140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
	  (mutex-lock! *heartbeat-mutex*)
	  (set! server-loop-heartbeat *server-loop-heart-beat*)
	  (set! server-info *server-info*)
	  (mutex-unlock! *heartbeat-mutex*)
	  ;; The logic here is that if the server loop gets stuck blocked in working
	  ;; we don't want to update our heartbeat
	  (set! pulse (- (current-seconds) server-loop-heartbeat))
	  (debug:print-info 1 "Heartbeat period is " pulse " seconds on " (cadr server-info) ":" (caddr server-info) ", last db access is " (- (current-seconds) *last-db-access*) " seconds ago")
	  (if (> pulse 11) ;; must stay less than 10 seconds 
	      (begin
		(debug:print 0 "ERROR: Heartbeat failed, committing servercide")
		(exit))
	      (open-run-close tasks:server-update-heartbeat tasks:open-db (car server-info)))
	  (if (or (> numrunning 0) ;; stay alive for two days after last access
		  (> (+ *last-db-access* 

Modified tasks.scm from [6c83278de0] to [67418262aa].

122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
			 server-id
			 (tasks:server-get-server-id mdb hostname port pid)))
	 (heartbeat-delta 99e9))
    (sqlite3:for-each-row
     (lambda (delta)
       (set! heartbeat-delta delta))
     mdb "SELECT strftime('%s','now')-heartbeat FROM servers WHERE id=?;" server-id)
    (debug:print 1 "Found heartbeat-delta of " heartbeat-delta " for server with id " server-id)
    (< heartbeat-delta 10)))

(define (tasks:client-register mdb pid hostname cmdline)
  (sqlite3:execute
   mdb
   "INSERT OR REPLACE INTO clients (server_id,pid,hostname,cmdline,login_time) VALUES(?,?,?,?,strftime('%s','now'));")
  (tasks:server-get-server-id mdb hostname #f pid)







<







122
123
124
125
126
127
128

129
130
131
132
133
134
135
			 server-id
			 (tasks:server-get-server-id mdb hostname port pid)))
	 (heartbeat-delta 99e9))
    (sqlite3:for-each-row
     (lambda (delta)
       (set! heartbeat-delta delta))
     mdb "SELECT strftime('%s','now')-heartbeat FROM servers WHERE id=?;" server-id)

    (< heartbeat-delta 10)))

(define (tasks:client-register mdb pid hostname cmdline)
  (sqlite3:execute
   mdb
   "INSERT OR REPLACE INTO clients (server_id,pid,hostname,cmdline,login_time) VALUES(?,?,?,?,strftime('%s','now'));")
  (tasks:server-get-server-id mdb hostname #f pid)
191
192
193
194
195
196
197

198


199
200
201
202
203
204
205
  (if port
      (open-run-close tasks:server-deregister tasks:open-db  hostname port: port)
      (open-run-close tasks:server-deregister tasks:open-db hostname pid: pid))
  
  (if status ;; #t means alive
      (begin
	(if (equal? hostname (get-host-name))

	    (begin


	      (debug:print 1 "Sending signal/term to " pid " on " hostname)
	      (process-signal pid signal/term)
	      (thread-sleep! 5) ;; give it five seconds to die peacefully then do a brutal kill
	      (process-signal pid signal/kill)) ;; local machine, send sig term
	    (begin
	      (debug:print-info 1 "Telling alive server on " hostname ":" port " to commit servercide")
	      (cdb:kill-server zmq-socket))))    ;; remote machine, try telling server to commit suicide







>
|
>
>







190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
  (if port
      (open-run-close tasks:server-deregister tasks:open-db  hostname port: port)
      (open-run-close tasks:server-deregister tasks:open-db hostname pid: pid))
  
  (if status ;; #t means alive
      (begin
	(if (equal? hostname (get-host-name))
	    (handle-exceptions
	     exn
	     (debug:print-info 0 "server may or may not be dead, check for megatest -server running as pid " pid "\n"
			       "  EXCEPTION: " ((condition-property-accessor 'exn 'message) exn))
	      (debug:print 1 "Sending signal/term to " pid " on " hostname)
	      (process-signal pid signal/term)
	      (thread-sleep! 5) ;; give it five seconds to die peacefully then do a brutal kill
	      (process-signal pid signal/kill)) ;; local machine, send sig term
	    (begin
	      (debug:print-info 1 "Telling alive server on " hostname ":" port " to commit servercide")
	      (cdb:kill-server zmq-socket))))    ;; remote machine, try telling server to commit suicide