Megatest

Check-in [ece909ab1c]
Login
Overview
Comment:borked server heartbeat logic
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: ece909ab1c2ccad47eaf87bac003a67df8745579
User & Date: mrwellan on 2012-11-02 18:33:58
Other Links: manifest | tags
Context
2012-11-03
17:11
Heartbeat monitoring, on-the-fly server starting all working in simple manual testing check-in: 12d923326a user: matt tags: trunk
2012-11-02
18:33
borked server heartbeat logic check-in: ece909ab1c user: mrwellan tags: trunk
17:36
Added interface to the monitor db and appropriate handling thereof. check-in: 5f757480e6 user: mrwellan tags: trunk, v1.506
Changes

Modified megatest-version.scm from [4bf7ad110a] to [94672ddb9c].

1
2
3
4
5
6

7
1
2
3
4
5

6
7





-
+

;; Always use two digit decimal
;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00..

(declare (unit megatest-version))

(define megatest-version 1.5106)
(define megatest-version 1.5107)

Modified megatest.scm from [1433ed4462] to [71ef6a547c].

154
155
156
157
158
159
160

161
162
163
164
165
166
167
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168







+







			":value"
			":expected"
			":tol"
			":units"
			;; misc
			"-server"
			"-killserver"
			"-port"
			"-extract-ods"
			"-pathmod"
			"-env2file"
			"-setvars"
			"-set-state-status"
			"-debug" ;; for *verbosity* > 2
			"-gen-megatest-test"
302
303
304
305
306
307
308
309
310


311
312
313
314
315
316
317
318
319
320
321
322

323
324
325
326
327
328
329
303
304
305
306
307
308
309


310
311
312
313
314
315
316
317
318
319
320
321
322

323
324
325
326
327
328
329
330







-
-
+
+











-
+







			      (equal? hostname (car khost-port))
			      (equal? port (string->number (cadr khost-port)))))
		     (begin
		       (open-run-close tasks:server-deregister tasks:open-db  hostname port: port)
		       (if status ;; #t means alive
			   (begin
			     (if (equal? hostname (get-host-name))
				 (process-signal pid signal/term)
				 (cdb:kill-server zmq-socket))
				 (process-signal pid signal/term) ;; local machine, send sig term
				 (cdb:kill-server zmq-socket))    ;; remote machine, try telling server to commit suicide
			     (debug:print-info 1 "Killed server by host:port at " hostname ":" port))
			   (debug:print-info 1 "Removing defunct server record for " hostname ":" port))
		       (set! killed #t)))
		 (if (and kpid
			  ;; (equal? hostname (car khost-port))
			  (equal? kpid pid)) ;;; YEP, ALL WITH PID WILL BE KILLED!!!
		     (begin
		       (open-run-close tasks:server-deregister tasks:open-db hostname pid: pid)
		       (set! killed #t)
		       (if status 
			   (if (equal? hostname (get-host-name))
			       (process-signal pid signal/term)
			       (process-signal pid signal/term) ;; local machine, send sig term
			       (debug:print 0 "WARNING: Can't kill a dead server on host " hostname)))
		       (debug:print-info 1 "Killed server by pid at " hostname ":" port)))
		 ;; (if zmq-socket (close-socket  zmq-socket))
		 (format #t fmtstr id mt-ver pid hostname interface port start-time priority 
			 (if status "alive" "dead"))))
	     servers)
	    (debug:print-info 1 "Done with listservers")

Modified server.scm from [000964c6b9] to [aa1a28766d].

25
26
27
28
29
30
31


32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49




50
51
52
53
54
55
56
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

51
52
53
54
55
56
57
58
59
60
61







+
+

















-
+
+
+
+







(include "db_records.scm")

(define (server:make-server-url hostport)
  (if (not hostport)
      #f
      (conc "tcp://" (car hostport) ":" (cadr hostport))))

(define  *server-loop-heart-beat* (list 'start (current-seconds)))

(define (server:run hostn)
  (debug:print 0 "Attempting to start the server ...")
  (if (not *toppath*)
      (if (not (setup-for-run))
	  (begin
	    (debug:print 0 "ERROR: cannot find megatest.config, cannot start server, exiting")
	    (exit))))
  (let* ((zmq-socket     #f)
	 (iface          (if (string=? "-" hostn)
			     "*" ;; (get-host-name) 
			     hostn))
	 (hostname       (get-host-name))
	 (ipaddrstr      (let ((ipstr (if (string=? "-" hostn)
					  (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
					  #f)))
			   (if ipstr ipstr hostname))))
    ;; (set! zmq-socket (server:find-free-port-and-open iface zmq-socket 5555 0))
    (set! zmq-socket (server:find-free-port-and-open ipaddrstr zmq-socket 5555 0))
    (set! zmq-socket (server:find-free-port-and-open ipaddrstr zmq-socket (if (args:get-arg "-port")
									      (string->number (args:get-arg "-port"))
									      5555)
						     0))
    (set! *cache-on* #t)
    
    ;; what to do when we quit
    ;;
    (on-exit (lambda ()
	       (if (and *toppath* *server-id*)
		   (begin
65
66
67
68
69
70
71




72
73
74




75
76
77
78
79
80
81
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94







+
+
+
+



+
+
+
+







			   (begin
			     (debug:print-info 0 "Queue not flushed, waiting ...")
			     (loop))))))))

    ;; The heavy lifting
    ;;
    (let loop ()
      ;; Ugly yuk. 
      (mutex-lock! *incoming-mutex*)
      (set! *server-loop-heart-beat* (list 'waiting (current-seconds)))
      (mutex-unlock! *incoming-mutex*)
      (let* ((rawmsg (receive-message* zmq-socket))
	     (params (db:string->obj rawmsg)) ;; (with-input-from-string rawmsg (lambda ()(deserialize))))
	     (res    #f))
	;;; Ugly yuk. 
	(mutex-lock! *incoming-mutex*)
	(set! *server-loop-heart-beat* (list 'working (current-seconds)))
	(mutex-unlock! *incoming-mutex*)
	(debug:print-info 12 "server=> received params=" params)
	(set! res (cdb:cached-access params))
	(debug:print-info 12 "server=> processed res=" res)
	(send-message zmq-socket (db:obj->string res))
	(if (not *time-to-exit*)
	    (loop)
	    (begin
92
93
94
95
96
97
98
99
100














101
102
103
104
105
106
107
105
106
107
108
109
110
111


112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132







-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+







  ;; server last used then start shutdown
  (let loop ((count 0))
    (thread-sleep! 3) ;; no need to do this very often
    (db:write-cached-data)
    ;; (print "Server running, count is " count)
    (if (< count 2) ;; 3x3 = 9 secs aprox
	(loop (+ count 1))
	(let ((numrunning (open-run-close db:get-count-tests-running #f)))
	  (open-run-close tasks:server-update-heartbeat  tasks:open-db *server-id*)
	(let ((numrunning            (open-run-close db:get-count-tests-running #f))
	      (server-loop-heartbeat #f))
	;;; Ugly yuk. 
	  (mutex-lock! *incoming-mutex*)
	  (set! server-loop-heartbeat *server-loop-heart-beat*)
	  (mutex-unlock! *incoming-mutex*)
	  ;; The logic here is that if the server loop gets stuck blocked in working
	  ;; we don't want to update our heartbeat
	  (let ((server-state  (car server-loop-heartbeat))
		(server-update (cadr server-loop-heartbeat)))
	    (if (or (eq? server-state 'waiting)
		    (< (- (current-seconds) server-update) 10))
		(open-run-close tasks:server-update-heartbeat  tasks:open-db *server-id*)
		(debug:print "ERROR: No heartbeat update, server appears stuck")))
	  (if (or (> numrunning 0) ;; stay alive for two days after last access
		  (> (+ *last-db-access* (* 48 60 60))(current-seconds)))
	      (begin
		(debug:print-info 2 "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*))
		(loop 0))
	      (begin
		(debug:print-info 0 "Starting to shutdown the server.")
170
171
172
173
174
175
176
177

178
179
180
181
182
183
184
195
196
197
198
199
200
201

202
203
204
205
206
207
208
209







-
+







(define (server:client-logout zmq-socket)
  (let ((ok (and (socket? zmq-socket)
		 (cdb:logout zmq-socket *toppath* (server:get-client-signature)))))
    ;; (close-socket zmq-socket)
    ok))

;; Do all the connection work, start a server if not already running
(define (server:client-setup #!key (numtries 10)(do-ping #f))
(define (server:client-setup #!key (numtries 50)(do-ping #f))
  (if (not *toppath*)
      (if (not (setup-for-run))
	  (begin
	    (debug:print 0 "ERROR: failed to find megatest.config, exiting")
	    (exit))))
  (let ((hostinfo   (open-run-close tasks:get-best-server tasks:open-db do-ping: do-ping)))
    (if hostinfo
210
211
212
213
214
215
216
217

218
219
220
221
222
223
224
235
236
237
238
239
240
241

242
243
244
245
246
247
248
249







-
+







	(if (> numtries 0)
	    (let ((exe (car (argv))))
	      (debug:print-info 1 "No server available, attempting to start one...")
	      (process-run exe (list "-server" "-" "-debug" (conc *verbosity*)))
	      (sleep 2)
	      ;; not doing ping, assume the server started and registered itself
	      (server:client-setup numtries: (- numtries 1) do-ping: #f))
	    (debug:print-info 1 "Too many retries, giving up")))))
	    (debug:print-info 1 "Too many attempts, giving up")))))

(define (server:launch)
  (if (not *toppath*)
      (if (not (setup-for-run))
	  (begin
	    (debug:print 0 "ERROR: cannot find megatest.config, exiting")
	    (exit))))

Modified tasks.scm from [3a1458e323] to [efc84088da].

117
118
119
120
121
122
123
124

125
126
127
128
129
130
131
117
118
119
120
121
122
123

124
125
126
127
128
129
130
131







-
+







			 server-id
			 (tasks:server-get-server-id mdb hostname port pid)))
	 (heartbeat-delta 99e9))
    (sqlite3:for-each-row
     (lambda (delta)
       (set! heartbeat-delta delta))
     mdb "SELECT strftime('%s','now')-heartbeat FROM servers WHERE id=?;" server-id)
    (> heartbeat-delta 10)))
    (< heartbeat-delta 10)))

(define (tasks:client-register mdb pid hostname cmdline)
  (sqlite3:execute
   mdb
   "INSERT OR REPLACE INTO clients (server_id,pid,hostname,cmdline,login_time) VALUES(?,?,?,?,strftime('%s','now'));")
  (tasks:server-get-server-id mdb)
  pid hostname cmdline)