Megatest

Check-in [c9858f0dcb]
Login
Overview
Comment:Some tweaks to accomodate long running server (lives for 48 hrs after last access)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk | v1.5102
Files: files | file ages | folders
SHA1: c9858f0dcbe977d02e8105cd87080970b25cf547
User & Date: mrwellan on 2012-11-01 14:08:00
Other Links: manifest | tags
Context
2012-11-02
00:28
Added check for version on client/server login. Converted to looking at heartbeat time instead of trying to connect to server check-in: af929ed4d8 user: matt tags: trunk, 1.5103
2012-11-01
14:08
Some tweaks to accomodate long running server (lives for 48 hrs after last access) check-in: c9858f0dcb user: mrwellan tags: trunk, v1.5102
11:21
bumped version to 1.5101 check-in: 6b3cce4bcf user: mrwellan tags: trunk, v1.5101
Changes

Modified megatest.scm from [100a519d71] to [140e7d9667].

287
288
289
290
291
292
293
294

295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314


315
316
317
318
319
320
321
322
323


324
325
326
327
328
329
330
331
332

333
334

335
336
337
338
339
340
341
287
288
289
290
291
292
293

294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312


313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333

334
335

336
337
338
339
340
341
342
343







-
+


















-
-
+
+









+
+








-
+

-
+







		      (pid        (vector-ref server 1))
		      (hostname   (vector-ref server 2))
		      (port       (vector-ref server 3))
		      (start-time (vector-ref server 4))
		      (priority   (vector-ref server 5))
		      (state      (vector-ref server 6))
		      (stat-numc  (server:ping hostname port))
		      (status     (car stat-numc))
		      (status     (car  stat-numc))
		      (numclients (cadr stat-numc))
		      (killed     #f)
		      (zmq-socket (if status (server:client-connect hostname port) #f)))
		 ;; no need to login as status of #t indicates we are connecting to correct 
		 ;; server
		 (if (or (not status)    ;; no point in keeping dead records in the db
			 (and khost-port ;; kill by host/port
			      (equal? hostname (car khost-port))
			      (equal? port (string->number (cadr khost-port)))))
		     (begin
		       (open-run-close tasks:server-deregister tasks:open-db  hostname port: port)
		       (if status ;; #t means alive
			   (begin
			     (cdb:kill-server zmq-socket)
			     (debug:print-info 1 "Killed server by host:port at " hostname ":" port))
			   (debug:print-info 1 "Removing defunct server record for " hostname ":" port))
		       (set! killed #t)))
		 (if (and kpid
			  (equal? hostname (car khost-port))
			  (equal? kpid pid))
			  ;; (equal? hostname (car khost-port))
			  (equal? kpid pid)) ;;; YEP, ALL WITH PID WILL BE KILLED!!!
		     (begin
		       (open-run-close tasks:server-deregister tasks:open-db hostname pid: pid)
		       (set! killed #t)
		       (if status (cdb:kill-server zmq-socket))
		       (debug:print-info 1 "Killed server by pid at " hostname ":" port)))
		 ;; (if zmq-socket (close-socket  zmq-socket))
		 (format #t fmtstr id pid hostname port start-time priority 
			 status numclients)))
	     servers)
	    (debug:print-info 1 "Done with listservers")
	    (exit) ;; must do, would have to add checks to many/all calls below
	    (set! *didsomething* #t))))
    ;; if not list or kill then start a client (if appropriate)
    (if (or (args-defined? "-h" "-version" "-gen-megatest-area" "-gen-megatest-test")
	    (eq? (length (hash-table-keys args:arg-hash)) 0))
	(debug:print-info 1 "Server connection not needed")
	;; ping servers only if -runall -runtests
	(let ((ping (args-defined? "-runall" "-runtests" "-remove-runs" 
				   "-set-state-status" "-rerun" "-rollup" "-lock" "-unlock"
				   "-set-values" "-list-runs")))
				   "-set-values" "-list-runs" "-repl")))
	  (server:client-launch do-ping: ping))))
    

;;======================================================================
;; Remove old run(s)
;;======================================================================

;; since several actions can be specified on the command line the removal
;; is done first
(define (operate-on action)

Modified server.scm from [932059ea25] to [06d840e03e].

43
44
45
46
47
48
49
50

51
52
53
54
55
56
57
43
44
45
46
47
48
49

50
51
52
53
54
55
56
57







-
+







			   (if ipstr ipstr hostname))))
    (set! zmq-socket (server:find-free-port-and-open ipaddrstr zmq-socket 5555 0))
    (set! *cache-on* #t)
    
    ;; what to do when we quit
    ;;
    (on-exit (lambda ()
	       (open-run-close tasks:server-deregister-self tasks:open-db)
	       (open-run-close tasks:server-deregister-self tasks:open-db #f)
	       (let loop () 
		 (let ((queue-len 0))
		   (thread-sleep! (random 5))
		   (mutex-lock! *incoming-mutex*)
		   (set! queue-len (length *incoming-data*))
		   (mutex-unlock! *incoming-mutex*)
		   (if (> queue-len 0)
68
69
70
71
72
73
74

75
76
77
78
79
80
81
82
83
84
85
86
87

88
89
90
91
92
93
94


95
96
97


98
99
100
101
102
103


104
105
106

107
108
109
110
111
112
113
68
69
70
71
72
73
74
75
76

77
78
79
80
81
82
83
84
85
86

87
88
89
90
91
92


93
94
95


96
97
98
99
100
101


102
103
104
105

106
107
108
109
110
111
112
113







+

-










-
+





-
-
+
+

-
-
+
+




-
-
+
+


-
+







	(debug:print-info 12 "server=> received params=" params)
	(set! res (cdb:cached-access params))
	(debug:print-info 12 "server=> processed res=" res)
	(send-message zmq-socket (db:obj->string res))
	(if (not *time-to-exit*)
	    (loop)
	    (begin
	      (open-run-close tasks:server-deregister-self tasks:open-db #f)
	      (db:write-cached-data)
	      (open-run-close tasks:server-deregister-self tasks:open-db)
	      (exit)
	      ))))))

;; run server:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (server:keep-running)
  ;; if none running or if > 20 seconds since 
  ;; server last used then start shutdown
  (let loop ((count 0))
    (thread-sleep! 1) ;; no need to do this very often
    (thread-sleep! 3) ;; no need to do this very often
    (db:write-cached-data)
    ;; (print "Server running, count is " count)
    (if (< count 10)
	(loop (+ count 1))
	(let ((numrunning (open-run-close db:get-count-tests-running #f)))
	  (if (or (> numrunning 0)
		  (> (+ *last-db-access* 60)(current-seconds)))
	  (if (or (> numrunning 0) ;; stay alive for two days after last access
		  (> (+ *last-db-access* (* 48 60 60))(current-seconds)))
	      (begin
		(debug:print-info 0 "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*))
		(loop 0)))
		(debug:print-info 2 "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*))
		(loop 0))
	      (begin
		(debug:print-info 0 "Starting to shutdown the server.")
		;; need to delete only *my* server entry (future use)
		(set! *time-to-exit* #t)
		(open-run-close tasks:server-deregister-self tasks:open-db)
		(thread-sleep! 5)
		(open-run-close tasks:server-deregister-self tasks:open-db #f)
		(thread-sleep! 1)
		(debug:print-info 0 "Max cached queries was " *max-cache-size*)
		(debug:print-info 0 "Server shutdown complete. Exiting")
		(exit))))))
		(exit)))))))

(define (server:find-free-port-and-open host s port #!key (trynum 50))
  (let ((s (if s s (make-socket 'rep)))
	(p (if (number? port) port 5555)))
    (handle-exceptions
     exn
     (begin
195
196
197
198
199
200
201
202
203



204
205
206
207
208
209
210
195
196
197
198
199
200
201


202
203
204
205
206
207
208
209
210
211







-
-
+
+
+







		     (debug:print-info 2 "Failed to login or connect to " conurl)
		     (set! *runremote* #f)
		     #f))))))
	(if (> numtries 0)
	    (let ((exe (car (argv))))
	      (debug:print-info 1 "No server available, attempting to start one...")
	      (process-run exe (list "-server" "-" "-debug" (conc *verbosity*)))
	      (sleep 5)
	      (server:client-setup numtries: (- numtries 1) do-ping: do-ping))
	      (sleep 2)
	      ;; not doing ping, assume the server started and registered itself
	      (server:client-setup numtries: (- numtries 1) do-ping: #f))
	    (debug:print-info 1 "Too many retries, giving up")))))

(define (server:launch)
  (let* ((toppath (setup-for-run)))
    (debug:print-info 0 "Starting the standalone server")
    (let ((hostinfo (open-run-close tasks:get-best-server tasks:open-db)))
      (if hostinfo

Modified tasks.scm from [29f8996e48] to [6027d829c1].

76
77
78
79
80
81
82

83
84
85
86

87
88
89
90
91
92


93
94
95
96
97
98
99
76
77
78
79
80
81
82
83
84
85
86

87
88
89
90
91


92
93
94
95
96
97
98
99
100







+



-
+




-
-
+
+







;; state: 'live, 'shutting-down, 'dead
(define (tasks:server-register mdb pid hostname port priority state)
  (sqlite3:execute 
   mdb 
   "INSERT OR REPLACE INTO servers (pid,hostname,port,start_time,priority,state) VALUES(?,?,?,strftime('%s','now'),?,?);"
   pid hostname port priority (conc state)))

;; NB// two servers with same pid on different hosts will be removed from the list if pid: is used!
(define (tasks:server-deregister mdb hostname #!key (port #f)(pid #f))
  (debug:print-info 11 "server-deregister " hostname ", port " port ", pid " pid)
  (if pid
      (sqlite3:execute mdb "DELETE FROM servers WHERE  hostname=? AND pid=?;" hostname pid)
      (sqlite3:execute mdb "DELETE FROM servers WHERE pid=?;" pid)
      (if port
	  (sqlite3:execute mdb "DELETE FROM servers WHERE  hostname=? AND port=?;" hostname port)
	  (debug:print 0 "ERROR: tasks:server-deregister called with neither pid nor port specified"))))

(define (tasks:server-deregister-self mdb)
  (tasks:server-deregister mdb (get-host-name) pid: (current-process-id)))
(define (tasks:server-deregister-self mdb hostname)
  (tasks:server-deregister mdb hostname pid: (current-process-id)))

(define (tasks:server-get-server-id mdb)
  ;; dunno yet
  0)

(define (tasks:client-register mdb pid hostname cmdline)
  (sqlite3:execute