Megatest

Check-in [bbd7cb0f5a]
Login
Overview
Comment:Random port assignment, minimize re-use of ports
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60
Files: files | file ages | folders
SHA1: bbd7cb0f5a1d5d9963c1b00c2ea566370747e4b9
User & Date: matt on 2014-02-24 19:44:31
Other Links: branch diff | manifest | tags
Context
2014-02-24
22:15
Added debugging tags to server state changes. Cleaned up dashboard to display new server data. check-in: 904e5f7d6c user: matt tags: v1.60
19:44
Random port assignment, minimize re-use of ports check-in: bbd7cb0f5a user: matt tags: v1.60
16:46
Trying to get reliable server starts in the face of ports, hosts, kills etc. What fun check-in: cb995a99df user: mrwellan tags: v1.60
Changes

Modified docs/manual/howto.txt from [d3633d3bbf] to [b28c4b0da6].

55
56
57
58
59
60
61








55
56
57
58
59
60
61
62
63
64
65
66
67
68
69







+
+
+
+
+
+
+
+
The runscript method is a brute force way to run scripts where the
user is responsible for setting STATE and STATUS

-------------------
runscript main.csh
-------------------

Debugging Server Problems
~~~~~~~~~~~~~~~~~~~~~~~~~

----------------
sudo lsof -i
sudo netstat -lptu
sudo netstat -tulpn
----------------

Modified http-transport.scm from [78b88f80d9] to [f8416b573d].

389
390
391
392
393
394
395
396

397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416


417
418
419
420
421
422
423
389
390
391
392
393
394
395

396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415

416
417
418
419
420
421
422
423
424







-
+



















-
+
+







	    (debug:print-info 0 "Average non-cached time   "
			      (if (eq? *number-non-write-queries* 0)
				  "n/a (no queries)"
				  (/ *total-non-write-delay* 
				     *number-non-write-queries*))
			      " ms")
	    (debug:print-info 0 "Server shutdown complete. Exiting")
	    (tasks:server-delete-record! tdb server-id)
	    (tasks:server-delete-record tdb server-id)
	    (exit))))))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id)
  (set! *run-id*   run-id)
  (if (args:get-arg "-daemonize")
      (daemon:ize))
  (if (server:check-if-running run-id)
      (begin
	(debug:print 0 "INFO: Server for run-id " run-id " already running")
	(exit 0)))
  (let ((server-id (open-run-close tasks:server-lock-slot tasks:open-db run-id)))
    (if (not server-id)
	(begin
	  ;; since we didn't get the server lock we are going to clean up and bail out
	  (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
	  (open-run-close tasks:server-delete-records-for-this-pid tasks:open-db))
	  (open-run-close tasks:server-delete-records-for-this-pid tasks:open-db)
	  )
	(let* ((th2 (make-thread (lambda ()
				   (http-transport:run 
				    (if (args:get-arg "-server")
					(args:get-arg "-server")
					"-")
				    run-id
				    server-id)) "Server run"))

Modified tasks.scm from [0a12e30788] to [8d2eda9003].

126
127
128
129
130
131
132
133

134
135
136

137
138
139

140
141








142
143
144
145
146
147
148
149
150
151
152
153


154

155
156
157
158
159
160
161
162
163













164
165
166

167
168

169
170
171
172
173
174
175










176
177
178
179
180
181
182
126
127
128
129
130
131
132

133
134
135

136
137
138

139
140
141
142
143
144
145
146
147
148
149
150
151
152






153
154
155
156
157
158
159









160
161
162
163
164
165
166
167
168
169
170
171
172
173
174

175
176

177
178
179
180




181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197







-
+


-
+


-
+


+
+
+
+
+
+
+
+



-
-
-
-
-
-



+
+

+
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+


-
+

-
+



-
-
-
-
+
+
+
+
+
+
+
+
+
+







       (set! res num-in-queue))
     mdb
     "SELECT count(id) FROM servers WHERE run_id=? AND state = 'available';"
     run-id)
    res))

(define (tasks:server-clean-out-old-records-for-run-id mdb run-id)
  (sqlite3:execute mdb "DELETE FROM servers WHERE state in ('available','shutting-down') AND (strftime('%s','now') - start_time) > 30 AND run_id=?;" run-id))
  (sqlite3:execute mdb "UPDATE servers SET state='defunct' WHERE state in ('available','shutting-down') AND (strftime('%s','now') - start_time) > 300 AND run_id=?;" run-id))

(define (tasks:server-force-clean-running-records-for-run-id mdb run-id)
  (sqlite3:execute mdb "DELETE FROM servers WHERE state = 'running' AND run_id=?;" run-id))
  (sqlite3:execute mdb "UPDATE servers SET state='defunct' WHERE state = 'running' AND run_id=?;" run-id))

(define (tasks:server-force-clean-run-record mdb run-id iface port)
  (sqlite3:execute mdb "DELETE FROM servers WHERE state = 'running' AND run_id=? AND interface=? AND port=?;"
  (sqlite3:execute mdb "UPDATE servers SET state='defunct' WHERE state = 'running' AND run_id=? AND interface=? AND port=?;"
		   run-id iface port))

(define (tasks:server-delete-records-for-this-pid mdb)
  (sqlite3:execute mdb "UPDATE servers SET state='defunct' WHERE hostname=? AND pid=?;" (get-host-name) (current-process-id)))

(define (tasks:server-delete-record mdb server-id)
  (sqlite3:execute mdb "UPDATE servers SET state='defunct' WHERE id=?;" server-id)
  ;; use this opportuntity to clean out records over one month old
  (sqlite3:execute mdb "DELETE FROM servers WHERE state not in ('running','shutting-down') AND (strftime('%s','now') - start_time) > 2628000;"))

(define (tasks:server-set-state! mdb server-id state)
  (sqlite3:execute mdb "UPDATE servers SET state=? WHERE id=?;" state server-id))

(define (tasks:server-delete-record! mdb server-id)
  (sqlite3:execute mdb "DELETE FROM servers WHERE id=?;" server-id))

(define (tasks:server-delete-records-for-this-pid mdb)
  (sqlite3:execute mdb "DELETE FROM servers WHERE hostname=? AND pid=?;" (get-host-name) (current-process-id)))

(define (tasks:server-set-interface-port mdb server-id interface port)
  (sqlite3:execute mdb "UPDATE servers SET interface=?,port=? WHERE id=?;" interface port server-id))

;; Get random port not used in long time
;;
(define (tasks:server-get-next-port mdb)
  (let* ((lownum        30000)
  (let ((res         #f)
	(port-param  (if (and (args:get-arg "-port")
			      (string->number (args:get-arg "-port")))
			 (string->number (args:get-arg "-port"))
			 #f))
	(config-port (if (and (config-lookup  *configdat* "server" "port")
			      (string->number (config-lookup  *configdat* "server" "port")))
			 (string->number (config-lookup  *configdat* "server" "port"))
			 #f)))
	(highnum        64000)
	(used-ports     '())
	(get-rand-port  (lambda ()
			  (+ lownum (random (- highnum lownum)))))
	(port-param     (if (and (args:get-arg "-port")
				 (string->number (args:get-arg "-port")))
			    (string->number (args:get-arg "-port"))
			    #f))
	;; (config-port    (if (and (config-lookup  *configdat* "server" "port")
	;; 			 (string->number (config-lookup  *configdat* "server" "port")))
	;; 		    (string->number (config-lookup  *configdat* "server" "port"))
	;; 		    #f))
	)
    (sqlite3:for-each-row
     (lambda (port)
       (set! res (+ port 1))) ;; set to next
       (set! used-ports (cons port used-ports)))
     mdb
     "SELECT max(port) FROM servers;")
     "SELECT port FROM servers;")
    (cond
     ((and port-param res)   (if (> res port-param) res port-param))
     (port-param             port-param)
     ((and config-port res)  (if (> res config-port) res config-port))
     (config-port            config-port)
     ((and res (> res 8080)) res)
     (else                   (+ 5000 (random 1001))))))
     ;; ((and config-port res)  (if (> res config-port) res config-port))
     ;; (config-port            config-port)
     (else
      (let loop ((port     (get-rand-port))
		 (remtries 100))
	(if (member port used-ports)
	    (if (> remtries 0)
		(loop (get-rand-port)(- remtries 1))
		(get-rand-port))
	    port))))))

(define (tasks:server-am-i-the-server? mdb run-id)
  (let* ((all    (tasks:server-get-servers-vying-for-run-id mdb run-id))
	 (first  (if (null? all)
		     (begin (debug:print 0 "ERROR: no servers listed, should be at least one by now.") 
			    (sqlite3:finalize! mdb)
			    (exit 1))