Megatest

Check-in [f8c63429f7]
Login
Overview
Comment:Tentative improvements to server starting - separate out server start from db start (dbprep).
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60
Files: files | file ages | folders
SHA1: f8c63429f726b99d5759e257fd6002aa23b73b80
User & Date: matt on 2014-09-09 23:06:00
Other Links: branch diff | manifest | tags
Context
2014-09-10
13:40
Server bypass options increased. Now should bypass the server unless in 'running' or 'dbprep' mode check-in: e29bdd1210 user: mrwellan tags: v1.60
2014-09-09
23:06
Tentative improvements to server starting - separate out server start from db start (dbprep). check-in: f8c63429f7 user: matt tags: v1.60
2014-09-05
11:53
Use nbfake for launching servers check-in: d8c9e3b4ca user: mrwellan tags: v1.60
Changes

Modified http-transport.scm from [36a92be4c5] to [4bbddddd32].

263
264
265
266
267
268
269


270
271
272
273
274
275
276
       (retry-request? (lambda (request)
			 #f))
       ;; send the data and get the response
       ;; extract the needed info from the http data and 
       ;; process and return it.
       (let* ((send-recieve (lambda ()
			      (mutex-lock! *http-mutex*)


			      (set! res (with-input-from-request ;; was dat
					 fullurl 
					 (list (cons 'key "thekey")
					       (cons 'cmd cmd)
					       (cons 'params params))
					 read-string))
			      ;; Shouldn't this be a call to the managed call-all-connections stuff above?







>
>







263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
       (retry-request? (lambda (request)
			 #f))
       ;; send the data and get the response
       ;; extract the needed info from the http data and 
       ;; process and return it.
       (let* ((send-recieve (lambda ()
			      (mutex-lock! *http-mutex*)
			      ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines)
			      ;;					       ((exn http client-error) e (print e)))
			      (set! res (with-input-from-request ;; was dat
					 fullurl 
					 (list (cons 'key "thekey")
					       (cons 'cmd cmd)
					       (cons 'params params))
					 read-string))
			      ;; Shouldn't this be a call to the managed call-all-connections stuff above?
374
375
376
377
378
379
380


381
382
383
384
385
386
387
	(debug:print 0 "SYNC: time= " sync-time ", rem-time=" rem-time)

      ;;
      ;; set_running after our first pass through and start the db
      ;;
      (if (eq? server-state 'available)
	  (begin


	    (set! *inmemdb*  (db:setup run-id))
	    (tasks:server-set-state! tdb server-id "running")))

      (if (and (<= rem-time 4)
	       (> rem-time 0))
	  (thread-sleep! rem-time)
	  (thread-sleep! 4))) ;; fallback for if the math is changed ...







>
>







376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
	(debug:print 0 "SYNC: time= " sync-time ", rem-time=" rem-time)

      ;;
      ;; set_running after our first pass through and start the db
      ;;
      (if (eq? server-state 'available)
	  (begin
	    (tasks:server-set-state! tdb server-id "dbprep")
	    (thread-sleep! 5) ;; give some margin for queries to complete before switching from file based access to server based access
	    (set! *inmemdb*  (db:setup run-id))
	    (tasks:server-set-state! tdb server-id "running")))

      (if (and (<= rem-time 4)
	       (> rem-time 0))
	  (thread-sleep! rem-time)
	  (thread-sleep! 4))) ;; fallback for if the math is changed ...

Modified rmt.scm from [caf7bcfa2c] to [124a9a8be5].

67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
				;; bypass starting the server. 
				;;
				;; NB// can cache the answer for server running for 10 seconds ...
				;;
				(if (and (not (rmt:write-frequency-over-limit? cmd run-id))
					 (not (open-run-close tasks:server-running-or-starting? tasks:open-db run-id)))
				    #f
				    (let loop ((numtries 100))
				      (let ((res (client:setup run-id)))
					(if res 
					    (hash-table-ref/default *runremote* run-id #f) ;; client:setup filled this in (hopefully)
					    (if (> numtries 0)
						(begin
						  ;; junk records can cause stuckness here. use this time to
						  ;; clean out
						  (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id "auto-start-clean-up")
						  (thread-sleep! 10)
						  (loop (- numtries 1)))
						(begin
						  (debug:print 0 "ERROR: 100 tries and no server, giving up")
						  (exit 1))))))))))
	 (jparams         (db:obj->string params)))
    (if connection-info
	(let ((res             (http-transport:client-api-send-receive run-id connection-info cmd jparams)))
	  (if res
	      (db:string->obj res)
	      (let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")







|








|

|
<
|







67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85

86
87
88
89
90
91
92
93
				;; bypass starting the server. 
				;;
				;; NB// can cache the answer for server running for 10 seconds ...
				;;
				(if (and (not (rmt:write-frequency-over-limit? cmd run-id))
					 (not (open-run-close tasks:server-running-or-starting? tasks:open-db run-id)))
				    #f
				    (let loop ((numtries 2))
				      (let ((res (client:setup run-id)))
					(if res 
					    (hash-table-ref/default *runremote* run-id #f) ;; client:setup filled this in (hopefully)
					    (if (> numtries 0)
						(begin
						  ;; junk records can cause stuckness here. use this time to
						  ;; clean out
						  (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id "auto-start-clean-up")
						  (thread-sleep! 1)
						  (loop (- numtries 1)))
						#f) ;; try couple times to start server - give up and do local queries

					    )))))))
	 (jparams         (db:obj->string params)))
    (if connection-info
	(let ((res             (http-transport:client-api-send-receive run-id connection-info cmd jparams)))
	  (if res
	      (db:string->obj res)
	      (let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")

Modified tasks.scm from [9d8c491acc] to [929251744a].

167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
     (lambda (num-running)
       (set! res num-running))
     mdb
     "SELECT count(id) FROM servers WHERE run_id != 0 AND state = 'running';")
    res))

(define (tasks:server-clean-out-old-records-for-run-id mdb run-id tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state in ('available','shutting-down') AND (strftime('%s','now') - start_time) > 50 AND run_id=?;"
		   (conc "defunct" tag) run-id))

(define (tasks:server-force-clean-running-records-for-run-id mdb run-id tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=?;"
		   (conc "defunct" tag) run-id))

(define (tasks:server-force-clean-run-record mdb run-id iface port tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=? AND interface=? AND port=?;"
		   (conc "defunct" tag) run-id iface port))

(define (tasks:server-delete-records-for-this-pid mdb tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE hostname=? AND pid=?;"
		   (conc "defunct" tag) (get-host-name) (current-process-id)))

(define (tasks:server-delete-record mdb server-id tag) 
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE id=?;"
		   (conc "defunct" tag) server-id)
  ;; use this opportuntity to clean out records over one month old or over 10 minutes old with port = -1 (i.e. a never used placeholder)
  (sqlite3:execute mdb "DELETE FROM servers WHERE state not in ('running','shutting-down') AND (strftime('%s','now') - start_time) > 2628000;")
  (sqlite3:execute mdb "DELETE FROM servers WHERE state like 'defunct%' AND port=-1 AND (strftime('%s','now') - start_time) > 600;")
  )

(define (tasks:server-set-state! mdb server-id state)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE id=?;" state server-id))

(define (tasks:server-set-interface-port mdb server-id interface port)







|


















|







167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
     (lambda (num-running)
       (set! res num-running))
     mdb
     "SELECT count(id) FROM servers WHERE run_id != 0 AND state = 'running';")
    res))

(define (tasks:server-clean-out-old-records-for-run-id mdb run-id tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state in ('available','dbprep','shutting-down') AND (strftime('%s','now') - start_time) > 50 AND run_id=?;"
		   (conc "defunct" tag) run-id))

(define (tasks:server-force-clean-running-records-for-run-id mdb run-id tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=?;"
		   (conc "defunct" tag) run-id))

(define (tasks:server-force-clean-run-record mdb run-id iface port tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=? AND interface=? AND port=?;"
		   (conc "defunct" tag) run-id iface port))

(define (tasks:server-delete-records-for-this-pid mdb tag)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE hostname=? AND pid=?;"
		   (conc "defunct" tag) (get-host-name) (current-process-id)))

(define (tasks:server-delete-record mdb server-id tag) 
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE id=?;"
		   (conc "defunct" tag) server-id)
  ;; use this opportuntity to clean out records over one month old or over 10 minutes old with port = -1 (i.e. a never used placeholder)
  (sqlite3:execute mdb "DELETE FROM servers WHERE state not in ('running','shutting-down','dbprep') AND (strftime('%s','now') - start_time) > 2628000;")
  (sqlite3:execute mdb "DELETE FROM servers WHERE state like 'defunct%' AND port=-1 AND (strftime('%s','now') - start_time) > 600;")
  )

(define (tasks:server-set-state! mdb server-id state)
  (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE id=?;" state server-id))

(define (tasks:server-set-interface-port mdb server-id interface port)
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
   (let* ((header (list "id" "hostname" "pid" "interface" "port" "pubport" "state" "run_id" "priority" "start_time"))
	  (selstr (string-intersperse header ","))
	  (res    '()))
    (sqlite3:for-each-row
     (lambda (a . b)
       (set! res (cons (apply vector a b) res)))
     mdb
     (conc "SELECT " selstr " FROM servers WHERE run_id=? AND state in ('available','running') ORDER BY start_time DESC;")
     run-id)
    (vector header res)))

(define (tasks:get-server mdb run-id)
  (let ((res  #f)
	(best #f))
    (sqlite3:for-each-row







|







262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
   (let* ((header (list "id" "hostname" "pid" "interface" "port" "pubport" "state" "run_id" "priority" "start_time"))
	  (selstr (string-intersperse header ","))
	  (res    '()))
    (sqlite3:for-each-row
     (lambda (a . b)
       (set! res (cons (apply vector a b) res)))
     mdb
     (conc "SELECT " selstr " FROM servers WHERE run_id=? AND state in ('available','running','dbprep') ORDER BY start_time DESC;")
     run-id)
    (vector header res)))

(define (tasks:get-server mdb run-id)
  (let ((res  #f)
	(best #f))
    (sqlite3:for-each-row
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
    res))

(define (tasks:server-running-or-starting? mdb run-id)
  (let ((res #f))
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     mdb
     "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'available' AND  (strftime('%s','now') - start_time) < 30));" run-id)
    res))

(define (tasks:get-all-servers mdb)
  (let ((res '()))
    (sqlite3:for-each-row
     (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id)
       ;;                       0   1        2         3    4       5          6        7     8          9          10        11     12







|
|







285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
    res))

(define (tasks:server-running-or-starting? mdb run-id)
  (let ((res #f))
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     mdb ;; NEEDS dbprep ADDED
     "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND  (strftime('%s','now') - start_time) < 60));" run-id)
    res))

(define (tasks:get-all-servers mdb)
  (let ((res '()))
    (sqlite3:for-each-row
     (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id)
       ;;                       0   1        2         3    4       5          6        7     8          9          10        11     12
331
332
333
334
335
336
337

338
339
340
341
342
343
344
	    (if (equal? hostname (get-host-name))
		(begin
		  (debug:print-info 1 "Sending signal/term to " pid " on " hostname)
		  (process-signal pid signal/term)  ;; local machine, send sig term
		  (thread-sleep! 5)                 ;; give it five seconds to die peacefully then do a brutal kill
		  (process-signal pid signal/kill)) 
		(debug:print 0 "WARNING: Can't kill frozen server on remote host " hostname))))))


;;======================================================================
;; Tasks and Task monitors
;;======================================================================


;;======================================================================







>







331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
	    (if (equal? hostname (get-host-name))
		(begin
		  (debug:print-info 1 "Sending signal/term to " pid " on " hostname)
		  (process-signal pid signal/term)  ;; local machine, send sig term
		  (thread-sleep! 5)                 ;; give it five seconds to die peacefully then do a brutal kill
		  (process-signal pid signal/kill)) 
		(debug:print 0 "WARNING: Can't kill frozen server on remote host " hostname))))))


;;======================================================================
;; Tasks and Task monitors
;;======================================================================


;;======================================================================

tests/installall/config/megatest.config.dat became a regular file with contents [736a5da885].

tests/installall/config/runconfigs.config.dat became a regular file with contents [3b8f260acb].