Megatest

Check-in [ba2401c3f6]
Login
Overview
Comment:Force starting a server and wait for it when launching runs. This prevents server run-away but doesn't fix the underlying issue.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.64
Files: files | file ages | folders
SHA1: ba2401c3f651fbfd95ef94256dfc81f6828424fc
User & Date: matt on 2017-03-28 10:48:26
Other Links: branch diff | manifest | tags
Context
2017-03-28
10:54
Force starting a server and wait for it when launching runs. This prevents server run-away but doesn't fix the underlying issue. check-in: 27b1636e7b user: matt tags: v1.64
10:48
Force starting a server and wait for it when launching runs. This prevents server run-away but doesn't fix the underlying issue. check-in: ba2401c3f6 user: matt tags: v1.64
08:39
Fixed unit test running. To try do: cd tests;make all-rmt.log. Improved information from a server-side crash. check-in: 263cdeb6eb user: matt tags: v1.64
Changes

Modified api.scm from [ca9ed8f403] to [d7ff6e57f4].

124
125
126
127
128
129
130
131

132
133
134
135
136
137
138
124
125
126
127
128
129
130

131
132
133
134
135
136
137
138







-
+







     (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
     (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
   (cond
    ((not (vector? dat))                    ;; it is an error to not receive a vector
     (vector #f (vector #f "remote must be called with a vector")))
    ((> *api-process-request-count* 20) ;; 20)
     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")
     (vector #f (vector 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
     (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
    (else  
     (let* ((cmd-in            (vector-ref dat 0))
            (cmd               (if (symbol? cmd-in)
				   cmd-in
				   (string->symbol cmd-in)))
            (params            (vector-ref dat 1))
            (start-t           (current-milliseconds))

Modified http-transport.scm from [6eea7c3a25] to [826398cdb7].

223
224
225
226
227
228
229
230

231
232
233
234
235


236
237
238


239
240
241
242
243
244
245
223
224
225
226
227
228
229

230
231
232
233
234

235
236
237
238

239
240
241
242
243
244
245
246
247







-
+




-
+
+


-
+
+







       ;; send the data and get the response
       ;; extract the needed info from the http data and 
       ;; process and return it.
       (let* ((send-recieve (lambda ()
			      (mutex-lock! *http-mutex*)
			      ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines)
			      ;;					       ((exn http client-error) e (print e)))
			      (set! res (vector
			      (set! res (vector                ;;; DON'T FORGET - THIS IS THE CLIENT SIDE! NOTE: consider moving this to client.scm since we are only supporting http transport at this time.
					 success
					 (db:string->obj 
					  (handle-exceptions
					      exn
					      (begin
					      (let ((call-chain (get-call-chain))
						    (msg        ((condition-property-accessor 'exn 'message) exn)))
						(set! success #f)
						(debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".")
						(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
						(debug:print 0 *default-log-port* " message: " msg)
						(debug:print 0 *default-log-port* " cmd: " cmd " params: " params)
						(if runremote
						    (remote-conndat-set! runremote #f))
						;; Killing associated server to allow clean retry.")
						;; (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine?
						(mutex-unlock! *http-mutex*)
					     ;;; (signal (make-composite-condition
					     ;;;          (make-property-condition 'commfail 'message "failed to connect to server")))

Modified megatest.scm from [84dec1a162] to [1cdd8cf912].

794
795
796
797
798
799
800
801

802
803
804
805
806
807
808
794
795
796
797
798
799
800

801
802
803
804
805
806
807
808







-
+







		     (begin
		       (debug:print-info 0 *default-log-port* "Attempting to kill server with pid " pid)
		       (server:kill server)))))
	     (sort servers (lambda (a b)
			     (let ((ma (or (any->number (car a)) 9e9))
				   (mb (or (any->number (car b)) 9e9)))
			       (> ma mb)))))
	    (debug:print-info 1 *default-log-port* "Done with listservers")
	    ;; (debug:print-info 1 *default-log-port* "Done with listservers")
	    (set! *didsomething* #t)
	    (exit))
	  (exit))))
      ;; must do, would have to add checks to many/all calls below

;;======================================================================
;; Weird special calls that need to run *after* the server has started?

Modified rmt.scm from [2aa9d28934] to [816af6a52f].

194
195
196
197
198
199
200
201
202



203
204



205
206
207
208
209
210
211
194
195
196
197
198
199
200


201
202
203
204
205
206
207
208
209
210
211
212
213
214
215







-
-
+
+
+


+
+
+







	     (success  (if (vector? dat) (vector-ref dat 0) #f))
	     (res      (if (vector? dat) (vector-ref dat 1) #f)))
	(if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time
	;; (mutex-unlock! *rmt-mutex*)
        (debug:print-info 13 *default-log-port* "rmt:send-receive, case  9. conninfo=" conninfo " dat=" dat " runremote = " runremote)
	(mutex-unlock! *rmt-mutex*)
	(if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end
	    (if (and (symbol? res)
		     (eq? res 'overloaded))
	    (if (and (vector? res)
		     (eq? (vector-length res) 2)
		     (eq? (vector-ref res 2) 'overloaded)) ;; since we are looking at the data to carry the error we'll use a fairly obtuse combo to minimise the chances of some sort of collision.
		(let ((wait-delay (+ attemptnum (* attemptnum 10))))
		  (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.")
		  (mutex-lock! *rmt-mutex*)
		  (set! *runremote* #f) ;; force starting over
		  (mutex-unlock! *rmt-mutex*)
		  (thread-sleep! wait-delay)
		  (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
		res) ;; All good, return res
	    (begin
	      (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
	      (remote-conndat-set!    runremote #f)
	      (remote-server-url-set! runremote #f)

Modified runs.scm from [ad868adae8] to [b7a198d7dd].

251
252
253
254
255
256
257




258
259
260
261
262
263
264
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268







+
+
+
+







					       (exit 4)))))
		       (thread-start! th2)
		       (thread-start! th1)
		       (thread-join! th2)))))
      (set-signal-handler! signal/int sighand)
      (set-signal-handler! signal/term sighand))

    ;; force the starting of a server
    (debug:print 0 *default-log-port* "waiting on server...")
    (server:start-and-wait *toppath*)
    
    (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process
    (set! runconf (if (file-exists? runconfigf)
		      (setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target)
		      (begin
			(debug:print 0 *default-log-port* "WARNING: You do not have a run config file: " runconfigf)
			#f)))

1174
1175
1176
1177
1178
1179
1180

1181
1182
1183




1184
1185
1186
1187
1188
1189
1190
1191
1178
1179
1180
1181
1182
1183
1184
1185



1186
1187
1188
1189

1190
1191
1192
1193
1194
1195
1196







+
-
-
-
+
+
+
+
-







			   waitons:     waitons
			   testmode:    testmode
			   newtal:      newtal
			   itemmaps:    itemmaps
			   ;; prereqs-not-met: prereqs-not-met
			   )))
	(runs:dat-regfull-set! runsdat regfull)

	;; every couple minutes verify the server is there for this run
	;; (if (and (common:low-noise-print 60 "try start server"  run-id)
	;; 	 (tasks:need-server run-id))
	;; every 15 minutes verify the server is there for this run
	(if (and (common:low-noise-print 240 "try start server"  run-id)
		 (not (server:check-if-running *toppath*)))
	    (server:kind-run *toppath*))
	;;     (tasks:start-and-wait-for-server tdbdat run-id 10)) ;; NOTE: delay and wait is done under the hood
	
	(if (> num-running 0)
	  (set! last-time-some-running (current-seconds)))

      (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
	  (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
	;; (debug:print 0 *default-log-port* "max-tries-hash: " (hash-table->alist *max-tries-hash*))

Modified server.scm from [0a5d68ff36] to [d6964e3100].

216
217
218
219
220
221
222

223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
























244
245
246
247
248
249
250
216
217
218
219
220
221
222
223





















224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254







+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







;;
;; mod-time host port start-time pid
;;
;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off
;; and servers should stick around for about two hours or so.
;;
(define (server:get-best srvlst)
  (let* ((nums (server:get-num-servers))
  (let ((now (current-seconds)))
    (sort
     (filter (lambda (rec)
	       (if (and (list? rec)
			(> (length rec) 2))
		   (let ((start-time (list-ref rec 3))
			 (mod-time   (list-ref rec 0)))
		     ;; (print "start-time: " start-time " mod-time: " mod-time)
		     (and start-time mod-time
			  (> (- now start-time) 0)    ;; been running at least 0 seconds
			  (< (- now mod-time)   16)   ;; still alive - file touched in last 16 seconds
			  (< (- now start-time) 
			     (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600"))
				   180)
				(random 360))) ;; under one hour running time +/- 180
			  ))
		   #f))
	     srvlst)
     (lambda (a b)
       (< (list-ref a 3)
	  (list-ref b 3))))))
	 (now  (current-seconds))
	 (slst (sort
		(filter (lambda (rec)
			  (if (and (list? rec)
				   (> (length rec) 2))
			      (let ((start-time (list-ref rec 3))
				    (mod-time   (list-ref rec 0)))
				;; (print "start-time: " start-time " mod-time: " mod-time)
				(and start-time mod-time
				     (> (- now start-time) 0)    ;; been running at least 0 seconds
				     (< (- now mod-time)   16)   ;; still alive - file touched in last 16 seconds
				     (< (- now start-time) 
					(+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600"))
					      180)
					   (random 360))) ;; under one hour running time +/- 180
				     ))
			      #f))
			srvlst)
		(lambda (a b)
		  (< (list-ref a 3)
		     (list-ref b 3))))))
    (if (> (length slst) nums)
	(take slst nums)
	slst)))

(define (server:get-first-best areapath)
  (let ((srvrs (server:get-best (server:get-list areapath))))
    (if (and srvrs
	     (not (null? srvrs)))
	(car srvrs)
	#f)))
303
304
305
306
307
308
309





310
311
312
313
314


315
316
317
318
319
320
321
322
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321


322
323

324
325
326
327
328
329
330







+
+
+
+
+



-
-
+
+
-







	  (let ((num-ok (length (server:get-best (server:get-list areapath)))))
	    (if (< num-ok 1) ;; if there are no decent candidates for servers then try starting a new one
		(server:kind-run areapath))
	    (thread-sleep! 5)
	    (loop (server:check-if-running areapath)))))))

(define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG.

(define (server:get-num-servers #!key (numservers 2))
  (let ((ns (string->number
	     (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
    (or ns numservers)))

;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath #!key (numservers "2"))
  (let* ((ns            (string->number
(define (server:check-if-running areapath) ;;  #!key (numservers "2"))
  (let* ((ns            (server:get-num-servers))
			 (or (configf:lookup *configdat* "server" "numservers") numservers)))
	 (servers       (server:get-best (server:get-list areapath))))
    ;; (print "servers: " servers " ns: " ns)
    (if (or (and servers
		 (null? servers))
	    (not servers)
	    (and (list? servers)
		 (< (length servers) (random ns)))) ;; somewhere between 0 and numservers