Megatest

Check-in [fd6bcc8b75]
Login
Overview
Comment:Corrected issue with server overload handling. Switch to using a less burdensome server ping instead of server:check-if-running
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.64
Files: files | file ages | folders
SHA1: fd6bcc8b75450d820420f86aa3bd4122833085b6
User & Date: matt on 2017-06-01 00:21:31
Other Links: branch diff | manifest | tags
Context
2017-06-01
22:08
Fixed sync to megatest.db, it was not respecting last_update and thus syncing everything. Added hard storage of last update in a no-sync db on /tmp check-in: b665288f56 user: matt tags: v1.64
00:21
Corrected issue with server overload handling. Switch to using a less burdensome server ping instead of server:check-if-running check-in: fd6bcc8b75 user: matt tags: v1.64
2017-05-31
23:05
Improved messages in couple noisy cases. Add flag for turning off touching of server logs when max api calls situation occurs check-in: bc2a1779bc user: matt tags: v1.64
Changes

Modified http-transport.scm from [2c955425b7] to [e6367359e1].

293
294
295
296
297
298
299





300
301


302
303
304
305
306
307
308
293
294
295
296
297
298
299
300
301
302
303
304


305
306
307
308
309
310
311
312
313







+
+
+
+
+
-
-
+
+







(define (http-transport:close-connections #!key (area-dat #f))
  (let* ((runremote  (or area-dat *runremote*))
	 (server-dat (if runremote
                         (remote-conndat runremote)
                         #f))) ;; (hash-table-ref/default *runremote* run-id #f)))
    (if (vector? server-dat)
	(let ((api-dat (http-transport:server-dat-get-api-uri server-dat)))
	  (handle-exceptions
	    exn
	    (begin
	      (print-call-chain *default-log-port*)
	      (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn)))
	  (close-connection! api-dat)
	  #t)
	    (close-connection! api-dat)
	    #t))
	#f)))


(define (make-http-transport:server-dat)(make-vector 6))
(define (http-transport:server-dat-get-iface         vec)    (vector-ref  vec 0))
(define (http-transport:server-dat-get-port          vec)    (vector-ref  vec 1))
(define (http-transport:server-dat-get-api-uri       vec)    (vector-ref  vec 2))
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448

449

450
451
452
453
454
455
456
437
438
439
440
441
442
443

444
445
446
447
448
449
450
451
452
453

454
455
456
457
458
459
460
461







-









+
-
+







	     (adjusted-timeout (if (> hrs-since-start 1)
				   (- server-timeout (inexact->exact (round (* hrs-since-start 60))))  ;; subtract 60 seconds per hour
				   server-timeout)))
	(if (common:low-noise-print 120 "server timeout")
	    (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout))
	(cond
         ((and *server-run*
	       (not *server-overloaded*)
	       (> (+ last-access server-timeout)
		  (current-seconds))
	       (< (- (current-seconds) server-start-time) 3600)) ;; do not update log or touch log if we've been running for more than one hour.
          (if (common:low-noise-print 120 "server continuing")
              (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	      (let ((curr-time (current-seconds)))
		(handle-exceptions
		    exn
		    (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on log file " server-log-file ". Are you out of space on that disk?")
		  (if (not *server-overloaded*)
		  (change-file-times server-log-file curr-time curr-time))))
		      (change-file-times server-log-file curr-time curr-time)))))
          (loop 0 server-state bad-sync-count (current-milliseconds)))
         (else
          (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access))
          (http-transport:server-shutdown port)))))))

(define (http-transport:server-shutdown port)
  (begin

Modified megatest-version.scm from [132e4528f7] to [397d84c9d5].

1
2
3
4
5
6

7
1
2
3
4
5

6
7





-
+

;; Always use two or four digit decimal
;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..

(declare (unit megatest-version))

(define megatest-version 1.6416)
(define megatest-version 1.6417)

Modified rmt.scm from [f39fa419cf] to [7ffa1b548f].

131
132
133
134
135
136
137
138
139
140
141
142
143
144

145
146
147
148
149
150
151
131
132
133
134
135
136
137







138
139
140
141
142
143
144
145







-
-
-
-
-
-
-
+







     ;;DOT CASE4 -> "rmt:send-receive";
     ;; reset the connection if it has been unused too long
     ((and runremote
           (remote-conndat runremote)
	   (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 10)))) ;; Subtract or add the random value? Seems like it should be substract but Neither fixes the "WARNING: failure in with-input-from-request to #<request>.\n message: Server closed connection before sending response"
	     (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))
      (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.")
      (let ((old-conn (remote-conndat runremote)))
	(handle-exceptions
	    exn
	    (begin
	      (print-call-chain *default-log-port*)
	      (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn)))
	  (http-transport:close-connections area-dat: runremote)))
      (http-transport:close-connections area-dat: runremote)
      (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))

     ;;DOT CASE5 [label="local\nread"];
     ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5};
     ;;DOT CASE5 -> "rmt:open-qry-close-locally";
254
255
256
257
258
259
260

261
262
263
264
265
266
267
268

269
270
271
272


273
274
275
276
277
278
279
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266


267
268
269
270
271
272
273
274
275







+








+


-
-
+
+







	(if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end
	    (if (and (vector? res)
		     (eq? (vector-length res) 2)
		     (eq? (vector-ref res 1) 'overloaded)) ;; since we are looking at the data to carry the error we'll use a fairly obtuse combo to minimise the chances of some sort of collision.
		(let ((wait-delay (+ attemptnum (* attemptnum 10))))
		  (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.")
		  (mutex-lock! *rmt-mutex*)
		  (http-transport:close-connections area-dat: runremote)
		  (set! *runremote* #f) ;; force starting over
		  (mutex-unlock! *rmt-mutex*)
		  (thread-sleep! wait-delay)
		  (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
		res) ;; All good, return res
	    (begin
	      (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
	      (remote-conndat-set!    runremote #f)
	      (http-transport:close-connections area-dat: runremote)
	      (remote-server-url-set! runremote #f)
	      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9.1")
	      (if (not (server:check-if-running *toppath*))
		  (server:start-and-wait *toppath*))
	      ;; (if (not (server:check-if-running *toppath*))
	      ;; 	  (server:start-and-wait *toppath*))
	      (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))))))))

    ;;DOT }
    
;; (define (rmt:update-db-stats run-id rawcmd params duration)
;;   (mutex-lock! *db-stats-mutex*)
;;   (handle-exceptions

Modified runs.scm from [180cc9d9e4] to [50559e5a2a].

1267
1268
1269
1270
1271
1272
1273



1274

1275
1276
1277
1278
1279
1280
1281
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276

1277
1278
1279
1280
1281
1282
1283
1284







+
+
+
-
+







			   itemmaps:    itemmaps
			   ;; prereqs-not-met: prereqs-not-met
			   )))
	(runs:dat-regfull-set! runsdat regfull)

	;; every 15 minutes verify the server is there for this run
	(if (and (common:low-noise-print 240 "try start server"  run-id)
		 (not (or (and *runremote*
			       (remote-server-url *runremote*)
			       (server:ping (remote-server-url *runremote*)))
		 (not (server:check-if-running *toppath*)))
			  (server:check-if-running *toppath*))))
	    (server:kind-run *toppath*))
	
	(if (> num-running 0)
	  (set! last-time-some-running (current-seconds)))

      (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
	  (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))

Modified tasks.scm from [7b85a80157] to [5e421a88af].

193
194
195
196
197
198
199
200

201
202
203
204
205
206
207
193
194
195
196
197
198
199

200
201
202
203
204
205
206
207







-
+







    (setenv "TARGETHOST_LOGF" (conc logdir "server-kills.log"))

    (system (conc "nbfake kill "kill-switch" "pid))

    (when logfile
      (thread-sleep! 0.5)
      (if (file-exists? gzfile) (delete-file gzfile))
      (system (conc "gzip "logfile))
      (system (conc "gzip " logfile))
      
      (unsetenv "TARGETHOST_LOGF")
      (unsetenv "TARGETHOST"))))
    
 
;;======================================================================
;; M O N I T O R S