Megatest

Check-in [45da129709]
Login
Overview
Comment:Improved reliability but now have issue with connection.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.64
Files: files | file ages | folders
SHA1: 45da1297091429b7ae29dc66f4de59594c2ca311
User & Date: matt on 2017-03-28 00:35:23
Other Links: branch diff | manifest | tags
Context
2017-03-28
01:01
should be resetting the connection struct when connection is failed to make check-in: 333f542a2e user: matt tags: v1.64
00:35
Improved reliability but now have issue with connection. check-in: 45da129709 user: matt tags: v1.64
2017-03-27
23:59
protected config file call to delete-file with exception handler. Fixed logic on connecting using CMDINFO. Fixed -list-servers and -kill-servers. Turned exception handler back on in portlogger. Removed the addition of a little noise from the server timeout handling in rmt.scm check-in: 7b318f91bd user: matt tags: v1.64
Changes

Modified common.scm from [5a79ae269f] to [735742a8cc].

153
154
155
156
157
158
159
160

161
162
163
164
165
166
167
153
154
155
156
157
158
159

160
161
162
163
164
165
166
167







-
+








(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (conndat           #f)
  (transport         *transport-type*)
  (server-timeout    (or (server:get-timeout) 100)) ;; default to 100 seconds
  (server-timeout    (server:get-timeout)) ;; default from server:get-timeout
  (force-server      #f)
  (ro-mode           #f)  
  (ro-mode-checked   #f)) ;; flag that indicates we have checked for ro-mode

;; launching and hosts
(defstruct host
  (reachable    #f)

Modified launch.scm from [f8bf4a3053] to [57b9002fc8].

476
477
478
479
480
481
482
483
484
485



486
487
488
489
490
491
492
493
494
476
477
478
479
480
481
482



483
484
485


486
487
488
489
490
491
492







-
-
-
+
+
+
-
-







				     ping-res)
				;; (begin ;; let ((url  (http-transport:server-dat-make-url start-res)))
				(begin
				  (remote-conndat-set! *runremote* start-res)
				  ;; (remote-server-url-set! *runremote* url)
				  ;; (if (server:ping url)
				  (debug:print-info 0 *default-log-port* "connected to " host ":" port " using CMDINFO data."))
				;; (begin
				;; 	(debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " url)
				;; 	(remote-conndat-set! *runremote* #f)
				(begin
				  (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " host ":" port)
				  (remote-conndat-set! *runremote* #f))
				;; 	(remote-server-url-set! *runremote* #f))))
				(debug:print-info 0 *default-log-port* "received " host ":" port " for url but could not connect.")
				))
			  (begin
			    (debug:print-info 0 *default-log-port* (if host-port
								       (conc "received invalid host-port information " host-port)
								       "no host-port information received"))
			    ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare.
			    (set! needcare #t)))

Modified rmt.scm from [01e080d921] to [92ef1e6d91].

95
96
97
98
99
100
101
102

103
104

105
106
107
108
109
110
111
95
96
97
98
99
100
101

102
103

104
105
106
107
108
109
110
111







-
+

-
+







      (debug:print 0 *default-log-port* "WARNING: write transaction requested on a readonly area.  cmd="cmd" params="params)
      #f
      )

     ;; reset the connection if it has been unused too long
     ((and runremote
           (remote-conndat runremote)
	   (let ((expire-time (+ (- start-time (remote-server-timeout runremote))))) ;; NOTE: REMOVED the 30 second noise. If adding it back be sure to offset!! add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts
	   (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 10)))) ;; Subtract or add the random value? Seems like it should be substract but Neither fixes the "WARNING: failure in with-input-from-request to #<request>.\n message: Server closed connection before sending response"
	     (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
      (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.")
      (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a record for our connection for given area
     ((not runremote)                  ;; can remove this one. should never get here.         
      (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration
      (mutex-unlock! *rmt-mutex*)