Megatest

Check-in [a097994a72]
Login
Overview
Comment:Check that server isn't already running before doing kind-run. This was working fine with low load but when load got high it would run away.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | runaway-servers-fix
Files: files | file ages | folders
SHA1: a097994a72e93993fdfefea11d3fe73e73531f64
User & Date: matt on 2017-03-23 00:53:42
Other Links: branch diff | manifest | tags
Context
2017-03-23
11:34
Fixed wrong testsuite name calculation problem. check-in: 8c653071c7 user: matt tags: runaway-servers-fix
01:18
merged runaway servers fix check-in: c80ac44cb9 user: bjbarcla tags: v1.63
00:53
Check that server isn't already running before doing kind-run. This was working fine with low load but when load got high it would run away. check-in: a097994a72 user: matt tags: runaway-servers-fix
2017-03-22
14:44
applied patches for db init on non-homehost protection & no-cache mode check-in: 9ffefc583c user: bjbarcla tags: v1.63
Changes

Modified client.scm from [d740aa52d4] to [3c66569adb].

106
107
108
109
110
111
112
113


114
115
116
117
118
106
107
108
109
110
111
112

113
114
115
116

117
118







-
+
+


-


			  (case *transport-type* 
			    ((http)(http-transport:close-connections)))
			  (remote-conndat-set! runremote #f)  ;; (hash-table-delete! runremote run-id)
			  (thread-sleep! 1)
			  (client:setup-http areapath remaining-tries: (- remaining-tries 1))
			  )))
		  (begin    ;; no server registered
		    (server:kind-run areapath)
		    ;; (server:kind-run areapath)
		    (server:start-and-wait areapath)
		    (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries)
		    (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
		    (server:start-and-wait areapath)
		    (client:setup-http areapath remaining-tries: (- remaining-tries 1)))))))))

Modified common.scm from [3bdeeed025] to [ef5ed6728b].

148
149
150
151
152
153
154
155


156
157
158
159
160
161
162
148
149
150
151
152
153
154

155
156
157
158
159
160
161
162
163







-
+
+








(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (conndat           #f)
  (transport         *transport-type*)
  (server-timeout    (or (server:get-timeout) 100))) ;; default to 100 seconds
  (server-timeout    (or (server:get-timeout) 100))
  (force-server      #f)) ;; default to 100 seconds

;; launching and hosts
(defstruct host
  (reachable    #f)
  (last-update  0)
  (last-used    0)
  (last-cpuload 1))
1011
1012
1013
1014
1015
1016
1017












1018
1019
1020
1021
1022
1023
1024
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037







+
+
+
+
+
+
+
+
+
+
+
+








;; do we honor the caches of the config files?
;;
(define (common:use-cache?)
  (not (or (args:get-arg "-no-cache")
	   (and *configdat*
		(equal? (configf:lookup *configdat* "setup" "use-cache") "no")))))

;; force use of server?
;;
(define (common:force-server?)
  (let ((force-setting (configf:lookup "server" "force"))
	(force-type    (if force-setting (string->symbol force-setting) #f)))
    (case force-type
      ((#f)     #f)
      ((always) #t)
      ((test)   (if (args:get-arg "-execute") ;; we are in a test
		    #t
		    #f)))))

;;======================================================================
;; M I S C   L I S T S
;;======================================================================

;; items in lista are matched value and position in listb
;; return the remaining items in listb or #f

Modified rmt.scm from [37fe170e89] to [9187b44a8c].

105
106
107
108
109
110
111

112

113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

129

130
131

132
133
134
135
136

137

138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

154
155
156

157

158
159
160
161
162
163
164
105
106
107
108
109
110
111
112

113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

131
132

133
134
135
136
137
138
139

140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161

162
163
164
165
166
167
168
169







+
-
+
















+
-
+

-
+





+
-
+
















+



+
-
+







     ((not (pair? (remote-hh-dat runremote)))  ;; not on homehost
      (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
      (remote-hh-dat-set! runremote (common:get-homehost))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  2")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; on homehost and this is a read
     ((and (not (remote-force-server runremote))      ;; honor forced use of server
     ((and (cdr (remote-hh-dat runremote))   ;; on homehost
	   (cdr (remote-hh-dat runremote))     ;; on homehost
           (member cmd api:read-only-queries)) ;; this is a read
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  3")
      (rmt:open-qry-close-locally cmd 0 params))

     ;; on homehost and this is a write, we already have a server, but server has died
     ((and (cdr (remote-hh-dat runremote))         ;; on homehost
           (not (member cmd api:read-only-queries))  ;; this is a write
           (remote-server-url runremote)           ;; have a server
           (not (server:check-if-running *toppath*)))  ;; server has died.
      (set! *runremote* (make-remote))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))

     ;; on homehost and this is a write, we already have a server
     ((and (not (remote-force-server runremote))     ;; honor forced use of server
     ((and (cdr (remote-hh-dat runremote))         ;; on homehost
	   (cdr (remote-hh-dat runremote))           ;; on homehost
           (not (member cmd api:read-only-queries))  ;; this is a write
           (remote-server-url runremote))          ;; have a server
           (remote-server-url runremote))            ;; have a server
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4")
      (rmt:open-qry-close-locally cmd 0 params))

     ;;  on homehost, no server contact made and this is a write, passively start a server 
     ((and (not (remote-force-server runremote)) ;; honor forced use of server
     ((and (cdr (remote-hh-dat runremote)) ; new
	   (cdr (remote-hh-dat runremote))       ;; new
           (not (remote-server-url runremote))
	   (not (member cmd api:read-only-queries)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
      (let ((server-url  (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
	(if server-url
	    (remote-server-url-set! runremote server-url) ;; the string can be consumed by the client setup if needed
	    (server:kind-run *toppath*)))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.1")
      (rmt:open-qry-close-locally cmd 0 params))

     ((and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
           (not (remote-conndat runremote)))            ;; and no connection
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6  hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote))
      (mutex-unlock! *rmt-mutex*)
      (server:start-and-wait *toppath*)
      (if (common:force-server?)(remote-force-server-set! runremote #t))
      (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
      (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as
     ;; all set up if get this far, dispatch the query
     ((and (not (remote-force-server runremote))
     ((cdr (remote-hh-dat runremote)) ;; we are on homehost
	   (cdr (remote-hh-dat runremote))) ;; we are on homehost
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  7")
      (rmt:open-qry-close-locally cmd (if rid rid 0) params))

     ;; not on homehost, do server query
     (else
      (mutex-unlock! *rmt-mutex*)

Modified server.scm from [c9206854e2] to [8b1ebf4820].

255
256
257
258
259
260
261

262
263
264
265
266
267
268
269
270
271
272
273

















274
275
276
277
278
279
280
255
256
257
258
259
260
261
262












263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286







+
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))

;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
(define (server:kind-run areapath)
  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
  (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
         (call-num     (car last-run-dat))
         (when-run     (cadr last-run-dat))
         (run-delay    (+ (case call-num
                            ((0)    0)
                            ((1)   20)
                            ((2)  300)
                            (else 600))
                          (random 5)))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
    (if	(> (- (current-seconds) when-run) run-delay)
        (server:run areapath))
    (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))
      (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
	     (call-num     (car last-run-dat))
	     (when-run     (cadr last-run-dat))
	     (run-delay    (+ (case call-num
				((0)    0)
				((1)   20)
				((2)  300)
				(else 600))
			      (random 5)))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
	     (lock-file    (conc areapath "/logs/server-start.lock")))
	(if	(> (- (current-seconds) when-run) run-delay)
		(begin
		  (common:simple-file-lock lock-file expire-time: 15)
		  (server:run areapath)
		  (thread-sleep! 5) ;; don't release the lock for at least a few seconds
		  (common:simple-file-release-lock lock-file)))
	(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))

(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))
    (let loop ((server-url (server:check-if-running areapath)))
      (if (or server-url
	      (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
	  server-url