Megatest

Check-in [57e72d2e76]
Login
Overview
Comment:merged in protection against too demanding clients
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.63
Files: files | file ages | folders
SHA1: 57e72d2e764ae10d38e3187fbdab7d23b90bb469
User & Date: bjbarcla on 2017-03-24 15:39:13
Other Links: branch diff | manifest | tags
Context
2017-03-24
22:39
Merged v1.63 changes into v1.64 check-in: b2e6452e2f user: matt tags: v1.64
15:39
merged in protection against too demanding clients Leaf check-in: 57e72d2e76 user: bjbarcla tags: v1.63
15:18
Reduced threshold for server pushback on clients to 20 api calls/parallel. Added couple more calls to check if server is running before starting a new one. check-in: 37e149d637 user: matt tags: multi-server-hack
11:19
Added some error handling on the locking calls. Silenced a dashboard message. check-in: a5dbcdd2ac user: matt tags: v1.63
Changes

Modified api.scm from [4067424284] to [466ca51286].

123
124
125
126
127
128
129


130
131
132
133
134
135
136
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138







+
+







     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer")
     (print-call-chain (current-error-port))
     (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
     (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
   (cond
    ((not (vector? dat))                    ;; it is an error to not receive a vector
     (vector #f #f "remote must be called with a vector")       )
    ((> *api-process-request-count* 20)
     (vector #f 'overloaded))
    (else  
     (let* ((cmd-in (vector-ref dat 0))
            (cmd    (if (symbol? cmd-in)
                        cmd-in
                        (string->symbol cmd-in)))
            (params (vector-ref dat 1))
            (start-t (current-milliseconds))
269
270
271
272
273
274
275
276
277
278
279
280
281







282
283
284
285
286
287
288
271
272
273
274
275
276
277






278
279
280
281
282
283
284
285
286
287
288
289
290
291







-
-
-
-
-
-
+
+
+
+
+
+
+







                   ((ping)                         (current-process-id))

                   ;; TESTMETA
                   ((testmeta-get-record)       (apply db:testmeta-get-record dbstruct params))

                   ;; TASKS 
                   ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))))))
       (if (not writecmd-in-readonly-mode)
           (let ((delta-t (- (current-milliseconds)
                             start-t)))
             (hash-table-set! *db-api-call-time* cmd
                              (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))
             (vector #t res))
       ;; save all stats
       (let ((delta-t (- (current-milliseconds)
			 start-t)))
	 (hash-table-set! *db-api-call-time* cmd
			  (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))))
       (if (not writecmd-in-readonly-mode)
	   (vector #t res)
           (vector #f res)))))))

;; http-server  send-response
;;                 api:process-request
;;                    db:*
;;
;; NB// Runs on the server as part of the server loop

Modified client.scm from [3c66569adb] to [5611deb23d].

76
77
78
79
80
81
82
83

84
85
86
87
88
89
90
76
77
78
79
80
81
82

83
84
85
86
87
88
89
90







-
+







      (begin
	(debug:print-error 0 *default-log-port* "failed to start or connect to server")
	(exit 1))
      ;;
      ;; Alternatively here, we can get the list of candidate servers and work our way
      ;; through them searching for a good one.
      ;;
      (let* ((server-dat (server:get-first-best areapath))
      (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath))
	     (runremote  (or area-dat *runremote*)))
	(if (not server-dat) ;; no server found
	    (client:setup-http areapath remaining-tries: (- remaining-tries 1))
	    (let ((host  (cadr  server-dat))
		  (port  (caddr server-dat)))
	      (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	      (if (and (not area-dat)

Modified common.scm from [4f52eb87fd] to [9d413de21a].

148
149
150
151
152
153
154
155
156




157
158
159
160
161
162
163
148
149
150
151
152
153
154


155
156
157
158
159
160
161
162
163
164
165







-
-
+
+
+
+








(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (conndat           #f)
  (transport         *transport-type*)
  (server-timeout    (or (server:get-timeout) 100))
  (force-server      #f)) ;; default to 100 seconds
  (server-timeout    (or (server:get-timeout) 100)) ;; default to 100 seconds
  (force-server      #f)
  (ro-mode           #f)  
  (ro-mode-checked   #f)) ;; flag that indicates we have checked for ro-mode

;; launching and hosts
(defstruct host
  (reachable    #f)
  (last-update  0)
  (last-used    0)
  (last-cpuload 1))
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076















1077
1078
1079
1080
1081
1082
1083
1062
1063
1064
1065
1066
1067
1068










1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090







-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







	   (and *configdat*
		(equal? (configf:lookup *configdat* "setup" "use-cache") "no")))))

;; force use of server?
;;
(define (common:force-server?)
  (let* ((force-setting (configf:lookup *configdat* "server" "force"))
	 (force-type    (if force-setting (string->symbol force-setting) #f)))
    (case force-type
      ((#f)     #f)
      ((always) #t)
      ((test)   (if (args:get-arg "-execute") ;; we are in a test
		    #t
		    #f))
      (else
       (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.")
       #t)))) ;; default to requiring server 
	 (force-type    (if force-setting (string->symbol force-setting) #f))
	 (force-result  (case force-type
			  ((#f)     #f)
			  ((always) #t)
			  ((test)   (if (args:get-arg "-execute") ;; we are in a test
					#t
					#f))
			  (else
			   (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.")
			   #t)))) ;; default to requiring server
    (if force-result
	(begin
	  (debug:print-info 0 *default-log-port* "forcing use of server, force setting is \"" force-setting "\".")
	  #t)
	#f)))

;;======================================================================
;; M I S C   L I S T S
;;======================================================================

;; items in lista are matched value and position in listb
;; return the remaining items in listb or #f

Modified db.scm from [df76958ce7] to [feaa389617].

2094
2095
2096
2097
2098
2099
2100
2101


2102
2103
2104

2105

2106

2107
2108


2109
2110
2111
2112
2113
2114
2115
2094
2095
2096
2097
2098
2099
2100

2101
2102
2103
2104
2105
2106

2107
2108
2109
2110

2111
2112
2113
2114
2115
2116
2117
2118
2119







-
+
+



+
-
+

+

-
+
+








(define (db:print-current-query-stats)
  ;; generate stats from *db-api-call-time*
  (let ((ordered-keys (sort (hash-table-keys *db-api-call-time*)
			    (lambda (a b)
			      (let ((sum-a (common:sum (hash-table-ref *db-api-call-time* a)))
				    (sum-b (common:sum (hash-table-ref *db-api-call-time* b))))
				(> sum-a sum-b))))))
				(> sum-a sum-b)))))
	(total        0))
    (for-each
     (lambda (cmd-key)
       (let* ((dat  (hash-table-ref *db-api-call-time* cmd-key))
	      (num  (length dat))
	      (avg  (if (> (length dat) 0)
	      (avg  (if (> num 0)
			(/ (common:sum dat)(length dat)))))
	 (set! total (+ total num))
	 (debug:print-info 0 *default-log-port* cmd-key "\tavg: " avg " max: " (common:max dat) " min: " (common:min-max < dat) " num: " (length dat))))
     ordered-keys)))
     ordered-keys)
    (debug:print-info 0 *default-log-port* "TOTAL: " total " api calls since start.")))

(define (db:get-all-run-ids dbstruct)
  (db:with-db
   dbstruct
   #f
   #f
   (lambda (db)

Modified http-transport.scm from [44c2ce6eea] to [9751cbc3b5].

416
417
418
419
420
421
422
423




424
425
426
427
428
429
430
416
417
418
419
420
421
422

423
424
425
426
427
428
429
430
431
432
433







-
+
+
+
+







      (set! last-access *db-last-access*)
      (mutex-unlock! *heartbeat-mutex*)
      
      (if (common:low-noise-print 120 (conc "server running on " iface ":" port))
	  (begin
	    (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds))
	    (flush-output *default-log-port*)))

      (if (common:low-noise-print 60 "dbstats")
	  (begin
	    (debug:print 0 *default-log-port* "Server stats:")
	    (db:print-current-query-stats)))
      (let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600))
	     (adjusted-timeout (if (> hrs-since-start 1)
				   (- server-timeout (inexact->exact (round (* hrs-since-start 60))))  ;; subtract 60 seconds per hour
				   server-timeout)))
	(if (common:low-noise-print 120 "server timeout")
	    (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout))
	(cond

Modified launch.scm from [442d86a53d] to [7ecb4d1b9c].

783
784
785
786
787
788
789
790

791
792
793
794
795
796
797
783
784
785
786
787
788
789

790
791
792
793
794
795
796
797







-
+







	     (target   (common:args-get-target))
	     (linktree (common:get-linktree))
	     (sections (if target (list "default" target) #f)) ;; for runconfigs
	     (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config 
	     (rundir   (if (and runname target linktree)(conc linktree "/" target "/" runname) #f))
	     (mtcachef (and rundir (conc rundir "/" ".megatest.cfg-"  megatest-version "-" megatest-fossil-hash)))
	     (rccachef (and rundir (conc rundir "/" ".runconfigs.cfg-"  megatest-version "-" megatest-fossil-hash)))
	 (cancreate (and rundir (common:file-exists? rundir)(file-write-access? rundir)))
	     (cancreate (and rundir (common:file-exists? rundir)(file-write-access? rundir)))
	     (cxt       (hash-table-ref/default *contexts* toppath #f)))

	;; create our cxt for this area if it doesn't already exist
	(if (not cxt)(hash-table-set! *contexts* toppath (make-cxt)))

	;; (print "runname: " runname " target: " target " mtcachef: " mtcachef " rccachef: " rccachef)
	(set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource

Modified rmt.scm from [1adf35b1f4] to [a6b31e3da3].

54
55
56
57
58
59
60
61
62
63
64
65
















66
67
68
69
70
71
72
54
55
56
57
58
59
60





61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83







-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







  ;; do all the prep locked under the rmt-mutex
  (mutex-lock! *rmt-mutex*)

  ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
  ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
  ;; 3. do the query, if on homehost use local access
  ;;
  (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value
         (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
         (dbfile (conc *toppath* "/megatest.db"))
         (readonly-mode (not (file-write-access? dbfile))) ;; TODO: use dbstruct or runremote to figure this out in future
	 (runremote  (or area-dat *runremote*)))
  (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
         (areapath      *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
	 (runremote     (or area-dat
			    *runremote*))
	 (readonly-mode (if (and runremote
				 (remote-ro-mode-checked runremote))
			    (remote-ro-mode runremote)
			    (let* ((dbfile  (conc *toppath* "/megatest.db"))
				   (ro-mode (not (file-write-access? dbfile)))) ;; TODO: use dbstruct or runremote to figure this out in future
			      (if runremote
				  (begin
				    (remote-ro-mode-set! runremote ro-mode)
				    (remote-ro-mode-checked-set! runremote #t)
				    ro-mode)
				  ro-mode)))))

    ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
    (cond
     ;; give up if more than 15 attempts
     ((> attemptnum 15)
      (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.")
      (exit 1))

92
93
94
95
96
97
98
99

100
101
102
103
104
105
106
103
104
105
106
107
108
109

110
111
112
113
114
115
116
117







-
+







	   (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 30)))) ;; add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts
	     (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
      (remote-conndat-set! runremote #f)
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a record for our connection for given area
     ((not runremote)                     
     ((not runremote)                  ;; can remove this one. should never get here.         
      (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  1")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a homehost record
     ((not (pair? (remote-hh-dat runremote)))  ;; not on homehost
      (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
154
155
156
157
158
159
160

161

162
163
164
165
166
167
168
165
166
167
168
169
170
171
172

173
174
175
176
177
178
179
180







+
-
+








     ((or (and (remote-force-server runremote)              ;; we are forcing a server and don't yet have a connection to one
	       (not (remote-conndat runremote)))
	  (and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
	       (not (remote-conndat runremote))))           ;; and no connection
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6  hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote))
      (mutex-unlock! *rmt-mutex*)
      (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up?
      (server:start-and-wait *toppath*)
	  (server:start-and-wait *toppath*))
      (remote-force-server-set! runremote (common:force-server?))
      (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
      (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as
     ;; all set up if get this far, dispatch the query
     ((and (not (remote-force-server runremote))
	   (cdr (remote-hh-dat runremote))) ;; we are on homehost
      (mutex-unlock! *rmt-mutex*)
193
194
195
196
197
198
199





200
201
202
203
204
205
206
207









208
209
210
211
212
213
214
205
206
207
208
209
210
211
212
213
214
215
216








217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232







+
+
+
+
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+







	      ((http)
	       (mutex-unlock! *rmt-mutex*)
	       res)
	      (else
	       (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " is unknown")
	       (mutex-unlock! *rmt-mutex*)
	       (exit 1)))
	    (if (eq? res 'overloaded)
		(let ((wait-delay (+ attemptnum (* attemptnum 10))))
		  (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.")
		  (thread-sleep! wait-delay)
		  (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
	    (begin
	      (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
	      (remote-conndat-set!    runremote #f)
	      (remote-server-url-set! runremote #f)
              (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9.1")
	      (mutex-unlock! *rmt-mutex*)
	      (server:start-and-wait *toppath*)
	      (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))))))))
		(begin
		  (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
		  (remote-conndat-set!    runremote #f)
		  (remote-server-url-set! runremote #f)
		  (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9.1")
		  (mutex-unlock! *rmt-mutex*)
		  (if (not (server:check-if-running *toppath*))
		      (server:start-and-wait *toppath*))
		  (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))))

;; (define (rmt:update-db-stats run-id rawcmd params duration)
;;   (mutex-lock! *db-stats-mutex*)
;;   (handle-exceptions
;;    exn
;;    (begin
;;      (debug:print 0 *default-log-port* "WARNING: stats collection failed in update-db-stats")

Modified server.scm from [ba8be5ee9a] to [65b45e968a].

165
166
167
168
169
170
171
172

173
174
175
176
177
178
179
180

181
182
183
184
185
186
187
165
166
167
168
169
170
171

172
173
174
175
176
177
178
179

180
181
182
183
184
185
186
187







-
+







-
+







(define (server:get-list areapath #!key (limit #f))
  (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
	(day-seconds (* 24 60 60)))
    ;; if the directory exists continue to get the list
    ;; otherwise attempt to create the logs dir and then
    ;; continue
    (if (if (directory-exists? (conc areapath "/logs"))
	    #t
	    '()
	    (if (file-write-access? areapath)
		(begin
		  (condition-case
		      (create-directory (conc areapath "/logs") #t)
		    (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
		    (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list.")))
		  (directory-exists? (conc areapath "/logs")))
		#f))
		'()))
	(let* ((server-logs   (glob (conc areapath "/logs/server-*.log")))
	       (num-serv-logs (length server-logs)))
	  (if (null? server-logs)
	      '()
	      (let loop ((hed  (car server-logs))
			 (tal  (cdr server-logs))
			 (res '()))
216
217
218
219
220
221
222


223
224
225
226
227
228
229
230
231
232
233












234
235
236
237
238
239
240
241
242
243
244










245
246
247
248
249
250
251
216
217
218
219
220
221
222
223
224











225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264







+
+
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+











+
+
+
+
+
+
+
+
+
+







;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off
;; and servers should stick around for about two hours or so.
;;
(define (server:get-best srvlst)
  (let ((now (current-seconds)))
    (sort
     (filter (lambda (rec)
	       (if (and (list? rec)
			(> (length rec) 2))
	       (let ((start-time (list-ref rec 3))
		     (mod-time   (list-ref rec 0)))
		 ;; (print "start-time: " start-time " mod-time: " mod-time)
		 (and start-time mod-time
		      (> (- now start-time) 0)    ;; been running at least 0 seconds
		      (< (- now mod-time)   16)   ;; still alive - file touched in last 16 seconds
		      (< (- now start-time) 
                         (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600"))
                               180)
                            (random 360))) ;; under one hour running time +/- 180
		      )))
		   (let ((start-time (list-ref rec 3))
			 (mod-time   (list-ref rec 0)))
		     ;; (print "start-time: " start-time " mod-time: " mod-time)
		     (and start-time mod-time
			  (> (- now start-time) 0)    ;; been running at least 0 seconds
			  (< (- now mod-time)   16)   ;; still alive - file touched in last 16 seconds
			  (< (- now start-time) 
			     (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600"))
				   180)
				(random 360))) ;; under one hour running time +/- 180
			  ))
		   #f))
	     srvlst)
     (lambda (a b)
       (< (list-ref a 3)
	  (list-ref b 3))))))

(define (server:get-first-best areapath)
  (let ((srvrs (server:get-best (server:get-list areapath))))
    (if (and srvrs
	     (not (null? srvrs)))
	(car srvrs)
	#f)))

(define (server:get-rand-best areapath)
  (let ((srvrs (server:get-best (server:get-list areapath))))
    (if (and (list? srvrs)
	     (not (null? srvrs)))
	(let* ((len (length srvrs))
	       (idx (random len)))
	  (list-ref srvrs idx))
	#f)))


(define (server:record->url servr)
  (match-let (((mod-time host port start-time pid)
	       servr))
    (if (and host port)
	(conc host ":" port)
	#f)))
290
291
292
293
294
295
296
297
298
299










300
301
302
303
304
305
306
303
304
305
306
307
308
309



310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326







-
-
-
+
+
+
+
+
+
+
+
+
+







	    (thread-sleep! 5)
	    (loop (server:check-if-running areapath)))))))

(define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG.

;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath)
  (let* ((servers       (server:get-best (server:get-list areapath))))
    (if (null? servers)
(define (server:check-if-running areapath #!key (numservers "2"))
  (let* ((ns            (string->number
			 (or (configf:lookup *configdat* "server" "numservers") numservers)))
	 (servers       (server:get-best (server:get-list areapath))))
    ;; (print "servers: " servers " ns: " ns)
    (if (or (and servers
		 (null? servers))
	    (not servers)
	    (and (list? servers)
		 (< (length servers) (random ns)))) ;; somewhere between 0 and numservers
        #f
        (let loop ((hed (car servers))
                   (tal (cdr servers)))
          (let ((res (server:check-server hed)))
            (if res
                res
                (if (null? tal)