Megatest

Check-in [99a40be38a]
Login
Overview
Comment:Merged in multi-server-hack
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.64
Files: files | file ages | folders
SHA1: 99a40be38a77905e586e8ea39694ac22a632538b
User & Date: matt on 2017-03-24 23:35:09
Other Links: branch diff | manifest | tags
Context
2017-03-26
17:47
Added browsing of logs to RH Button for tests from runs view buttons. Added (speculative) fix for unstable sort on steps. check-in: 89dd44cd11 user: matt tags: v1.64
2017-03-24
23:35
Merged in multi-server-hack check-in: 99a40be38a user: matt tags: v1.64
22:39
Merged v1.63 changes into v1.64 check-in: b2e6452e2f user: matt tags: v1.64
18:22
Fixed couple issues with the throttle by api load but still not sure it is working right. Cleaned up few other transport things. Closed-Leaf check-in: e7dcebe686 user: matt tags: multi-server-hack
Changes

Modified api.scm from [4eeb269c20] to [b8376b33fc].

114
115
116
117
118
119
120
121

122
123
124
125
126
127
128
129
130
131



132
133
134
135
136
137
138
139
140








141
142
143
144
145
146
147
114
115
116
117
118
119
120

121

122
123
124
125
126
127



128
129
130
131








132
133
134
135
136
137
138
139
140
141
142
143
144
145
146







-
+
-






-
-
-
+
+
+

-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+







;;    - keep it simple, only return the actual result of the call, i.e. no meta info here
;;
;;    - returns #( flag result )
;;
(define (api:execute-requests dbstruct dat)
  (handle-exceptions
   exn
   (let ((call-chain (get-call-chain))
   (let ((call-chain (get-call-chain)))
         )
     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat)
     (print-call-chain (current-error-port))
     (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
     (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
   (cond
    ((not (vector? dat))                    ;; it is an error to not receive a vector
     (vector #f #f "remote must be called with a vector")       )
    ((> *api-process-request-count* 20)
     (vector #f 'overloaded))
     (vector #f (vector #f "remote must be called with a vector")))
    ((> *api-process-request-count* 20) ;; 20)
     'overloaded) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
    (else  
     (let* ((cmd-in (vector-ref dat 0))
            (cmd    (if (symbol? cmd-in)
                        cmd-in
                        (string->symbol cmd-in)))
            (params (vector-ref dat 1))
            (start-t (current-milliseconds))
            (readonly-mode (dbr:dbstruct-read-only dbstruct))
            (readonly-command (member cmd api:read-only-queries))
     (let* ((cmd-in            (vector-ref dat 0))
            (cmd               (if (symbol? cmd-in)
				   cmd-in
				   (string->symbol cmd-in)))
            (params            (vector-ref dat 1))
            (start-t           (current-milliseconds))
            (readonly-mode     (dbr:dbstruct-read-only dbstruct))
            (readonly-command  (member cmd api:read-only-queries))
            (writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))
            (res    
             (if writecmd-in-readonly-mode
                 (conc "attempt to run write command "cmd" on a read-only database")
                 (case cmd
                   ;;===============================================
                   ;; READ/WRITE QUERIES
271
272
273
274
275
276
277
278





279
280
281
282
283
284
285
286



287
288
289
290
291
292
293
294
295
296
297
298
299
300






301
302
303
304
305
306
307
308
309
310
311
312
313
270
271
272
273
274
275
276

277
278
279
280
281
282
283
284
285
286



287
288
289
290
291
292
293
294
295
296
297
298
299
300



301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319







-
+
+
+
+
+





-
-
-
+
+
+











-
-
-
+
+
+
+
+
+













                   ((ping)                         (current-process-id))
		   ((get-changed-record-ids)       (apply db:get-changed-record-ids dbstruct params))
		   
                   ;; TESTMETA
                   ((testmeta-get-record)       (apply db:testmeta-get-record dbstruct params))

                   ;; TASKS 
                   ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))))))
                   ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))
		   (else
		    (debug:print 0 *default-log-port* "ERROR: bad api call " cmd)
		    (conc "ERROR: BAD api call " cmd))))))
       
       ;; save all stats
       (let ((delta-t (- (current-milliseconds)
			 start-t)))
	 (hash-table-set! *db-api-call-time* cmd
			  (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))))
       (if (not writecmd-in-readonly-mode)
	   (vector #t res)
           (vector #f res)))))))
       (if writecmd-in-readonly-mode
	   (vector #f res)
           (vector #t res)))))))

;; http-server  send-response
;;                 api:process-request
;;                    db:*
;;
;; NB// Runs on the server as part of the server loop
;;
(define (api:process-request dbstruct $) ;; the $ is the request vars proc
  (set! *api-process-request-count* (+ *api-process-request-count* 1))
  (let* ((cmd     ($ 'cmd))
	 (paramsj ($ 'params))
	 (params  (db:string->obj paramsj transport: 'http)) ;; (rmt:json-str->dat paramsj))
	 (resdat  (api:execute-requests dbstruct (vector cmd params))) ;; #( flag result )
	 (res     (vector-ref resdat 1)))
	 (params  (db:string->obj paramsj transport: 'http)) ;; incoming data from the POST (or is it a GET?)
	 (resdat  (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result )
	 (success (vector-ref resdat 0))
	 (res     (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?)
    (if (not success)
	(debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params))
    (if (> *api-process-request-count* *max-api-process-requests*)
	(set! *max-api-process-requests* *api-process-request-count*))
    (set! *api-process-request-count* (- *api-process-request-count* 1))
    ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds
    ;; (rmt:dat->json-str
    ;;  (if (or (string? res)
    ;;          (list?   res)
    ;;          (number? res)
    ;;          (boolean? res))
    ;;      res 
    ;;      (list "ERROR, not string, list, number or boolean" 1 cmd params res)))))
    (db:obj->string res transport: 'http)))

Modified http-transport.scm from [b05354fdaf] to [a7745a1b3a].

264
265
266
267
268
269
270
271
272


273
274
275
276
277
278
279
264
265
266
267
268
269
270


271
272
273
274
275
276
277
278
279







-
-
+
+







	      (th2 (make-thread time-out     "time out")))
	 (thread-start! th1)
	 (thread-start! th2)
	 (thread-join! th1)
	 (thread-terminate! th2)
	 (debug:print-info 11 *default-log-port* "got res=" res)
	 (if (vector? res)
	     (if (vector-ref res 0)
		 res
	     (if (vector-ref res 0) ;; this is the first flag or the second flag?
		 res ;; this is the *inner* vector? seriously? why?
                 (if (debug:debug-mode 11)
                     (begin ;; note: this code also called in nmsg-transport - consider consolidating it
                       (debug:print-error 11 *default-log-port* "error occured at server, info=" (vector-ref res 2))
                       (debug:print 11 *default-log-port* " client call chain:")
                       (print-call-chain (current-error-port))
                       (debug:print 11 *default-log-port* " server call chain:")
                       (pp (vector-ref res 1) (current-error-port))

Modified rmt.scm from [aea2992f6b] to [f051a84a44].

194
195
196
197
198
199
200
201

202
203
204
205
206



207
208
209
210
211

212
213
214
215

216
217
218
219
220





221
222
223
224



225
226
227
228
229
230
231
194
195
196
197
198
199
200

201





202
203
204





205
206
207
208
209
210





211
212
213
214
215




216
217
218
219
220
221
222
223
224
225







-
+
-
-
-
-
-
+
+
+
-
-
-
-
-
+




+
-
-
-
-
-
+
+
+
+
+
-
-
-
-
+
+
+







			 (else
			  (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported")
			  (exit))))
	     (success  (if (vector? dat) (vector-ref dat 0) #f))
	     (res      (if (vector? dat) (vector-ref dat 1) #f)))
	(if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time
	;; (mutex-unlock! *rmt-mutex*)
        (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9. conninfo=" conninfo " dat=" dat " runremote = "runremote)
        (debug:print-info 13 *default-log-port* "rmt:send-receive, case  9. conninfo=" conninfo " dat=" dat " runremote = " runremote)
	(if success
	    (case (remote-transport runremote)
	      ((http)
	       (mutex-unlock! *rmt-mutex*)
	       res)
	(mutex-unlock! *rmt-mutex*)
	(if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end
	    (if (and (symbol? res)
	      (else
	       (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " is unknown")
	       (mutex-unlock! *rmt-mutex*)
	       (exit 1)))
	    (if (eq? res 'overloaded)
		     (eq? res 'overloaded))
		(let ((wait-delay (+ attemptnum (* attemptnum 10))))
		  (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.")
		  (thread-sleep! wait-delay)
		  (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
		res) ;; All good, return res
		(begin
		  (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
		  (remote-conndat-set!    runremote #f)
		  (remote-server-url-set! runremote #f)
		  (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9.1")
	    (begin
	      (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
	      (remote-conndat-set!    runremote #f)
	      (remote-server-url-set! runremote #f)
	      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9.1")
		  (mutex-unlock! *rmt-mutex*)
		  (if (not (server:check-if-running *toppath*))
		      (server:start-and-wait *toppath*))
		  (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))))
	      (if (not (server:check-if-running *toppath*))
		  (server:start-and-wait *toppath*))
	      (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))))))))

;; (define (rmt:update-db-stats run-id rawcmd params duration)
;;   (mutex-lock! *db-stats-mutex*)
;;   (handle-exceptions
;;    exn
;;    (begin
;;      (debug:print 0 *default-log-port* "WARNING: stats collection failed in update-db-stats")

Modified server.scm from [f61159c89e] to [2e218ee65f].

138
139
140
141
142
143
144



145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
















161
162
163
164
165
166
167
138
139
140
141
142
143
144
145
146
147
















148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170







+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds
;;
(define (server:logf-get-start-info logf)
  (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs
    (handle-exceptions
	exn
	(list #f #f #f) ;; no idea what went wrong, call it a bad server
    (with-input-from-file
	logf
      (lambda ()
	(let loop ((inl  (read-line))
		   (lnum 0))
	  (if (not (eof-object? inl))
	      (let ((mlst (string-match rx inl)))
		(if (not mlst)
		    (if (< lnum 500) ;; give up if more than 500 lines of server log read
			(loop (read-line)(+ lnum 1))
			(list #f #f #f))
		    (let ((dat  (cdr mlst)))
		      (list (car dat) ;; host
			    (string->number (cadr dat)) ;; port
			    (string->number (caddr dat))))))
	      (list #f #f #f)))))))
      (with-input-from-file
	  logf
	(lambda ()
	  (let loop ((inl  (read-line))
		     (lnum 0))
	    (if (not (eof-object? inl))
		(let ((mlst (string-match rx inl)))
		  (if (not mlst)
		      (if (< lnum 500) ;; give up if more than 500 lines of server log read
			  (loop (read-line)(+ lnum 1))
			  (list #f #f #f))
		      (let ((dat  (cdr mlst)))
			(list (car dat) ;; host
			      (string->number (cadr dat)) ;; port
			      (string->number (caddr dat))))))
		(list #f #f #f))))))))

;; get a list of servers with all relevant data
;; ( mod-time host port start-time pid )
;;
(define (server:get-list areapath #!key (limit #f))
  (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
	(day-seconds (* 24 60 60)))
182
183
184
185
186
187
188
189

190
191


192
193
194
195
196
197
198
185
186
187
188
189
190
191

192


193
194
195
196
197
198
199
200
201







-
+
-
-
+
+







	       (num-serv-logs (length server-logs)))
	  (if (null? server-logs)
	      '()
	      (let loop ((hed  (car server-logs))
			 (tal  (cdr server-logs))
			 (res '()))
		(let* ((mod-time  (handle-exceptions
                                   exn
				      exn
                                   0
                                   (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
				      (current-seconds) ;; 0
				    (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
		       (down-time (- (current-seconds) mod-time))
		       (serv-dat  (if (or (< num-serv-logs 10)
				  	  (< down-time day-seconds))
				     (server:logf-get-start-info hed)
				     '())) ;; don't waste time processing server files not touched in the past day if there are more than ten servers to look at
		       (serv-rec (cons mod-time serv-dat))
		       (fmatch   (string-match fname-rx hed))

Modified tests.scm from [bccd138868] to [897e386321].

484
485
486
487
488
489
490



491

492
493
494
495
496
497
498
484
485
486
487
488
489
490
491
492
493

494
495
496
497
498
499
500
501







+
+
+
-
+







		      (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename))
		  (common:simple-file-release-lock lockf)
		  (change-directory orig-dir)
		  ;; NB// tests:test-set-toplog! is remote internal...
		  (tests:test-set-toplog! run-id test-name outputfilename))
		;; didn't get the lock, check to see if current update started later than this 
		;; update, if so we can exit without doing any work
		(if (> my-start-time (handle-exceptions
					 exn
					 0
		(if (> my-start-time (file-modification-time lockf))
				       (file-modification-time lockf)))
		    ;; we started since current re-gen in flight, delay a little and try again
		    (begin
		      (debug:print-info 1 *default-log-port* "Waiting to update " outputfilename ", another test currently updating it")
		      (thread-sleep! (+ 5 (random 5))) ;; delay between 5 and 10 seconds
		      (loop (common:simple-file-lock lockf))))))))))

(define (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename)