Megatest

Check-in [e7dcebe686]
Login
Overview
Comment:Fixed couple issues with the throttle by api load but still not sure it is working right. Cleaned up few other transport things.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | multi-server-hack
Files: files | file ages | folders
SHA1: e7dcebe6869ab0bfa639c16b78ac1fc824d22630
User & Date: matt on 2017-03-24 18:22:57
Other Links: branch diff | manifest | tags
Context
2017-03-24
23:35
Merged in multi-server-hack check-in: 99a40be38a user: matt tags: v1.64
22:18
non-good commit of merge multi-server-hack into v1.64 Leaf check-in: 42e2fcd1c8 user: matt tags: private (unpublished)
18:22
Fixed couple issues with the throttle by api load but still not sure it is working right. Cleaned up few other transport things. Closed-Leaf check-in: e7dcebe686 user: matt tags: multi-server-hack
15:55
Default to treating a log as recent instead of ancient. This might be part of the run-away servers issue check-in: 31427e83dd user: matt tags: multi-server-hack
Changes

Modified api.scm from [466ca51286] to [510d0b0df2].

114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
;;    - keep it simple, only return the actual result of the call, i.e. no meta info here
;;
;;    - returns #( flag result )
;;
(define (api:execute-requests dbstruct dat)
  (handle-exceptions
   exn
   (let ((call-chain (get-call-chain))
         )
     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer")
     (print-call-chain (current-error-port))
     (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
     (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
   (cond
    ((not (vector? dat))                    ;; it is an error to not receive a vector
     (vector #f #f "remote must be called with a vector")       )
    ((> *api-process-request-count* 20)
     (vector #f 'overloaded))
    (else  
     (let* ((cmd-in (vector-ref dat 0))
            (cmd    (if (symbol? cmd-in)
                        cmd-in
                        (string->symbol cmd-in)))
            (params (vector-ref dat 1))
            (start-t (current-milliseconds))
            (readonly-mode (dbr:dbstruct-read-only dbstruct))
            (readonly-command (member cmd api:read-only-queries))
            (writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))
            (res    
             (if writecmd-in-readonly-mode
                 (conc "attempt to run write command "cmd" on a read-only database")
                 (case cmd
                   ;;===============================================
                   ;; READ/WRITE QUERIES







|
<






|
|
|

|
|
|
|
|
|
|
|







114
115
116
117
118
119
120
121

122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
;;    - keep it simple, only return the actual result of the call, i.e. no meta info here
;;
;;    - returns #( flag result )
;;
(define (api:execute-requests dbstruct dat)
  (handle-exceptions
   exn
   (let ((call-chain (get-call-chain)))

     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer")
     (print-call-chain (current-error-port))
     (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
     (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
   (cond
    ((not (vector? dat))                    ;; it is an error to not receive a vector
     (vector #f (vector #f "remote must be called with a vector")))
    ((> *api-process-request-count* 20) ;; 20)
     'overloaded) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
    (else  
     (let* ((cmd-in            (vector-ref dat 0))
            (cmd               (if (symbol? cmd-in)
				   cmd-in
				   (string->symbol cmd-in)))
            (params            (vector-ref dat 1))
            (start-t           (current-milliseconds))
            (readonly-mode     (dbr:dbstruct-read-only dbstruct))
            (readonly-command  (member cmd api:read-only-queries))
            (writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))
            (res    
             (if writecmd-in-readonly-mode
                 (conc "attempt to run write command "cmd" on a read-only database")
                 (case cmd
                   ;;===============================================
                   ;; READ/WRITE QUERIES
270
271
272
273
274
275
276
277




278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298

299


300
301
302
303
304
305
306
307
308
309
310
311
312
                   ((sdb-qry)                      (apply sdb:qry params))
                   ((ping)                         (current-process-id))

                   ;; TESTMETA
                   ((testmeta-get-record)       (apply db:testmeta-get-record dbstruct params))

                   ;; TASKS 
                   ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))))))




       ;; save all stats
       (let ((delta-t (- (current-milliseconds)
			 start-t)))
	 (hash-table-set! *db-api-call-time* cmd
			  (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))))
       (if (not writecmd-in-readonly-mode)
	   (vector #t res)
           (vector #f res)))))))

;; http-server  send-response
;;                 api:process-request
;;                    db:*
;;
;; NB// Runs on the server as part of the server loop
;;
(define (api:process-request dbstruct $) ;; the $ is the request vars proc
  (set! *api-process-request-count* (+ *api-process-request-count* 1))
  (let* ((cmd     ($ 'cmd))
	 (paramsj ($ 'params))
	 (params  (db:string->obj paramsj transport: 'http)) ;; (rmt:json-str->dat paramsj))
	 (resdat  (api:execute-requests dbstruct (vector cmd params))) ;; #( flag result )

	 (res     (vector-ref resdat 1)))


    (if (> *api-process-request-count* *max-api-process-requests*)
	(set! *max-api-process-requests* *api-process-request-count*))
    (set! *api-process-request-count* (- *api-process-request-count* 1))
    ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds
    ;; (rmt:dat->json-str
    ;;  (if (or (string? res)
    ;;          (list?   res)
    ;;          (number? res)
    ;;          (boolean? res))
    ;;      res 
    ;;      (list "ERROR, not string, list, number or boolean" 1 cmd params res)))))
    (db:obj->string res transport: 'http)))








|
>
>
>
>





|
|
|











|
|
>
|
>
>













269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
                   ((sdb-qry)                      (apply sdb:qry params))
                   ((ping)                         (current-process-id))

                   ;; TESTMETA
                   ((testmeta-get-record)       (apply db:testmeta-get-record dbstruct params))

                   ;; TASKS 
                   ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))
		   (else
		    (debug:print 0 *default-log-port* "ERROR: bad api call " cmd)
		    (conc "ERROR: BAD api call " cmd))))))
       
       ;; save all stats
       (let ((delta-t (- (current-milliseconds)
			 start-t)))
	 (hash-table-set! *db-api-call-time* cmd
			  (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))))
       (if writecmd-in-readonly-mode
	   (vector #f res)
           (vector #t res)))))))

;; http-server  send-response
;;                 api:process-request
;;                    db:*
;;
;; NB// Runs on the server as part of the server loop
;;
(define (api:process-request dbstruct $) ;; the $ is the request vars proc
  (set! *api-process-request-count* (+ *api-process-request-count* 1))
  (let* ((cmd     ($ 'cmd))
	 (paramsj ($ 'params))
	 (params  (db:string->obj paramsj transport: 'http)) ;; incoming data from the POST (or is it a GET?)
	 (resdat  (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result )
	 (success (vector-ref resdat 0))
	 (res     (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?)
    (if (not success)
	(debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params))
    (if (> *api-process-request-count* *max-api-process-requests*)
	(set! *max-api-process-requests* *api-process-request-count*))
    (set! *api-process-request-count* (- *api-process-request-count* 1))
    ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds
    ;; (rmt:dat->json-str
    ;;  (if (or (string? res)
    ;;          (list?   res)
    ;;          (number? res)
    ;;          (boolean? res))
    ;;      res 
    ;;      (list "ERROR, not string, list, number or boolean" 1 cmd params res)))))
    (db:obj->string res transport: 'http)))

Modified http-transport.scm from [9751cbc3b5] to [9769576fcd].

264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
	      (th2 (make-thread time-out     "time out")))
	 (thread-start! th1)
	 (thread-start! th2)
	 (thread-join! th1)
	 (thread-terminate! th2)
	 (debug:print-info 11 *default-log-port* "got res=" res)
	 (if (vector? res)
	     (if (vector-ref res 0)
		 res
                 (if (debug:debug-mode 11)
                     (begin ;; note: this code also called in nmsg-transport - consider consolidating it
                       (debug:print-error 11 *default-log-port* "error occured at server, info=" (vector-ref res 2))
                       (debug:print 11 *default-log-port* " client call chain:")
                       (print-call-chain (current-error-port))
                       (debug:print 11 *default-log-port* " server call chain:")
                       (pp (vector-ref res 1) (current-error-port))







|
|







264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
	      (th2 (make-thread time-out     "time out")))
	 (thread-start! th1)
	 (thread-start! th2)
	 (thread-join! th1)
	 (thread-terminate! th2)
	 (debug:print-info 11 *default-log-port* "got res=" res)
	 (if (vector? res)
	     (if (vector-ref res 0) ;; this is the first flag or the second flag?
		 res ;; this is the *inner* vector? seriously? why?
                 (if (debug:debug-mode 11)
                     (begin ;; note: this code also called in nmsg-transport - consider consolidating it
                       (debug:print-error 11 *default-log-port* "error occured at server, info=" (vector-ref res 2))
                       (debug:print 11 *default-log-port* " client call chain:")
                       (print-call-chain (current-error-port))
                       (debug:print 11 *default-log-port* " server call chain:")
                       (pp (vector-ref res 1) (current-error-port))

Modified rmt.scm from [a6b31e3da3] to [c559126227].

195
196
197
198
199
200
201
202
203
204
205
206

207
208
209
210
211
212
213
214
215
216

217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
			 (else
			  (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported")
			  (exit))))
	     (success  (if (vector? dat) (vector-ref dat 0) #f))
	     (res      (if (vector? dat) (vector-ref dat 1) #f)))
	(if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time
	;; (mutex-unlock! *rmt-mutex*)
        (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9. conninfo=" conninfo " dat=" dat " runremote = "runremote)
	(if success
	    (case (remote-transport runremote)
	      ((http)
	       (mutex-unlock! *rmt-mutex*)

	       res)
	      (else
	       (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " is unknown")
	       (mutex-unlock! *rmt-mutex*)
	       (exit 1)))
	    (if (eq? res 'overloaded)
		(let ((wait-delay (+ attemptnum (* attemptnum 10))))
		  (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.")
		  (thread-sleep! wait-delay)
		  (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))

		(begin
		  (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
		  (remote-conndat-set!    runremote #f)
		  (remote-server-url-set! runremote #f)
		  (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9.1")
		  (mutex-unlock! *rmt-mutex*)
		  (if (not (server:check-if-running *toppath*))
		      (server:start-and-wait *toppath*))
		  (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))))

;; (define (rmt:update-db-stats run-id rawcmd params duration)
;;   (mutex-lock! *db-stats-mutex*)
;;   (handle-exceptions
;;    exn
;;    (begin
;;      (debug:print 0 *default-log-port* "WARNING: stats collection failed in update-db-stats")







|
<
<
<
|
>
|
<
<
<
<
|




>
|
|
|
|
|
<
|
|
|







195
196
197
198
199
200
201
202



203
204
205




206
207
208
209
210
211
212
213
214
215
216

217
218
219
220
221
222
223
224
225
226
			 (else
			  (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported")
			  (exit))))
	     (success  (if (vector? dat) (vector-ref dat 0) #f))
	     (res      (if (vector? dat) (vector-ref dat 1) #f)))
	(if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time
	;; (mutex-unlock! *rmt-mutex*)
        (debug:print-info 13 *default-log-port* "rmt:send-receive, case  9. conninfo=" conninfo " dat=" dat " runremote = " runremote)



	(mutex-unlock! *rmt-mutex*)
	(if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end
	    (if (and (symbol? res)




		     (eq? res 'overloaded))
		(let ((wait-delay (+ attemptnum (* attemptnum 10))))
		  (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.")
		  (thread-sleep! wait-delay)
		  (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
		res) ;; All good, return res
	    (begin
	      (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
	      (remote-conndat-set!    runremote #f)
	      (remote-server-url-set! runremote #f)
	      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9.1")

	      (if (not (server:check-if-running *toppath*))
		  (server:start-and-wait *toppath*))
	      (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))))))))

;; (define (rmt:update-db-stats run-id rawcmd params duration)
;;   (mutex-lock! *db-stats-mutex*)
;;   (handle-exceptions
;;    exn
;;    (begin
;;      (debug:print 0 *default-log-port* "WARNING: stats collection failed in update-db-stats")

Modified tests.scm from [bccd138868] to [897e386321].

484
485
486
487
488
489
490



491
492
493
494
495
496
497
498
		      (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename))
		  (common:simple-file-release-lock lockf)
		  (change-directory orig-dir)
		  ;; NB// tests:test-set-toplog! is remote internal...
		  (tests:test-set-toplog! run-id test-name outputfilename))
		;; didn't get the lock, check to see if current update started later than this 
		;; update, if so we can exit without doing any work



		(if (> my-start-time (file-modification-time lockf))
		    ;; we started since current re-gen in flight, delay a little and try again
		    (begin
		      (debug:print-info 1 *default-log-port* "Waiting to update " outputfilename ", another test currently updating it")
		      (thread-sleep! (+ 5 (random 5))) ;; delay between 5 and 10 seconds
		      (loop (common:simple-file-lock lockf))))))))))

(define (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename)







>
>
>
|







484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
		      (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename))
		  (common:simple-file-release-lock lockf)
		  (change-directory orig-dir)
		  ;; NB// tests:test-set-toplog! is remote internal...
		  (tests:test-set-toplog! run-id test-name outputfilename))
		;; didn't get the lock, check to see if current update started later than this 
		;; update, if so we can exit without doing any work
		(if (> my-start-time (handle-exceptions
					 exn
					 0
				       (file-modification-time lockf)))
		    ;; we started since current re-gen in flight, delay a little and try again
		    (begin
		      (debug:print-info 1 *default-log-port* "Waiting to update " outputfilename ", another test currently updating it")
		      (thread-sleep! (+ 5 (random 5))) ;; delay between 5 and 10 seconds
		      (loop (common:simple-file-lock lockf))))))))))

(define (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename)