Megatest

Check-in [6baac6187e]
Login
Overview
Comment:Show connection stats every 60 seconds. Remove stat of megatest.db from rmt:send-receive, it was happening on every call.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | multi-server-hack
Files: files | file ages | folders
SHA1: 6baac6187eacc0b730baefcba631fb01818c0a3a
User & Date: matt on 2017-03-24 13:51:20
Other Links: branch diff | manifest | tags
Context
2017-03-24
14:52
Tell calling client to wait and try again if server is overloaded (in terms of parallel api calls over 25) check-in: fd3c06195d user: matt tags: multi-server-hack
13:51
Show connection stats every 60 seconds. Remove stat of megatest.db from rmt:send-receive, it was happening on every call. check-in: 6baac6187e user: matt tags: multi-server-hack
11:27
Merged v1.63 changes to multi-server-hack check-in: 8a6ca9fd18 user: matt tags: multi-server-hack
Changes

Modified api.scm from [4067424284] to [d0edc7e79b].

269
270
271
272
273
274
275
276
277
278
279
280

281
282
283
284
285
286
287
288
                   ((ping)                         (current-process-id))

                   ;; TESTMETA
                   ((testmeta-get-record)       (apply db:testmeta-get-record dbstruct params))

                   ;; TASKS 
                   ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))))))
       (if (not writecmd-in-readonly-mode)
           (let ((delta-t (- (current-milliseconds)
                             start-t)))
             (hash-table-set! *db-api-call-time* cmd
                              (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))

             (vector #t res))
           (vector #f res)))))))

;; http-server  send-response
;;                 api:process-request
;;                    db:*
;;
;; NB// Runs on the server as part of the server loop







|
|
|
|
|
>
|







269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
                   ((ping)                         (current-process-id))

                   ;; TESTMETA
                   ((testmeta-get-record)       (apply db:testmeta-get-record dbstruct params))

                   ;; TASKS 
                   ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))))))
       ;; save all stats
       (let ((delta-t (- (current-milliseconds)
			 start-t)))
	 (hash-table-set! *db-api-call-time* cmd
			  (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))))
       (if (not writecmd-in-readonly-mode)
	   (vector #t res)
           (vector #f res)))))))

;; http-server  send-response
;;                 api:process-request
;;                    db:*
;;
;; NB// Runs on the server as part of the server loop

Modified common.scm from [4158ce55d8] to [9d413de21a].

148
149
150
151
152
153
154
155
156


157
158
159
160
161
162
163

(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (conndat           #f)
  (transport         *transport-type*)
  (server-timeout    (or (server:get-timeout) 100))
  (force-server      #f)) ;; default to 100 seconds



;; launching and hosts
(defstruct host
  (reachable    #f)
  (last-update  0)
  (last-used    0)
  (last-cpuload 1))







|
|
>
>







148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165

(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (conndat           #f)
  (transport         *transport-type*)
  (server-timeout    (or (server:get-timeout) 100)) ;; default to 100 seconds
  (force-server      #f)
  (ro-mode           #f)  
  (ro-mode-checked   #f)) ;; flag that indicates we have checked for ro-mode

;; launching and hosts
(defstruct host
  (reachable    #f)
  (last-update  0)
  (last-used    0)
  (last-cpuload 1))

Modified db.scm from [df76958ce7] to [feaa389617].

2094
2095
2096
2097
2098
2099
2100
2101

2102
2103
2104

2105
2106

2107
2108

2109
2110
2111
2112
2113
2114
2115

(define (db:print-current-query-stats)
  ;; generate stats from *db-api-call-time*
  (let ((ordered-keys (sort (hash-table-keys *db-api-call-time*)
			    (lambda (a b)
			      (let ((sum-a (common:sum (hash-table-ref *db-api-call-time* a)))
				    (sum-b (common:sum (hash-table-ref *db-api-call-time* b))))
				(> sum-a sum-b))))))

    (for-each
     (lambda (cmd-key)
       (let* ((dat  (hash-table-ref *db-api-call-time* cmd-key))

	      (avg  (if (> (length dat) 0)
			(/ (common:sum dat)(length dat)))))

	 (debug:print-info 0 *default-log-port* cmd-key "\tavg: " avg " max: " (common:max dat) " min: " (common:min-max < dat) " num: " (length dat))))
     ordered-keys)))


(define (db:get-all-run-ids dbstruct)
  (db:with-db
   dbstruct
   #f
   #f
   (lambda (db)







|
>



>
|

>

|
>







2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119

(define (db:print-current-query-stats)
  ;; generate stats from *db-api-call-time*
  (let ((ordered-keys (sort (hash-table-keys *db-api-call-time*)
			    (lambda (a b)
			      (let ((sum-a (common:sum (hash-table-ref *db-api-call-time* a)))
				    (sum-b (common:sum (hash-table-ref *db-api-call-time* b))))
				(> sum-a sum-b)))))
	(total        0))
    (for-each
     (lambda (cmd-key)
       (let* ((dat  (hash-table-ref *db-api-call-time* cmd-key))
	      (num  (length dat))
	      (avg  (if (> num 0)
			(/ (common:sum dat)(length dat)))))
	 (set! total (+ total num))
	 (debug:print-info 0 *default-log-port* cmd-key "\tavg: " avg " max: " (common:max dat) " min: " (common:min-max < dat) " num: " (length dat))))
     ordered-keys)
    (debug:print-info 0 *default-log-port* "TOTAL: " total " api calls since start.")))

(define (db:get-all-run-ids dbstruct)
  (db:with-db
   dbstruct
   #f
   #f
   (lambda (db)

Modified http-transport.scm from [44c2ce6eea] to [9751cbc3b5].

416
417
418
419
420
421
422
423



424
425
426
427
428
429
430
      (set! last-access *db-last-access*)
      (mutex-unlock! *heartbeat-mutex*)
      
      (if (common:low-noise-print 120 (conc "server running on " iface ":" port))
	  (begin
	    (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds))
	    (flush-output *default-log-port*)))




      (let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600))
	     (adjusted-timeout (if (> hrs-since-start 1)
				   (- server-timeout (inexact->exact (round (* hrs-since-start 60))))  ;; subtract 60 seconds per hour
				   server-timeout)))
	(if (common:low-noise-print 120 "server timeout")
	    (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout))
	(cond







|
>
>
>







416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
      (set! last-access *db-last-access*)
      (mutex-unlock! *heartbeat-mutex*)
      
      (if (common:low-noise-print 120 (conc "server running on " iface ":" port))
	  (begin
	    (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds))
	    (flush-output *default-log-port*)))
      (if (common:low-noise-print 60 "dbstats")
	  (begin
	    (debug:print 0 *default-log-port* "Server stats:")
	    (db:print-current-query-stats)))
      (let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600))
	     (adjusted-timeout (if (> hrs-since-start 1)
				   (- server-timeout (inexact->exact (round (* hrs-since-start 60))))  ;; subtract 60 seconds per hour
				   server-timeout)))
	(if (common:low-noise-print 120 "server timeout")
	    (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout))
	(cond

Modified launch.scm from [442d86a53d] to [7ecb4d1b9c].

783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
	     (target   (common:args-get-target))
	     (linktree (common:get-linktree))
	     (sections (if target (list "default" target) #f)) ;; for runconfigs
	     (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config 
	     (rundir   (if (and runname target linktree)(conc linktree "/" target "/" runname) #f))
	     (mtcachef (and rundir (conc rundir "/" ".megatest.cfg-"  megatest-version "-" megatest-fossil-hash)))
	     (rccachef (and rundir (conc rundir "/" ".runconfigs.cfg-"  megatest-version "-" megatest-fossil-hash)))
	 (cancreate (and rundir (common:file-exists? rundir)(file-write-access? rundir)))
	     (cxt       (hash-table-ref/default *contexts* toppath #f)))

	;; create our cxt for this area if it doesn't already exist
	(if (not cxt)(hash-table-set! *contexts* toppath (make-cxt)))

	;; (print "runname: " runname " target: " target " mtcachef: " mtcachef " rccachef: " rccachef)
	(set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource







|







783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
	     (target   (common:args-get-target))
	     (linktree (common:get-linktree))
	     (sections (if target (list "default" target) #f)) ;; for runconfigs
	     (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config 
	     (rundir   (if (and runname target linktree)(conc linktree "/" target "/" runname) #f))
	     (mtcachef (and rundir (conc rundir "/" ".megatest.cfg-"  megatest-version "-" megatest-fossil-hash)))
	     (rccachef (and rundir (conc rundir "/" ".runconfigs.cfg-"  megatest-version "-" megatest-fossil-hash)))
	     (cancreate (and rundir (common:file-exists? rundir)(file-write-access? rundir)))
	     (cxt       (hash-table-ref/default *contexts* toppath #f)))

	;; create our cxt for this area if it doesn't already exist
	(if (not cxt)(hash-table-set! *contexts* toppath (make-cxt)))

	;; (print "runname: " runname " target: " target " mtcachef: " mtcachef " rccachef: " rccachef)
	(set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource

Modified rmt.scm from [1adf35b1f4] to [4b028f3c38].

54
55
56
57
58
59
60
61
62





63
64
65






66
67
68
69
70
71
72
  ;; do all the prep locked under the rmt-mutex
  (mutex-lock! *rmt-mutex*)

  ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
  ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
  ;; 3. do the query, if on homehost use local access
  ;;
  (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value
         (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas





         (dbfile (conc *toppath* "/megatest.db"))
         (readonly-mode (not (file-write-access? dbfile))) ;; TODO: use dbstruct or runremote to figure this out in future
	 (runremote  (or area-dat *runremote*)))






    ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
    (cond
     ;; give up if more than 15 attempts
     ((> attemptnum 15)
      (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.")
      (exit 1))








|
|
>
>
>
>
>
|
|
|
>
>
>
>
>
>







54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
  ;; do all the prep locked under the rmt-mutex
  (mutex-lock! *rmt-mutex*)

  ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
  ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
  ;; 3. do the query, if on homehost use local access
  ;;
  (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
         (areapath      *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
	 (runremote     (or area-dat
			    *runremote*))
	 (readonly-mode (if (and runremote
				 (remote-ro-mode-checked runremote))
			    (remote-ro-mode runremote)
			    (let* ((dbfile  (conc *toppath* "/megatest.db"))
				   (ro-mode (not (file-write-access? dbfile)))) ;; TODO: use dbstruct or runremote to figure this out in future
			      (if runremote
				  (begin
				    (remote-ro-mode-set! runremote ro-mode)
				    (remote-ro-mode-checked-set! runremote #t)
				    ro-mode)
				  ro-mode)))))

    ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
    (cond
     ;; give up if more than 15 attempts
     ((> attemptnum 15)
      (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.")
      (exit 1))

92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
	   (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 30)))) ;; add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts
	     (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
      (remote-conndat-set! runremote #f)
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a record for our connection for given area
     ((not runremote)                     
      (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  1")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a homehost record
     ((not (pair? (remote-hh-dat runremote)))  ;; not on homehost
      (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little







|







103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
	   (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 30)))) ;; add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts
	     (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
      (remote-conndat-set! runremote #f)
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a record for our connection for given area
     ((not runremote)                  ;; can remove this one. should never get here.         
      (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  1")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a homehost record
     ((not (pair? (remote-hh-dat runremote)))  ;; not on homehost
      (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little

Modified server.scm from [a07a79fe32] to [65b45e968a].

244
245
246
247
248
249
250
251

252
253
254
255
256
257
258
    (if (and srvrs
	     (not (null? srvrs)))
	(car srvrs)
	#f)))

(define (server:get-rand-best areapath)
  (let ((srvrs (server:get-best (server:get-list areapath))))
    (if (list? srvrs)

	(let* ((len (length srvrs))
	       (idx (random len)))
	  (list-ref srvrs idx))
	#f)))


(define (server:record->url servr)







|
>







244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
    (if (and srvrs
	     (not (null? srvrs)))
	(car srvrs)
	#f)))

(define (server:get-rand-best areapath)
  (let ((srvrs (server:get-best (server:get-list areapath))))
    (if (and (list? srvrs)
	     (not (null? srvrs)))
	(let* ((len (length srvrs))
	       (idx (random len)))
	  (list-ref srvrs idx))
	#f)))


(define (server:record->url servr)