Megatest

Check-in [6baac6187e]
Login
Overview
Comment:Show connection stats every 60 seconds. Remove stat of megatest.db from rmt:send-receive, it was happening on every call.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | multi-server-hack
Files: files | file ages | folders
SHA1: 6baac6187eacc0b730baefcba631fb01818c0a3a
User & Date: matt on 2017-03-24 13:51:20
Other Links: branch diff | manifest | tags
Context
2017-03-24
14:52
Tell calling client to wait and try again if server is overloaded (in terms of parallel api calls over 25) check-in: fd3c06195d user: matt tags: multi-server-hack
13:51
Show connection stats every 60 seconds. Remove stat of megatest.db from rmt:send-receive, it was happening on every call. check-in: 6baac6187e user: matt tags: multi-server-hack
11:27
Merged v1.63 changes to multi-server-hack check-in: 8a6ca9fd18 user: matt tags: multi-server-hack
Changes

Modified api.scm from [4067424284] to [d0edc7e79b].

269
270
271
272
273
274
275
276
277
278
279
280
281







282
283
284
285
286
287
288
269
270
271
272
273
274
275






276
277
278
279
280
281
282
283
284
285
286
287
288
289







-
-
-
-
-
-
+
+
+
+
+
+
+







                   ((ping)                         (current-process-id))

                   ;; TESTMETA
                   ((testmeta-get-record)       (apply db:testmeta-get-record dbstruct params))

                   ;; TASKS 
                   ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))))))
       (if (not writecmd-in-readonly-mode)
           (let ((delta-t (- (current-milliseconds)
                             start-t)))
             (hash-table-set! *db-api-call-time* cmd
                              (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))
             (vector #t res))
       ;; save all stats
       (let ((delta-t (- (current-milliseconds)
			 start-t)))
	 (hash-table-set! *db-api-call-time* cmd
			  (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))))
       (if (not writecmd-in-readonly-mode)
	   (vector #t res)
           (vector #f res)))))))

;; http-server  send-response
;;                 api:process-request
;;                    db:*
;;
;; NB// Runs on the server as part of the server loop

Modified common.scm from [4158ce55d8] to [9d413de21a].

148
149
150
151
152
153
154
155
156




157
158
159
160
161
162
163
148
149
150
151
152
153
154


155
156
157
158
159
160
161
162
163
164
165







-
-
+
+
+
+








(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (conndat           #f)
  (transport         *transport-type*)
  (server-timeout    (or (server:get-timeout) 100))
  (force-server      #f)) ;; default to 100 seconds
  (server-timeout    (or (server:get-timeout) 100)) ;; default to 100 seconds
  (force-server      #f)
  (ro-mode           #f)  
  (ro-mode-checked   #f)) ;; flag that indicates we have checked for ro-mode

;; launching and hosts
(defstruct host
  (reachable    #f)
  (last-update  0)
  (last-used    0)
  (last-cpuload 1))

Modified db.scm from [df76958ce7] to [feaa389617].

2094
2095
2096
2097
2098
2099
2100
2101


2102
2103
2104

2105

2106

2107
2108


2109
2110
2111
2112
2113
2114
2115
2094
2095
2096
2097
2098
2099
2100

2101
2102
2103
2104
2105
2106

2107
2108
2109
2110

2111
2112
2113
2114
2115
2116
2117
2118
2119







-
+
+



+
-
+

+

-
+
+








(define (db:print-current-query-stats)
  ;; generate stats from *db-api-call-time*
  (let ((ordered-keys (sort (hash-table-keys *db-api-call-time*)
			    (lambda (a b)
			      (let ((sum-a (common:sum (hash-table-ref *db-api-call-time* a)))
				    (sum-b (common:sum (hash-table-ref *db-api-call-time* b))))
				(> sum-a sum-b))))))
				(> sum-a sum-b)))))
	(total        0))
    (for-each
     (lambda (cmd-key)
       (let* ((dat  (hash-table-ref *db-api-call-time* cmd-key))
	      (num  (length dat))
	      (avg  (if (> (length dat) 0)
	      (avg  (if (> num 0)
			(/ (common:sum dat)(length dat)))))
	 (set! total (+ total num))
	 (debug:print-info 0 *default-log-port* cmd-key "\tavg: " avg " max: " (common:max dat) " min: " (common:min-max < dat) " num: " (length dat))))
     ordered-keys)))
     ordered-keys)
    (debug:print-info 0 *default-log-port* "TOTAL: " total " api calls since start.")))

(define (db:get-all-run-ids dbstruct)
  (db:with-db
   dbstruct
   #f
   #f
   (lambda (db)

Modified http-transport.scm from [44c2ce6eea] to [9751cbc3b5].

416
417
418
419
420
421
422
423




424
425
426
427
428
429
430
416
417
418
419
420
421
422

423
424
425
426
427
428
429
430
431
432
433







-
+
+
+
+







      (set! last-access *db-last-access*)
      (mutex-unlock! *heartbeat-mutex*)
      
      (if (common:low-noise-print 120 (conc "server running on " iface ":" port))
	  (begin
	    (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds))
	    (flush-output *default-log-port*)))

      (if (common:low-noise-print 60 "dbstats")
	  (begin
	    (debug:print 0 *default-log-port* "Server stats:")
	    (db:print-current-query-stats)))
      (let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600))
	     (adjusted-timeout (if (> hrs-since-start 1)
				   (- server-timeout (inexact->exact (round (* hrs-since-start 60))))  ;; subtract 60 seconds per hour
				   server-timeout)))
	(if (common:low-noise-print 120 "server timeout")
	    (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout))
	(cond

Modified launch.scm from [442d86a53d] to [7ecb4d1b9c].

783
784
785
786
787
788
789
790

791
792
793
794
795
796
797
783
784
785
786
787
788
789

790
791
792
793
794
795
796
797







-
+







	     (target   (common:args-get-target))
	     (linktree (common:get-linktree))
	     (sections (if target (list "default" target) #f)) ;; for runconfigs
	     (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config 
	     (rundir   (if (and runname target linktree)(conc linktree "/" target "/" runname) #f))
	     (mtcachef (and rundir (conc rundir "/" ".megatest.cfg-"  megatest-version "-" megatest-fossil-hash)))
	     (rccachef (and rundir (conc rundir "/" ".runconfigs.cfg-"  megatest-version "-" megatest-fossil-hash)))
	 (cancreate (and rundir (common:file-exists? rundir)(file-write-access? rundir)))
	     (cancreate (and rundir (common:file-exists? rundir)(file-write-access? rundir)))
	     (cxt       (hash-table-ref/default *contexts* toppath #f)))

	;; create our cxt for this area if it doesn't already exist
	(if (not cxt)(hash-table-set! *contexts* toppath (make-cxt)))

	;; (print "runname: " runname " target: " target " mtcachef: " mtcachef " rccachef: " rccachef)
	(set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource

Modified rmt.scm from [1adf35b1f4] to [4b028f3c38].

54
55
56
57
58
59
60
61
62
63
64
65
















66
67
68
69
70
71
72
54
55
56
57
58
59
60





61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83







-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







  ;; do all the prep locked under the rmt-mutex
  (mutex-lock! *rmt-mutex*)

  ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
  ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
  ;; 3. do the query, if on homehost use local access
  ;;
  (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value
         (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
         (dbfile (conc *toppath* "/megatest.db"))
         (readonly-mode (not (file-write-access? dbfile))) ;; TODO: use dbstruct or runremote to figure this out in future
	 (runremote  (or area-dat *runremote*)))
  (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
         (areapath      *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
	 (runremote     (or area-dat
			    *runremote*))
	 (readonly-mode (if (and runremote
				 (remote-ro-mode-checked runremote))
			    (remote-ro-mode runremote)
			    (let* ((dbfile  (conc *toppath* "/megatest.db"))
				   (ro-mode (not (file-write-access? dbfile)))) ;; TODO: use dbstruct or runremote to figure this out in future
			      (if runremote
				  (begin
				    (remote-ro-mode-set! runremote ro-mode)
				    (remote-ro-mode-checked-set! runremote #t)
				    ro-mode)
				  ro-mode)))))

    ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
    (cond
     ;; give up if more than 15 attempts
     ((> attemptnum 15)
      (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.")
      (exit 1))

92
93
94
95
96
97
98
99

100
101
102
103
104
105
106
103
104
105
106
107
108
109

110
111
112
113
114
115
116
117







-
+







	   (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 30)))) ;; add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts
	     (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
      (remote-conndat-set! runremote #f)
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a record for our connection for given area
     ((not runremote)                     
     ((not runremote)                  ;; can remove this one. should never get here.         
      (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  1")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a homehost record
     ((not (pair? (remote-hh-dat runremote)))  ;; not on homehost
      (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little

Modified server.scm from [a07a79fe32] to [65b45e968a].

244
245
246
247
248
249
250
251


252
253
254
255
256
257
258
244
245
246
247
248
249
250

251
252
253
254
255
256
257
258
259







-
+
+







    (if (and srvrs
	     (not (null? srvrs)))
	(car srvrs)
	#f)))

(define (server:get-rand-best areapath)
  (let ((srvrs (server:get-best (server:get-list areapath))))
    (if (list? srvrs)
    (if (and (list? srvrs)
	     (not (null? srvrs)))
	(let* ((len (length srvrs))
	       (idx (random len)))
	  (list-ref srvrs idx))
	#f)))


(define (server:record->url servr)