Megatest

Diff
Login

Differences From Artifact [51e718f694]:

To Artifact [8fc869d3b5]:


1
2
3
4
5
6
7
8
9
10
11
12

13
14
15
16
17
18


19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35







36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65






66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81




82
83
84
85


86
87
88







89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112



























113
114
115
116
117
118
119
120
121
122
123
















124
125

126
127
128
129
130
131
132
133









134
135
136

137
138
139
140

141
142
143
144
145



146
147
148
149

150
151
152
153
154




155
156
157


158
159
160
161
162
163
164
165
166












167
168

169
170
171
172




173
174
175
176
177
178
179
1
2
3
4
5
6
7
8
9
10
11

12
13
14
15
16
17
18
19
20
21
22
23
24









25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40




















41
42
43
44

45
46
47
48
49
50
51
52


53
54
55
56
57
58
59
60
61



62
63
64
65




66
67



68
69
70
71
72
73
74
























75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101











102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117


118








119
120
121
122
123
124
125
126
127



128




129





130
131
132




133





134
135
136
137



138
139









140
141
142
143
144
145
146
147
148
149
150
151
152

153




154
155
156
157
158
159
160
161
162
163
164











-
+






+
+




-
-
-
-
-
-
-
-
-




+
+
+
+
+
+
+





-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-




-
+
+
+
+
+
+


-
-









-
-
-
+
+
+
+
-
-
-
-
+
+
-
-
-
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
-
-
-
+
-
-
-
-
+
-
-
-
-
-
+
+
+
-
-
-
-
+
-
-
-
-
-
+
+
+
+
-
-
-
+
+
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+

-
+
-
-
-
-
+
+
+
+







;;======================================================================
;; Copyright 2006-2013, Matthew Welland.
;; 
;;  This program is made available under the GNU GPL version 2.0 or
;;  greater. See the accompanying file COPYING for details.
;; 
;;  This program is distributed WITHOUT ANY WARRANTY; without even the
;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
;;  PURPOSE.
;;======================================================================

(use json format) ;; RADT => purpose of json format??
(use format typed-records) ;; RADT => purpose of json format??

(declare (unit rmt))
(declare (uses api))
(declare (uses tdb))
(declare (uses http-transport))
;;(declare (uses nmsg-transport))
(include "common_records.scm")

;;
;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!!
;;

;; ;; For debugging add the following to ~/.megatestrc
;;
;; (require-library trace)
;; (import trace)
;; (trace
;; rmt:send-receive
;; api:execute-requests
;; )

;; generate entries for ~/.megatestrc with the following
;;
;;  grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u

(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        (if *toppath* (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (conndat           #f)
  (transport         *transport-type*)
  (server-timeout    (or (server:get-timeout) 100))) ;; default to 100 seconds

;;======================================================================
;;  S U P P O R T   F U N C T I O N S
;;======================================================================

;;
(define (rmt:write-frequency-over-limit? cmd run-id)
  (and (not (member cmd api:read-only-queries))
       (let* ((tmprec (hash-table-ref/default *write-frequency* run-id #f))
	      (record (if tmprec tmprec 
			  (let ((v (vector (current-seconds) 0)))
			    (hash-table-set! *write-frequency* run-id v)
			    v)))
	      (count  (+ 1 (vector-ref record 1)))
	      (start  (vector-ref record 0))
	      (queries-per-second (/ (* count 1.0)
				     (max (- (current-seconds) start) 1))))
	 (vector-set! record 1 count)
	 (if (and (> count 10)
		  (> queries-per-second 10))
	     (begin
	       (debug:print-info 1 *default-log-port* "db write rate too high, starting a server, count=" count " start=" start " run-id=" run-id " queries-per-second=" queries-per-second)
	       #t)
	     #f))))

;; if a server is either running or in the process of starting call client:setup
;; else return #f to let the calling proc know that there is no server available
;;
(define (rmt:get-connection-info run-id)
  (let ((cinfo (hash-table-ref/default *runremote* run-id #f)))
  (let ((cinfo *runremote*)) ;; (hash-table-ref/default *runremote* run-id #f)))
;; how about if rrr is a defstruct and we use a wrapper to access it (even better would be a macro)
;; look in common_records for with-mutex
;;
;; (with-mutex *rrr-mutex* rmt:dat-host *runremote*) => returns value
;; (with-mutex *rrr-mutex* rmt:
    (if cinfo
	cinfo
	;; NB// can cache the answer for server running for 10 seconds ...
	;;  ;; (and (not (rmt:write-frequency-over-limit? cmd run-id))
	(if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id)
	    (client:setup run-id)
	    #f))))

(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected
  ;; clean out old connections
  ;; (mutex-lock! *db-multi-sync-mutex*)
  (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin

  ;; do all the prep locked under the rmt-mutex
  (mutex-lock! *rmt-mutex*)

    (for-each 
     (lambda (run-id)
       (let ((connection (hash-table-ref/default *runremote* run-id #f)))
         (if (and (vector? connection)
  ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in *runremote*
  ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
        	  (< (http-transport:server-dat-get-last-access connection) expire-time))
             (begin
               (debug:print-info 0 *default-log-port* "Discarding connection to server for run-id " run-id ", too long between accesses")
  ;; 3. do the query, if on homehost use local access
  ;;
  (let* ((start-time (current-seconds))) ;; snapshot time so all use cases get same value
    (cond
     ;; give up if more than 15 attempts
     ((> attemptnum 15)
      (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.")
               ;; bb- disabling nanomsg
               ;; SHOULD CLOSE THE CONNECTION HERE 
	       ;; (case *transport-type*
	       ;;   ((nmsg)(nn-close (http-transport:server-dat-get-socket 
	       ;;  		   (hash-table-ref *runremote* run-id)))))
               (hash-table-delete! *runremote* run-id)))))
     (hash-table-keys *runremote*)))
  ;; (mutex-unlock! *db-multi-sync-mutex*)
  ;; (mutex-lock! *send-receive-mutex*)
  (let* ((run-id          (if rid rid 0))
	 (connection-info (rmt:get-connection-info run-id)))
    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)
    (if connection-info
	;; use the server if have connection info
	(let* ((dat     (case *transport-type*
			  ((http)(condition-case
				  (http-transport:client-api-send-receive run-id connection-info cmd params)
				  ((commfail)(vector #f "communications fail"))
				  ((exn)(vector #f "other fail"))))
			  ;; ((nmsg)(condition-case
			  ;;         (nmsg-transport:client-api-send-receive run-id connection-info cmd params)
			  ;;         ((timeout)(vector #f "timeout talking to server"))))
			  (else  (exit))))
	       (success (if (vector? dat) (vector-ref dat 0) #f))
      (exit 1))
     ;; ensure we have a record for our connection for given area
     ((not *runremote*)                     
      (set! *runremote* (make-remote))
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a homehost record
     ((not (pair? (remote-hh-dat *runremote*)))  ;; have a homehost record?
      (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
      (remote-hh-dat-set! *runremote* (common:get-homehost))
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; on homehost and this is a read
     ((and (cdr (remote-hh-dat *runremote*))   ;; on homehost
           (member cmd api:read-only-queries)) ;; this is a read
      (mutex-unlock! *rmt-mutex*)
      (rmt:open-qry-close-locally cmd 0 params))
     ;; on homehost and this is a write, we already have a server
     ((and (cdr (remote-hh-dat *runremote*))         ;; on homehost
           (not (member cmd api:read-only-queries))  ;; this is a write
           (remote-server-url *runremote*))          ;; have a server
      (mutex-unlock! *rmt-mutex*)
      (rmt:open-qry-close-locally cmd 0 params))
     ;; no server contact made and this is a write, try starting a server 
     ((and (not (remote-server-url *runremote*))
	   (not (member cmd api:read-only-queries)))
      (let ((serverconn (server:check-if-running *toppath*)))
	       (res     (if (vector? dat) (vector-ref dat 1) #f)))
	  (if (vector? connection-info)(http-transport:server-dat-update-last-access connection-info))
	  (if success
	      (begin
		;; (mutex-unlock! *send-receive-mutex*)
		(case *transport-type* 
		  ((http) res) ;; (db:string->obj res))
		  ;; ((nmsg) res)
                  )) ;; (vector-ref res 1)))
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 *default-log-port* "WARNING: Communication failed, trying call to rmt:send-receive again.")
	(if serverconn
	    (remote-server-url-set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed
	    (if (not (server:start-attempted? *toppath*))
		(server:kind-run *toppath*))))
      (if (cdr (remote-hh-dat *runremote*)) ;; we are on the homehost, just do the call
          (begin
            (mutex-unlock! *rmt-mutex*)
            (rmt:open-qry-close-locally cmd 0 params))
          (begin
            (mutex-unlock! *rmt-mutex*)
            (rmt:send-receive cmd rid params attemptnum: attemptnum))))
     ;; if not on homehost ensure we have a connection to a live server
     ((or (not (pair? (remote-hh-dat *runremote*)))  ;; have a homehost record?
	  (not (cdr (remote-hh-dat *runremote*)))    ;; have record, are we on a homehost?
	  (not (remote-conndat *runremote*)))         ;; do we not have a connection?
      (remote-hh-dat-set!  *runremote* (common:get-homehost))
		;; (case *transport-type*
		;;   ((nmsg)(nn-close (http-transport:server-dat-get-socket connection-info))))
      (remote-conndat-set! *runremote* (rmt:get-connection-info 0))
		(hash-table-delete! *runremote* run-id) ;; don't keep using the same connection
		;; NOTE: killing server causes this process to block forever. No idea why. Dec 2. 
		;; (if (eq? (modulo attemptnum 5) 0)
		;;     (tasks:kill-server-run-id run-id tag: "api-send-receive-failed"))
		;; (mutex-unlock! *send-receive-mutex*) ;; close the mutex here to allow other threads access to communications
		(tasks:start-and-wait-for-server (tasks:open-db) run-id 15)
		;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1))))))

      (mutex-unlock! *rmt-mutex*)
      (server:kind-run *toppath*) ;; we need a sever
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; all set up if get this far, dispatch the query
     ((cdr (remote-hh-dat *runremote*)) ;; we are on homehost
      (mutex-unlock! *rmt-mutex*)
      (rmt:open-qry-close-locally cmd (if rid rid 0) params))
     ;; reset the connection if it has been unused too long
     ((and (remote-conndat *runremote*)
		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
	   (let ((expire-time (- start-time (remote-server-timeout *runremote*))))
		;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))
		;; (thread-sleep! 2)
		(rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1)))))
	;; no connection info? try to start a server, or access locally if no
	     (< (http-transport:server-dat-get-last-access connection) expire-time)))
	;; server and the query is read-only
	;;
	;; Note: The tasks db was checked for a server in starting mode in the rmt:get-connection-info call
	;;
	(if (and (< attemptnum 15)
      (remote-conndatr *runremote* #f))
     ;; not on homehost, do server query
     (else
		 (member cmd api:write-queries))
	    (let ((faststart (configf:lookup *configdat* "server" "faststart")))
	      (hash-table-delete! *runremote* run-id)
	      ;; (mutex-unlock! *send-receive-mutex*)
      (mutex-unlock! *rmt-mutex*)
	      (if (and faststart (equal? faststart "no"))
		  (begin
		    (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
		    (thread-sleep! (random 5)) ;; give some time to settle and minimize collison?
		    (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
      (let* ((conninfo (remote-conndat *runremote*))
	     (dat      (case (remote-transport *runremote*)
			 ((http)(condition-case
				 (http-transport:client-api-send-receive run-id conninfo cmd params)
		  (let ((start-time (current-milliseconds))
			(max-query  (string->number (or (configf:lookup *configdat* "server" "server-query-threshold")
							"300")))
				 ((commfail)(vector #f "communications fail"))
				 ((exn)(vector #f "other fail"))))
			(newres     (rmt:open-qry-close-locally cmd run-id params)))
		    (let ((delta (- (current-milliseconds) start-time)))
		      (if (> delta max-query)
			  (begin
			    (debug:print-info 0 *default-log-port* "Starting server as query time " delta " is over the limit of " max-query)
			    (server:kind-run run-id)))
		      ;; return the result!
		      newres)
		    )))
			 (else
			  (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported")
			  (exit))))
	     (success  (if (vector? dat) (vector-ref dat 0) #f))
	     (res      (if (vector? dat) (vector-ref dat 1) #f)))
	(if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time
	(if (and success res)
	    (case (remote-transport *runremote*)
	      ((http) res)
	      (else
	       (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " is unknown")
	       (exit 1)))
	    (begin
	      ;; (debug:print-error 0 *default-log-port* "Communication failed!")
	      (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
	      ;; (mutex-unlock! *send-receive-mutex*)
	      ;; (exit)
	      (rmt:open-qry-close-locally cmd run-id params)
	      )))))
	      (remote-conndat-set! *runremote* #f)
	      (server-url-set!     *runremote* #f)
	      (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
	      (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))))))))

(define (rmt:update-db-stats run-id rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 *default-log-port* "WARNING: stats collection failed in update-db-stats")
225
226
227
228
229
230
231
232
233
234




235

236


237
238
239


240
241
242
243
244
245
246
247
248
249
250
251
252


253
254
255
256
257
258
259



260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286










287
288
289
290
291
292
293
210
211
212
213
214
215
216



217
218
219
220
221
222

223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240


241
242
243
244





245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264










265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281







-
-
-
+
+
+
+

+
-
+
+



+
+











-
-
+
+


-
-
-
-
-
+
+
+

















-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+







				 (cons newmax-cmd currmax)
				 (cons 'none 0))
			     (loop (car tal)(cdr tal) newmax-cmd currmax)))))))
    (mutex-unlock! *db-stats-mutex*)
    res))

(define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5))
  (let* ((dbstruct-local (db:open-local-db-handle))
	 (db-file-path   (db:dbfile-path 0))
	 ;; (read-only      (not (file-read-access? db-file-path)))
  (let* ((qry-is-write   (not (member cmd api:read-only-queries)))
	 (db-file-path   (db:dbfile-path)) ;;  0))
	 (dbstruct-local (db:setup))  ;; make-dbr:dbstruct path:  dbdir local: #t)))
	 (read-only      (not (file-write-access? db-file-path)))
	 (start          (current-milliseconds))
	 (resdat         (if (not (and read-only qry-is-write))
	 (resdat         (api:execute-requests dbstruct-local (vector (symbol->string cmd) params)))
			     (api:execute-requests dbstruct-local (vector (symbol->string cmd) params))
			     (vector #t '())))
	 (success        (vector-ref resdat 0))
	 (res            (vector-ref resdat 1))
	 (duration       (- (current-milliseconds) start)))
    (if (and read-only qry-is-write)
        (debug:print 0 *default-log-port* "ERROR: attempt to write to read-only database ignored. cmd=" cmd))
    (if (not success)
	(if (> remretries 0)
	    (begin
	      (debug:print-error 0 *default-log-port* "local query failed. Trying again.")
	      (thread-sleep! (/ (random 5000) 1000)) ;; some random delay 
	      (rmt:open-qry-close-locally cmd run-id params remretries: (- remretries 1)))
	    (begin
	      (debug:print-error 0 *default-log-port* "too many retries in rmt:open-qry-close-locally, giving up")
	      #f))
	(begin
	  ;; (rmt:update-db-stats run-id cmd params duration)
	  ;; mark this run as dirty if this was a write
	  (if (not (member cmd api:read-only-queries))
	  ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it
	  (if qry-is-write
	      (let ((start-time (current-seconds)))
		(mutex-lock! *db-multi-sync-mutex*)
		;; (if (not (hash-table-ref/default *db-local-sync* run-id #f))
		;; just set it every time. Is a write more expensive than a read and does it matter?
		(hash-table-set! *db-local-sync* (or run-id 0) start-time) ;; the oldest "write"
		(mutex-unlock! *db-multi-sync-mutex*)))
	  res))))
		(set! *db-last-write* start-time) ;; the oldest "write"
                (mutex-unlock! *db-multi-sync-mutex*)))))
    res))

(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)
  (let* ((run-id   (if run-id run-id 0))
	 ;; (jparams  (db:obj->string params)) ;; (rmt:dat->json-str params))
	 (res  	   (handle-exceptions
		    exn
		    #f
		    (http-transport:client-api-send-receive run-id connection-info cmd params))))
;;		    ((commfail) (vector #f "communications fail")))))
    (if (and res (vector-ref res 0))
	(vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!!
	#f)))
;; 	(db:string->obj (vector-ref dat 1))
;; 	(begin
;; 	  (debug:print-error 0 *default-log-port* "rmt:send-receive-no-auto-client-setup failed, attempting to continue. Got " dat)
;; 	  dat))))

;; Wrap json library for strings (why the ports crap in the first place?)
(define (rmt:dat->json-str dat)
  (with-output-to-string 
    (lambda ()
      (json-write dat))))

(define (rmt:json-str->dat json-str)
  (with-input-from-string json-str
    (lambda ()
      (json-read))))
;; ;; Wrap json library for strings (why the ports crap in the first place?)
;; (define (rmt:dat->json-str dat)
;;   (with-output-to-string 
;;     (lambda ()
;;       (json-write dat))))
;; 
;; (define (rmt:json-str->dat json-str)
;;   (with-input-from-string json-str
;;     (lambda ()
;;       (json-read))))

;;======================================================================
;;
;; A C T U A L   A P I   C A L L S  
;;
;;======================================================================

307
308
309
310
311
312
313
314
315
316



317
318
319
320
321
322
323
295
296
297
298
299
300
301



302
303
304
305
306
307
308
309
310
311







-
-
-
+
+
+








(define (rmt:login run-id)
  (rmt:send-receive 'login run-id (list *toppath* megatest-version run-id *my-client-signature*)))

;; This login does no retries under the hood - it acts a bit like a ping.
;; Deprecated for nmsg-transport.
;;
(define (rmt:login-no-auto-client-setup connection-info run-id)
  (case *transport-type*
    ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login run-id (list *toppath* megatest-version run-id *my-client-signature*)))
(define (rmt:login-no-auto-client-setup connection-info)
  (case *transport-type* ;; run-id of 0 is just a placeholder
    ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login 0 (list *toppath* megatest-version *my-client-signature*)))
    ;;((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info 'login (list *toppath* megatest-version run-id *my-client-signature*)))
    ))

;; hand off a call to one of the db:queries statements
;; added run-id to make looking up the correct db possible 
;;
(define (rmt:general-call stmtname run-id . params)
523
524
525
526
527
528
529
530
531


532
533
534
535
536
537
538
511
512
513
514
515
516
517


518
519
520
521
522
523
524
525
526







-
-
+
+







  (rmt:send-receive 'get-count-tests-running-for-testname run-id (list run-id testname)))

(define (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)
  (rmt:send-receive 'get-count-tests-running-in-jobgroup run-id (list run-id jobgroup)))

;; state and status are extra hints not usually used in the calculation
;;
(define (rmt:roll-up-pass-fail-counts run-id test-name item-path state status)
  (rmt:send-receive 'roll-up-pass-fail-counts run-id (list run-id test-name item-path state status)))
(define (rmt:roll-up-pass-fail-counts run-id test-name item-path state status comment)
  (rmt:send-receive 'roll-up-pass-fail-counts run-id (list run-id test-name item-path state status comment)))

(define (rmt:update-pass-fail-counts run-id test-name)
  (rmt:general-call 'update-pass-fail-counts run-id test-name test-name test-name))

(define (rmt:top-test-set-per-pf-counts run-id test-name)
  (rmt:send-receive 'top-test-set-per-pf-counts run-id (list run-id test-name)))