Megatest

Check-in [f0a33deb29]
Login
Overview
Comment:Added some defense to local queries
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60-broken-test1
Files: files | file ages | folders
SHA1: f0a33deb299155f0c7283659b8641d92dc6fd1ab
User & Date: matt on 2014-12-09 22:56:56
Other Links: branch diff | manifest | tags
Context
2014-12-09
23:00
Tweak dashboard updates check-in: 1b36724a57 user: matt tags: v1.60-broken-test1
22:56
Added some defense to local queries check-in: f0a33deb29 user: matt tags: v1.60-broken-test1
21:00
Hand-merged refactor of server launch/client start code check-in: e7355f3724 user: matt tags: v1.60-broken-test1
Changes

Modified client.scm from [4189e0da0c] to [e12d06ffcd].

205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
		;;   (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
		;;   (if (< num-available 2)
		;;       (server:try-running run-id))
		(tasks:start-and-wait-for-server tdbdat run-id delay-max-tries)
		;; (thread-sleep! (+ 2 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
		(client:setup run-id remaining-tries: (- remaining-tries 1))))))))

;; 	(let ((host-info (hash-table-ref/default *runremote* run-id #f)))
;; 	  (if host-info ;; this is a bit circular. the host-info *is* the start-res FIXME
;; 	      (let* ((iface     (http-transport:server-dat-get-iface host-info))
;; 		     (port      (http-transport:server-dat-get-port  host-info))
;; 		     (start-res (case *transport-type* 
;; 				  ((http)(http-transport:client-connect iface port))
;; 				  ((nmsg)(nmsg-transport:client-connect iface port)) ;; (http-transport:server-dat-get-socket host-info))
;; 				  (else #f)))
;; 		     (ping-res  (case *transport-type*
;; 				  ((http)(rmt:login-no-auto-client-setup start-res run-id))
;; 				  ((nmsg)(let ((logininfo (rmt:login-no-auto-client-setup start-res run-id)))
;; 					   (if logininfo
;; 					       (vector-ref (vector-ref logininfo 1) 1)
;; 					       #f)))
;; 				  (else #f))))
;; 		(if ping-res   ;; sucessful login?
;; 		    (begin
;; 		      (debug:print-info 2 "client:setup, ping is good using host-info=" host-info ", remaining-tries=" remaining-tries)
;; 		      start-res)  ;; return the server info
;; 		    ;; have host info but no ping. shutdown the current connection and try again
;; 		    (begin    ;; login failed
;; 		      (debug:print-info 1 "client:setup, ping is bad for start-res=" start-res " and *runremote*=" host-info)
;; 		      (case *transport-type*
;; 			((http)(http-transport:close-connections run-id)))
;; 		      (hash-table-delete! *runremote* run-id)
;; 		      (if (< remaining-tries 8)
;; 			  (thread-sleep! 5)
;; 			  (thread-sleep! 1))
;; 		      (client:setup run-id remaining-tries: (- remaining-tries 1)))))
;; 	      ;; YUK: rename server-dat here
;; 

;; keep this as a function to ease future 
(define (client:start run-id server-info)
  (http-transport:client-connect (tasks:hostinfo-get-interface server-info)
				 (tasks:hostinfo-get-port server-info)))

;; client:signal-handler
(define (client:signal-handler signum)







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







205
206
207
208
209
210
211
































212
213
214
215
216
217
218
		;;   (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
		;;   (if (< num-available 2)
		;;       (server:try-running run-id))
		(tasks:start-and-wait-for-server tdbdat run-id delay-max-tries)
		;; (thread-sleep! (+ 2 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
		(client:setup run-id remaining-tries: (- remaining-tries 1))))))))

































;; keep this as a function to ease future 
(define (client:start run-id server-info)
  (http-transport:client-connect (tasks:hostinfo-get-interface server-info)
				 (tasks:hostinfo-get-port server-info)))

;; client:signal-handler
(define (client:signal-handler signum)

Modified dashboard.scm from [6d6a8350b9] to [0a889b25e4].

1471
1472
1473
1474
1475
1476
1477
1478

1479
1480
1481
1482
1483
1484
1485
1486
1487
  (let* ((modtime         (dashboard:get-youngest-run-db-mod-time)) ;; (file-modification-time *db-file-path*))
	 (monitor-modtime (if (file-exists? *monitor-db-path*)
			      (file-modification-time *monitor-db-path*)
			      -1))
	 (run-update-time (current-seconds))
	 (recalc          (dashboard:recalc modtime *please-update-buttons* *last-db-update-time*)))
    (if (and (eq? *current-tab-number* 0)
	     (> monitor-modtime *last-monitor-update-time*))

	(begin
	  (set! *last-monitor-update-time* monitor-modtime)
	  (if dashboard:update-servers-table (dashboard:update-servers-table))))
    (if recalc
	(begin	
	  (case *current-tab-number* 
	    ((0) 
	     (if dashboard:update-summary-tab (dashboard:update-summary-tab)))
	    ((1) ;; The runs table is active







|
>

|







1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
  (let* ((modtime         (dashboard:get-youngest-run-db-mod-time)) ;; (file-modification-time *db-file-path*))
	 (monitor-modtime (if (file-exists? *monitor-db-path*)
			      (file-modification-time *monitor-db-path*)
			      -1))
	 (run-update-time (current-seconds))
	 (recalc          (dashboard:recalc modtime *please-update-buttons* *last-db-update-time*)))
    (if (and (eq? *current-tab-number* 0)
	     (or (> monitor-modtime *last-monitor-update-time*)
		 (> (- run-update-time *last-monitor-update-time*) 30))) ;; update every 1/2 minute just in case
	(begin
	  (set! *last-monitor-update-time* run-update-time) ;; monitor-modtime)
	  (if dashboard:update-servers-table (dashboard:update-servers-table))))
    (if recalc
	(begin	
	  (case *current-tab-number* 
	    ((0) 
	     (if dashboard:update-summary-tab (dashboard:update-summary-tab)))
	    ((1) ;; The runs table is active

Modified rmt.scm from [e7ce503f1e] to [e8763f368b].

26
27
28
29
30
31
32




33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
;; (require-library trace)
;; (import trace)
;; (trace
;; rmt:send-receive
;; api:execute-requests
;; )






;;======================================================================
;;  S U P P O R T   F U N C T I O N S
;;======================================================================

;; NOT USED
;;
(define (rmt:call-transport run-id connection-info cmd jparams)
  (case (server:get-transport)
    ((rpc)  ( rpc-transport:client-api-send-receive run-id connection-info cmd jparams))
    ((http) (http-transport:client-api-send-receive run-id connection-info cmd jparams))
    ((fs)   ( fs-transport:client-api-send-receive run-id connection-info cmd jparams))
    ((zmq)  (zmq-transport:client-api-send-receive run-id connection-info cmd jparams))
    (else   ( rpc-transport:client-api-send-receive run-id connection-info cmd jparams))))

;;
(define (rmt:write-frequency-over-limit? cmd run-id)
  (and (not (member cmd api:read-only-queries))
       (let* ((tmprec (hash-table-ref/default *write-frequency* run-id #f))
	      (record (if tmprec tmprec 
			  (let ((v (vector (current-seconds) 0)))







>
>
>
>





|

|
|
|
|
|
|
|







26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
;; (require-library trace)
;; (import trace)
;; (trace
;; rmt:send-receive
;; api:execute-requests
;; )

;; generate entries for ~/.megatestrc with the following
;;
;;  grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u


;;======================================================================
;;  S U P P O R T   F U N C T I O N S
;;======================================================================

;; NOT USED?
;;
;; (define (rmt:call-transport run-id connection-info cmd jparams)
;;   (case (server:get-transport)
;;     ((rpc)  ( rpc-transport:client-api-send-receive run-id connection-info cmd jparams))
;;     ((http) (http-transport:client-api-send-receive run-id connection-info cmd jparams))
;;     ((fs)   ( fs-transport:client-api-send-receive run-id connection-info cmd jparams))
;;     ((zmq)  (zmq-transport:client-api-send-receive run-id connection-info cmd jparams))
;;     (else   ( rpc-transport:client-api-send-receive run-id connection-info cmd jparams))))

;;
(define (rmt:write-frequency-over-limit? cmd run-id)
  (and (not (member cmd api:read-only-queries))
       (let* ((tmprec (hash-table-ref/default *write-frequency* run-id #f))
	      (record (if tmprec tmprec 
			  (let ((v (vector (current-seconds) 0)))
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217

218









219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
			     (if (> tot 10)
				 (cons newmax-cmd currmax)
				 (cons 'none 0))
			     (loop (car tal)(cdr tal) newmax-cmd currmax)))))))
    (mutex-unlock! *db-stats-mutex*)
    res))
	  
(define (rmt:open-qry-close-locally cmd run-id params)
  (let* ((dbstruct-local (if *dbstruct-db*
			     *dbstruct-db*
			     (let* ((dbdir (conc    (configf:lookup *configdat* "setup" "linktree") "/.db"))
				    (db (make-dbr:dbstruct path:  dbdir local: #t)))
			       (set! *dbstruct-db* db)
			       db)))
	 (db-file-path   (db:dbfile-path 0)))
    ;; (read-only      (not (file-read-access? db-file-path)))
    (let* ((start         (current-milliseconds))
	   (resdat        (api:execute-requests dbstruct-local (vector (symbol->string cmd) params)))
	   (res           (vector-ref resdat 1))

	   (duration      (- (current-milliseconds) start)))









      (rmt:update-db-stats run-id cmd params duration)
      ;; mark this run as dirty if this was a write
      (if (not (member cmd api:read-only-queries))
	  (let ((start-time (current-seconds)))
	    (mutex-lock! *db-multi-sync-mutex*)
	    ;; (if (not (hash-table-ref/default *db-local-sync* run-id #f))
	    ;; just set it every time. Is a write more expensive than a read and does it matter?
	    (hash-table-set! *db-local-sync* (or run-id 0) start-time) ;; the oldest "write"
	    (mutex-unlock! *db-multi-sync-mutex*)))
      res)))

(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)
  (let* ((run-id   (if run-id run-id 0))
	 ;; (jparams  (db:obj->string params)) ;; (rmt:dat->json-str params))
	 (res  	   (handle-exceptions
		    exn
		    #f







|






|
|
|
|
|
>
|
>
>
>
>
>
>
>
>
>
|
|
|
|
|
|
|
|
|
|







203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
			     (if (> tot 10)
				 (cons newmax-cmd currmax)
				 (cons 'none 0))
			     (loop (car tal)(cdr tal) newmax-cmd currmax)))))))
    (mutex-unlock! *db-stats-mutex*)
    res))
	  
(define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5))
  (let* ((dbstruct-local (if *dbstruct-db*
			     *dbstruct-db*
			     (let* ((dbdir (conc    (configf:lookup *configdat* "setup" "linktree") "/.db"))
				    (db (make-dbr:dbstruct path:  dbdir local: #t)))
			       (set! *dbstruct-db* db)
			       db)))
	 (db-file-path   (db:dbfile-path 0))
	 ;; (read-only      (not (file-read-access? db-file-path)))
	 (start          (current-milliseconds))
	 (resdat         (api:execute-requests dbstruct-local (vector (symbol->string cmd) params)))
	 (success        (vector-ref resdat 0))
	 (res            (vector-ref resdat 1))
	 (duration       (- (current-milliseconds) start)))
    (if (not success)
	(if (> remretries 0)
	    (begin
	      (debug:print 0 "ERROR: local query failed. Trying again.")
	      (rmt:open-qry-close-locally cmd run-id params remretries: (- remretries 1)))
	    (begin
	      (debug:print 0 "ERROR: too many retries in rmt:open-qry-close-locally, giving up")
	      #f))
	(begin
	  (rmt:update-db-stats run-id cmd params duration)
	  ;; mark this run as dirty if this was a write
	  (if (not (member cmd api:read-only-queries))
	      (let ((start-time (current-seconds)))
		(mutex-lock! *db-multi-sync-mutex*)
		;; (if (not (hash-table-ref/default *db-local-sync* run-id #f))
		;; just set it every time. Is a write more expensive than a read and does it matter?
		(hash-table-set! *db-local-sync* (or run-id 0) start-time) ;; the oldest "write"
		(mutex-unlock! *db-multi-sync-mutex*)))
	  res))))

(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)
  (let* ((run-id   (if run-id run-id 0))
	 ;; (jparams  (db:obj->string params)) ;; (rmt:dat->json-str params))
	 (res  	   (handle-exceptions
		    exn
		    #f

Modified runs.scm from [866c85daf6] to [3bd91bccb4].

158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
    (if (> (- currtime lasttime) waitval)
	(begin
	  (hash-table-set! *runs:denoise* key currtime)
	  #t)
	#f)))

(define (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)
  ;;(thread-sleep! (cond
  ;;      	  ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while
  ;;      	  (else 0)))
  (let* ((num-running             (rmt:get-count-tests-running run-id))
	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
	 (job-group-limit         (let ((jobg-count (config-lookup *configdat* "jobgroups" jobgroup)))
				    (if (string? jobg-count)
					(string->number jobg-count)
					jobg-count))))
    (if (> (+ num-running num-running-in-jobgroup) 0)







|
|
|







158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
    (if (> (- currtime lasttime) waitval)
	(begin
	  (hash-table-set! *runs:denoise* key currtime)
	  #t)
	#f)))

(define (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)
  (thread-sleep! (cond
        	  ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while
        	  (else 0)))
  (let* ((num-running             (rmt:get-count-tests-running run-id))
	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
	 (job-group-limit         (let ((jobg-count (config-lookup *configdat* "jobgroups" jobgroup)))
				    (if (string? jobg-count)
					(string->number jobg-count)
					jobg-count))))
    (if (> (+ num-running num-running-in-jobgroup) 0)

Modified tasks.scm from [8536412774] to [4936b038b4].

359
360
361
362
363
364
365
366

367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
     (lambda (id)
       (set! res id))
     mdb ;; NEEDS dbprep ADDED
     "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id)
    res))

(define (tasks:need-server run-id)
  (let ((forced (configf:lookup *configdat* "server" "required"))

	(maxqry (cdr (rmt:get-max-query-average run-id)))
	(threshold   (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))
    (cond
     (forced 
      (if (common:low-noise-print 60 run-id "server required is set")
	  (debug:print-info 0 "Server required is set, starting server for run-id " run-id "."))
      #t)
     ((> maxqry threshold)
      (if (common:low-noise-print 60 run-id "Max query time execeeded")
	  (debug:print-info 0 "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, server needed for run-id " run-id "."))
      #t)
     (else
      #f))))

;; try to start a server and wait for it to be available
;;
(define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries)
  ;; ensure a server is running for this run
  (let loop ((server-running (tasks:server-running? (db:delay-if-busy tdbdat) run-id))
	     (delay-time 0))







|
>
|
|
|
|
|
|
|
|
|
|
|
|
|







359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
     (lambda (id)
       (set! res id))
     mdb ;; NEEDS dbprep ADDED
     "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id)
    res))

(define (tasks:need-server run-id)
  (configf:lookup *configdat* "server" "required"))

;; 	(maxqry (cdr (rmt:get-max-query-average run-id)))
;; 	(threshold   (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))
;;     (cond
;;      (forced 
;;       (if (common:low-noise-print 60 run-id "server required is set")
;; 	  (debug:print-info 0 "Server required is set, starting server for run-id " run-id "."))
;;       #t)
;;      ((> maxqry threshold)
;;       (if (common:low-noise-print 60 run-id "Max query time execeeded")
;; 	  (debug:print-info 0 "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, server needed for run-id " run-id "."))
;;       #t)
;;      (else
;;       #f))))

;; try to start a server and wait for it to be available
;;
(define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries)
  ;; ensure a server is running for this run
  (let loop ((server-running (tasks:server-running? (db:delay-if-busy tdbdat) run-id))
	     (delay-time 0))