Megatest

Check-in [c8926e3cff]
Login
Overview
Comment:full fs access support and megatest.db turned off by default
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60
Files: files | file ages | folders
SHA1: c8926e3cff337af293928bd1bb2025ae7c85c05f
User & Date: matt on 2014-11-20 21:44:37
Other Links: branch diff | manifest | tags
Context
2014-11-20
22:20
full fs access support and megatest.db turned off by default check-in: 84336a6b8f user: matt tags: v1.60
21:44
full fs access support and megatest.db turned off by default check-in: c8926e3cff user: matt tags: v1.60
2014-11-19
14:36
Fixed bugs in exit handling check-in: bd18160690 user: mrwellan tags: v1.60
Changes

Modified megatest.scm from [b300543c22] to [16af3583d8].

285
286
287
288
289
290
291

292
293
294
295
296
297
298
299

300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328

;; The watchdog is to keep an eye on things like db sync etc.
;;
(define *watchdog*
  (make-thread 
   (lambda ()
     (thread-sleep! 0.05) ;; delay for startup

     (let loop ()
       ;; sync for filesystem local db writes
       ;;
       (let ((start-time      (current-seconds))
	     (servers-started (make-hash-table)))
	 (for-each 
	  (lambda (run-id)
	    (mutex-lock! *db-multi-sync-mutex*)

	    (if (hash-table-ref/default *db-local-sync* run-id #f)
		;; (if (> (- start-time last-write) 5) ;; every five seconds
		(begin ;; let ((sync-time (- (current-seconds) start-time)))
		  (db:multi-db-sync (list run-id) 'new2old)
		  (if (common:low-noise-print 30 "sync new to old")
		      (let ((sync-time (- (current-seconds) start-time)))
			(debug:print-info 0 "Sync of newdb to olddb for run-id " run-id " completed in " sync-time " seconds")
			(if (> sync-time 10) ;; took more than ten seconds, start a server for this run
			    (begin
			      (debug:print-info 0 "Sync is taking a long time, start up a server to assist for run " run-id)
			      (server:kind-run run-id)))))
		  (hash-table-delete! *db-local-sync* run-id)))
	    (mutex-unlock! *db-multi-sync-mutex*))
	  (hash-table-keys *db-local-sync*)))

       ;; keep going unless time to exit
       ;;
       (if (not *time-to-exit*)
	   (begin
	     (thread-sleep! 1) ;; wait one second before syncing again
	     (loop)))))
   "Watchdog thread"))

(thread-start! *watchdog*)

(if (args:get-arg "-log")
    (let ((oup (open-output-file (args:get-arg "-log"))))
      (debug:print-info 0 "Sending log output to " (args:get-arg "-log"))
      (current-error-port oup)







>
|
|
|
|
|
|
|
|
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
|
|







285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330

;; The watchdog is to keep an eye on things like db sync etc.
;;
(define *watchdog*
  (make-thread 
   (lambda ()
     (thread-sleep! 0.05) ;; delay for startup
     (let ((legacy-sync (configf:lookup *configdat* "setup" "megatest-db")))
       (let loop ()
	 ;; sync for filesystem local db writes
	 ;;
	 (let ((start-time      (current-seconds))
	       (servers-started (make-hash-table)))
	   (for-each 
	    (lambda (run-id)
	      (mutex-lock! *db-multi-sync-mutex*)
	      (if (and legacy-sync 
		       (hash-table-ref/default *db-local-sync* run-id #f))
		  ;; (if (> (- start-time last-write) 5) ;; every five seconds
		  (begin ;; let ((sync-time (- (current-seconds) start-time)))
		    (db:multi-db-sync (list run-id) 'new2old)
		    (if (common:low-noise-print 30 "sync new to old")
			(let ((sync-time (- (current-seconds) start-time)))
			  (debug:print-info 0 "Sync of newdb to olddb for run-id " run-id " completed in " sync-time " seconds")))
		    ;; (if (> sync-time 10) ;; took more than ten seconds, start a server for this run
		    ;;     (begin
		    ;;       (debug:print-info 0 "Sync is taking a long time, start up a server to assist for run " run-id)
		    ;;       (server:kind-run run-id)))))
		    (hash-table-delete! *db-local-sync* run-id)))
	      (mutex-unlock! *db-multi-sync-mutex*))
	    (hash-table-keys *db-local-sync*)))

	 ;; keep going unless time to exit
	 ;;
	 (if (not *time-to-exit*)
	     (begin
	       (thread-sleep! 1) ;; wait one second before syncing again
	       (loop)))))
     "Watchdog thread")))

(thread-start! *watchdog*)

(if (args:get-arg "-log")
    (let ((oup (open-output-file (args:get-arg "-log"))))
      (debug:print-info 0 "Sending log output to " (args:get-arg "-log"))
      (current-error-port oup)

Modified rmt.scm from [714450135c] to [a4cf4136f4].

108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
		(if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))

		(rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1)))))
	(let ((max-avg-qry (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))
	  (if (and (< attemptnum 10)
		   (configf:lookup *configdat* "server" "required"))
	      (begin
		(debug:print-info 0 "Server required mode, attempting to start server and retry query in ten seconds")
		(server:kind-run run-id)
		(thread-sleep! 10)
		(rmt:send-receive cmd rid params (+ attemptnum 1)))
	      ;; (if (rmt:write-frequency-over-limit? cmd run-id)(server:kind-run run-id))
	      (let* ((curr-max     (rmt:get-max-query-average run-id))
		     (curr-max-val (cdr curr-max)))
		(debug:print-info 4 "no server and read-only query, bypassing normal channel")
		(if (> curr-max-val max-avg-qry)
		    (if (common:low-noise-print 10 "start server due to max average query too long")
			(begin
			  (debug:print-info 0 "Max average query, " (inexact->exact (round curr-max-val)) "ms (" (car curr-max) ") exceeds " max-avg-qry "ms, try starting server ...")
			  (server:kind-run run-id))
			(debug:print-info 3 "Max average query, " (inexact->exact (round curr-max-val)) "ms (" (car curr-max) ") below " max-avg-qry "ms, not starting server...")))
		(rmt:open-qry-close-locally cmd run-id params)))))))

(define (rmt:update-db-stats run-id rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 "WARNING: stats collection failed in update-db-stats")







<
|
|
|
<
|
<
|
<
<
<
<
<
<
<
<
<
<
|







108
109
110
111
112
113
114

115
116
117

118

119










120
121
122
123
124
125
126
127

		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
		(if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))

		(rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1)))))

	(if (and (< attemptnum 10)
		 (tasks:need-server run-id))
	    (begin

	      (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)

	      (rmt:send-receive cmd rid params (+ attemptnum 1)))










	    (rmt:open-qry-close-locally cmd run-id params)))))

(define (rmt:update-db-stats run-id rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 "WARNING: stats collection failed in update-db-stats")

Modified runs.scm from [380bd479a2] to [6ed325fc14].

214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
	 (all-tests-registry #f)  ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names
	 (all-test-names     #f)  ;; (hash-table-keys all-tests-registry))
	 (test-names         #f)  ;; (tests:filter-test-names all-test-names test-patts))
	 (required-tests     #f)  ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work
	 (task-key           (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id)))
	 (tdbdat             (tasks:open-db)))

    (tasks:start-and-wait-for-server tdbdat run-id 10)

    (set-signal-handler! signal/int
			 (lambda (signum)
			   (signal-mask! signum)
			   (print "Received signal " signum ", cleaning up before exit. Please wait...")
			   (let ((tdbdat (tasks:open-db)))
			     (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "killed"))







|







214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
	 (all-tests-registry #f)  ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names
	 (all-test-names     #f)  ;; (hash-table-keys all-tests-registry))
	 (test-names         #f)  ;; (tests:filter-test-names all-test-names test-patts))
	 (required-tests     #f)  ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work
	 (task-key           (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id)))
	 (tdbdat             (tasks:open-db)))

    (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))

    (set-signal-handler! signal/int
			 (lambda (signum)
			   (signal-mask! signum)
			   (print "Received signal " signum ", cleaning up before exit. Please wait...")
			   (let ((tdbdat (tasks:open-db)))
			     (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "killed"))
933
934
935
936
937
938
939
940

941
942
943
944
945
946
947
948
949
950
	     (item-path   (item-list->path itemdat))
	     (tfullname   (runs:make-full-test-name test-name item-path))
	     (newtal      (append tal (list hed)))
	     (regfull     (>= (length reg) reglen))
	     (num-running (rmt:get-count-tests-running-for-run-id run-id)))

	;; every couple minutes verify the server is there for this run
	(if (common:low-noise-print 60 "try start server"  run-id)

	    (tasks:start-and-wait-for-server tdbdat run-id 10))

      (if (> num-running 0)
	  (set! last-time-some-running (current-seconds)))

      (if (> (current-seconds)(+ last-time-some-running 240))
	  (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
	;; (debug:print 0 "max-tries-hash: " (hash-table->alist *max-tries-hash*))

	;; Ensure all top level tests get registered. This way they show up as "NOT_STARTED" on the dashboard







|
>

|
|







933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
	     (item-path   (item-list->path itemdat))
	     (tfullname   (runs:make-full-test-name test-name item-path))
	     (newtal      (append tal (list hed)))
	     (regfull     (>= (length reg) reglen))
	     (num-running (rmt:get-count-tests-running-for-run-id run-id)))

	;; every couple minutes verify the server is there for this run
	(if (and (common:low-noise-print 60 "try start server"  run-id)
		 (tasks:need-server run-id))
	    (tasks:start-and-wait-for-server tdbdat run-id 10))
	
	(if (> num-running 0)
	  (set! last-time-some-running (current-seconds)))

      (if (> (current-seconds)(+ last-time-some-running 240))
	  (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
	;; (debug:print 0 "max-tries-hash: " (hash-table->alist *max-tries-hash*))

	;; Ensure all top level tests get registered. This way they show up as "NOT_STARTED" on the dashboard
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
			       '()))
		(lasttpath "/does/not/exist/I/hope"))
	   (debug:print-info 4 "runs:operate-on run=" run ", header=" header)
	   (if (not (null? tests))
	       (begin
		 (case action
		   ((remove-runs)
		    (tasks:start-and-wait-for-server tdbdat run-id 10)
		    ;; seek and kill in flight -runtests with % as testpatt here
		    (if (equal? testpatt "%")
			(tasks:kill-runner (db:delay-if-busy tdbdat) target run-name)
			(debug:print 0 "not attempting to kill any run launcher processes as testpatt is " testpatt))
		    (debug:print 1 "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((set-state-status)
		    (tasks:start-and-wait-for-server tdbdat run-id 10)
		    (debug:print 1 "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((print-run)
		    (debug:print 1 "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header)
		    action)
		   ((run-wait)
		    (debug:print 1 "Waiting for run " runkey ", run=" runnamepatt " to complete"))
		   (else







|






|







1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
			       '()))
		(lasttpath "/does/not/exist/I/hope"))
	   (debug:print-info 4 "runs:operate-on run=" run ", header=" header)
	   (if (not (null? tests))
	       (begin
		 (case action
		   ((remove-runs)
		    (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
		    ;; seek and kill in flight -runtests with % as testpatt here
		    (if (equal? testpatt "%")
			(tasks:kill-runner (db:delay-if-busy tdbdat) target run-name)
			(debug:print 0 "not attempting to kill any run launcher processes as testpatt is " testpatt))
		    (debug:print 1 "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((set-state-status)
		    (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
		    (debug:print 1 "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((print-run)
		    (debug:print 1 "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header)
		    action)
		   ((run-wait)
		    (debug:print 1 "Waiting for run " runkey ", run=" runnamepatt " to complete"))
		   (else

Modified tasks.scm from [a0ed950b28] to [7c5174d4f3].

342
343
344
345
346
347
348
















349
350
351
352
353
354
355
356
357
358
359

360
361
362
363
364
365
366
  (let ((res #f))
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     mdb ;; NEEDS dbprep ADDED
     "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND  (strftime('%s','now') - start_time) < 60));" run-id)
    res))

















;; try to start a server and wait for it to be available
;;
(define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries)
  ;; ensure a server is running for this run
  (let loop ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))
	     (delay-time 0))
      (if (and (not server-dat)
	       (< delay-time delay-max-tries))
	  (begin
	    (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id)(debug:print 0 "Try starting server for run-id " run-id))

	    (server:kind-run run-id)
	    (thread-sleep! (min delay-time 5))
	    (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1))))))

(define (tasks:get-all-servers mdb)
  (let ((res '()))
    (sqlite3:for-each-row







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>










|
>







342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
  (let ((res #f))
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     mdb ;; NEEDS dbprep ADDED
     "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND  (strftime('%s','now') - start_time) < 60));" run-id)
    res))

(define (tasks:need-server run-id)
  (let ((forced (configf:lookup *configdat* "server" "required"))
	(maxqry (cdr (rmt:get-max-query-average run-id)))
	(threshold   (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))
    (cond
     (forced 
      (if (common:low-noise-print 60 run-id "server required is set")
	  (debug:print-info 0 "Server required is set, starting server."))
      #t)
     ((> maxqry threshold)
      (if (common:low-noise-print 60 run-id "Max query time execeeded")
	  (debug:print-info 0 "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, starting server."))
      #t)
     (else
      #f))))

;; try to start a server and wait for it to be available
;;
(define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries)
  ;; ensure a server is running for this run
  (let loop ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))
	     (delay-time 0))
      (if (and (not server-dat)
	       (< delay-time delay-max-tries))
	  (begin
	    (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id)
		(debug:print 0 "Try starting server for run-id " run-id))
	    (server:kind-run run-id)
	    (thread-sleep! (min delay-time 5))
	    (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1))))))

(define (tasks:get-all-servers mdb)
  (let ((res '()))
    (sqlite3:for-each-row

Modified tests/fullrun/megatest.config from [6b64710311] to [79e8e68f6b].

137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# timeout 0.025
timeout 0.01

# Server is required - slower but more resistant to Sqlite issues.
# required yes

# Start server when average query takes longer than this
server-query-threshold 15

# daemonize yes
# hostname #{scheme (get-host-name)}

## disks are:
## name host:/path/to/area
## -or-







|







137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# timeout 0.025
timeout 0.01

# Server is required - slower but more resistant to Sqlite issues.
# required yes

# Start server when average query takes longer than this
server-query-threshold 55500

# daemonize yes
# hostname #{scheme (get-host-name)}

## disks are:
## name host:/path/to/area
## -or-