Megatest

Changes On Branch 33e9582eb3ff65ce
Login

Changes In Branch rpc-transport-bbtest Through [33e9582eb3] Excluding Merge-Ins

This is equivalent to a diff from d223d55c09 to 33e9582eb3

2016-01-14
15:39
wip Closed-Leaf check-in: 9e927a9211 user: bjbarcla tags: rpc-transport, rpc-transport-bbtest
2016-01-08
16:55
factored out two functions from http-transport:keep-running to simplify: http-transport:get-server-info,http-transport:sync-inmemdb-to-db check-in: 33e9582eb3 user: bjbarcla tags: rpc-transport, rpc-transport-bbtest
2016-01-07
17:13
add bbtest rmt api func for testing check-in: 8b8c692892 user: bjbarcla tags: rpc-transport, rpc-transport-bbtest
2016-01-06
15:31
Create new branch named "rpc-support" Closed-Leaf check-in: d223d55c09 user: bjbarcla tags: rpc-transport
09:48
Some fixes to address issues created by the per-section config processing code check-in: 2316fa6bc4 user: mrwellan tags: v1.60

Modified Makefile from [c0b2515300] to [579f73e2e6].

1
2
3
4
5
6
7
8
9
10
11
12
# make install CSCOPTS='-accumulate-profile -profile-name $(PWD)/profile-ww$(shell date +%V.%u)'
PREFIX=$(PWD)
CSCOPTS= 
INSTALL=install
SRCFILES = common.scm items.scm launch.scm \
           ods.scm runconfig.scm server.scm configf.scm \
           db.scm keys.scm margs.scm megatest-version.scm \
           process.scm runs.scm tasks.scm tests.scm genexample.scm \
	   http-transport.scm nmsg-transport.scm filedb.scm \
           client.scm gutils.scm synchash.scm daemon.scm mt.scm dcommon.scm \
	   tree.scm ezsteps.scm lock-queue.scm sdb.scm \
	   rmt.scm api.scm tdb.scm rpc-transport.scm \




|







1
2
3
4
5
6
7
8
9
10
11
12
# make install CSCOPTS='-accumulate-profile -profile-name $(PWD)/profile-ww$(shell date +%V.%u)'
PREFIX=$(PWD)
CSCOPTS= 
INSTALL=install
SRCFILES = common.scm bb.scm items.scm launch.scm \
           ods.scm runconfig.scm server.scm configf.scm \
           db.scm keys.scm margs.scm megatest-version.scm \
           process.scm runs.scm tasks.scm tests.scm genexample.scm \
	   http-transport.scm nmsg-transport.scm filedb.scm \
           client.scm gutils.scm synchash.scm daemon.scm mt.scm dcommon.scm \
	   tree.scm ezsteps.scm lock-queue.scm sdb.scm \
	   rmt.scm api.scm tdb.scm rpc-transport.scm \

Modified api.scm from [7425d00411] to [ee00bee346].

9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
;;  PURPOSE.
;;======================================================================

(declare (unit api))
(declare (uses rmt))
(declare (uses db))
(declare (uses tasks))

;; allow these queries through without starting a server
;;
(define api:read-only-queries
  '(get-key-val-pairs
    get-keys
    test-toplevel-num-items
    get-test-info-by-id







|







9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
;;  PURPOSE.
;;======================================================================

(declare (unit api))
(declare (uses rmt))
(declare (uses db))
(declare (uses tasks))
(declare (uses bb))
;; allow these queries through without starting a server
;;
(define api:read-only-queries
  '(get-key-val-pairs
    get-keys
    test-toplevel-num-items
    get-test-info-by-id
53
54
55
56
57
58
59

60
61
62
63
64
65
66
    testmeta-get-record
    have-incompletes?
    synchash-get
    ))

(define api:write-queries
  '(

    ;; SERVERS
    start-server
    kill-server

    ;; TESTS
    test-set-state-status-by-id
    delete-test-records







>







53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
    testmeta-get-record
    have-incompletes?
    synchash-get
    ))

(define api:write-queries
  '(
    bbtest
    ;; SERVERS
    start-server
    kill-server

    ;; TESTS
    test-set-state-status-by-id
    delete-test-records
116
117
118
119
120
121
122



123
124
125
126
127
128
129
130
	#t 
	(let ((cmd    (vector-ref dat 0))
	      (params (vector-ref dat 1)))
	  (case (if (symbol? cmd)
		    cmd
		    (string->symbol cmd))




	    ;;===============================================
	    ;; READ/WRITE QUERIES
	    ;;===============================================

	    ;; SERVERS
	    ((start-server)                    (apply server:kind-run params))
	    ((kill-server)                     (set! *server-run* #f))








>
>
>
|







117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
	#t 
	(let ((cmd    (vector-ref dat 0))
	      (params (vector-ref dat 1)))
	  (case (if (symbol? cmd)
		    cmd
		    (string->symbol cmd))

            ;; Brandon's test to extend API
            ((bbtest) (apply bb:test params))

            ;;===============================================
	    ;; READ/WRITE QUERIES
	    ;;===============================================

	    ;; SERVERS
	    ((start-server)                    (apply server:kind-run params))
	    ((kill-server)                     (set! *server-run* #f))

Modified http-transport.scm from [d387fec12a] to [e7099e7d7f].

345
346
347
348
349
350
351



















































































352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390

391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
;;
(define (http-transport:client-connect iface port)
  (let* ((api-url      (conc "http://" iface ":" port "/api"))
	 (api-uri      (uri-reference (conc "http://" iface ":" port "/api")))
	 (api-req      (make-request method: 'POST uri: api-uri))
	 (server-dat   (vector iface port api-uri api-url api-req (current-seconds))))
    server-dat))




















































































;; run http-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (http-transport:keep-running server-id run-id)
  ;; if none running or if > 20 seconds since 
  ;; server last used then start shutdown
  ;; This thread waits for the server to come alive
  (debug:print-info 0 "Starting the sync-back, keep alive thread in server for run-id=" run-id)
  (let* ((tdbdat      (tasks:open-db))
	 (server-start-time (current-seconds))
	 (server-info (let loop ((start-time (current-seconds))
				 (changed    #t)
				 (last-sdat  "not this"))
                        (let ((sdat #f))
			  (thread-sleep! 0.01)
			  (debug:print-info 0 "Waiting for server alive signature")
                          (mutex-lock! *heartbeat-mutex*)
                          (set! sdat *server-info*)
                          (mutex-unlock! *heartbeat-mutex*)
                          (if (and sdat
				   (not changed)
				   (> (- (current-seconds) start-time) 2))
			      sdat
                              (begin
				(debug:print-info 0 "Still waiting, last-sdat=" last-sdat)
                                (sleep 4)
				(if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
				    (begin
				      (debug:print 0 "ERROR: transport appears to have died, exiting server " server-id " for run " run-id)
				      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature")
				      (exit))
				    (loop start-time
					  (equal? sdat last-sdat)
					  sdat)))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
	 (server-timeout (server:get-timeout)))

    (let loop ((count         0)
	       (server-state 'available)
	       (bad-sync-count 0))

      ;; Use this opportunity to sync the inmemdb to db
      (if *inmemdb* 
	  (let ((start-time (current-milliseconds))
		(sync-time  #f)
		(rem-time   #f))
	    ;; inmemdb is a dbstruct
	    (condition-case
	     (db:sync-touched *inmemdb* *run-id* force-sync: #t)
	     ((sync-failed)(cond
			    ((> bad-sync-count 10) ;; time to give up
			     (http-transport:server-shutdown server-id port))
			    (else ;; (> bad-sync-count 0)  ;; we've had a fail or two, delay and loop
			     (thread-sleep! 5)
			     (loop count server-state (+ bad-sync-count 1)))))
	     ((exn)
	      (debug:print 0 "ERROR: error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server")
	      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running crashed")
	      (exit)))
	    (set! sync-time  (- (current-milliseconds) start-time))
	    (set! rem-time (quotient (- 4000 sync-time) 1000))
	    (debug:print 4 "SYNC: time= " sync-time ", rem-time=" rem-time)
	    
	    (if (and (<= rem-time 4)
		     (> rem-time 0))
		(thread-sleep! rem-time)
		(thread-sleep! 4))) ;; fallback for if the math is changed ...

	  ;;
	  ;; no *inmemdb* yet, set running after our first pass through and start the db
	  ;;
	  (if (eq? server-state 'available)
	      (let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers
		(if (equal? new-server-id server-id)
		    (begin
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep")
		      (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access
		      (set! *inmemdb*  (db:setup run-id))
		      ;; force initialization
		      ;; (db:get-db *inmemdb* #t)
		      (db:get-db *inmemdb* run-id)
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running"))
		    (begin ;; gotta exit nicely
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision")
		      (http-transport:server-shutdown server-id port))))))
      
      (if (< count 1) ;; 3x3 = 9 secs aprox
	  (loop (+ count 1) 'running bad-sync-count))
      
      ;; Check that iface and port have not changed (can happen if server port collides)
      (mutex-lock! *heartbeat-mutex*)
      (set! sdat *server-info*)
      (mutex-unlock! *heartbeat-mutex*)







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>











|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<




>





|
<
|
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446























447
448
449
450
451
452
453
454
455
456
457

458









459







460























461
462
463
464
465
466
467
;;
(define (http-transport:client-connect iface port)
  (let* ((api-url      (conc "http://" iface ":" port "/api"))
	 (api-uri      (uri-reference (conc "http://" iface ":" port "/api")))
	 (api-req      (make-request method: 'POST uri: api-uri))
	 (server-dat   (vector iface port api-uri api-url api-req (current-seconds))))
    server-dat))

;;; factored out of http-transport:keep-running
;; return #t if a bad sync occurred and a retry is warranted
;; return #f otherwise
;; side effect - cleans up and exits on exception.
(define (http-transport:sync-inmemdb-to-db tdbdat server-state run-id server-id bad-sync-count)
  (if *inmemdb* 
      (let ((start-time (current-milliseconds))
            (sync-time  #f)
            (rem-time   #f)
            (sync-retry #f))
        ;; inmemdb is a dbstruct
        (condition-case
         (db:sync-touched *inmemdb* *run-id* force-sync: #t)
         ((sync-failed)(cond
                        ((> bad-sync-count 10) ;; time to give up
                         (http-transport:server-shutdown server-id port))
                        (else ;; (> bad-sync-count 0)  ;; we've had a fail or two, delay and loop
                         (thread-sleep! 5)
                         (set! sync-retry #t))))
         ((exn)
          (debug:print 0 "ERROR: error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server")
          (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running crashed")
          (exit)))
        (if sync-retry
            #t ; return true - retry
            (begin
              (set! sync-time  (- (current-milliseconds) start-time))
              (set! rem-time (quotient (- 4000 sync-time) 1000))
              (debug:print 4 "SYNC: time= " sync-time ", rem-time=" rem-time)
              
              (if (and (<= rem-time 4)
                       (> rem-time 0))
                  (thread-sleep! rem-time)
                  (thread-sleep! 4)) ;; fallback for if the math is changed ...
              
              ;;
              ;; no *inmemdb* yet, set running after our first pass through and start the db
              ;;
              (if (eq? server-state 'available)
                  (let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers
                    (if (equal? new-server-id server-id)
                        (begin
                          (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep")
                          (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access
                          (set! *inmemdb*  (db:setup run-id))
                          ;; force initialization
                          ;; (db:get-db *inmemdb* #t)
                          (db:get-db *inmemdb* run-id)
                          (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running"))
                        (begin ;; gotta exit nicely
                          (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision")
                          (http-transport:server-shutdown server-id port)))))
              #f))) ; return #f - don't retry
      #f)) ; return #f - don't retry since there is no inmemdb


;;; factored out of http-transport:keep-running
(define (http-transport:get-server-info tdbdat server-start-time server-id run-id)
  (let loop ((start-time (current-seconds))
             (changed    #t)
             (last-sdat  "not this"))
    (let ((sdat #f))
      (thread-sleep! 0.01)
      (debug:print-info 0 "Waiting for server alive signature")
      (mutex-lock!   *heartbeat-mutex*)
      (set! sdat     *server-info*)
      (mutex-unlock! *heartbeat-mutex*)
      (if (and sdat
               (not changed)
               (> (- (current-seconds) start-time) 2))
          sdat
          (begin
            (debug:print-info 0 "Still waiting, last-sdat=" last-sdat)
            (sleep 4)
            (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
                (begin
                  (debug:print 0 "ERROR: transport appears to have died, exiting server " server-id " for run " run-id)
                  (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature")
                  (exit))
                (loop start-time
                      (equal? sdat last-sdat)
                      sdat)))))))

;; run http-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (http-transport:keep-running server-id run-id)
  ;; if none running or if > 20 seconds since 
  ;; server last used then start shutdown
  ;; This thread waits for the server to come alive
  (debug:print-info 0 "Starting the sync-back, keep alive thread in server for run-id=" run-id)
  (let* ((tdbdat      (tasks:open-db))
	 (server-start-time (current-seconds))
	 (server-info (http-transport:get-server-info tdbdat server-start-time server-id run-id))























         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
	 (server-timeout (server:get-timeout)))

    (let loop ((count         0)
	       (server-state 'available)
	       (bad-sync-count 0))

      ;; Use this opportunity to sync the inmemdb to db
      (let ((sync-retry (http-transport:sync-inmemdb-to-db tdbdat server-state run-id server-id bad-sync-count)))

        (if sync-retry









            (loop count server-state (+ bad-sync-count 1))))







            























      (if (< count 1) ;; 3x3 = 9 secs aprox
	  (loop (+ count 1) 'running bad-sync-count))
      
      ;; Check that iface and port have not changed (can happen if server port collides)
      (mutex-lock! *heartbeat-mutex*)
      (set! sdat *server-info*)
      (mutex-unlock! *heartbeat-mutex*)

Modified rmt.scm from [58033889c8] to [02ebc7dccc].

582
583
584
585
586
587
588




589
590
591
592
593
594
595
(define (rmt:get-runs-by-patt  keys runnamepatt targpatt offset limit fields) ;; fields of #f uses default
  (rmt:send-receive 'get-runs-by-patt #f (list keys runnamepatt targpatt offset limit fields)))

(define (rmt:find-and-mark-incomplete run-id ovr-deadtime)
  (if (rmt:send-receive 'have-incompletes? run-id (list run-id ovr-deadtime))
      (rmt:send-receive 'mark-incomplete run-id (list run-id ovr-deadtime))))





;;======================================================================
;; M U L T I R U N   Q U E R I E S
;;======================================================================

;; Need to move this to multi-run section and make associated changes
(define (rmt:find-and-mark-incomplete-all-runs #!key (ovr-deadtime #f))
  (let ((run-ids (rmt:get-all-run-ids)))







>
>
>
>







582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
(define (rmt:get-runs-by-patt  keys runnamepatt targpatt offset limit fields) ;; fields of #f uses default
  (rmt:send-receive 'get-runs-by-patt #f (list keys runnamepatt targpatt offset limit fields)))

(define (rmt:find-and-mark-incomplete run-id ovr-deadtime)
  (if (rmt:send-receive 'have-incompletes? run-id (list run-id ovr-deadtime))
      (rmt:send-receive 'mark-incomplete run-id (list run-id ovr-deadtime))))


(define (rmt:bbtest )
  (rmt:send-receive 'bbtest #f '()))

;;======================================================================
;; M U L T I R U N   Q U E R I E S
;;======================================================================

;; Need to move this to multi-run section and make associated changes
(define (rmt:find-and-mark-incomplete-all-runs #!key (ovr-deadtime #f))
  (let ((run-ids (rmt:get-all-run-ids)))