Megatest

Check-in [6b5c7789cb]
Login
Overview
Comment:improved server stability
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.63
Files: files | file ages | folders
SHA1: 6b5c7789cb09e64c09ec9326d70e20615984af3a
User & Date: bjbarcla on 2016-12-27 17:15:48
Other Links: branch diff | manifest | tags
Context
2016-12-28
16:35
fixed condition where watchdog continued to process many times rapidly when time-to-exit is #t check-in: 88034605c0 user: bjbarcla tags: v1.63
14:46
Manual merge to integ-home to fix a conflict check-in: 6673cb2ce0 user: mrwellan tags: integ-home
2016-12-27
17:15
improved server stability check-in: 6b5c7789cb user: bjbarcla tags: v1.63
2016-12-22
17:41
corrected close-all-connections in exit proc to avoid stack dump check-in: 9f7ecc5050 user: bjbarcla tags: v1.63
Changes

Modified api.scm from [19e6c44e9d] to [1debddd502].

60
61
62
63
64
65
66


67
68
69
70
71
72
73
    testmeta-get-record
    have-incompletes?
    synchash-get
    ))

(define api:write-queries
  '(


    ;; SERVERS
    start-server
    kill-server

    ;; TESTS
    test-set-state-status-by-id
    delete-test-records







>
>







60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
    testmeta-get-record
    have-incompletes?
    synchash-get
    ))

(define api:write-queries
  '(
    get-keys-write ;; dummy "write" query to force server start

    ;; SERVERS
    start-server
    kill-server

    ;; TESTS
    test-set-state-status-by-id
    delete-test-records
187
188
189
190
191
192
193

194
195
196
197
198
199
200
	    ;;======================================================================
	    ;; READ ONLY QUERIES
	    ;;======================================================================

	    ;; KEYS
	    ((get-key-val-pairs)               (apply db:get-key-val-pairs dbstruct params))
	    ((get-keys)                        (db:get-keys dbstruct))

	    ((get-key-vals)                    (apply db:get-key-vals dbstruct params))
	    ((get-target)                      (apply db:get-target dbstruct params))
	    ((get-targets)                     (db:get-targets dbstruct))

	    ;; ARCHIVES
	    ((test-get-archive-block-info)     (apply db:test-get-archive-block-info dbstruct params))
	    







>







189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
	    ;;======================================================================
	    ;; READ ONLY QUERIES
	    ;;======================================================================

	    ;; KEYS
	    ((get-key-val-pairs)               (apply db:get-key-val-pairs dbstruct params))
	    ((get-keys)                        (db:get-keys dbstruct))
            ((get-keys-write)                        (db:get-keys dbstruct)) ;; force a dummy "write" query to force server
	    ((get-key-vals)                    (apply db:get-key-vals dbstruct params))
	    ((get-target)                      (apply db:get-target dbstruct params))
	    ((get-targets)                     (db:get-targets dbstruct))

	    ;; ARCHIVES
	    ((test-get-archive-block-info)     (apply db:test-get-archive-block-info dbstruct params))
	    

Modified common.scm from [4f9e9ae478] to [ac422f66d5].

551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575

576
577
578
579
580
581
582
583
584
585
586
587
;;;; run-ids
;;    if #f use *db-local-sync* : or 'local-sync-flags
;;    if #t use timestamps      : or 'timestamps
(define (common:sync-to-megatest.db dbstruct) 
  (let ((start-time         (current-seconds))
	(res                (db:multi-db-sync dbstruct 'new2old)))
    (let ((sync-time (- (current-seconds) start-time)))
      (debug:print-info 3 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds")
      (if (common:low-noise-print 30 "sync new to old")
	  (debug:print-info 0 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds")))
    res))

;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp
;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server)
;;
(define (common:watchdog)
  (thread-sleep! 0.05) ;; delay for startup
  (let ((legacy-sync (common:run-sync?))
	(debug-mode  (debug:debug-mode 1))
	(last-time   (current-seconds)))
    (debug:print-info 0 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync)
    (if legacy-sync
	(let ((dbstruct (db:setup)))
	  (debug:print-info 0 *default-log-port* "Server running, periodic sync started.")
	  (let loop ()

	    ;; sync for filesystem local db writes
	    ;;
	    (mutex-lock! *db-multi-sync-mutex*)
	    (let* (
                   (need-sync        (>= *db-last-access* *db-last-sync*)) ;; no sync since last write
		   (sync-in-progress *db-sync-in-progress*)
		   (should-sync      (> (- (current-seconds) *db-last-sync*) 5)) ;; sync every five seconds minimum
		   (will-sync        (and (or need-sync should-sync)
					  (not sync-in-progress)))
		   (start-time       (current-seconds)))
	      ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync)
	      (if will-sync (set! *db-sync-in-progress* #t))







|

|










|




>



<
|







551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579

580
581
582
583
584
585
586
587
;;;; run-ids
;;    if #f use *db-local-sync* : or 'local-sync-flags
;;    if #t use timestamps      : or 'timestamps
(define (common:sync-to-megatest.db dbstruct) 
  (let ((start-time         (current-seconds))
	(res                (db:multi-db-sync dbstruct 'new2old)))
    (let ((sync-time (- (current-seconds) start-time)))
      (debug:print-info 3 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id))
      (if (common:low-noise-print 30 "sync new to old")
	  (debug:print-info 0 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id))))
    res))

;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp
;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server)
;;
(define (common:watchdog)
  (thread-sleep! 0.05) ;; delay for startup
  (let ((legacy-sync (common:run-sync?))
	(debug-mode  (debug:debug-mode 1))
	(last-time   (current-seconds)))
    (debug:print-info 0 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id))
    (if legacy-sync
	(let ((dbstruct (db:setup)))
	  (debug:print-info 0 *default-log-port* "Server running, periodic sync started.")
	  (let loop ()
            ;;(BB> "watchdog loop.  pid="(current-process-id))
	    ;; sync for filesystem local db writes
	    ;;
	    (mutex-lock! *db-multi-sync-mutex*)

	    (let* ((need-sync        (>= *db-last-access* *db-last-sync*)) ;; no sync since last write
		   (sync-in-progress *db-sync-in-progress*)
		   (should-sync      (> (- (current-seconds) *db-last-sync*) 5)) ;; sync every five seconds minimum
		   (will-sync        (and (or need-sync should-sync)
					  (not sync-in-progress)))
		   (start-time       (current-seconds)))
	      ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync)
	      (if will-sync (set! *db-sync-in-progress* #t))
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
		  (if (and (not *time-to-exit*)
			   (< count 4)) ;; was 11, changing to 4. 
		      (begin
			(thread-sleep! 1)
			(delay-loop (+ count 1))))
		  (loop)))
	    (if (common:low-noise-print 30)
		(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*)))))))

(define (std-exit-procedure)
  (on-exit (lambda () 0))
  (let ((no-hurry  (if *time-to-exit* ;; hurry up
		       #f
		       (begin
			 (set! *time-to-exit* #t)







|







614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
		  (if (and (not *time-to-exit*)
			   (< count 4)) ;; was 11, changing to 4. 
		      (begin
			(thread-sleep! 1)
			(delay-loop (+ count 1))))
		  (loop)))
	    (if (common:low-noise-print 30)
		(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id))))))))

(define (std-exit-procedure)
  (on-exit (lambda () 0))
  (let ((no-hurry  (if *time-to-exit* ;; hurry up
		       #f
		       (begin
			 (set! *time-to-exit* #t)

Modified http-transport.scm from [ec36961174] to [285317a3d3].

484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
;; (if (and (<= rem-time 4)
;; 	     (> rem-time 0))
;; 	(thread-sleep! rem-time)
;; 	(thread-sleep! 4))) ;; fallback for if the math is changed ...

(define (http-transport:server-shutdown server-id port)
  (let ((tdbdat (tasks:open-db)))
    (debug:print-info 0 *default-log-port* "Starting to shutdown the server.")
    ;;
    ;; start_shutdown
    ;;
    (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down")
    (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up
    (portlogger:open-run-close portlogger:set-port port "released")
    (thread-sleep! 5)







|







484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
;; (if (and (<= rem-time 4)
;; 	     (> rem-time 0))
;; 	(thread-sleep! rem-time)
;; 	(thread-sleep! 4))) ;; fallback for if the math is changed ...

(define (http-transport:server-shutdown server-id port)
  (let ((tdbdat (tasks:open-db)))
    (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id))
    ;;
    ;; start_shutdown
    ;;
    (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down")
    (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up
    (portlogger:open-run-close portlogger:set-port port "released")
    (thread-sleep! 5)

Modified rmt.scm from [7e5abf90aa] to [c70f311cf0].

84
85
86
87
88
89
90











91
92
93
94
95
96
97


98
99
100
101
102
103


104
105
106
107
108
109
110
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; on homehost and this is a read
     ((and (cdr (remote-hh-dat *runremote*))   ;; on homehost
           (member cmd api:read-only-queries)) ;; this is a read
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  3")
      (rmt:open-qry-close-locally cmd 0 params))











     ;; on homehost and this is a write, we already have a server
     ((and (cdr (remote-hh-dat *runremote*))         ;; on homehost
           (not (member cmd api:read-only-queries))  ;; this is a write
           (remote-server-url *runremote*))          ;; have a server
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4")
      (rmt:open-qry-close-locally cmd 0 params))


     ;; on homehost and this is a write, we have a server (we know because case 4 checked)
     ((and (cdr (remote-hh-dat *runremote*))         ;; on homehost
	   (not (member cmd api:read-only-queries)))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
      (rmt:open-qry-close-locally cmd 0 params))


     ;; no server contact made and this is a write, passively start a server 
     ((and (not (remote-server-url *runremote*))
	   (not (member cmd api:read-only-queries)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
      (let ((serverconn (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
	(if serverconn
	    (remote-server-url-set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed







>
>
>
>
>
>
>
>
>
>
>







>
>
|
|
|
|
|
|
>
>







84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; on homehost and this is a read
     ((and (cdr (remote-hh-dat *runremote*))   ;; on homehost
           (member cmd api:read-only-queries)) ;; this is a read
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  3")
      (rmt:open-qry-close-locally cmd 0 params))

     ;; on homehost and this is a write, we already have a server, but server has died
     ((and (cdr (remote-hh-dat *runremote*))         ;; on homehost
           (not (member cmd api:read-only-queries))  ;; this is a write
           (remote-server-url *runremote*)           ;; have a server
           (not (server:read-dotserver *toppath*)))  ;; server has died.
      (set! *runremote* #f)
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))

     ;; on homehost and this is a write, we already have a server
     ((and (cdr (remote-hh-dat *runremote*))         ;; on homehost
           (not (member cmd api:read-only-queries))  ;; this is a write
           (remote-server-url *runremote*))          ;; have a server
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4")
      (rmt:open-qry-close-locally cmd 0 params))

     ;; commented by bb; this was blocking server passive start on write on homehost (case 5)
     ;; ;; on homehost and this is a write, we have a server (we know because case 4 checked)
     ;; ((and (cdr (remote-hh-dat *runremote*))         ;; on homehost
     ;;       (not (member cmd api:read-only-queries)))
     ;;  (mutex-unlock! *rmt-mutex*)
     ;;  (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
     ;;  (rmt:open-qry-close-locally cmd 0 params))

     
     ;; no server contact made and this is a write, passively start a server 
     ((and (not (remote-server-url *runremote*))
	   (not (member cmd api:read-only-queries)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
      (let ((serverconn (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
	(if serverconn
	    (remote-server-url-set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed
347
348
349
350
351
352
353





354
355
356
357
358
359
360
  (rmt:send-receive 'get-key-val-pairs run-id (list run-id)))

(define (rmt:get-keys)
  (if *db-keys* *db-keys* 
     (let ((res (rmt:send-receive 'get-keys #f '())))
       (set! *db-keys* res)
       res)))






;; we don't reuse run-id's (except possibly *after* a db cleanup) so it is safe
;; to cache the resuls in a hash
;;
(define (rmt:get-key-vals run-id)
  (or (hash-table-ref/default *keyvals* run-id #f)
      (let ((res (rmt:send-receive 'get-key-vals #f (list run-id))))







>
>
>
>
>







362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
  (rmt:send-receive 'get-key-val-pairs run-id (list run-id)))

(define (rmt:get-keys)
  (if *db-keys* *db-keys* 
     (let ((res (rmt:send-receive 'get-keys #f '())))
       (set! *db-keys* res)
       res)))

(define (rmt:get-keys-write) ;; dummy query to force server start
  (let ((res (rmt:send-receive 'get-keys-write #f '())))
    (set! *db-keys* res)
    res))

;; we don't reuse run-id's (except possibly *after* a db cleanup) so it is safe
;; to cache the resuls in a hash
;;
(define (rmt:get-key-vals run-id)
  (or (hash-table-ref/default *keyvals* run-id #f)
      (let ((res (rmt:send-receive 'get-key-vals #f (list run-id))))