Megatest

Check-in [e75192ae30]
Login
Overview
Comment:wip to clean up server shutdown process
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.6584-nanomsg
Files: files | file ages | folders
SHA1: e75192ae303f9ab465ffb69b6c46b67da7649970
User & Date: matt on 2021-06-18 05:44:26
Other Links: branch diff | manifest | tags
Context
2021-06-18
22:29
Added clean exit for nn using 'quit check-in: 48b415009f user: matt tags: v1.6584-nanomsg
05:44
wip to clean up server shutdown process check-in: e75192ae30 user: matt tags: v1.6584-nanomsg
05:05
Removed need for watchdog check-in: b1c8817ecf user: matt tags: v1.6584-nanomsg
Changes

Modified commonmod.scm from [d20588a683] to [917d82cc3a].

226
227
228
229
230
231
232

233
234
235
236
237
238
239
240
241
242
243
244
245
246
(define *home-host*         #f)
;; (define *total-non-write-delay* 0)
(define *heartbeat-mutex*   (make-mutex))
(define *api-process-request-count* 0)
(define *max-api-process-requests* 0)
(define *server-overloaded*  #f)
(define *writes-total-delay*  0)


;; client
(define *rmt-mutex*         (make-mutex))     ;; remote access calls mutex 

;; RPC transport
(define *rpc:listener*      #f)

;; KEY info
;; (define *target*            (make-hash-table)) ;; cache the target here; target is keyval1/keyval2/.../keyvalN
(define *keys*              (make-hash-table)) ;; cache the keys here
(define *keyvals*           (make-hash-table))
(define *toptest-paths*     (make-hash-table)) ;; cache toptest path settings here
(define *test-paths*        (make-hash-table)) ;; cache test-id to test run paths here
(define *test-ids*          (make-hash-table)) ;; cache run-id, testname, and item-path => test-id







>




<
<
<







226
227
228
229
230
231
232
233
234
235
236
237



238
239
240
241
242
243
244
(define *home-host*         #f)
;; (define *total-non-write-delay* 0)
(define *heartbeat-mutex*   (make-mutex))
(define *api-process-request-count* 0)
(define *max-api-process-requests* 0)
(define *server-overloaded*  #f)
(define *writes-total-delay*  0)
(define *unclean-shutdown*  #t) ;; flag to clear on clean shutdown

;; client
(define *rmt-mutex*         (make-mutex))     ;; remote access calls mutex 




;; KEY info
;; (define *target*            (make-hash-table)) ;; cache the target here; target is keyval1/keyval2/.../keyvalN
(define *keys*              (make-hash-table)) ;; cache the keys here
(define *keyvals*           (make-hash-table))
(define *toptest-paths*     (make-hash-table)) ;; cache toptest path settings here
(define *test-paths*        (make-hash-table)) ;; cache test-id to test run paths here
(define *test-ids*          (make-hash-table)) ;; cache run-id, testname, and item-path => test-id

Modified rmtmod.scm from [be3b150966] to [dca8b0d5fd].

1457
1458
1459
1460
1461
1462
1463






































1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
(define (common:version-changed?)
  (not (equal? (common:get-last-run-version)
               (common:version-signature))))

(define (common:api-changed?)
  (not (equal? (substring (->string megatest-version) 0 4)
               (substring (conc (common:get-last-run-version)) 0 4))))







































(define (std-exit-procedure)
  ;;(common:telemetry-log-close)
  (on-exit (lambda () 0))
  ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
  (let ((no-hurry  (if (bdat-time-to-exit *bdat*) ;; hurry up
		       #f
		       (begin
			 (bdat-time-to-exit-set! *bdat* #t)
			 #t))))
    (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.")
    (if (and no-hurry (debug:debug-mode 18))
	(rmt:print-db-stats))
    (let ((th1 (make-thread
		(lambda () ;; thread for cleaning up, give it five seconds
		  (let* ((start-time (current-seconds)))
		    (if *server-info*
			(let ((dbfile   (servdat-dbfile *server-info*)))
			  (debug:print-info 0 *default-log-port* "dbfile is "dbfile)
			  (if dbfile
			      (let* ((am-server  (args:get-arg "-server"))
				     (dbfile     (args:get-arg "-db"))
				     (apath      *toppath*))
				;; do a final sync here
				(debug:print-info 0 *default-log-port* "Doing final sync for "apath" "dbfile" at "(current-seconds))
				(db:sync-inmem->disk *dbstruct-db* apath dbfile force-sync: #t)
				(if am-server
				    (if (string-match ".*/main.db$" dbfile)
					(let ((pkt-file (conc (get-pkts-dir *toppath*)
							      "/" (servdat-uuid *server-info*)
							      ".pkt")))
					  (debug:print-info 0 *default-log-port* "removing pkt "pkt-file)
					  (delete-file* pkt-file)
					  (debug:print-info 0 *default-log-port* "Releasing lock for "dbfile)
					  (db:with-lock-db (servdat-dbfile *server-info*)
							   (lambda (dbh dbfile)
							     (db:release-lock dbh dbfile))))
					(let* ((sdat *server-info*) ;; we have a run-id server
					       (host (servdat-host sdat))
					       (port (servdat-port sdat))
					       (uuid (servdat-uuid sdat)))
					  (debug:print-info 0 *default-log-port* "deregistering server "host":"port" with uuid "uuid)
					  )))))))
		    (debug:print-info 0 *default-log-port* "Shutdown activities completed in "(- (current-seconds) start-time)" seconds"))
		  ;; (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated
		  #;(if (bdat-task-db *bdat*)    ;; TODO: Check that this is correct for task db
		  (let ((db (cdr (bdat-task-db *bdat*))))
		  (if (sqlite3:database? db)
		  (begin
		  (debug:print-info 0 *default-log-port* "Closing down task db "db)







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
















|
<
<
|
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518


1519






1520
















1521
1522
1523
1524
1525
1526
1527
(define (common:version-changed?)
  (not (equal? (common:get-last-run-version)
               (common:version-signature))))

(define (common:api-changed?)
  (not (equal? (substring (->string megatest-version) 0 4)
               (substring (conc (common:get-last-run-version)) 0 4))))

(define (rmt:server-shutdown)
  (let ((dbfile   (servdat-dbfile *server-info*)))
    (debug:print-info 0 *default-log-port* "dbfile is "dbfile)
    (if dbfile
	(let* ((am-server  (args:get-arg "-server"))
	       (dbfile     (args:get-arg "-db"))
	       (apath      *toppath*))
	  ;; do a final sync here
	  (debug:print-info 0 *default-log-port* "Doing final sync for "apath" "dbfile" at "(current-seconds))
	  (db:sync-inmem->disk *dbstruct-db* apath dbfile force-sync: #t)
	  (if am-server
	      (if (string-match ".*/main.db$" dbfile)
		  (let ((pkt-file (conc (get-pkts-dir *toppath*)
					"/" (servdat-uuid *server-info*)
					".pkt")))
		    (debug:print-info 0 *default-log-port* "removing pkt "pkt-file)
		    (delete-file* pkt-file)
		    (debug:print-info 0 *default-log-port* "Releasing lock for "dbfile)
		    (db:with-lock-db (servdat-dbfile *server-info*)
				     (lambda (dbh dbfile)
				       (db:release-lock dbh dbfile))))
		  (let* ((sdat *server-info*) ;; we have a run-id server
			 (host (servdat-host sdat))
			 (port (servdat-port sdat))
			 (uuid (servdat-uuid sdat)))
		    (if (not (string-match ".db/main.db" (args:get-arg "-db")))
			(let* ((res (rmt:deregister-server *rmt:remote* ;; TODO/BUG: why is this requiring *rmt:remote*?
							   *toppath*
							   (servdat-host *server-info*)   ;; iface
							   (servdat-port *server-info*)
							   (servdat-uuid *server-info*)
							   (current-process-id)
							   )))
			  (debug:print-info 0 *default-log-port* "deregistered-server, res="res)))
		    
		    (debug:print-info 0 *default-log-port* "deregistering server "host":"port" with uuid "uuid)
		    )))))))

(define (std-exit-procedure)
  ;;(common:telemetry-log-close)
  (on-exit (lambda () 0))
  ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
  (let ((no-hurry  (if (bdat-time-to-exit *bdat*) ;; hurry up
		       #f
		       (begin
			 (bdat-time-to-exit-set! *bdat* #t)
			 #t))))
    (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.")
    (if (and no-hurry (debug:debug-mode 18))
	(rmt:print-db-stats))
    (let ((th1 (make-thread
		(lambda () ;; thread for cleaning up, give it five seconds
		  (let* ((start-time (current-seconds)))
		    (if (and *server-info*


			     *unclean-shutdown*)






			(rmt:server-shutdown))
















		    (debug:print-info 0 *default-log-port* "Shutdown activities completed in "(- (current-seconds) start-time)" seconds"))
		  ;; (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated
		  #;(if (bdat-task-db *bdat*)    ;; TODO: Check that this is correct for task db
		  (let ((db (cdr (bdat-task-db *bdat*))))
		  (if (sqlite3:database? db)
		  (begin
		  (debug:print-info 0 *default-log-port* "Closing down task db "db)
1589
1590
1591
1592
1593
1594
1595


1596
1597


1598
1599
1600

1601
1602
1603
1604
1605
1606
1607
	  (servdat-port-set! *server-info* port)
	  (servdat-status-set! *server-info* 'trying-port)
	  (servdat-trynum-set! *server-info* (+ (servdat-trynum *server-info*) 1)))
	(set! *server-info* (make-servdat host: ipaddrstr port: port)))
    (let* ((rep (rmt:try-start-server ipaddrstr port)))
      (let loop ((instr (nn-recv rep)))
	(let* ((data   (string->sexpr instr))


	       (res    (api:process-request *dbstruct-db* data))
	       (resdat (sexpr->string res)))


	  (set! *db-last-access* (current-seconds))
	  (nn-send rep resdat)
	  (loop (nn-recv rep)))))

    (let* ((portnum (servdat-port *server-info*)))
      (portlogger:open-run-close portlogger:set-port portnum "released")
      (debug:print 1 *default-log-port* "INFO: server has been stopped"))))

(define (rmt:try-start-server ipaddrstr portnum)
  (if *server-info* ;; update the server info as we might be trying next port
      (begin







>
>
|

>
>
|
|
|
>







1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
	  (servdat-port-set! *server-info* port)
	  (servdat-status-set! *server-info* 'trying-port)
	  (servdat-trynum-set! *server-info* (+ (servdat-trynum *server-info*) 1)))
	(set! *server-info* (make-servdat host: ipaddrstr port: port)))
    (let* ((rep (rmt:try-start-server ipaddrstr port)))
      (let loop ((instr (nn-recv rep)))
	(let* ((data   (string->sexpr instr))
	       (res    (case data
			 ((quit) 'quit)
			 (else    (api:process-request *dbstruct-db* data))))
	       (resdat (sexpr->string res)))
	  (if (not (eq? res 'quit))
	      (begin
		(set! *db-last-access* (current-seconds))
		(nn-send rep resdat)
		(loop (nn-recv rep)))))))
    ;; server exit stuff here
    (let* ((portnum (servdat-port *server-info*)))
      (portlogger:open-run-close portlogger:set-port portnum "released")
      (debug:print 1 *default-log-port* "INFO: server has been stopped"))))

(define (rmt:try-start-server ipaddrstr portnum)
  (if *server-info* ;; update the server info as we might be trying next port
      (begin
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164

2165
2166
2167
2168
2169
2170
2171
		 (> (+ last-access server-timeout)
		    (current-seconds)))
	    (if (common:low-noise-print 120 "server continuing")
		(debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)))
	    (loop 0 bad-sync-count (current-milliseconds)))
	   (else
	    (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access))
	    (if (not (string-match ".db/main.db" (args:get-arg "-db")))
		(let* ((res (rmt:deregister-server *rmt:remote* ;; TODO/BUG: why is this requiring *rmt:remote*?
						   *toppath*
						   (servdat-host *server-info*)   ;; iface
						   (servdat-port *server-info*)
						   (servdat-uuid *server-info*)
						   (current-process-id)
						   )))
		  (debug:print-info 0 *default-log-port* "deregistered-server, res="res)))
	    (http-transport:server-shutdown port))))))))

(define (http-transport:server-shutdown port)
  (begin
    ;;(BB> "http-transport:server-shutdown called")
    (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id))
    ;;
    ;; start_shutdown
    ;;

    ;; deregister the server


    
    (bdat-time-to-exit-set! *bdat* #t) ;; tell on-exit to be fast as we've already cleaned up
    ;; (portlogger:open-run-close portlogger:set-port port "released") ;; done in rmt:run
    (thread-sleep! 1)

    ;; (debug:print-info 0 *default-log-port* "Max cached queries was    " *max-cache-size*)
    ;; (debug:print-info 0 *default-log-port* "Number of cached writes   " *number-of-writes*)







|
|
|
|
|
|
|
|
|











|
>







2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
		 (> (+ last-access server-timeout)
		    (current-seconds)))
	    (if (common:low-noise-print 120 "server continuing")
		(debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)))
	    (loop 0 bad-sync-count (current-milliseconds)))
	   (else
	    (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access))




	    ;; send self 'quit here



	    
	    (http-transport:server-shutdown port))))))))

(define (http-transport:server-shutdown port)
  (begin
    ;;(BB> "http-transport:server-shutdown called")
    (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id))
    ;;
    ;; start_shutdown
    ;;

    ;; deregister the server
    (rmt:server-shutdown)
    (set! *unclean-shutdown* #f)
    
    (bdat-time-to-exit-set! *bdat* #t) ;; tell on-exit to be fast as we've already cleaned up
    ;; (portlogger:open-run-close portlogger:set-port port "released") ;; done in rmt:run
    (thread-sleep! 1)

    ;; (debug:print-info 0 *default-log-port* "Max cached queries was    " *max-cache-size*)
    ;; (debug:print-info 0 *default-log-port* "Number of cached writes   " *number-of-writes*)