Megatest

Check-in [2725343c92]
Login
Overview
Comment:Proper calling of exit cleanup for servers. Go back to nbfake for running servers (proper logs kept). Remove .servinfo file for a server that does not respond to ping (returns #f).
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.80-revolution
Files: files | file ages | folders
SHA1: 2725343c921c38fcc631f342252ca32b8c61557f
User & Date: matt on 2023-12-03 02:16:07
Other Links: branch diff | manifest | tags
Context
2023-12-03
17:49
Added todo items check-in: d4f2f2c1ef user: matt tags: v1.80-revolution, v1.8023
02:16
Proper calling of exit cleanup for servers. Go back to nbfake for running servers (proper logs kept). Remove .servinfo file for a server that does not respond to ping (returns #f). check-in: 2725343c92 user: matt tags: v1.80-revolution
2023-11-28
13:45
Bypass all the mutexes in dashboard. It seems to help with performance quite a bit. check-in: f4844a3801 user: mrwellan tags: v1.80-revolution, v1.8022
Changes

Modified TODO from [d1855eb2e9] to [4fe902f98b].

14
15
16
17
18
19
20







21
22
23
24
25
26
27
# 
#     You should have received a copy of the GNU General Public License
#     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

TODO
====








23WW47
. Finding server
.. look at .servinfo for likely prime main
.. ask the .servinfo prime main for real prime main
.. save prime main (for how long, 10 seconds or 10 minutes?)

. Starting prime main







>
>
>
>
>
>
>







14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# 
#     You should have received a copy of the GNU General Public License
#     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

TODO
====

23WW48
. Allow two or three servers to run for any given db
. Update avg call count/sec every 30 sec in no-sync
. get server uses no-sync process info to decide which server to suggest
. Use process table to decide who will do sync back
. Fix metadat being synced over and over

23WW47
. Finding server
.. look at .servinfo for likely prime main
.. ask the .servinfo prime main for real prime main
.. save prime main (for how long, 10 seconds or 10 minutes?)

. Starting prime main

Modified server.scm from [39953c681c] to [8ac9dab770].

726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
;;     (set! *db-last-access* (current-seconds)) ;; might not be needed.
;;     (if (equal? *toppath* toppath)
;; 	#t
;; 	#f)))

;; timeout is hms string: 1h 5m 3s, default is 1 minute
;; This is currently broken. Just use the number of hours with no unit.
;; Default is 60 seconds.
;;
(define (server:expiration-timeout)
  (let* ((tmo (configf:lookup *configdat* "server" "timeout")))
    (if (string? tmo)
	(let* ((num (string->number tmo)))
	  (if num
	      (* 3600 num)







|







726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
;;     (set! *db-last-access* (current-seconds)) ;; might not be needed.
;;     (if (equal? *toppath* toppath)
;; 	#t
;; 	#f)))

;; timeout is hms string: 1h 5m 3s, default is 1 minute
;; This is currently broken. Just use the number of hours with no unit.
;; Default is 600 seconds.
;;
(define (server:expiration-timeout)
  (let* ((tmo (configf:lookup *configdat* "server" "timeout")))
    (if (string? tmo)
	(let* ((num (string->number tmo)))
	  (if num
	      (* 3600 num)

Modified tcp-transportmod.scm from [fb9929d164] to [5cef09f36d].

183
184
185
186
187
188
189









190
191
192
193
194
195
196
197
198
199
200
201
202
203
		    conn)
		   ((starting)
		    (thread-sleep! 0.5)
                    (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect")
		    (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))
		   (else
		    (let* ((curr-secs (current-seconds)))









		      ;; rm the (last server) would go here
		      (if (> (- curr-secs (tt-last-serv-start ttdat)) 10)
			  (begin
			    (tt-last-serv-start-set! ttdat curr-secs)
			    (server-start-proc))) ;; start server if 10 sec since last attempt
		      (thread-sleep! 1)
                      (debug:print-info 0 *default-log-port* "server ping result was neither running nor starting. Retrying connect")
		      (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)))))))
	    (else ;; no good server found, if haven't started server in > 5 secs, start another
	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers
		 (begin
		   (debug:print-info 0 *default-log-port* "Starting server for "dbfname)
		   (server-start-proc)
		   (tt-last-serv-start-set! ttdat (current-seconds))







>
>
>
>
>
>
>
>
>






|







183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
		    conn)
		   ((starting)
		    (thread-sleep! 0.5)
                    (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect")
		    (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))
		   (else
		    (let* ((curr-secs (current-seconds)))
		      (if (not ping-res) ;; the server is actually dead, remove the .servinfo file
			  (begin
			    (debug:print-info 0 *default-log-port* "Unreachable server at "
					      host":"port" with servinfo file "servinffile", removing it")
			    (if (file-exists? servinffile)
				(handle-exceptions
				 exn
				 #f
				 (delete-file servinffile)))))
		      ;; rm the (last server) would go here
		      (if (> (- curr-secs (tt-last-serv-start ttdat)) 10)
			  (begin
			    (tt-last-serv-start-set! ttdat curr-secs)
			    (server-start-proc))) ;; start server if 10 sec since last attempt
		      (thread-sleep! 1)
                      (debug:print-info 0 *default-log-port* "server ping result was "ping-res" neither running nor starting. Retrying connect")
		      (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)))))))
	    (else ;; no good server found, if haven't started server in > 5 secs, start another
	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers
		 (begin
		   (debug:print-info 0 *default-log-port* "Starting server for "dbfname)
		   (server-start-proc)
		   (tt-last-serv-start-set! ttdat (current-seconds))
564
565
566
567
568
569
570


571
572
573
574
575
576
577
	   nosyncdbpath
	   (lambda (nsdb)
	     (dbfile:insert-or-update-process nsdb *procinf*)))
	  (thread-start! run-thread)

	  (thread-join! run-thread) ;; run thread will exit on timeout or other conditions
	  


	  ;; replace with call to (dbfile:set-process-done nsdb host pid reason)
	  (procinf-status-set! *procinf* "done")
	  (procinf-end-set! *procinf* (current-seconds))
	  ;; either convert this to use set-process-done or get rid of set-process-done
	  (dbfile:with-no-sync-db
	   nosyncdbpath
	   (lambda (nsdb)







>
>







573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
	   nosyncdbpath
	   (lambda (nsdb)
	     (dbfile:insert-or-update-process nsdb *procinf*)))
	  (thread-start! run-thread)

	  (thread-join! run-thread) ;; run thread will exit on timeout or other conditions
	  
	  ;; (tcp-close (tt-socket ttdat)) ;; close up ports here

	  ;; replace with call to (dbfile:set-process-done nsdb host pid reason)
	  (procinf-status-set! *procinf* "done")
	  (procinf-end-set! *procinf* (current-seconds))
	  ;; either convert this to use set-process-done or get rid of set-process-done
	  (dbfile:with-no-sync-db
	   nosyncdbpath
	   (lambda (nsdb)
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
		((dbr:dbstruct-sync-proc dbstruct) last-update)
		(dbr:dbstruct-last-update-set! dbstruct curr-secs))))
	
	(if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param))
	    (begin
	      (thread-sleep! 5)
	      (loop)))))
    ;; (cleanup) ;; all done by tt:shutdown-server
    (debug:print 0 *default-log-port* "INFO: Server timed out, exiting from tt:keep-running.")))


(define (tt:shutdown-server ttdat)
  (let* ((host (tt-host ttdat))
	 (port (tt-port ttdat))
	 (sinf (tt-servinf-file ttdat)))
    (tt-state-set! ttdat 'shutdown)
    (portlogger:open-run-close portlogger:set-port port "released")
    (if (file-exists? sinf)
	(delete-file* sinf))
    (tcp-close (tt-socket ttdat)) ;; close up ports here
    ))

;; return servid
;; side-effects:
;;   ttdat-cleanup-proc is populated with function to remove the serverinfo file
(define (tt:create-server-registration-file ttdat dbfname)
  (let* ((areapath (tt-areapath ttdat))







|











<







638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656

657
658
659
660
661
662
663
		((dbr:dbstruct-sync-proc dbstruct) last-update)
		(dbr:dbstruct-last-update-set! dbstruct curr-secs))))
	
	(if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param))
	    (begin
	      (thread-sleep! 5)
	      (loop)))))
    (tt:shutdown-server ttdat)
    (debug:print 0 *default-log-port* "INFO: Server timed out, exiting from tt:keep-running.")))


(define (tt:shutdown-server ttdat)
  (let* ((host (tt-host ttdat))
	 (port (tt-port ttdat))
	 (sinf (tt-servinf-file ttdat)))
    (tt-state-set! ttdat 'shutdown)
    (portlogger:open-run-close portlogger:set-port port "released")
    (if (file-exists? sinf)
	(delete-file* sinf))

    ))

;; return servid
;; side-effects:
;;   ttdat-cleanup-proc is populated with function to remove the serverinfo file
(define (tt:create-server-registration-file ttdat dbfname)
  (let* ((areapath (tt-areapath ttdat))
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789





790
791
792
793
794
795
796
797
	    (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
	    (thread-sleep! 1)
	    #f)
	   (else
	    (if (not (file-exists? (conc areapath"/logs")))
		(create-directory (conc areapath"/logs") #t))
	    (let* ((logfile   (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
		   (cmdln     (conc
			       mtexe
			       " -startdir "areapath
			       " -server - ";; (or target-host "-")
			       " -m testsuite:"testsuite
			       " -db "dbfname ;; (dbmod:run-id->dbfname run-id)
			       " " profile-mode
			       (conc " >> " logfile " 2>&1 &"))))
	      ;; we want the remote server to start in *toppath* so push there
	      ;; (push-directory areapath) ;; use cd in the command line instead
	      (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
	      ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))






	      (system cmdln)
	      (hash-table-set! *last-server-start* dbfname (current-seconds))
	      ;; ;; use below to go back to nbfake - nbfake does cause trouble ...
	      ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
	      ;; (setenv "NBFAKE_LOG" logfile)
	      ;; (system (conc "cd "areapath" ; nbfake " cmdln))
	      ;; (unsetenv "NBFAKE_QUIET")
	      ;; (unsetenv "NBFAKE_LOG")







|






|





>
>
>
>
>
|







780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
	    (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
	    (thread-sleep! 1)
	    #f)
	   (else
	    (if (not (file-exists? (conc areapath"/logs")))
		(create-directory (conc areapath"/logs") #t))
	    (let* ((logfile   (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
		   (cmdln (conc
			       mtexe
			       " -startdir "areapath
			       " -server - ";; (or target-host "-")
			       " -m testsuite:"testsuite
			       " -db "dbfname ;; (dbmod:run-id->dbfname run-id)
			       " " profile-mode
			       #;(conc " >> " logfile " 2>&1 &"))))
	      ;; we want the remote server to start in *toppath* so push there
	      ;; (push-directory areapath) ;; use cd in the command line instead
	      (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
	      ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))

	      (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
	      (setenv "NBFAKE_LOG" logfile)
	      (system (conc "cd "areapath" ; nbfake " cmdln))
	      (unsetenv "NBFAKE_QUIET")
	      (unsetenv "NBFAKE_LOG")
	      ;; (system cmdln)
	      (hash-table-set! *last-server-start* dbfname (current-seconds))
	      ;; ;; use below to go back to nbfake - nbfake does cause trouble ...
	      ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
	      ;; (setenv "NBFAKE_LOG" logfile)
	      ;; (system (conc "cd "areapath" ; nbfake " cmdln))
	      ;; (unsetenv "NBFAKE_QUIET")
	      ;; (unsetenv "NBFAKE_LOG")