Megatest

Check-in [7fb44b797e]
Login
Overview
Comment:reworked the sync locking, allow parallel servers (needs work to lock in to single machine)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.80
Files: files | file ages | folders
SHA1: 7fb44b797e70c15c101f361a2d7c61f4d9852cef
User & Date: matt on 2023-05-11 09:08:14
Other Links: branch diff | manifest | tags
Context
2023-05-11
09:14
Merged fork check-in: b21843154b user: mrwellan tags: v1.80
09:08
reworked the sync locking, allow parallel servers (needs work to lock in to single machine) check-in: 7fb44b797e user: matt tags: v1.80
05:43
Use a key for the db lock-down that is unique to the db in .mtdb, this should elminate duplicate, overlapping servers. check-in: 138a40d18e user: matt tags: v1.80
Changes

Modified common.scm from [10e4ec655c] to [6b6394cde8].

55
56
57
58
59
60
61

62
63
64
65
66
67
68
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69







+








(define (stop-the-train)
  (thread-start! (make-thread (lambda ()
				(let loop ()
				  (if (and *toppath*
					   (file-exists? (conc *toppath*"/stop-the-train")))
				      (let* ((msg (conc "ERROR: found file "*toppath*"/stop-the-train, exiting immediately")))
					;; yes, print to current-output-port AND *default-log-port*, annoying but necessary I think
					(print msg)
					(debug:print 0 *default-log-port* msg)
					(exit 1)))
				  (thread-sleep! 5)
				  (loop))))))

;; execute thunk, return value.  If exception thrown, trap exception, return #f, and emit nonfatal condition note to *default-log-port* .

Modified dbfile.scm from [82b1ce2a6f] to [deb13d8b08].

80
81
82
83
84
85
86

87
88
89
90
91
92
93
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94







+







  ;;
  ;; for the inmem approach (see dbmod.scm)
  ;; this is one db per server
  (inmem     #f)  ;; handle for the in memory copy
  (dbfile    #f)  ;; path to the db file on disk
  (dbfname   #f)  ;; short name of db file on disk (used to validate accessing correct db)
  (ondiskdb  #f)  ;; handle for the on-disk file
  (dbtmpname #f)  ;; path to db file in /tmp (non-imem method)
  (dbdat     #f)  ;; create a dbdat for the downstream calls such as db:with-db
  (last-update 0)
  (sync-proc #f)
  )

;; NOTE: Need one dbr:subdb per main.db, 1.db ...
;;
514
515
516
517
518
519
520


521
522
523
524
525
526
527
528
529
530
531
532


533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548





549
550
551
552

553
554
555
556
557
558
559
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533


534
535
536
537
538
539
540
541
542
543
544
545
546
547




548
549
550
551
552
553
554
555

556
557
558
559
560
561
562
563







+
+










-
-
+
+












-
-
-
-
+
+
+
+
+



-
+







			  (string->number res)
			  #f)))
          (if newres
              newres
              res))
        res)))

;; timestring+identifier+payload
;; locks are unique on identifier, payload is informational
(define (db:extract-time-identifier instr)
  (let ((tokens (string-split instr "+")))
    (match tokens
      ((t i)(cons (string->number t) i))
      ((t)  (cons (string->number t) #f))
      (else
       (assert #f "FATAL: db:extract-time-identifier handed bad data "instr)))))

;; transaction protected lock aquisition
;; either:
;;    fails    returns  (#f . lock-creation-time)
;;    succeeds (returns (#t . lock-creation-time)
;;    fails    returns  (#f lock-creation-time identifier)
;;    succeeds (returns (#t lock-creation-time identifier)
;; use (db:no-sync-del! db keyname) to release the lock
;;
(define (db:no-sync-get-lock-with-id db keyname identifier)
  (sqlite3:with-transaction
   db
   (lambda ()
     (condition-case
      (let* ((curr-val (db:no-sync-get/default db keyname #f)))
	(if curr-val
	    (match (db:extract-time-identifier curr-val) ;; result->timestamp, identifier
	       ((timestamp ident)
		(if (equal? ident identifier)
		    #t     ;; this *is* my lock
		    #f))   ;; nope, not my lock
	       (else #f))  ;; nope, not my lock
	    (let ((lock-value (if identifier
		    (cons #t timestamp)    ;; this *is* my lock
		    (cons #f timestamp)))  ;; nope, not my lock
	       (else (cons #f #f)))  ;; nope, not my lock
	    (let ((curr-sec (current-seconds))
		  (lock-value (if identifier
				  (conc (current-seconds)"+"identifier)
				  (current-seconds))))
	      (sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-value)
	      #t)))
	      (cons #t curr-sec))))
      (exn (io-error)  (dbfile:print-err "ERROR: i/o error with no-sync db. Check permissions, disk space etc. and try again."))
      (exn (corrupt)   (dbfile:print-err "ERROR: database no-sync db is corrupt. Repair it to proceed."))
      (exn (busy)      (dbfile:print-err "ERROR: database no-sync db is locked. Try copying to another location, remove original and copy back."))
      (exn (permission)(dbfile:print-err "ERROR: database no-sync db has some permissions problem."))
      (exn () ;; (status done) ;; I don't know how to detect status done but no data!
	   (dbfile:print-err "ERROR: Unknown error with database no-sync db message: exn="(condition->list exn)", \n"
			     ((condition-property-accessor 'exn 'message) exn))

Modified dbmod.scm from [fa16c38514] to [a52cf6d3b4].

95
96
97
98
99
100
101
102

103
104
105
106
107
108
109
95
96
97
98
99
100
101

102
103
104
105
106
107
108
109







-
+







	 (dbdat     (dbmod:open-db dbstruct run-id (dbfile:db-init-proc)))
	 (dbh       (dbr:dbdat-dbh dbdat)) ;; this will be the inmem handle
	 (dbfile    (dbr:dbdat-dbfile dbdat)))
    ;; if nfs mode do a sync if delta > 2
    (let* ((last-update (dbr:dbstruct-last-update dbstruct))
	   (sync-proc   (dbr:dbstruct-sync-proc dbstruct))
	   (curr-secs   (current-seconds)))
      (if (> (- curr-secs last-update) 3)
      (if (> (- curr-secs last-update) 5)
	  (begin
	    (sync-proc last-update)

	    ;; MOVE THIS CALL TO INSIDE THE sync-proc CALL
	    (dbr:dbstruct-last-update-set! dbstruct curr-secs)
	    )))
    (assert (sqlite3:database? dbh) "FATAL: bad db handle in dbmod:with-db") 
176
177
178
179
180
181
182
183


184
185
186
187
188
189
190
176
177
178
179
180
181
182

183
184
185
186
187
188
189
190
191







-
+
+







			    (syncdir 'todisk))
  (let* ((dbstruct     (or dbstruct-in (make-dbr:dbstruct areapath: areapath)))
	 (dbfname      (or dbfname-in (dbmod:run-id->dbfname run-id)))
	 (dbpath       (dbmod:get-dbdir dbstruct))             ;; directory where all the .db files are kept
	 (dbfullname   (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id))
	 (dbexists     (file-exists? dbfullname))
	 (tmpdir       (conc "/tmp/"(current-user-name)))
	 (tmpdb        (let* ((fname (conc tmpdir"/" (string-translate areapath "/" ".")"-"(current-process-id)"-"dbfname)))
	 (tmpdb        (let* ((fname (conc tmpdir"/" (string-translate areapath "/" ".") ;; "-"(current-process-id)
					   "-"dbfname)))
			 (if (not (file-exists? tmpdir))(create-directory tmpdir))
			 ;; check if tmpdb already exists, either delete it or
			 ;; add something to the name
			 fname))
	 (inmem        (dbmod:open-inmem-db init-proc
					    ;; (if (eq? (dbfile:cache-method) 'inmem)
					    ;; 	#f
199
200
201
202
203
204
205

206
207
208
209
210
211
212


213
214
215
216
217
218
219
220
221
222
223





224
225
226
227
228
229
230
200
201
202
203
204
205
206
207
208
209
210
211
212


213
214
215
216
217
218
219
220
221




222
223
224
225
226
227
228
229
230
231
232
233







+





-
-
+
+







-
-
-
-
+
+
+
+
+







	(begin
	  (debug:print 0 *default-log-port* "ERROR: Failed to properly open "dbfname-in", exiting immediately.")
	  (exit)))    ;; (assert (sqlite3:database? inmem) "FATAL: open-dbmoddb: inmem is not a db")
    ;; (assert (sqlite3:database? db) "FATAL:  open-dbmoddb: db is not a db")
    (dbr:dbstruct-inmem-set!     dbstruct inmem)
    (dbr:dbstruct-ondiskdb-set!  dbstruct db)
    (dbr:dbstruct-dbfile-set!    dbstruct dbfullname)
    (dbr:dbstruct-dbtmpname-set! dbstruct tmpdb)
    (dbr:dbstruct-dbfname-set!   dbstruct dbfname)
    (dbr:dbstruct-sync-proc-set! dbstruct
				 (lambda (last-update)
				   (if *sync-in-progress*
				       (debug:print 3 *default-log-port* "WARNING: overlapping calls to sync to disk")
				       (let* ((sync-cmd (conc "megatest -db2db -from "tmpdb" -to "dbfullname)))
					 (mutex-lock! *db-with-db-mutex*) ;; this mutex is used when overloaded or during a query that modifies the db
				       (let* ((sync-cmd (conc "megatest -db2db -from "tmpdb" -to "dbfullname"&")))
					 ;; (mutex-lock! *db-with-db-mutex*) ;; this mutex is used when overloaded or during a query that modifies the db
					 (set! *sync-in-progress* #t)
					 ;; (if (eq? (dbfile:cache-method) 'inmem)
					 ;;     (dbmod:sync-gasket tables last-update inmem db
					 ;; 			dbfullname syncdir keys)
					 (thread-start! (make-thread
							 (lambda ()
							   (debug:print-info "Running "sync-cmd)
							   (system sync-cmd))))
					 (mutex-unlock! *db-with-db-mutex*)
					 (thread-sleep! 0.5) ;; ensure at least 1/2 second down time between sync calls
					 (set! *sync-in-progress* #f)))))
							   (system sync-cmd)
							   ;; (mutex-unlock! *db-with-db-mutex*)
							   ;; (thread-sleep! 0.5) ;; ensure at least 1/2 second down time between sync calls
							   (set! *sync-in-progress* #f))))
					 ))))
    ;; (dbmod:sync-tables tables #f db inmem)
    ;; (if db
    (dbmod:sync-gasket tables #f inmem db dbfullname 'fromdest keys) ;; ) ;; load into inmem
    (dbr:dbstruct-last-update-set! dbstruct (current-seconds)) ;; should this be offset back in time by one second?
    dbstruct))

;;    (if (eq? syncdir 'todisk) ;; sync to disk normally, sync from in dashboard

Modified megatest.scm from [e2f14e189c] to [612430ed4b].

2571
2572
2573
2574
2575
2576
2577
2578

2579
2580
2581
2582
2583


2584
2585
2586
2587
2588
2589
2590
2591
2592
2593














2594
2595
2596
2597
2598
2599
2600
2571
2572
2573
2574
2575
2576
2577

2578
2579
2580
2581
2582
2583
2584
2585










2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606







-
+





+
+
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+







;;
(if (args:get-arg "-db2db")
    (let* ((duh         (launch:setup))
	   (src-db      (args:get-arg "-from"))
	   (dest-db     (args:get-arg "-to"))
	   (sync-period (args:get-arg "-period"))    ;; NOT IMPLEMENTED YET
	   (sync-timeout (args:get-arg "-timeout"))  ;; NOT IMPLEMENTED YET
	   (lockfile    (conc dest-db".lock"))
	   (lockfile    (conc dest-db".sync-lock"))
	   (keys        (db:get-keys #f))
	   )
      
      (if (and src-db dest-db)
	  (if (file-exists? src-db)
	      (if (file-exists? lockfile)
		  (debug:print 0 *default-log-port* "Lock "lockfile" exists, skipping sync...")
	      (begin
		(debug:print-info 0 *default-log-port* "Attempting to sync data from "src-db" to "dest-db"...")
		(if (not (file-exists? dest-db))
		    (begin
		      (debug:print 0 *default-log-port* "Using copy to create "dest-db" from "src-db)
		      (file-copy src-db dest-db))
		    (let ((res (dbmod:db-to-db-sync src-db dest-db 0 (dbfile:db-init-proc) keys)))
		      (if res
			  (debug:print-info 0 *default-log-port* "Synced " res " records from "src-db" to "dest-db)
			  (debug:print-info 0 *default-log-port* "No sync due to permissions or other issue.")))))
		  (begin
		    (with-output-to-file lockfile
		      (lambda ()
			(print (current-process-id))))
		    (debug:print-info 0 *default-log-port* "Attempting to sync data from "src-db" to "dest-db"...")
		    (if (not (file-exists? dest-db))
			(begin
			  (debug:print 0 *default-log-port* "Using copy to create "dest-db" from "src-db)
			  (file-copy src-db dest-db))
			(let ((res (dbmod:db-to-db-sync src-db dest-db 0 (dbfile:db-init-proc) keys)))
			  (if res
			      (debug:print-info 0 *default-log-port* "Synced " res " records from "src-db" to "dest-db)
			      (debug:print-info 0 *default-log-port* "No sync due to permissions or other issue."))))
		    (delete-file* lockfile)))
	      (debug:print 0 *default-log-port* "No sync due to unreadble or non-existant source file"src-db))
	  (debug:print 0 *default-log-port* "Usage for -db2db; -to and -from must be specified"))
      (set! *didsomething* #t)))

(if (args:get-arg "-list-test-time")
     (let* ((toppath (launch:setup))) 
     (task:get-test-times)  

Modified tcp-transportmod.scm from [3d0d8b0130] to [f16326ea6c].

487
488
489
490
491
492
493


494



495
496
497
498
499
500
501
487
488
489
490
491
492
493
494
495

496
497
498
499
500
501
502
503
504
505







+
+
-
+
+
+







	 (areapath     (tt-areapath ttdat))
	 (nosyncdbpath (conc areapath"/.mtdb"))
	 (cleanup (lambda ()
		    (if (tt-cleanup-proc ttdat)
			((tt-cleanup-proc ttdat)))
		    (dbfile:with-no-sync-db nosyncdbpath
					    (lambda (db)
					      (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct)))
						(debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname)
					      (db:no-sync-del! db dbfname))))))
						(db:no-sync-del! db dbfname)
						#;(if dbtmpname
						    (delete-file dbtmpname))))))))
    (set! *server-info* ttdat)
    (let loop ((count 0))
      (if (> count 240)
	  (begin
	    (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
	    (exit 1))
	  (if (not (tt-port ttdat)) ;; no connection yet
511
512
513
514
515
516
517

518
519
520
521
522





523
524



525
526
527
528
529
530
531
532
533
534
535
536

537
538
539
540
541
542
543
515
516
517
518
519
520
521
522





523
524
525
526
527


528
529
530
531
532
533
534
535
536
537
538
539
540


541
542
543
544
545
546
547
548







+
-
-
-
-
-
+
+
+
+
+
-
-
+
+
+










-
-
+







      (let* ((servers (tt:get-server-info-sorted ttdat dbfname))
	     (ok      (cond
		       ((null? servers) #f) ;; not ok
		       ((equal? (list-ref (car servers) 6) ;; compare the servinfofile
				(tt-servinf-file ttdat))
			(let* ((res (if db-locked-in
					#t
					(let* ((lock-result
					(let* ((success (dbfile:with-no-sync-db
							 nosyncdbpath
							 (lambda (db)
							   (db:no-sync-get-lock-with-id db dbfname
											;; (tt-servinf-file ttdat) ;; does NOT work, must be unique to the dbname which seems silly but makes sense!
						(dbfile:with-no-sync-db
						 nosyncdbpath
						 (lambda (db)
						   (db:no-sync-get-lock-with-id db dbfname
										;; (tt-servinf-file ttdat)
											areapath ;; as good as anything
											)))))
										(dbr:dbstruct-dbtmpname dbstruct)
										))))
					       (success (car lock-result)))
					  (if success
					      (begin
						(tt-state-set! ttdat 'running)
						(debug:print 0 *default-log-port* "Got server lock for "
							     dbfname)
						(set! db-locked-in #t)
						#t)
					      (begin
						(debug:print 0 *default-log-port* "Failed to get server lock for "dbfname)
						#f))))))
			  (if (and res
				   (common:low-noise-print 120 "top server message"))
			  (if (and res (common:low-noise-print 120 "top server message"))
			      (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for "
						dbfname" on "(tt-host ttdat)":"(tt-port ttdat)))
			  res))
		       (else
			(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
			(let* ((leadsrv (car servers)))
			  (match leadsrv
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
567
568
569
570
571
572
573


574
575
576
577
578
579
580







-
-







				       (debug:print 0 *default-log-port* "I'm not the server but could not ping "host":"port", will try again.")
				       (thread-sleep! 1) ;; just because
				       #t)))))
			    (else ;; should never get here
			     (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv)
			     (assert #f "Bad server record "leadsrv))))))))
	(if ok
	    ;; (if (> *api-process-request-count* 0) ;; have requests in flight
	    ;;	(tt-last-access-set! ttdat (current-seconds)))
	    (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access
	    (begin
	      (debug:print 0 *default-log-port* "Exiting immediately")
	      (cleanup)
	      (exit)))

	(let* ((last-update (dbr:dbstruct-last-update dbstruct))
645
646
647
648
649
650
651
652


653
654
655
656
657
658
659
648
649
650
651
652
653
654

655
656
657
658
659
660
661
662
663







-
+
+







  (let* ((areapath (tt-areapath ttdat))
	 (servdir  (tt:get-servinfo-dir areapath))
	 (host     (tt-host ttdat))
	 (port     (tt-port ttdat))
	 (servinf (conc servdir"/"host":"port"-"(current-process-id)":"dbfname))
	 (serv-id (tt:mk-signature areapath))
	 (clean-proc (lambda ()
		       (delete-file* servinf))))
		       (delete-file* servinf)
		       )))
    (assert (and host port) "FATAL: tt:create-server-registration-file called with no conn, dbfname="dbfname)
    (tt-cleanup-proc-set! ttdat clean-proc)
    (tt-servinf-file-set! ttdat servinf)
    (with-output-to-file servinf
      (lambda ()
	(print "SERVER STARTED: "host":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)" dbfname: "dbfname)))
      serv-id))