Megatest

Check-in [76fac9ef30]
Login
Overview
Comment:Add back and improve waiting for journal plus few other performance helpers
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.70
Files: files | file ages | folders
SHA1: 76fac9ef301954903e31234b7af1e9bc3274e7f8
User & Date: matt on 2022-09-18 18:12:21
Other Links: branch diff | manifest | tags
Context
2022-09-18
19:31
Added brute-force-salvage-db back. It seems to work, at least sometimes. check-in: fb8df28768 user: matt tags: v1.70
18:12
Add back and improve waiting for journal plus few other performance helpers check-in: 76fac9ef30 user: matt tags: v1.70
12:50
Moved close-all-connections! to better spot. check-in: a847a95748 user: matt tags: v1.70
Changes

Modified api.scm from [736048365d] to [3289b67f67].

151
152
153
154
155
156
157
158





159
160
161
162

163
164
165
166
167
168
169
151
152
153
154
155
156
157

158
159
160
161
162
163
164
165

166
167
168
169
170
171
172
173







-
+
+
+
+
+



-
+







  (db:open-no-sync-db) ;; sets *no-sync-db*
;;   (handle-exceptions
;;    exn
;;    (let ((call-chain (get-call-chain)))
;;      (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn)
;;      (print-call-chain (current-error-port))
;;      (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
;;      (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
  ;;      (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
  (if (> *api-process-request-count* 200)
      (begin
	(debug:print 0 *default-log-port* "WARNING: Over 200 threads, overload, taking a five second nap.")
	(thread-sleep! 5))) ;; take a nap
   (cond
    ((not (vector? dat))                    ;; it is an error to not receive a vector
     (vector #f (vector #f "remote must be called with a vector")))
    ((> *api-process-request-count* 200) ;; 20)
    #;((> *api-process-request-count* 200) ;; 20)
     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")
     (set! *server-overloaded* #t)
     (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
    (else  
     (let* ((cmd-in            (vector-ref dat 0))
            (cmd               (if (symbol? cmd-in)
				   cmd-in

Modified db.scm from [974c310e18] to [131d871139].

655
656
657
658
659
660
661
662
663
664
665
666
667






668
669
670
671
672
673
674
675
676
677











678
679
680



681
682


683
684
685
686




687
688
689


690
691
692

693
694
695
696
697
698

699
700
701
702
703
704
705
706
655
656
657
658
659
660
661






662
663
664
665
666
667
668









669
670
671
672
673
674
675
676
677
678
679



680
681
682


683
684




685
686
687
688
689


690
691

692

693
694
695
696
697
698

699

700
701
702
703
704
705
706







-
-
-
-
-
-
+
+
+
+
+
+

-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
+
-
-
+
+
-
-
-
-
+
+
+
+

-
-
+
+
-

-
+





-
+
-







    (tmp-area       (common:get-db-tmp-area))
    (dbfiles        (glob (conc tmp-area"/.megatest/*.db")))
    (sync-durations (make-hash-table))
    (no-sync-db        (db:open-no-sync-db)))
    (for-each
     (lambda (file)
       (debug:print-info 3 *default-log-port* "file: " file)
       (let* ((fname (conc (pathname-file file) ".db"))
	      (fulln (conc *toppath*"/.megatest/"fname))
	      (time1 (if (file-exists? file)
			 (file-modification-time file)
			 (begin
			   (debug:print-info 2 *default-log-port* "Sync - I do not see file "file)
       (let* ((fname       (conc (pathname-file file) ".db"))
	      (fulln       (conc *toppath*"/.megatest/"fname))
	      (time1       (if (file-exists? file)
			       (file-modification-time file)
			       (begin
				 (debug:print-info 2 *default-log-port* "Sync - I do not see file "file)
			   1)))
	      (time2 (if (file-exists? fulln)
			 (file-modification-time fulln)
			 (begin
			   (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln)
			   0)))
	      (changed (> time1 time2))
	      (do-cp (cond
		      ((not (file-exists? fulln)) ;; shouldn't happen, but this might recover
		       (debug:print-info 2 *default-log-port* "File "fulln" not found! Copying "fname" to "fulln)
	      (time2       (if (file-exists? fulln)
			       (file-modification-time fulln)
			       (begin
				 (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln)
				 0)))
	      (changed      (> (- time1 time2) (+ (random 5) 1)))  ;; it has been at some few seconds since last synced
	      (changed10    (> (- time1 time2) 10)) ;; it has been at least ten seconds since sync'd
	      (jfile-exists (file-exists? (conc file"-journal"))) ;; i.e. are we busy?
	      (do-cp        (cond
			     ((not (file-exists? fulln)) ;; shouldn't happen, but this might recover
			      (cons #t (conc "File "fulln" not found! Copying "fname" to "fulln)))
		       #t)
		      (changed ;; (and changed
		       ;; (> (- (current-seconds) time1) 3)) ;; if file is changed and three seconds have passed.
			     ((and (not jfile-exists) changed)
			      (cons #t "not busy, changed")) ;; not busy and changed
			     ((and jfile-exists changed10)
		       #t)
		      ((and changed *time-to-exit*) ;; last sync
			      (cons #t "busy but not synced in a while")) ;; busy but not sync'd in over 10 seconds
			     ((and changed *time-to-exit*)
		       #t)
		      (else
		       #f))))
	 (if do-cp
			      (cons #t "Time to exit, forced final sync")) ;; last sync
			     (else
			      (cons #f "No sync needed")))))
	 (if (car do-cp)
	     (let* ((start-time (current-milliseconds))
              (fname (pathname-file file))
              (runid (if (string= fname "main") #f (string->number fname)))
		    (fname (pathname-file file))
		    (runid (if (string= fname "main") #f (string->number fname))))
              )
	       (debug:print-info 3 *default-log-port* "db:all-db-sync: fname: "
				 fname", delta: " (- time1 time2) " seconds")
				 fname", delta: " (- time1 time2) " seconds, reason: "(cdr do-cp))
	       (db:lock-and-delta-sync no-sync-db dbstruct fname runid (db:get-keys dbstruct) db:initialize-main-db)
	       (hash-table-set! sync-durations (conc fname".db")
				(- (current-milliseconds) start-time)))
	     (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date")
         )))
     dbfiles
     dbfiles)
     )
    (if dbdat (dbfile:add-dbdat dbstruct #f dbdat)))
  #t)

;; options:
;;
;;  'killservers  - kills all servers
;;  'dejunk       - removes junk records
1861
1862
1863
1864
1865
1866
1867
1868

1869
1870
1871
1872
1873
1874
1875

1876

1877


1878
1879
1880
1881
1882
1883
1884
1861
1862
1863
1864
1865
1866
1867

1868
1869
1870
1871
1872
1873
1874

1875
1876
1877

1878
1879
1880
1881
1882
1883
1884
1885
1886







-
+






-
+

+
-
+
+







	 (contour   (or contour-in ""))  ;; empty string to force no hierarcy and be backwards compatible.
	 (comma     (if (> (length keys) 0) "," ""))
	 (andstr    (if (> (length keys) 0) " AND " ""))
	 (valslots  (keys->valslots keys)) ;; ?,?,? ...
	 (allvals   (append (list runname state status user contour) (map cadr keyvals)))
	 (qryvals   (append (list runname) (map cadr keyvals)))
	 (key=?str  (string-intersperse (map (lambda (k)(conc k "=?")) keys) " AND ")))
  (debug:print 0 *default-log-port* "Got here 0.")
    ;; (debug:print 0 *default-log-port* "Got here 0.")
    (debug:print 3 *default-log-port* "keys: " keys " allvals: " allvals " keyvals: " keyvals " key=?str is " key=?str)
    (debug:print 2 *default-log-port* "NOTE: using target " (string-intersperse (map cadr keyvals) "/") " for this run")
    (if (and runname (null? (filter (lambda (x)(not x)) keyvals))) ;; there must be a better way to "apply and"
	(db:with-db
	 dbstruct #f #f
	 (lambda (dbdat db)
  (debug:print 0 *default-log-port* "Got here 1.")
	   ;; (debug:print 0 *default-log-port* "Got here 1.")
	   (let ((res #f))
	     (apply sqlite3:execute db
	     (apply sqlite3:execute db (conc "INSERT OR IGNORE INTO runs (runname,state,status,owner,event_time,contour" comma keystr ") VALUES (?,?,?,?,strftime('%s','now'),?" comma valslots ");")
		    (conc "INSERT OR IGNORE INTO runs (runname,state,status,owner,event_time,contour"
			  comma keystr ") VALUES (?,?,?,?,strftime('%s','now'),?" comma valslots ");")
		    allvals)
	     (apply sqlite3:for-each-row 
		    (lambda (id)
		      (set! res id))
		    db
		    (let ((qry (conc "SELECT id FROM runs WHERE (runname=? " andstr key=?str ");")))
		      qry)

Modified dbfile.scm from [769f7dede8] to [3ea5a94d80].

467
468
469
470
471
472
473
474

475
476
477
478
479
480
481
482
483














484
485
486
487
488
489
490
467
468
469
470
471
472
473

474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504







-
+









+
+
+
+
+
+
+
+
+
+
+
+
+
+







	  (dbfile:print-err "INFO: db:sync-all-tables-list done.")
	  )
	(dbfile:print-err " db, " (dbr:dbdat-dbfile tmpdb) " already exists or fresh enough, not propogating data from\n     " (dbr:dbdat-dbfile mtdb) " mod time delta: " modtimedelta) )
    ;; (db:multi-db-sync subdb 'old2new))  ;; migrate data from megatest.db automatically
    tmpdb))
		

(define (dbfile:cautious-open-database fname init-proc #!optional (tries-left 50))
(define (dbfile:cautious-open-database fname init-proc #!optional (tries-left 500))

  (let* ((busy-file  (conc fname"-journal"))
	 (delay-time (* (- 51 tries-left) 1.1))
      	 (write-access (file-write-access? fname))
         (dir-access (file-write-access? (pathname-directory fname)))
         (retry      (lambda ()
		       (thread-sleep! delay-time)
		       (if (> tries-left 0)
			   (dbfile:cautious-open-database fname init-proc (- tries-left 1))))))
    (assert (>= tries-left 0) (conc "FATAL: too many attempts in dbfile:cautious-open-database of "fname", giving up."))

    (if (and (file-write-access? fname)
	     (file-exists? busy-file))
	(begin
	  (if (common:low-noise-print 120 busy-file)
	      (dbfile:print-err "INFO: dbfile:cautious-open-database: journal file "
				busy-file" exists, trying again in few seconds."))
	  (thread-sleep! 1)
	;; (if (eq? tries-left 2)
	;;     (begin
	;; 	(dbfile:print-err "INFO: forcing journal rollup "busy-file)
	;; 	(dbfile:brute-force-salvage-db fname)))
	  (dbfile:cautious-open-database fname init-proc (- tries-left 1)))

	(let* ((result (condition-case
		         (if dir-access
			   (dbfile:with-simple-file-lock
			    (conc fname ".lock")
			    (lambda ()
			      (let* ((db-exists (file-exists? fname))
515
516
517
518
519
520
521
522

523
524
525
526
527
528
529
529
530
531
532
533
534
535

536
537
538
539
540
541
542
543







-
+







			      (retry))
			 (exn (permission)(dbfile:print-err exn "ERROR: database " fname " has some permissions problem.")
			      (retry))
			 (exn ()
			      (dbfile:print-err exn "ERROR: Unknown error with database " fname " message: "
						((condition-property-accessor 'exn 'message) exn))
			      (retry)))))
	  result)))
	  result))))

(define (dbfile:brute-force-salvage-db fname)
  (let* ((backupfname (conc fname"-"(current-process-id)".bak"))
	 (cmd (conc "cp "fname" "backupfname";mv "fname" "(conc fname ".delme;")
		    "cp "backupfname" "fname)))
    (dbfile:print-err "WARNING: attempting recovery of file "fname" by running commands:\n"
		      "  "cmd)
1169
1170
1171
1172
1173
1174
1175

1176
1177
1178
1179




1180
1181
1182


1183
1184
1185
1186

1187
1188
1189
1190
1191
1192
1193
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200

1201
1202
1203
1204
1205

1206
1207
1208
1209
1210
1211
1212
1213







+




+
+
+
+


-
+
+



-
+







			#f))
	 (db        (if have-struct                ;; this stuff just allows us to call with a db handle directly
			(dbr:dbdat-dbh dbdat)
			dbstruct))
	 (fname     (if dbdat
			(dbr:dbdat-dbfile dbdat)
			"nofilenameavailable"))
	 (jfile     (conc fname"-journal"))
	 #;(subdb     (if have-struct
			(dbfile:get-subdb dbstruct run-id)
			#f))
	 (use-mutex (> *api-process-request-count* 25))) ;; was 25
    (if (file-exists? jfile)
	(begin
	  (dbfile:print-err "INFO: "jfile" exists, delaying few seconds to reduce database load")
	  (thread-sleep! 2)))
    (if (and use-mutex
	     (common:low-noise-print 120 "over-50-parallel-api-requests"))
	(dbfile:print-err *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access"))
	(dbfile:print-err *api-process-request-count* " parallel api requests being processed in process "
			  (current-process-id) ", throttling access"))
    (condition-case
	(begin
	  (if use-mutex (mutex-lock! *db-with-db-mutex*))
	  (let ((res (apply proc dbdat db params)))
	  (let ((res (apply proc dbdat db params))) ;; the actual call is here.
	    (if use-mutex (mutex-unlock! *db-with-db-mutex*))
	    ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
	    (if dbdat
		(dbfile:add-dbdat dbstruct run-id dbdat))
	    res))
      (exn (io-error)
	   (db:generic-error-printout exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again."))

Added utils/open-files.sh version [61001a23e2].




1
2
3
+
+
+
echo "Database opens:  $(lsof -c mtest|egrep '.*db$'|wc -l)"
echo "Logfile opens:   $(lsof -c mtest|egrep '.*log$'|wc -l)"
echo "TCP connections: $(lsof -c mtest|grep TCP|wc -l)"