Megatest

Check-in [76fac9ef30]
Login
Overview
Comment:Add back and improve waiting for journal plus few other performance helpers
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.70
Files: files | file ages | folders
SHA1: 76fac9ef301954903e31234b7af1e9bc3274e7f8
User & Date: matt on 2022-09-18 18:12:21
Other Links: branch diff | manifest | tags
Context
2022-09-18
19:31
Added brute-force-salvage-db back. It seems to work, at least sometimes. check-in: fb8df28768 user: matt tags: v1.70
18:12
Add back and improve waiting for journal plus few other performance helpers check-in: 76fac9ef30 user: matt tags: v1.70
12:50
Moved close-all-connections! to better spot. check-in: a847a95748 user: matt tags: v1.70
Changes

Modified api.scm from [736048365d] to [3289b67f67].

151
152
153
154
155
156
157
158




159
160
161
162
163
164
165
166
167
168
169
  (db:open-no-sync-db) ;; sets *no-sync-db*
;;   (handle-exceptions
;;    exn
;;    (let ((call-chain (get-call-chain)))
;;      (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn)
;;      (print-call-chain (current-error-port))
;;      (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
;;      (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens




   (cond
    ((not (vector? dat))                    ;; it is an error to not receive a vector
     (vector #f (vector #f "remote must be called with a vector")))
    ((> *api-process-request-count* 200) ;; 20)
     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")
     (set! *server-overloaded* #t)
     (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
    (else  
     (let* ((cmd-in            (vector-ref dat 0))
            (cmd               (if (symbol? cmd-in)
				   cmd-in







|
>
>
>
>



|







151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
  (db:open-no-sync-db) ;; sets *no-sync-db*
;;   (handle-exceptions
;;    exn
;;    (let ((call-chain (get-call-chain)))
;;      (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn)
;;      (print-call-chain (current-error-port))
;;      (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
  ;;      (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
  (if (> *api-process-request-count* 200)
      (begin
	(debug:print 0 *default-log-port* "WARNING: Over 200 threads, overload, taking a five second nap.")
	(thread-sleep! 5))) ;; take a nap
   (cond
    ((not (vector? dat))                    ;; it is an error to not receive a vector
     (vector #f (vector #f "remote must be called with a vector")))
    #;((> *api-process-request-count* 200) ;; 20)
     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")
     (set! *server-overloaded* #t)
     (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
    (else  
     (let* ((cmd-in            (vector-ref dat 0))
            (cmd               (if (symbol? cmd-in)
				   cmd-in

Modified db.scm from [974c310e18] to [131d871139].

655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673

674

675
676
677
678
679

680
681

682
683

684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
    (tmp-area       (common:get-db-tmp-area))
    (dbfiles        (glob (conc tmp-area"/.megatest/*.db")))
    (sync-durations (make-hash-table))
    (no-sync-db        (db:open-no-sync-db)))
    (for-each
     (lambda (file)
       (debug:print-info 3 *default-log-port* "file: " file)
       (let* ((fname (conc (pathname-file file) ".db"))
	      (fulln (conc *toppath*"/.megatest/"fname))
	      (time1 (if (file-exists? file)
			 (file-modification-time file)
			 (begin
			   (debug:print-info 2 *default-log-port* "Sync - I do not see file "file)
			   1)))
	      (time2 (if (file-exists? fulln)
			 (file-modification-time fulln)
			 (begin
			   (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln)
			   0)))

	      (changed (> time1 time2))

	      (do-cp (cond
		      ((not (file-exists? fulln)) ;; shouldn't happen, but this might recover
		       (debug:print-info 2 *default-log-port* "File "fulln" not found! Copying "fname" to "fulln)
		       #t)
		      (changed ;; (and changed

		       ;; (> (- (current-seconds) time1) 3)) ;; if file is changed and three seconds have passed.
		       #t)

		      ((and changed *time-to-exit*) ;; last sync
		       #t)

		      (else
		       #f))))
	 (if do-cp
	     (let* ((start-time (current-milliseconds))
              (fname (pathname-file file))
              (runid (if (string= fname "main") #f (string->number fname)))
              )
	       (debug:print-info 3 *default-log-port* "db:all-db-sync: fname: "
				 fname", delta: " (- time1 time2) " seconds")
	       (db:lock-and-delta-sync no-sync-db dbstruct fname runid (db:get-keys dbstruct) db:initialize-main-db)
	       (hash-table-set! sync-durations (conc fname".db")
				(- (current-milliseconds) start-time)))
	     (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date")
         )))
     dbfiles
     )
    (if dbdat (dbfile:add-dbdat dbstruct #f dbdat)))
  #t)

;; options:
;;
;;  'killservers  - kills all servers
;;  'dejunk       - removes junk records







|
|
|
|
|
|

|
|
|
|
|
>
|
>
|
|
|
<
|
>
|
<
>
|
<
>
|
|
|

|
|
<

|





|
<







655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679

680
681
682

683
684

685
686
687
688
689
690
691

692
693
694
695
696
697
698
699

700
701
702
703
704
705
706
    (tmp-area       (common:get-db-tmp-area))
    (dbfiles        (glob (conc tmp-area"/.megatest/*.db")))
    (sync-durations (make-hash-table))
    (no-sync-db        (db:open-no-sync-db)))
    (for-each
     (lambda (file)
       (debug:print-info 3 *default-log-port* "file: " file)
       (let* ((fname       (conc (pathname-file file) ".db"))
	      (fulln       (conc *toppath*"/.megatest/"fname))
	      (time1       (if (file-exists? file)
			       (file-modification-time file)
			       (begin
				 (debug:print-info 2 *default-log-port* "Sync - I do not see file "file)
			   1)))
	      (time2       (if (file-exists? fulln)
			       (file-modification-time fulln)
			       (begin
				 (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln)
				 0)))
	      (changed      (> (- time1 time2) (+ (random 5) 1)))  ;; it has been at some few seconds since last synced
	      (changed10    (> (- time1 time2) 10)) ;; it has been at least ten seconds since sync'd
	      (jfile-exists (file-exists? (conc file"-journal"))) ;; i.e. are we busy?
	      (do-cp        (cond
			     ((not (file-exists? fulln)) ;; shouldn't happen, but this might recover
			      (cons #t (conc "File "fulln" not found! Copying "fname" to "fulln)))

			     ((and (not jfile-exists) changed)
			      (cons #t "not busy, changed")) ;; not busy and changed
			     ((and jfile-exists changed10)

			      (cons #t "busy but not synced in a while")) ;; busy but not sync'd in over 10 seconds
			     ((and changed *time-to-exit*)

			      (cons #t "Time to exit, forced final sync")) ;; last sync
			     (else
			      (cons #f "No sync needed")))))
	 (if (car do-cp)
	     (let* ((start-time (current-milliseconds))
		    (fname (pathname-file file))
		    (runid (if (string= fname "main") #f (string->number fname))))

	       (debug:print-info 3 *default-log-port* "db:all-db-sync: fname: "
				 fname", delta: " (- time1 time2) " seconds, reason: "(cdr do-cp))
	       (db:lock-and-delta-sync no-sync-db dbstruct fname runid (db:get-keys dbstruct) db:initialize-main-db)
	       (hash-table-set! sync-durations (conc fname".db")
				(- (current-milliseconds) start-time)))
	     (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date")
         )))
     dbfiles)

    (if dbdat (dbfile:add-dbdat dbstruct #f dbdat)))
  #t)

;; options:
;;
;;  'killservers  - kills all servers
;;  'dejunk       - removes junk records
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876

1877

1878
1879
1880
1881
1882
1883
1884
	 (contour   (or contour-in ""))  ;; empty string to force no hierarcy and be backwards compatible.
	 (comma     (if (> (length keys) 0) "," ""))
	 (andstr    (if (> (length keys) 0) " AND " ""))
	 (valslots  (keys->valslots keys)) ;; ?,?,? ...
	 (allvals   (append (list runname state status user contour) (map cadr keyvals)))
	 (qryvals   (append (list runname) (map cadr keyvals)))
	 (key=?str  (string-intersperse (map (lambda (k)(conc k "=?")) keys) " AND ")))
  (debug:print 0 *default-log-port* "Got here 0.")
    (debug:print 3 *default-log-port* "keys: " keys " allvals: " allvals " keyvals: " keyvals " key=?str is " key=?str)
    (debug:print 2 *default-log-port* "NOTE: using target " (string-intersperse (map cadr keyvals) "/") " for this run")
    (if (and runname (null? (filter (lambda (x)(not x)) keyvals))) ;; there must be a better way to "apply and"
	(db:with-db
	 dbstruct #f #f
	 (lambda (dbdat db)
  (debug:print 0 *default-log-port* "Got here 1.")
	   (let ((res #f))

	     (apply sqlite3:execute db (conc "INSERT OR IGNORE INTO runs (runname,state,status,owner,event_time,contour" comma keystr ") VALUES (?,?,?,?,strftime('%s','now'),?" comma valslots ");")

		    allvals)
	     (apply sqlite3:for-each-row 
		    (lambda (id)
		      (set! res id))
		    db
		    (let ((qry (conc "SELECT id FROM runs WHERE (runname=? " andstr key=?str ");")))
		      qry)







|






|

>
|
>







1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
	 (contour   (or contour-in ""))  ;; empty string to force no hierarcy and be backwards compatible.
	 (comma     (if (> (length keys) 0) "," ""))
	 (andstr    (if (> (length keys) 0) " AND " ""))
	 (valslots  (keys->valslots keys)) ;; ?,?,? ...
	 (allvals   (append (list runname state status user contour) (map cadr keyvals)))
	 (qryvals   (append (list runname) (map cadr keyvals)))
	 (key=?str  (string-intersperse (map (lambda (k)(conc k "=?")) keys) " AND ")))
    ;; (debug:print 0 *default-log-port* "Got here 0.")
    (debug:print 3 *default-log-port* "keys: " keys " allvals: " allvals " keyvals: " keyvals " key=?str is " key=?str)
    (debug:print 2 *default-log-port* "NOTE: using target " (string-intersperse (map cadr keyvals) "/") " for this run")
    (if (and runname (null? (filter (lambda (x)(not x)) keyvals))) ;; there must be a better way to "apply and"
	(db:with-db
	 dbstruct #f #f
	 (lambda (dbdat db)
	   ;; (debug:print 0 *default-log-port* "Got here 1.")
	   (let ((res #f))
	     (apply sqlite3:execute db
		    (conc "INSERT OR IGNORE INTO runs (runname,state,status,owner,event_time,contour"
			  comma keystr ") VALUES (?,?,?,?,strftime('%s','now'),?" comma valslots ");")
		    allvals)
	     (apply sqlite3:for-each-row 
		    (lambda (id)
		      (set! res id))
		    db
		    (let ((qry (conc "SELECT id FROM runs WHERE (runname=? " andstr key=?str ");")))
		      qry)

Modified dbfile.scm from [769f7dede8] to [3ea5a94d80].

467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483














484
485
486
487
488
489
490
	  (dbfile:print-err "INFO: db:sync-all-tables-list done.")
	  )
	(dbfile:print-err " db, " (dbr:dbdat-dbfile tmpdb) " already exists or fresh enough, not propogating data from\n     " (dbr:dbdat-dbfile mtdb) " mod time delta: " modtimedelta) )
    ;; (db:multi-db-sync subdb 'old2new))  ;; migrate data from megatest.db automatically
    tmpdb))
		

(define (dbfile:cautious-open-database fname init-proc #!optional (tries-left 50))

  (let* ((busy-file  (conc fname"-journal"))
	 (delay-time (* (- 51 tries-left) 1.1))
      	 (write-access (file-write-access? fname))
         (dir-access (file-write-access? (pathname-directory fname)))
         (retry      (lambda ()
		       (thread-sleep! delay-time)
		       (if (> tries-left 0)
			   (dbfile:cautious-open-database fname init-proc (- tries-left 1))))))















	(let* ((result (condition-case
		         (if dir-access
			   (dbfile:with-simple-file-lock
			    (conc fname ".lock")
			    (lambda ()
			      (let* ((db-exists (file-exists? fname))







|









>
>
>
>
>
>
>
>
>
>
>
>
>
>







467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
	  (dbfile:print-err "INFO: db:sync-all-tables-list done.")
	  )
	(dbfile:print-err " db, " (dbr:dbdat-dbfile tmpdb) " already exists or fresh enough, not propogating data from\n     " (dbr:dbdat-dbfile mtdb) " mod time delta: " modtimedelta) )
    ;; (db:multi-db-sync subdb 'old2new))  ;; migrate data from megatest.db automatically
    tmpdb))
		

(define (dbfile:cautious-open-database fname init-proc #!optional (tries-left 500))

  (let* ((busy-file  (conc fname"-journal"))
	 (delay-time (* (- 51 tries-left) 1.1))
      	 (write-access (file-write-access? fname))
         (dir-access (file-write-access? (pathname-directory fname)))
         (retry      (lambda ()
		       (thread-sleep! delay-time)
		       (if (> tries-left 0)
			   (dbfile:cautious-open-database fname init-proc (- tries-left 1))))))
    (assert (>= tries-left 0) (conc "FATAL: too many attempts in dbfile:cautious-open-database of "fname", giving up."))

    (if (and (file-write-access? fname)
	     (file-exists? busy-file))
	(begin
	  (if (common:low-noise-print 120 busy-file)
	      (dbfile:print-err "INFO: dbfile:cautious-open-database: journal file "
				busy-file" exists, trying again in few seconds."))
	  (thread-sleep! 1)
	;; (if (eq? tries-left 2)
	;;     (begin
	;; 	(dbfile:print-err "INFO: forcing journal rollup "busy-file)
	;; 	(dbfile:brute-force-salvage-db fname)))
	  (dbfile:cautious-open-database fname init-proc (- tries-left 1)))

	(let* ((result (condition-case
		         (if dir-access
			   (dbfile:with-simple-file-lock
			    (conc fname ".lock")
			    (lambda ()
			      (let* ((db-exists (file-exists? fname))
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
			      (retry))
			 (exn (permission)(dbfile:print-err exn "ERROR: database " fname " has some permissions problem.")
			      (retry))
			 (exn ()
			      (dbfile:print-err exn "ERROR: Unknown error with database " fname " message: "
						((condition-property-accessor 'exn 'message) exn))
			      (retry)))))
	  result)))

(define (dbfile:brute-force-salvage-db fname)
  (let* ((backupfname (conc fname"-"(current-process-id)".bak"))
	 (cmd (conc "cp "fname" "backupfname";mv "fname" "(conc fname ".delme;")
		    "cp "backupfname" "fname)))
    (dbfile:print-err "WARNING: attempting recovery of file "fname" by running commands:\n"
		      "  "cmd)







|







529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
			      (retry))
			 (exn (permission)(dbfile:print-err exn "ERROR: database " fname " has some permissions problem.")
			      (retry))
			 (exn ()
			      (dbfile:print-err exn "ERROR: Unknown error with database " fname " message: "
						((condition-property-accessor 'exn 'message) exn))
			      (retry)))))
	  result))))

(define (dbfile:brute-force-salvage-db fname)
  (let* ((backupfname (conc fname"-"(current-process-id)".bak"))
	 (cmd (conc "cp "fname" "backupfname";mv "fname" "(conc fname ".delme;")
		    "cp "backupfname" "fname)))
    (dbfile:print-err "WARNING: attempting recovery of file "fname" by running commands:\n"
		      "  "cmd)
1169
1170
1171
1172
1173
1174
1175

1176
1177
1178
1179




1180
1181
1182

1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
			#f))
	 (db        (if have-struct                ;; this stuff just allows us to call with a db handle directly
			(dbr:dbdat-dbh dbdat)
			dbstruct))
	 (fname     (if dbdat
			(dbr:dbdat-dbfile dbdat)
			"nofilenameavailable"))

	 #;(subdb     (if have-struct
			(dbfile:get-subdb dbstruct run-id)
			#f))
	 (use-mutex (> *api-process-request-count* 25))) ;; was 25




    (if (and use-mutex
	     (common:low-noise-print 120 "over-50-parallel-api-requests"))
	(dbfile:print-err *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access"))

    (condition-case
	(begin
	  (if use-mutex (mutex-lock! *db-with-db-mutex*))
	  (let ((res (apply proc dbdat db params)))
	    (if use-mutex (mutex-unlock! *db-with-db-mutex*))
	    ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
	    (if dbdat
		(dbfile:add-dbdat dbstruct run-id dbdat))
	    res))
      (exn (io-error)
	   (db:generic-error-printout exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again."))







>




>
>
>
>


|
>



|







1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
			#f))
	 (db        (if have-struct                ;; this stuff just allows us to call with a db handle directly
			(dbr:dbdat-dbh dbdat)
			dbstruct))
	 (fname     (if dbdat
			(dbr:dbdat-dbfile dbdat)
			"nofilenameavailable"))
	 (jfile     (conc fname"-journal"))
	 #;(subdb     (if have-struct
			(dbfile:get-subdb dbstruct run-id)
			#f))
	 (use-mutex (> *api-process-request-count* 25))) ;; was 25
    (if (file-exists? jfile)
	(begin
	  (dbfile:print-err "INFO: "jfile" exists, delaying few seconds to reduce database load")
	  (thread-sleep! 2)))
    (if (and use-mutex
	     (common:low-noise-print 120 "over-50-parallel-api-requests"))
	(dbfile:print-err *api-process-request-count* " parallel api requests being processed in process "
			  (current-process-id) ", throttling access"))
    (condition-case
	(begin
	  (if use-mutex (mutex-lock! *db-with-db-mutex*))
	  (let ((res (apply proc dbdat db params))) ;; the actual call is here.
	    (if use-mutex (mutex-unlock! *db-with-db-mutex*))
	    ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
	    (if dbdat
		(dbfile:add-dbdat dbstruct run-id dbdat))
	    res))
      (exn (io-error)
	   (db:generic-error-printout exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again."))

Added utils/open-files.sh version [61001a23e2].







>
>
>
1
2
3
echo "Database opens:  $(lsof -c mtest|egrep '.*db$'|wc -l)"
echo "Logfile opens:   $(lsof -c mtest|egrep '.*log$'|wc -l)"
echo "TCP connections: $(lsof -c mtest|grep TCP|wc -l)"