Megatest

Check-in [a66ca8ec4f]
Login
Overview
Comment:removed checking for journal file. Moved setting of busy timeout and PRAGMA synchronous inside cautious-open-database
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.70
Files: files | file ages | folders
SHA1: a66ca8ec4ff1945b9210e1e5f9e4294b5dfe984c
User & Date: mmgraham on 2022-09-16 16:21:26
Other Links: branch diff | manifest | tags
Context
2022-09-18
10:48
Wrap dbfile:open-db call with mutex to minimize over-opening of db's check-in: 53f35f5363 user: matt tags: v1.70
2022-09-16
16:21
removed checking for journal file. Moved setting of busy timeout and PRAGMA synchronous inside cautious-open-database check-in: a66ca8ec4f user: mmgraham tags: v1.70
2022-08-30
15:44
Changed server timeout from 60 to 1200 seconds check-in: 36255e358b user: mmgraham tags: v1.70, v1.7006
Changes

Modified db.scm from [8c707e9257] to [974c310e18].

1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
           #t)))))

(define (db:get-status-from-final-status-file run-dir)
  (let ((infile (conc run-dir "/.final-status")))
    ;; first verify we are able to write the output file
    (if (not (file-read-access? infile))
        (begin 
	  (debug:print 0 *default-log-port* "ERROR: cannot read " infile)
          (debug:print 0 *default-log-port* "ERROR: run-dir is " run-dir)
          #f
          )
        (with-input-from-file infile read-lines)
	)))

;;  select end_time-now from
;;      (select testname,item_path,event_time+run_duration as







|
|







1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
           #t)))))

(define (db:get-status-from-final-status-file run-dir)
  (let ((infile (conc run-dir "/.final-status")))
    ;; first verify we are able to write the output file
    (if (not (file-read-access? infile))
        (begin 
	  (debug:print 2 *default-log-port* "ERROR: cannot read " infile)
          (debug:print 2 *default-log-port* "ERROR: run-dir is " run-dir)
          #f
          )
        (with-input-from-file infile read-lines)
	)))

;;  select end_time-now from
;;      (select testname,item_path,event_time+run_duration as

Modified dbfile.scm from [f6dfe9d92f] to [a29708f170].

307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
;;   NOTE: returns a dbdat not a dbstruct!
;;
(define (dbfile:open-sqlite3-db dbpath init-proc)
  (let* ((dbexists     (file-exists? dbpath))
	 (write-access (file-write-access? dbpath))
	 (db           (dbfile:cautious-open-database dbpath init-proc))) #;(sqlite3:open-database dbpath)
    (dbfile:inc-db-open dbpath)
    (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 10000))
    (sqlite3:execute db (conc "PRAGMA synchronous = 0;"))
    ;; (init-proc db)
    (make-dbr:dbdat dbfile: dbpath dbh: db read-only: (not write-access))))

(define (dbfile:print-and-exit . params)
  (with-output-to-port
      (current-error-port)
    (lambda ()







<
<







307
308
309
310
311
312
313


314
315
316
317
318
319
320
;;   NOTE: returns a dbdat not a dbstruct!
;;
(define (dbfile:open-sqlite3-db dbpath init-proc)
  (let* ((dbexists     (file-exists? dbpath))
	 (write-access (file-write-access? dbpath))
	 (db           (dbfile:cautious-open-database dbpath init-proc))) #;(sqlite3:open-database dbpath)
    (dbfile:inc-db-open dbpath)


    ;; (init-proc db)
    (make-dbr:dbdat dbfile: dbpath dbh: db read-only: (not write-access))))

(define (dbfile:print-and-exit . params)
  (with-output-to-port
      (current-error-port)
    (lambda ()
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503


504
505
506
507
508
509
510
	 (delay-time (* (- 51 tries-left) 1.1))
      	 (write-access (file-write-access? fname))
         (dir-access (file-write-access? (pathname-directory fname)))
         (retry      (lambda ()
		       (thread-sleep! delay-time)
		       (if (> tries-left 0)
			   (dbfile:cautious-open-database fname init-proc (- tries-left 1))))))
    (assert (>= tries-left 0) (conc "FATAL: too many attempts in dbfile:cautious-open-database of "fname", giving up."))

    (if (and (file-write-access? fname)
	     (file-exists? busy-file))
	(begin
	  (dbfile:print-err "INFO: dbfile:cautious-open-database: journal file " busy-file " exists, trying again in few seconds.")
	  (thread-sleep! 1)
	  (if (eq? tries-left 2)
	      (begin
		(dbfile:print-err "INFO: forcing journal rollup "busy-file)
		(dbfile:brute-force-salvage-db fname)))
	  (dbfile:cautious-open-database fname init-proc (- tries-left 1)))

	(let* ((result (condition-case
		         (if dir-access
			   (dbfile:with-simple-file-lock
			    (conc fname ".lock")
			    (lambda ()
			      (let* ((db-exists (file-exists? fname))
				     (db        (sqlite3:open-database fname))) ;; creates an empty db if it did not already exist.


				(if (and init-proc (not db-exists))
				    (init-proc db))
				db)))
                            (begin
                               (if (file-exists? fname )
                                   (begin
                                      (sqlite3:open-database fname)







<
<
<
<
<
<
<
<
<
<
<
<








>
>







475
476
477
478
479
480
481












482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
	 (delay-time (* (- 51 tries-left) 1.1))
      	 (write-access (file-write-access? fname))
         (dir-access (file-write-access? (pathname-directory fname)))
         (retry      (lambda ()
		       (thread-sleep! delay-time)
		       (if (> tries-left 0)
			   (dbfile:cautious-open-database fname init-proc (- tries-left 1))))))













	(let* ((result (condition-case
		         (if dir-access
			   (dbfile:with-simple-file-lock
			    (conc fname ".lock")
			    (lambda ()
			      (let* ((db-exists (file-exists? fname))
				     (db        (sqlite3:open-database fname))) ;; creates an empty db if it did not already exist.
                                (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 30000))
                                (sqlite3:execute db (conc "PRAGMA synchronous = 0;"))
				(if (and init-proc (not db-exists))
				    (init-proc db))
				db)))
                            (begin
                               (if (file-exists? fname )
                                   (begin
                                      (sqlite3:open-database fname)
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
			      (retry))
			 (exn (permission)(dbfile:print-err exn "ERROR: database " fname " has some permissions problem.")
			      (retry))
			 (exn ()
			      (dbfile:print-err exn "ERROR: Unknown error with database " fname " message: "
						((condition-property-accessor 'exn 'message) exn))
			      (retry)))))
          #;(if (file-write-access? fname)
	  (dbfile:simple-file-release-lock lock-file))
	  result))))

(define (dbfile:brute-force-salvage-db fname)
  (let* ((backupfname (conc fname"-"(current-process-id)".bak"))
	 (cmd (conc "cp "fname" "backupfname";mv "fname" "(conc fname ".delme;")
		    "cp "backupfname" "fname)))
    (dbfile:print-err "WARNING: attempting recovery of file "fname" by running commands:\n"
		      "  "cmd)







<
<
|







513
514
515
516
517
518
519


520
521
522
523
524
525
526
527
			      (retry))
			 (exn (permission)(dbfile:print-err exn "ERROR: database " fname " has some permissions problem.")
			      (retry))
			 (exn ()
			      (dbfile:print-err exn "ERROR: Unknown error with database " fname " message: "
						((condition-property-accessor 'exn 'message) exn))
			      (retry)))))


	  result)))

(define (dbfile:brute-force-salvage-db fname)
  (let* ((backupfname (conc fname"-"(current-process-id)".bak"))
	 (cmd (conc "cp "fname" "backupfname";mv "fname" "(conc fname ".delme;")
		    "cp "backupfname" "fname)))
    (dbfile:print-err "WARNING: attempting recovery of file "fname" by running commands:\n"
		      "  "cmd)

Modified server.scm from [775e426670] to [a114f4f994].

231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
		  (condition-case
		   (create-directory (conc areapath "/logs") #t)
		   (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
		   (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn)))
		  (directory-exists? (conc areapath "/logs")))
		'()))

        ;; Get the list of server logs. First remove logs for servers that have exited.
	(let* (
               ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers.
               ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'")))
               (server-logs   (glob (conc areapath "/logs/server-*-*.log")))
	       (num-serv-logs (length server-logs)))
	  (if (or (null? server-logs) (= num-serv-logs 0))
              (let ()
                 (debug:print 2  *default-log-port* "There are no servers running at " (common:human-time))
	         '()
              )
	      (let loop ((hed  (string-chomp (car server-logs)))
			 (tal  (cdr server-logs))
			 (res '()))
		(let* ((mod-time  (handle-exceptions
				   exn
				   (begin
				     (debug:print 0 *default-log-port* "failed to get modification time on " hed ", exn=" exn)
				     (current-seconds)) ;; 0
				   (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
		       (down-time (- (current-seconds) mod-time))
		       (serv-dat  (if (or (< num-serv-logs 10)
				  	  (< down-time 900)) ;; day-seconds))
				      (server:logf-get-start-info hed)
				      '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at







|
















|







231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
		  (condition-case
		   (create-directory (conc areapath "/logs") #t)
		   (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
		   (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn)))
		  (directory-exists? (conc areapath "/logs")))
		'()))

        ;; Get the list of server logs.
	(let* (
               ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers.
               ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'")))
               (server-logs   (glob (conc areapath "/logs/server-*-*.log")))
	       (num-serv-logs (length server-logs)))
	  (if (or (null? server-logs) (= num-serv-logs 0))
              (let ()
                 (debug:print 2  *default-log-port* "There are no servers running at " (common:human-time))
	         '()
              )
	      (let loop ((hed  (string-chomp (car server-logs)))
			 (tal  (cdr server-logs))
			 (res '()))
		(let* ((mod-time  (handle-exceptions
				   exn
				   (begin
				     (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn)
				     (current-seconds)) ;; 0
				   (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
		       (down-time (- (current-seconds) mod-time))
		       (serv-dat  (if (or (< num-serv-logs 10)
				  	  (< down-time 900)) ;; day-seconds))
				      (server:logf-get-start-info hed)
				      '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at