Megatest

Check-in [cfcc13973c]
Login
Overview
Comment:Added switchable support for db on /tmp instead of inmem. Added couple asserts to help find why run-id and servers are not aligned in some cases.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.80
Files: files | file ages | folders
SHA1: cfcc13973cae9eeaff6bacca9085fe94289493b2
User & Date: matt on 2023-04-07 08:25:40
Other Links: branch diff | manifest | tags
Context
2023-04-09
13:27
Added host name to messages about server not started check-in: 21f45d51cf user: mmgraham tags: v1.80
2023-04-08
02:45
Removed assert from simple lock. Removed use of statement cache which fixed bind issues check-in: 532f050f93 user: matt tags: v1.80
2023-04-07
08:25
Added switchable support for db on /tmp instead of inmem. Added couple asserts to help find why run-id and servers are not aligned in some cases. check-in: cfcc13973c user: matt tags: v1.80
03:56
For /tmp db cache disambiguate the db with pid check-in: 481acc5191 user: matt tags: v1.80
Changes

Modified dbfile.scm from [50f8ec9e6b] to [cf63c9cd5f].

192
193
194
195
196
197
198

199
200
201
202
203
204
205
206
207
208
209
210
211

212



213

214
215

216
217
218
219
220
221
222
	       #;(db:safely-close-sqlite3-db rdb #f))) ;; stmt-cache))))) ;; (if (sqlite3:database? rdb) (sqlite3:finalize! rdb))))))
	   subdbs)
           #t
          )
          #f
  )
)


(define (dbfile:run-id->path apath run-id)
  (conc apath"/"(dbfile:run-id->dbname run-id)))

(define (db:dbname->path apath dbname)
  (conc apath"/"dbname))

(define (dbfile:run-id->dbnum run-id)
  (cond
   ((number? run-id)
    (modulo run-id (num-run-dbs)))
   ((not run-id) "main")   ;; 0 or main?
   (else run-id)))





;; POTENTIAL BUG: this implementation could produce a db file if run-id is neither #f or a number

(define (dbfile:run-id->dbname run-id)
  (conc ".mtdb/"(dbfile:run-id->dbnum run-id)".db"))


;; Make the dbstruct, setup up auxillary db's and call for main db at least once
;;
;; called in http-transport and replicated in rmt.scm for *local* access. 
;;
(define (dbfile:setup do-sync areapath tmppath)
  (cond







>











|
|
>

>
>
>
|
>

|
>







192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
	       #;(db:safely-close-sqlite3-db rdb #f))) ;; stmt-cache))))) ;; (if (sqlite3:database? rdb) (sqlite3:finalize! rdb))))))
	   subdbs)
           #t
          )
          #f
  )
)


(define (dbfile:run-id->path apath run-id)
  (conc apath"/"(dbfile:run-id->dbname run-id)))

(define (db:dbname->path apath dbname)
  (conc apath"/"dbname))

(define (dbfile:run-id->dbnum run-id)
  (cond
   ((number? run-id)
    (modulo run-id (num-run-dbs)))
   ((not run-id) "main")   ;; 0 or main? No, not 0. 
   (else
    (assert #f "FATAL: run-id is required to be a number or #f"))))

;; just the filename
(define (dbfile:run-id->dbfname run-id)
  (conc (dbfile:run-id->dbnum run-id)".db"))

;; the path in MTRAH with the filename
(define (dbfile:run-id->dbname run-id)
  (conc ".mtdb/"(dbfile:run-id->dbfname run-id)))


;; Make the dbstruct, setup up auxillary db's and call for main db at least once
;;
;; called in http-transport and replicated in rmt.scm for *local* access. 
;;
(define (dbfile:setup do-sync areapath tmppath)
  (cond

Modified dbmod.scm from [722704066d] to [e3fed73fc8].

39
40
41
42
43
44
45


46
47
48
49
50
51
52
	srfi-18
	srfi-69

	commonmod
	dbfile
	debugprint
	)



;; NOTE: This returns only the name "1.db", "main.db", not the path
;;
(define (dbmod:run-id->dbfname run-id)
  (conc (dbfile:run-id->dbnum run-id)".db"))

(define (dbmod:get-dbdir dbstruct)







>
>







39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
	srfi-18
	srfi-69

	commonmod
	dbfile
	debugprint
	)

(define dbcache-mode (make-parameter 'tmp)) ;; 'inmem, 'tmp

;; NOTE: This returns only the name "1.db", "main.db", not the path
;;
(define (dbmod:run-id->dbfname run-id)
  (conc (dbfile:run-id->dbnum run-id)".db"))

(define (dbmod:get-dbdir dbstruct)
163
164
165
166
167
168
169






170
171



172
173
174
175
176
177
178
			    #!key (dbstruct-in #f)
			    (syncdir 'todisk))
  (let* ((dbstruct     (or dbstruct-in (make-dbr:dbstruct areapath: areapath)))
	 (dbfname      (or dbfname-in (dbmod:run-id->dbfname run-id)))
	 (dbpath       (dbmod:get-dbdir dbstruct))             ;; directory where all the .db files are kept
	 (dbfullname   (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id))
	 (dbexists     (file-exists? dbfullname))






	 (inmem        (dbmod:open-inmem-db init-proc
					    (conc "/tmp/"(current-process-id)"-"dbfname) ;; will create /tmp file



					    ))
	 (write-access (file-write-access? dbpath))
	 (db           (dbmod:safely-open-db dbfullname init-proc write-access))
	 (tables       (db:sync-all-tables-list keys)))
    (assert (sqlite3:database? inmem) "FATAL: open-dbmoddb: inmem is not a db")
    (assert (sqlite3:database? db) "FATAL:  open-dbmoddb: db is not a db")
    (dbr:dbstruct-inmem-set!     dbstruct inmem)







>
>
>
>
>
>

<
>
>
>







165
166
167
168
169
170
171
172
173
174
175
176
177
178

179
180
181
182
183
184
185
186
187
188
			    #!key (dbstruct-in #f)
			    (syncdir 'todisk))
  (let* ((dbstruct     (or dbstruct-in (make-dbr:dbstruct areapath: areapath)))
	 (dbfname      (or dbfname-in (dbmod:run-id->dbfname run-id)))
	 (dbpath       (dbmod:get-dbdir dbstruct))             ;; directory where all the .db files are kept
	 (dbfullname   (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id))
	 (dbexists     (file-exists? dbfullname))
	 (tmpdir       (conc "/tmp/"(current-user-name)))
	 (tmpdb        (let* ((fname (conc tmpdir"/"(current-process-id)"-"dbfname)))
			 (if (not (file-exists? tmpdir))(create-directory tmpdir))
			 ;; check if tmpdb already exists, either delete it or
			 ;; add something to the name
			 fname))
	 (inmem        (dbmod:open-inmem-db init-proc

					    (if (eq? (dbcache-mode) 'inmem)
						#f
						tmpdb)
					    ))
	 (write-access (file-write-access? dbpath))
	 (db           (dbmod:safely-open-db dbfullname init-proc write-access))
	 (tables       (db:sync-all-tables-list keys)))
    (assert (sqlite3:database? inmem) "FATAL: open-dbmoddb: inmem is not a db")
    (assert (sqlite3:database? db) "FATAL:  open-dbmoddb: db is not a db")
    (dbr:dbstruct-inmem-set!     dbstruct inmem)

Modified rmt.scm from [759d7a27d6] to [35e00a1063].

126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
  (if (not runremote)
      (let* ((newremote  (make-and-init-remote areapath)))
	(set! *runremote* newremote)
	(set! runremote newremote)))
  (let* ((dbfname (conc (dbfile:run-id->dbnum run-id)".db"))) ;;(dbfile:run-id->path areapath run-id)))
    (tt:handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))
	


(define (rmt:print-db-stats)
  (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f"
    (debug:print 18 *default-log-port* "DB Stats\n========")
    (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg"))
    (for-each (lambda (cmd)
		(let ((cmd-dat (hash-table-ref *db-stats* cmd)))
		  (debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0))))))







<
<







126
127
128
129
130
131
132


133
134
135
136
137
138
139
  (if (not runremote)
      (let* ((newremote  (make-and-init-remote areapath)))
	(set! *runremote* newremote)
	(set! runremote newremote)))
  (let* ((dbfname (conc (dbfile:run-id->dbnum run-id)".db"))) ;;(dbfile:run-id->path areapath run-id)))
    (tt:handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))
	


(define (rmt:print-db-stats)
  (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f"
    (debug:print 18 *default-log-port* "DB Stats\n========")
    (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg"))
    (for-each (lambda (cmd)
		(let ((cmd-dat (hash-table-ref *db-stats* cmd)))
		  (debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0))))))

Modified tcp-transportmod.scm from [09d461be9f] to [a71da4bf27].

113
114
115
116
117
118
119



120
121
122

123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
;; make ttdat visible
(define *server-info* #f)

(define (tt:make-remote areapath)
  (make-tt areapath: areapath))

;; 1 ... or #f



(define (tt:valid-run-id run-id)
  (or (number? run-id)
      (not run-id)))


(tcp-buffer-size 2048)
;; (max-connections 4096) 

;; do all the busy work of finding and setting up conn for
;; connecting to a server
;; 
(define (tt:client-connect-to-server ttdat dbfname run-id testsuite)
  (assert (tt:valid-run-id run-id) "FATAL: invalid run-id "run-id)
  (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f))
	 (server-start-proc (lambda ()
			      (tt:server-process-run
			       (tt-areapath ttdat)
			       testsuite ;; (dbfile:testsuite-name)
			       (common:find-local-megatest)
			       run-id))))







>
>
>
|
|
|
>








|







113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
;; make ttdat visible
(define *server-info* #f)

(define (tt:make-remote areapath)
  (make-tt areapath: areapath))

;; 1 ... or #f
;; and check that dbfname matches. FIXME: the propagation of dbfname and run-id
;; might not make the best sense
;;
(define (tt:valid-run-id run-id dbfname)
  (and (or (number? run-id)
	   (not run-id))
       (equal? (dbfile:run-id->dbfname run-id) dbfname)))

(tcp-buffer-size 2048)
;; (max-connections 4096) 

;; do all the busy work of finding and setting up conn for
;; connecting to a server
;; 
(define (tt:client-connect-to-server ttdat dbfname run-id testsuite)
  (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id)
  (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f))
	 (server-start-proc (lambda ()
			      (tt:server-process-run
			       (tt-areapath ttdat)
			       testsuite ;; (dbfile:testsuite-name)
			       (common:find-local-megatest)
			       run-id))))
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 5) ;; really do not want to swamp the machine with servers
		 (begin
		   (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
		   (server-start-proc)
		   (tt-last-serv-start-set! ttdat (current-seconds))))
	     (thread-sleep! 1)
	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
    
(define (tt:ping host port server-id)
  (let*  ((res (tt:send-receive-direct host port `(ping #f #f #f)))) ;; please send me your server-id
    ;;
    ;; need two threads, one a 5 second timer
    ;;
    (match res
      ((status errmsg result meta)







|







178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 5) ;; really do not want to swamp the machine with servers
		 (begin
		   (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
		   (server-start-proc)
		   (tt-last-serv-start-set! ttdat (current-seconds))))
	     (thread-sleep! 1)
	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))

(define (tt:ping host port server-id)
  (let*  ((res (tt:send-receive-direct host port `(ping #f #f #f)))) ;; please send me your server-id
    ;;
    ;; need two threads, one a 5 second timer
    ;;
    (match res
      ((status errmsg result meta)
263
264
265
266
267
268
269

270



271
272
273
274
275
276
277
;; returns list of (host port startseconds server-id servinfofile)
;;
(define (tt:get-server-info-sorted ttdat dbfname)
  (let* ((areapath (tt-areapath ttdat))
	 (sfiles   (tt:find-server areapath dbfname))
	 (sdats    (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read
	 (sorted   (sort sdats (lambda (a b)

				 (< (list-ref a 2)(list-ref b 2)))))



	 (count    0))
    (for-each
     (lambda (rec)
       (if (or (> (length sorted) 1)
	       (common:low-noise-print 120 "server info sorted"))
	   (debug:print 0 *default-log-port* "SERVER #"count": "(string-intersperse (map conc sorted) ", ")))
       (set! count (+ count 1)))







>
|
>
>
>







267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
;; returns list of (host port startseconds server-id servinfofile)
;;
(define (tt:get-server-info-sorted ttdat dbfname)
  (let* ((areapath (tt-areapath ttdat))
	 (sfiles   (tt:find-server areapath dbfname))
	 (sdats    (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read
	 (sorted   (sort sdats (lambda (a b)
				 (let* ((starta (list-ref a 2))
					(startb (list-ref b 2)))
				   (if (eq? starta startb)
				       (string>? (list-ref a 3)(list-ref b 3)) ;; if servers started at same time look at server-id
				       (< starta startb))))))
	 (count    0))
    (for-each
     (lambda (rec)
       (if (or (> (length sorted) 1)
	       (common:low-noise-print 120 "server info sorted"))
	   (debug:print 0 *default-log-port* "SERVER #"count": "(string-intersperse (map conc sorted) ", ")))
       (set! count (+ count 1)))
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444

445
446
447
448
449
450
451
					  (> (- (current-seconds)(file-modification-time servinfofile)) 30))
				     (begin
				       ;; can't ping and file has been on disk 15 seconds, go ahead and try to remove it
				       (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile)
				       (delete-file* servinfofile)
				       #t) ;; not the server but the server is not reachable
				     (begin
				       (debug:print 0 *default-log-port* "I'm not the server but could not ping "host":"port", trying again.")
				       (thread-sleep! 1) ;; just because
				       #t)))))
			    (else ;; should never get here
			     (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv)
			     (assert #f "Bad server record "leadsrv))))))))
	(if ok
	    ;; (if (> *api-process-request-count* 0) ;; have requests in flight
	    ;;	(tt-last-access-set! ttdat (current-seconds)))
	    (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access
	    (begin

	      (cleanup)
	      (exit)))

	(let* ((last-update (dbr:dbstruct-last-update dbstruct))
	       (curr-secs   (current-seconds)))
	  (if (and (eq? (tt-state ttdat) 'running)
		   (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db? maybe this should be refresh the inmem?







|










>







435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
					  (> (- (current-seconds)(file-modification-time servinfofile)) 30))
				     (begin
				       ;; can't ping and file has been on disk 15 seconds, go ahead and try to remove it
				       (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile)
				       (delete-file* servinfofile)
				       #t) ;; not the server but the server is not reachable
				     (begin
				       (debug:print 0 *default-log-port* "I'm not the server but could not ping "host":"port", will try again.")
				       (thread-sleep! 1) ;; just because
				       #t)))))
			    (else ;; should never get here
			     (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv)
			     (assert #f "Bad server record "leadsrv))))))))
	(if ok
	    ;; (if (> *api-process-request-count* 0) ;; have requests in flight
	    ;;	(tt-last-access-set! ttdat (current-seconds)))
	    (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access
	    (begin
	      (debug:print 0 *default-log-port* "Exiting immediately")
	      (cleanup)
	      (exit)))

	(let* ((last-update (dbr:dbstruct-last-update dbstruct))
	       (curr-secs   (current-seconds)))
	  (if (and (eq? (tt-state ttdat) 'running)
		   (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db? maybe this should be refresh the inmem?