Megatest

Check-in [abb2c73974]
Login
Overview
Comment:wip
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.80
Files: files | file ages | folders
SHA1: abb2c7397456dc2a1867d0ebfe4458d8aa1cf6e9
User & Date: mrwellan on 2023-03-03 17:37:11
Other Links: branch diff | manifest | tags
Context
2023-03-03
18:32
no crashes (yet) check-in: 349234dd61 user: mrwellan tags: v1.80
17:37
wip check-in: abb2c73974 user: mrwellan tags: v1.80
02:16
Speculative fix for wrong server access issue check-in: 0f348aa84b user: mrwellan tags: v1.80
Changes

Modified api.scm from [1736f4bcb3] to [9b20a45ffe].

241
242
243
244
245
246
247
248






249
250
251
252
253
254
255
256
257


258
259
260
261
262
263
264
265


266
267

268
269
270


271
272
273
274
275
276
277
241
242
243
244
245
246
247

248
249
250
251
252
253
254
255
256
257
258
259
260


261
262
263
264
265
266
267
268


269
270


271



272
273
274
275
276
277
278
279
280







-
+
+
+
+
+
+







-
-
+
+






-
-
+
+
-
-
+
-
-
-
+
+







  (if (not *server-signature*)
      (set! *server-signature* (tt:mk-signature *toppath*)))
  (lambda ()
    (let* ((indat      (deserialize))
	   (newcount   (+ *api-process-request-count* 1))
	   (delay-wait (if (> newcount 10)
			   (- newcount 10)
			   0)))
			   0))
	   (normal-proc (lambda (cmd run-id params)
			  (case cmd
			    ((ping) *server-signature*)
			    (else
			     (api:dispatch-request dbstruct cmd run-id params))))))
      (set! *api-process-request-count* newcount)
      (set! *db-last-access* (current-seconds))
      (match indat
	((cmd run-id params meta)
	 (let* ((ttdat   *server-info*)
		(server-state (tt-state ttdat))
		(status  (cond
			  ((> newcount 30) 'busy)
			  ((> newcount 15) 'loaded)
			  ;; ((> newcount 600) 'busy)
			  ;; ((> newcount 300) 'loaded)
			  (else 'ok)))
		(errmsg  (case status
			   ((busy)   (conc "Server overloaded, "newcount" threads in flight"))
			   ((loaded) (conc "Server loaded, "newcount" threads in flight"))
			   (else     #f)))
		(result  (case status
			   ((busy)  (- newcount 29))
			   ((loaded) #f)
			   ((busy)  (- newcount 29)) ;; call back in as many seconds
			   ((loaded)
			   (else
			    (case cmd
			    (normal-proc cmd run-id params))
			      ((ping) *server-signature*)
			      (else
			       (api:dispatch-request dbstruct cmd run-id params))))))
			   (else
			    (normal-proc cmd run-id params))))
		(meta   (case cmd
			  ((ping) `((sstate . ,server-state)))
			  (else   `((wait . ,delay-wait)))))
		(payload (list status errmsg result meta)))
	   (set! *api-process-request-count* (- *api-process-request-count* 1))
	   (serialize payload)))
	(else

Modified db.scm from [5ed64563c8] to [5255866c2a].

2852
2853
2854
2855
2856
2857
2858














2859
2860
2861
2862
2863
2864
2865
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879







+
+
+
+
+
+
+
+
+
+
+
+
+
+







	  ;;                 0    1       2      3      4        5       6      7        8     9     10      11          12          13       14
	  (set! res (cons (apply vector a b) res)))
	db
	(conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id in ("
	      (string-intersperse (map conc test-ids) ",") ");"))
       res))))

;; try every second until tries times proc
;;
(define (db:keep-trying-until-true proc params tries)
  (let* ((res (apply proc params)))
    (if res
	res
	(if (> tries 0)
	    (begin
	      (thread-sleep! 1)
	      (db:keep-trying-until-true proc params (- tries 1)))
	    (begin
	      (debug:print-info 0 *default-log-port* "proc never returned true, params="params)
	      #f)))))
  
(define (db:get-test-info dbstruct run-id test-name item-path)
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (dbdat db)
     (db:get-test-info-db db run-id test-name item-path))))
3314
3315
3316
3317
3318
3319
3320


3321


3322
3323
3324
3325
3326
3327
3328
3328
3329
3330
3331
3332
3333
3334
3335
3336

3337
3338
3339
3340
3341
3342
3343
3344
3345







+
+
-
+
+







;; if test-name is an integer work off that as test-id instead of test-name test-path
;;
(define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment)
  ;; establish info on incoming test followed by info on top level test
  ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met
  (let* ((testdat      (if (number? test-name)
			   (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id
			   (db:keep-trying-until-true ;; in our threaded stuff this call could happen before the test is registered (maybe?)
			    db:get-test-info
			   (db:get-test-info       dbstruct run-id test-name item-path)))
			    (list dbstruct run-id test-name item-path)
			    10)))
	 (test-id      (db:test-get-id testdat))
	 (test-name    (if (number? test-name)
			   (db:test-get-testname testdat)
			   test-name))
	 (item-path    (db:test-get-item-path testdat))
         (tl-testdat   (db:get-test-info dbstruct run-id test-name ""))
         (tl-test-id   (if tl-testdat

Modified dbfile.scm from [776ffe336b] to [e6a2908d99].

15
16
17
18
19
20
21
22

23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42

43
44
45
46
47
48
49
15
16
17
18
19
20
21

22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

42
43
44
45
46
47
48
49







-
+



















-
+







;; 
;;     You should have received a copy of the GNU General Public License
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

;;======================================================================

(declare (unit dbfile))
;; (declare (uses debugprint))
(declare (uses debugprint))
(declare (uses commonmod))

(module dbfile
	*
	
  (import scheme
	  chicken
	  data-structures
	  extras
	  matchable)
  
(import (prefix sqlite3 sqlite3:)
	posix typed-records srfi-18 srfi-1
	srfi-69
	stack
	files
	ports

	commonmod
	;; debugprint
	debugprint
	)

(define keep-age-param (make-parameter 10)) ;; qif file age, if over move to attic
(define num-run-dbs    (make-parameter 10))     ;; number of db's in .megatest
(define dbfile:testsuite-name (make-parameter #f))
(define dbfile:sync-method (make-parameter 'attach)) ;; 'attach or 'original

405
406
407
408
409
410
411

412
413
414
415
416
417
418
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419







+







    (system cmd)))

;; opens and returns handle and nothing else
;;
(define (dbfile:raw-open-no-sync-db dbpath)
  (if (not (file-exists? dbpath))
      (create-directory dbpath #t))
  (debug:print-info 0 *default-log-port* "Opening "dbpath"/no-sync.db")
  (let* ((dbname    (conc dbpath "/no-sync.db"))
	 (db-exists (file-exists? dbname))
	 (init-proc (lambda (db)
		      (if (not db-exists)
			  (begin
			    (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));"))
			  )))
466
467
468
469
470
471
472


473

474
475
476
477
478
479
480
467
468
469
470
471
472
473
474
475

476
477
478
479
480
481
482
483







+
+
-
+








;; transaction protected lock aquisition
;; either:
;;    fails    returns  (#f . lock-creation-time)
;;    succeeds (returns (#t . lock-creation-time)
;; use (db:no-sync-del! db keyname) to release the lock
;;
;; 
;;
(define (db:no-sync-get-lock db keyname)
(define (db:no-sync-get-lock db keyname . identification)
  (sqlite3:with-transaction
   db
   (lambda ()
     (condition-case
	 (let* ((curr-val (db:no-sync-get/default db keyname #f)))
	   (if curr-val
	       `(#f . ,curr-val)   ;; (sqlite3:first-result db "SELECT val FROM no_sync_metadat WHERE var=?;" keyname))

Modified launch.scm from [6fda936e21] to [b3f7d09843].

1457
1458
1459
1460
1461
1462
1463

1464
1465
1466
1467
1468
1469
1470
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471







+







;; 2. create run dir on disk, path name is meaningful
;; 3. create link from run dir to megatest runs area 
;; 4. remotely run the test on allocated host
;;    - could be ssh to host from hosts table (update regularly with load)
;;    - could be netbatch
;;      (launch-test db (cadr status) test-conf))
(define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params)
  (assert runname "FATAL: launch-test called with no runname")
  (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex
  (let* ( ;; (lock-key        (conc "test-" test-id))
	;; (got-lock        (let loop ((lock        (rmt:no-sync-get-lock lock-key))
	;; 			     (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds
	;; 		    (if (car lock)
	;; 			#t
	;; 			(if (> (current-seconds) expire-time)
1563
1564
1565
1566
1567
1568
1569
1570

1571
1572
1573
1574
1575
1576
1577
1564
1565
1566
1567
1568
1569
1570

1571
1572
1573
1574
1575
1576
1577
1578







-
+







      ;; the following call handles waiver propogation. cannot yet condense into roll-up-pass-fail
      (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED"))
      (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f)
      ;; (pp (hash-table->alist tconfig))
      (set! diskpath (get-best-disk *configdat* tconfig))
      (debug:print 2 *default-log-port* "best disk path = " diskpath)
      (if diskpath
	  (let ((dat  (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat)))
	  (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat)))
	    (set! work-area (car dat))
	    (set! toptest-work-area (cadr dat))
	    (debug:print-info 2 *default-log-port* "Using work area " work-area))
	  (begin
	    (set! work-area (conc test-path "/tmp_run"))
	    (create-directory work-area #t)
	    (debug:print 0 *default-log-port* "WARNING: No disk work area specified - running in the test directory under tmp_run")))

Modified megatest.scm from [a9076c4fb8] to [c6a7ef8a82].

93
94
95
96
97
98
99

100
101
102
103
104
105
106
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107







+







(define *usage-use-seconds* #t) ;; for Epoc seconds in usage logging change this to #t in ~/.megatestrc file

;; set some parameters here - these need to be put in something that can be loaded from other
;; executables such as dashboard and mtutil
;;
(include "transport-mode.scm")
(dbfile:db-init-proc db:initialize-main-db)
(debug:enable-timestamp #t) 

;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file
;;
(let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc")))
  (if (common:file-exists? debugcontrolf)
      (load debugcontrolf)))

Modified tcp-transportmod.scm from [2182e473e2] to [8018e361e3].

206
207
208
209
210
211
212
213

214
215
216
217

218
219
220
221
222
223
224
206
207
208
209
210
211
212

213
214
215
216

217
218
219
220
221
222
223
224







-
+



-
+







		   (if (and (number? delay-wait)
			    (> delay-wait 0))
		       (begin
			 (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds")
			 (thread-sleep! delay-wait)))))
	     (case status
	       ((busy) ;; result will be how long the server wants you to delay
		(debug:print 0 *default-log-port* "WARNING: server is overloaded, will try again in "result" seconds.")
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is overloaded, will try again in "result" seconds.")
		(thread-sleep! (if (number? result) result 2))
		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       ((loaded)
		(debug:print 0 *default-log-port* "WARNING: server is loaded, will try again in a 1/4 second.")
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, will try again in a 1/4 second.")
		(thread-sleep! 0.25)
		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       (else
		result)))
	    (else
	     (if (not res)
		 (let* ((host    (tt-conn-host conn))
265
266
267
268
269
270
271
272







273
274
275
276
277
278
279
265
266
267
268
269
270
271

272
273
274
275
276
277
278
279
280
281
282
283
284
285







-
+
+
+
+
+
+
+







;; returns list of (host port startseconds server-id servinfofile)
;;
(define (tt:get-server-info-sorted ttdat dbfname)
  (let* ((areapath (tt-areapath ttdat))
	 (sfiles   (tt:find-server areapath dbfname))
	 (sdats    (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read
	 (sorted   (sort sdats (lambda (a b)
				 (< (list-ref a 2)(list-ref b 2))))))
				 (< (list-ref a 2)(list-ref b 2)))))
	 (count    0))
    (for-each
     (lambda (rec)
       (debug:print 0 *default-log-port* "SERVER #"count": "(string-intersperse (map conc sorted) ", "))
       (set! count (+ count 1)))
     sorted)
    sorted))
    
(define (tt:get-current-server-info ttdat dbfname)
  (assert (tt-areapath ttdat) "FATAL: areapath not set in ttdat.")
  ;;
  ;; TODO - replace most of below with tt;get-server-info-sorted
  ;;
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407





















408
409
410
411
412
413
414
415
416







417
418

419
420
421
422
423
424



425
426
427
428
429
430
431
432
433
434
435
436
437

438

439

440
441
442
443
444
445
446
391
392
393
394
395
396
397
















398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423




424
425
426
427
428
429
430
431

432
433
434
435
436


437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453

454
455
456
457
458
459
460
461
462
463







-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+





-
-
-
-
+
+
+
+
+
+
+

-
+




-
-
+
+
+













+
-
+

+







    (thread-sleep! 0.05) ;; any real need for delay here?
    (let loop ()
      (let* ((servers (tt:get-server-info-sorted ttdat dbfname))
	     (ok      (cond
		       ((null? servers) #f) ;; not ok
		       ((equal? (list-ref (car servers) 6) ;; compare the servinfofile
				(tt-servinf-file ttdat))
			(debug:print-info 0 *default-log-port* "Keep running, I'm the top server on "(tt-host ttdat)":"(tt-port ttdat))
			(if db-locked-in
			    #t
			    (let* ((lockinfo  (dbfile:with-no-sync-db nosyncdbpath
								      (lambda (db)
									(db:no-sync-get-lock db dbfname))))
				   (success   (car lockinfo)))
			      (if success
				  (begin
				    (tt-state-set! ttdat 'running)
				    (debug:print 0 *default-log-port* "Got server lock for "dbfname)
				    (set! db-locked-in #t)
				    #t)
				  (begin
				    (debug:print 0 *default-log-port* "Failed to get server lock for "dbfname)
				    #f)))))
			(let* ((res (if db-locked-in
					#t
					(let* ((lockinfo  (dbfile:with-no-sync-db
							   nosyncdbpath
							   (lambda (db)
							     (db:no-sync-get-lock db dbfname))))
					       (success   (car lockinfo)))
					  (if success
					      (begin
						(tt-state-set! ttdat 'running)
						(debug:print 0 *default-log-port* "Got server lock for "
							     dbfname)
						(set! db-locked-in #t)
						#t)
					      (begin
						(debug:print 0 *default-log-port* "Failed to get server lock for "dbfname)
						#f))))))
			  (if res
			      (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for "
						dbfname" on "(tt-host ttdat)":"(tt-port ttdat)))
			  res))
		       (else
			(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
			(let* ((leadsrv (car servers)))
			  (match leadsrv
			    ((host port startseconds server-id pid dbfname servinfofile)
			     (if (tt:ping host port server-id)
				 #f ;; not the server, but all good, want to exit
				 (if (and (file-exists? servinfofile)
					  (> (- (current-seconds)(file-modification-time servinfofile)) 15))
			     (let* ((res  (tt:ping host port server-id)))
			       (debug:print-info 0 *default-log-port* "Ping to "host":"port", with server-id "server-id
						 ", and file "servinfofile" returned "res)
			       (if res
				   #f ;; not the server, but all good, want to exit
				   (if (and (file-exists? servinfofile)
					  (> (- (current-seconds)(file-modification-time servinfofile)) 30))
				     (begin
				       ;; can't ping and file has been on disk 5 seconds, go ahead and try to remove it
				       ;; can't ping and file has been on disk 15 seconds, go ahead and try to remove it
				       (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile)
				       (delete-file* servinfofile)
				       #t) ;; not the server but the server is not reachable
				     (begin
				       (debug:print 0 *default-log-port* "I'm not the server but will try again since "servinfofile" is fresh")
				       #t))))
				       (debug:print 0 *default-log-port* "I'm not the server but could not ping "host":"port", trying again.")
				       (thread-sleep! 1) ;; just because
				       #t)))))
			    (else ;; should never get here
			     (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv)
			     (assert #f "Bad server record "leadsrv))))))))
	(if ok
	    ;; (if (> *api-process-request-count* 0) ;; have requests in flight
	    ;;	(tt-last-access-set! ttdat (current-seconds)))
	    (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access
	    (begin
	      (cleanup)
	      (exit)))

	(let* ((last-update (dbr:dbstruct-last-update dbstruct))
	       (curr-secs   (current-seconds)))
	  (if (and (eq? (tt-state ttdat) 'running)
	  (if (> (- curr-secs last-update) 3) ;; every 3-4 seconds update the db? maybe this should be refresh the inmem?
		   (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db? maybe this should be refresh the inmem?
	      (begin
		(set! (file-modification-time (tt-servinf-file ttdat)) (current-seconds))
		((dbr:dbstruct-sync-proc dbstruct) last-update)
		(dbr:dbstruct-last-update-set! dbstruct curr-secs))))
	  
	(if (< (- (current-seconds) (tt-last-access ttdat)) 60)
	    (begin
	      (thread-sleep! 5)
	      (loop)))))