Megatest

Check-in [6c7b8be468]
Login
Overview
Comment:Squashed v1.80-dbperformance into one commit
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.80-dbperf
Files: files | file ages | folders
SHA1: 6c7b8be4680f1fd29e13a9f9448cb07e4a3b688b
User & Date: matt on 2023-02-06 19:35:20
Other Links: branch diff | manifest | tags
Context
2023-02-10
09:48
Factor out http-transport aspect of rmt.scm in preparation for adding tcp-transport.scm Closed-Leaf check-in: bce6a00a70 user: matt tags: v1.80-dbperf
2023-02-06
19:35
Squashed v1.80-dbperformance into one commit check-in: 6c7b8be468 user: matt tags: v1.80-dbperf
2023-02-04
20:00
Change default number of db's to 10, couple minor fixes. Closed-Leaf check-in: 7b86032d25 user: matt tags: v1.80-dbperformance
2023-02-02
12:54
Use an actual droop check-in: 19861e6399 user: matt tags: v1.80
Changes

Modified api.scm from [0676a2f9d1] to [f3cc459c57].

37
38
39
40
41
42
43

44
45
46
47
48
49
50
(define api:read-only-queries
  '(get-key-val-pairs
    get-var
    get-keys
    get-key-vals
    test-toplevel-num-items
    get-test-info-by-id

    get-steps-info-by-id
    get-data-info-by-id
    test-get-rundir-from-test-id
    get-count-tests-running-for-testname
    get-count-tests-running
    get-count-tests-running-in-jobgroup
    get-previous-test-run-record







>







37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
(define api:read-only-queries
  '(get-key-val-pairs
    get-var
    get-keys
    get-key-vals
    test-toplevel-num-items
    get-test-info-by-id
    get-test-state-status-by-id
    get-steps-info-by-id
    get-data-info-by-id
    test-get-rundir-from-test-id
    get-count-tests-running-for-testname
    get-count-tests-running
    get-count-tests-running-in-jobgroup
    get-previous-test-run-record
326
327
328
329
330
331
332

333
334
335
336
337
338
339

    ;; ARCHIVES
    ((test-get-archive-block-info)     (apply db:test-get-archive-block-info dbstruct params))
    
    ;; TESTS
    ((test-toplevel-num-items)         (apply db:test-toplevel-num-items dbstruct params))
    ((get-test-info-by-id)	       (apply db:get-test-info-by-id dbstruct params))

    ((test-get-rundir-from-test-id)    (apply db:test-get-rundir-from-test-id dbstruct params))
    ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params))
    ((get-count-tests-running)         (apply db:get-count-tests-running dbstruct params))
    ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params))
    ;; ((delete-test-step-records)        (apply db:delete-test-step-records dbstruct params))
    ;; ((get-previous-test-run-record)    (apply db:get-previous-test-run-record dbstruct params))
    ((get-matching-previous-test-run-records)(apply db:get-matching-previous-test-run-records dbstruct params))







>







327
328
329
330
331
332
333
334
335
336
337
338
339
340
341

    ;; ARCHIVES
    ((test-get-archive-block-info)     (apply db:test-get-archive-block-info dbstruct params))
    
    ;; TESTS
    ((test-toplevel-num-items)         (apply db:test-toplevel-num-items dbstruct params))
    ((get-test-info-by-id)	       (apply db:get-test-info-by-id dbstruct params))
    ((get-test-state-status-by-id)     (apply db:get-test-state-status-by-id dbstruct params))
    ((test-get-rundir-from-test-id)    (apply db:test-get-rundir-from-test-id dbstruct params))
    ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params))
    ((get-count-tests-running)         (apply db:get-count-tests-running dbstruct params))
    ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params))
    ;; ((delete-test-step-records)        (apply db:delete-test-step-records dbstruct params))
    ;; ((get-previous-test-run-record)    (apply db:get-previous-test-run-record dbstruct params))
    ((get-matching-previous-test-run-records)(apply db:get-matching-previous-test-run-records dbstruct params))
349
350
351
352
353
354
355

356
357
358
359
360
361
362
    ((get-raw-run-stats)               (apply db:get-raw-run-stats dbstruct params))
    ((get-test-times)                  (apply db:get-test-times dbstruct params))

    ;; RUNS
    ((get-run-info)                 (apply db:get-run-info dbstruct params))
    ((get-run-status)               (apply db:get-run-status dbstruct params))
    ((get-run-state)                (apply db:get-run-state dbstruct params))

    ((set-run-status)               (apply db:set-run-status dbstruct params))
    ((set-run-state-status)  			 (apply db:set-run-state-status dbstruct params))
    ((update-tesdata-on-repilcate-db) (apply db:update-tesdata-on-repilcate-db  dbstruct params)) 
    ((get-tests-for-run)            (apply db:get-tests-for-run dbstruct params))
    ((get-tests-for-run-state-status) (apply db:get-tests-for-run-state-status dbstruct params))
    ((get-test-id)                  (apply db:get-test-id dbstruct params))
    ((get-tests-for-run-mindata)    (apply db:get-tests-for-run-mindata dbstruct params))







>







351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
    ((get-raw-run-stats)               (apply db:get-raw-run-stats dbstruct params))
    ((get-test-times)                  (apply db:get-test-times dbstruct params))

    ;; RUNS
    ((get-run-info)                 (apply db:get-run-info dbstruct params))
    ((get-run-status)               (apply db:get-run-status dbstruct params))
    ((get-run-state)                (apply db:get-run-state dbstruct params))
    ((get-run-state-status)         (apply db:get-run-state-status dbstruct params))
    ((set-run-status)               (apply db:set-run-status dbstruct params))
    ((set-run-state-status)  			 (apply db:set-run-state-status dbstruct params))
    ((update-tesdata-on-repilcate-db) (apply db:update-tesdata-on-repilcate-db  dbstruct params)) 
    ((get-tests-for-run)            (apply db:get-tests-for-run dbstruct params))
    ((get-tests-for-run-state-status) (apply db:get-tests-for-run-state-status dbstruct params))
    ((get-test-id)                  (apply db:get-test-id dbstruct params))
    ((get-tests-for-run-mindata)    (apply db:get-tests-for-run-mindata dbstruct params))

Modified db.scm from [66cca5d3c4] to [ffd8a763fc].

466
467
468
469
470
471
472

473

474
475
476
477
478
479
480
				 fname", delta: " (- time1 time2) " seconds, reason: "(cdr do-cp))
	       (db:lock-and-delta-sync no-sync-db dbstruct fname runid (db:get-keys dbstruct) db:initialize-main-db)
	       (hash-table-set! sync-durations (conc fname".db")
				(- (current-milliseconds) start-time)))
	     (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date")
         )))
     dbfiles)

    (if dbdat (dbfile:add-dbdat dbstruct #f dbdat)))

  #t)

;; options:
;;
;;  'killservers  - kills all servers
;;  'dejunk       - removes junk records
;;  'adj-testids  - move test-ids into correct ranges







>
|
>







466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
				 fname", delta: " (- time1 time2) " seconds, reason: "(cdr do-cp))
	       (db:lock-and-delta-sync no-sync-db dbstruct fname runid (db:get-keys dbstruct) db:initialize-main-db)
	       (hash-table-set! sync-durations (conc fname".db")
				(- (current-milliseconds) start-time)))
	     (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date")
         )))
     dbfiles)
    ;; WHY does the dbdat need to be added back?
    (if dbdat (dbfile:add-dbdat dbstruct #f dbdat))
    )
  #t)

;; options:
;;
;;  'killservers  - kills all servers
;;  'dejunk       - removes junk records
;;  'adj-testids  - move test-ids into correct ranges
596
597
598
599
600
601
602

603
604
605
606
607
608
609
     (lambda (subdb)
       (let* ((dbname (db:run-id->dbname run-id))
	      (mtdb   (dbr:subdb-mtdb subdb))
	      (tmpdb  (db:get-subdb dbstruct run-id))
	      (refndb (dbr:subdb-refndb subdb))
	      (newres (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) last-update tmpdb refndb mtdb)))
	 ;; (stack-push! (dbr:subdb-dbstack subdb) tmpdb)

	 (dbfile:add-dbdat dbstruct run-id tmpdb)
	 (set! res (cons newres res))))
     subdbs)
    res))

;;;; run-ids
;;    if #f use *db-local-sync* : or 'local-sync-flags







>







598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
     (lambda (subdb)
       (let* ((dbname (db:run-id->dbname run-id))
	      (mtdb   (dbr:subdb-mtdb subdb))
	      (tmpdb  (db:get-subdb dbstruct run-id))
	      (refndb (dbr:subdb-refndb subdb))
	      (newres (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) last-update tmpdb refndb mtdb)))
	 ;; (stack-push! (dbr:subdb-dbstack subdb) tmpdb)
	 ;; BUG: verify this is really needed
	 (dbfile:add-dbdat dbstruct run-id tmpdb)
	 (set! res (cons newres res))))
     subdbs)
    res))

;;;; run-ids
;;    if #f use *db-local-sync* : or 'local-sync-flags
902
903
904
905
906
907
908

909
910
911
912
913
914
915
	 db 
	 (conc
	  "SELECT d.id,d.archive_area_name,disk_path,last_df,last_df_time FROM archive_disks AS d
             INNER JOIN archive_blocks AS b ON d.id=b.archive_disk_id
             WHERE b.id IN (" (string-intersperse (map conc res) ",") ") AND
         last_df > ?;")
	 dneeded))

    (dbfile:add-dbdat dbstruct #f dbdat)
    blocks))
    
;; returns id of the record, register a disk allocated to archiving and record it's last known
;; available space
;;
(define (db:archive-register-disk dbstruct bdisk-name bdisk-path df)







>







905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
	 db 
	 (conc
	  "SELECT d.id,d.archive_area_name,disk_path,last_df,last_df_time FROM archive_disks AS d
             INNER JOIN archive_blocks AS b ON d.id=b.archive_disk_id
             WHERE b.id IN (" (string-intersperse (map conc res) ",") ") AND
         last_df > ?;")
	 dneeded))
    ;; BUG: Verfify this is really needed
    (dbfile:add-dbdat dbstruct #f dbdat)
    blocks))
    
;; returns id of the record, register a disk allocated to archiving and record it's last known
;; available space
;;
(define (db:archive-register-disk dbstruct bdisk-name bdisk-path df)
2064
2065
2066
2067
2068
2069
2070

2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083

2084
2085














2086
2087
2088
2089
2090
2091
2092
  (let ((res "n/a"))
    (db:with-db
     dbstruct #f #f
     (lambda (dbdat db)
       (sqlite3:for-each-row 
	(lambda (status)
	  (set! res status))

	db
	"SELECT status FROM runs WHERE id=?;" 
	run-id)
       res))))

(define (db:get-run-state dbstruct run-id)
  (let ((res "n/a"))
    (db:with-db
     dbstruct #f #f
     (lambda (dbdat db)
       (sqlite3:for-each-row 
	(lambda (status)
	  (set! res status))

	db
	"SELECT state FROM runs WHERE id=?;" 














	run-id)
       res))))


;;======================================================================
;; K E Y S
;;======================================================================







>
|
|











>
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>







2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
  (let ((res "n/a"))
    (db:with-db
     dbstruct #f #f
     (lambda (dbdat db)
       (sqlite3:for-each-row 
	(lambda (status)
	  (set! res status))
	(db:get-cache-stmth
	 dbdat db
	 "SELECT status FROM runs WHERE id=?;" )
	run-id)
       res))))

(define (db:get-run-state dbstruct run-id)
  (let ((res "n/a"))
    (db:with-db
     dbstruct #f #f
     (lambda (dbdat db)
       (sqlite3:for-each-row 
	(lambda (status)
	  (set! res status))
	(db:get-cache-stmth
	 dbdat db
	 "SELECT state FROM runs WHERE id=?;" )
	run-id)
       res))))

(define (db:get-run-state-status dbstruct run-id)
  (let ((res (cons "n/a" "n/a")))
    (db:with-db
     dbstruct #f #f
     (lambda (dbdat db)
       (sqlite3:for-each-row 
	(lambda (state status)
	  (set! res (cons state status)))
	(db:get-cache-stmth
	 dbdat db
	 "SELECT state,status FROM runs WHERE id=?;" )
	run-id)
       res))))


;;======================================================================
;; K E Y S
;;======================================================================
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
















2718
2719
2720
2721
2722
2723
2724
  (let* ((run-ids (db:get-all-run-ids mtdb)))
    (for-each 
     (lambda (run-id)
       (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
	 (db:prep-megatest.db-adj-test-ids (dbr:dbdat-dbh mtdb) run-id testrecs)))
     run-ids)))

;; Get test data using test_id, run-id is not used
;; 
(define (db:get-test-info-by-id dbstruct run-id test-id)
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (dbdat db)
     (let ((res #f))
       (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
	(lambda (id run-id testname state status event-time host cpuload diskfree uname rundir-id item-path run_duration final-logf-id comment short-dir-id attemptnum archived last-update)
	  ;;                0    1       2      3      4        5       6      7        8     9     10      11          12          13           14         15          16
	  (set! res (vector id run-id testname state status event-time host cpuload diskfree uname rundir-id item-path run_duration final-logf-id comment short-dir-id attemptnum archived last-update)))
	(db:get-cache-stmth dbdat db
			    (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id=?;"))
	test-id)
       res))))

















;; Use db:test-get* to access
;; Get test data using test_ids. NB// Only works within a single run!!
;;
(define (db:get-test-info-by-ids dbstruct run-id test-ids)
  (db:with-db
   dbstruct







|
















>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
  (let* ((run-ids (db:get-all-run-ids mtdb)))
    (for-each 
     (lambda (run-id)
       (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
	 (db:prep-megatest.db-adj-test-ids (dbr:dbdat-dbh mtdb) run-id testrecs)))
     run-ids)))

;; Get test data using test_id
;; 
(define (db:get-test-info-by-id dbstruct run-id test-id)
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (dbdat db)
     (let ((res #f))
       (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
	(lambda (id run-id testname state status event-time host cpuload diskfree uname rundir-id item-path run_duration final-logf-id comment short-dir-id attemptnum archived last-update)
	  ;;                0    1       2      3      4        5       6      7        8     9     10      11          12          13           14         15          16
	  (set! res (vector id run-id testname state status event-time host cpuload diskfree uname rundir-id item-path run_duration final-logf-id comment short-dir-id attemptnum archived last-update)))
	(db:get-cache-stmth dbdat db
			    (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id=?;"))
	test-id)
       res))))

;; Get test state, status using test_id
;; 
(define (db:get-test-state-status-by-id dbstruct run-id test-id)
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (dbdat db)
     (let ((res (cons #f #f)))
       (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
	(lambda (state status)
	  (cons state status))
	(db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=?;")
	test-id)
       res))))

;; Use db:test-get* to access
;; Get test data using test_ids. NB// Only works within a single run!!
;;
(define (db:get-test-info-by-ids dbstruct run-id test-ids)
  (db:with-db
   dbstruct

Modified dbfile.scm from [25f8271ef2] to [2ad26c948f].

35
36
37
38
39
40
41

42
43
44


45
46
47
48
49
50
51
	posix typed-records srfi-18 srfi-1
	srfi-69
	stack
	files
	ports

	commonmod

	)

;; (import debugprint)



;;======================================================================
;;  R E C O R D S
;;======================================================================

;; a single Megatest area with it's multiple dbs is
;; managed in a dbstruct







>


<
>
>







35
36
37
38
39
40
41
42
43
44

45
46
47
48
49
50
51
52
53
	posix typed-records srfi-18 srfi-1
	srfi-69
	stack
	files
	ports

	commonmod
	;; debugprint
	)


(define keep-age-param (make-parameter 10)) ;; qif file age, if over move to attic
(define num-run-dbs (make-parameter 10))     ;; number of db's in .megatest

;;======================================================================
;;  R E C O R D S
;;======================================================================

;; a single Megatest area with it's multiple dbs is
;; managed in a dbstruct
189
190
191
192
193
194
195







196
197
198
199
200
201
202
203
204
205
206
207
208
209
;;   (abandoned the idea of num/db)
;; 
(define (dbfile:run-id->path apath run-id)
  (conc apath"/"(dbfile:run-id->dbname run-id)))

(define (db:dbname->path apath dbname)
  (conc apath"/"dbname))








;; POTENTIAL BUG: this implementation could produce a db file if run-id is neither #f or a number
(define (dbfile:run-id->dbname run-id)
  (cond
   ((number? run-id) (conc ".megatest/" (modulo run-id 100) ".db"))
   ((not run-id)     (conc ".megatest/main.db"))
   (else             run-id)))

;; Make the dbstruct, setup up auxillary db's and call for main db at least once
;;
;; called in http-transport and replicated in rmt.scm for *local* access. 
;;
(define (dbfile:setup do-sync areapath tmppath)
  (cond







>
>
>
>
>
>
>



<
<
|
<







191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207


208

209
210
211
212
213
214
215
;;   (abandoned the idea of num/db)
;; 
(define (dbfile:run-id->path apath run-id)
  (conc apath"/"(dbfile:run-id->dbname run-id)))

(define (db:dbname->path apath dbname)
  (conc apath"/"dbname))

(define (dbfile:run-id->dbnum run-id)
  (cond
   ((number? run-id)
    (modulo run-id (num-run-dbs)))
   ((not run-id) "main")   ;; 0 or main?
   (else run-id)))

;; POTENTIAL BUG: this implementation could produce a db file if run-id is neither #f or a number
(define (dbfile:run-id->dbname run-id)


  (conc ".megatest/"(dbfile:run-id->dbnum run-id)".db"))


;; Make the dbstruct, setup up auxillary db's and call for main db at least once
;;
;; called in http-transport and replicated in rmt.scm for *local* access. 
;;
(define (dbfile:setup do-sync areapath tmppath)
  (cond
239
240
241
242
243
244
245
246
247




248
249
250
251
252
253
254
    (if (stack-empty? (dbr:subdb-dbstack subdb))
	#f
	(begin
	  (stack-pop! (dbr:subdb-dbstack subdb))))))

;; return a previously opened db handle to the stack of available handles
(define (dbfile:add-dbdat dbstruct run-id dbdat)
  (let* ((subdb (dbfile:get-subdb dbstruct run-id)))
    (stack-push! (dbr:subdb-dbstack subdb) dbdat)




    dbdat))

;; set up a subdb
;;
(define (dbfile:init-subdb dbstruct run-id init-proc)
  (let* ((dbname    (dbfile:run-id->dbname run-id))
	 (areapath  (dbr:dbstruct-areapath dbstruct))







|
|
>
>
>
>







245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
    (if (stack-empty? (dbr:subdb-dbstack subdb))
	#f
	(begin
	  (stack-pop! (dbr:subdb-dbstack subdb))))))

;; return a previously opened db handle to the stack of available handles
(define (dbfile:add-dbdat dbstruct run-id dbdat)
  (let* ((subdb (dbfile:get-subdb dbstruct run-id))
	 (dbstk (dbr:subdb-dbstack subdb))
	 (count (stack-count dbstk)))
    (if (> count 15)
	(dbfile:print-err "WARNING: stack for "run-id".db is "count"."))
    (stack-push! dbstk dbdat)
    dbdat))

;; set up a subdb
;;
(define (dbfile:init-subdb dbstruct run-id init-proc)
  (let* ((dbname    (dbfile:run-id->dbname run-id))
	 (areapath  (dbr:dbstruct-areapath dbstruct))
884
885
886
887
888
889
890
891

892
893
894
895
896
897
898
	     (append (list todb) slave-dbs)
           )
          )
        )
	tbls)
       (let* ((runtime      (- (current-milliseconds) start-time))
	      (should-print (or ;; (debug:debug-mode 12)
				(common:low-noise-print 120 "db sync" (> runtime 500))))) ;; low and high sync times treated as separate.

	 (for-each 
	  (lambda (dat)
	    (let ((tblname (car dat))
		  (count   (cdr dat)))
	      (set! tot-count (+ tot-count count))
              )) 
	  (sort (hash-table->alist numrecs)(lambda (a b)(> (cdr a)(cdr b))))))







|
>







894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
	     (append (list todb) slave-dbs)
           )
          )
        )
	tbls)
       (let* ((runtime      (- (current-milliseconds) start-time))
	      (should-print (or ;; (debug:debug-mode 12)
			     (common:low-noise-print 120 "db sync")
			     (> runtime 500)))) ;; low and high sync times treated as separate.
	 (for-each 
	  (lambda (dat)
	    (let ((tblname (car dat))
		  (count   (cdr dat)))
	      (set! tot-count (+ tot-count count))
              )) 
	  (sort (hash-table->alist numrecs)(lambda (a b)(> (cdr a)(cdr b))))))
1002
1003
1004
1005
1006
1007
1008




1009

1010
1011
1012
1013
1014
1015
1016

1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046

1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
  (let* ((dbdat (dbfile:open-db dbstruct run-id dbinit)))
    (set! *db-write-access* (not (dbr:dbdat-read-only dbdat)))
    ;; (mutex-unlock! *db-open-mutex*)
    dbdat))

(define dbfile:db-init-proc (make-parameter #f))





(define keep-age-param (make-parameter 10))

(define qif-slope      (make-parameter 100))

;; create a dropping near the db file in a qif dir
;; use count of such files to gate queries (queries in flight)
;;
(define (dbfile:wait-for-qif fname run-id params)
  (let* ((thedir  (pathname-directory fname))

	 (destdir (conc thedir"/qif-"run-id))
	 (uniqn   (get-area-path-signature (conc (or run-id "main") params)))
	 (crumbn  (conc destdir"/"(current-seconds)"-"uniqn"."(current-process-id))))
    (if (not (file-exists? destdir))(create-directory (conc destdir"/attic") #t))
    (let loop ((count 0))
      (let* ((currlks (glob (conc destdir"/*")))
	     (numqrys (length currlks))
	     (delayval (cond ;; do a droopish curve
			((> numqrys 50)
			 (if (> numqrys 50)
			     (for-each
			      (lambda (f)
				(if (> (- (current-seconds)
					  (handle-exceptions
					      exn
					    (current-seconds) ;; file is likely gone, just fake out
					    (file-modification-time f)))
				       (keep-age-param))
				    (let* ((basedir (pathname-directory f))
					   (filen   (pathname-file f))
					   (destf   (conc basedir"/attic/"filen)))
				      (dbfile:print-err "Moving qif file "f" older than 10 seconds to "destf)
				      ;; (delete-file* f)
				      (handle-exceptions
					  exn
					#t
					(file-move f destf #t)))))
			      currlks))
			 1) ;; 50 and above => 1
			((> numqrys 10) (* numqrys (/ 1 (qif-slope)))) ;; slope of 1/100

			;; ((> numqrys 30) 0.50)
			;; ((> numqrys 25) 0.20)
			;; ((> numqrys 20) 0.10)
			;; ((> numqrys 15) 0.05)
			;; ((> numqrys 10) 0.01)
			(else #f))))
	(if (and delayval
		 (< count 5))
	    (begin
	      (thread-sleep! delayval)
	      (loop (+ count 1))))))
    (with-output-to-file crumbn







>
>
>
>
|
>
|






>
|
|






|
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<
<
>
|
<
<
<
<







1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042

1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060


1061
1062




1063
1064
1065
1066
1067
1068
1069
  (let* ((dbdat (dbfile:open-db dbstruct run-id dbinit)))
    (set! *db-write-access* (not (dbr:dbdat-read-only dbdat)))
    ;; (mutex-unlock! *db-open-mutex*)
    dbdat))

(define dbfile:db-init-proc (make-parameter #f))

;; in xmaxima this gives a curve close to what I want:
;;    plot2d ((exp(x/1.2)-1)/300, [x, 0, 10])$
;;    plot2d ((exp(x/1.5)-1)/40, [x, 0, 10])$
;;    plot2d ((exp(x/5)-1)/40, [x, 0, 20])$
(define (dbfile:droop x)
  (/ (- (exp (/ x 5)) 1) 40))
  ;; (* numqrys (/ 1 (qif-slope))))

;; create a dropping near the db file in a qif dir
;; use count of such files to gate queries (queries in flight)
;;
(define (dbfile:wait-for-qif fname run-id params)
  (let* ((thedir  (pathname-directory fname))
	 (dbnum   (dbfile:run-id->dbnum run-id))
	 (destdir (conc thedir"/qif-"dbnum))
	 (uniqn   (get-area-path-signature (conc dbnum params)))
	 (crumbn  (conc destdir"/"(current-seconds)"-"uniqn"."(current-process-id))))
    (if (not (file-exists? destdir))(create-directory (conc destdir"/attic") #t))
    (let loop ((count 0))
      (let* ((currlks (glob (conc destdir"/*")))
	     (numqrys (length currlks))
	     (delayval (cond ;; do a droopish curve
			((> numqrys 25)

			 (for-each
			  (lambda (f)
			    (if (> (- (current-seconds)
				      (handle-exceptions
					  exn
					(current-seconds) ;; file is likely gone, just fake out
					(file-modification-time f)))
				   (keep-age-param))
				(let* ((basedir (pathname-directory f))
				       (filen   (pathname-file f))
				       (destf   (conc basedir"/attic/"filen)))
				  (dbfile:print-err "Moving qif file "f" older than 10 seconds to "destf)
				  ;; (delete-file* f)
				  (handle-exceptions
				      exn
				    #t
				    (file-move f destf #t)))))
			  currlks)


			 4)
			((> numqrys 0)  (dbfile:droop numqrys)) ;; slope of 1/100




			(else #f))))
	(if (and delayval
		 (< count 5))
	    (begin
	      (thread-sleep! delayval)
	      (loop (+ count 1))))))
    (with-output-to-file crumbn

Modified launch.scm from [34af87f4bd] to [a705b2e0b2].

27
28
29
30
31
32
33

34
35
36
37
38
39
40
41


42
43
44
45
46
47
48

(import (prefix base64 base64:))
(import (prefix sqlite3 sqlite3:))

(declare (unit launch))
(declare (uses subrun))
(declare (uses common))

(declare (uses configf))
(declare (uses db))
(declare (uses ezsteps))

(include "common_records.scm")
(include "key_records.scm")
(include "db_records.scm")
(include "megatest-fossil-hash.scm")



;;======================================================================
;; ezsteps
;;======================================================================

;; ezsteps were going to be coded as
;; stepname[,predstep1,predstep2 ...] [{VAR1=first,second,third}] command to execute







>








>
>







27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

(import (prefix base64 base64:))
(import (prefix sqlite3 sqlite3:))

(declare (unit launch))
(declare (uses subrun))
(declare (uses common))
(declare (uses commonmod))
(declare (uses configf))
(declare (uses db))
(declare (uses ezsteps))

(include "common_records.scm")
(include "key_records.scm")
(include "db_records.scm")
(include "megatest-fossil-hash.scm")

(import commonmod)

;;======================================================================
;; ezsteps
;;======================================================================

;; ezsteps were going to be coded as
;; stepname[,predstep1,predstep2 ...] [{VAR1=first,second,third}] command to execute
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
			(debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep)))
                        (file-close status-file)
                        )

                        ))))))

(define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)
  (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30")))
         (start-seconds (current-seconds))
	 (calc-minutes  (lambda ()
			  (inexact->exact 
			   (round 
			    (- 
			     (current-seconds) 
			     start-seconds)))))







|







206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
			(debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep)))
                        (file-close status-file)
                        )

                        ))))))

(define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)
  (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "60")))
         (start-seconds (current-seconds))
	 (calc-minutes  (lambda ()
			  (inexact->exact 
			   (round 
			    (- 
			     (current-seconds) 
			     start-seconds)))))
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263

264
265
266
267
268
269
270
271
                                   (delta (abs (- df disk-free))))
                              (if (and (> df 0)
                                       (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                  df
                                  #f)))
             (do-sync       (or new-cpu-load new-disk-free over-time))

             (test-info   (rmt:get-test-info-by-id run-id test-id))
             (state       (db:test-get-state test-info))
             (status      (db:test-get-status test-info))
             (kill-reason  "no kill reason specified")
             (kill-job?    #f))
        ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
        (cond
         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)

        (launch:handle-zombie-tests run-id)
        (when do-sync
          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
          ;; (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))
	  )







|
|
|

















>
|







240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
                                   (delta (abs (- df disk-free))))
                              (if (and (> df 0)
                                       (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                  df
                                  #f)))
             (do-sync       (or new-cpu-load new-disk-free over-time))

             (test-info   (rmt:get-test-state-status-by-id run-id test-id))
             (state       (car test-info));; (db:test-get-state test-info))
             (status      (cdr test-info));; (db:test-get-status test-info))
             (kill-reason  "no kill reason specified")
             (kill-job?    #f))
        ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
        (cond
         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty
	    (launch:handle-zombie-tests run-id))
        (when do-sync
          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
          ;; (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))
	  )
763
764
765
766
767
768
769
770
771
772
773
774

775
776
777
778






779

780
781
782
783
784
785
786
;; if dead safe to mark the test as killed in the db
;; State/status table
;; new
;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup
;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
;; 0 RUNNING ==> this is actually the first condition, should not get here

(define (launch:end-of-run-check run-id )
    (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))  
           (running-cnt (rmt:get-count-tests-running-for-run-id run-id))
           (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))

           (current-state (rmt:get-run-state run-id))
           (current-status (rmt:get-run-status run-id)))
     ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
     (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)                      






     (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)

     (runs:update-junit-test-reporter-xml run-id) 
     (cond 
       ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" ))
                (if (and (equal? (rmt:get-var (conc "end-of-run-" run-id)) "no") (common:simple-lock (conc "endOfRun" run-id)))
                (begin
           	(debug:print 4 *default-log-port* "look for  post hook. currseconds: " (current-seconds) " EOR " (rmt:get-var (conc "end-of-run-" run-id)))
                (debug:print 0 *default-log-port* "End of Run Detected.")







|


|

>
|
|
|
|
>
>
>
>
>
>
|
>







767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
;; if dead safe to mark the test as killed in the db
;; State/status table
;; new
;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup
;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
;; 0 RUNNING ==> this is actually the first condition, should not get here
(define *last-rollup* 0)
(define (launch:end-of-run-check run-id )
    (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))  
           (running-cnt       (rmt:get-count-tests-running-for-run-id run-id))
           (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))
	   (current-state-status (rmt:get-run-state-status run-id))
           (current-state        (car current-state-status))  ;; (rmt:get-run-state run-id))
           (current-status       (cdr current-state-status))) ;; (rmt:get-run-status run-id)))
      ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
      (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)
      ;;
      ;; TODO: add a final rollup when run is done (if there isn't one already)
      ;;
      (if (or (< running-cnt 3)                              ;; have only few running
	      (> (- (current-seconds) *last-rollup*) 10))    ;; or haven't rolled up in past ten seconds
	  (begin
	    (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
	    (set! *last-rollup* (current-seconds))))
     (runs:update-junit-test-reporter-xml run-id) 
     (cond 
       ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" ))
                (if (and (equal? (rmt:get-var (conc "end-of-run-" run-id)) "no") (common:simple-lock (conc "endOfRun" run-id)))
                (begin
           	(debug:print 4 *default-log-port* "look for  post hook. currseconds: " (current-seconds) " EOR " (rmt:get-var (conc "end-of-run-" run-id)))
                (debug:print 0 *default-log-port* "End of Run Detected.")

Modified rmt.scm from [771d7d8ec4] to [22216f2b37].

535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551



552
553
554
555
556
557
558
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:general-call 'register-test run-id run-id test-name item-path))

(define (rmt:get-test-id run-id testname item-path)
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))

;; run-id is NOT used
;;
(define (rmt:get-test-info-by-id run-id test-id)
  (if (number? test-id)
      (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
      (begin
	(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
	(print-call-chain (current-error-port))
	#f)))




(define (rmt:test-get-rundir-from-test-id run-id test-id)
  (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))

(define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f))
  (assert (number? run-id) "FATAL: Run id required.")
  (let* ((test-path (if (string? work-area)
			work-area







<
<








>
>
>







535
536
537
538
539
540
541


542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:general-call 'register-test run-id run-id test-name item-path))

(define (rmt:get-test-id run-id testname item-path)
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))



(define (rmt:get-test-info-by-id run-id test-id)
  (if (number? test-id)
      (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
      (begin
	(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
	(print-call-chain (current-error-port))
	#f)))

(define (rmt:get-test-state-status-by-id run-id test-id)
  (rmt:send-receive 'get-test-state-status-by-id run-id (list run-id test-id)))

(define (rmt:test-get-rundir-from-test-id run-id test-id)
  (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))

(define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f))
  (assert (number? run-id) "FATAL: Run id required.")
  (let* ((test-path (if (string? work-area)
			work-area
797
798
799
800
801
802
803



804
805
806
807
808
809
810
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:send-receive 'get-run-status #f (list run-id)))

(define (rmt:get-run-state run-id)
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:send-receive 'get-run-state #f (list run-id)))





(define (rmt:set-run-status run-id run-status #!key (msg #f))
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:send-receive 'set-run-status #f (list run-id run-status msg)))

(define (rmt:set-run-state-status run-id state status )
  (assert (number? run-id) "FATAL: Run id required.")







>
>
>







798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:send-receive 'get-run-status #f (list run-id)))

(define (rmt:get-run-state run-id)
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:send-receive 'get-run-state #f (list run-id)))

(define (rmt:get-run-state-status run-id)
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:send-receive 'get-run-state-status #f (list run-id)))

(define (rmt:set-run-status run-id run-status #!key (msg #f))
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:send-receive 'set-run-status #f (list run-id run-status msg)))

(define (rmt:set-run-state-status run-id state status )
  (assert (number? run-id) "FATAL: Run id required.")

Modified runs.scm from [dd48ee309d] to [e0e242ce63].

1861
1862
1863
1864
1865
1866
1867

1868
1869
1870
1871
1872
1873
1874
1875
                                   (my-item-path (item-list->path my-itemdat))

                                   (newtestname (db:test-make-full-name hed my-item-path)))    ;; test names are unique on testname/item-path
                              (tests:testqueue-set-items!     new-test-record #f)
                              (tests:testqueue-set-itemdat!   new-test-record my-itemdat)
                              (tests:testqueue-set-item_path! new-test-record my-item-path)
                              (hash-table-set! test-records newtestname new-test-record)

                              (set! tal (append tal (list newtestname)))))  ;; since these are itemized create new test names testname/itempath
                          items-in-testpatt)))
          
          

	  ;; At this point we have possibly added items to tal but all must be handed off to 
	  ;; INNER COND logic. I think loop without rotating the queue 
	  ;; (loop hed tal reg reruns))







>
|







1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
                                   (my-item-path (item-list->path my-itemdat))

                                   (newtestname (db:test-make-full-name hed my-item-path)))    ;; test names are unique on testname/item-path
                              (tests:testqueue-set-items!     new-test-record #f)
                              (tests:testqueue-set-itemdat!   new-test-record my-itemdat)
                              (tests:testqueue-set-item_path! new-test-record my-item-path)
                              (hash-table-set! test-records newtestname new-test-record)
			      ;; BUG: This next line sucks up a lot of horsepower
			      (set! tal (append tal (list newtestname)))))  ;; since these are itemized create new test names testname/itempath
                          items-in-testpatt)))
          
          

	  ;; At this point we have possibly added items to tal but all must be handed off to 
	  ;; INNER COND logic. I think loop without rotating the queue 
	  ;; (loop hed tal reg reruns))

Modified tests.scm from [5c2006972a] to [b17b9fa8b7].

1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
;;======================================================================
;; test steps
;;======================================================================

;; teststep-set-status! used to be here

(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)
  (let* ((testdat   (rmt:get-test-info-by-id run-id test-id)))
    (and testdat
	 (equal? (test:get-state testdat) "KILLREQ"))))

(define (test:tdb-get-rundat-count tdb)
  (if tdb
      (let ((res 0))
	(sqlite3:for-each-row
	 (lambda (count)
	   (set! res count))







|

|







1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
;;======================================================================
;; test steps
;;======================================================================

;; teststep-set-status! used to be here

(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)
  (let* ((testdat   (rmt:get-test-state-status-by-id run-id test-id)))
    (and testdat
	 (equal? (car testdat) "KILLREQ"))))

(define (test:tdb-get-rundat-count tdb)
  (if tdb
      (let ((res 0))
	(sqlite3:for-each-row
	 (lambda (count)
	   (set! res count))