Megatest

Changes On Branch 7b86032d25a8b4e5
Login

Changes In Branch v1.80-dbperformance Excluding Merge-Ins

This is equivalent to a diff from 19861e6399 to 7b86032d25

2023-02-10
21:06
Merging pretty good branch v1.80-dbperf to v1.80 check-in: 7170e5f43b user: matt tags: v1.80
20:19
Use debugprint module in dbfile module as stepping stone to replacing old debug:print calls with new. Closed-Leaf check-in: 0e8fa15f1d user: matt tags: v1.80-debugprint
2023-02-06
19:35
Squashed v1.80-dbperformance into one commit check-in: 6c7b8be468 user: matt tags: v1.80-dbperf
2023-02-05
11:47
Minor clean up. There were a couple communication errors in sixtyfivek but they looked likely to be host related. Closed-Leaf check-in: e973b1fb77 user: matt tags: v1.80-cleanup
08:36
wip, close idle db connections check-in: 97a3c4ad11 user: matt tags: v1.80-close-idle-connections
2023-02-04
20:00
Change default number of db's to 10, couple minor fixes. Closed-Leaf check-in: 7b86032d25 user: matt tags: v1.80-dbperformance
2023-02-03
21:51
Reduce zombie cleanup, not clear it helps check-in: c8ec459e79 user: matt tags: v1.80-dbperformance
02:16
Reduce load from get-state-status-and-roll-up-run. check-in: 4e634eb46a user: matt tags: v1.80-dbperformance
2023-02-02
12:54
Use an actual droop check-in: 19861e6399 user: matt tags: v1.80
09:27
Change couple queries to use prepared statements check-in: a2e41e0613 user: matt tags: v1.80

Modified api.scm from [0676a2f9d1] to [f3cc459c57].

37
38
39
40
41
42
43

44
45
46
47
48
49
50
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51







+







(define api:read-only-queries
  '(get-key-val-pairs
    get-var
    get-keys
    get-key-vals
    test-toplevel-num-items
    get-test-info-by-id
    get-test-state-status-by-id
    get-steps-info-by-id
    get-data-info-by-id
    test-get-rundir-from-test-id
    get-count-tests-running-for-testname
    get-count-tests-running
    get-count-tests-running-in-jobgroup
    get-previous-test-run-record
326
327
328
329
330
331
332

333
334
335
336
337
338
339
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341







+








    ;; ARCHIVES
    ((test-get-archive-block-info)     (apply db:test-get-archive-block-info dbstruct params))
    
    ;; TESTS
    ((test-toplevel-num-items)         (apply db:test-toplevel-num-items dbstruct params))
    ((get-test-info-by-id)	       (apply db:get-test-info-by-id dbstruct params))
    ((get-test-state-status-by-id)     (apply db:get-test-state-status-by-id dbstruct params))
    ((test-get-rundir-from-test-id)    (apply db:test-get-rundir-from-test-id dbstruct params))
    ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params))
    ((get-count-tests-running)         (apply db:get-count-tests-running dbstruct params))
    ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params))
    ;; ((delete-test-step-records)        (apply db:delete-test-step-records dbstruct params))
    ;; ((get-previous-test-run-record)    (apply db:get-previous-test-run-record dbstruct params))
    ((get-matching-previous-test-run-records)(apply db:get-matching-previous-test-run-records dbstruct params))
349
350
351
352
353
354
355

356
357
358
359
360
361
362
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365







+







    ((get-raw-run-stats)               (apply db:get-raw-run-stats dbstruct params))
    ((get-test-times)                  (apply db:get-test-times dbstruct params))

    ;; RUNS
    ((get-run-info)                 (apply db:get-run-info dbstruct params))
    ((get-run-status)               (apply db:get-run-status dbstruct params))
    ((get-run-state)                (apply db:get-run-state dbstruct params))
    ((get-run-state-status)         (apply db:get-run-state-status dbstruct params))
    ((set-run-status)               (apply db:set-run-status dbstruct params))
    ((set-run-state-status)  			 (apply db:set-run-state-status dbstruct params))
    ((update-tesdata-on-repilcate-db) (apply db:update-tesdata-on-repilcate-db  dbstruct params)) 
    ((get-tests-for-run)            (apply db:get-tests-for-run dbstruct params))
    ((get-tests-for-run-state-status) (apply db:get-tests-for-run-state-status dbstruct params))
    ((get-test-id)                  (apply db:get-test-id dbstruct params))
    ((get-tests-for-run-mindata)    (apply db:get-tests-for-run-mindata dbstruct params))

Modified db.scm from [66cca5d3c4] to [ffd8a763fc].

466
467
468
469
470
471
472

473


474
475
476
477
478
479
480
466
467
468
469
470
471
472
473

474
475
476
477
478
479
480
481
482







+
-
+
+







				 fname", delta: " (- time1 time2) " seconds, reason: "(cdr do-cp))
	       (db:lock-and-delta-sync no-sync-db dbstruct fname runid (db:get-keys dbstruct) db:initialize-main-db)
	       (hash-table-set! sync-durations (conc fname".db")
				(- (current-milliseconds) start-time)))
	     (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date")
         )))
     dbfiles)
    ;; WHY does the dbdat need to be added back?
    (if dbdat (dbfile:add-dbdat dbstruct #f dbdat)))
    (if dbdat (dbfile:add-dbdat dbstruct #f dbdat))
    )
  #t)

;; options:
;;
;;  'killservers  - kills all servers
;;  'dejunk       - removes junk records
;;  'adj-testids  - move test-ids into correct ranges
596
597
598
599
600
601
602

603
604
605
606
607
608
609
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612







+







     (lambda (subdb)
       (let* ((dbname (db:run-id->dbname run-id))
	      (mtdb   (dbr:subdb-mtdb subdb))
	      (tmpdb  (db:get-subdb dbstruct run-id))
	      (refndb (dbr:subdb-refndb subdb))
	      (newres (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) last-update tmpdb refndb mtdb)))
	 ;; (stack-push! (dbr:subdb-dbstack subdb) tmpdb)
	 ;; BUG: verify this is really needed
	 (dbfile:add-dbdat dbstruct run-id tmpdb)
	 (set! res (cons newres res))))
     subdbs)
    res))

;;;; run-ids
;;    if #f use *db-local-sync* : or 'local-sync-flags
902
903
904
905
906
907
908

909
910
911
912
913
914
915
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919







+







	 db 
	 (conc
	  "SELECT d.id,d.archive_area_name,disk_path,last_df,last_df_time FROM archive_disks AS d
             INNER JOIN archive_blocks AS b ON d.id=b.archive_disk_id
             WHERE b.id IN (" (string-intersperse (map conc res) ",") ") AND
         last_df > ?;")
	 dneeded))
    ;; BUG: Verfify this is really needed
    (dbfile:add-dbdat dbstruct #f dbdat)
    blocks))
    
;; returns id of the record, register a disk allocated to archiving and record it's last known
;; available space
;;
(define (db:archive-register-disk dbstruct bdisk-name bdisk-path df)
2064
2065
2066
2067
2068
2069
2070

2071
2072


2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083

2084
2085
















2086
2087
2088
2089
2090
2091
2092
2068
2069
2070
2071
2072
2073
2074
2075


2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089


2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112







+
-
-
+
+











+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







  (let ((res "n/a"))
    (db:with-db
     dbstruct #f #f
     (lambda (dbdat db)
       (sqlite3:for-each-row 
	(lambda (status)
	  (set! res status))
	(db:get-cache-stmth
	db
	"SELECT status FROM runs WHERE id=?;" 
	 dbdat db
	 "SELECT status FROM runs WHERE id=?;" )
	run-id)
       res))))

(define (db:get-run-state dbstruct run-id)
  (let ((res "n/a"))
    (db:with-db
     dbstruct #f #f
     (lambda (dbdat db)
       (sqlite3:for-each-row 
	(lambda (status)
	  (set! res status))
	(db:get-cache-stmth
	db
	"SELECT state FROM runs WHERE id=?;" 
	 dbdat db
	 "SELECT state FROM runs WHERE id=?;" )
	run-id)
       res))))

(define (db:get-run-state-status dbstruct run-id)
  (let ((res (cons "n/a" "n/a")))
    (db:with-db
     dbstruct #f #f
     (lambda (dbdat db)
       (sqlite3:for-each-row 
	(lambda (state status)
	  (set! res (cons state status)))
	(db:get-cache-stmth
	 dbdat db
	 "SELECT state,status FROM runs WHERE id=?;" )
	run-id)
       res))))


;;======================================================================
;; K E Y S
;;======================================================================
2694
2695
2696
2697
2698
2699
2700
2701

2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
















2718
2719
2720
2721
2722
2723
2724
2714
2715
2716
2717
2718
2719
2720

2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760







-
+
















+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







  (let* ((run-ids (db:get-all-run-ids mtdb)))
    (for-each 
     (lambda (run-id)
       (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
	 (db:prep-megatest.db-adj-test-ids (dbr:dbdat-dbh mtdb) run-id testrecs)))
     run-ids)))

;; Get test data using test_id, run-id is not used
;; Get test data using test_id
;; 
(define (db:get-test-info-by-id dbstruct run-id test-id)
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (dbdat db)
     (let ((res #f))
       (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
	(lambda (id run-id testname state status event-time host cpuload diskfree uname rundir-id item-path run_duration final-logf-id comment short-dir-id attemptnum archived last-update)
	  ;;                0    1       2      3      4        5       6      7        8     9     10      11          12          13           14         15          16
	  (set! res (vector id run-id testname state status event-time host cpuload diskfree uname rundir-id item-path run_duration final-logf-id comment short-dir-id attemptnum archived last-update)))
	(db:get-cache-stmth dbdat db
			    (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id=?;"))
	test-id)
       res))))

;; Get test state, status using test_id
;; 
(define (db:get-test-state-status-by-id dbstruct run-id test-id)
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (dbdat db)
     (let ((res (cons #f #f)))
       (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
	(lambda (state status)
	  (cons state status))
	(db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=?;")
	test-id)
       res))))

;; Use db:test-get* to access
;; Get test data using test_ids. NB// Only works within a single run!!
;;
(define (db:get-test-info-by-ids dbstruct run-id test-ids)
  (db:with-db
   dbstruct

Modified dbfile.scm from [25f8271ef2] to [2ad26c948f].

35
36
37
38
39
40
41

42
43
44


45
46
47
48
49
50
51
35
36
37
38
39
40
41
42
43
44

45
46
47
48
49
50
51
52
53







+


-
+
+







	posix typed-records srfi-18 srfi-1
	srfi-69
	stack
	files
	ports

	commonmod
	;; debugprint
	)

;; (import debugprint)
(define keep-age-param (make-parameter 10)) ;; qif file age, if over move to attic
(define num-run-dbs (make-parameter 10))     ;; number of db's in .megatest

;;======================================================================
;;  R E C O R D S
;;======================================================================

;; a single Megatest area with it's multiple dbs is
;; managed in a dbstruct
189
190
191
192
193
194
195







196
197
198
199
200
201

202
203
204
205
206
207
208
209
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207



208

209
210
211
212
213
214
215







+
+
+
+
+
+
+



-
-
-
+
-







;;   (abandoned the idea of num/db)
;; 
(define (dbfile:run-id->path apath run-id)
  (conc apath"/"(dbfile:run-id->dbname run-id)))

(define (db:dbname->path apath dbname)
  (conc apath"/"dbname))

(define (dbfile:run-id->dbnum run-id)
  (cond
   ((number? run-id)
    (modulo run-id (num-run-dbs)))
   ((not run-id) "main")   ;; 0 or main?
   (else run-id)))

;; POTENTIAL BUG: this implementation could produce a db file if run-id is neither #f or a number
(define (dbfile:run-id->dbname run-id)
  (cond
   ((number? run-id) (conc ".megatest/" (modulo run-id 100) ".db"))
   ((not run-id)     (conc ".megatest/main.db"))
  (conc ".megatest/"(dbfile:run-id->dbnum run-id)".db"))
   (else             run-id)))

;; Make the dbstruct, setup up auxillary db's and call for main db at least once
;;
;; called in http-transport and replicated in rmt.scm for *local* access. 
;;
(define (dbfile:setup do-sync areapath tmppath)
  (cond
239
240
241
242
243
244
245
246
247






248
249
250
251
252
253
254
245
246
247
248
249
250
251


252
253
254
255
256
257
258
259
260
261
262
263
264







-
-
+
+
+
+
+
+







    (if (stack-empty? (dbr:subdb-dbstack subdb))
	#f
	(begin
	  (stack-pop! (dbr:subdb-dbstack subdb))))))

;; return a previously opened db handle to the stack of available handles
(define (dbfile:add-dbdat dbstruct run-id dbdat)
  (let* ((subdb (dbfile:get-subdb dbstruct run-id)))
    (stack-push! (dbr:subdb-dbstack subdb) dbdat)
  (let* ((subdb (dbfile:get-subdb dbstruct run-id))
	 (dbstk (dbr:subdb-dbstack subdb))
	 (count (stack-count dbstk)))
    (if (> count 15)
	(dbfile:print-err "WARNING: stack for "run-id".db is "count"."))
    (stack-push! dbstk dbdat)
    dbdat))

;; set up a subdb
;;
(define (dbfile:init-subdb dbstruct run-id init-proc)
  (let* ((dbname    (dbfile:run-id->dbname run-id))
	 (areapath  (dbr:dbstruct-areapath dbstruct))
884
885
886
887
888
889
890
891


892
893
894
895
896
897
898
894
895
896
897
898
899
900

901
902
903
904
905
906
907
908
909







-
+
+







	     (append (list todb) slave-dbs)
           )
          )
        )
	tbls)
       (let* ((runtime      (- (current-milliseconds) start-time))
	      (should-print (or ;; (debug:debug-mode 12)
				(common:low-noise-print 120 "db sync" (> runtime 500))))) ;; low and high sync times treated as separate.
			     (common:low-noise-print 120 "db sync")
			     (> runtime 500)))) ;; low and high sync times treated as separate.
	 (for-each 
	  (lambda (dat)
	    (let ((tblname (car dat))
		  (count   (cdr dat)))
	      (set! tot-count (+ tot-count count))
              )) 
	  (sort (hash-table->alist numrecs)(lambda (a b)(> (cdr a)(cdr b))))))
1002
1003
1004
1005
1006
1007
1008




1009
1010



1011
1012
1013
1014
1015
1016

1017
1018


1019
1020
1021
1022
1023
1024
1025

1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044


















1045
1046
1047


1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023


1024
1025
1026
1027
1028
1029
1030
1031
1032
1033


1034
1035
1036
1037
1038
1039
1040
1041

1042



















1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060



1061
1062




1063
1064
1065
1066
1067
1068
1069







+
+
+
+
-
-
+
+
+






+
-
-
+
+






-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
-
-
-
-







  (let* ((dbdat (dbfile:open-db dbstruct run-id dbinit)))
    (set! *db-write-access* (not (dbr:dbdat-read-only dbdat)))
    ;; (mutex-unlock! *db-open-mutex*)
    dbdat))

(define dbfile:db-init-proc (make-parameter #f))

;; in xmaxima this gives a curve close to what I want:
;;    plot2d ((exp(x/1.2)-1)/300, [x, 0, 10])$
;;    plot2d ((exp(x/1.5)-1)/40, [x, 0, 10])$
;;    plot2d ((exp(x/5)-1)/40, [x, 0, 20])$
(define keep-age-param (make-parameter 10))
(define qif-slope      (make-parameter 100))
(define (dbfile:droop x)
  (/ (- (exp (/ x 5)) 1) 40))
  ;; (* numqrys (/ 1 (qif-slope))))

;; create a dropping near the db file in a qif dir
;; use count of such files to gate queries (queries in flight)
;;
(define (dbfile:wait-for-qif fname run-id params)
  (let* ((thedir  (pathname-directory fname))
	 (dbnum   (dbfile:run-id->dbnum run-id))
	 (destdir (conc thedir"/qif-"run-id))
	 (uniqn   (get-area-path-signature (conc (or run-id "main") params)))
	 (destdir (conc thedir"/qif-"dbnum))
	 (uniqn   (get-area-path-signature (conc dbnum params)))
	 (crumbn  (conc destdir"/"(current-seconds)"-"uniqn"."(current-process-id))))
    (if (not (file-exists? destdir))(create-directory (conc destdir"/attic") #t))
    (let loop ((count 0))
      (let* ((currlks (glob (conc destdir"/*")))
	     (numqrys (length currlks))
	     (delayval (cond ;; do a droopish curve
			((> numqrys 50)
			((> numqrys 25)
			 (if (> numqrys 50)
			     (for-each
			      (lambda (f)
				(if (> (- (current-seconds)
					  (handle-exceptions
					      exn
					    (current-seconds) ;; file is likely gone, just fake out
					    (file-modification-time f)))
				       (keep-age-param))
				    (let* ((basedir (pathname-directory f))
					   (filen   (pathname-file f))
					   (destf   (conc basedir"/attic/"filen)))
				      (dbfile:print-err "Moving qif file "f" older than 10 seconds to "destf)
				      ;; (delete-file* f)
				      (handle-exceptions
					  exn
					#t
					(file-move f destf #t)))))
			      currlks))
			 (for-each
			  (lambda (f)
			    (if (> (- (current-seconds)
				      (handle-exceptions
					  exn
					(current-seconds) ;; file is likely gone, just fake out
					(file-modification-time f)))
				   (keep-age-param))
				(let* ((basedir (pathname-directory f))
				       (filen   (pathname-file f))
				       (destf   (conc basedir"/attic/"filen)))
				  (dbfile:print-err "Moving qif file "f" older than 10 seconds to "destf)
				  ;; (delete-file* f)
				  (handle-exceptions
				      exn
				    #t
				    (file-move f destf #t)))))
			  currlks)
			 1) ;; 50 and above => 1
			((> numqrys 10) (* numqrys (/ 1 (qif-slope)))) ;; slope of 1/100
			;; ((> numqrys 30) 0.50)
			 4)
			((> numqrys 0)  (dbfile:droop numqrys)) ;; slope of 1/100
			;; ((> numqrys 25) 0.20)
			;; ((> numqrys 20) 0.10)
			;; ((> numqrys 15) 0.05)
			;; ((> numqrys 10) 0.01)
			(else #f))))
	(if (and delayval
		 (< count 5))
	    (begin
	      (thread-sleep! delayval)
	      (loop (+ count 1))))))
    (with-output-to-file crumbn

Modified launch.scm from [34af87f4bd] to [a705b2e0b2].

27
28
29
30
31
32
33

34
35
36
37
38
39
40
41


42
43
44
45
46
47
48
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51







+








+
+








(import (prefix base64 base64:))
(import (prefix sqlite3 sqlite3:))

(declare (unit launch))
(declare (uses subrun))
(declare (uses common))
(declare (uses commonmod))
(declare (uses configf))
(declare (uses db))
(declare (uses ezsteps))

(include "common_records.scm")
(include "key_records.scm")
(include "db_records.scm")
(include "megatest-fossil-hash.scm")

(import commonmod)

;;======================================================================
;; ezsteps
;;======================================================================

;; ezsteps were going to be coded as
;; stepname[,predstep1,predstep2 ...] [{VAR1=first,second,third}] command to execute
203
204
205
206
207
208
209
210

211
212
213
214
215
216
217
206
207
208
209
210
211
212

213
214
215
216
217
218
219
220







-
+







			(debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep)))
                        (file-close status-file)
                        )

                        ))))))

(define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)
  (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30")))
  (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "60")))
         (start-seconds (current-seconds))
	 (calc-minutes  (lambda ()
			  (inexact->exact 
			   (round 
			    (- 
			     (current-seconds) 
			     start-seconds)))))
237
238
239
240
241
242
243
244
245
246



247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263

264

265
266
267
268
269
270
271
240
241
242
243
244
245
246



247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267

268
269
270
271
272
273
274
275







-
-
-
+
+
+

















+
-
+







                                   (delta (abs (- df disk-free))))
                              (if (and (> df 0)
                                       (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                  df
                                  #f)))
             (do-sync       (or new-cpu-load new-disk-free over-time))

             (test-info   (rmt:get-test-info-by-id run-id test-id))
             (state       (db:test-get-state test-info))
             (status      (db:test-get-status test-info))
             (test-info   (rmt:get-test-state-status-by-id run-id test-id))
             (state       (car test-info));; (db:test-get-state test-info))
             (status      (cdr test-info));; (db:test-get-status test-info))
             (kill-reason  "no kill reason specified")
             (kill-job?    #f))
        ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
        (cond
         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty
        (launch:handle-zombie-tests run-id)
	    (launch:handle-zombie-tests run-id))
        (when do-sync
          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
          ;; (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))
	  )
763
764
765
766
767
768
769
770

771
772
773

774

775
776
777
778
779












780
781
782
783
784
785
786
767
768
769
770
771
772
773

774
775
776

777
778
779





780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798







-
+


-
+

+
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+







;; if dead safe to mark the test as killed in the db
;; State/status table
;; new
;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup
;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
;; 0 RUNNING ==> this is actually the first condition, should not get here

(define *last-rollup* 0)
(define (launch:end-of-run-check run-id )
    (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))  
           (running-cnt (rmt:get-count-tests-running-for-run-id run-id))
           (running-cnt       (rmt:get-count-tests-running-for-run-id run-id))
           (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))
	   (current-state-status (rmt:get-run-state-status run-id))
           (current-state (rmt:get-run-state run-id))
           (current-status (rmt:get-run-status run-id)))
     ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
     (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)                      
     (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
           (current-state        (car current-state-status))  ;; (rmt:get-run-state run-id))
           (current-status       (cdr current-state-status))) ;; (rmt:get-run-status run-id)))
      ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
      (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)
      ;;
      ;; TODO: add a final rollup when run is done (if there isn't one already)
      ;;
      (if (or (< running-cnt 3)                              ;; have only few running
	      (> (- (current-seconds) *last-rollup*) 10))    ;; or haven't rolled up in past ten seconds
	  (begin
	    (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
	    (set! *last-rollup* (current-seconds))))
     (runs:update-junit-test-reporter-xml run-id) 
     (cond 
       ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" ))
                (if (and (equal? (rmt:get-var (conc "end-of-run-" run-id)) "no") (common:simple-lock (conc "endOfRun" run-id)))
                (begin
           	(debug:print 4 *default-log-port* "look for  post hook. currseconds: " (current-seconds) " EOR " (rmt:get-var (conc "end-of-run-" run-id)))
                (debug:print 0 *default-log-port* "End of Run Detected.")

Modified rmt.scm from [771d7d8ec4] to [22216f2b37].

535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551



552
553
554
555
556
557
558
535
536
537
538
539
540
541


542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559







-
-








+
+
+







  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:general-call 'register-test run-id run-id test-name item-path))

(define (rmt:get-test-id run-id testname item-path)
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))

;; run-id is NOT used
;;
(define (rmt:get-test-info-by-id run-id test-id)
  (if (number? test-id)
      (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
      (begin
	(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
	(print-call-chain (current-error-port))
	#f)))

(define (rmt:get-test-state-status-by-id run-id test-id)
  (rmt:send-receive 'get-test-state-status-by-id run-id (list run-id test-id)))

(define (rmt:test-get-rundir-from-test-id run-id test-id)
  (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))

(define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f))
  (assert (number? run-id) "FATAL: Run id required.")
  (let* ((test-path (if (string? work-area)
			work-area
797
798
799
800
801
802
803



804
805
806
807
808
809
810
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814







+
+
+







  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:send-receive 'get-run-status #f (list run-id)))

(define (rmt:get-run-state run-id)
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:send-receive 'get-run-state #f (list run-id)))

(define (rmt:get-run-state-status run-id)
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:send-receive 'get-run-state-status #f (list run-id)))

(define (rmt:set-run-status run-id run-status #!key (msg #f))
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:send-receive 'set-run-status #f (list run-id run-status msg)))

(define (rmt:set-run-state-status run-id state status )
  (assert (number? run-id) "FATAL: Run id required.")

Modified runs.scm from [dd48ee309d] to [e0e242ce63].

1861
1862
1863
1864
1865
1866
1867

1868

1869
1870
1871
1872
1873
1874
1875
1861
1862
1863
1864
1865
1866
1867
1868

1869
1870
1871
1872
1873
1874
1875
1876







+
-
+







                                   (my-item-path (item-list->path my-itemdat))

                                   (newtestname (db:test-make-full-name hed my-item-path)))    ;; test names are unique on testname/item-path
                              (tests:testqueue-set-items!     new-test-record #f)
                              (tests:testqueue-set-itemdat!   new-test-record my-itemdat)
                              (tests:testqueue-set-item_path! new-test-record my-item-path)
                              (hash-table-set! test-records newtestname new-test-record)
			      ;; BUG: This next line sucks up a lot of horsepower
                              (set! tal (append tal (list newtestname)))))  ;; since these are itemized create new test names testname/itempath
			      (set! tal (append tal (list newtestname)))))  ;; since these are itemized create new test names testname/itempath
                          items-in-testpatt)))
          
          

	  ;; At this point we have possibly added items to tal but all must be handed off to 
	  ;; INNER COND logic. I think loop without rotating the queue 
	  ;; (loop hed tal reg reruns))

Modified tests.scm from [5c2006972a] to [b17b9fa8b7].

1964
1965
1966
1967
1968
1969
1970
1971

1972
1973

1974
1975
1976
1977
1978
1979
1980
1964
1965
1966
1967
1968
1969
1970

1971
1972

1973
1974
1975
1976
1977
1978
1979
1980







-
+

-
+







;;======================================================================
;; test steps
;;======================================================================

;; teststep-set-status! used to be here

(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)
  (let* ((testdat   (rmt:get-test-info-by-id run-id test-id)))
  (let* ((testdat   (rmt:get-test-state-status-by-id run-id test-id)))
    (and testdat
	 (equal? (test:get-state testdat) "KILLREQ"))))
	 (equal? (car testdat) "KILLREQ"))))

(define (test:tdb-get-rundat-count tdb)
  (if tdb
      (let ((res 0))
	(sqlite3:for-each-row
	 (lambda (count)
	   (set! res count))