Megatest

Check-in [6cb8704b45]
Login
Overview
Comment:Cherrypick 0a9e69 aa2998 63e558 b487e8 e28be4 50237f 64aa934, archive fixes, do not save 0 cpus to cache file, reduce frequency of notification of load checking, generate example fix, reduce from 10 to 5 number of times to try getting test info, use sha sum for log file name when it gets too long, reduce default inter-test-delay from 2 seconds to 0.1
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65 | v1.6548
Files: files | file ages | folders
SHA1: 6cb8704b4542bd97818391a7b9d437c76f5c419c
User & Date: mrwellan on 2020-05-31 21:34:50
Other Links: branch diff | manifest | tags
Context
2020-06-01
12:35
merged in v1.65-junit-xml check-in: ab9f96c331 user: pjhatwal tags: v1.65
2020-05-31
22:25
Merged just to create a common ancestor. No actual changes on v1.65 found (other than irrelevant change in generated html of manual) check-in: cc07d0a6af user: mrwellan tags: v1.66
21:34
Cherrypick 0a9e69 aa2998 63e558 b487e8 e28be4 50237f 64aa934, archive fixes, do not save 0 cpus to cache file, reduce frequency of notification of load checking, generate example fix, reduce from 10 to 5 number of times to try getting test info, use sha sum for log file name when it gets too long, reduce default inter-test-delay from 2 seconds to 0.1 check-in: 6cb8704b45 user: mrwellan tags: v1.65, v1.6548
21:24
Cherrypick of 5de2f 06451 0364b 07ff7 a374d. Support for save and get in archive. check-in: b7937c6a6f user: mrwellan tags: v1.65
Changes

Modified archive.scm from [80337cecee] to [212cc6c596].

88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
	(hash-table-ref blockid-cache key)
	(let* ((pscript     (configf:lookup *configdat* "archive" "pathscript"))
	       (pscript-cmd (conc pscript " " testsuite-name " " target " " run-name " " test-name))
	       (apath       (if pscript
				(handle-exceptions
				 exn
				 (begin
				   (debug:print 0 "ERROR: script \"" pscript-cmd "\" failed to run properly.")
				   (exit 1))
				 (with-input-from-pipe
				  pscript-cmd
				  read-line))
				#f)) ;; this is the user-calculated archive path
	       (adisks    (archive:get-archive-disks))
	       (best-disk (common:get-disk-with-most-free-space adisks dneeded)))







|







88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
	(hash-table-ref blockid-cache key)
	(let* ((pscript     (configf:lookup *configdat* "archive" "pathscript"))
	       (pscript-cmd (conc pscript " " testsuite-name " " target " " run-name " " test-name))
	       (apath       (if pscript
				(handle-exceptions
				 exn
				 (begin
				   (debug:print 0 *default-log-port* "ERROR: script \"" pscript-cmd "\" failed to run properly.")
				   (exit 1))
				 (with-input-from-pipe
				  pscript-cmd
				  read-line))
				#f)) ;; this is the user-calculated archive path
	       (adisks    (archive:get-archive-disks))
	       (best-disk (common:get-disk-with-most-free-space adisks dneeded)))
114
115
116
117
118
119
120


121


122
123
124
125
126
127
128
129
130
		     (archive-path  (conc bdisk-path "/" archive-name))
		     (block-id      (rmt:archive-register-block-name bdisk-id archive-path)))
		;;   (allocation-id (rmt:archive-allocate-testsuite/area-to-block block-id testsuite-name area-key)))
		(if block-id ;; (and block-id allocation-id)
		    (let ((res (cons block-id archive-path)))
		      (hash-table-set! blockid-cache key res)
		      res)


		    #f))


	      #f)) ;; no best disk found
	  )))

;; archive - run bup
;;
;; 1. create the bup dir if not exists
;; 2. start the du of each directory
;; 3. gen index
;; 4. save







>
>
|
>
>
|
<







114
115
116
117
118
119
120
121
122
123
124
125
126

127
128
129
130
131
132
133
		     (archive-path  (conc bdisk-path "/" archive-name))
		     (block-id      (rmt:archive-register-block-name bdisk-id archive-path)))
		;;   (allocation-id (rmt:archive-allocate-testsuite/area-to-block block-id testsuite-name area-key)))
		(if block-id ;; (and block-id allocation-id)
		    (let ((res (cons block-id archive-path)))
		      (hash-table-set! blockid-cache key res)
		      res)
		    (begin
		      (debug:print 0 *default-log-port* "WARNING: no disk found for " target ", " run-name ", " test-name ",  archive-path=" archive-path)
		      #f)))
	      (begin
		(debug:print 0 *default-log-port* "WARNING: no disk found for " target ", " run-name ", " test-name ", block-id=" block-id)
		#f)))))) ;; no best disk found


;; archive - run bup
;;
;; 1. create the bup dir if not exists
;; 2. start the du of each directory
;; 3. gen index
;; 4. save
244
245
246
247
248
249
250
251









252
253
254
255
256
257
258
     (lambda (test-base)
       (let* ((disk-group (hash-table-ref disk-groups test-base))
	      (arch-group (hash-table-ref arch-groups test-base))
	      (arch-info  (car arch-group)) ;; don't know yet how this will work, can I get more than one possibility?
	      (archive-id    (car arch-info))
	      (archive-dir   (cdr arch-info)))
	 (debug:print 0 *default-log-port* "Processing disk-group " test-base)
	 (let* ((test-paths (hash-table-ref disk-groups test-base)))









	   (if (not (common:file-exists? archive-dir))
	       (create-directory archive-dir #t))
	   (case archiver
	     ((bup) ;; Archive using bup
	      (let* ((bup-init-params  (list "-d" archive-dir "init"))
		     (bup-index-params (append (list "-d" archive-dir "index") test-paths))
		     (bup-save-params  (append (list "-d" archive-dir "save" ;; (conc "--strip-path=" linktree)







|
>
>
>
>
>
>
>
>
>







247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
     (lambda (test-base)
       (let* ((disk-group (hash-table-ref disk-groups test-base))
	      (arch-group (hash-table-ref arch-groups test-base))
	      (arch-info  (car arch-group)) ;; don't know yet how this will work, can I get more than one possibility?
	      (archive-id    (car arch-info))
	      (archive-dir   (cdr arch-info)))
	 (debug:print 0 *default-log-port* "Processing disk-group " test-base)
	 (let* ((test-paths-in (hash-table-ref disk-groups test-base))
		(test-paths    (if (args:get-arg "-include")
				   (let ((subpaths (string-split (args:get-arg "-include") ",")))
				     (apply append
					    (map (lambda (p)
						   (map (lambda (subp)
							  (conc p "/" subp))
							subpaths))
						 test-paths-in)))
				   test-paths-in)))
	   (if (not (common:file-exists? archive-dir))
	       (create-directory archive-dir #t))
	   (case archiver
	     ((bup) ;; Archive using bup
	      (let* ((bup-init-params  (list "-d" archive-dir "init"))
		     (bup-index-params (append (list "-d" archive-dir "index") test-paths))
		     (bup-save-params  (append (list "-d" archive-dir "save" ;; (conc "--strip-path=" linktree)
341
342
343
344
345
346
347
348



349
350
351
352
353
354
355
	      (mutex-unlock! rp-mutex)
	      (new-test-physical-path  (conc best-disk "/" test-partial-path))
	      (archive-block-id        (db:test-get-archived test-dat))
	      (archive-block-info      (rmt:test-get-archive-block-info archive-block-id))
	      (archive-path            (if (vector? archive-block-info)
					   (vector-ref archive-block-info 2) ;; look in db.scm for test-get-archive-block-info for the vector record info
					   #f)) ;; no archive found?
	      (archive-internal-path   (conc (common:get-testsuite-name) "-" run-id "/latest/" test-partial-path)))



	 
	 ;; some sanity checks, move an existing path out of the way - iif it is not a toplevel with children
	 ;;
	 (if (and (not toplevel/children)  ;; special handling needed for toplevel with children
		  prev-test-physical-path
		  (common:file-exists? prev-test-physical-path)) ;; what to do? abort or clean up or link it in?
	     (let* ((base (pathname-directory prev-test-physical-path))







|
>
>
>







353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
	      (mutex-unlock! rp-mutex)
	      (new-test-physical-path  (conc best-disk "/" test-partial-path))
	      (archive-block-id        (db:test-get-archived test-dat))
	      (archive-block-info      (rmt:test-get-archive-block-info archive-block-id))
	      (archive-path            (if (vector? archive-block-info)
					   (vector-ref archive-block-info 2) ;; look in db.scm for test-get-archive-block-info for the vector record info
					   #f)) ;; no archive found?
	      (archive-internal-path   (conc (common:get-testsuite-name) "-" run-id "/latest/" test-partial-path))
	      (include-paths           (args:get-arg "-include"))
	      (exclude-pattern         (args:get-arg "-exclude-rx"))
	      (exclude-file            (args:get-arg "-exclude-rx-from")))
	 
	 ;; some sanity checks, move an existing path out of the way - iif it is not a toplevel with children
	 ;;
	 (if (and (not toplevel/children)  ;; special handling needed for toplevel with children
		  prev-test-physical-path
		  (common:file-exists? prev-test-physical-path)) ;; what to do? abort or clean up or link it in?
	     (let* ((base (pathname-directory prev-test-physical-path))
438
439
440
441
442
443
444
445



446
447
448
449
450
451




452
453
454
455
456
457
458
459
		   (test-path         (conc linktree "/" test-partial-path))
		   (archive-block-id        (db:test-get-archived test-dat))
		   (archive-block-info      (rmt:test-get-archive-block-info archive-block-id))
		   (archive-path            (if (vector? archive-block-info)
						(vector-ref archive-block-info 2)
						#f))
		   (archive-internal-path   (conc (common:get-testsuite-name) "-" run-id
						  "/latest/" test-partial-path)))



	      
	      (if (and archive-path ;; no point in proceeding if there is no actual archive
		       (not toplevel/children))
		  (begin
		    (let* ((bup-restore-params  (list "-d" archive-path "restore" "-C" (or destpath "data")
						      ;; " " ;; What is the empty string for?




						      archive-internal-path)))
		      (debug:print-info 0 *default-log-port* "Restoring archived data to " (or destpath "data")
					" from archive in " archive-path " ... " archive-internal-path)
		      (run-n-wait bup-exe params: bup-restore-params print-cmd: #t)))
		  (let ((new-rem-tests (filter (lambda (tdat)
						 (or (not (eq? (db:test-get-id tdat) test-id))
						     (not (eq? (db:test-get-run_id tdat) run-id))))
					       rem-tests) ))







|
>
>
>




|
|
>
>
>
>
|







453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
		   (test-path         (conc linktree "/" test-partial-path))
		   (archive-block-id        (db:test-get-archived test-dat))
		   (archive-block-info      (rmt:test-get-archive-block-info archive-block-id))
		   (archive-path            (if (vector? archive-block-info)
						(vector-ref archive-block-info 2)
						#f))
		   (archive-internal-path   (conc (common:get-testsuite-name) "-" run-id
						  "/latest/" test-partial-path))
		   (include-paths           (args:get-arg "-include"))
		   (exclude-pattern         (args:get-arg "-exclude-rx"))
		   (exclude-file            (args:get-arg "-exclude-rx-from")))
	      
	      (if (and archive-path ;; no point in proceeding if there is no actual archive
		       (not toplevel/children))
		  (begin
		    (let* ((bup-restore-params (append (list "-d" archive-path "restore" "-C" (or destpath "data"))
						       ;; " " ;; What is the empty string for?
						       (if include-paths
							   (map (lambda (p)
								  (conc archive-internal-path "/" p))
								(string-split include-paths ","))
							   (list archive-internal-path)))))
		      (debug:print-info 0 *default-log-port* "Restoring archived data to " (or destpath "data")
					" from archive in " archive-path " ... " archive-internal-path)
		      (run-n-wait bup-exe params: bup-restore-params print-cmd: #t)))
		  (let ((new-rem-tests (filter (lambda (tdat)
						 (or (not (eq? (db:test-get-id tdat) test-id))
						     (not (eq? (db:test-get-run_id tdat) run-id))))
					       rem-tests) ))

Modified common.scm from [4d4a2441c8] to [e0587bf118].

482
483
484
485
486
487
488
489
490
491

492
493
494
495
496
497
498
	(max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age
    (if (not (directory-exists? "logs"))(create-directory "logs"))
    (directory-fold 
     (lambda (file rem)
       (handle-exceptions
	exn
	(begin
	  (debug:print-info 0 *default-log-port* "unable to rotate log " file ", probably handled by another process.")
	  (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
	  (print-call-chain (current-error-port)))

	(let* ((fullname  (conc "logs/" file))
	       (mod-time  (file-modification-time fullname))
	       (file-age  (- (current-seconds) mod-time)))
	  (hash-table-set! all-files file mod-time)
	  (if (or (and (string-match "^.*.log" file)
		       (> (file-size fullname) 200000))
		  (and (string-match "^server-.*.log" file)







|
|
|
>







482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
	(max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age
    (if (not (directory-exists? "logs"))(create-directory "logs"))
    (directory-fold 
     (lambda (file rem)
       (handle-exceptions
	exn
	(begin
	  (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore.")
	  (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
	  ;; (print-call-chain (current-error-port)) ;; 
	  )
	(let* ((fullname  (conc "logs/" file))
	       (mod-time  (file-modification-time fullname))
	       (file-age  (- (current-seconds) mod-time)))
	  (hash-table-set! all-files file mod-time)
	  (if (or (and (string-match "^.*.log" file)
		       (> (file-size fullname) 200000))
		  (and (string-match "^server-.*.log" file)
1195
1196
1197
1198
1199
1200
1201
1202












1203
1204
1205
1206
1207
1208
1209

;; use bash to expand a glob. Does NOT handle paths with spaces!
;;
(define (common:bash-glob instr)
  (string-split
   (with-input-from-pipe
       (conc "/bin/bash -c \"echo " instr "\"")
     read-line)))












  
;;======================================================================
;; T A R G E T S  ,   S T A T E ,   S T A T U S ,   
;;                    R U N N A M E    A N D   T E S T P A T T
;;======================================================================

;; (map print (map car (hash-table->alist (read-config "runconfigs.config" #f #t))))







|
>
>
>
>
>
>
>
>
>
>
>
>







1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222

;; use bash to expand a glob. Does NOT handle paths with spaces!
;;
(define (common:bash-glob instr)
  (string-split
   (with-input-from-pipe
       (conc "/bin/bash -c \"echo " instr "\"")
       read-line)))

;;======================================================================
;; Some safety net stuff
;;======================================================================

;; return input if it is a list or return null
(define (common:list-or-null inlst #!key (ovrd #f)(message #f))
  (if (list? inlst)
      inlst
      (begin
	(if message (debug:print-error 0 *default-log-port* message))
	(or ovrd '()))))
  
;;======================================================================
;; T A R G E T S  ,   S T A T E ,   S T A T U S ,   
;;                    R U N N A M E    A N D   T E S T P A T T
;;======================================================================

;; (map print (map car (hash-table->alist (read-config "runconfigs.config" #f #t))))
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
;; 			    (set! cpu-load newval))))))
;; 	      (car load-res))
;;     cpu-load))

;; get values from cached info from dropping file in logs dir
;;  e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 5))
  (if *toppath*
      (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")))
	(if (and (file-exists? fullpath)
		 (file-read-access? fullpath))
	    (handle-exceptions
	     exn
	     #f
	     (debug:print 2 *default-log-port* "reading file " fullpath)
	     (let ((real-age (- (current-seconds)(file-change-time fullpath)))) 
	       (if (< real-age age)
		   (with-input-from-file fullpath read)
		   (begin
		     (debug:print 1 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")
		     #f))))
	    (begin
	      (debug:print 2 *default-log-port* "not reading file " fullpath)
	      #f)))
      #f))
 
(define (common:write-cached-info key dtype dat)







|












|







1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
;; 			    (set! cpu-load newval))))))
;; 	      (car load-res))
;;     cpu-load))

;; get values from cached info from dropping file in logs dir
;;  e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 10))
  (if *toppath*
      (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")))
	(if (and (file-exists? fullpath)
		 (file-read-access? fullpath))
	    (handle-exceptions
	     exn
	     #f
	     (debug:print 2 *default-log-port* "reading file " fullpath)
	     (let ((real-age (- (current-seconds)(file-change-time fullpath)))) 
	       (if (< real-age age)
		   (with-input-from-file fullpath read)
		   (begin
		     (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")
		     #f))))
	    (begin
	      (debug:print 2 *default-log-port* "not reading file " fullpath)
	      #f)))
      #f))
 
(define (common:write-cached-info key dtype dat)
1979
1980
1981
1982
1983
1984
1985


1986
1987
1988
1989
1990
1991
1992
1993
					 numcpu)
				     (read-line))))))
	       (result (if remote-host
			   (with-input-from-pipe 
			       (conc "ssh " remote-host " cat /proc/cpuinfo")
			     proc)
			   (with-input-from-file "/proc/cpuinfo" proc))))


	  (if (> result 0)(common:write-cached-info actual-host "num-cpus" result))
	  result))))

;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxload msg remote-host #!optional (rem-tries 5))
  (let ((num-cpus (common:get-num-cpus remote-host)))
    (if num-cpus







>
>
|







1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
					 numcpu)
				     (read-line))))))
	       (result (if remote-host
			   (with-input-from-pipe 
			       (conc "ssh " remote-host " cat /proc/cpuinfo")
			     proc)
			   (with-input-from-file "/proc/cpuinfo" proc))))
	  (if (and (number? result)
		   (> result 0))
	      (common:write-cached-info actual-host "num-cpus" result))
	  result))))

;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxload msg remote-host #!optional (rem-tries 5))
  (let ((num-cpus (common:get-num-cpus remote-host)))
    (if num-cpus
2009
2010
2011
2012
2013
2014
2015


2016
2017
2018
2019
2020
2021
2022
2023
2024
		      maxload-in
		      (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
	 (loadjmp (- first next))
         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) ))  )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously


    (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
		      ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)
    (cond
     ((and (> first adjload)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)







>
>
|
|







2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
		      maxload-in
		      (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
	 (loadjmp (- first next))
         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) ))  )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
    ;; let's let the user know once in a long while that load checking is happening but not constantly report it
    (if (> (random 100) 75) ;; about 25% of the time
	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
			  ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp))
    (cond
     ((and (> first adjload)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)

Modified db.scm from [4dafe820e6] to [17795efac6].

1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     db
     "SELECT id FROM archive_blocks WHERE archive_disk_id=? AND disk_path=?;"
     bdisk-id archive-path)
    (if res ;; record exists, update du if applicable and return res
	(begin
	  (if du (sqlite3:execute db "UPDATE archive_blocks SET last_du=?,last_du_time=(strftime('%s','now'))
                                          WHERE archive_disk_id=? AND disk_path=?;"
				   bdisk-id archive-path du))
	  res)
	(begin
	  (sqlite3:execute db "INSERT OR REPLACE INTO archive_blocks (archive_disk_id,disk_path,last_du)
                                                        VALUES (?,?,?);"
			   bdisk-id archive-path (or du 0))
	  (db:archive-register-block-name dbstruct bdisk-id archive-path du: du)))
    (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat)
    res))


;; The "archived" field in tests is overloaded; 0 = not archived, > 0 archived in block with given id
;;
(define (db:test-set-archive-block-id dbstruct run-id test-id archive-block-id)







<
|

|
<




|







1462
1463
1464
1465
1466
1467
1468

1469
1470
1471

1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     db
     "SELECT id FROM archive_blocks WHERE archive_disk_id=? AND disk_path=?;"
     bdisk-id archive-path)
    (if res ;; record exists, update du if applicable and return res

	(if du (sqlite3:execute db "UPDATE archive_blocks SET last_du=?,last_du_time=(strftime('%s','now'))
                                          WHERE archive_disk_id=? AND disk_path=?;"
				bdisk-id archive-path du))

	(begin
	  (sqlite3:execute db "INSERT OR REPLACE INTO archive_blocks (archive_disk_id,disk_path,last_du)
                                                        VALUES (?,?,?);"
			   bdisk-id archive-path (or du 0))
	  (set! res (db:archive-register-block-name dbstruct bdisk-id archive-path du: du))))
    (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat)
    res))


;; The "archived" field in tests is overloaded; 0 = not archived, > 0 archived in block with given id
;;
(define (db:test-set-archive-block-id dbstruct run-id test-id archive-block-id)

Modified genexample.scm from [d3c1b1c11c] to [2597a6cc06].

154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
	(print "[fields]")
	(map (lambda (k)(print k " TEXT")) keys)
	(print "")
	(print "[setup]")
	(print "# Adjust max_concurrent_jobs to limit how much you load your machines")
	(print "max_concurrent_jobs 50\n")
	(print "# This is your link path. Avoid moving it once set.")
	(print "linktree " (common:real-path lntree))
	(print "\n# Job tools are more advanced ways to control how your jobs are launched")
	(print "[jobtools]\nuseshell yes\nlauncher nbfake\nmaxload 1.5\n")
	(print "# You can override environment variables for all your tests here")
	(print "[env-override]\nEXAMPLE_VAR example value\n")
	(print "# As you run more tests you may need to add additional disks, the names are arbitrary but must be unique")
	(print "[disks]\ndisk0 " (common:real-path firstd))))

    (print
     "==================

I'm now creating a runconfigs.config file for you with a default section.
You can use this file to set variables for your tests based on the \"target\" (the combination
of keys).








|





|
|







154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
	(print "[fields]")
	(map (lambda (k)(print k " TEXT")) keys)
	(print "")
	(print "[setup]")
	(print "# Adjust max_concurrent_jobs to limit how much you load your machines")
	(print "max_concurrent_jobs 50\n")
	(print "# This is your link path. Avoid moving it once set.")
	(print "linktree " lntree) ;; (common:real-path lntree))
	(print "\n# Job tools are more advanced ways to control how your jobs are launched")
	(print "[jobtools]\nuseshell yes\nlauncher nbfake\nmaxload 1.5\n")
	(print "# You can override environment variables for all your tests here")
	(print "[env-override]\nEXAMPLE_VAR example value\n")
	(print "# As you run more tests you may need to add additional disks, the names are arbitrary but must be unique")
	(print "[disks]\ndisk0 " firstd))) ;; (common:real-path firstd))))
    
    (print
     "==================

I'm now creating a runconfigs.config file for you with a default section.
You can use this file to set variables for your tests based on the \"target\" (the combination
of keys).

193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
	(print "# Override settings in ../runconfigs.config for user " (current-user-name) " here.")))
    
    ;; Now create a test and logpro file
    (print
     "==================

You now have the basic common files for your megatest setup. Next run
\"megatest -gen-test\" to create a test.

Thank you for using Megatest. 

You can edit your config files and create tests in the " path " directory

")))








|







193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
	(print "# Override settings in ../runconfigs.config for user " (current-user-name) " here.")))
    
    ;; Now create a test and logpro file
    (print
     "==================

You now have the basic common files for your megatest setup. Next run
\"megatest -create-test <testname>\" to create a test.

Thank you for using Megatest. 

You can edit your config files and create tests in the " path " directory

")))

Modified launch.scm from [6b5c8d69de] to [b881cd615c].

621
622
623
624
625
626
627

628







629
630
631
632
633
634
635
	    (set-signal-handler! signal/int sighand)
	    (set-signal-handler! signal/term sighand)
	    ) ;; (set-signal-handler! signal/stop sighand)
	  
	  ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART,
	  ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY*
	  ;;

	  (let* ((test-info (rmt:get-test-info-by-id run-id test-id))







		 (test-host (if test-info
				(db:test-get-host        test-info)
				(begin
				  (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.")
				  (exit))))
		 (test-pid  (db:test-get-process_id  test-info)))
	    (cond







>
|
>
>
>
>
>
>
>







621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
	    (set-signal-handler! signal/int sighand)
	    (set-signal-handler! signal/term sighand)
	    ) ;; (set-signal-handler! signal/stop sighand)
	  
	  ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART,
	  ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY*
	  ;;
	  (let* ((test-info (let loop ((tries 0))
			      (let ((tinfo (rmt:get-test-info-by-id run-id test-id)))
				(if tinfo
				    tinfo
				    (if (> tries 5)
					#f
					(begin
					  (thread-sleep! (+ 1 (* tries 10)))
					  (loop (+ tries 1))))))))
		 (test-host (if test-info
				(db:test-get-host        test-info)
				(begin
				  (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.")
				  (exit))))
		 (test-pid  (db:test-get-process_id  test-info)))
	    (cond
1122
1123
1124
1125
1126
1127
1128
1129

1130
1131
1132
1133
1134
1135
1136
		  (set! toppath      *toppath*)
		  (if (not *toppath*)
		      (begin
			(debug:print-error 0 *default-log-port* "you are not in a megatest area!")
			(exit 1)))
		  (setenv "MT_RUN_AREA_HOME" *toppath*)
		  ;; the seed read is done, now read runconfigs, cache it then read megatest.config one more time and cache it
		  (let* ((keys         (rmt:get-keys))

			 (key-vals     (keys:target->keyval keys target))
			 (linktree     (common:get-linktree)) ;; (or (getenv "MT_LINKTREE")(if *configdat* (configf:lookup *configdat* "setup" "linktree") #f)))
					;     (if *configdat*
					; 	   (configf:lookup *configdat* "setup" "linktree")
					; 	   (conc *toppath* "/lt"))))
			 (second-pass  (find-and-read-config
					mtconfig







|
>







1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
		  (set! toppath      *toppath*)
		  (if (not *toppath*)
		      (begin
			(debug:print-error 0 *default-log-port* "you are not in a megatest area!")
			(exit 1)))
		  (setenv "MT_RUN_AREA_HOME" *toppath*)
		  ;; the seed read is done, now read runconfigs, cache it then read megatest.config one more time and cache it
		  (let* ((keys         (common:list-or-null (rmt:get-keys)
							    message: "Failed to retrieve keys in launch.scm. Please report this to the developers."))
			 (key-vals     (keys:target->keyval keys target))
			 (linktree     (common:get-linktree)) ;; (or (getenv "MT_LINKTREE")(if *configdat* (configf:lookup *configdat* "setup" "linktree") #f)))
					;     (if *configdat*
					; 	   (configf:lookup *configdat* "setup" "linktree")
					; 	   (conc *toppath* "/lt"))))
			 (second-pass  (find-and-read-config
					mtconfig

Modified megatest.scm from [c4f77722c1] to [4aeaf761b4].

224
225
226
227
228
229
230
231

232
233
234
235
236
237
238
  -refdb2dat refdb        : convert refdb to sexp or to format specified by s-dumpmode
                            formats: perl, ruby, sqlite3, csv (for csv the -o param
                            will substitute %s for the sheet name in generating 
                            multiple sheets)
  -o                      : output file for refdb2dat (defaults to stdout)
  -archive cmd            : archive runs specified by selectors to one of disks specified
                            in the [archive-disks] section.
                            cmd: keep-html, restore, save, save-remove, get (use -dest to set destination)

  -generate-html          : create a simple html dashboard for browsing your runs
  -generate-html-structure  : create a top level html veiw to list targets/runs and a Run view within each run directory.  
  -list-run-time          : list time requered to complete runs. It supports following switches
                            -run-patt <patt> -target-patt <patt> -dumpmode <csv,json,plain-text>
  -list-test-time	   : list time requered to complete each test in a run. It following following arguments
                            -runname <patt> -target <patt> -dumpmode <csv,json,plain-text>








|
>







224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
  -refdb2dat refdb        : convert refdb to sexp or to format specified by s-dumpmode
                            formats: perl, ruby, sqlite3, csv (for csv the -o param
                            will substitute %s for the sheet name in generating 
                            multiple sheets)
  -o                      : output file for refdb2dat (defaults to stdout)
  -archive cmd            : archive runs specified by selectors to one of disks specified
                            in the [archive-disks] section.
                            cmd: keep-html, restore, save, save-remove, get (use 
                            -dest to set destination), -include path1,path2... to get or save specific files
  -generate-html          : create a simple html dashboard for browsing your runs
  -generate-html-structure  : create a top level html veiw to list targets/runs and a Run view within each run directory.  
  -list-run-time          : list time requered to complete runs. It supports following switches
                            -run-patt <patt> -target-patt <patt> -dumpmode <csv,json,plain-text>
  -list-test-time	   : list time requered to complete each test in a run. It following following arguments
                            -runname <patt> -target <patt> -dumpmode <csv,json,plain-text>

328
329
330
331
332
333
334


335
336
337



338
339
340
341
342
343
344
			"-setvars"
			"-set-state-status"

                        ;; move runs stuff here
                        "-remove-keep"           
			"-set-run-status"
			"-age"


			"-archive"
			"-actions"
			"-precmd"



			
			"-debug" ;; for *verbosity* > 2
			"-create-test"
			"-override-timeout"
			"-test-files"  ;; -test-paths is for listing all
			"-load"        ;; load and exectute a scheme file
			"-section"







>
>



>
>
>







329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
			"-setvars"
			"-set-state-status"

                        ;; move runs stuff here
                        "-remove-keep"           
			"-set-run-status"
			"-age"

			;; archive 
			"-archive"
			"-actions"
			"-precmd"
			"-include"
			"-exclude-rx"
			"-exclude-rx-from"
			
			"-debug" ;; for *verbosity* > 2
			"-create-test"
			"-override-timeout"
			"-test-files"  ;; -test-paths is for listing all
			"-load"        ;; load and exectute a scheme file
			"-section"
504
505
506
507
508
509
510
511
512
513






514
515
516
517
518
519
520
       (start-watchdog (null? no-watchdog-args-vals)))
  ;;(BB> "no-watchdog-args="no-watchdog-args "no-watchdog-args-vals="no-watchdog-args-vals) 
  (if start-watchdog
      (thread-start! *watchdog*)))


;; bracket open-output-file with code to make leading directory if it does not exist and handle exceptions
(define (open-logfile logpath)
  (condition-case
   (let* ((log-dir (or (pathname-directory logpath) ".")))






     (if (not (directory-exists? log-dir))
         (system (conc "mkdir -p " log-dir)))
     (open-output-file logpath))
   (exn ()
        (debug:print-error 0 *default-log-port* "Could not open log file for write: "logpath)
        (define *didsomething* #t)  
        (exit 1))))







|

|
>
>
>
>
>
>







510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
       (start-watchdog (null? no-watchdog-args-vals)))
  ;;(BB> "no-watchdog-args="no-watchdog-args "no-watchdog-args-vals="no-watchdog-args-vals) 
  (if start-watchdog
      (thread-start! *watchdog*)))


;; bracket open-output-file with code to make leading directory if it does not exist and handle exceptions
(define (open-logfile logpath-in)
  (condition-case
   (let* ((log-dir (or (pathname-directory logpath-in) "."))
	  (fname   (pathname-strip-directory logpath-in))
	  (logpath (if (> (string-length fname) 250)
		       (let ((newlogf (conc log-dir "/" (common:get-signature fname) ".log")))
			 (debug:print 0 *default-log-port* "WARNING: log file " logpath-in " path too long, converted to " newlogf)
			 newlogf)
		       logpath-in)))
     (if (not (directory-exists? log-dir))
         (system (conc "mkdir -p " log-dir)))
     (open-output-file logpath))
   (exn ()
        (debug:print-error 0 *default-log-port* "Could not open log file for write: "logpath)
        (define *didsomething* #t)  
        (exit 1))))

Modified runs.scm from [5ad6583ebc] to [af2d0a75d8].

235
236
237
238
239
240
241
242




243
244
245
246
247


248
249
250
251
252
253
254
(define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)

  ;; Take advantage of a good place to exit if running the one-pass methodology
  (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20)
	   (args:get-arg "-one-pass"))
      (exit 0))

  (thread-sleep! (cond ;; BB: check with Matt.  Should this sleep move to cond clauses below where we determine we have too many jobs running rather than each time the and condition above is true (which seems like always)?




        	  ((> (runs:dat-can-run-more-tests-count runsdat) 20)
		   (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ..."))
                   (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.1) ;; was 2
		   );; obviously haven't had any work to do for a while
        	  (else 0)))


  
  (let* ((num-running             (rmt:get-count-tests-running run-id))
	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
	 (job-group-limit         (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))
				    (if (string? jobg-count)
					(string->number jobg-count)
					jobg-count))))







|
>
>
>
>
|

<
|
|
>
>







235
236
237
238
239
240
241
242
243
244
245
246
247
248

249
250
251
252
253
254
255
256
257
258
259
(define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)

  ;; Take advantage of a good place to exit if running the one-pass methodology
  (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20)
	   (args:get-arg "-one-pass"))
      (exit 0))

  (thread-sleep! (cond ;; BB: check with Matt.  Should this sleep move
		       ;; to cond clauses below where we determine we
		       ;; have too many jobs running rather than each
		       ;; time the and condition above is true (which
		       ;; seems like always)?
        	  ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time
		   (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ..."))

		   10)  ;; obviously haven't had any work to do for a while
		  (else
		   ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?
		   (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01))))
  
  (let* ((num-running             (rmt:get-count-tests-running run-id))
	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
	 (job-group-limit         (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))
				    (if (string? jobg-count)
					(string->number jobg-count)
					jobg-count))))
1279
1280
1281
1282
1283
1284
1285

1286
1287
1288
1289
1290
1291
1292
1293







1294
1295
1296
1297
1298
1299
1300
  (let ((curr-sec (current-seconds)))
    (if (> (- curr-sec (runs:gendat-inc-results-last-update *runs:general-data*)) 5) ;; at least five seconds since last update
	(let* ((run-dat  (or (runs:gendat-run-info *runs:general-data*)(rmt:get-run-info run-id)))
	       (runname  (or (runs:gendat-runname *runs:general-data*)
			     (db:get-value-by-header (db:get-rows run-dat)
						     (db:get-header run-dat) "runname")))
	       (target   (or (runs:gendat-target *runs:general-data*)(rmt:get-target run-id)))

	       (testsdat (rmt:get-tests-for-run run-id "%" '() '() ;; run-id testpatt states statuses
						#f #f ;; offset limit
						#f ;; not-in
						#f ;; sort-by
						#f ;; sort-order
						#f ;; get full data (not 'shortlist)
						(runs:gendat-inc-results-last-update *runs:general-data*) ;; last update time
						'dashboard)))







	  (if (not (runs:gendat-run-info *runs:general-data*))
	      (runs:gendat-run-info-set! *runs:general-data* run-dat))
	  (if (not (runs:gendat-runname  *runs:general-data*))
	      (runs:gendat-runname-set! *runs:general-data* runname))
	  (if (not (runs:gendat-target *runs:general-data*))
	      (runs:gendat-target-set! *runs:general-data* target))
	  (for-each







>
|
|
|
|
|
|
|
|
>
>
>
>
>
>
>







1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
  (let ((curr-sec (current-seconds)))
    (if (> (- curr-sec (runs:gendat-inc-results-last-update *runs:general-data*)) 5) ;; at least five seconds since last update
	(let* ((run-dat  (or (runs:gendat-run-info *runs:general-data*)(rmt:get-run-info run-id)))
	       (runname  (or (runs:gendat-runname *runs:general-data*)
			     (db:get-value-by-header (db:get-rows run-dat)
						     (db:get-header run-dat) "runname")))
	       (target   (or (runs:gendat-target *runs:general-data*)(rmt:get-target run-id)))
	       (testsdat (let ((res (rmt:get-tests-for-run
				     run-id "%" '() '() ;; run-id testpatt states statuses
				     #f #f ;; offset limit
				     #f ;; not-in
				     #f ;; sort-by
				     #f ;; sort-order
				     #f ;; get full data (not 'shortlist)
				     (runs:gendat-inc-results-last-update *runs:general-data*) ;; last update time
				     'dashboard)))
			   (if (list? res)
			       res
			       (begin
				 (debug:print-error
				  0 *default-log-port*
				  "FAILED TO GET DATA using rmt:get-tests-for-run. Notify developers if you see this. result: " res)
				 '())))))
	  (if (not (runs:gendat-run-info *runs:general-data*))
	      (runs:gendat-run-info-set! *runs:general-data* run-dat))
	  (if (not (runs:gendat-runname  *runs:general-data*))
	      (runs:gendat-runname-set! *runs:general-data* runname))
	  (if (not (runs:gendat-target *runs:general-data*))
	      (runs:gendat-target-set! *runs:general-data* target))
	  (for-each