Megatest

Check-in [b17e77a3ea]
Login
Overview
Comment:Cross-pollinated v1.65 run-testdat work. This likely still does not pass obfusc-skel.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | v1.65-sidework
Files: files | file ages | folders
SHA1: b17e77a3ea5b2ad32ae3d904f2b11bfcc4726d50
User & Date: matt on 2020-10-05 23:07:30
Other Links: branch diff | manifest | tags
Context
2020-10-05
23:07
Cross-pollinated v1.65 run-testdat work. This likely still does not pass obfusc-skel. Closed-Leaf check-in: b17e77a3ea user: matt tags: v1.65-sidework
22:49
Do not exit on failure to create directory - race conditons on NFS cause false fail scenarios - just keep going and cross your fingers... (cherrypicked from v1.6572) check-in: 05b253a452 user: matt tags: v1.65-sidework
Changes

Modified TODO from [da5eae4898] to [dcd0f52bc7].

14
15
16
17
18
19
20







21
22
23
24
25
26
27
# 
#     You should have received a copy of the GNU General Public License
#     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

TODO
====








WW15
. fill newview matrix with data, filter pipeline gui elements
. improve [script], especially indent handling

WW16
. split db into megatest.db (runs etc.) db/<something>.db
. release basic newview implementation







>
>
>
>
>
>
>







14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# 
#     You should have received a copy of the GNU General Public License
#     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

TODO
====

WW38
. Add test_rundat to no-sync ==> correction, put in <testdir>/.meta/test-run.dat
. Add STATE/STATUS transitions to .meta/test-run.dat or similar
. Swizzle update-test-rundat to operate on no-sync
. Swizzle update-run-duration, -uname-host and cpuload-diskfree to no-sync
. On state/status change update tests table with duration

WW15
. fill newview matrix with data, filter pipeline gui elements
. improve [script], especially indent handling

WW16
. split db into megatest.db (runs etc.) db/<something>.db
. release basic newview implementation

Modified common.scm from [33c7316880] to [a37bf38bb2].

499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
;;          logs directory you wish to log-rotate.
;;
(define (common:rotate-logs)
  (let* ((all-files (make-hash-table))
	 (stats     (make-hash-table))
	 (inc-stat  (lambda (key)
		      (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1))))
	(max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age
    (if (not (directory-exists? "logs"))(create-directory "logs"))
    (directory-fold 
     (lambda (file rem)
       (handle-exceptions
	exn
	(begin
	  (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore. exn=" exn)







|







499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
;;          logs directory you wish to log-rotate.
;;
(define (common:rotate-logs)
  (let* ((all-files (make-hash-table))
	 (stats     (make-hash-table))
	 (inc-stat  (lambda (key)
		      (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1))))
	(max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "600")))) ;; name -> age
    (if (not (directory-exists? "logs"))(create-directory "logs"))
    (directory-fold 
     (lambda (file rem)
       (handle-exceptions
	exn
	(begin
	  (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore. exn=" exn)

Modified dashboard.scm from [935bf4d2df] to [d4733659e1].

210
211
212
213
214
215
216


217
218
219
220
221
222
223
;;
(define (dboard:common-set-tabdat! commondat tabnum tabdat)
  (hash-table-set!
   (dboard:commondat-tabdats commondat)
   tabnum
   tabdat))



;; gets and calls updater list based on curr-tab-num
;;
(define (dboard:common-run-curr-updaters commondat #!key (tab-num #f))
  (if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat
      (let* ((tnum     (or tab-num (dboard:commondat-curr-tab-num commondat)))
	     (updaters (hash-table-ref/default (dboard:commondat-updaters commondat)
					       tnum







>
>







210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
;;
(define (dboard:common-set-tabdat! commondat tabnum tabdat)
  (hash-table-set!
   (dboard:commondat-tabdats commondat)
   tabnum
   tabdat))

(define *updater-running* #f) ;; move this into one of the stucts

;; gets and calls updater list based on curr-tab-num
;;
(define (dboard:common-run-curr-updaters commondat #!key (tab-num #f))
  (if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat
      (let* ((tnum     (or tab-num (dboard:commondat-curr-tab-num commondat)))
	     (updaters (hash-table-ref/default (dboard:commondat-updaters commondat)
					       tnum
238
239
240
241
242
243
244
245


246
247
248
249
250
251
252
	 (curr-updaters (hash-table-ref/default (dboard:commondat-updaters commondat) tnum '())))
    (hash-table-set! (dboard:commondat-updaters commondat)
		     tnum
		     (cons updater curr-updaters))))

;; data for each specific tab goes here
;;
(defstruct dboard:tabdat 


  ;; runs
  ((allruns         '())                 : list)        ;; list of dboard:rundat records
  ((allruns-by-id    (make-hash-table))  : hash-table)  ;; hash of run-id -> dboard:rundat records
  ((done-runs       '())                 : list)        ;; list of runs already drawn
  ((not-done-runs   '())                 : list)        ;; list of runs not yet drawn
  (header            #f)                                ;; header for decoding the run records
  (keys              #f)                                ;; keys for this run (i.e. target components)







|
>
>







240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
	 (curr-updaters (hash-table-ref/default (dboard:commondat-updaters commondat) tnum '())))
    (hash-table-set! (dboard:commondat-updaters commondat)
		     tnum
		     (cons updater curr-updaters))))

;; data for each specific tab goes here
;;
(defstruct dboard:tabdat
  ;; flags
  ((already-running #f)                  : boolean)     ;; the updater is already running. skip
  ;; runs
  ((allruns         '())                 : list)        ;; list of dboard:rundat records
  ((allruns-by-id    (make-hash-table))  : hash-table)  ;; hash of run-id -> dboard:rundat records
  ((done-runs       '())                 : list)        ;; list of runs already drawn
  ((not-done-runs   '())                 : list)        ;; list of runs not yet drawn
  (header            #f)                                ;; header for decoding the run records
  (keys              #f)                                ;; keys for this run (i.e. target components)
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
;;
;;    NOTE: Yes, this is used
;;
(define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals)
  (let* ((start-time   (current-seconds))
	 (access-mode  (dboard:tabdat-access-mode tabdat))
         (num-to-get   (string->number (or (configf:lookup *configdat* "setup" "num-tests-to-get")
                                           "200")))
	 (states       (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat)))
	 (statuses     (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat)))
         (do-not-use-db-file-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab
         (do-not-use-query-timestamps   #t) ;; (configf:lookup *configdat* "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab
	 (sort-info    (get-curr-sort))
	 (sort-by      (vector-ref sort-info 1))
	 (sort-order   (vector-ref sort-info 2))







|







647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
;;
;;    NOTE: Yes, this is used
;;
(define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals)
  (let* ((start-time   (current-seconds))
	 (access-mode  (dboard:tabdat-access-mode tabdat))
         (num-to-get   (string->number (or (configf:lookup *configdat* "setup" "num-tests-to-get")
                                           "50"))) ;; was 200, which is fine in a normal run area.
	 (states       (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat)))
	 (statuses     (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat)))
         (do-not-use-db-file-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab
         (do-not-use-query-timestamps   #t) ;; (configf:lookup *configdat* "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab
	 (sort-info    (get-curr-sort))
	 (sort-by      (vector-ref sort-info 1))
	 (sort-order   (vector-ref sort-info 2))
899
900
901
902
903
904
905
906
907
908
909
910

911
912
913
914
915
916
917
918
919
920
921
922

923
924
925
926
927
928
929
		   (elapsed-time (- (current-seconds) start-time)))
	      (if (null? all-test-ids)
		  (hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id)
		  (hash-table-set!    (dboard:tabdat-allruns-by-id tabdat) run-id run-struct))
	      (if (or (null? tal)
		      (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update
		  (begin
		    (when (> elapsed-time 2)   
                      (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
                      (let* ((old-val (iup:attribute *tim* "TIME"))
                             (new-val (number->string (inexact->exact (floor (* 2  (string->number old-val)))))))
                        (if (< (string->number new-val) 5000)

                            ((debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
                            (iup:attribute-set! *tim* "TIME" new-val))))


                      )
		    (dboard:tabdat-allruns-set! tabdat new-res)
		    maxtests)
		  (if (> (dboard:rundat-run-data-offset run-struct) 0)
		      (loop run tal new-res newmaxtests) ;; not done getting data for this run
		      (loop (car tal)(cdr tal) new-res newmaxtests)))))))
    (dboard:tabdat-filters-changed-set! tabdat #f)
    (dboard:update-tree tabdat runs-hash header tb)))


(define *collapsed* (make-hash-table))

(define (toggle-hide lnum uidat) ; fulltestname)
  (let* ((btn (vector-ref (dboard:uidat-get-lftcol uidat) lnum))
	 (fulltestname (iup:attribute btn "TITLE"))
	 (parts        (string-split fulltestname "("))







|




>
|
|
<
<
<






|
>







903
904
905
906
907
908
909
910
911
912
913
914
915
916
917



918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
		   (elapsed-time (- (current-seconds) start-time)))
	      (if (null? all-test-ids)
		  (hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id)
		  (hash-table-set!    (dboard:tabdat-allruns-by-id tabdat) run-id run-struct))
	      (if (or (null? tal)
		      (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update
		  (begin
		    #;(when (> elapsed-time 2)   
                      (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
                      (let* ((old-val (iup:attribute *tim* "TIME"))
                             (new-val (number->string (inexact->exact (floor (* 2  (string->number old-val)))))))
                        (if (< (string->number new-val) 5000)
                            (begin
			      (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
			      (iup:attribute-set! *tim* "TIME" new-val)))))



		    (dboard:tabdat-allruns-set! tabdat new-res)
		    maxtests)
		  (if (> (dboard:rundat-run-data-offset run-struct) 0)
		      (loop run tal new-res newmaxtests) ;; not done getting data for this run
		      (loop (car tal)(cdr tal) new-res newmaxtests)))))))
    (dboard:tabdat-filters-changed-set! tabdat #f)
    (dboard:update-tree tabdat runs-hash header tb)
    (dboard:tabdat-already-running-set! tabdat #f)))

(define *collapsed* (make-hash-table))

(define (toggle-hide lnum uidat) ; fulltestname)
  (let* ((btn (vector-ref (dboard:uidat-get-lftcol uidat) lnum))
	 (fulltestname (iup:attribute btn "TITLE"))
	 (parts        (string-split fulltestname "("))
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
            sort-lb))) 
	)

        ;; insert extra widget here
        (if extra-widget
            extra-widget
            (iup:hbox)) ;; empty widget

        

        
        )))

     (let* ((status-toggles (map (lambda (status)
				   (iup:toggle (conc status)
					       #:fontsize 8 ;; btn-fontsz ;; "10"
					       ;; #:expand "HORIZONTAL"
					       #:action   (lambda (obj val)







<
<
<
<







2506
2507
2508
2509
2510
2511
2512




2513
2514
2515
2516
2517
2518
2519
            sort-lb))) 
	)

        ;; insert extra widget here
        (if extra-widget
            extra-widget
            (iup:hbox)) ;; empty widget




        )))

     (let* ((status-toggles (map (lambda (status)
				   (iup:toggle (conc status)
					       #:fontsize 8 ;; btn-fontsz ;; "10"
					       ;; #:expand "HORIZONTAL"
					       #:action   (lambda (obj val)
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747

3748
3749
3750
3751
3752
3753
3754
   tabdat
   (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "runname" "%")
   (dboard:tabdat-numruns tabdat)
   (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%")
   ;; generate key patterns from the target stored in tabdat
   (let* ((dbkeys (dboard:tabdat-dbkeys tabdat)))
     (let ((fres   (if (dboard:tabdat-target tabdat)
                       (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%"))))
                         (map (lambda (k v)(list k v)) dbkeys ptparts))
                       (let ((res '()))
                         (for-each (lambda (key)
                                     (if (not (equal? key "runname"))
                                         (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f)))
                                           (if val (set! res (cons (list key val) res))))))
                                   dbkeys)
                         res))))
       fres))))


(define (dashboard:runs-tab-updater commondat tab-num)
  (debug:catch-and-dump 
   (lambda ()
     (let* ((tabdat (dboard:common-get-tabdat commondat tab-num: tab-num))
	    (dbkeys (dboard:tabdat-dbkeys tabdat)))
       (dashboard:do-update-rundat tabdat)







|
|
|
|
|
|
|
|
|
|
>







3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
   tabdat
   (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "runname" "%")
   (dboard:tabdat-numruns tabdat)
   (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%")
   ;; generate key patterns from the target stored in tabdat
   (let* ((dbkeys (dboard:tabdat-dbkeys tabdat)))
     (let ((fres   (if (dboard:tabdat-target tabdat)
		       (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%"))))
			 (map (lambda (k v)(list k v)) dbkeys ptparts))
		       (let ((res '()))
			 (for-each (lambda (key)
				     (if (not (equal? key "runname"))
					 (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f)))
					   (if val (set! res (cons (list key val) res))))))
				   dbkeys)
			 res))))
       fres)))
  #;(set! *updater-running* #f))

(define (dashboard:runs-tab-updater commondat tab-num)
  (debug:catch-and-dump 
   (lambda ()
     (let* ((tabdat (dboard:common-get-tabdat commondat tab-num: tab-num))
	    (dbkeys (dboard:tabdat-dbkeys tabdat)))
       (dashboard:do-update-rundat tabdat)
3799
3800
3801
3802
3803
3804
3805
3806

3807
3808
3809
3810
3811
3812
3813

3814

3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
	 commondat 
	 (lambda ()
	   (dashboard:runs-tab-updater commondat 1))
	 tab-num: 2)
	(iup:callback-set! *tim*
			   "ACTION_CB"
			   (lambda (time-obj)
			     (let ((update-is-running #f))

			     (mutex-lock! (dboard:commondat-update-mutex commondat))
			     (set! update-is-running (dboard:commondat-updating commondat))
			     (if (not update-is-running)
			     (dboard:commondat-updating-set! commondat #t))
			     (mutex-unlock! (dboard:commondat-update-mutex commondat))
			     (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
			     (begin

			     (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)

			     (mutex-lock! (dboard:commondat-update-mutex commondat))
			     (dboard:commondat-updating-set! commondat #f)
			     (mutex-unlock! (dboard:commondat-update-mutex commondat)))
				   ))
			     1))))
      
      (let ((th1 (make-thread (lambda ()
				(thread-sleep! 1)
				(dboard:common-run-curr-updaters commondat 0) ;; force update of summary tab 
				) "update buttons once"))







|
>
|
|
|
|
|
|
|
>
|
>
|
|
|







3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
	 commondat 
	 (lambda ()
	   (dashboard:runs-tab-updater commondat 1))
	 tab-num: 2)
	(iup:callback-set! *tim*
			   "ACTION_CB"
			   (lambda (time-obj)
			     (if (not *updater-running*)
				 (begin
				   ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
				   ;; (set! update-is-running (dboard:commondat-updating commondat))
				   ;;(if (not update-is-running)
				   ;;	   (dboard:commondat-updating-set! commondat #t))
				   ;;(mutex-unlock! (dboard:commondat-update-mutex commondat))
				   ;;(if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
				   ;;	   (begin
				   (set! *updater-running* #t)
				   (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)
				   (set! *updater-running* #f)
				   ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
				   ;;  (dboard:commondat-updating-set! commondat #f)
				   ;;   (mutex-unlock! (dboard:commondat-update-mutex commondat)))
				   ))
			     1))))
      
      (let ((th1 (make-thread (lambda ()
				(thread-sleep! 1)
				(dboard:common-run-curr-updaters commondat 0) ;; force update of summary tab 
				) "update buttons once"))

Modified db.scm from [fb3a18f52f] to [2c7b396933].

1774
1775
1776
1777
1778
1779
1780


1781
1782
1783
1784
















1785
1786
1787
1788
1789
1790
1791
1792
	  (debug:print 0 *default-log-port* "ERROR: cannot read " infile)
          (debug:print 0 *default-log-port* "ERROR: run-dir is " run-dir)
          #f
          )
        (with-input-from-file infile read-lines)
	)))



;;  select end_time-now from
;;      (select testname,item_path,event_time+run_duration as
;;                          end_time,strftime('%s','now') as now from tests where state in
;;      ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));

















(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
  (let* ((incompleted '())
	 (oldlaunched '())
	 (toplevels   '())
          ;; The default running-deadtime is 720 seconds = 12 minutes.
          ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30))
         (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime")))







>
>
|
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|







1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
	  (debug:print 0 *default-log-port* "ERROR: cannot read " infile)
          (debug:print 0 *default-log-port* "ERROR: run-dir is " run-dir)
          #f
          )
        (with-input-from-file infile read-lines)
	)))

;; check duration against test-run.dat file if it exists and update the value in
;; the db if necessary
;;
(define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)
  (let* ((datf             (conc run-dir ".mt_data/test-run.dat"))
	 (modt             (if (and (file-exists? datf)
				    (file-read-access? datf))
			       (file-modification-time datf)
			       #f)) ;; (+ event-time run-duration))))
	 (alt-run-duration (if modt
			       (- modt event-time)
			       #f)))
    (if (and alt-run-duration
	     (> alt-run-duration run-duration))
	(begin
	  (debug:print 0 *default-log-port* "Test " test-id " run duration mismatch. Setting to " alt-run-duration)
	  (db:with-db
	   dbstruct #f #f
	   (lambda (db)
	     (sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id)
	     #t)))
	#f))) ;; #f = we did NOT adjust the time
	      
(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
  (let* ((incompleted '())
	 (oldlaunched '())
	 (toplevels   '())
          ;; The default running-deadtime is 720 seconds = 12 minutes.
          ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30))
         (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime")))
1826
1827
1828
1829
1830
1831
1832

1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848

1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
	 ;; HOWEVER: this code in run:test seems to work fine
	 ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
	 ;;                     (db:test-get-run_duration testdat)))
	 ;;                    600) 
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)

	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
		  (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
				    test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
				    " event-time="event-time" run-duration="run-duration))))
	  stmth1
	  run-id running-deadtime) ;; default time 720 seconds
       
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)

	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
				    " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
				    " run-duration="run-duration)
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
	  stmth2
	  run-id remotehoststart-deadtime) ;; default time 230 seconds
	 
	 ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
	 ;;
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row







>
|
|
|
|
|
|
|
|
|
|
|


|


>
|
|
|
|
|
|
|
|
|
|
|







1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
	 ;; HOWEVER: this code in run:test seems to work fine
	 ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
	 ;;                     (db:test-get-run_duration testdat)))
	 ;;                    600) 
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
		(if (and (equal? uname "n/a")
			 (equal? item-path "")) ;; this is a toplevel test
		    ;; what to do with toplevel? call rollup?
		    (begin
		      (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		      (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		    (begin
		      (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
		      (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
					test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
					" event-time="event-time" run-duration="run-duration)))))
	  stmth1
	  run-id running-deadtime) ;; default time 720 seconds
	    
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
		(if (and (equal? uname "n/a")
			 (equal? item-path "")) ;; this is a toplevel test
		    ;; what to do with toplevel? call rollup?
		    (begin
		      (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		      (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		    (begin
		      (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
					" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
					" run-duration="run-duration)
		      (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))))
	  stmth2
	  run-id remotehoststart-deadtime) ;; default time 230 seconds
	 
	 ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
	 ;;
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
  (let* ((run-ids (db:get-all-run-ids mtdb)))
    (for-each 
     (lambda (run-id)
       (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
	 (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs)))
     run-ids)))

;; Get test data using test_id, run-id is not used
;; 
(define (db:get-test-info-by-id dbstruct run-id test-id)
  (db:with-db
   dbstruct
   #f ;; run-id
   #f
   (lambda (db)







|







3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
  (let* ((run-ids (db:get-all-run-ids mtdb)))
    (for-each 
     (lambda (run-id)
       (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
	 (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs)))
     run-ids)))

;; Get test data using test_id, run-id is not used - but it will be!
;; 
(define (db:get-test-info-by-id dbstruct run-id test-id)
  (db:with-db
   dbstruct
   #f ;; run-id
   #f
   (lambda (db)

Modified launch.scm from [6b92672067] to [3e2d417312].

203
204
205
206
207
208
209
210
211
212
213
214
215
216


217
218
219
220
221
222
223
			   (round 
			    (- 
			     (current-seconds) 
			     start-seconds)))))
	 (kill-tries 0))
    ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
    ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)

    (let loop ((minutes   (calc-minutes))
	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
	       (disk-free (get-df (current-directory)))
               (last-sync (current-seconds)))
      (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))


      (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
             (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                   (delta (abs (- load cpu-load))))
                              (if (> delta 0.1) ;; don't bother updating with small changes
                                  load
                                  #f)))
             (new-disk-free (let* ((df    (if over-time ;; only get df every 30 seconds







|





|
>
>







203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
			   (round 
			    (- 
			     (current-seconds) 
			     start-seconds)))))
	 (kill-tries 0))
    ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
    ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t)

    (let loop ((minutes   (calc-minutes))
	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
	       (disk-free (get-df (current-directory)))
               (last-sync (current-seconds)))
      ;; (common:telemetry-log "zombie" (conc "launch:monitor-job -
      ;; top of loop encountered at "(current-seconds)" with
      ;; last-sync="last-sync))
      (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
             (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                   (delta (abs (- load cpu-load))))
                              (if (> delta 0.1) ;; don't bother updating with small changes
                                  load
                                  #f)))
             (new-disk-free (let* ((df    (if over-time ;; only get df every 30 seconds
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
             (do-sync       (or new-cpu-load new-disk-free over-time))

             (test-info   (rmt:get-test-info-by-id run-id test-id))
             (state       (db:test-get-state test-info))
             (status      (db:test-get-status test-info))
             (kill-reason  "no kill reason specified")
             (kill-job?    #f))
        (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
        (cond
         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (launch:handle-zombie-tests run-id)
        (when do-sync
          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
          (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
        
	(if kill-job? 
	    (begin
              (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
	      (mutex-lock! m)
	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
	      ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
	      ;;       between tries?







|








|






|
<
<
<
|
<
<







233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256



257


258
259
260
261
262
263
264
             (do-sync       (or new-cpu-load new-disk-free over-time))

             (test-info   (rmt:get-test-info-by-id run-id test-id))
             (state       (db:test-get-state test-info))
             (status      (db:test-get-status test-info))
             (kill-reason  "no kill reason specified")
             (kill-job?    #f))
        #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
        (cond
         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (launch:handle-zombie-tests run-id)
        (if do-sync ;; save meta data about the running of this test



	    (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))


	(if kill-job? 
	    (begin
              (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
	      (mutex-lock! m)
	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
	      ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
	      ;;       between tries?
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
	    (begin
	      (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
		  (loop (calc-minutes)
                        (or new-cpu-load cpu-load)
                        (or new-disk-free disk-free)
                        (if do-sync (current-seconds) last-sync)))))))
    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional


(define (launch:execute encoded-cmd)
  (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
	 (tconfigreg #f))
    (setenv "MT_CMDINFO" encoded-cmd)
    ;;(bb-check-path msg: "launch:execute incoming")







|







307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
	    (begin
	      (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
		  (loop (calc-minutes)
                        (or new-cpu-load cpu-load)
                        (or new-disk-free disk-free)
                        (if do-sync (current-seconds) last-sync)))))))
    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional


(define (launch:execute encoded-cmd)
  (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
	 (tconfigreg #f))
    (setenv "MT_CMDINFO" encoded-cmd)
    ;;(bb-check-path msg: "launch:execute incoming")
463
464
465
466
467
468
469



470
471
472
473
474
475
476
				(db:test-get-host        test-info)
				(begin
				  (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.")
				  (exit))))
		 (test-pid  (db:test-get-process_id  test-info)))
	    (cond
             ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag.



	     ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
	      (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")

              (rmt:general-call 'set-test-start-time #f test-id)
              (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
	      ) ;; prime it for running







>
>
>







460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
				(db:test-get-host        test-info)
				(begin
				  (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.")
				  (exit))))
		 (test-pid  (db:test-get-process_id  test-info)))
	    (cond
             ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag.
	     ;; ((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
	     ;;  (and (equal? (db:test-get-state test-info) "COMPLETED")                           ;; completed/abort => rerun if asked
	     ;;	       (member (db:test-get-status test-info) '("ABORT"))))
	     ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
	      (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")

              (rmt:general-call 'set-test-start-time #f test-id)
              (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
	      ) ;; prime it for running
769
770
771
772
773
774
775


776
777
778
779
780
781
782
783
                 (item-path (vector-ref running-test 11)))
			       	(debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed")
              (if (not (null? tal))
				  (loop (car tal) (cdr tal)))))))))))        
 
(define (launch:is-test-alive host pid)
  (if (and host pid (not (equal? host "n/a")))


      (let* ((cmd (conc "ssh " host " pstree -A " pid))
	     (output (with-input-from-pipe cmd read-lines)))
	(debug:print 2 *default-log-port* "Running " cmd " received " output)
	(if (eq? (length output) 0)
	   #f
	   #t))
      #t))
 







>
>
|







769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
                 (item-path (vector-ref running-test 11)))
			       	(debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed")
              (if (not (null? tal))
				  (loop (car tal) (cdr tal)))))))))))        
 
(define (launch:is-test-alive host pid)
  (if (and host pid (not (equal? host "n/a")))
      (let* ((is-local (equal? host (get-host-name)))
	     (ssh-cmd   (if is-local " " (conc "ssh " host " ")))
	     (cmd (conc ssh-cmd "pstree -A " pid))
	     (output (with-input-from-pipe cmd read-lines)))
	(debug:print 2 *default-log-port* "Running " cmd " received " output)
	(if (eq? (length output) 0)
	   #f
	   #t))
      #t))
 

Modified rmt.scm from [39d97c528a] to [29d7593e43].

20
21
22
23
24
25
26


27
28
29
30
31
32
33

(use format typed-records) ;; RADT => purpose of json format??

(declare (unit rmt))
(declare (uses api))
(declare (uses http-transport))
(include "common_records.scm")


;; (declare (uses rmtmod))

;; (import rmtmod)

;;
;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!!
;;







>
>







20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

(use format typed-records) ;; RADT => purpose of json format??

(declare (unit rmt))
(declare (uses api))
(declare (uses http-transport))
(include "common_records.scm")
(include "db_records.scm")

;; (declare (uses rmtmod))

;; (import rmtmod)

;;
;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!!
;;
523
524
525
526
527
528
529
530
531
532
533
534









535
536
537
538
539
540
541
;; Just some syntatic sugar
(define (rmt:register-test run-id test-name item-path)
  (rmt:general-call 'register-test run-id run-id test-name item-path))

(define (rmt:get-test-id run-id testname item-path)
  (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))

;; run-id is NOT used
;;
(define (rmt:get-test-info-by-id run-id test-id)
  (if (number? test-id)
      (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))









      (begin
	(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
	(print-call-chain (current-error-port))
	#f)))

(define (rmt:test-get-rundir-from-test-id run-id test-id)
  (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))







|



|
>
>
>
>
>
>
>
>
>







525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
;; Just some syntatic sugar
(define (rmt:register-test run-id test-name item-path)
  (rmt:general-call 'register-test run-id run-id test-name item-path))

(define (rmt:get-test-id run-id testname item-path)
  (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))

;; run-id is NOT used - but it will be! 
;;
(define (rmt:get-test-info-by-id run-id test-id)
  (if (number? test-id)
      (let* ((testdat  (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)))
	     (trundatf (conc (db:test-get-rundir testdat) "/.mt_data/test-run.dat")))
	;; now we can update a couple fields from the filesystem
	(if (and (db:test-get-rundir testdat)
		 (file-exists? trundatf))
	    (let* ((duration   (db:test-get-run_duration testdat))
		   (event-time (db:test-get-event_time   testdat))
		   (last-touch (file-modification-time trundatf)))
	      (db:test-set-run_duration! testdat (max duration (- last-touch event-time)))))
	testdat)
      (begin
	(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
	(print-call-chain (current-error-port))
	#f)))

(define (rmt:test-get-rundir-from-test-id run-id test-id)
  (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))

Modified runs.scm from [030b929939] to [497a5137ad].

1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-4")
	  (let ((can-run-more    #f)) ;; (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)))
	    (if (not can-run-more) #;(and (list? can-run-more)
		(car can-run-more))
		(let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here
		  (if loop-list
		      (apply loop loop-list)
                      (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)
                      )
                  )
		;; if can't run more just loop with next possible test
		(loop (car newtal)(cdr newtal) reg reruns))))
         
	 ;; this case should not happen, added to help catch any bugs
	 ((and (list? items) itemdat)
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-5")
	  (debug:print-error 0 *default-log-port* "Should not have a list of items in a test and the itemspath set - please report this")







|
<
<







1787
1788
1789
1790
1791
1792
1793
1794


1795
1796
1797
1798
1799
1800
1801
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-4")
	  (let ((can-run-more    #f)) ;; (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)))
	    (if (not can-run-more) #;(and (list? can-run-more)
		(car can-run-more))
		(let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here
		  (if loop-list
		      (apply loop loop-list)
                      (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)))


		;; if can't run more just loop with next possible test
		(loop (car newtal)(cdr newtal) reg reruns))))
         
	 ;; this case should not happen, added to help catch any bugs
	 ((and (list? items) itemdat)
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-5")
	  (debug:print-error 0 *default-log-port* "Should not have a list of items in a test and the itemspath set - please report this")

Modified server.scm from [7b2af2dc7e] to [e17aad21de].

344
345
346
347
348
349
350


















351
352
353
354
355
356
357
		       (equal? server-key res))))
	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe
	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
				  fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
		(thread-sleep! reftime)
		(server:wait-for-server-start-last-flag areapath)))))))



















;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least 3 seconds old







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
		       (equal? server-key res))))
	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe
	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
				  fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
		(thread-sleep! reftime)
		(server:wait-for-server-start-last-flag areapath)))))))

;; wait for server=start-last to be three seconds old - this might be what was causing the obfusc-skel fails
;;
#;(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last")))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (reftime  (+ 2 (random 3)))
	       (delta    (- (current-seconds) fmodtime))
	       (all-go   (> delta reftime)))
	  (if all-go
	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe
	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
				  fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
		(thread-sleep! 5)
		(server:wait-for-server-start-last-flag areapath))))
	#;(system (conc "touch " start-flag)))))

;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least 3 seconds old

Modified tests.scm from [0094b671e6] to [9198f1fbaa].

1942
1943
1944
1945
1946
1947
1948

1949




















1950


1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963

1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
	 (lambda (count)
	   (set! res count))
	 tdb
	 "SELECT count(id) FROM test_rundat;")
	res))
  0)


(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)




















  (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))


  (if (and cpuload diskfree)
      (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
  (if minutes 
      (rmt:general-call 'update-run-duration run-id minutes test-id))
  (if (and uname hostname)
      (rmt:general-call 'update-uname-host run-id uname hostname test-id)))
  
;; This one is for running with no db access (i.e. via rmt: internally)
(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries)
;; (define (tests:set-full-meta-info test-id run-id minutes work-area)
;;  (let ((remtries 10))
  (let* ((cpuload  (get-cpu-load))
	 (diskfree (get-df (current-directory)))

	 (uname    (get-uname "-srvpio"))
	 (hostname (get-host-name)))
    (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)))
    
;; (define (tests:set-partial-meta-info test-id run-id minutes work-area)
#;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries)
  (let* ((cpuload  (get-cpu-load))
	 (diskfree (get-df (current-directory)))
	 (remtries 10))
    (handle-exceptions







>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
|
|
|
|
|
|


|




>


|







1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
	 (lambda (count)
	   (set! res count))
	 tdb
	 "SELECT count(id) FROM test_rundat;")
	res))
  0)

;; 
(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname #!key (update-db #f)(tmpfree #f))
  (if (get-environment-variable "MT_TEST_RUN_DIR")
      (let* ((dest-dir (conc (get-environment-variable "MT_TEST_RUN_DIR") "/.mt_data"))
	     (or-dash  (lambda (instr)
			 (cond
			  ((not instr) "") ;; #f -> blank, indicates value unchanged since last measurement taken
			  ((string? instr)(if (string-search " " instr) (conc "\"" instr "\"") instr))
			  (else instr))))
	     (file-new (not (directory-exists? dest-dir))))
	(if file-new (create-directory dest-dir #t))
	(let* ((outp (open-output-file (conc dest-dir "/test-run.dat") #:append)))
	  (with-output-to-port outp
	    (lambda ()
	      (if file-new
		  (print "epoch_time,run_id,test_id,cpuload,diskfree,tmpfree,run_minutes,hostname,uname"))
	      (print (current-seconds) "," (or-dash run-id)   "," (or-dash test-id)  ","
		     (or-dash cpuload) "," (or-dash diskfree) "," (or-dash tmpfree)  ","
		     (or-dash minutes) "," (or-dash hostname) ","
		     (or-dash uname)))) ;; put uname last as it has spaces in it
	  (close-output-port outp)))
      (begin
	(rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))))
  (if update-db
      (begin
	(if (and cpuload diskfree)
	    (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
	(if minutes 
	    (rmt:general-call 'update-run-duration run-id minutes test-id))
	(if (and uname hostname)
	    (rmt:general-call 'update-uname-host run-id uname hostname test-id)))))
  
;; This one is for running with no db access (i.e. via rmt: internally)
(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries #!key (update-db #f))
;; (define (tests:set-full-meta-info test-id run-id minutes work-area)
;;  (let ((remtries 10))
  (let* ((cpuload  (get-cpu-load))
	 (diskfree (get-df (current-directory)))
	 (tmpfree  (get-df "/tmp"))
	 (uname    (get-uname "-srvpio"))
	 (hostname (get-host-name)))
    (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db tmpfree: tmpfree)))
    
;; (define (tests:set-partial-meta-info test-id run-id minutes work-area)
#;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries)
  (let* ((cpuload  (get-cpu-load))
	 (diskfree (get-df (current-directory)))
	 (remtries 10))
    (handle-exceptions
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
	 (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn)))
	   (debug:print-error 0 *default-log-port* "tried for over a minute to update meta info and failed. Giving up")
	   (debug:print 0 *default-log-port* "EXCEPTION: database probably overloaded or unreadable.")
	   (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
	   (debug:print 5 *default-log-port* "exn=" (condition->list exn))
	   (debug:print 0 *default-log-port* " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
	   (print-call-chain (current-error-port))))
     (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes)
  )))
	 
;;======================================================================
;; A R C H I V I N G
;;======================================================================

(define (test:archive db test-id)
  #f)

(define (test:archive-tests db keynames target)
  #f)








|
<











2006
2007
2008
2009
2010
2011
2012
2013

2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
	 (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn)))
	   (debug:print-error 0 *default-log-port* "tried for over a minute to update meta info and failed. Giving up")
	   (debug:print 0 *default-log-port* "EXCEPTION: database probably overloaded or unreadable.")
	   (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
	   (debug:print 5 *default-log-port* "exn=" (condition->list exn))
	   (debug:print 0 *default-log-port* " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
	   (print-call-chain (current-error-port))))
     (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes))))

	 
;;======================================================================
;; A R C H I V I N G
;;======================================================================

(define (test:archive db test-id)
  #f)

(define (test:archive-tests db keynames target)
  #f)