Megatest

Check-in [21849054ca]
Login
Overview
Comment:Merged in last of testdat changes from testdat2 via meld
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65
Files: files | file ages | folders
SHA1: 21849054cacf4bd4d07ebc04019fba05ec6f5fd4
User & Date: matt on 2020-10-27 23:44:55
Other Links: branch diff | manifest | tags
Context
2020-10-28
09:58
Added to manual ezsteps section check-in: 60c23b1e99 user: mrwellan tags: v1.65
2020-10-27
23:44
Merged in last of testdat changes from testdat2 via meld check-in: 21849054ca user: matt tags: v1.65
22:43
Updated ezsteps propagation and documentation check-in: b8cc18503a user: matt tags: v1.65
Changes

Modified db.scm from [7fd742bffc] to [00e75ddfd8].

1849
1850
1851
1852
1853
1854
1855

1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871

1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
	 ;; HOWEVER: this code in run:test seems to work fine
	 ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
	 ;;                     (db:test-get-run_duration testdat)))
	 ;;                    600) 
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)

	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
		  (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
				    test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
				    " event-time="event-time" run-duration="run-duration))))
	  stmth1
	  run-id running-deadtime) ;; default time 720 seconds
	    
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)

	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
				    " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
				    " run-duration="run-duration)
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
	  stmth2
	  run-id remotehoststart-deadtime) ;; default time 230 seconds
	 
	 ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
	 ;;
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row







>
|
|
|
|
|
|
|
|
|
|
|





>
|
|
|
|
|
|
|
|
|
|
|







1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
	 ;; HOWEVER: this code in run:test seems to work fine
	 ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
	 ;;                     (db:test-get-run_duration testdat)))
	 ;;                    600) 
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
		(if (and (equal? uname "n/a")
			 (equal? item-path "")) ;; this is a toplevel test
		    ;; what to do with toplevel? call rollup?
		    (begin
		      (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		      (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		    (begin
		      (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
		      (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
					test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
					" event-time="event-time" run-duration="run-duration)))))
	  stmth1
	  run-id running-deadtime) ;; default time 720 seconds
	    
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
		(if (and (equal? uname "n/a")
			 (equal? item-path "")) ;; this is a toplevel test
		    ;; what to do with toplevel? call rollup?
		    (begin
		      (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		      (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		    (begin
		      (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
					" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
					" run-duration="run-duration)
		      (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))))
	  stmth2
	  run-id remotehoststart-deadtime) ;; default time 230 seconds
	 
	 ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
	 ;;
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row

Modified launch.scm from [0de0e6aa2e] to [b35a784abe].

207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
			   (round 
			    (- 
			     (current-seconds) 
			     start-seconds)))))
	 (kill-tries 0))
    ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
    ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)

    (let loop ((minutes   (calc-minutes))
	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
	       (disk-free (get-df (current-directory)))
               (last-sync (current-seconds)))
      ;; (common:telemetry-log "zombie" (conc "launch:monitor-job -
      ;; top of loop encountered at "(current-seconds)" with







|







207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
			   (round 
			    (- 
			     (current-seconds) 
			     start-seconds)))))
	 (kill-tries 0))
    ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
    ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t)

    (let loop ((minutes   (calc-minutes))
	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
	       (disk-free (get-df (current-directory)))
               (last-sync (current-seconds)))
      ;; (common:telemetry-log "zombie" (conc "launch:monitor-job -
      ;; top of loop encountered at "(current-seconds)" with
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (launch:handle-zombie-tests run-id)
        (if do-sync ;; save meta data about the running of this test







|







246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (launch:handle-zombie-tests run-id)
        (if do-sync ;; save meta data about the running of this test
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
	    (begin
	      (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
		  (loop (calc-minutes)
                        (or new-cpu-load cpu-load)
                        (or new-disk-free disk-free)
                        (if do-sync (current-seconds) last-sync)))))))
    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional


(define (launch:execute encoded-cmd)
  (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
	 (tconfigreg #f))
    (setenv "MT_CMDINFO" encoded-cmd)
    ;;(bb-check-path msg: "launch:execute incoming")







|







311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
	    (begin
	      (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
		  (loop (calc-minutes)
                        (or new-cpu-load cpu-load)
                        (or new-disk-free disk-free)
                        (if do-sync (current-seconds) last-sync)))))))
    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional


(define (launch:execute encoded-cmd)
  (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
	 (tconfigreg #f))
    (setenv "MT_CMDINFO" encoded-cmd)
    ;;(bb-check-path msg: "launch:execute incoming")

Modified rmt.scm from [2716a38de0] to [05738b4bdc].

548
549
550
551
552
553
554
555

















556
557
558
559
560
561
562
(define (rmt:get-test-id run-id testname item-path)
  (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))

;; run-id is NOT used - but it will be! 
;;
(define (rmt:get-test-info-by-id run-id test-id)
  (if (number? test-id)
      (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))

















      (begin
	(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
	(print-call-chain (current-error-port))
	#f)))

(define (rmt:test-get-rundir-from-test-id run-id test-id)
  (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))







|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
(define (rmt:get-test-id run-id testname item-path)
  (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))

;; run-id is NOT used - but it will be! 
;;
(define (rmt:get-test-info-by-id run-id test-id)
  (if (number? test-id)
      (let* ((testdat  (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)))
             (trundir  (vector-ref testdat 10))
	     (trundatf (conc trundir"/.mt_data/test-run.dat")))
	;; now we can update a couple fields from the filesystem
	(handle-exceptions
	    exn
	    (begin
	      (debug:print-info 0 *default-log-port* "Could not update testdat record from "trundatf", exn=" exn)
	      #f)
	  (if (and trundir
		   (file-exists? trundatf))
	      (let* ((duration     (vector-ref testdat 12)) ;; (db:test-get-run_duration testdat))
		     (event-time   (vector-ref testdat 5))   ;; (db:test-get-event_time   testdat))
		     (last-touch   (file-modification-time trundatf))
		     (new-duration (max duration (- last-touch event-time))))
		(vector-set! testdat 12 new-duration))))
	      #;(db:test-set-run_duration! testdat (max duration (- last-touch event-time)))
	testdat)
      (begin
	(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
	(print-call-chain (current-error-port))
	#f)))

(define (rmt:test-get-rundir-from-test-id run-id test-id)
  (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))