Megatest

Check-in [fa5f74982b]
Login
Overview
Comment:small bugfix to get-cpu-load and policy change so tests marked dead which are running are moved back to running instead of killed
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65 | v1.6525
Files: files | file ages | folders
SHA1: fa5f74982be4138bda1124183612e353bfe37503
User & Date: bjbarcla on 2019-02-28 14:30:22
Other Links: branch diff | manifest | tags
Context
2019-03-12
18:26
removing chestertons fence check-in: 53ed616f9f user: bjbarcla tags: v1.65-nosleep
18:22
Add kill-runs to actions applicatble to remove-keep check-in: 183f89d345 user: mrwellan tags: v1.65
2019-02-28
14:30
small bugfix to get-cpu-load and policy change so tests marked dead which are running are moved back to running instead of killed check-in: fa5f74982b user: bjbarcla tags: v1.65, v1.6525
2019-02-27
19:47
bumped version to v1.6525 check-in: 3a17917329 user: bjbarcla tags: v1.65
Changes

Modified common.scm from [be82152a65] to [c41ac723cd].

1591
1592
1593
1594
1595
1596
1597



1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
     exn
     #f
     (with-output-to-file fullpath (lambda ()(pp dat))))))

;; get cpu load by reading from /proc/loadavg, return all three values
;;
(define (common:get-cpu-load remote-host)



  (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
    (or (common:get-cached-info actual-hostname "cpu-load")
	(let ((result (if remote-host
			  (map (lambda (res)
				 (if (eof-object? res) 9e99 res))
			       (with-input-from-pipe 
				   (conc "ssh " remote-host " cat /proc/loadavg")
				 (lambda ()(list (read)(read)(read)))))
			  (with-input-from-file "/proc/loadavg" 
			    (lambda ()(list (read)(read)(read)))))))
	  (common:write-cached-info actual-hostname "cpu-load" result)
	  result))))

;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads
;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc.
;;  keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load
;;
(define (common:get-normalized-cpu-load remote-host)
  (let ((res (common:get-normalized-cpu-load-raw remote-host))







>
>
>
|
|
|
|
|
|
|

|
|
|
|







1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
     exn
     #f
     (with-output-to-file fullpath (lambda ()(pp dat))))))

;; get cpu load by reading from /proc/loadavg, return all three values
;;
(define (common:get-cpu-load remote-host)
  (handle-exceptions
   exn
   '(99 99 99)
   (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
     (or (common:get-cached-info actual-hostname "cpu-load")
	 (let ((result (if remote-host
			   (map (lambda (res)
				  (if (eof-object? res) 9e99 res))
			        (with-input-from-pipe 
				 (conc "ssh " remote-host " cat /proc/loadavg")
				 (lambda ()(list (read)(read)(read)))))
			   (with-input-from-file "/proc/loadavg" 
			     (lambda ()(list (read)(read)(read)))))))
	   (common:write-cached-info actual-hostname "cpu-load" result)
	                     result)))))

;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads
;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc.
;;  keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load
;;
(define (common:get-normalized-cpu-load remote-host)
  (let ((res (common:get-normalized-cpu-load-raw remote-host))

Modified launch.scm from [8c6f051622] to [9f824ebdab].

411
412
413
414
415
416
417


418
419
420
421
422
423
424
425
426
         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")


          (set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)")
          (set! kill-job? #t)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (launch:handle-zombie-tests run-id)
        (when do-sync
          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
          (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))







>
>
|
|







411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (launch:handle-zombie-tests run-id)
        (when do-sync
          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
          (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))