Megatest

Check-in [fa5f74982b]
Login
Overview
Comment:small bugfix to get-cpu-load and policy change so tests marked dead which are running are moved back to running instead of killed
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65 | v1.6525
Files: files | file ages | folders
SHA1: fa5f74982be4138bda1124183612e353bfe37503
User & Date: bjbarcla on 2019-02-28 14:30:22
Other Links: branch diff | manifest | tags
Context
2019-03-12
18:26
removing chestertons fence check-in: 53ed616f9f user: bjbarcla tags: v1.65-nosleep
18:22
Add kill-runs to actions applicatble to remove-keep check-in: 183f89d345 user: mrwellan tags: v1.65
2019-02-28
14:30
small bugfix to get-cpu-load and policy change so tests marked dead which are running are moved back to running instead of killed check-in: fa5f74982b user: bjbarcla tags: v1.65, v1.6525
2019-02-27
19:47
bumped version to v1.6525 check-in: 3a17917329 user: bjbarcla tags: v1.65
Changes

Modified common.scm from [be82152a65] to [c41ac723cd].

1591
1592
1593
1594
1595
1596
1597



1598
1599
1600
1601
1602
1603
1604







1605
1606
1607
1608
1609




1610
1611
1612
1613
1614
1615
1616
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600







1601
1602
1603
1604
1605
1606
1607
1608




1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619







+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+

-
-
-
-
+
+
+
+







     exn
     #f
     (with-output-to-file fullpath (lambda ()(pp dat))))))

;; get cpu load by reading from /proc/loadavg, return all three values
;;
(define (common:get-cpu-load remote-host)
  (handle-exceptions
   exn
   '(99 99 99)
  (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
    (or (common:get-cached-info actual-hostname "cpu-load")
	(let ((result (if remote-host
			  (map (lambda (res)
				 (if (eof-object? res) 9e99 res))
			       (with-input-from-pipe 
				   (conc "ssh " remote-host " cat /proc/loadavg")
   (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
     (or (common:get-cached-info actual-hostname "cpu-load")
	 (let ((result (if remote-host
			   (map (lambda (res)
				  (if (eof-object? res) 9e99 res))
			        (with-input-from-pipe 
				 (conc "ssh " remote-host " cat /proc/loadavg")
				 (lambda ()(list (read)(read)(read)))))
			  (with-input-from-file "/proc/loadavg" 
			    (lambda ()(list (read)(read)(read)))))))
	  (common:write-cached-info actual-hostname "cpu-load" result)
	  result))))
			   (with-input-from-file "/proc/loadavg" 
			     (lambda ()(list (read)(read)(read)))))))
	   (common:write-cached-info actual-hostname "cpu-load" result)
	                     result)))))

;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads
;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc.
;;  keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load
;;
(define (common:get-normalized-cpu-load remote-host)
  (let ((res (common:get-normalized-cpu-load-raw remote-host))

Modified launch.scm from [8c6f051622] to [9f824ebdab].

411
412
413
414
415
416
417


418
419


420
421
422
423
424
425
426
411
412
413
414
415
416
417
418
419


420
421
422
423
424
425
426
427
428







+
+
-
-
+
+







         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          (set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)")
          (set! kill-job? #t)))
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (launch:handle-zombie-tests run-id)
        (when do-sync
          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
          (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))