Megatest

Check-in [7b78f70ebe]
Login
Overview
Comment:Updated common to have a little less delay when checking loads since tquery is quite quick to check
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | v2.01-tquery | v2.01-try-1
Files: files | file ages | folders
SHA1: 7b78f70ebe35f93ed572f15d5af5e56619a84ca7
User & Date: jmoon18 on 2018-12-11 15:11:16
Other Links: branch diff | manifest | tags
Context
2018-12-11
15:11
Updated common to have a little less delay when checking loads since tquery is quite quick to check Closed-Leaf check-in: 7b78f70ebe user: jmoon18 tags: v2.01-tquery, v2.01-try-1
2018-12-06
09:52
Initial version of tquery check-in: 06b3f69cfc user: jmoon18 tags: v2.01-tquery, v2.01-try-1
Changes

Modified common.scm from [8a7bc69ba9] to [c80e9730c4].

1504
1505
1506
1507
1508
1509
1510





1511
1512
1513

1514
1515
1516
1517
1518
1519
1520
  (let* ((fullpath (conc *toppath* "/logs/" key "-" dtype ".log")))
    (handle-exceptions
     exn
     #f
     (with-output-to-file fullpath (lambda ()(pp dat))))))

(define (common:get-cpu-load remote-host)





  (let ((al (common:get-normalized-cpu-load remote-host)))
  (list (alist-ref '1m-load al) (alist-ref '5m-load al) (alist-ref '15m-load al)))
  ;;(common:get-cpu-load-original remote-host)

)
;; get cpu load by reading from /proc/loadavg, return all three values
;;
(define (common:get-cpu-load-original remote-host)
  (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
    (or (common:get-cached-info actual-hostname "cpu-load")
	(let ((result (if remote-host







>
>
>
>
>



>







1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
  (let* ((fullpath (conc *toppath* "/logs/" key "-" dtype ".log")))
    (handle-exceptions
     exn
     #f
     (with-output-to-file fullpath (lambda ()(pp dat))))))

(define (common:get-cpu-load remote-host)
  (handle-exceptions 
    exn
  (lambda() 
    (list 50 50 50)
  ) 
  (let ((al (common:get-normalized-cpu-load remote-host)))
  (list (alist-ref '1m-load al) (alist-ref '5m-load al) (alist-ref '15m-load al)))
  ;;(common:get-cpu-load-original remote-host)
  )
)
;; get cpu load by reading from /proc/loadavg, return all three values
;;
(define (common:get-cpu-load-original remote-host)
  (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
    (or (common:get-cached-info actual-hostname "cpu-load")
	(let ((result (if remote-host
1535
1536
1537
1538
1539
1540
1541





1542

1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
(define (common:get-normalized-cpu-load remote-host)
  (if (file-exists? (pathname-expand "~/.megatest/tquery"))
    (begin
    (with-input-from-file (pathname-expand "~/.megatest/tquery")
      (lambda()
        (set! tqfilecontents (read-string))
      ))





    (set! tqfileparts (string-split (string-trim-both tqfilecontents) ":"))

    ;;(print "TQuery host: " (car tqfileparts)) 
    ;;(print "TQuery port " (cadr tqfileparts))
    ;;(print "Getting normalized cpu load for : " remote-host " via " (car tqfileparts) ":" (cadr tqfileparts))
    )
    (begin
      (process-run "nbfake /p/fdk/gwa/jmoon18/fossil/megatest/tquery -server -")
      (sleep 5)
    )
  )
  (handle-exceptions exn 
    (lambda()
      ;;(print "Need to start tquery server here:")
      (process-run "nbfake /p/fdk/gwa/jmoon18/fossil/megatest/tquery -server -")
      (sleep 5)
      (common:get-normalized-cpu-load remote-host)
    )
    (define-values (i o) (tcp-connect (car tqfileparts) (string->number (cadr tqfileparts)))) 
    ;;(define-values (i o) (tcp-connect "plxcas102" 9000)) 
    (write-line (conc "adj-cpuload-full:" (if remote-host remote-host (get-host-name))) o) 
    ;;(write-line "adj-cpuload-full:plxcm5005" o) 
    (with-input-from-string (read-line i) read)
  )

  ;;(define new-al with-input-from-string (read-line i) read)
  ;;new-al
  ;;(set! loadstring (read-string i))
  ;;(with-input-from-string loadstring read)  
  ;;`((adj-proc-load 3.0)
  ;;  (adj-core-load 3.2))
)


(define (common:get-normalized-cpu-load-original remote-host)
  (let ((res (common:get-normalized-cpu-load-raw-original remote-host))
	(default `((adj-proc-load . 2) ;; there is no right answer
		   (adj-core-load . 2)







>
>
>
>
>
|
>






|






|








<
<
<
<
<
<
<







1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576







1577
1578
1579
1580
1581
1582
1583
(define (common:get-normalized-cpu-load remote-host)
  (if (file-exists? (pathname-expand "~/.megatest/tquery"))
    (begin
    (with-input-from-file (pathname-expand "~/.megatest/tquery")
      (lambda()
        (set! tqfilecontents (read-string))
      ))
    (handle-exceptions exn 
      (lambda()
       (sleep 1)
       (common:get-normalized-cpu-load remote-host)
      )
      (set! tqfileparts (string-split (string-trim-both tqfilecontents) ":"))
    )
    ;;(print "TQuery host: " (car tqfileparts)) 
    ;;(print "TQuery port " (cadr tqfileparts))
    ;;(print "Getting normalized cpu load for : " remote-host " via " (car tqfileparts) ":" (cadr tqfileparts))
    )
    (begin
      (process-run "nbfake /p/fdk/gwa/jmoon18/fossil/megatest/tquery -server -")
      (sleep 2)
    )
  )
  (handle-exceptions exn 
    (lambda()
      ;;(print "Need to start tquery server here:")
      (process-run "nbfake /p/fdk/gwa/jmoon18/fossil/megatest/tquery -server -")
      (sleep 2)
      (common:get-normalized-cpu-load remote-host)
    )
    (define-values (i o) (tcp-connect (car tqfileparts) (string->number (cadr tqfileparts)))) 
    ;;(define-values (i o) (tcp-connect "plxcas102" 9000)) 
    (write-line (conc "adj-cpuload-full:" (if remote-host remote-host (get-host-name))) o) 
    ;;(write-line "adj-cpuload-full:plxcm5005" o) 
    (with-input-from-string (read-line i) read)
  )







)


(define (common:get-normalized-cpu-load-original remote-host)
  (let ((res (common:get-normalized-cpu-load-raw-original remote-host))
	(default `((adj-proc-load . 2) ;; there is no right answer
		   (adj-core-load . 2)
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792

1793
1794
1795
1796
1797

1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809




1810

1811
1812
1813
1814
1815
1816
1817
	      (if new-best
		  (begin ;; found a host, return it
		    (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
		    (host-last-used-set! rec curr-time)
		    new-best)
		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))

(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f))
  (let* ((loadavg (common:get-cpu-load remote-host))
	 (numcpus (if (< 1 numcpus-in) ;; not possible
		      (common:get-num-cpus remote-host)
		      numcpus-in))
	 (maxload (if force-maxload
		      maxload-in
		      (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
	 (loadjmp (- first next))
         (adjwait (min (+ 300 (random 10)) (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )  ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
    (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
		      ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)
    (cond
     ((and (> first adjload)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))

      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg ""))

      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)))))

(define (common:wait-for-homehost-load maxload msg)
  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                     #f
                     (common:get-homehost)))
         (hh     (if hh-dat (car hh-dat) #f))
         (numcpus (common:get-num-cpus hh)))
    (common:wait-for-normalized-load maxload msg hh)))

(define (common:get-num-cpus remote-host)




  (alist-ref 'core (common:get-normalized-cpu-load remote-host))

)

(define (common:get-num-cpus-orig remote-host)
  (let* ((actual-host (or remote-host (get-host-name))))
    (or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often!
	(let* ((proc   (lambda ()
			 (let loop ((numcpu 0)







|











|

|




>





>












>
>
>
>
|
>







1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
	      (if new-best
		  (begin ;; found a host, return it
		    (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
		    (host-last-used-set! rec curr-time)
		    new-best)
		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))

(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 100) (msg #f)(remote-host #f)(force-maxload #f))
  (let* ((loadavg (common:get-cpu-load remote-host))
	 (numcpus (if (< 1 numcpus-in) ;; not possible
		      (common:get-num-cpus remote-host)
		      numcpus-in))
	 (maxload (if force-maxload
		      maxload-in
		      (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
	 (loadjmp (- first next))
         (adjwait (min (+ 300 (random 10)) (* (+ (random 10)(/ (- 100 count) 10) waitdelay) (- first adjload) )  ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
    (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
		      ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp " ,adjwait: " adjwait " ,numcpus: " numcpus ", loadjmp: " loadjmp)
    (cond
     ((and (> first adjload)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
      (debug:print-info 1 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg ""))
      (debug:print-info 1 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)))))

(define (common:wait-for-homehost-load maxload msg)
  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                     #f
                     (common:get-homehost)))
         (hh     (if hh-dat (car hh-dat) #f))
         (numcpus (common:get-num-cpus hh)))
    (common:wait-for-normalized-load maxload msg hh)))

(define (common:get-num-cpus remote-host)
  (handle-exceptions exn 
    (lambda() 
      2 
    )
    (alist-ref 'core (common:get-normalized-cpu-load remote-host))
  )
)

(define (common:get-num-cpus-orig remote-host)
  (let* ((actual-host (or remote-host (get-host-name))))
    (or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often!
	(let* ((proc   (lambda ()
			 (let loop ((numcpu 0)