@@ -226,10 +226,32 @@ (fullpath (realpath this-script))) fullpath)) (define *common:this-exe-fullpath* (common:get-this-exe-fullpath)) (define *common:this-exe-dir* (pathname-directory *common:this-exe-fullpath*)) (define *common:this-exe-name* (pathname-strip-directory *common:this-exe-fullpath*)) + +;; when called from a wrapper I need sometimes to find the calling +;; wrapper, this is for dashboard to find the correct megatest. +;; +(define (common:find-local-megatest #!optional (progname "megatest")) + (let ((res (filter file-exists? + (map (lambda (updir) + (let* ((lm (car (argv))) + (dir (pathname-directory lm)) + (exe (pathname-strip-directory lm))) + (conc (if dir (conc dir "/") "") + (case (string->symbol exe) + ((dboard) (conc updir progname)) + ((mtest) (conc updir progname)) + ((dashboard) progname) + (else exe))))) + '("../../" "../"))))) + (if (null? res) + (begin + (debug:print 0 *current-log-port* "Failed to find this executable! Using what can be found on the path") + progname) + (car res)))) (define *common:logpro-exit-code->status-sym-alist* '( ( 0 . pass ) ( 1 . fail ) ( 2 . warn ) @@ -1758,17 +1780,25 @@ (handle-exceptions exn #f (with-output-to-file fullpath (lambda ()(pp dat))))) #f)) - + +(define (common:raw-get-remote-host-load remote-host) + (handle-exceptions + exn + #f ;; more specific handling of errors needed + (with-input-from-pipe + (conc "ssh " remote-host " cat /proc/loadavg") + (lambda ()(list (read)(read)(read)))))) + ;; get cpu load by reading from /proc/loadavg, return all three values ;; (define (common:get-cpu-load remote-host) (handle-exceptions exn - '(99 99 99) + '(-99 -99 -99) (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) (or (common:get-cached-info actual-hostname "cpu-load") (let ((result (if remote-host (map (lambda (res) (if (eof-object? res) 9e99 res)) @@ -1775,12 +1805,21 @@ (with-input-from-pipe (conc "ssh " remote-host " cat /proc/loadavg") (lambda ()(list (read)(read)(read))))) (with-input-from-file "/proc/loadavg" (lambda ()(list (read)(read)(read))))))) - (common:write-cached-info actual-hostname "cpu-load" result) - result))))) + (match + result + ((l1 l2 l3) + (if (and (number? l1) + (number? l2) + (number? l3)) + (begin + (common:write-cached-info actual-hostname "cpu-load" result) + result) + '(-1 -1 -1))) ;; -1 is bad result + (else '(-2 -2 -2)))))))) ;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads ;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc. ;; keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load ;; @@ -2026,11 +2065,11 @@ (common:wait-for-normalized-load maxload msg remote-host (- rem-tries 1)) #f))))) ;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load ;; -(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)) +(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5)) (let* ((loadavg (common:get-cpu-load remote-host)) (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again (common:get-num-cpus remote-host) numcpus-in)) (maxload (if force-maxload @@ -2038,28 +2077,53 @@ (if (number? maxload-in) (max maxload-in 0.5) 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? (first (car loadavg)) (next (cadr loadavg)) - (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 + (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where + ;; numcpus (or could be + ;; maxload) is zero, + ;; crude fallback is to + ;; at least use 1 (loadjmp (- first next)) - (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously - ;; let's let the user know once in a long while that load checking is happening but not constantly report it + ;; add some randomness to the time to break any alignment + ;; where netbatch dumps many jobs to machines simultaneously + (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10) + (/ (- 1000 count) 10) + waitdelay) + (- first adjmaxload) )) ))) + ;; let's let the user know once in a long while that load checking + ;; is happening but not constantly report it (if (> (random 100) 75) ;; about 25% of the time (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload - ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)) + ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp)) (cond - ((and (> first adjload) + ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable + (> num-tries 0)) + (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.") + (thread-sleep! 10) + (common:wait-for-cpuload maxload-in numcpus-in waitdelay + count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1))) + ((and (> first adjmaxload) (> count 0)) - (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) + (debug:print-info 0 *default-log-port* + "server start delayed " adjwait + " seconds due to load " first + " exceeding max of " adjmaxload + " on server " (or remote-host (get-host-name)) + " (normalized load-limit: " maxload ") " (if msg msg "")) (thread-sleep! adjwait) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) ((and (> loadjmp numcpus) (> count 0)) (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) (thread-sleep! adjwait) - (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))))) + (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) + (else + (if (> num-tries 0) + (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing.") + (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing.")))))) (define (get-uname . params) (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params))))) (uname #f)) (if (null? (car uname-res))