Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -1750,19 +1750,26 @@ (with-input-from-pipe (conc "/bin/readlink -f " path) (lambda () (read-line))))) +;; returns *effective load* (not normalized) +;; (define (common:get-intercept onemin fivemin) - (let* ((load-change (- onemin fivemin)) - (tchange (- 300 60))) - (max (+ onemin (* 60 (/ load-change tchange)))0)) -) + (if (< onemin fivemin) ;; load is decreasing, just use the onemin load + onemin + (let* ((load-change (- onemin fivemin)) + (tchange (- 300 60))) + (max (+ onemin (* 60 (/ load-change tchange))) 0)))) +;; calculate a delay number based on a droop curve +;; inputs are: +;; - load-in, load as from uptime, NOT normalized +;; - numcpus, number of cpus, ideally use the real cpus, not threads +;; (define (common:get-delay load-in numcpus) - (max (/ (expt 5 (* 4 (/ load-in numcpus))) 10) 0) -) + (max (/ (expt 5 (* 4 (/ load-in numcpus))) 10) 0)) (define (get-cpu-load #!key (remote-host #f)) (car (common:get-cpu-load remote-host))) ;; (let* ((load-res (process:cmd-run->list "uptime")) @@ -2062,16 +2069,16 @@ (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) (host-last-used-set! rec curr-time) new-best) (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) -(define (common:wait-for-homehost-load maxload msg) +(define (common:wait-for-homehost-load maxnormload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f (common:get-homehost))) (hh (if hh-dat (car hh-dat) #f))) - (common:wait-for-normalized-load maxload msg hh))) + (common:wait-for-normalized-load maxnormload msg hh))) (define (common:get-num-cpus remote-host) (let* ((actual-host (or remote-host (get-host-name)))) (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) ;; hosts had better not be changing the number of cpus too often! (let* ((proc (lambda () @@ -2095,88 +2102,177 @@ (common:write-cached-info actual-host "num-cpus" result)) result)))) ;; wait for normalized cpu load to drop below maxload ;; -(define (common:wait-for-normalized-load maxload msg remote-host #!optional (rem-tries 5)) +(define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5)) (let ((num-cpus (common:get-num-cpus remote-host))) (if num-cpus - (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host) + (common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host) (begin (thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again (if (> rem-tries 0) - (common:wait-for-normalized-load maxload msg remote-host (- rem-tries 1)) + (common:wait-for-normalized-load maxnormload msg remote-host (- rem-tries 1)) #f))))) ;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load +;; count - count down to zero, at some point we'd give up if the load never drops +;; num-tries - count down to zero number tries to get numcpus ;; -(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5)) - (let* ((loadavg (common:get-cpu-load remote-host)) - (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again - (common:get-num-cpus remote-host) - numcpus-in)) - (maxload (if force-maxload - maxload-in - (if (number? maxload-in) - (max maxload-in 0.5) - 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? - (first (car loadavg)) - (next (cadr loadavg)) - (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where - ;; numcpus (or could be - ;; maxload) is zero, - ;; crude fallback is to - ;; at least use 1 - (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next? - 0 - next))) ;; we will force a conservative calculation any time next is large. - (first-next-avg (/ (+ first next) 2)) - ;; add some randomness to the time to break any alignment - ;; where netbatch dumps many jobs to machines simultaneously - (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10) - (/ (- 1000 count) 10) - waitdelay) - (- first adjmaxload) )))) - (load-jump-limit (configf:lookup-number *configdat* "setup" "load-jump-limit"))) - ;; let's let the user know once in a long while that load checking - ;; is happening but not constantly report it - (if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time - (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload - ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp)) +(define (common:wait-for-cpuload maxnormload numcpus-in + #!key (count 1000) + (msg #f)(remote-host #f)(num-tries 5)) + (let* ((loadavg (common:get-cpu-load remote-host)) + ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again + (numcpus (if (<= 1 numcpus-in) + (common:get-num-cpus remote-host) + numcpus-in)) + (first (car loadavg)) + (next (cadr loadavg)) + (adjmaxload (* maxnormload (max 1 numcpus))) ;; possible bug + ;; where numcpus + ;; (or could be + ;; maxload) is + ;; zero, crude + ;; fallback is to + ;; at least use 1 + ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit + ;; etc. + (effective-load (common:get-intercept first next)) + (recommended-delay (common:get-delay effective-load numcpus)) + (effective-host (or remote-host "localhost")) + (normalized-effective-load (/ effective-load numcpus)) + (will-wait (> normalized-effective-load maxnormload))) + (if (> recommended-delay 0) + (let* ((actual-delay (min recommended-delay 30))) + (debug:print-info 0 *default-log-port* "Load is high, delaying " actual-delay " seconds.") + (thread-sleep! actual-delay))) + (cond + ;; bad data, try again to get the data + ((not will-wait) + (if (common:low-noise-print 30 (conc (round normalized-effective-load) "-load-acceptable-" effective-host)) + (debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing."))) ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable (> num-tries 0)) - (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.") + (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " + first ", we'll sleep 10s and try " num-tries " more times.") (thread-sleep! 10) - (common:wait-for-cpuload maxload-in numcpus-in waitdelay - count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1))) - ((and (> first adjmaxload) + (common:wait-for-cpuload maxnormload numcpus-in + count: count remote-host: remote-host num-tries: (- num-tries 1))) + ;; need to wait for load to drop + ((and will-wait ;; (> first adjmaxload) (> count 0)) (debug:print-info 0 *default-log-port* - "Delaying " adjwait - " seconds due to load " first + "Delaying 15" ;; adjwait + " seconds due to normalized effective load " normalized-effective-load ;; first " exceeding max of " adjmaxload " on server " (or remote-host (get-host-name)) - " (normalized load-limit: " maxload ") " (if msg msg "")) - (thread-sleep! adjwait) - (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) - ((and (> loadjmp (cond - (load-jump-limit load-jump-limit) - ((> numcpus 8)(/ numcpus 2)) - ((> numcpus 4)(/ numcpus 1.2)) - (else 0.5))) - (> count 0)) - (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". " - (if msg msg "")) - (thread-sleep! adjwait) - (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) + " (normalized load-limit: " maxnormload ") " (if msg msg "")) + (thread-sleep! 15) ;; adjwait) + (common:wait-for-cpuload maxnormload numcpus count: (- count 1) msg: msg remote-host: remote-host) + ;; put the message here to indicate came out of waiting + (debug:print-info 1 *default-log-port* + "On host: " effective-host + ", effective load: " effective-load + ", numcpus: " numcpus + ", normalized effective load: " normalized-effective-load + )) + ;; overloaded and count expired (i.e. went to zero) (else - (if (> num-tries 0) - (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost"))) - (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing.")) - (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing.")))))) + (if (> num-tries 0) ;; should be "num-tries-left". + (if (common:low-noise-print 30 (conc (round effective-load) "-load-acceptable-" effective-host)) + (debug:print 0 *default-log-port* "Load on " effective-host " is acceptable at effective normalized load of " + effective-normalized-load " continuing.")) + (debug:print 0 *default-log-port* "Load on " effective-host ", " + first" could not be retrieved. Giving up and continuing.")))))) +;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load +;; +;; (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5)) +;; (let* ((loadavg (common:get-cpu-load remote-host)) +;; (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again +;; (common:get-num-cpus remote-host) +;; numcpus-in)) +;; (maxload (if force-maxload +;; maxload-in +;; (if (number? maxload-in) +;; (max maxload-in 0.5) +;; 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? +;; (first (car loadavg)) +;; (next (cadr loadavg)) +;; (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where +;; ;; numcpus (or could be +;; ;; maxload) is zero, +;; ;; crude fallback is to +;; ;; at least use 1 +;; (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next? +;; 0 +;; next))) ;; we will force a conservative calculation any time next is large. +;; (first-next-avg (/ (+ first next) 2)) +;; ;; add some randomness to the time to break any alignment +;; ;; where netbatch dumps many jobs to machines simultaneously +;; (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10) +;; (/ (- 1000 count) 10) +;; waitdelay) +;; (- first adjmaxload) )))) +;; (load-jump-limit (configf:lookup-number *configdat* "setup" "load-jump-limit")) +;; ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit +;; ;; etc. +;; (effective-load (common:get-intercept first next)) +;; (effective-host (or remote-host "localhost")) +;; (normalized-effective-load (/ effective-load numcpus)) +;; (will-wait (> normalized-effective-load maxload))) +;; +;; ;; let's let the user know once in a long while that load checking +;; ;; is happening but not constantly report it +;; #;(if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time +;; (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload +;; ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp)) +;; +;; (debug:print-info 1 *default-log-port* +;; "On host: " effective-host +;; ", effective load: " effective-load +;; ", numcpus: " numcpus +;; ", normalized effective load: " normalized-effective-load +;; ) +;; +;; (cond +;; ;; bad data, try again to get the data +;; ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable +;; (> num-tries 0)) +;; (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.") +;; (thread-sleep! 10) +;; (common:wait-for-cpuload maxload-in numcpus-in waitdelay +;; count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1))) +;; ;; need to wait for load to drop +;; ((and will-wait ;; (> first adjmaxload) +;; (> count 0)) +;; (debug:print-info 0 *default-log-port* +;; "Delaying " 15 ;; adjwait +;; " seconds due to normalized effective load " normalized-effective-load ;; first +;; " exceeding max of " adjmaxload +;; " on server " (or remote-host (get-host-name)) +;; " (normalized load-limit: " maxload ") " (if msg msg "")) +;; (thread-sleep! 15) ;; adjwait) +;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) +;; ((and (> loadjmp (cond +;; (load-jump-limit load-jump-limit) +;; ((> numcpus 8)(/ numcpus 2)) +;; ((> numcpus 4)(/ numcpus 1.2)) +;; (else 0.5))) +;; (> count 0)) +;; (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". " +;; (if msg msg "")) +;; (thread-sleep! adjwait) +;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) +;; (else +;; (if (> num-tries 0) +;; (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost"))) +;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing.")) +;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing.")))))) +;; (define (get-uname . params) (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params))))) (uname #f)) (if (null? (car uname-res)) "unknown" Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1221,11 +1221,12 @@ (if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues (if maxhomehostload - (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))) + (common:wait-for-homehost-load maxhomehostload + (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))) (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) @@ -1672,10 +1673,16 @@ (debug:print-info 4 *default-log-port* "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) + + ;; This would be a good place to block on homehost load + + + + (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running (let ((loop-list (runs:process-expanded-tests runsdat testdat))) (if loop-list (apply loop loop-list)))) ;; items processed into a list but not came in as a list been processed Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -135,11 +135,11 @@ "") ;; " -log " logfile " -m testsuite:" testsuite " " profile-mode )) ;; (conc " >> " logfile " 2>&1 &"))))) - (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) + (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) ;; we want the remote server to start in *toppath* so push there (push-directory areapath) (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") (thread-start! log-rotate) @@ -155,11 +155,11 @@ (setenv "TARGETHOST" target-host))) (setenv "TARGETHOST_LOGF" logfile) (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever - (common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit)) + #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit)) (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) @@ -317,14 +317,32 @@ (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) (set! *my-client-signature* sig) *my-client-signature*))) + +;; wait for server=start-last to be three seconds old +;; +(define (server:wait-for-server-start-last-flag areapath) + (let* ((start-flag (conc areapath "/logs/server-start-last"))) + (if (file-exists? start-flag) + (let* ((fmodtime (file-modification-time start-flag))) + (if (> (- (current-seconds) fmodtime) 3) ;; good enough + (system (conc "touch " start-flag)) ;; lazy but safe + (begin + (thread-sleep! 5) + (server:wait-for-server-start-last-flag areapath)))) + (system (conc "touch " start-flag))))) + ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched +;; (define (server:kind-run areapath) + ;; look for $MT_RUN_AREA_HOME/logs/server-start-last + ;; and wait for it to be at least 3 seconds old + (server:wait-for-server-start-last-flag areapath) (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun (call-num (car last-run-dat)) (when-run (cadr last-run-dat)) (run-delay (+ (case call-num @@ -340,10 +358,12 @@ (server:run areapath) (thread-sleep! 2) ;; don't release the lock for at least a few seconds (common:simple-file-release-lock lock-file))) (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) +;; this one seems to be the general entry point +;; (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) (let loop ((server-url (server:check-if-running areapath)) (try-num 0)) (if (or server-url