@@ -817,14 +817,14 @@ (define *common:ended-states* ;; states which indicate the test is stopped and will not proceed '("COMPLETED" "ARCHIVED" "KILLED" "KILLREQ" "STUCK" "INCOMPLETE" )) (define *common:badly-ended-states* ;; these roll up as CHECK, i.e. results need to be checked - '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD")) + '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD" "CHECK")) (define *common:well-ended-states* ;; an item's prereq in this state allows item to proceed - '("PASS" "WARN" "CHECK" "WAIVED" "SKIP")) + '("PASS" "WARN" "WAIVED" "SKIP")) ;; BBnote: *common:running-states* used from db:set-state-status-and-roll-up-items (define *common:running-states* ;; test is either running or can be run '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED" "STARTED")) @@ -2208,37 +2208,31 @@ ;; (define (common:wait-for-cpuload maxnormload numcpus-in #!key (count 1000) (msg #f)(remote-host #f)(num-tries 5)) (let* ((loadavg (common:get-cpu-load remote-host)) - ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again - (numcpus (if (<= 1 numcpus-in) - (common:get-num-cpus remote-host) - numcpus-in)) - (first (car loadavg)) - (next (cadr loadavg)) - (adjmaxload (* maxnormload (max 1 numcpus))) ;; possible bug - ;; where numcpus - ;; (or could be - ;; maxload) is - ;; zero, crude - ;; fallback is to - ;; at least use 1 - ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit - ;; etc. - (effective-load (common:get-intercept first next)) - (recommended-delay (common:get-delay effective-load numcpus)) - (effective-host (or remote-host "localhost")) - (normalized-effective-load (/ effective-load numcpus)) - (will-wait (> normalized-effective-load maxnormload))) - (if (> recommended-delay 1) - (let* ((actual-delay (min recommended-delay 30))) - (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) - (debug:print-info 0 *default-log-port* "Load control, delaying " + ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again + (numcpus (if (<= 1 numcpus-in) + (common:get-num-cpus remote-host) numcpus-in)) + (first (car loadavg)) + (next (cadr loadavg)) + (adjmaxload (* maxnormload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude + ;; fallback is to at least use 1 + ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit + ;; etc. + (effective-load (common:get-intercept first next)) + (recommended-delay (common:get-delay effective-load numcpus)) + (effective-host (or remote-host "localhost")) + (normalized-effective-load (/ effective-load numcpus)) + (will-wait (> normalized-effective-load maxnormload))) + (if (and will-wait (> recommended-delay 1)) + (let* ((actual-delay (min recommended-delay 30))) + (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) + (debug:print-info 0 *default-log-port* "Load control, delaying " actual-delay " seconds to maintain safe load. current normalized effective load is " normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load)) - (thread-sleep! actual-delay))) + (thread-sleep! actual-delay))) (cond ;; bad data, try again to get the data ((not will-wait) (if (common:low-noise-print 3600 (conc (round normalized-effective-load) "-load-acceptable-" effective-host))