Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -1919,43 +1919,15 @@ (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) (host-last-used-set! rec curr-time) new-best) (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) -(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)) - (let* ((loadavg (common:get-cpu-load remote-host)) - (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again - (common:get-num-cpus remote-host) - numcpus-in)) - (maxload (if force-maxload - maxload-in - (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? - (first (car loadavg)) - (next (cadr loadavg)) - (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 - (loadjmp (- first next)) - (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously - (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload - ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp) - (cond - ((and (> first adjload) - (> count 0)) - (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) - (thread-sleep! adjwait) - (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) - ((and (> loadjmp numcpus) - (> count 0)) - (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) - (thread-sleep! adjwait) - (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))))) - (define (common:wait-for-homehost-load maxload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f (common:get-homehost))) - (hh (if hh-dat (car hh-dat) #f)) - (numcpus (common:get-num-cpus hh))) + (hh (if hh-dat (car hh-dat) #f))) (common:wait-for-normalized-load maxload msg hh))) (define (common:get-num-cpus remote-host) (let* ((actual-host (or remote-host (get-host-name)))) (or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often! @@ -1982,10 +1954,39 @@ ;; (define (common:wait-for-normalized-load maxload msg remote-host) (let ((num-cpus (common:get-num-cpus remote-host))) (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host))) +;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load +;; +(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)) + (let* ((loadavg (common:get-cpu-load remote-host)) + (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again + (common:get-num-cpus remote-host) + numcpus-in)) + (maxload (if force-maxload + maxload-in + (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? + (first (car loadavg)) + (next (cadr loadavg)) + (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 + (loadjmp (- first next)) + (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously + (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload + ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp) + (cond + ((and (> first adjload) + (> count 0)) + (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) + (thread-sleep! adjwait) + (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) + ((and (> loadjmp numcpus) + (> count 0)) + (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) + (thread-sleep! adjwait) + (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))))) + (define (get-uname . params) (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params))))) (uname #f)) (if (null? (car uname-res)) "unknown" Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1017,11 +1017,10 @@ (non-completed (filter (lambda (x) ;; remove hed from not completed list, duh, of course it is not completed! (not (equal? x hed))) (runs:calc-not-completed prereqs-not-met))) (loop-list (list hed tal reg reruns)) ;; configure the load runner - (numcpus (common:get-num-cpus #f)) (maxload (string->number (or (configf:lookup *configdat* "jobtools" "maxload") "3.0"))) ;; use a non-number string to disable (maxhomehostload (string->number (or (configf:lookup *configdat* "jobtools" "maxhomehostload") "2.0"))) ;; use a non-number string to disable (waitdelay (string->number (or (configf:lookup *configdat* "jobtools" "waitdelay") "60")))) (debug:print-info 4 *default-log-port* "have-resources: " have-resources " prereqs-not-met: (" (string-intersperse @@ -1119,14 +1118,19 @@ (not (member 'exclusive testmode))))) ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path)) ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! *max-tries-hash* (make-hash-table)) + ;; well, first lets see if cpu load throttling is enabled. If so wait around until the ;; average cpu load is under the threshold before continuing - (if maxload ;; only gate if maxload is specified - (common:wait-for-cpuload maxload numcpus waitdelay)) + + ;; jobtools maxload is useful for where the full Megatest run is done on one machine + (if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized + (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) + + ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues (if maxhomehostload (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))) (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) (runs:incremental-print-results run-id) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -1,6 +1,5 @@ - ;; Copyright 2006-2017, Matthew Welland. ;; ;; This file is part of Megatest. ;; ;; Megatest is free software: you can redistribute it and/or modify @@ -151,11 +150,12 @@ (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) (setenv "TARGETHOST_LOGF" logfile) (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time - (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever + ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever + (common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit)) (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory)))