Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -461,10 +461,13 @@ (map common:to-alist (hash-table->alist dat))) (else (if dat dat "")))) + +(define (common:alist-ref/default key alist default) + (or (alist-ref key alist) default)) (define (common:low-noise-print waitval . keys) (let* ((key (string-intersperse (map conc keys) "-" )) (lasttime (hash-table-ref/default *common:denoise* key 0)) (currtime (current-seconds))) @@ -1363,10 +1366,33 @@ new-rownames new-colnames (if (> curr-rownum rownum) curr-rownum rownum) (if (> curr-colnum colnum) curr-colnum colnum) )))))) + +;; if it looks like a number -> convert it to a number, else return it +;; +(define (common:lazy-convert inval) + (let* ((as-num (if (string? inval)(string->number inval) #f))) + (or as-num inval))) + +;; convert string a=1; b=2; c=a silly thing; d= +;; to '((a . 1)(b . 2)(c . "a silly thing")(d . "")) +;; +(define (common:val->alist val #!key (convert #f)) + (let ((val-list (string-split-fields ";\\s*" val #:infix))) + (if val-list + (map (lambda (x) + (let ((f (string-split-fields "\\s*=\\s*" x #:infix))) + (case (length f) + ((0) `(,#f)) ;; null string case + ((1) `(,(string->symbol (car f)))) + ((2) `(,(string->symbol (car f)) . ,(let ((inval (cadr f))) + (if convert (common:lazy-convert inval) inval)))) + (else f)))) + val-list) + '()))) ;;====================================================================== ;; S Y S T E M S T U F F ;;====================================================================== @@ -1540,16 +1566,16 @@ ;; ideally put all this info into the db, no need to preserve it across moving homehost ;; ;; return list of ;; ( reachable? cpuload update-time ) (define (common:get-host-info hostname) - (let* ((loadinfo (rmt:get-latest-host-load hostname)) - (load (car loadinfo)) - (load-sample-time (cdr loadinfo)) - (load-sample-age (- (current-seconds) load-sample-time)) - (loadinfo-timeout-seconds 20) - (host-last-update-timeout-seconds 10) + (let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data + (load (car loadinfo)) + (load-sample-time (cdr loadinfo)) + (load-sample-age (- (current-seconds) load-sample-time)) + (loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds + (host-last-update-timeout-seconds 4) (host-rec (hash-table-ref/default *host-loads* hostname #f)) ) (cond ((< load-sample-age loadinfo-timeout-seconds) (list #t @@ -1561,14 +1587,18 @@ (host-last-update host-rec) (host-last-cpuload host-rec ))) ((common:unix-ping hostname) (list #t (current-seconds) - (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) + (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds (else - (list #f 0 -1))))) - + (list #f 0 -1) ;; bad host, don't use! + )))) + +;; see defstruct host at top of file. +;; host: reachable last-update last-used last-cpuload +;; (define (common:update-host-loads-table hosts-raw) (let* ((hosts (filter (lambda (x) (string-match (regexp "^\\S+$") x)) hosts-raw))) (for-each @@ -1586,37 +1616,74 @@ (host-reachable-set! rec is-reachable) (host-last-update-set! rec last-reached-time) (host-last-cpuload-set! rec load))) hosts))) -(define (common:get-least-loaded-host hosts-raw) - (let* ((hosts (filter (lambda (x) - (string-match (regexp "^\\S+$") x)) - hosts-raw)) - (best-host #f) +;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the +;; [host-rules] section. +;; +(define (common:get-least-loaded-host hosts-raw host-type configdat) + (let* ((rdat (configf:lookup configdat "host-rules" host-type)) + (rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate + (maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load + (maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs + (maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second + (hosts (filter (lambda (x) + (string-match (regexp "^\\S+$") x)) + hosts-raw)) + ;; (best-host #f) + (get-rec (lambda (hostname) + ;; (print "get-rec hostname=" hostname) + (let ((h (hash-table-ref/default *host-loads* hostname #f))) + (if h + h + (let ((h (make-host))) + (hash-table-set! *host-loads* hostname h) + h))))) (best-load 99999) - (curr-time (current-seconds))) - (common:update-host-loads-table hosts) - (for-each - (lambda (hostname) - (let* ((rec - (let ((h (hash-table-ref/default *host-loads* hostname #f))) - (if h - h - (let ((h (make-host))) - (hash-table-set! *host-loads* hostname h) - h)))) - (reachable (host-reachable rec)) - (load (host-last-cpuload rec))) - (cond - ((not reachable) #f) - ((< (+ load (/ (random 250) 1000)) ;; add a random factor to keep from getting in a rut - (+ best-load (/ (random 250) 1000)) ) - (set! best-load load) - (set! best-host hostname))))) - hosts) - best-host)) + (curr-time (current-seconds)) + (get-hosts-sorted (lambda (hosts) + (sort hosts (lambda (a b) + (let ((a-rec (get-rec a)) + (b-rec (get-rec b))) + ;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec)) + ;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec)) + (< (host-last-used a-rec) + (host-last-used b-rec)))))))) + (debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts)) + (if (null? hosts) + #f ;; no hosts to select from. All done and giving up now. + (let ((hosts-sorted (get-hosts-sorted hosts))) + (common:update-host-loads-table hosts) + (let loop ((hostname (car hosts-sorted)) + (tal (cdr hosts-sorted)) + (best-host #f)) + (let* ((rec (get-rec hostname)) + (reachable (host-reachable rec)) + (load (host-last-cpuload rec)) + (last-used (host-last-used rec)) + (delta (- curr-time last-used)) + (job-rate (if (> delta 0) + (/ 1 delta) + 999)) ;; jobs per second + (new-best + (cond + ((not reachable) + (debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.") + best-host) + ((and (< load maxnload) ;; load is acceptable + (< job-rate maxjobrate)) ;; job rate is acceptable + (set! best-load load) + hostname) + (else best-host)))) + (debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." ) + (if new-best + (begin ;; found a host, return it + (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) + (host-last-used-set! rec curr-time) + new-best) + (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)) (let* ((loadavg (common:get-cpu-load remote-host)) (numcpus (if (< 1 numcpus-in) ;; not possible (common:get-num-cpus remote-host) @@ -2376,10 +2443,16 @@ ;; ;; [host-types] ;; general #MTLOWESTLOAD #{g hosts allhosts} ;; arm #MTLOWESTLOAD #{g hosts arm} ;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo +;; +;; [host-rules] +;; # maxnload => max normalized load +;; # maxnjobs => max jobs per cpu +;; # maxjobrate => max jobs per second +;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 ;; ;; [launchers] ;; envsetup general ;; xor/%/n 4C16G ;; % nbgeneral @@ -2406,12 +2479,23 @@ (let ((launcher (configf:lookup configdat "host-types" host-type))) (if launcher (let* ((launcher-parts (string-split launcher)) (launcher-exe (car launcher-parts))) (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline - (let ((targ-host (common:get-least-loaded-host (cdr launcher-parts)))) - (conc "remrun " targ-host)) + (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) + (count 100)) + (if targ-host + (conc "remrun " targ-host) + (if (> count 0) + (begin + (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) + (thread-sleep! (- 101 count)) + (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) + (- count 1))) + (begin + (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) + (exit))))) launcher)) (begin (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) (if (null? tal) fallback-launcher Index: mtut.scm ================================================================== --- mtut.scm +++ mtut.scm @@ -265,22 +265,11 @@ (define (megatest-param->mtutil-param param) (let* ((mapping-alist (common:get-param-mapping flavor: 'switch-symbol))) (alist-ref (string->symbol param) mapping-alist eq? param) param)) -(define (val->alist val) - (let ((val-list (string-split-fields ";\\s*" val #:infix))) - (if val-list - (map (lambda (x) - (let ((f (string-split-fields "\\s*=\\s*" x #:infix))) - (case (length f) - ((0) `(,#f)) ;; null string case - ((1) `(,(string->symbol (car f)))) - ((2) `(,(string->symbol (car f)) . ,(cadr f))) - (else f)))) - val-list) - '()))) +(define val->alist common:val->alist) (define (push-run-spec torun contour runkey spec) (configf:section-var-set! torun contour runkey (cons spec (or (configf:lookup torun contour runkey) @@ -547,11 +536,11 @@ ;; (define (create-run-pkt mtconf action area runkey target runname mode-patt tag-expr pktsdir reason contour sched dbdest append-conf runtrans) (let* ((good-val (lambda (inval)(and inval (string? inval)(not (string-null? inval))))) - (area-dat (val->alist (or (configf:lookup mtconf "areas" area) ""))) + (area-dat (common:val->alist (or (configf:lookup mtconf "areas" area) ""))) (area-path (alist-ref 'path area-dat)) ;; (area-xlatr (alist-ref 'targtrans area-dat)) ;; (xlatr-key (if area-xlatr (string->symbol area-xlatr) #f)) (new-runname (let* ((callname (if (string? runtrans)(string->symbol runtrans) #f)) (mapper (if callname (hash-table-ref/default *runname-mappers* callname #f) #f))) @@ -671,11 +660,11 @@ (len-key (length keyparts)) (ruletype (if (> len-key 1)(cadr keyparts) #f)) (action (if (> len-key 2)(caddr keyparts) #f)) (optional (if (> len-key 3)(cadddr keyparts) #f)) ;; (val-list (string-split-fields ";\\s*" val #:infix)) ;; (string-split val)) ;; runname-rule params - (val-alist (val->alist val)) + (val-alist (common:val->alist val)) (runname (make-runname "" "")) (runtrans (alist-ref 'runtrans val-alist)) ;; these may or may not be defined and not all are used in each handler type in the case below (run-name (alist-ref 'run-name val-alist)) @@ -916,11 +905,11 @@ ;; now have to run populated (for-each (lambda (contour) (let* ((cval (or (configf:lookup mtconf "contours" contour) "")) - (cval-alist (val->alist cval)) ;; BEWARE ... NOT the same val-alist as above! + (cval-alist (common:val->alist cval)) ;; BEWARE ... NOT the same val-alist as above! (areas (val-alist->areas cval-alist)) (selector (alist-ref 'selector cval-alist)) (mode-tag (and selector (string-split-fields "/" selector #:infix))) (mode-patt (and mode-tag (if (eq? (length mode-tag) 2)(cadr mode-tag) #f))) (tag-expr (and mode-tag (if (null? mode-tag) #f (car mode-tag))))) @@ -934,11 +923,11 @@ (lambda (runkeydat) (for-each (lambda (area) (if (area-allowed? area areas runkey contour mode-patt) ;; is this area to be handled (from areas=a,b,c OR using areafn=abcfn and *area-checks* ...) (let* ((aval (or (configf:lookup mtconf "areas" area) "")) - (aval-alist (val->alist aval)) + (aval-alist (common:val->alist aval)) (runname (alist-ref 'runname runkeydat)) (runtrans (alist-ref 'runtrans runkeydat)) (reason (alist-ref 'message runkeydat)) (sched (alist-ref 'sched runkeydat)) @@ -1113,11 +1102,11 @@ ((run remove rerun set-ss archive kill list) (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir"))) (mtconf (car mtconfdat)) (area (args:get-arg "-area")) ;; look up the area to dispatch to from [areas] section (areasec (if area (configf:lookup mtconf "areas" area) #f)) - (areadat (if areasec (val->alist areasec) #f)) + (areadat (if areasec (common:val->alist areasec) #f)) (area-path (if areadat (alist-ref 'path areadat) #f)) (pktsdirs (configf:lookup mtconf "setup" "pktsdirs")) (pktsdir (if pktsdirs (car (string-split pktsdirs " ")) #f)) (adjargs (hash-table-copy args:arg-hash)) (new-ss (args:get-arg "-new")))