ADDED mtserve.scm
Index: mtserve.scm
==================================================================
--- /dev/null
+++ mtserve.scm
@@ -0,0 +1,266 @@
+;; Copyright 2006-2017, Matthew Welland.
+;;
+;; This file is part of Megatest.
+;;
+;; Megatest is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+;;
+;; Megatest is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with Megatest. If not, see .
+;;
+
+;; (include "common.scm")
+;; (include "megatest-version.scm")
+
+;; fake out readline usage of toplevel-command
+(define (toplevel-command . a) #f)
+
+(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (prefix base64 base64:)
+ readline apropos json directory-utils typed-records
+ srfi-18 extras format (prefix pkts pkts:))
+
+(declare (uses common))
+(declare (uses megatest-version))
+(declare (uses margs))
+(declare (uses server))
+;; (declare (uses daemon))
+
+(declare (uses db))
+(import db)
+
+(declare (uses portlogger))
+(import portlogger)
+
+(declare (uses nmsg-transport))
+(import (prefix nmsg-transport nmsg:))
+
+(include "common_records.scm")
+(include "key_records.scm")
+(include "db_records.scm")
+(include "run_records.scm")
+(include "megatest-fossil-hash.scm")
+
+(define *usage-log-file* #f) ;; put path to file for logging usage in this var in the ~/.megatestrc file
+(define *usage-use-seconds* #t) ;; for Epoc seconds in usage logging change this to #t in ~/.megatestrc file
+
+;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file
+;;
+(let ((debugcontrolf (conc (get-environment-variable "HOME") "/.mtserverrc")))
+ (if (file-exists? debugcontrolf)
+ (load debugcontrolf)))
+
+;; usage logging, careful with this, it is not designed to deal with all real world challenges!
+;;
+(if (and *usage-log-file*
+ (file-write-access? *usage-log-file*))
+ (with-output-to-file
+ *usage-log-file*
+ (lambda ()
+ (print
+ (if *usage-use-seconds*
+ (current-seconds)
+ (time->string
+ (seconds->local-time (current-seconds))
+ "%Yww%V.%w %H:%M:%S"))
+ " "
+ (current-user-name) " "
+ (current-directory) " "
+ "\"" (string-intersperse (argv) " ") "\""))
+ #:append))
+
+;; Disabled help items
+;; -rollup : (currently disabled) fill run (set by :runname) with latest test(s)
+;; from prior runs with same keys
+;; -daemonize : fork into background and disconnect from stdin/out
+
+(define help (conc "
+Megatest, documentation at http://www.kiatoa.com/fossils/megatest
+ version " megatest-version "
+ license GPL, Copyright Matt Welland 2006-2017
+
+Usage: mtserver [options]
+ -h : this help
+ -manual : show the Megatest user manual
+ -version : print megatest version (currently " megatest-version ")
+
+Launching and managing runs
+ -run : run all tests or as specified by -testpatt
+ -server -|hostname : start the server (reduces contention on megatest.db), use
+ - to automatically figure out hostname
+ -log logfile : send stdout and stderr to logfile
+ -list-servers : list the servers
+ -kill-servers : kill all servers
+ -repl : start a repl (useful for extending megatest)
+ -ping run-id|host:port : ping server, exit with 0 if found
+ -debug N|N,M,O... : enable debug 0-N or N and M and O ...
+
+Examples
+
+# Get test path, use '.' to get a single path or a specific path/file pattern
+megatest -test-files 'logs/*.log' -target ubuntu/n%/no% -runname w49% -testpatt test_mt%
+
+Called as " (string-intersperse (argv) " ") "
+Version " megatest-version ", built from " megatest-fossil-hash ))
+
+;; -gui : start a gui interface
+;; -config fname : override the runconfigs file with fname
+
+;; process args
+(define remargs (args:get-args
+ (argv)
+ (list "-start-dir"
+ "-server"
+ "-port"
+ "-log"
+ )
+ (list "-h" "-help" "--help"
+ "-manual"
+ "-version"
+ "-list-servers"
+ "-kill-servers"
+ "-v" ;; verbose 2, more than normal (normal is 1)
+ "-q" ;; quiet 0, errors/warnings only
+
+ "-diff-rep"
+ )
+ args:arg-hash
+ 0))
+
+;; Add args that use remargs here
+;;
+(if (and (not (null? remargs))
+ (not (or
+ (args:get-arg "-runstep")
+ (args:get-arg "-envcap")
+ (args:get-arg "-envdelta")
+ )
+ ))
+ (debug:print-error 0 *default-log-port* "Unrecognised arguments: " (string-intersperse (if (list? remargs) remargs (argv)) " ")))
+
+;; before doing anything else change to the start-dir if provided
+;;
+(if (args:get-arg "-start-dir")
+ (if (file-exists? (args:get-arg "-start-dir"))
+ (let ((fullpath (common:real-path (args:get-arg "-start-dir"))))
+ (setenv "PWD" fullpath)
+ (change-directory fullpath))
+ (begin
+ (debug:print-error 0 *default-log-port* "non-existant start dir " (args:get-arg "-start-dir") " specified, exiting.")
+ (exit 1))))
+
+;; immediately set MT_TARGET if -reqtarg or -target are available
+;;
+(let ((targ (or (args:get-arg "-reqtarg")(args:get-arg "-target"))))
+ (if targ (setenv "MT_TARGET" targ)))
+
+;; The watchdog is to keep an eye on things like db sync etc.
+;;
+
+;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
+(define *watchdog* (make-thread
+ (lambda ()
+ (handle-exceptions
+ exn
+ (begin
+ (print-call-chain)
+ (print " message: " ((condition-property-accessor 'exn 'message) exn)))
+ (common:watchdog)))
+ "Watchdog thread"))
+;;======================================================================
+;; Strive for clean exit handling
+;;======================================================================
+
+(define (server-exit-procedure)
+ (on-exit (lambda ()
+ ;; close the databases, ensure the pkt is removed!
+ 0)))
+
+;; Copied from the SDL2 examples.
+;;
+;; Schedule quit! to be automatically called when your program exits normally.
+(on-exit server-exit-procedure)
+
+;; Install a custom exception handler that will call quit! and then
+;; call the original exception handler. This ensures that quit! will
+;; be called even if an unhandled exception reaches the top level.
+(current-exception-handler
+ (let ((original-handler (current-exception-handler)))
+ (lambda (exception)
+ (server-exit-procedure)
+ (original-handler exception))))
+
+;;(if (not (args:get-arg "-server"))
+;; (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog
+(let* ((no-watchdog-args
+ '("-list-runs"
+ "-testdata-csv"
+ "-list-servers"
+ "-server"
+ "-list-disks"
+ "-list-targets"
+ "-show-runconfig"
+ ;;"-list-db-targets"
+ "-show-runconfig"
+ "-show-config"
+ "-show-cmdinfo"
+ "-cleanup-db"))
+ (no-watchdog-args-vals (filter (lambda (x) x)
+ (map args:get-arg no-watchdog-args)))
+ (start-watchdog (null? no-watchdog-args-vals)))
+ ;;(BB> "no-watchdog-args="no-watchdog-args "no-watchdog-args-vals="no-watchdog-args-vals)
+ (if start-watchdog
+ (thread-start! *watchdog*)))
+
+
+;; bracket open-output-file with code to make leading directory if it does not exist and handle exceptions
+(define (open-logfile logpath)
+ (condition-case
+ (let* ((log-dir (or (pathname-directory logpath) ".")))
+ (if (not (directory-exists? log-dir))
+ (system (conc "mkdir -p " log-dir)))
+ (open-output-file logpath))
+ (exn ()
+ (debug:print-error 0 *default-log-port* "Could not open log file for write: "logpath)
+ (define *didsomething* #t)
+ (exit 1))))
+
+;; this segment will run launch:setup only if -log is not set. This is fairly safe as servers are not
+;; manually started and thus should never be started in a non-megatest area. Thus no need to handle situation
+;; where (launch:setup) returns #f?
+;;
+(if (or (args:get-arg "-log")(args:get-arg "-server")) ;; redirect the log always when a server
+ (handle-exceptions
+ exn
+ (begin
+ (print "ERROR: Failed to switch to log output. " ((condition-property-accessor 'exn 'message) exn))
+ )
+ (let* ((tl (or (args:get-arg "-log")(launch:setup))) ;; run launch:setup if -server, ensure we do NOT run launch:setup if -log specified
+ (logf (or (args:get-arg "-log") ;; use -log unless we are a server, then craft a logfile name
+ (conc tl "/logs/server-" (current-process-id) "-" (get-host-name) ".log")))
+ (oup (open-logfile logf)))
+ (if (not (args:get-arg "-log"))
+ (hash-table-set! args:arg-hash "-log" logf)) ;; fake out future queries of -log
+ (debug:print-info 0 *default-log-port* "Sending log output to " logf)
+ (set! *default-log-port* oup))))
+
+(if (or (args:get-arg "-h")
+ (args:get-arg "-help")
+ (args:get-arg "--help"))
+ (begin
+ (print help)
+ (exit)))
+
+(if (args:get-arg "-version")
+ (begin
+ (print (common:version-signature)) ;; (print megatest-version)
+ (exit)))
+
+(define *didsomething* #f)
+
ADDED util.scm
Index: util.scm
==================================================================
--- /dev/null
+++ util.scm
@@ -0,0 +1,423 @@
+;;======================================================================
+;; Copyright 2018, Matthew Welland.
+;;
+;; This file is part of Megatest.
+;;
+;; Megatest is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+;;
+;; Megatest is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with Megatest. If not, see .
+;;
+;;======================================================================
+
+;;======================================================================
+;; Tests
+;;======================================================================
+
+(declare (unit util))
+
+(declare (uses common))
+
+(module util
+ *
+
+(import chicken scheme data-structures extras srfi-13 ports )
+
+
+;;======================================================================
+;; L O C K I N G M E C H A N I S M S
+;;======================================================================
+
+;; faux-lock is deprecated. Please use simple-lock below
+;;
+(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t))
+ (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count
+ (if (> wait-time 0)
+ (begin
+ (thread-sleep! 1)
+ (if (eq? wait-time 1) ;; only one second left, steal the lock
+ (begin
+ (debug:print-info 0 *default-log-port* "stealing lock for " keyname)
+ (common:faux-unlock keyname force: #t)))
+ (common:faux-lock keyname wait-time: (- wait-time 1)))
+ #f)
+ (begin
+ (rmt:no-sync-set keyname (conc (current-process-id)))
+ (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f))))))
+
+(define (common:faux-unlock keyname #!key (force #f))
+ (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f))))
+ (begin
+ (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname))
+ #t)
+ #f))
+
+;; simple lock. improve and converge on this one.
+;;
+(define (common:simple-lock keyname)
+ (rmt:no-sync-get-lock keyname))
+
+(define (common:run-a-command cmd #!key (with-vars #f))
+ (let* ((pre-cmd (dtests:get-pre-command))
+ (post-cmd (dtests:get-post-command))
+ (fullcmd (if (or pre-cmd post-cmd)
+ (conc pre-cmd cmd post-cmd)
+ (conc "viewscreen " cmd))))
+ (debug:print-info 02 *default-log-port* "Running command: " fullcmd)
+ (if with-vars
+ (common:without-vars cmd)
+ (common:without-vars fullcmd "MT_.*"))))
+
+;; ideally put all this info into the db, no need to preserve it across moving homehost
+;;
+;; return list of
+;; ( reachable? cpuload update-time )
+(define (common:get-host-info hostname)
+ (let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data
+ (load (car loadinfo))
+ (load-sample-time (cdr loadinfo))
+ (load-sample-age (- (current-seconds) load-sample-time))
+ (loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds
+ (host-last-update-timeout-seconds 4)
+ (host-rec (hash-table-ref/default *host-loads* hostname #f))
+ )
+ (cond
+ ((< load-sample-age loadinfo-timeout-seconds)
+ (list #t
+ load-sample-time
+ load))
+ ((and host-rec
+ (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds)))
+ (list #t
+ (host-last-update host-rec)
+ (host-last-cpuload host-rec )))
+ ((common:unix-ping hostname)
+ (list #t
+ (current-seconds)
+ (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds
+ (else
+ (list #f 0 -1) ;; bad host, don't use!
+ ))))
+
+(define (get-uname . params)
+ (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
+ (uname #f))
+ (if (null? (car uname-res))
+ "unknown"
+ (caar uname-res))))
+
+(define (get-unix-df path)
+ (let* ((df-results (process:cmd-run->list (conc "df " path)))
+ (space-rx (regexp "([0-9]+)\\s+([0-9]+)%"))
+ (freespc #f))
+ ;; (write df-results)
+ (for-each (lambda (l)
+ (let ((match (string-search space-rx l)))
+ (if match
+ (let ((newval (string->number (cadr match))))
+ (if (number? newval)
+ (set! freespc newval))))))
+ (car df-results))
+ freespc))
+
+;; given path get free space, allows override in [setup]
+;; with free-space-script /path/to/some/script.sh
+;;
+(define (get-df path)
+ (if (configf:lookup *configdat* "setup" "free-space-script")
+ (with-input-from-pipe
+ (conc (configf:lookup *configdat* "setup" "free-space-script") " " path)
+ (lambda ()
+ (let ((res (read-line)))
+ (if (string? res)
+ (string->number res)))))
+ (get-unix-df path)))
+
+(define (std-exit-procedure)
+ (on-exit (lambda () 0))
+ ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
+ (let ((no-hurry (if *time-to-exit* ;; hurry up
+ #f
+ (begin
+ (set! *time-to-exit* #t)
+ #t))))
+ (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.")
+ (if (and no-hurry (debug:debug-mode 18))
+ (rmt:print-db-stats))
+ (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds
+ (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated
+ (if *task-db*
+ (let ((db (cdr *task-db*)))
+ (if (sqlite3:database? db)
+ (begin
+ (sqlite3:interrupt! db)
+ (sqlite3:finalize! db #t)
+ ;; (vector-set! *task-db* 0 #f)
+ (set! *task-db* #f)))))
+ (http-client#close-all-connections!)
+ ;; (if (and *runremote*
+ ;; (remote-conndat *runremote*))
+ ;; (begin
+ ;; (http-client#close-all-connections!))) ;; for http-client
+ (if (not (eq? *default-log-port* (current-error-port)))
+ (close-output-port *default-log-port*))
+ (set! *default-log-port* (current-error-port))) "Cleanup db exit thread"))
+ (th2 (make-thread (lambda ()
+ (debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...")
+ (if no-hurry
+ (begin
+ (thread-sleep! 5)) ;; give the clean up few seconds to do it's stuff
+ (begin
+ (thread-sleep! 2)))
+ (debug:print 4 *default-log-port* " ... done")
+ )
+ "clean exit")))
+ (thread-start! th1)
+ (thread-start! th2)
+ (thread-join! th1)
+ )
+ )
+
+ 0)
+
+;; from metadat lookup MEGATEST_VERSION
+;;
+(define (common:get-last-run-version) ;; RADT => How does this work in send-receive function??; assume it is the value saved in some DB
+ (rmt:get-var "MEGATEST_VERSION"))
+
+(define (common:set-last-run-version)
+ (rmt:set-var "MEGATEST_VERSION" (common:version-signature)))
+
+;;======================================================================
+;; D I S K S P A C E
+;;======================================================================
+
+(define (common:get-disk-space-used fpath)
+ (with-input-from-pipe (conc "/usr/bin/du -s " fpath) read))
+
+(define (common:check-space-in-dir dirpath required)
+ (let* ((dbspace (if (directory? dirpath)
+ (get-df dirpath)
+ 0)))
+ (list (> dbspace required)
+ dbspace
+ required
+ dirpath)))
+
+;; check space in dbdir and in megatest dir
+;; returns: ok/not dbspace required-space
+;;
+(define (common:check-db-dir-space)
+ (let* ((required (string->number
+ (or (configf:lookup *configdat* "setup" "dbdir-space-required")
+ "100000")))
+ (dbdir (common:get-db-tmp-area)) ;; (db:get-dbdir))
+ (tdbspace (common:check-space-in-dir dbdir required))
+ (mdbspace (common:check-space-in-dir *toppath* required)))
+ (sort (list tdbspace mdbspace) (lambda (a b)
+ (< (cadr a)(cadr b))))))
+
+;; check available space in dbdir, exit if insufficient
+;;
+(define (common:check-db-dir-and-exit-if-insufficient)
+ (let* ((spacedat (car (common:check-db-dir-space))) ;; look only at worst for now
+ (is-ok (car spacedat))
+ (dbspace (cadr spacedat))
+ (required (caddr spacedat))
+ (dbdir (cadddr spacedat)))
+ (if (not is-ok)
+ (begin
+ (debug:print-error 0 *default-log-port* "Insufficient space in " dbdir ", require " required ", have " dbspace ", exiting now.")
+ (exit 1)))))
+
+;; paths is list of lists ((name path) ... )
+;;
+(define (common:get-disk-with-most-free-space disks minsize)
+ (let ((best #f)
+ (bestsize 0))
+ (for-each
+ (lambda (disk-num)
+ (let* ((dirpath (cadr (assoc disk-num disks)))
+ (freespc (cond
+ ((not (directory? dirpath))
+ (if (common:low-noise-print 300 "disks not a dir " disk-num)
+ (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a directory - ignoring it."))
+ -1)
+ ((not (file-write-access? dirpath))
+ (if (common:low-noise-print 300 "disks not writeable " disk-num)
+ (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not writeable - ignoring it."))
+ -1)
+ ((not (eq? (string-ref dirpath 0) #\/))
+ (if (common:low-noise-print 300 "disks not a proper path " disk-num)
+ (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a fully qualified path - ignoring it."))
+ -1)
+ (else
+ (get-df dirpath)))))
+ (if (> freespc bestsize)
+ (begin
+ (set! best (cons disk-num dirpath))
+ (set! bestsize freespc)))))
+ (map car disks))
+ (if (and best (> bestsize minsize))
+ best
+ #f))) ;; #f means no disk candidate found
+
+;; see defstruct host at top of file.
+;; host: reachable last-update last-used last-cpuload
+;;
+(define (common:update-host-loads-table hosts-raw)
+ (let* ((hosts (filter (lambda (x)
+ (string-match (regexp "^\\S+$") x))
+ hosts-raw)))
+ (for-each
+ (lambda (hostname)
+ (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f)))
+ (if h
+ h
+ (let ((h (make-host)))
+ (hash-table-set! *host-loads* hostname h)
+ h))))
+ (host-info (common:get-host-info hostname))
+ (is-reachable (car host-info))
+ (last-reached-time (cadr host-info))
+ (load (caddr host-info)))
+ (host-reachable-set! rec is-reachable)
+ (host-last-update-set! rec last-reached-time)
+ (host-last-cpuload-set! rec load)))
+ hosts)))
+
+;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the
+;; [host-rules] section.
+;;
+(define (common:get-least-loaded-host hosts-raw host-type configdat)
+ (let* ((rdat (configf:lookup configdat "host-rules" host-type))
+ (rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate
+ (maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load
+ (maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs
+ (maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second
+ (hosts (filter (lambda (x)
+ (string-match (regexp "^\\S+$") x))
+ hosts-raw))
+ ;; (best-host #f)
+ (get-rec (lambda (hostname)
+ ;; (print "get-rec hostname=" hostname)
+ (let ((h (hash-table-ref/default *host-loads* hostname #f)))
+ (if h
+ h
+ (let ((h (make-host)))
+ (hash-table-set! *host-loads* hostname h)
+ h)))))
+ (best-load 99999)
+ (curr-time (current-seconds))
+ (get-hosts-sorted (lambda (hosts)
+ (sort hosts (lambda (a b)
+ (let ((a-rec (get-rec a))
+ (b-rec (get-rec b)))
+ ;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec))
+ ;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec))
+ (< (host-last-used a-rec)
+ (host-last-used b-rec))))))))
+ (debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts))
+ (if (null? hosts)
+ #f ;; no hosts to select from. All done and giving up now.
+ (let ((hosts-sorted (get-hosts-sorted hosts)))
+ (common:update-host-loads-table hosts)
+ (let loop ((hostname (car hosts-sorted))
+ (tal (cdr hosts-sorted))
+ (best-host #f))
+ (let* ((rec (get-rec hostname))
+ (reachable (host-reachable rec))
+ (load (host-last-cpuload rec))
+ (last-used (host-last-used rec))
+ (delta (- curr-time last-used))
+ (job-rate (if (> delta 0)
+ (/ 1 delta)
+ 999)) ;; jobs per second
+ (new-best
+ (cond
+ ((not reachable)
+ (debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.")
+ best-host)
+ ((and (< load maxnload) ;; load is acceptable
+ (< job-rate maxjobrate)) ;; job rate is acceptable
+ (set! best-load load)
+ hostname)
+ (else best-host))))
+ (debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." )
+ (if new-best
+ (begin ;; found a host, return it
+ (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
+ (host-last-used-set! rec curr-time)
+ new-best)
+ (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))
+
+(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f))
+ (let* ((loadavg (common:get-cpu-load remote-host))
+ (numcpus (if (< 1 numcpus-in) ;; not possible
+ (common:get-num-cpus remote-host)
+ numcpus-in))
+ (maxload (max maxload-in 0.5)) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
+ (first (car loadavg))
+ (next (cadr loadavg))
+ (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
+ (loadjmp (- first next))
+ (adjwait (+ (random 10)(/ (- 1000 count) 10) waitdelay))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
+ (cond
+ ((and (> first adjload)
+ (> count 0))
+ (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
+ (thread-sleep! adjwait)
+ (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
+ ((and (> loadjmp numcpus)
+ (> count 0))
+ (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg ""))
+ (thread-sleep! adjwait)
+ (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)))))
+
+(define (common:wait-for-homehost-load maxload msg)
+ (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
+ #f
+ (common:get-homehost)))
+ (hh (if hh-dat (car hh-dat) #f))
+ (numcpus (common:get-num-cpus hh)))
+ (common:wait-for-normalized-load maxload msg hh)))
+
+(define (common:get-num-cpus remote-host)
+ (let* ((actual-host (or remote-host (get-host-name))))
+ (or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often!
+ (let* ((proc (lambda ()
+ (let loop ((numcpu 0)
+ (inl (read-line)))
+ (if (eof-object? inl)
+ (begin
+ (common:write-cached-info remote-host "num-cpus" numcpu)
+ numcpu)
+ (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl)
+ (+ numcpu 1)
+ numcpu)
+ (read-line))))))
+ (result (if remote-host
+ (with-input-from-pipe
+ (conc "ssh " remote-host " cat /proc/cpuinfo")
+ proc)
+ (with-input-from-file "/proc/cpuinfo" proc))))
+ (common:write-cached-info actual-host "num-cpus" result)
+ result))))
+
+;; wait for normalized cpu load to drop below maxload
+;;
+(define (common:wait-for-normalized-load maxload msg remote-host)
+ (let ((num-cpus (common:get-num-cpus remote-host)))
+ (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host)))
+
+
+)
+