Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -37,11 +37,11 @@ diff-report.scm cgisetup/models/pgdb.scm # module source files MSRCFILES = dbfile.scm debugprint.scm mtargs.scm commonmod.scm dbmod.scm \ tcp-transportmod.scm rmtmod.scm portlogger.scm apimod.scm \ - configfmod.scm processmod.scm + configfmod.scm processmod.scm servermod.scm transport-mode.scm : transport-mode.scm.template cp transport-mode.scm.template transport-mode.scm dashboard-transport-mode.scm : dashboard-transport-mode.scm.template @@ -53,10 +53,11 @@ # dbmod.import.o is just a hack here mofiles/portlogger.o : mofiles/dbmod.o process.o : mofiles/processmod.o mofiles/configfmod.o : mofiles/processmod.o mofiles/processmod.o : mofiles/commonmod.o +mofiles/servermod.o : mofiles/commonmod.o mofiles/dbfile.o : \ mofiles/debugprint.o mofiles/commonmod.o mofiles/apimod.o : mofiles/commonmod.o mofiles/tcp-transportmod.o mofiles/configfmod.o mofiles/dbmod.o : mofiles/dbfile.o Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -107,12 +107,10 @@ (normal-proc cmd run-id params) ;; numthreads must be greater than 5 for busy (* 0.1 (- numthreads maxthreads)) ;; was 15 - return a number for the remote to delay )) ;; (- numthreads 29)) ;; call back in as many seconds ((loaded) - ;; (if (eq? (rmt:transport-mode) 'tcp) - ;; (thread-sleep! 0.5)) (normal-proc cmd run-id params)) (else (normal-proc cmd run-id params)))) (meta (case cmd ((ping) `((sstate . ,server-state))) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -45,2938 +45,13 @@ rmtmod (prefix mtargs args:)) (include "common_records.scm") -(define (remove-files filespec) - (let ((files (glob filespec))) - (for-each delete-file files))) - -(define (stop-the-train) - (thread-start! (make-thread (lambda () - (let loop () - (if (and *toppath* - (file-exists? (conc *toppath*"/stop-the-train"))) - (let* ((msg (conc "ERROR: found file "*toppath*"/stop-the-train, exiting immediately"))) - ;; yes, print to current-output-port AND *default-log-port*, annoying but necessary I think - (print msg) - (debug:print 0 *default-log-port* msg) - (remove-files (conc *toppath* "/logs/server*")) - (remove-files (conc *toppath* "/.servinfo/*")) - (remove-files (conc *toppath* "/.mtdb/*lock")) - (exit 1))) - (thread-sleep! 5) - (loop)))))) - -;; execute thunk, return value. If exception thrown, trap exception, return #f, and emit nonfatal condition note to *default-log-port* . -;; arguments - thunk, message -(define (common:fail-safe thunk warning-message-on-exception) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "notable but nonfatal condition - "warning-message-on-exception", exn=" exn) - (debug:print-info 0 *default-log-port* - (string-substitute "\n?Error:" "nonfatal condition:" - (with-output-to-string - (lambda () - (print-error-message exn) )))) - (debug:print-info 0 *default-log-port* " -- continuing after nonfatal condition...") - #f) - (thunk))) - - -;; returns list of fd count, socket count -(define (get-file-descriptor-count #!key (pid (current-process-id ))) - (list - (length (glob (conc "/proc/" pid "/fd/*"))) - (length (filter identity (map socket? (glob (conc "/proc/" pid "/fd/*"))))) - ) -) - - - -;; GLOBALS - -;; CONTEXTS -(defstruct cxt - (taskdb #f) - (cmutex (make-mutex))) -;; (define *contexts* (make-hash-table)) -;; (define *context-mutex* (make-mutex)) - -;; ;; safe method for accessing a context given a toppath -;; ;; -;; (define (common:with-cxt toppath proc) -;; (mutex-lock! *context-mutex*) -;; (let ((cxt (hash-table-ref/default *contexts* toppath #f))) -;; (if (not cxt) -;; (set! cxt (let ((x (make-cxt)))(hash-table-set! *contexts* toppath x) x))) -;; (let ((cxt-mutex (cxt-mutex cxt))) -;; (mutex-unlock! *context-mutex*) -;; (mutex-lock! cxt-mutex) -;; (let ((res (proc cxt))) -;; (mutex-unlock! cxt-mutex) -;; res)))) - -;; A hash table that can be accessed by #{scheme ...} calls in -;; config files. Allows communicating between confgs -;; -(define *user-hash-data* (make-hash-table)) - -(define *db-keys* #f) - -(define *pkts-info* (make-hash-table)) ;; store stuff like the last parent here -(define *configinfo* #f) ;; raw results from setup, includes toppath and table from megatest.config -(define *runconfigdat* #f) ;; run configs data -(define *configdat* #f) ;; megatest.config data -(define *configstatus* #f) ;; status of data; 'fulldata : all processing done, #f : no data yet, 'partialdata : partial read done -;; (define *toppath* #f) ;; moved to commonmod -(define *already-seen-runconfig-info* #f) - -(define *test-meta-updated* (make-hash-table)) -(define *globalexitstatus* 0) ;; attempt to work around possible thread issues -(define *passnum* 0) ;; when running track calls to run-tests or similar -;; (define *alt-log-file* #f) ;; used by -log -;; (define *common:denoise* (make-hash-table)) ;; for low noise printing -(define *default-log-port* (current-error-port)) -(define *time-zero* (current-seconds)) ;; for the watchdog -(define *on-exit-procs* '()) ;; add procs to this list to be executed on exit -(define *default-area-tag* "local") - -;; DATABASE -;; db access -(define *db-last-access* (current-seconds)) ;; last db access, used in server -;; (define *db-write-access* #t) -;; db sync -;; (define *db-last-sync* 0) ;; last time the sync to megatest.db happened -(define *db-sync-in-progress* #f) ;; if there is a sync in progress do not try to start another -;; (define *db-multi-sync-mutex* (make-mutex)) ;; protect access to *db-sync-in-progress*, *db-last-sync* -;; task db -(define *task-db* #f) ;; (vector db path-to-db) -(define *db-access-allowed* #t) ;; flag to allow access -;; (define *db-access-mutex* (make-mutex)) ;; moved to dbfile -;; (define *db-transaction-mutex* (make-mutex)) -(define *db-cache-path* #f) -;; (define *db-with-db-mutex* (make-mutex)) -(define *db-api-call-time* (make-hash-table)) ;; hash of command => (list of times) - -;; SERVER -(define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg -(define *runremote* #f) ;; if set up for server communication this will hold -;; (define *max-cache-size* 0) -(define *logged-in-clients* (make-hash-table)) -(define *server-id* #f) -;; (define *server-info* #f) ;; good candidate for easily convert to non-global -(define *time-to-exit* #f) -(define *run-id* #f) -(define *server-kind-run* (make-hash-table)) -(define *home-host* #f) -;; (define *total-non-write-delay* 0) -(define *heartbeat-mutex* (make-mutex)) -;; (define *api-process-request-count* 0) -;; (define *max-api-process-requests* 0) -(define *server-overloaded* #f) - -;; client -(define *rmt-mutex* (make-mutex)) ;; remote access calls mutex - -;; RPC transport -(define *rpc:listener* #f) - -;; KEY info -(define *target* (make-hash-table)) ;; cache the target here; target is keyval1/keyval2/.../keyvalN -(define *keys* (make-hash-table)) ;; cache the keys here -(define *keyvals* (make-hash-table)) -(define *toptest-paths* (make-hash-table)) ;; cache toptest path settings here -(define *test-paths* (make-hash-table)) ;; cache test-id to test run paths here -(define *test-ids* (make-hash-table)) ;; cache run-id, testname, and item-path => test-id -(define *test-info* (make-hash-table)) ;; cache the test info records, update the state, status, run_duration etc. from testdat.db - -(define *run-info-cache* (make-hash-table)) ;; run info is stable, no need to reget -(define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set -(define *homehost-mutex* (make-mutex)) - -;; Miscellaneous -(define *triggers-mutex* (make-mutex)) ;; block overlapping processing of triggers -(define *numcpus-cache* (make-hash-table)) - -;; this plugs a hole in posix-extras in recent chicken versions > 4.9) -(let-values (( (chicken-release-number chicken-major-version) - (apply values - (map string->number - (take - (string-split (chicken-version) ".") - 2))))) - (let ((resolve-pathname-broken? - (or (> chicken-release-number 4) - (and (eq? 4 chicken-release-number) (> chicken-major-version 9))))) - (if resolve-pathname-broken? - (define ##sys#expand-home-path pathname-expand)))) - -(define (realpath x) (resolve-pathname (pathname-expand (or x "/dev/null")) )) - -(define (common:get-this-exe-fullpath #!key (argv (argv))) - (let* ((this-script - (cond - ((and (> (length argv) 2) - (string-match "^(.*/csi|csi)$" (car argv)) - (string-match "^-(s|ss|sx|script)$" (cadr argv))) - (caddr argv)) - (else (car argv)))) - (fullpath (realpath this-script))) - fullpath)) - -;;====================================================================== - -(define *common:this-exe-fullpath* (common:get-this-exe-fullpath)) -(define *common:this-exe-dir* (pathname-directory *common:this-exe-fullpath*)) -(define *common:this-exe-name* (pathname-strip-directory *common:this-exe-fullpath*)) - -(define (common:get-sync-lock-filepath) - (let* ((tmp-area (common:make-tmpdir-name *toppath* "")) - (lockfile (conc tmp-area "/megatest.db.lock"))) - lockfile)) - -(define *common:logpro-exit-code->status-sym-alist* - '( ( 0 . pass ) - ( 1 . fail ) - ( 2 . warn ) - ( 3 . check ) - ( 4 . waived ) - ( 5 . abort ) - ( 6 . skip ))) - -(define (common:logpro-exit-code->status-sym exit-code) - (or (alist-ref exit-code *common:logpro-exit-code->status-sym-alist*) 'fail)) - -(define (common:worse-status-sym ss1 ss2) - (let loop ((status-syms-remaining '(abort fail check skip warn waived pass))) - (cond - ((null? status-syms-remaining) - 'fail) - ((eq? (car status-syms-remaining) ss1) - ss1) - ((eq? (car status-syms-remaining) ss2) - ss2) - (else - (loop (cdr status-syms-remaining)))))) - -(define (common:steps-can-proceed-given-status-sym status-sym) - (if (member status-sym '(warn waived pass)) - #t - #f)) - -(define (status-sym->string status-sym) - (case status-sym - ((pass) "PASS") - ((fail) "FAIL") - ((warn) "WARN") - ((check) "CHECK") - ((waived) "WAIVED") - ((abort) "ABORT") - ((skip) "SKIP") - (else "FAIL"))) - -(define (common:logpro-exit-code->test-status exit-code) - (status-sym->string (common:logpro-exit-code->status-sym exit-code))) - -;; -(defstruct remote - - ;; transport to be used - ;; http - use http-transport - ;; http-read-cached - use http-transport for writes but in-mem cached for reads - (rmode 'http) - (hh-dat (let ((res (or (server:choose-server *toppath* 'homehost) - (cons #f #f)))) - (assert (pair? res)(conc "FATAL: hh-dat should be a pair, got "res)) - res)) - (server-url #f) ;; (server:check-if-running *toppath*) #f)) - (server-id #f) - (server-info #f) ;; (if *toppath* (server:check-if-running *toppath*) #f)) - (last-server-check 0) ;; last time we checked to see if the server was alive - (connect-time (current-seconds)) ;; when we first connected - (last-access (current-seconds)) ;; last time we talked to server - ;; (conndat #f) ;; iface port api-uri api-url api-req seconds server-id - (server-timeout (server:expiration-timeout)) - (force-server #f) - (ro-mode #f) - (ro-mode-checked #f) ;; flag that indicates we have checked for ro-mode - - ;; conndat stuff - (iface #f) ;; TODO: Consolidate this data with server-url and server-info above - (port #f) - (api-url #f) - (api-uri #f) - (api-req #f)) - -;; launching and hosts -(defstruct host - (reachable #f) - (last-update 0) - (last-used 0) - (last-cpuload 1)) - -(define *host-loads* (make-hash-table)) - -;; cache environment vars for each run here -(define *env-vars-by-run-id* (make-hash-table)) - -;; Testconfig and runconfig caches. -(define *testconfigs* (make-hash-table)) ;; test-name => testconfig -(define *runconfigs* (make-hash-table)) ;; target => runconfig - -;; This is a cache of pre-reqs met, don't re-calc in cases where called with same params less than -;; five seconds ago -(define *pre-reqs-met-cache* (make-hash-table)) - -;; cache of verbosity given string -;; -(define *verbosity-cache* (make-hash-table)) - -(define (common:clear-caches) - (set! *target* (make-hash-table)) - (set! *keys* (make-hash-table)) - (set! *keyvals* (make-hash-table)) - (set! *toptest-paths* (make-hash-table)) - (set! *test-paths* (make-hash-table)) - (set! *test-ids* (make-hash-table)) - (set! *test-info* (make-hash-table)) - (set! *run-info-cache* (make-hash-table)) - (set! *env-vars-by-run-id* (make-hash-table)) - (set! *test-id-cache* (make-hash-table))) - -;; Generic string database -(define sdb:qry #f) ;; (make-sdb:qry)) ;; 'init #f) -;; Generic path database -(define *fdb* #f) - -(define *last-launch* (current-seconds)) ;; use for throttling the launch rate. Would be better to use the db and last time of a test in LAUNCHED state. - -;;====================================================================== -;; V E R S I O N -;;====================================================================== - -(define (common:get-full-version) - (conc megatest-version "-" megatest-fossil-hash)) - -(define (common:version-signature) - (conc megatest-version "-" (substring megatest-fossil-hash 0 4))) - -;;====================================================================== -;; from metadat lookup MEGATEST_VERSION -;; -(define (common:get-last-run-version) ;; RADT => How does this work in send-receive function??; assume it is the value saved in some DB - (rmt:get-var "MEGATEST_VERSION")) - -(define (common:get-last-run-version-number) - (string->number - (substring (common:get-last-run-version) 0 6))) - -(define (common:set-last-run-version) - (rmt:set-var "MEGATEST_VERSION" (common:version-signature))) - -;;====================================================================== -;; postive number if megatest version > db version -;; negative number if megatest version < db version -(define (common:version-db-delta) - (- megatest-version (common:get-last-run-version-number))) - -(define (common:version-changed?) - (not (equal? (common:get-last-run-version) - (common:version-signature)))) - - -;; From 1.70 to 1.80, db's are compatible. - -(define (common:api-changed?) - (let* ( - (megatest-major-version (substring (->string megatest-version) 0 4)) - (run-major-version (substring (conc (common:get-last-run-version)) 0 4)) - ) - (and (not (equal? megatest-major-version "1.80")) - (not (equal? megatest-major-version megatest-run-version))) - ) -) - -;;====================================================================== -;; Move me elsewhere ... -;; RADT => Why do we meed the version check here, this is called only if version misma -;; -(define (common:cleanup-db dbstruct #!key (full #f)) - (case (rmt:transport-mode) - ((http) - (apply db:multi-db-sync - dbstruct - 'schema - 'killservers - 'adj-target - 'new2old - '(dejunk) - )) - ((tcp nfs) - (apply db:multi-db-sync - dbstruct - 'schema - 'killservers - 'adj-target - 'new2old - '(dejunk) - ))) - (if (common:api-changed?) - (common:set-last-run-version))) - -(define (common:snapshot-file filepath #!key (subdir ".") ) - (if (file-exists? filepath) - (let* ((age-sec (lambda (file) - (if (file-exists? file) - (- (current-seconds) (file-modification-time file)) - 1000000000))) ;; return really old value if file doesn't exist. we want to clobber it if old or not exist. - (ok-flag #t) - (age-mins (lambda (file) (/ (age-sec file) 60))) - (age-hrs (lambda (file) (/ (age-mins file) 60))) - (age-days (lambda (file) (/ (age-hrs file) 24))) - (age-wks (lambda (file) (/ (age-days file) 7))) - (docmd (lambda (cmd) - (cond - (ok-flag - (let ((res (system cmd))) - (cond - ((eq? 0 res) - #t) - (else - (set! ok-flag #f) - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Command failed with exit code " - (if (< res 0) - res - (/ res 8)) " ["cmd"]" ) - #f)))) - (else - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Not runnining command due to prior error. ["cmd"]") - #f)))) - (copy (lambda (src dest) (docmd (conc "/bin/cp '"src"' '"dest"'")))) - (copy+zip (lambda (src dest) (docmd (conc "gzip -c - < '"src"' > '"dest"'")))) - (fullpath (realpath filepath)) - (basedir (pathname-directory fullpath)) - (basefile (pathname-strip-directory fullpath)) - ;;(prevfile (conc filepath ".prev.gz")) - (minsfile (conc basedir "/" subdir "/" basefile ".mins.gz")) - (hrsfile (conc basedir "/" subdir "/" basefile ".hrs.gz")) - (daysfile (conc basedir "/" subdir "/" basefile ".days.gz")) - (wksfile (conc basedir "/" subdir "/" basefile ".weeks.gz"))) - - ;; create subdir it not exists - (if (not (directory-exists? (conc basedir "/" subdir))) - (docmd (conc "/bin/mkdir -p '"(conc basedir "/" subdir)"'"))) - - ;; copy&zip to .mins if not exists - (if (not (file-exists? minsfile)) - (copy+zip filepath minsfile)) - ;; copy .mins to .hrs if not exists - (if (not (file-exists? hrsfile)) - (copy minsfile hrsfile)) - ;; copy .hrs to .days if not exists - (if (not (file-exists? daysfile)) - (copy hrsfile daysfile)) - ;; copy .days to .weeks if not exists - (if (not (file-exists? wksfile)) - (copy daysfile wksfile)) - - - ;; if age(.mins.gz) >= 1h: - ;; copy .mins.gz .hrs.gz - ;; copy .mins.gz - (when (>= (age-mins minsfile) 1) - (copy minsfile hrsfile) - (copy+zip filepath minsfile)) - - ;; if age(.hrs.gz) >= 1d: - ;; copy .hrs.gz .days.gz - ;; copy .mins.gz .hrs.gz - (when (>= (age-days hrsfile) 1) - (copy hrsfile daysfile) - (copy minsfile hrsfile)) - - ;; if age(.days.gz) >= 1w: - ;; copy .days.gz .weeks.gz - ;; copy .hrs.gz .days.gz - (when (>= (age-wks daysfile) 1) - (copy daysfile wksfile) - (copy hrsfile daysfile)) - #t) - #f)) - -;;====================================================================== -;; Rotate logs, logic: -;; if > 500k and older than 1 week: -;; remove previous compressed log and compress this log -;; WARNING: This proc operates assuming that it is in the directory above the -;; logs directory you wish to log-rotate. -;; -(define (common:rotate-logs) - (let* ((all-files (make-hash-table)) - (stats (make-hash-table)) - (inc-stat (lambda (key) - (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1)))) - (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "600")))) ;; name -> age - (if (not (directory-exists? "logs"))(create-directory "logs")) - (directory-fold - (lambda (file rem) - (handle-exceptions - exn - (begin - (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore. exn=" exn) - (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - ;; (print-call-chain (current-error-port)) ;; - ) - (let* ((fullname (conc "logs/" file)) - (mod-time (file-modification-time fullname)) - (file-age (- (current-seconds) mod-time)) - (file-old (> file-age (* 48 60 60))) - (file-big (> (file-size fullname) 200000))) - (hash-table-set! all-files file mod-time) - (if (or (and (string-match "^.*.log" file) - file-old - file-big) - (and (string-match "^server-.*.log" file) - file-old)) - (let ((gzfile (conc fullname ".gz"))) - (if (common:file-exists? gzfile) - (begin - (debug:print-info 0 *default-log-port* "removing " gzfile) - (delete-file* gzfile) - (hash-table-delete! all-files gzfile) ;; needed? - )) - (debug:print-info 0 *default-log-port* "compressing " file) - (system (conc "gzip " fullname)) - (inc-stat "gzipped") - (hash-table-set! all-files (conc file ".gz") file-age) ;; add the .gz file and remove the base file - (hash-table-delete! all-files file) - ) - (if (and (> file-age (* (string->number (or (configf:lookup *configdat* "setup" "log-expire-days") "30")) 24 3600)) - (file-exists? fullname)) ;; just in case it was gzipped - will get it next time - (handle-exceptions - exn - #f - (if (directory? fullname) - (begin - (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") - (inc-stat "directories")) - (begin - (delete-file* fullname) - (inc-stat "deleted"))) - (hash-table-delete! all-files file))))))) - '() - "logs") - (for-each - (lambda (category) - (let ((quant (hash-table-ref/default stats category 0))) - (if (> quant 0) - (debug:print-info 0 *default-log-port* category " log files: " quant)))) - `("deleted" "gzipped" "directories")) - (let ((num-logs (hash-table-size all-files))) - (if (> num-logs max-allowed) ;; because NFS => don't let number of logs exceed 300 - (let ((files (take (sort (hash-table-keys all-files) - (lambda (a b) - (< (hash-table-ref all-files a)(hash-table-ref all-files b)))) - (- num-logs max-allowed)))) - (for-each - (lambda (file) - (let* ((fullname (conc "logs/" file))) - (if (directory? fullname) - (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") - (handle-exceptions - exn - (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn) - (delete-file* fullname))))) - files) - (debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files.")))))) - -;;====================================================================== -;; Force a megatest cleanup-db if version is changed and skip-version-check not specified -;; Do NOT check if not on homehost! -;; -(define (common:exit-on-version-changed) - (if (and *toppath* ;; do nothing if *toppath* not yet provided - (common:on-homehost?)) - (if (common:api-changed?) - (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")) - (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") ".mtdb/main.db")) - (read-only (not (file-write-access? dbfile))) - (dbstruct (db:setup))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;; #t))) - (debug:print 0 *default-log-port* - "WARNING: Version mismatch!\n" - " expected: " (common:version-signature) "\n" - " got: " (common:get-last-run-version)) - (cond - ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t) - ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only) - (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db - (debug:print 0 *default-log-port* " I see you are the owner of megatest.config, attempting to cleanup and reset to new version") - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "Failed to switch versions. exn=" exn) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (print-call-chain (current-error-port)) - (exit 1)) - (common:cleanup-db dbstruct))) - ((not (common:file-exists? mtconf)) - (debug:print 0 *default-log-port* " megatest.config does not exist in this area. Cannot proceed with megatest version migration.") - (exit 1)) - ((not (common:file-exists? dbfile)) - (debug:print 0 *default-log-port* " .mtdb/main.db does not exist in this area. Cannot proceed with megatest version migration.") - (exit 1)) - ((not (eq? (current-user-id)(file-owner mtconf))) - (debug:print 0 *default-log-port* " You do not own .mtdb/main.db in this area. Cannot proceed with megatest version migration.") - (exit 1)) - (read-only - (debug:print 0 *default-log-port* " You have read-only access to this area. Cannot proceed with megatest version migration.") - (exit 1)) - (else - (debug:print 0 *default-log-port* " to switch versions you can run: \"megatest -cleanup-db\"") - (exit 1))))))) -;;====================================================================== -;; (begin -;; (debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.") -;; (exit 1)))) - -;;====================================================================== -;; S P A R S E A R R A Y S -;;====================================================================== - -(define (make-sparse-array) - (let ((a (make-sparse-vector))) - (sparse-vector-set! a 0 (make-sparse-vector)) - a)) - -(define (sparse-array? a) - (and (sparse-vector? a) - (sparse-vector? (sparse-vector-ref a 0)))) - -(define (sparse-array-ref a x y) - (let ((row (sparse-vector-ref a x))) - (if row - (sparse-vector-ref row y) - #f))) - -(define (sparse-array-set! a x y val) - (let ((row (sparse-vector-ref a x))) - (if row - (sparse-vector-set! row y val) - (let ((new-row (make-sparse-vector))) - (sparse-vector-set! a x new-row) - (sparse-vector-set! new-row y val))))) - -;;====================================================================== -;; L O C K E R S A N D B L O C K E R S -;;====================================================================== - -;; block further accesses to databases. Call this before shutting db down -(define (common:db-block-further-queries) - (mutex-lock! *db-access-mutex*) - (set! *db-access-allowed* #f) - (mutex-unlock! *db-access-mutex*)) - -(define (common:db-access-allowed?) - (let ((val (begin - (mutex-lock! *db-access-mutex*) - *db-access-allowed* - (mutex-unlock! *db-access-mutex*)))) - val)) - -;;====================================================================== -;; U S E F U L S T U F F -;;====================================================================== - -;; convert things to an alist or assoc list, #f gets converted to "" -;; -(define (common:to-alist dat) - (cond - ((list? dat) (map common:to-alist dat)) - ((vector? dat) - (map common:to-alist (vector->list dat))) - ((pair? dat) - (cons (common:to-alist (car dat)) - (common:to-alist (cdr dat)))) - ((hash-table? dat) - (map common:to-alist (hash-table->alist dat))) - (else - (if dat - dat - "")))) - -(define (common:alist-ref/default key alist default) - (or (alist-ref key alist) default)) - -;; moved into commonmod -;; -;; (define (common:low-noise-print waitval . keys) -;; (let* ((key (string-intersperse (map conc keys) "-" )) -;; (lasttime (hash-table-ref/default *common:denoise* key 0)) -;; (currtime (current-seconds))) -;; (if (> (- currtime lasttime) waitval) -;; (begin -;; (hash-table-set! *common:denoise* key currtime) -;; #t) -;; #f))) - -(define (common:read-encoded-string instr) - (handle-exceptions - exn - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* "received bad encoded string \"" instr "\", message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) - (print-call-chain (current-error-port)) - #f) - (read (open-input-string (base64:base64-decode instr)))) - (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) - - -;;====================================================================== -;; S T A T E S A N D S T A T U S E S -;;====================================================================== - -;; BBnote: *common:std-states* - dashboard filter control and test control state buttons defined here; used in set-fields-panel and dboard:make-controls -(define *common:std-states* ;; for toggle buttons in dashboard - '( - (0 "ARCHIVED") - (1 "STUCK") - (2 "KILLREQ") - (3 "KILLED") - (4 "NOT_STARTED") - (5 "COMPLETED") - (6 "LAUNCHED") - (7 "REMOTEHOSTSTART") - (8 "RUNNING") - )) - -(define *common:dont-roll-up-states* - '("DELETED" - "REMOVING" - "CLEANING" - "ARCHIVE_REMOVING" - )) - -;;====================================================================== -;; BBnote: *common:std-statuses* dashboard filter control and test control status buttons defined here; used in set-fields-panel and dboard:make-controls -;; note these statuses are sorted from better to worse. -;; This sort order is important to dcommon:status-compare3 and db:set-state-status-and-roll-up-items -(define *common:std-statuses* - '(;; (0 "DELETED") - (1 "n/a") - (2 "PASS") - (3 "SKIP") - (4 "WARN") - (5 "WAIVED") - (6 "CHECK") - (7 "STUCK/DEAD") - (8 "DEAD") - (9 "FAIL") - (10 "PREQ_FAIL") - (11 "PREQ_DISCARDED") - (12 "ABORT"))) - -(define *common:ended-states* ;; states which indicate the test is stopped and will not proceed - '("COMPLETED" "ARCHIVED" "KILLED" "KILLREQ" "STUCK" "INCOMPLETE" )) - -(define *common:badly-ended-states* ;; these roll up as CHECK, i.e. results need to be checked - '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD" "CHECK")) - -(define *common:well-ended-states* ;; an item's prereq in this state allows item to proceed - '("PASS" "WARN" "WAIVED" "SKIP")) - -;; BBnote: *common:running-states* used from db:set-state-status-and-roll-up-items -(define *common:running-states* ;; test is either running or can be run - '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED" "STARTED")) - -(define *common:cant-run-states* ;; These are stopping conditions that prevent a test from being run - '("COMPLETED" "KILLED" "UNKNOWN" "INCOMPLETE" "ARCHIVED")) - -(define *common:not-started-ok-statuses* ;; if not one of these statuses when in not_started state treat as dead - '("n/a" "na" "PASS" "FAIL" "WARN" "CHECK" "WAIVED" "DEAD" "SKIP")) - -;;====================================================================== -;; group tests into buckets corresponding to rollup -;;; Running, completed-pass, completed-non-pass + worst status, not started. -;; filter out -;(define (common:categorize-items-for-rollup in-tests) -; ( - -(define (common:special-sort items order comp) - (let ((items-order (map reverse order)) - (acomp (or comp >))) - (sort items - (lambda (a b) - (let ((a-num (cadr (or (assoc a items-order) '(0 0)))) - (b-num (cadr (or (assoc b items-order) '(0 0))))) - (acomp a-num b-num)))))) - -;;====================================================================== -;; ;; given a toplevel with currstate, currstatus apply state and status -;; ;; => (newstate . newstatus) -;; (define (common:apply-state-status currstate currstatus state status) -;; (let* ((cstate (string->symbol (string-downcase currstate))) -;; (cstatus (string->symbol (string-downcase currstatus))) -;; (sstate (string->symbol (string-downcase state))) -;; (sstatus (string->symbol (string-downcase status))) -;; (nstate #f) -;; (nstatus #f)) -;; (set! nstate -;; (case cstate -;; ((completed not_started killed killreq stuck archived) -;; (case sstate ;; completed -> sstate -;; ((completed killed killreq stuck archived) completed) -;; ((running remotehoststart launched) running) -;; (else unknown-error-1))) -;; ((running remotehoststart launched) -;; (case sstate -;; ((completed killed killreq stuck archived) #f) ;; need to look at all items -;; ((running remotehoststart launched) running) -;; (else unknown-error-2))) -;; (else unknown-error-3))) -;; (set! nstatus -;; (case sstatus -;; ((pass) -;; (case nstate -;; ((pass n/a deleted) pass) -;; ((warn) warn) -;; ((fail) fail) -;; ((check) check) -;; ((waived) waived) -;; ((skip) skip) -;; ((stuck/dead) stuck) -;; ((abort) abort) -;; (else unknown-error-4))) -;; ((warn) -;; (case nstate -;; ((pass warn n/a skip deleted) warn) -;; ((fail) fail) -;; ((check) check) -;; ((waived) waived) -;; ((stuck/dead) stuck) -;; (else unknown-error-5))) -;; ((fail) -;; (case nstate -;; ((pass warn fail check n/a waived skip deleted stuck/dead stuck) fail) -;; ((abort) abort) -;; (else unknown-error-6))) -;; (else unknown-error-7))) -;; (cons -;; (if nstate (symbol->string nstate) nstate) -;; (if nstatus (symbol->string nstatus) nstatus)))) - -;;====================================================================== -;; D E B U G G I N G S T U F F -;;====================================================================== - -(define *verbosity* 1) -(define *logging* #f) - -(define (get-with-default val default) - (let ((val (args:get-arg val))) - (if val val default))) - -(define (assoc/default key lst . default) - (let ((res (assoc key lst))) - (if res (cadr res)(if (null? default) #f (car default))))) - -(define (common:get-testsuite-name) - (or (configf:lookup *configdat* "setup" "area-name") ;; megatest is a flexible tool, testsuite is too limiting a description. - (configf:lookup *configdat* "setup" "testsuite" ) - (getenv "MT_TESTSUITE_NAME") - (pathname-file (or (if (string? *toppath* ) - (pathname-file *toppath*) - #f) - (common:get-toppath #f))) - "please-set-setup-area-name")) ;; (pathname-file (current-directory))))) - -;;====================================================================== -;; safe getting of toppath -(define (common:get-toppath areapath) - (or *toppath* - (if areapath - (begin - (set! *toppath* areapath) - (setenv "MT_RUN_AREA_HOME" areapath) - areapath) - #f) - (if (getenv "MT_RUN_AREA_HOME") - (begin - (set! *toppath* (getenv "MT_RUN_AREA_HOME")) - *toppath*) - #f) - ;; last resort, look for megatest.config - (let loop ((thepath (realpath "."))) - (if (file-exists? (conc thepath "/megatest.config")) - thepath - (if (equal? thepath "/") - (begin - (debug:print-error 0 *default-log-port* "Unable to find megatest home directory.") - #f) - (loop (pathname-directory thepath))))) - )) - - -;;====================================================================== -;; redefine for future cleanup (converge on area-name, the more generic -;; -(define common:get-area-name common:get-testsuite-name) - -(define (common:get-db-tmp-area . junk) - (if *db-cache-path* - *db-cache-path* - (if *toppath* ;; common:get-create-writeable-dir - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* "Couldn't create path to " *db-cache-path* ", exn=" exn) - (exit 1)) - (let* ((toppath (common:real-path *toppath*)) - (tsname (common:get-testsuite-name)) - (dbpath (common:get-create-writeable-dir - (list (conc "/tmp/" (current-user-name) - "/megatest_localdb/" - tsname "/" - (string-translate toppath "/" ".")) - (conc "/tmp/" (current-process-id) ;; just in case we have an issue with the dir by own user name - "/"(current-user-name) "/megatest_localdb/" - tsname - (string-translate toppath "/" ".")) - )))) - (set! *db-cache-path* dbpath) - ;; ensure megatest area has .mtdb - (let ((dbarea (conc *toppath* "/.mtdb"))) - (if (not (file-exists? dbarea)) - (create-directory dbarea))) - ;; ensure tmp area has .mtdb - (let ((dbarea (conc dbpath "/.mtdb"))) - (if (not (file-exists? dbarea)) - (create-directory dbarea))) - dbpath)) - #f))) - -(define (common:get-area-path-signature) - (message-digest-string (md5-primitive) *toppath*)) - -;;====================================================================== -;; E X I T H A N D L I N G -;;====================================================================== - -(define (common:run-sync?) - (and *toppath* ;; gate if called before *toppath* is set - (common:on-homehost?) - (args:get-arg "-server"))) - - -(define (std-signal-handler signum) - ;; (signal-mask! signum) - (set! *time-to-exit* #t) - ;;(debug:print-info 13 *default-log-port* "got signal "signum) - (debug:print-error 0 *default-log-port* "Received signal " signum " aaa exiting promptly") - ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway - (exit)) - -(define (special-signal-handler signum) - ;; (signal-mask! signum) - (set! *time-to-exit* #t) - ;;(debug:print-info 13 *default-log-port* "got signal "signum) - (debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting!!") - ;;TODO send email to notify admin contact listed in the config that the lisner got killed - ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway - (exit)) - - -(set-signal-handler! signal/int std-signal-handler) ;; ^C -(set-signal-handler! signal/term std-signal-handler) - -;; (set-signal-handler! signal/stop std-signal-handler) ;; ^Z NO, do NOT handle ^Z! - -;;====================================================================== -;; M I S C U T I L S -;;====================================================================== - -;;====================================================================== -;; convert stuff to a number if possible -(define (any->number val) - (cond - ((number? val) val) - ((string? val) (string->number val)) - ((symbol? val) (any->number (symbol->string val))) - (else #f))) - -(define (any->number-if-possible val) - (let ((num (any->number val))) - (if num num val))) - -(define (patt-list-match item patts) - (debug:print-info 8 *default-log-port* "patt-list-match item=" item " patts=" patts) - (if (and item patts) ;; here we are filtering for matches with item patterns - (let ((res #f)) ;; look through all the item-patts if defined, format is patt1,patt2,patt3 ... wildcard is % - (for-each - (lambda (patt) - (let ((modpatt (string-substitute "%" ".*" patt #t))) - (debug:print-info 10 *default-log-port* "patt " patt " modpatt " modpatt) - (if (string-match (regexp modpatt) item) - (set! res #t)))) - (string-split patts ",")) - res) - #t)) - -;;====================================================================== -;; '(print (string-intersperse (map cadr (hash-table-ref/default (read-config "megatest.config" \#f \#t) "disks" '"'"'("none" ""))) "\n"))' -(define (common:get-disks #!key (configf #f)) - (hash-table-ref/default - (or configf (read-config "megatest.config" #f #t)) - "disks" '("none" ""))) - -(define (common:get-install-area) - (let ((exe-path (car (argv)))) - (if (common:file-exists? exe-path) - (handle-exceptions - exn - #f - (pathname-directory - (pathname-directory - (pathname-directory exe-path)))) - #f))) - -;;====================================================================== -;; return first path that can be created or already exists and is writable -;; -(define (common:get-create-writeable-dir dirs) - (if (null? dirs) - #f - (let loop ((hed (car dirs)) - (tal (cdr dirs))) - (let ((res (or (and (directory? hed) - (file-write-access? hed) - hed) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "could not create " hed - ", this might cause problems down the road. exn=" exn) - #f) - (create-directory hed #t))))) - (if (and (string? res) - (directory? res)) - res - (if (null? tal) - #f - (loop (car tal)(cdr tal)))))))) - -;;====================================================================== -;; return the youngest timestamp . filename -;; -(define (common:get-youngest glob-list) - (let ((all-files (apply append - (map (lambda (patt) - (handle-exceptions - exn - '() - (glob patt))) - glob-list)))) - (fold (lambda (fname res) - (let ((last-mod (car res)) - (curmod (handle-exceptions - exn - 0 - (file-modification-time fname)))) - (if (> curmod last-mod) - (list curmod fname) - res))) - '(0 "n/a") - all-files))) - -;;====================================================================== -;; use bash to expand a glob. Does NOT handle paths with spaces! -;; -(define (common:bash-glob instr) - (string-split - (with-input-from-pipe - (conc "/bin/bash -c \"echo " instr "\"") - read-line))) - -;;====================================================================== -;; Some safety net stuff -;;====================================================================== - -;;====================================================================== -;; return input if it is a list or return null -(define (common:list-or-null inlst #!key (ovrd #f)(message #f)) - (if (list? inlst) - inlst - (begin - (if message (debug:print-error 0 *default-log-port* message)) - (or ovrd '())))) - -;;====================================================================== -;; T A R G E T S , S T A T E , S T A T U S , -;; R U N N A M E A N D T E S T P A T T -;;====================================================================== - -;;====================================================================== -;; (map print (map car (hash-table->alist (read-config "runconfigs.config" #f #t)))) -;; -(define (common:get-runconfig-targets #!key (configf #f)) - (let ((targs (sort (map car (hash-table->alist - (or configf ;; NOTE: There is no value in using runconfig:read here. - (read-config (conc *toppath* "/runconfigs.config") - #f #t) - (make-hash-table)))) - stringsymbol force-setting) #f)) - (force-result (case force-type - ((#f) #f) - ((always) #t) - ((test) (if (args:get-arg "-execute") ;; we are in a test - #t - #f)) - (else - (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.") - #t)))) ;; default to requiring server - (if force-result - (begin - (debug:print-info 0 *default-log-port* "ATTENTION! Forcing use of server, force setting is \"" force-setting "\".") - #t) - #f))) - -;;====================================================================== -;; M I S C L I S T S -;;====================================================================== - -;;====================================================================== -;; items in lista are matched value and position in listb -;; return the remaining items in listb or #f -;; -(define (common:list-is-sublist lista listb) - (if (null? lista) - listb ;; all items in listb are "remaining" - (if (> (length lista)(length listb)) - #f - (let loop ((heda (car lista)) - (tala (cdr lista)) - (hedb (car listb)) - (talb (cdr listb))) - (if (equal? heda hedb) - (if (null? tala) ;; we are done - talb - (loop (car tala) - (cdr tala) - (car talb) - - (cdr talb))) - #f))))) - -;;====================================================================== -;; Needed for long lists to be sorted where (apply max ... ) dies -;; -(define (common:max inlst) - (let loop ((max-val (car inlst)) - (hed (car inlst)) - (tal (cdr inlst))) - (if (not (null? tal)) - (loop (max hed max-val) - (car tal) - (cdr tal)) - (max hed max-val)))) - -;;====================================================================== -;; get min or max, use > for max and < for min, this works around the limits on apply -;; -(define (common:min-max comp lst) - (if (null? lst) - #f ;; better than an exception for my needs - (fold (lambda (a b) - (if (comp a b) a b)) - (car lst) - lst))) - -;;====================================================================== -;; get min or max, use > for max and < for min, this works around the limits on apply -;; -(define (common:sum lst) - (if (null? lst) - 0 - (fold (lambda (a b) - (+ a b)) - (car lst) - lst))) - -;;====================================================================== -;; path list to hash-table tree -;; ((a b c)(a b d)(e b c)) => ((a (b (d) (c))) (e (b (c)))) -;; -(define (common:list->htree lst) - (let ((resh (make-hash-table))) - (for-each - (lambda (inlst) - (let loop ((ht resh) - (hed (car inlst)) - (tal (cdr inlst))) - (if (hash-table-ref/default ht hed #f) - (if (not (null? tal)) - (loop (hash-table-ref ht hed) - (car tal) - (cdr tal))) - (begin - (hash-table-set! ht hed (make-hash-table)) - (loop ht hed tal))))) - lst) - resh)) - -;;====================================================================== -;; hash-table tree to html list tree -;; -;; tipfunc takes two parameters: y the tip value and path the path to that point -;; -(define (common:htree->html ht path tipfunc) - (let ((datlist (sort (hash-table->alist ht) - (lambda (a b) - (string< (car a)(car b)))))) - (if (null? datlist) - (tipfunc #f path) ;; really shouldn't get here - (s:ul - (map (lambda (x) - (let* ((levelname (car x)) - (y (cdr x)) - (newpath (append path (list levelname))) - (leaf (or (not (hash-table? y)) - (null? (hash-table-keys y))))) - (if leaf - (s:li (tipfunc y newpath)) - (s:li - (list - levelname - (common:htree->html y newpath tipfunc)))))) - datlist))))) - -;;====================================================================== -;; hash-table tree to alist tree -;; -(define (common:htree->atree ht) - (map (lambda (x) - (cons (car x) - (let ((y (cdr x))) - (if (hash-table? y) - (common:htree->atree y) - y)))) - (hash-table->alist ht))) - -;;====================================================================== -;; M U N G E D A T A I N T O N I C E F O R M S -;;====================================================================== - -;;====================================================================== -;; Generate an index for a sparse list of key values -;; ( (rowname1 colname1 val1)(rowname2 colname2 val2) ) -;; -;; => -;; -;; ( (rowname1 0)(rowname2 1)) ;; rownames -> num -;; (colname1 0)(colname2 1)) ) ;; colnames -> num -;; -;; optional apply proc to rownum colnum value -(define (common:sparse-list-generate-index data #!key (proc #f)) - (if (null? data) - (list '() '()) - (let loop ((hed (car data)) - (tal (cdr data)) - (rownames '()) - (colnames '()) - (rownum 0) - (colnum 0)) - (let* ((rowkey (car hed)) - (colkey (cadr hed)) - (value (caddr hed)) - (existing-rowdat (assoc rowkey rownames)) - (existing-coldat (assoc colkey colnames)) - (curr-rownum (if existing-rowdat rownum (+ rownum 1))) - (curr-colnum (if existing-coldat colnum (+ colnum 1))) - (new-rownames (if existing-rowdat rownames (cons (list rowkey curr-rownum) rownames))) - (new-colnames (if existing-coldat colnames (cons (list colkey curr-colnum) colnames)))) - ;; (debug:print-info 0 *default-log-port* "Processing record: " hed ) - (if proc (proc curr-rownum curr-colnum rowkey colkey value)) - (if (null? tal) - (list new-rownames new-colnames) - (loop (car tal) - (cdr tal) - new-rownames - new-colnames - (if (> curr-rownum rownum) curr-rownum rownum) - (if (> curr-colnum colnum) curr-colnum colnum) - )))))) - -;;====================================================================== -;; if it looks like a number -> convert it to a number, else return it -;; -(define (common:lazy-convert inval) - (let* ((as-num (if (string? inval)(string->number inval) #f))) - (or as-num inval))) - -;;====================================================================== -;; convert string a=1; b=2; c=a silly thing; d= -;; to '((a . 1)(b . 2)(c . "a silly thing")(d . "")) -;; -(define (common:val->alist val #!key (convert #f)) - (let ((val-list (string-split-fields ";\\s*" val #:infix))) - (if val-list - (map (lambda (x) - (let ((f (string-split-fields "\\s*=\\s*" x #:infix))) - (case (length f) - ((0) `(,#f)) ;; null string case - ((1) `(,(string->symbol (car f)))) - ((2) `(,(string->symbol (car f)) . ,(let ((inval (cadr f))) - (if convert (common:lazy-convert inval) inval)))) - (else f)))) - val-list) - '()))) - -;;====================================================================== -;; S Y S T E M S T U F F -;;====================================================================== - -;;====================================================================== -;; lazy-safe get file mod time. on any error (file not existing etc.) return 0 -;; -(define (common:lazy-modification-time fpath) - (handle-exceptions - exn - (begin - (debug:print 2 *default-log-port* "Failed to get modification time for " fpath ", treating it as zero. exn=" exn) - 0) - (if (file-exists? fpath) - (file-modification-time fpath) - 0))) - -;;====================================================================== -;; find timestamp of newest file associated with a sqlite db file -(define (common:lazy-sqlite-db-modification-time fpath) - (let* ((glob-list (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "Failed to glob " fpath "*, exn=" exn) - `(,(conc "/no/such/file, message: " ((condition-property-accessor 'exn 'message) exn)))) - (glob (conc fpath "*")))) - (file-list (if (eq? 0 (length glob-list)) - '("/no/such/file") - glob-list))) - (apply max - (map - common:lazy-modification-time - file-list)))) - -;;====================================================================== -;; make "nice-path" available in config files and the repl -(define nice-path common:nice-path) - -;;====================================================================== -;; returns *effective load* (not normalized) -;; -(define (common:get-intercept onemin fivemin) - (if (< onemin fivemin) ;; load is decreasing, just use the onemin load - onemin - (let* ((load-change (- onemin fivemin)) - (tchange (- 300 60))) - (max (+ onemin (* 60 (/ load-change tchange))) 0)))) - -;;====================================================================== -;; calculate a delay number based on a droop curve -;; inputs are: -;; - load-in, load as from uptime, NOT normalized -;; - numcpus, number of cpus, ideally use the real cpus, not threads -;; -(define (common:get-delay load-in numcpus) - (let* ((ratio (/ load-in numcpus)) - (new-option (configf:lookup *configdat* "load" "new-load-method")) - (paramstr (or (configf:lookup *configdat* "load" "exp-params") - "15 12 1281453987.9543 0.75")) ;; 5 4 10 1")) - (paramlst (map string->number (string-split paramstr)))) - (if new-option - (begin - (cond ((and (>= ratio 0) (< ratio .5)) - 0) - ((and (>= ratio 0.5) (<= ratio .9)) - (* ratio (/ 5 .9))) - ((and (> ratio .9) (<= ratio 1.1)) - (+ 5 (* (- ratio .9) (/ 55 .2)))) - ((> ratio 1.1) - 60))) - (match paramlst - ((r1 r2 s1 s2) - (debug:print 3 *default-log-port* "Using params r1=" r1 " r2=" r2 " s1=" s1 " s2=" s2) - (min (max (/ (expt r1 (* r2 s2 ratio)) s1) 0) 30)) - (else - (debug:print 0 *default-log-port* "BAD exp-params, should be \"r1 r2 s1 s2\" but got " paramstr) - 30))))) - -;; -mrw- this appears to not be used -;; -;; (define (common:print-delay-table) -;; (let loop ((x 0)) -;; (print x "," (common:get-delay x 1)) -;; (if (< x 2) -;; (loop (+ x 0.1))))) - -;; (define (get-cpu-load #!key (remote-host #f)) -;; (car (common:get-cpu-load remote-host))) - -;;====================================================================== -;; (let* ((load-res (process:cmd-run->list "uptime")) -;; (load-rx (regexp "load average:\\s+(\\d+)")) -;; (cpu-load #f)) -;; (for-each (lambda (l) -;; (let ((match (string-search load-rx l))) -;; (if match -;; (let ((newval (string->number (cadr match)))) -;; (if (number? newval) -;; (set! cpu-load newval)))))) -;; (car load-res)) -;; cpu-load)) - -;;====================================================================== -;; get values from cached info from dropping file in .sysdata dir -;; e.g. key is host and dtype is normalized-load -;; -(define (common:get-cached-info key dtype #!key (age 10)) - (if *toppath* - (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")) - (delfile (lambda (exn) - (debug:print-info 2 *default-log-port* " removing bad file " fullpath ", exn=" exn) - (delete-file* fullpath) - #f))) - (if (and (file-exists? fullpath) - (file-read-access? fullpath)) - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "failed to get cached info from " fullpath ", exn=" exn) - #f) - (debug:print 2 *default-log-port* "reading file " fullpath) - (let ((real-age (- (current-seconds) - (handle-exceptions - exn - (begin - (debug:print 1 *default-log-port* "Failed to read mod time on file " - fullpath ", using 0, exn=" exn) - 0) - (file-change-time fullpath))))) - (if (< real-age age) - (handle-exceptions - exn - (delfile exn) - (let* ((res (with-input-from-file fullpath read))) - (if (eof-object? res) - (begin - (delfile "n/a") - #f) - res))) - (begin - (debug:print-info 2 *default-log-port* "file " fullpath - " is too old (" real-age" seconds) to trust, skipping reading it") - #f)))) - (begin - (debug:print 2 *default-log-port* "not reading file " fullpath) - #f))) - #f)) - -(define (common:write-cached-info key dtype dat) - (if *toppath* - (let* ((fulldir (conc *toppath* "/.sysdata")) - (fullpath (conc fulldir "/" key "-" dtype ".log"))) - (if (not (file-exists? fulldir))(create-directory fulldir #t)) - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "failed to write file " fullpath ", exn=" exn) - #f) - (with-output-to-file fullpath (lambda ()(pp dat))))) - #f)) - -(define (common:raw-get-remote-host-load remote-host) - (let* ((inp #f)) - (handle-exceptions - exn - (begin - (close-input-pipe inp) - (debug:print 0 *default-log-port* "failed to ssh to " remote-host " and get loadavg. exn=" exn) - #f) ;; more specific handling of errors needed - (set! inp (open-input-pipe (conc "ssh " remote-host " cat /proc/loadavg"))) - (let ((res (list (read inp)(read inp)(read inp)))) - (close-input-pipe inp) - res)))) - -;;====================================================================== -;; get cpu load by reading from /proc/loadavg, return all three values -;; -(define (common:get-cpu-load remote-host) - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "failed to ssh or read loadavg from host " remote-host ", exn=" exn) - '(-99 -99 -99)) - (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) - (or (common:get-cached-info actual-hostname "cpu-load") - (let ((result (if (and remote-host - (not (equal? remote-host (get-host-name)))) - (map (lambda (res) - (if (eof-object? res) 9e99 res)) - (common:raw-get-remote-host-load remote-host)) - (with-input-from-file "/proc/loadavg" - (lambda ()(list (read)(read)(read))))))) - (match - result - ((l1 l2 l3) - (if (and (number? l1) - (number? l2) - (number? l3)) - (begin - (common:write-cached-info actual-hostname "cpu-load" result) - result) - '(-1 -1 -1))) ;; -1 is bad result - (else '(-2 -2 -2)))))))) - -;;====================================================================== -;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads -;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc. -;; keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load -;; -(define (common:get-normalized-cpu-load remote-host) - (let ((res (common:get-normalized-cpu-load-raw remote-host)) - (default `((adj-proc-load . 2) ;; there is no right answer - (adj-core-load . 2) - (1m-load . 2) - (5m-load . 0) ;; causes a large delta - thus causing default of throttling if stuff goes wrong - (15m-load . 0) - (proc . 1) - (core . 1) - (phys . 1) - (error . #t)))) - (cond - ((and (list? res) - (> (length res) 2)) - res) - ((eq? res #f) default) ;; add messages? - ((eq? res #f) default) ;; this would be the #eof - (else default)))) - -(define (common:ssh-get-loadavg remote-host) - (let ((inp (open-input-pipe (conc "ssh " remote-host " \"cat /proc/loadavg;cat /proc/cpuinfo;echo end\"")))) - (let* ((res (read-lines inp))) - (close-input-pipe inp) - res))) - -(define (common:get-normalized-cpu-load-raw remote-host) - (let* ((actual-host (or remote-host (get-host-name)))) ;; #f is localhost - (or (common:get-cached-info actual-host "normalized-load") - (let ((data (if remote-host - (common:ssh-get-loadavg remote-host) - (append - (with-input-from-file "/proc/loadavg" - read-lines) - (with-input-from-file "/proc/cpuinfo" - read-lines) - (list "end")))) - (load-rx (regexp "^([\\d\\.]+)\\s+([\\d\\.]+)\\s+([\\d\\.]+)\\s+.*$")) - (proc-rx (regexp "^processor\\s+:\\s+(\\d+)\\s*$")) - (core-rx (regexp "^core id\\s+:\\s+(\\d+)\\s*$")) - (phys-rx (regexp "^physical id\\s+:\\s+(\\d+)\\s*$")) - (max-num (lambda (p n)(max (string->number p) n)))) - ;; (print "data=" data) - (if (null? data) ;; something went wrong - #f - (let loop ((hed (car data)) - (tal (cdr data)) - (loads #f) - (proc-num 0) ;; processor includes threads - (phys-num 0) ;; physical chip on motherboard - (core-num 0)) ;; core - ;;; (print hed ", " loads ", " proc-num ", " phys-num ", " core-num) - (if (null? tal) ;; have all our data, calculate normalized load and return result - (let* ((act-proc (+ proc-num 1)) - (act-phys (+ phys-num 1)) - (act-core (+ core-num 1)) - (adj-proc-load (/ (car loads) act-proc)) - (adj-core-load (/ (car loads) act-core)) - (result - (append (list (cons 'adj-proc-load adj-proc-load) - (cons 'adj-core-load adj-core-load)) - (list (cons '1m-load (car loads)) - (cons '5m-load (cadr loads)) - (cons '15m-load (caddr loads))) - (list (cons 'proc act-proc) - (cons 'core act-core) - (cons 'phys act-phys))))) - (common:write-cached-info actual-host "normalized-load" result) - result) - (regex-case - hed - (load-rx ( x l1 l5 l15 ) (loop (car tal)(cdr tal)(map string->number (list l1 l5 l15)) proc-num phys-num core-num)) - (proc-rx ( x p ) (loop (car tal)(cdr tal) loads (max-num p proc-num) phys-num core-num)) - (phys-rx ( x p ) (loop (car tal)(cdr tal) loads proc-num (max-num p phys-num) core-num)) - (core-rx ( x c ) (loop (car tal)(cdr tal) loads proc-num phys-num (max-num c core-num))) - (else - (begin - ;; (print "NO MATCH: " hed) - (loop (car tal)(cdr tal) loads proc-num phys-num core-num))))))))))) - -(define (common:unix-ping hostname) - (let ((res (system (conc "ping -c 1 " hostname " > /dev/null")))) - (eq? res 0))) - -;;====================================================================== -;; ideally put all this info into the db, no need to preserve it across moving homehost -;; -;; return list of -;; ( reachable? cpuload update-time ) -(define (common:get-host-info hostname) - (let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data - (load (car loadinfo)) - (load-sample-time (cdr loadinfo)) - (load-sample-age (- (current-seconds) load-sample-time)) - (loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds - (host-last-update-timeout-seconds 4) - (host-rec (hash-table-ref/default *host-loads* hostname #f)) - ) - (cond - ((< load-sample-age loadinfo-timeout-seconds) - (list #t - load-sample-time - load)) - ((and host-rec - (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds))) - (list #t - (host-last-update host-rec) - (host-last-cpuload host-rec ))) - ((common:unix-ping hostname) - (list #t - (current-seconds) - (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds - (else - (list #f 0 -1) ;; bad host, don't use! - )))) - -;;====================================================================== -;; see defstruct host at top of file. -;; host: reachable last-update last-used last-cpuload -;; -(define (common:update-host-loads-table hosts-raw) - (let* ((hosts (filter (lambda (x) - (string-match (regexp "^\\S+$") x)) - hosts-raw))) - (for-each - (lambda (hostname) - (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) - (if h - h - (let ((h (make-host))) - (hash-table-set! *host-loads* hostname h) - h)))) - (host-info (common:get-host-info hostname)) - (is-reachable (car host-info)) - (last-reached-time (cadr host-info)) - (load (caddr host-info))) - (host-reachable-set! rec is-reachable) - (host-last-update-set! rec last-reached-time) - (host-last-cpuload-set! rec load))) - hosts))) - -;;====================================================================== -;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the -;; [host-rules] section. -;; -(define (common:get-least-loaded-host hosts-raw host-type configdat) - (let* ((rdat (configf:lookup configdat "host-rules" host-type)) - (rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate - (maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load - (maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs - (maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second - (hosts (filter (lambda (x) - (string-match (regexp "^\\S+$") x)) - hosts-raw)) - ;; (best-host #f) - (get-rec (lambda (hostname) - ;; (print "get-rec hostname=" hostname) - (let ((h (hash-table-ref/default *host-loads* hostname #f))) - (if h - h - (let ((h (make-host))) - (hash-table-set! *host-loads* hostname h) - h))))) - (best-load 99999) - (curr-time (current-seconds)) - (get-hosts-sorted (lambda (hosts) - (sort hosts (lambda (a b) - (let ((a-rec (get-rec a)) - (b-rec (get-rec b))) - ;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec)) - ;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec)) - (< (host-last-used a-rec) - (host-last-used b-rec)))))))) - (debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts)) - (if (null? hosts) - #f ;; no hosts to select from. All done and giving up now. - (let ((hosts-sorted (get-hosts-sorted hosts))) - (common:update-host-loads-table hosts) - (let loop ((hostname (car hosts-sorted)) - (tal (cdr hosts-sorted)) - (best-host #f)) - (let* ((rec (get-rec hostname)) - (reachable (host-reachable rec)) - (load (host-last-cpuload rec)) - (last-used (host-last-used rec)) - (delta (- curr-time last-used)) - (job-rate (if (> delta 0) - (/ 1 delta) - 999)) ;; jobs per second - (new-best - (cond - ((not reachable) - (debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.") - best-host) - ((and (< load maxnload) ;; load is acceptable - (< job-rate maxjobrate)) ;; job rate is acceptable - (set! best-load load) - hostname) - (else best-host)))) - (debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." ) - (if new-best - (begin ;; found a host, return it - (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) - (host-last-used-set! rec curr-time) - new-best) - (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) - -(define (common:wait-for-homehost-load maxnormload msg) - (let loop ((start-time (current-seconds))) ;; we saw some instances of this being called before *toppath* was set. This might be an early setup race. This delay should help but it is impossible to test... - (if (not *toppath*) - (begin - (debug:print 0 *default-log-port* "ERROR: common:wait-for-homehost-load called before *toppath* set.") - (thread-sleep! 30) - (if (< (- (current-seconds) start-time) 300) - (loop start-time))))) - (case (rmt:transport-mode) - ((http) - (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. - #f - (server:choose-server *toppath* 'homehost))) - (hh (if hh-dat (car hh-dat) #f))) - (common:wait-for-normalized-load maxnormload msg hh))) - (else - (common:wait-for-normalized-load maxnormload msg (get-host-name))))) - -(define (common:get-num-cpus remote-host) - (let* ((actual-host (or remote-host (get-host-name)))) - ;; hosts had better not be changing the number of cpus too often! - (or (hash-table-ref/default *numcpus-cache* actual-host #f) - (let* ((numcpus (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) - (let* ((proc (lambda () - (let loop ((numcpu 0) - (inl (read-line))) - (if (eof-object? inl) - (if (> numcpu 0) - numcpu - #f) ;; if zero return #f so caller knows that things are not working - (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl) - (+ numcpu 1) - numcpu) - (read-line)))))) - (result (if (and remote-host - (not (equal? remote-host (get-host-name)))) - (common:generic-ssh - (conc "ssh " remote-host " cat /proc/cpuinfo") - proc -1) - (with-input-from-file "/proc/cpuinfo" proc)))) - (if (and (number? result) - (> result 0)) - (common:write-cached-info actual-host "num-cpus" result)) - result)))) - (hash-table-set! *numcpus-cache* actual-host numcpus) - numcpus)))) - -;;====================================================================== -;; wait for normalized cpu load to drop below maxload -;; -(define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5)) - (let ((num-cpus (common:get-num-cpus remote-host))) - (if num-cpus - (common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host) - (begin - (thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again - (if (> rem-tries 0) - (common:wait-for-normalized-load maxnormload msg remote-host (- rem-tries 1)) - #f))))) - -;;====================================================================== -;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load -;; count - count down to zero, at some point we'd give up if the load never drops -;; num-tries - count down to zero number tries to get numcpus -;; -(define (common:wait-for-cpuload maxnormload numcpus-in - #!key (count 1000) - (msg #f)(remote-host #f)(num-tries 5)) - (let* ((loadavg (common:get-cpu-load remote-host)) - ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again - (numcpus (if (<= 1 numcpus-in) - (common:get-num-cpus remote-host) numcpus-in)) - (first (car loadavg)) - (next (cadr loadavg)) - (adjmaxload (* maxnormload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude - ;; fallback is to at least use 1 - ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit - ;; etc. - (effective-load (common:get-intercept first next)) - (recommended-delay (common:get-delay effective-load numcpus)) - (effective-host (or remote-host "localhost")) - (normalized-effective-load (/ effective-load numcpus)) - (will-wait (> normalized-effective-load maxnormload))) - (if (and will-wait (> recommended-delay 1)) - (let* ((actual-delay (min recommended-delay 30))) - (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) - (debug:print-info 0 *default-log-port* "Load control, delaying " - actual-delay " seconds to maintain safe load. current normalized effective load is " - normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load)) - (thread-sleep! actual-delay))) - - (cond - ;; bad data, try again to get the data - ((not will-wait) - (if (common:low-noise-print 3600 (conc (round normalized-effective-load) "-load-acceptable-" effective-host)) - (debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing."))) - - ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable - (> num-tries 0)) - (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " - first ", we'll sleep 10s and try " num-tries " more times.") - (thread-sleep! 10) - (common:wait-for-cpuload maxnormload numcpus-in - count: count remote-host: remote-host num-tries: (- num-tries 1))) - - ;; need to wait for load to drop - ((and will-wait ;; (> first adjmaxload) - (> count 0)) - (debug:print-info 0 *default-log-port* - "Delaying 15" ;; adjwait - " seconds due to normalized effective load " normalized-effective-load ;; first - " exceeding max of " adjmaxload - " on server " (or remote-host (get-host-name)) - " (normalized load-limit: " maxnormload ") " (if msg msg "")) - (thread-sleep! 15) ;; adjwait) - (common:wait-for-cpuload maxnormload numcpus count: (- count 1) msg: msg remote-host: remote-host) - ;; put the message here to indicate came out of waiting - (debug:print-info 1 *default-log-port* - "On host: " effective-host - ", effective load: " effective-load - ", numcpus: " numcpus - ", normalized effective load: " normalized-effective-load - )) - ;; overloaded and count expired (i.e. went to zero) - (else - (if (> num-tries 0) ;; should be "num-tries-left". - (if (common:low-noise-print 30 (conc (round effective-load) "-load-acceptable-" effective-host)) - (debug:print 0 *default-log-port* "Load on " effective-host " is acceptable at effective normalized load of " - effective-normalized-load " continuing.")) - (debug:print 0 *default-log-port* "Load on " effective-host ", " - first" could not be retrieved. Giving up and continuing.")))))) - -;;====================================================================== -;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load -;; -;; (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5)) -;; (let* ((loadavg (common:get-cpu-load remote-host)) -;; (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again -;; (common:get-num-cpus remote-host) -;; numcpus-in)) -;; (maxload (if force-maxload -;; maxload-in -;; (if (number? maxload-in) -;; (max maxload-in 0.5) -;; 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? -;; (first (car loadavg)) -;; (next (cadr loadavg)) -;; (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where -;; ;; numcpus (or could be -;; ;; maxload) is zero, -;; ;; crude fallback is to -;; ;; at least use 1 -;; (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next? -;; 0 -;; next))) ;; we will force a conservative calculation any time next is large. -;; (first-next-avg (/ (+ first next) 2)) -;; ;; add some randomness to the time to break any alignment -;; ;; where netbatch dumps many jobs to machines simultaneously -;; (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10) -;; (/ (- 1000 count) 10) -;; waitdelay) -;; (- first adjmaxload) )))) -;; (load-jump-limit (configf:lookup-number *configdat* "setup" "load-jump-limit")) -;; ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit -;; ;; etc. -;; (effective-load (common:get-intercept first next)) -;; (effective-host (or remote-host "localhost")) -;; (normalized-effective-load (/ effective-load numcpus)) -;; (will-wait (> normalized-effective-load maxload))) -;; -;; ;; let's let the user know once in a long while that load checking -;; ;; is happening but not constantly report it -;; #;(if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time -;; (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload -;; ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp)) -;; -;; (debug:print-info 1 *default-log-port* -;; "On host: " effective-host -;; ", effective load: " effective-load -;; ", numcpus: " numcpus -;; ", normalized effective load: " normalized-effective-load -;; ) -;; -;; (cond -;; ;; bad data, try again to get the data -;; ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable -;; (> num-tries 0)) -;; (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.") -;; (thread-sleep! 10) -;; (common:wait-for-cpuload maxload-in numcpus-in waitdelay -;; count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1))) -;; ;; need to wait for load to drop -;; ((and will-wait ;; (> first adjmaxload) -;; (> count 0)) -;; (debug:print-info 0 *default-log-port* -;; "Delaying " 15 ;; adjwait -;; " seconds due to normalized effective load " normalized-effective-load ;; first -;; " exceeding max of " adjmaxload -;; " on server " (or remote-host (get-host-name)) -;; " (normalized load-limit: " maxload ") " (if msg msg "")) -;; (thread-sleep! 15) ;; adjwait) -;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) -;; ((and (> loadjmp (cond -;; (load-jump-limit load-jump-limit) -;; ((> numcpus 8)(/ numcpus 2)) -;; ((> numcpus 4)(/ numcpus 1.2)) -;; (else 0.5))) -;; (> count 0)) -;; (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". " -;; (if msg msg "")) -;; (thread-sleep! adjwait) -;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) -;; (else -;; (if (> num-tries 0) -;; (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost"))) -;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing.")) -;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing.")))))) -;; -(define (get-uname . params) - (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params))))) - (uname #f)) - (if (null? (car uname-res)) - "unknown" - (caar uname-res)))) - -;;====================================================================== -;; D I S K S P A C E -;;====================================================================== - -(define (common:get-disk-space-used fpath) - (with-input-from-pipe (conc "/usr/bin/du -s " fpath) read)) - -;;====================================================================== -;; given path get free space, allows override in [setup] -;; with free-space-script /path/to/some/script.sh -;; -(define (get-df path) - (if (configf:lookup *configdat* "setup" "free-space-script") - (with-input-from-pipe - (conc (configf:lookup *configdat* "setup" "free-space-script") " " path) - (lambda () - (let ((res (read-line))) - (if (string? res) - (string->number res))))) - (get-unix-df path))) - -(define (get-free-inodes path) - (if (configf:lookup *configdat* "setup" "free-inodes-script") - (with-input-from-pipe - (conc (configf:lookup *configdat* "setup" "free-inodes-script") " " path) - (lambda () - (let ((res (read-line))) - (if (string? res) - (string->number res))))) - (get-unix-inodes path))) - -(define (get-unix-df path) - (let* ((df-results (process:cmd-run->list (conc "df " path))) - (space-rx (regexp "([0-9]+)\\s+([0-9]+)%")) - (freespc #f)) - ;; (write df-results) - (for-each (lambda (l) - (let ((match (string-search space-rx l))) - (if match - (let ((newval (string->number (cadr match)))) - (if (number? newval) - (set! freespc newval)))))) - (car df-results)) - freespc)) - -(define (get-unix-inodes path) - (let* ((df-results (process:cmd-run->list (conc "df -i " path))) - (space-rx (regexp "([0-9]+)\\s+([0-9]+)%")) - (freenodes 0)) ;; 0 is a better failsafe than #f here. - ;; (write df-results) - (for-each (lambda (l) - (let ((match (string-search space-rx l))) - (if match - (let ((newval (string->number (cadr match)))) - (if (number? newval) - (set! freenodes newval)))))) - (car df-results)) - freenodes)) - -(define (common:check-space-in-dir dirpath required) - (let* ((dbspace (if (directory? dirpath) - (get-df dirpath) - 0))) - (list (> dbspace required) - dbspace - required - dirpath))) - -;;====================================================================== -;; check space in dbdir and in megatest dir -;; returns: ok/not dbspace required-space -;; -(define (common:check-db-dir-space) - (let* ((required (string->number - ;; default is 1GB (or actually a billion bytes) This is the number of 1 kB blocks. - (or (configf:lookup *configdat* "setup" "dbdir-space-required") - "1000000"))) - (dbdir (common:make-tmpdir-name *toppath* "")) ;; (db:get-dbdir)) - (tdbspace (common:check-space-in-dir dbdir required)) - (mdbspace (common:check-space-in-dir *toppath* required))) - (sort (list tdbspace mdbspace) (lambda (a b) - (< (cadr a)(cadr b)))))) - -;;====================================================================== -;; check available space in dbdir, exit if insufficient -;; -(define (common:check-db-dir-and-exit-if-insufficient) - (let* ((spacedat (car (common:check-db-dir-space))) ;; look only at worst for now - (is-ok (car spacedat)) - (dbspace (cadr spacedat)) - (required (caddr spacedat)) - (dbdir (cadddr spacedat))) - (if (not is-ok) - (begin - (debug:print-error 0 *default-log-port* "Insufficient space in " dbdir ", require " required ", have " dbspace ", exiting now.") - (exit 1))))) - -;;====================================================================== -;; paths is list of lists ((name path) ... ) -;; -(define (common:get-disk-with-most-free-space disks minsize) - (let* ((best #f) - (bestsize 0) - (default-min-inodes-string "1000000") - (default-min-inodes (string->number default-min-inodes-string)) - (min-inodes (or (string->number (if (configf:lookup *configdat* "setup" "min_inodes") (configf:lookup *configdat* "setup" "min_inodes") default-min-inodes-string)) default-min-inodes))) - - (for-each - (lambda (disk-num) - (let* ((dirpath (cadr (assoc disk-num disks))) - (freespc (cond - ((not (directory? dirpath)) - (if (common:low-noise-print 300 "disks not a dir " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a directory - ignoring it.")) - -1) - ((not (file-write-access? dirpath)) - (if (common:low-noise-print 300 "disks not writeable " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not writeable - ignoring it.")) - -1) - ((not (eq? (string-ref dirpath 0) #\/)) - (if (common:low-noise-print 300 "disks not a proper path " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a fully qualified path - ignoring it.")) - -1) - (else - (get-df dirpath)))) - (free-inodes (cond - ((not (directory? dirpath)) - (if (common:low-noise-print 300 "disks not a dir " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a directory - ignoring it.")) - -1) - ((not (file-write-access? dirpath)) - (if (common:low-noise-print 300 "disks not writeable " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not writeable - ignoring it.")) - -1) - ((not (eq? (string-ref dirpath 0) #\/)) - (if (common:low-noise-print 300 "disks not a proper path " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a fully qualified path - ignoring it.")) - -1) - (else - (get-free-inodes dirpath)))) - ;;(free-inodes (get-free-inodes dirpath)) - ) - (debug:print 2 *default-log-port* "INFO: disk " disk-num " path " dirpath " free space " freespc " free inodes " free-inodes) - (if (and (> freespc bestsize)(> free-inodes min-inodes )) - (begin - (set! best (cons disk-num dirpath)) - (set! bestsize freespc))) - ;;(print "Processing: " disk-num " bestsize: " bestsize " best: " best " freespc: " freespc " min-inodes: " min-inodes " free-inodes: " free-inodes) - )) - (map car disks)) - (if (and best (> bestsize minsize)) - best - #f))) ;; #f means no disk candidate found - -;;====================================================================== -;; convert a spec string to a list of vectors #( rx action rx-string ) -(define (common:spec-string->list-of-specs spec-string actions) - (let ((spec-strings (string-split-fields "\\s*;\\s*" spec-string #:infix)) - (actions-regex (regexp (conc "^(.*)\\s+(" (string-intersperse (map conc actions) "|") ")")))) - (filter - (lambda (x) x) - (map (lambda (s) - (let ((m (string-match actions-regex s))) - (if m - (vector (regexp (cadr m))(string->symbol (caddr m))(cadr m)) - (begin - (debug:print 0 *default-log-port* "WARNING: Unrecognised rule \"" s "\" in clean-up specification.") - #f)))) - spec-strings)))) - -;;====================================================================== -;; given a list of specs rx . rule and a file return the first matching rule -;; -(define (common:file-find-rule fname rules) ;; rule is vector #( rx action rx-string) - (let loop ((rule (car rules)) - (tail (cdr rules))) - (let ((rx (vector-ref rule 0)) - (rn (vector-ref rule 1))) ;; rule name - (if (string-match rx fname) - rule ;; return the whole rule so regex can be printed etc. - (if (null? tail) - #f - (loop (car tail)(cdr tail))))))) - -;;====================================================================== -;; given a spec apply some rules to a directory -;; -;; WARNING: This function will REMOVE files - be sure your spec and path is correct! -;; -;; spec format: -;; file-regex1 action; file-regex2 action; ... -;; e.g. -;; .*\.log$ keep; .* remove -;; --> keep all .log files, remove everything else -;; limitations: -;; cannot have a rule with ; as part of the spec -;; not very flexible, would be nice to return binned file names? -;; supported rules: -;; keep - keep this file -;; remove - remove this file -;; compress - compress this file -;; -(define (common:dir-clean-up path spec-string #!key (compress "gzip")(actions '(keep remove compress))(remove-empty #f)) - (let* ((specs (common:spec-string->list-of-specs spec-string actions)) - (keepers (make-hash-table)) - (directories (make-hash-table))) - (find-files - path - action: (lambda (p res) - (let ((rule (common:file-find-rule p specs))) - (cond - ((directory? p)(hash-table-set! directories p #t)) - (else - (case (vector-ref rule 1) - ((keep)(hash-table-set! keepers p rule)) - ((remove) - (debug:print 0 *default-log-port* "Removing file " p) - (delete-file p)) - ((compress) - (debug:print 0 *default-log-port* "Compressing file " p) - (system (conc compress " " p))) - (else - (debug:print 0 *default-log-port* "No match for file " p)))))))) - (if remove-empty - (for-each - (lambda (d) - (if (null? (glob (conc d "/.*")(conc d "/*"))) - (begin - (debug:print 0 *default-log-port* "Removing empty directory " d) - (delete-directory d)))) - (sort (hash-table-keys directories) (lambda (a b)(> (string-length a)(string-length b)))))) - )) - -;;====================================================================== -;; E N V I R O N M E N T V A R S -;;====================================================================== - -(define (save-environment-as-files fname #!key (ignorevars (list "USER" "HOME" "DISPLAY" "LS_COLORS" "XKEYSYMDB" "EDITOR" "MAKEFLAGS" "MAKEF" "MAKEOVERRIDES" "HOSTNAME"))) - ;;(bb-check-path msg: "save-environment-as-files entry") - (let ((envvars (get-environment-variables)) - (whitesp (regexp "[^a-zA-Z0-9_\\-:,\\.\\/%$]")) - (mungeval (lambda (val) - (cond - ((eq? val #t) "") ;; convert #t to empty string - ((eq? val #f) #f) ;; convert #f to itself (still thinking about this one - (else val))))) - (with-output-to-file (conc fname ".csh") - (lambda () - (for-each (lambda (keyval) - (let* ((key (car keyval)) - (val (cdr keyval)) - (delim (if (and (string-search whitesp val) - (not (string-search "^\".*\"$" val)) - (not (string-search "^'.*'$" val))) - "\"" - ""))) - - (print (if (or (member key ignorevars) - (string-search whitesp key)) - "# setenv " - "setenv ") - key " " delim (mungeval val) delim))) - envvars))) - (with-output-to-file (conc fname ".sh") - (lambda () - (for-each (lambda (keyval) - (let* ((key (car keyval)) - (val (cdr keyval)) - (delim (if (and (string-search whitesp val) - (not (string-search "^\".*\"$" val)) - (not (string-search "^'.*'$" val))) - "\"" - ""))) - (print (if (or (member key ignorevars) - (string-search whitesp key) - (string-search ":" key)) ;; internal only values to be skipped. - "# export " - "export ") - key "=" delim (mungeval val) delim))) - envvars))))) - -(define (common:get-param-mapping #!key (flavor #f)) - "returns alist mapping string keys in testconfig/subrun to megatest command line switches; if flavor is switch-symbol, maps tcmt symbolic switches to megatest switches" - (let ((default '(("tag-expr" . "-tagexpr") - ("mode-patt" . "-modepatt") - ("run-name" . "-runname") - ("contour" . "-contour") - ("target" . "-target") - ("test-patt" . "-testpatt") - ("msg" . "-m") - ("log" . "-log") - ("start-dir" . "-start-dir") - ("new" . "-set-state-status")))) - (if (eq? flavor 'switch-symbol) - (map (lambda (x) - (cons (string->symbol (conc "-" (car x))) (cdr x))) - default) - default))) - -;;====================================================================== -;; set some env vars from an alist, return an alist with original values -;; (("VAR" "value") ...) -;; a value of #f means "unset this var" -;; -(define (alist->env-vars lst) - (if (list? lst) - (let ((res '())) - (for-each (lambda (p) - (let* ((var (car p)) - (val (cadr p)) - (prv (get-environment-variable var))) - (set! res (cons (list var prv) res)) - (if val - (safe-setenv var (->string val)) - (unsetenv var)))) - lst) - res) - '())) - -;;====================================================================== -;; clear vars matching pattern, run proc, set vars back -;; if proc is a string run that string as a command with -;; system. -;; -(define *common:orig-env* - (let ((envvars (get-environment-variables))) - (if (get-environment-variable "MT_ORIG_ENV") - (with-input-from-string - (z3:decode-buffer (base64:base64-decode (get-environment-variable "MT_ORIG_ENV"))) - read) - (filter-map (lambda (x) - (if (string-match "^MT_.*" (car x)) - #f - x)) - envvars)))) - -(define (common:with-orig-env proc) - (let ((current-env (get-environment-variables))) - (for-each (lambda (x) (unsetenv (car x))) current-env) - (for-each (lambda (x) (setenv (car x) (cdr x))) *common:orig-env*) - (let ((rv (cond - ((string? proc)(system proc)) - (proc (proc))))) - (for-each (lambda (x) (unsetenv (car x))) *common:orig-env*) - (for-each (lambda (x) (setenv (car x) (cdr x))) current-env) - rv))) - -(define (common:without-vars proc . var-patts) - (let ((vars (make-hash-table))) - (for-each - (lambda (vardat) ;; each env var - (for-each - (lambda (var-patt) - (if (string-match var-patt (car vardat)) - (let ((var (car vardat)) - (val (cdr vardat))) - (hash-table-set! vars var val) - (unsetenv var)))) - var-patts)) - (get-environment-variables)) - (cond - ((string? proc)(system proc)) - (proc (proc))) - (hash-table-for-each - vars - (lambda (var val) - (setenv var val))) - vars)) - -(define (common:run-a-command cmd #!key (with-vars #f) (with-orig-env #f)) - (let* ((pre-cmd (dtests:get-pre-command)) - (post-cmd (dtests:get-post-command)) - (fullcmd (if (or pre-cmd post-cmd) - (conc pre-cmd cmd post-cmd) - (conc "viewscreen " cmd)))) - (debug:print-info 02 *default-log-port* "Running command: " fullcmd) - (cond - (with-vars (common:without-vars fullcmd)) - (with-orig-env (common:with-orig-env fullcmd)) - (else (common:without-vars fullcmd "MT_.*"))))) - -;;====================================================================== -;; C O L O R S -;;====================================================================== - -(define (common:name->iup-color name) - (case (string->symbol (string-downcase name)) - ((red) "223 33 49") - ((grey) "192 192 192") - ((orange) "255 172 13") - ((purple) "This is unfinished ..."))) - -;;====================================================================== -;; (define (common:get-color-for-state-status state status) -;; (case (string->symbol state) -;; ((COMPLETED) -;; (case (string->symbol status) -;; ((PASS) "70 249 73") -;; ((WARN WAIVED) "255 172 13") -;; ((SKIP) "230 230 0") -;; (else "223 33 49"))) -;; ((LAUNCHED) "101 123 142") -;; ((CHECK) "255 100 50") -;; ((REMOTEHOSTSTART) "50 130 195") -;; ((RUNNING) "9 131 232") -;; ((KILLREQ) "39 82 206") -;; ((KILLED) "234 101 17") -;; ((NOT_STARTED) "240 240 240") -;; (else "192 192 192"))) - -(define (common:iup-color->rgb-hex instr) - (string-intersperse - (map (lambda (x) - (number->string x 16)) - (map string->number - (string-split instr))) - "/")) - -;;====================================================================== -;; L O C K I N G M E C H A N I S M S -;;====================================================================== - -;;====================================================================== -;; faux-lock is deprecated. Please use simple-lock below -;; -(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t)) - (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count - (if (> wait-time 0) - (begin - (thread-sleep! 1) - (if (eq? wait-time 1) ;; only one second left, steal the lock - (begin - (debug:print-info 0 *default-log-port* "stealing lock for " keyname) - (common:faux-unlock keyname force: #t))) - (common:faux-lock keyname wait-time: (- wait-time 1))) - #f) - (begin - (rmt:no-sync-set keyname (conc (current-process-id))) - (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))))) - -(define (common:faux-unlock keyname #!key (force #f)) - (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))) - (begin - (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname)) - #t) - #f)) - -;;====================================================================== -;; simple lock. improve and converge on this one. -;; -(define (common:simple-lock keyname) - (rmt:no-sync-get-lock keyname)) - -(define (common:simple-unlock keyname #!key (force #f)) - (rmt:no-sync-del! keyname)) - -;;====================================================================== -;; -;;====================================================================== - -(define (common:in-running-test?) - (and (args:get-arg "-execute") (get-environment-variable "MT_CMDINFO"))) - -(define (common:get-color-from-status status) - (cond - ((equal? status "PASS") "green") - ((equal? status "FAIL") "red") - ((equal? status "WARN") "orange") - ((equal? status "KILLED") "orange") - ((equal? status "KILLREQ") "purple") - ((equal? status "RUNNING") "blue") - ((equal? status "ABORT") "brown") - (else "black"))) - -;;====================================================================== -;; N A N O M S G C L I E N T -;;====================================================================== -;; -;; -;; -;; (define (common:send-dboard-main-changed) -;; (let* ((dashboard-ips (mddb:get-dashboards))) -;; (for-each -;; (lambda (ipadr) -;; (let* ((soc (common:open-nm-req (conc "tcp://" ipadr))) -;; (msg (conc "main " *toppath*)) -;; (res (common:nm-send-receive-timeout soc msg))) -;; (if (not res) ;; couldn't reach that dashboard - remove it from db -;; (print "ERROR: couldn't reach dashboard " ipadr)) -;; res)) -;; dashboard-ips))) -;; -;; -;; ;;====================================================================== -;; ;; D A S H B O A R D D B -;; ;;====================================================================== -;; -;; (define (mddb:open-db) -;; (let* ((db (open-database (conc (get-environment-variable "HOME") "/.dashboard.db")))) -;; (set-busy-handler! db (busy-timeout 10000)) -;; (for-each -;; (lambda (qry) -;; (exec (sql db qry))) -;; (list -;; "CREATE TABLE IF NOT EXISTS vars (id INTEGER PRIMARY KEY,key TEXT, val TEXT, CONSTRAINT varsconstraint UNIQUE (key));" -;; "CREATE TABLE IF NOT EXISTS dashboards ( -;; id INTEGER PRIMARY KEY, -;; pid INTEGER, -;; username TEXT, -;; hostname TEXT, -;; ipaddr TEXT, -;; portnum INTEGER, -;; start_time TIMESTAMP DEFAULT (strftime('%s','now')), -;; CONSTRAINT hostport UNIQUE (hostname,portnum) -;; );" -;; )) -;; db)) -;; -;; ;; register a dashboard -;; ;; -;; (define (mddb:register-dashboard port) -;; (let* ((pid (current-process-id)) -;; (hostname (get-host-name)) -;; (ipaddr (server:get-best-guess-address hostname)) -;; (username (current-user-name)) ;; (car userinfo))) -;; (db (mddb:open-db))) -;; (print "Register monitor, pid: " pid ", hostname: " hostname ", port: " port ", username: " username) -;; (exec (sql db "INSERT OR REPLACE INTO dashboards (pid,username,hostname,ipaddr,portnum) VALUES (?,?,?,?,?);") -;; pid username hostname ipaddr port) -;; (close-database db))) -;; -;; ;; unregister a monitor -;; ;; -;; (define (mddb:unregister-dashboard host port) -;; (let* ((db (mddb:open-db))) -;; (print "Register unregister monitor, host:port=" host ":" port) -;; (exec (sql db "DELETE FROM dashboards WHERE hostname=? AND portnum=?;") host port) -;; (close-database db))) -;; -;; ;; get registered dashboards -;; ;; -;; (define (mddb:get-dashboards) -;; (let ((db (mddb:open-db))) -;; (query fetch-column -;; (sql db "SELECT ipaddr || ':' || portnum FROM dashboards;")))) - -;;====================================================================== -;; T E S T L A U N C H I N G P E R I T E M W I T H H O S T T Y P E S -;;====================================================================== -;; -;; [hosts] -;; arm cubie01 cubie02 -;; x86_64 zeus xena myth01 -;; allhosts #{g hosts arm} #{g hosts x86_64} -;; -;; [host-types] -;; general #MTLOWESTLOAD #{g hosts allhosts} -;; arm #MTLOWESTLOAD #{g hosts arm} -;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo -;; -;; [host-rules] -;; # maxnload => max normalized load -;; # maxnjobs => max jobs per cpu -;; # maxjobrate => max jobs per second -;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 -;; -;; [launchers] -;; envsetup general -;; xor/%/n 4C16G -;; % nbgeneral -;; -;; [jobtools] -;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match. -;; flexi-launcher yes -;; launcher nbfake -;; -(define (common:get-launcher configdat testname itempath) - (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher"))) - (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher - (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no"))) - (let* ((launchers (hash-table-ref/default configdat "launchers" '()))) - (if (null? launchers) - fallback-launcher - (let loop ((hed (car launchers)) - (tal (cdr launchers))) - (let ((patt (car hed)) - (host-type (cadr hed))) - (if (tests:match patt testname itempath) - (begin - (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) - (let ((launcher (configf:lookup configdat "host-types" host-type))) - (if launcher - (let* ((launcher-parts (string-split launcher)) - (launcher-exe (car launcher-parts))) - (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline - (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) - (count 100)) - (if targ-host - (conc "remrun " targ-host) - (if (> count 0) - (begin - (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) - (thread-sleep! (- 101 count)) - (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) - (- count 1))) - (begin - (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) - (exit))))) - launcher)) - (begin - (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) - (if (null? tal) - fallback-launcher - (loop (car tal)(cdr tal))))))) - ;; no match, try again - (if (null? tal) - fallback-launcher - (loop (car tal)(cdr tal)))))))) - fallback-launcher))) - -;;====================================================================== -;; D A S H B O A R D U S E R V I E W S -;;====================================================================== - -;;====================================================================== -;; first read ~/views.config if it exists, then read $MTRAH/views.config if it exists -;; -(define (common:load-views-config) - (let* ((view-cfgdat (make-hash-table)) - (home-cfgfile (conc (get-environment-variable "HOME") "/.mtviews.config")) - (mthome-cfgfile (conc *toppath* "/.mtviews.config"))) - (if (common:file-exists? mthome-cfgfile) - (read-config mthome-cfgfile view-cfgdat #t)) - ;; we load the home dir file AFTER the MTRAH file so the user can clobber settings when running the dashboard in read-only areas - (if (common:file-exists? home-cfgfile) - (read-config home-cfgfile view-cfgdat #t)) - view-cfgdat)) - -;;====================================================================== -;; H I E R A R C H I C A L H A S H T A B L E S -;;====================================================================== -;; -;; Every element including top element is a vector: -;; - -(define (hh:make-hh #!key (ht #f)(value #f)) - (vector (or ht (make-hash-table)) value)) - -;;====================================================================== -;; used internally -(define-inline (hh:set-ht! hh ht) (vector-set! hh 0 ht)) -(define-inline (hh:get-ht hh) (vector-ref hh 0)) -(define-inline (hh:set-value! hh value) (vector-set! hh 1 value)) -(define-inline (hh:get-value hh value) (vector-ref hh 1)) - -;;====================================================================== -;; given a hierarchial hash and some keys look up the value ... -;; -(define (hh:get hh . keys) - (if (null? keys) - (vector-ref hh 1) ;; we have reached the end of the line, return the value sought - (let ((sub-ht (hh:get-ht hh))) - (if sub-ht ;; yes, there is more hierarchy - (let ((sub-hh (hash-table-ref/default sub-ht (car keys) #f))) - (if sub-hh - (apply hh:get sub-hh (cdr keys)) - #f)) - #f)))) - -;;====================================================================== -;; given a hierarchial hash, a value and some keys, add needed hierarcy and insert the value -;; -(define (hh:set! hh value . keys) - (if (null? keys) - (hh:set-value! hh value) ;; we have reached the end of the line, store the value - (let ((sub-ht (hh:get-ht hh))) - (if sub-ht ;; yes, there is more hierarchy - (let ((sub-hh (hash-table-ref/default sub-ht (car keys) #f))) - (if (not sub-hh) ;; we'll need to add the next level of hierarchy - (let ((new-sub-hh (hh:make-hh))) - (hash-table-set! sub-ht (car keys) new-sub-hh) - (apply hh:set! new-sub-hh value (cdr keys))) - (apply hh:set! sub-hh value (cdr keys)))) ;; call the sub-hierhash with remaining keys - (begin - (hh:set-ht! hh (make-hash-table)) - (apply hh:set! hh value keys)))))) - -;;====================================================================== -;; Manage pkts, used in servers, tests and likely other contexts so put -;; in common -;;====================================================================== - -(define common:pkts-spec - '((default . ((parent . P) - (action . a) - (filename . f))) - (configf . ((parent . P) - (action . a) - (filename . f))) - (server . ((action . a) - (pid . d) - (ipaddr . i) - (port . p) - (parent . P))) - - (test . ((cpuuse . c) - (diskuse . d) - (item-path . i) - (runname . r) - (state . s) - (target . t) - (status . u) - (parent . P))))) - -(define (common:get-pkts-dirs mtconf use-lt) - (let* ((pktsdirs-str (or (configf:lookup mtconf "setup" "pktsdirs") - (and use-lt - (conc (or *toppath* - (current-directory)) - "/lt/.pkts")))) - (pktsdirs (if pktsdirs-str - (string-split pktsdirs-str " ") - #f))) - pktsdirs)) - -;;====================================================================== -;; use-lt is use linktree "lt" link to find pkts dir -(define (common:save-pkt pktalist-in mtconf use-lt #!key (add-only #f)) ;; add-only saves the pkt only if there is a parent already - (if (or (not add-only) - (hash-table-exists? *pkts-info* 'last-parent)) - (let* ((parent (hash-table-ref/default *pkts-info* 'last-parent #f)) - (pktalist (if parent - (cons `(parent . ,parent) - pktalist-in) - pktalist-in))) - (let-values (((uuid pkt) - (alist->pkt pktalist common:pkts-spec))) - (hash-table-set! *pkts-info* 'last-parent uuid) - (let ((pktsdir (or (hash-table-ref/default *pkts-info* 'pkts-dir #f) - (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) - (pktsdir (car pktsdirs))) ;; assume it is there - (hash-table-set! *pkts-info* 'pkts-dir pktsdir) - pktsdir)))) - (debug:print 0 *default-log-port* "pktsdir: "pktsdir) - (handle-exceptions - exn - (debug:print-info 0 "failed to write out packet to " pktsdir ", exn=" exn) ;; don't care if this failed for now but MUST FIX - BUG!! - (if (not (file-exists? pktsdir)) - (create-directory pktsdir #t)) - (with-output-to-file - (conc pktsdir "/" uuid ".pkt") - (lambda () - (print pkt))))))))) - -(define (common:with-queue-db mtconf proc #!key (use-lt #f)(toppath-in #f)) - (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) - (pktsdir (if pktsdirs (car pktsdirs) #f)) - (toppath (or (configf:lookup mtconf "scratchdat" "toppath") - toppath-in)) - (pdbpath (or (configf:lookup mtconf "setup" "pdbpath") pktsdir))) - (cond - ((not (and pktsdir toppath pdbpath)) - (debug:print 0 *default-log-port* "ERROR: settings are missing in your megatest.config for area management.") - (debug:print 0 *default-log-port* " you need to have pktsdirs in the [setup] section.")) - ((not (common:file-exists? pktsdir)) - (debug:print 0 *default-log-port* "ERROR: pkts directory not found " pktsdir)) - ((not (equal? (file-owner pktsdir)(current-effective-user-id))) - (debug:print 0 *default-log-port* "ERROR: directory " pktsdir " is not owned by " (current-effective-user-name))) - (else - (let* ((pdb (open-queue-db pdbpath "pkts.db" - schema: '("CREATE TABLE groups (id INTEGER PRIMARY KEY,groupname TEXT, CONSTRAINT group_constraint UNIQUE (groupname));")))) - (proc pktsdirs pktsdir pdb) - (dbi:close pdb)))))) - -(define (common:load-pkts-to-db mtconf #!key (use-lt #f)) - (common:with-queue-db - mtconf - (lambda (pktsdirs pktsdir pdb) - (for-each - (lambda (pktsdir) ;; look at all - (cond - ((not (common:file-exists? pktsdir)) - (debug:print 0 *default-log-port* "ERROR: packets directory " pktsdir " does not exist.")) - ((not (directory? pktsdir)) - (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not a directory.")) - ((not (file-read-access? pktsdir)) - (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not readable.")) - (else - (debug:print-info 0 *default-log-port* "Loading packets found in " pktsdir) - (let ((pkts (glob (conc pktsdir "/*.pkt"))) - (sqdb (dbi:db-conn pdb)) - ) - ;; Put this in a transaction to avoid issues overloading the db - (sqlite3:with-transaction - sqdb - (lambda () - (for-each - (lambda (pkt) - (let* ((uuid (cadr (string-match ".*/([0-9a-f]+).pkt" pkt))) - (exists (lookup-by-uuid pdb uuid #f))) - (if (not exists) - (let* ((pktdat (string-intersperse - (with-input-from-file pkt read-lines) - "\n")) - (apkt (pkt->alist pktdat)) - (ptype (alist-ref 'T apkt))) - (add-to-queue pdb pktdat uuid (or ptype 'cmd) #f 0) - (debug:print 4 *default-log-port* "Added " uuid " of type " ptype " to queue")) - (debug:print 4 *default-log-port* "pkt: " uuid " exists, skipping...") - ))) - pkts))))))) - pktsdirs)) - use-lt: use-lt)) - -(define (common:get-pkt-alists pkts) - (map (lambda (x) - (alist-ref 'apkt x)) ;; 'pkta pulls out the alist from the read pkt - pkts)) - -;;====================================================================== -;; given list of pkts (alist mode) return list of D cards as Unix epoch, sorted descending -;; also delete duplicates by target i.e. (car pkt) -;; -(define (common:get-pkt-times pkts) - (delete-duplicates - (sort - (map (lambda (x) - `(,(alist-ref 't x) . ,(string->number (alist-ref 'D x)))) - pkts) - (lambda (a b)(> (cdr a)(cdr b)))) ;; sort descending - (lambda (a b)(equal? (car a)(car b))))) ;; remove duplicates by target - - -(define *common:thread-punchlist* (make-hash-table)) -(define (common:send-thunk-to-background-thread thunk #!key (name #f)) - ;;(BB> "launched thread " name) - ;; we need a unique name for the thread. - (let* ((realname (if name - (if (not (hash-table-ref/default *common:thread-punchlist* name #f)) - name - (conc name"-" (symbol->string (gensym)))) - (conc "anonymous-"(symbol->string (gensym))))) - (realthunk (lambda () - (let ((res (thunk))) - (hash-table-delete! *common:thread-punchlist* realname) - res))) - (thread (make-thread realthunk realname))) - (hash-table-set! *common:thread-punchlist* realname thread) - (thread-start! thread) - )) - -(define (common:join-backgrounded-threads) - ;; may need to trap and ignore exceptions -- dunno how atomic threads are... - (for-each - (lambda (thread-name) - (let* ((thread (hash-table-ref/default *common:thread-punchlist* thread-name #f))) - (if thread - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "joining threads failed. exn=" exn) - #t) ;; just ignore it, it might have died in the meantime so joining it will throw an exception - (thread-join! thread)) - ))) - (hash-table-keys *common:thread-punchlist*))) + + + ;;====================================================================== ;; (define *common:telemetry-log-state* 'startup) ;; (define *common:telemetry-log-socket* #f) ;; Index: commonmod.scm ================================================================== --- commonmod.scm +++ commonmod.scm @@ -896,8 +896,2938 @@ (debug:print 2 *default-log-port* "Running " cmd " received " output) (if (eq? (length output) 0) #f #t)) #t))) ;; assuming bad query is about a live test is likely not the right thing to do? + +(define (remove-files filespec) + (let ((files (glob filespec))) + (for-each delete-file files))) + +(define (stop-the-train) + (thread-start! (make-thread (lambda () + (let loop () + (if (and *toppath* + (file-exists? (conc *toppath*"/stop-the-train"))) + (let* ((msg (conc "ERROR: found file "*toppath*"/stop-the-train, exiting immediately"))) + ;; yes, print to current-output-port AND *default-log-port*, annoying but necessary I think + (print msg) + (debug:print 0 *default-log-port* msg) + (remove-files (conc *toppath* "/logs/server*")) + (remove-files (conc *toppath* "/.servinfo/*")) + (remove-files (conc *toppath* "/.mtdb/*lock")) + (exit 1))) + (thread-sleep! 5) + (loop)))))) + +;; execute thunk, return value. If exception thrown, trap exception, return #f, and emit nonfatal condition note to *default-log-port* . +;; arguments - thunk, message +(define (common:fail-safe thunk warning-message-on-exception) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "notable but nonfatal condition - "warning-message-on-exception", exn=" exn) + (debug:print-info 0 *default-log-port* + (string-substitute "\n?Error:" "nonfatal condition:" + (with-output-to-string + (lambda () + (print-error-message exn) )))) + (debug:print-info 0 *default-log-port* " -- continuing after nonfatal condition...") + #f) + (thunk))) + + +;; returns list of fd count, socket count +(define (get-file-descriptor-count #!key (pid (current-process-id ))) + (list + (length (glob (conc "/proc/" pid "/fd/*"))) + (length (filter identity (map socket? (glob (conc "/proc/" pid "/fd/*"))))) + ) +) + + + +;; GLOBALS + +;; CONTEXTS +(defstruct cxt + (taskdb #f) + (cmutex (make-mutex))) +;; (define *contexts* (make-hash-table)) +;; (define *context-mutex* (make-mutex)) + +;; ;; safe method for accessing a context given a toppath +;; ;; +;; (define (common:with-cxt toppath proc) +;; (mutex-lock! *context-mutex*) +;; (let ((cxt (hash-table-ref/default *contexts* toppath #f))) +;; (if (not cxt) +;; (set! cxt (let ((x (make-cxt)))(hash-table-set! *contexts* toppath x) x))) +;; (let ((cxt-mutex (cxt-mutex cxt))) +;; (mutex-unlock! *context-mutex*) +;; (mutex-lock! cxt-mutex) +;; (let ((res (proc cxt))) +;; (mutex-unlock! cxt-mutex) +;; res)))) + +;; A hash table that can be accessed by #{scheme ...} calls in +;; config files. Allows communicating between confgs +;; +(define *user-hash-data* (make-hash-table)) + +(define *db-keys* #f) + +(define *pkts-info* (make-hash-table)) ;; store stuff like the last parent here +(define *configinfo* #f) ;; raw results from setup, includes toppath and table from megatest.config +(define *runconfigdat* #f) ;; run configs data +(define *configdat* #f) ;; megatest.config data +(define *configstatus* #f) ;; status of data; 'fulldata : all processing done, #f : no data yet, 'partialdata : partial read done +;; (define *toppath* #f) ;; moved to commonmod +(define *already-seen-runconfig-info* #f) + +(define *test-meta-updated* (make-hash-table)) +(define *globalexitstatus* 0) ;; attempt to work around possible thread issues +(define *passnum* 0) ;; when running track calls to run-tests or similar +;; (define *alt-log-file* #f) ;; used by -log +;; (define *common:denoise* (make-hash-table)) ;; for low noise printing +(define *default-log-port* (current-error-port)) +(define *time-zero* (current-seconds)) ;; for the watchdog +(define *on-exit-procs* '()) ;; add procs to this list to be executed on exit +(define *default-area-tag* "local") + +;; DATABASE +;; db access +(define *db-last-access* (current-seconds)) ;; last db access, used in server +;; (define *db-write-access* #t) +;; db sync +;; (define *db-last-sync* 0) ;; last time the sync to megatest.db happened +(define *db-sync-in-progress* #f) ;; if there is a sync in progress do not try to start another +;; (define *db-multi-sync-mutex* (make-mutex)) ;; protect access to *db-sync-in-progress*, *db-last-sync* +;; task db +(define *task-db* #f) ;; (vector db path-to-db) +(define *db-access-allowed* #t) ;; flag to allow access +;; (define *db-access-mutex* (make-mutex)) ;; moved to dbfile +;; (define *db-transaction-mutex* (make-mutex)) +(define *db-cache-path* #f) +;; (define *db-with-db-mutex* (make-mutex)) +(define *db-api-call-time* (make-hash-table)) ;; hash of command => (list of times) + +;; SERVER +(define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg +(define *runremote* #f) ;; if set up for server communication this will hold +;; (define *max-cache-size* 0) +(define *logged-in-clients* (make-hash-table)) +(define *server-id* #f) +;; (define *server-info* #f) ;; good candidate for easily convert to non-global +(define *time-to-exit* #f) +(define *run-id* #f) +(define *server-kind-run* (make-hash-table)) +(define *home-host* #f) +;; (define *total-non-write-delay* 0) +(define *heartbeat-mutex* (make-mutex)) +;; (define *api-process-request-count* 0) +;; (define *max-api-process-requests* 0) +(define *server-overloaded* #f) + +;; client +(define *rmt-mutex* (make-mutex)) ;; remote access calls mutex + +;; RPC transport +(define *rpc:listener* #f) + +;; KEY info +(define *target* (make-hash-table)) ;; cache the target here; target is keyval1/keyval2/.../keyvalN +(define *keys* (make-hash-table)) ;; cache the keys here +(define *keyvals* (make-hash-table)) +(define *toptest-paths* (make-hash-table)) ;; cache toptest path settings here +(define *test-paths* (make-hash-table)) ;; cache test-id to test run paths here +(define *test-ids* (make-hash-table)) ;; cache run-id, testname, and item-path => test-id +(define *test-info* (make-hash-table)) ;; cache the test info records, update the state, status, run_duration etc. from testdat.db + +(define *run-info-cache* (make-hash-table)) ;; run info is stable, no need to reget +(define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set +(define *homehost-mutex* (make-mutex)) + +;; Miscellaneous +(define *triggers-mutex* (make-mutex)) ;; block overlapping processing of triggers +(define *numcpus-cache* (make-hash-table)) + +;; this plugs a hole in posix-extras in recent chicken versions > 4.9) +(let-values (( (chicken-release-number chicken-major-version) + (apply values + (map string->number + (take + (string-split (chicken-version) ".") + 2))))) + (let ((resolve-pathname-broken? + (or (> chicken-release-number 4) + (and (eq? 4 chicken-release-number) (> chicken-major-version 9))))) + (if resolve-pathname-broken? + (define ##sys#expand-home-path pathname-expand)))) + +(define (realpath x) (resolve-pathname (pathname-expand (or x "/dev/null")) )) + +(define (common:get-this-exe-fullpath #!key (argv (argv))) + (let* ((this-script + (cond + ((and (> (length argv) 2) + (string-match "^(.*/csi|csi)$" (car argv)) + (string-match "^-(s|ss|sx|script)$" (cadr argv))) + (caddr argv)) + (else (car argv)))) + (fullpath (realpath this-script))) + fullpath)) + +;;====================================================================== + +(define *common:this-exe-fullpath* (common:get-this-exe-fullpath)) +(define *common:this-exe-dir* (pathname-directory *common:this-exe-fullpath*)) +(define *common:this-exe-name* (pathname-strip-directory *common:this-exe-fullpath*)) + +(define (common:get-sync-lock-filepath) + (let* ((tmp-area (common:make-tmpdir-name *toppath* "")) + (lockfile (conc tmp-area "/megatest.db.lock"))) + lockfile)) + +(define *common:logpro-exit-code->status-sym-alist* + '( ( 0 . pass ) + ( 1 . fail ) + ( 2 . warn ) + ( 3 . check ) + ( 4 . waived ) + ( 5 . abort ) + ( 6 . skip ))) + +(define (common:logpro-exit-code->status-sym exit-code) + (or (alist-ref exit-code *common:logpro-exit-code->status-sym-alist*) 'fail)) + +(define (common:worse-status-sym ss1 ss2) + (let loop ((status-syms-remaining '(abort fail check skip warn waived pass))) + (cond + ((null? status-syms-remaining) + 'fail) + ((eq? (car status-syms-remaining) ss1) + ss1) + ((eq? (car status-syms-remaining) ss2) + ss2) + (else + (loop (cdr status-syms-remaining)))))) + +(define (common:steps-can-proceed-given-status-sym status-sym) + (if (member status-sym '(warn waived pass)) + #t + #f)) + +(define (status-sym->string status-sym) + (case status-sym + ((pass) "PASS") + ((fail) "FAIL") + ((warn) "WARN") + ((check) "CHECK") + ((waived) "WAIVED") + ((abort) "ABORT") + ((skip) "SKIP") + (else "FAIL"))) + +(define (common:logpro-exit-code->test-status exit-code) + (status-sym->string (common:logpro-exit-code->status-sym exit-code))) + +;; +(defstruct remote + + ;; transport to be used + ;; http - use http-transport + ;; http-read-cached - use http-transport for writes but in-mem cached for reads + (rmode 'http) + (hh-dat (let ((res (or (server:choose-server *toppath* 'homehost) + (cons #f #f)))) + (assert (pair? res)(conc "FATAL: hh-dat should be a pair, got "res)) + res)) + (server-url #f) ;; (server:check-if-running *toppath*) #f)) + (server-id #f) + (server-info #f) ;; (if *toppath* (server:check-if-running *toppath*) #f)) + (last-server-check 0) ;; last time we checked to see if the server was alive + (connect-time (current-seconds)) ;; when we first connected + (last-access (current-seconds)) ;; last time we talked to server + ;; (conndat #f) ;; iface port api-uri api-url api-req seconds server-id + (server-timeout (server:expiration-timeout)) + (force-server #f) + (ro-mode #f) + (ro-mode-checked #f) ;; flag that indicates we have checked for ro-mode + + ;; conndat stuff + (iface #f) ;; TODO: Consolidate this data with server-url and server-info above + (port #f) + (api-url #f) + (api-uri #f) + (api-req #f)) + +;; launching and hosts +(defstruct host + (reachable #f) + (last-update 0) + (last-used 0) + (last-cpuload 1)) + +(define *host-loads* (make-hash-table)) + +;; cache environment vars for each run here +(define *env-vars-by-run-id* (make-hash-table)) + +;; Testconfig and runconfig caches. +(define *testconfigs* (make-hash-table)) ;; test-name => testconfig +(define *runconfigs* (make-hash-table)) ;; target => runconfig + +;; This is a cache of pre-reqs met, don't re-calc in cases where called with same params less than +;; five seconds ago +(define *pre-reqs-met-cache* (make-hash-table)) + +;; cache of verbosity given string +;; +(define *verbosity-cache* (make-hash-table)) + +(define (common:clear-caches) + (set! *target* (make-hash-table)) + (set! *keys* (make-hash-table)) + (set! *keyvals* (make-hash-table)) + (set! *toptest-paths* (make-hash-table)) + (set! *test-paths* (make-hash-table)) + (set! *test-ids* (make-hash-table)) + (set! *test-info* (make-hash-table)) + (set! *run-info-cache* (make-hash-table)) + (set! *env-vars-by-run-id* (make-hash-table)) + (set! *test-id-cache* (make-hash-table))) + +;; Generic string database +(define sdb:qry #f) ;; (make-sdb:qry)) ;; 'init #f) +;; Generic path database +(define *fdb* #f) + +(define *last-launch* (current-seconds)) ;; use for throttling the launch rate. Would be better to use the db and last time of a test in LAUNCHED state. + +;;====================================================================== +;; V E R S I O N +;;====================================================================== + +(define (common:get-full-version) + (conc megatest-version "-" megatest-fossil-hash)) + +(define (common:version-signature) + (conc megatest-version "-" (substring megatest-fossil-hash 0 4))) + +;;====================================================================== +;; from metadat lookup MEGATEST_VERSION +;; +(define (common:get-last-run-version) ;; RADT => How does this work in send-receive function??; assume it is the value saved in some DB + (rmt:get-var "MEGATEST_VERSION")) + +(define (common:get-last-run-version-number) + (string->number + (substring (common:get-last-run-version) 0 6))) + +(define (common:set-last-run-version) + (rmt:set-var "MEGATEST_VERSION" (common:version-signature))) + +;;====================================================================== +;; postive number if megatest version > db version +;; negative number if megatest version < db version +(define (common:version-db-delta) + (- megatest-version (common:get-last-run-version-number))) + +(define (common:version-changed?) + (not (equal? (common:get-last-run-version) + (common:version-signature)))) + + +;; From 1.70 to 1.80, db's are compatible. + +(define (common:api-changed?) + (let* ( + (megatest-major-version (substring (->string megatest-version) 0 4)) + (run-major-version (substring (conc (common:get-last-run-version)) 0 4)) + ) + (and (not (equal? megatest-major-version "1.80")) + (not (equal? megatest-major-version megatest-run-version))) + ) +) + +;;====================================================================== +;; Move me elsewhere ... +;; RADT => Why do we meed the version check here, this is called only if version misma +;; +(define (common:cleanup-db dbstruct #!key (full #f)) + (case (rmt:transport-mode) + ((http) + (apply db:multi-db-sync + dbstruct + 'schema + 'killservers + 'adj-target + 'new2old + '(dejunk) + )) + ((tcp nfs) + (apply db:multi-db-sync + dbstruct + 'schema + 'killservers + 'adj-target + 'new2old + '(dejunk) + ))) + (if (common:api-changed?) + (common:set-last-run-version))) + +(define (common:snapshot-file filepath #!key (subdir ".") ) + (if (file-exists? filepath) + (let* ((age-sec (lambda (file) + (if (file-exists? file) + (- (current-seconds) (file-modification-time file)) + 1000000000))) ;; return really old value if file doesn't exist. we want to clobber it if old or not exist. + (ok-flag #t) + (age-mins (lambda (file) (/ (age-sec file) 60))) + (age-hrs (lambda (file) (/ (age-mins file) 60))) + (age-days (lambda (file) (/ (age-hrs file) 24))) + (age-wks (lambda (file) (/ (age-days file) 7))) + (docmd (lambda (cmd) + (cond + (ok-flag + (let ((res (system cmd))) + (cond + ((eq? 0 res) + #t) + (else + (set! ok-flag #f) + (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Command failed with exit code " + (if (< res 0) + res + (/ res 8)) " ["cmd"]" ) + #f)))) + (else + (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Not runnining command due to prior error. ["cmd"]") + #f)))) + (copy (lambda (src dest) (docmd (conc "/bin/cp '"src"' '"dest"'")))) + (copy+zip (lambda (src dest) (docmd (conc "gzip -c - < '"src"' > '"dest"'")))) + (fullpath (realpath filepath)) + (basedir (pathname-directory fullpath)) + (basefile (pathname-strip-directory fullpath)) + ;;(prevfile (conc filepath ".prev.gz")) + (minsfile (conc basedir "/" subdir "/" basefile ".mins.gz")) + (hrsfile (conc basedir "/" subdir "/" basefile ".hrs.gz")) + (daysfile (conc basedir "/" subdir "/" basefile ".days.gz")) + (wksfile (conc basedir "/" subdir "/" basefile ".weeks.gz"))) + + ;; create subdir it not exists + (if (not (directory-exists? (conc basedir "/" subdir))) + (docmd (conc "/bin/mkdir -p '"(conc basedir "/" subdir)"'"))) + + ;; copy&zip to .mins if not exists + (if (not (file-exists? minsfile)) + (copy+zip filepath minsfile)) + ;; copy .mins to .hrs if not exists + (if (not (file-exists? hrsfile)) + (copy minsfile hrsfile)) + ;; copy .hrs to .days if not exists + (if (not (file-exists? daysfile)) + (copy hrsfile daysfile)) + ;; copy .days to .weeks if not exists + (if (not (file-exists? wksfile)) + (copy daysfile wksfile)) + + + ;; if age(.mins.gz) >= 1h: + ;; copy .mins.gz .hrs.gz + ;; copy .mins.gz + (when (>= (age-mins minsfile) 1) + (copy minsfile hrsfile) + (copy+zip filepath minsfile)) + + ;; if age(.hrs.gz) >= 1d: + ;; copy .hrs.gz .days.gz + ;; copy .mins.gz .hrs.gz + (when (>= (age-days hrsfile) 1) + (copy hrsfile daysfile) + (copy minsfile hrsfile)) + + ;; if age(.days.gz) >= 1w: + ;; copy .days.gz .weeks.gz + ;; copy .hrs.gz .days.gz + (when (>= (age-wks daysfile) 1) + (copy daysfile wksfile) + (copy hrsfile daysfile)) + #t) + #f)) + +;;====================================================================== +;; Rotate logs, logic: +;; if > 500k and older than 1 week: +;; remove previous compressed log and compress this log +;; WARNING: This proc operates assuming that it is in the directory above the +;; logs directory you wish to log-rotate. +;; +(define (common:rotate-logs) + (let* ((all-files (make-hash-table)) + (stats (make-hash-table)) + (inc-stat (lambda (key) + (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1)))) + (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "600")))) ;; name -> age + (if (not (directory-exists? "logs"))(create-directory "logs")) + (directory-fold + (lambda (file rem) + (handle-exceptions + exn + (begin + (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore. exn=" exn) + (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + ;; (print-call-chain (current-error-port)) ;; + ) + (let* ((fullname (conc "logs/" file)) + (mod-time (file-modification-time fullname)) + (file-age (- (current-seconds) mod-time)) + (file-old (> file-age (* 48 60 60))) + (file-big (> (file-size fullname) 200000))) + (hash-table-set! all-files file mod-time) + (if (or (and (string-match "^.*.log" file) + file-old + file-big) + (and (string-match "^server-.*.log" file) + file-old)) + (let ((gzfile (conc fullname ".gz"))) + (if (common:file-exists? gzfile) + (begin + (debug:print-info 0 *default-log-port* "removing " gzfile) + (delete-file* gzfile) + (hash-table-delete! all-files gzfile) ;; needed? + )) + (debug:print-info 0 *default-log-port* "compressing " file) + (system (conc "gzip " fullname)) + (inc-stat "gzipped") + (hash-table-set! all-files (conc file ".gz") file-age) ;; add the .gz file and remove the base file + (hash-table-delete! all-files file) + ) + (if (and (> file-age (* (string->number (or (configf:lookup *configdat* "setup" "log-expire-days") "30")) 24 3600)) + (file-exists? fullname)) ;; just in case it was gzipped - will get it next time + (handle-exceptions + exn + #f + (if (directory? fullname) + (begin + (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") + (inc-stat "directories")) + (begin + (delete-file* fullname) + (inc-stat "deleted"))) + (hash-table-delete! all-files file))))))) + '() + "logs") + (for-each + (lambda (category) + (let ((quant (hash-table-ref/default stats category 0))) + (if (> quant 0) + (debug:print-info 0 *default-log-port* category " log files: " quant)))) + `("deleted" "gzipped" "directories")) + (let ((num-logs (hash-table-size all-files))) + (if (> num-logs max-allowed) ;; because NFS => don't let number of logs exceed 300 + (let ((files (take (sort (hash-table-keys all-files) + (lambda (a b) + (< (hash-table-ref all-files a)(hash-table-ref all-files b)))) + (- num-logs max-allowed)))) + (for-each + (lambda (file) + (let* ((fullname (conc "logs/" file))) + (if (directory? fullname) + (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") + (handle-exceptions + exn + (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn) + (delete-file* fullname))))) + files) + (debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files.")))))) + +;;====================================================================== +;; Force a megatest cleanup-db if version is changed and skip-version-check not specified +;; Do NOT check if not on homehost! +;; +(define (common:exit-on-version-changed) + (if (and *toppath* ;; do nothing if *toppath* not yet provided + (common:on-homehost?)) + (if (common:api-changed?) + (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")) + (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") ".mtdb/main.db")) + (read-only (not (file-write-access? dbfile))) + (dbstruct (db:setup))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;; #t))) + (debug:print 0 *default-log-port* + "WARNING: Version mismatch!\n" + " expected: " (common:version-signature) "\n" + " got: " (common:get-last-run-version)) + (cond + ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t) + ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only) + (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db + (debug:print 0 *default-log-port* " I see you are the owner of megatest.config, attempting to cleanup and reset to new version") + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "Failed to switch versions. exn=" exn) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (print-call-chain (current-error-port)) + (exit 1)) + (common:cleanup-db dbstruct))) + ((not (common:file-exists? mtconf)) + (debug:print 0 *default-log-port* " megatest.config does not exist in this area. Cannot proceed with megatest version migration.") + (exit 1)) + ((not (common:file-exists? dbfile)) + (debug:print 0 *default-log-port* " .mtdb/main.db does not exist in this area. Cannot proceed with megatest version migration.") + (exit 1)) + ((not (eq? (current-user-id)(file-owner mtconf))) + (debug:print 0 *default-log-port* " You do not own .mtdb/main.db in this area. Cannot proceed with megatest version migration.") + (exit 1)) + (read-only + (debug:print 0 *default-log-port* " You have read-only access to this area. Cannot proceed with megatest version migration.") + (exit 1)) + (else + (debug:print 0 *default-log-port* " to switch versions you can run: \"megatest -cleanup-db\"") + (exit 1))))))) +;;====================================================================== +;; (begin +;; (debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.") +;; (exit 1)))) + +;;====================================================================== +;; S P A R S E A R R A Y S +;;====================================================================== + +(define (make-sparse-array) + (let ((a (make-sparse-vector))) + (sparse-vector-set! a 0 (make-sparse-vector)) + a)) + +(define (sparse-array? a) + (and (sparse-vector? a) + (sparse-vector? (sparse-vector-ref a 0)))) + +(define (sparse-array-ref a x y) + (let ((row (sparse-vector-ref a x))) + (if row + (sparse-vector-ref row y) + #f))) + +(define (sparse-array-set! a x y val) + (let ((row (sparse-vector-ref a x))) + (if row + (sparse-vector-set! row y val) + (let ((new-row (make-sparse-vector))) + (sparse-vector-set! a x new-row) + (sparse-vector-set! new-row y val))))) + +;;====================================================================== +;; L O C K E R S A N D B L O C K E R S +;;====================================================================== + +;; block further accesses to databases. Call this before shutting db down +(define (common:db-block-further-queries) + (mutex-lock! *db-access-mutex*) + (set! *db-access-allowed* #f) + (mutex-unlock! *db-access-mutex*)) + +(define (common:db-access-allowed?) + (let ((val (begin + (mutex-lock! *db-access-mutex*) + *db-access-allowed* + (mutex-unlock! *db-access-mutex*)))) + val)) + +;;====================================================================== +;; U S E F U L S T U F F +;;====================================================================== + +;; convert things to an alist or assoc list, #f gets converted to "" +;; +(define (common:to-alist dat) + (cond + ((list? dat) (map common:to-alist dat)) + ((vector? dat) + (map common:to-alist (vector->list dat))) + ((pair? dat) + (cons (common:to-alist (car dat)) + (common:to-alist (cdr dat)))) + ((hash-table? dat) + (map common:to-alist (hash-table->alist dat))) + (else + (if dat + dat + "")))) + +(define (common:alist-ref/default key alist default) + (or (alist-ref key alist) default)) + +;; moved into commonmod +;; +;; (define (common:low-noise-print waitval . keys) +;; (let* ((key (string-intersperse (map conc keys) "-" )) +;; (lasttime (hash-table-ref/default *common:denoise* key 0)) +;; (currtime (current-seconds))) +;; (if (> (- currtime lasttime) waitval) +;; (begin +;; (hash-table-set! *common:denoise* key currtime) +;; #t) +;; #f))) + +(define (common:read-encoded-string instr) + (handle-exceptions + exn + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* "received bad encoded string \"" instr "\", message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) + (print-call-chain (current-error-port)) + #f) + (read (open-input-string (base64:base64-decode instr)))) + (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) + + +;;====================================================================== +;; S T A T E S A N D S T A T U S E S +;;====================================================================== + +;; BBnote: *common:std-states* - dashboard filter control and test control state buttons defined here; used in set-fields-panel and dboard:make-controls +(define *common:std-states* ;; for toggle buttons in dashboard + '( + (0 "ARCHIVED") + (1 "STUCK") + (2 "KILLREQ") + (3 "KILLED") + (4 "NOT_STARTED") + (5 "COMPLETED") + (6 "LAUNCHED") + (7 "REMOTEHOSTSTART") + (8 "RUNNING") + )) + +(define *common:dont-roll-up-states* + '("DELETED" + "REMOVING" + "CLEANING" + "ARCHIVE_REMOVING" + )) + +;;====================================================================== +;; BBnote: *common:std-statuses* dashboard filter control and test control status buttons defined here; used in set-fields-panel and dboard:make-controls +;; note these statuses are sorted from better to worse. +;; This sort order is important to dcommon:status-compare3 and db:set-state-status-and-roll-up-items +(define *common:std-statuses* + '(;; (0 "DELETED") + (1 "n/a") + (2 "PASS") + (3 "SKIP") + (4 "WARN") + (5 "WAIVED") + (6 "CHECK") + (7 "STUCK/DEAD") + (8 "DEAD") + (9 "FAIL") + (10 "PREQ_FAIL") + (11 "PREQ_DISCARDED") + (12 "ABORT"))) + +(define *common:ended-states* ;; states which indicate the test is stopped and will not proceed + '("COMPLETED" "ARCHIVED" "KILLED" "KILLREQ" "STUCK" "INCOMPLETE" )) + +(define *common:badly-ended-states* ;; these roll up as CHECK, i.e. results need to be checked + '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD" "CHECK")) + +(define *common:well-ended-states* ;; an item's prereq in this state allows item to proceed + '("PASS" "WARN" "WAIVED" "SKIP")) + +;; BBnote: *common:running-states* used from db:set-state-status-and-roll-up-items +(define *common:running-states* ;; test is either running or can be run + '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED" "STARTED")) + +(define *common:cant-run-states* ;; These are stopping conditions that prevent a test from being run + '("COMPLETED" "KILLED" "UNKNOWN" "INCOMPLETE" "ARCHIVED")) + +(define *common:not-started-ok-statuses* ;; if not one of these statuses when in not_started state treat as dead + '("n/a" "na" "PASS" "FAIL" "WARN" "CHECK" "WAIVED" "DEAD" "SKIP")) + +;;====================================================================== +;; group tests into buckets corresponding to rollup +;;; Running, completed-pass, completed-non-pass + worst status, not started. +;; filter out +;(define (common:categorize-items-for-rollup in-tests) +; ( + +(define (common:special-sort items order comp) + (let ((items-order (map reverse order)) + (acomp (or comp >))) + (sort items + (lambda (a b) + (let ((a-num (cadr (or (assoc a items-order) '(0 0)))) + (b-num (cadr (or (assoc b items-order) '(0 0))))) + (acomp a-num b-num)))))) + +;;====================================================================== +;; ;; given a toplevel with currstate, currstatus apply state and status +;; ;; => (newstate . newstatus) +;; (define (common:apply-state-status currstate currstatus state status) +;; (let* ((cstate (string->symbol (string-downcase currstate))) +;; (cstatus (string->symbol (string-downcase currstatus))) +;; (sstate (string->symbol (string-downcase state))) +;; (sstatus (string->symbol (string-downcase status))) +;; (nstate #f) +;; (nstatus #f)) +;; (set! nstate +;; (case cstate +;; ((completed not_started killed killreq stuck archived) +;; (case sstate ;; completed -> sstate +;; ((completed killed killreq stuck archived) completed) +;; ((running remotehoststart launched) running) +;; (else unknown-error-1))) +;; ((running remotehoststart launched) +;; (case sstate +;; ((completed killed killreq stuck archived) #f) ;; need to look at all items +;; ((running remotehoststart launched) running) +;; (else unknown-error-2))) +;; (else unknown-error-3))) +;; (set! nstatus +;; (case sstatus +;; ((pass) +;; (case nstate +;; ((pass n/a deleted) pass) +;; ((warn) warn) +;; ((fail) fail) +;; ((check) check) +;; ((waived) waived) +;; ((skip) skip) +;; ((stuck/dead) stuck) +;; ((abort) abort) +;; (else unknown-error-4))) +;; ((warn) +;; (case nstate +;; ((pass warn n/a skip deleted) warn) +;; ((fail) fail) +;; ((check) check) +;; ((waived) waived) +;; ((stuck/dead) stuck) +;; (else unknown-error-5))) +;; ((fail) +;; (case nstate +;; ((pass warn fail check n/a waived skip deleted stuck/dead stuck) fail) +;; ((abort) abort) +;; (else unknown-error-6))) +;; (else unknown-error-7))) +;; (cons +;; (if nstate (symbol->string nstate) nstate) +;; (if nstatus (symbol->string nstatus) nstatus)))) + +;;====================================================================== +;; D E B U G G I N G S T U F F +;;====================================================================== + +(define *verbosity* 1) +(define *logging* #f) + +(define (get-with-default val default) + (let ((val (args:get-arg val))) + (if val val default))) + +(define (assoc/default key lst . default) + (let ((res (assoc key lst))) + (if res (cadr res)(if (null? default) #f (car default))))) + +(define (common:get-testsuite-name) + (or (configf:lookup *configdat* "setup" "area-name") ;; megatest is a flexible tool, testsuite is too limiting a description. + (configf:lookup *configdat* "setup" "testsuite" ) + (getenv "MT_TESTSUITE_NAME") + (pathname-file (or (if (string? *toppath* ) + (pathname-file *toppath*) + #f) + (common:get-toppath #f))) + "please-set-setup-area-name")) ;; (pathname-file (current-directory))))) + +;;====================================================================== +;; safe getting of toppath +(define (common:get-toppath areapath) + (or *toppath* + (if areapath + (begin + (set! *toppath* areapath) + (setenv "MT_RUN_AREA_HOME" areapath) + areapath) + #f) + (if (getenv "MT_RUN_AREA_HOME") + (begin + (set! *toppath* (getenv "MT_RUN_AREA_HOME")) + *toppath*) + #f) + ;; last resort, look for megatest.config + (let loop ((thepath (realpath "."))) + (if (file-exists? (conc thepath "/megatest.config")) + thepath + (if (equal? thepath "/") + (begin + (debug:print-error 0 *default-log-port* "Unable to find megatest home directory.") + #f) + (loop (pathname-directory thepath))))) + )) + + +;;====================================================================== +;; redefine for future cleanup (converge on area-name, the more generic +;; +(define common:get-area-name common:get-testsuite-name) + +(define (common:get-db-tmp-area . junk) + (if *db-cache-path* + *db-cache-path* + (if *toppath* ;; common:get-create-writeable-dir + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* "Couldn't create path to " *db-cache-path* ", exn=" exn) + (exit 1)) + (let* ((toppath (common:real-path *toppath*)) + (tsname (common:get-testsuite-name)) + (dbpath (common:get-create-writeable-dir + (list (conc "/tmp/" (current-user-name) + "/megatest_localdb/" + tsname "/" + (string-translate toppath "/" ".")) + (conc "/tmp/" (current-process-id) ;; just in case we have an issue with the dir by own user name + "/"(current-user-name) "/megatest_localdb/" + tsname + (string-translate toppath "/" ".")) + )))) + (set! *db-cache-path* dbpath) + ;; ensure megatest area has .mtdb + (let ((dbarea (conc *toppath* "/.mtdb"))) + (if (not (file-exists? dbarea)) + (create-directory dbarea))) + ;; ensure tmp area has .mtdb + (let ((dbarea (conc dbpath "/.mtdb"))) + (if (not (file-exists? dbarea)) + (create-directory dbarea))) + dbpath)) + #f))) + +(define (common:get-area-path-signature) + (message-digest-string (md5-primitive) *toppath*)) + +;;====================================================================== +;; E X I T H A N D L I N G +;;====================================================================== + +(define (common:run-sync?) + (and *toppath* ;; gate if called before *toppath* is set + (common:on-homehost?) + (args:get-arg "-server"))) + + +(define (std-signal-handler signum) + ;; (signal-mask! signum) + (set! *time-to-exit* #t) + ;;(debug:print-info 13 *default-log-port* "got signal "signum) + (debug:print-error 0 *default-log-port* "Received signal " signum " aaa exiting promptly") + ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway + (exit)) + +(define (special-signal-handler signum) + ;; (signal-mask! signum) + (set! *time-to-exit* #t) + ;;(debug:print-info 13 *default-log-port* "got signal "signum) + (debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting!!") + ;;TODO send email to notify admin contact listed in the config that the lisner got killed + ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway + (exit)) + + +(set-signal-handler! signal/int std-signal-handler) ;; ^C +(set-signal-handler! signal/term std-signal-handler) + +;; (set-signal-handler! signal/stop std-signal-handler) ;; ^Z NO, do NOT handle ^Z! + +;;====================================================================== +;; M I S C U T I L S +;;====================================================================== + +;;====================================================================== +;; convert stuff to a number if possible +(define (any->number val) + (cond + ((number? val) val) + ((string? val) (string->number val)) + ((symbol? val) (any->number (symbol->string val))) + (else #f))) + +(define (any->number-if-possible val) + (let ((num (any->number val))) + (if num num val))) + +(define (patt-list-match item patts) + (debug:print-info 8 *default-log-port* "patt-list-match item=" item " patts=" patts) + (if (and item patts) ;; here we are filtering for matches with item patterns + (let ((res #f)) ;; look through all the item-patts if defined, format is patt1,patt2,patt3 ... wildcard is % + (for-each + (lambda (patt) + (let ((modpatt (string-substitute "%" ".*" patt #t))) + (debug:print-info 10 *default-log-port* "patt " patt " modpatt " modpatt) + (if (string-match (regexp modpatt) item) + (set! res #t)))) + (string-split patts ",")) + res) + #t)) + +;;====================================================================== +;; '(print (string-intersperse (map cadr (hash-table-ref/default (read-config "megatest.config" \#f \#t) "disks" '"'"'("none" ""))) "\n"))' +(define (common:get-disks #!key (configf #f)) + (hash-table-ref/default + (or configf (read-config "megatest.config" #f #t)) + "disks" '("none" ""))) + +(define (common:get-install-area) + (let ((exe-path (car (argv)))) + (if (common:file-exists? exe-path) + (handle-exceptions + exn + #f + (pathname-directory + (pathname-directory + (pathname-directory exe-path)))) + #f))) + +;;====================================================================== +;; return first path that can be created or already exists and is writable +;; +(define (common:get-create-writeable-dir dirs) + (if (null? dirs) + #f + (let loop ((hed (car dirs)) + (tal (cdr dirs))) + (let ((res (or (and (directory? hed) + (file-write-access? hed) + hed) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "could not create " hed + ", this might cause problems down the road. exn=" exn) + #f) + (create-directory hed #t))))) + (if (and (string? res) + (directory? res)) + res + (if (null? tal) + #f + (loop (car tal)(cdr tal)))))))) + +;;====================================================================== +;; return the youngest timestamp . filename +;; +(define (common:get-youngest glob-list) + (let ((all-files (apply append + (map (lambda (patt) + (handle-exceptions + exn + '() + (glob patt))) + glob-list)))) + (fold (lambda (fname res) + (let ((last-mod (car res)) + (curmod (handle-exceptions + exn + 0 + (file-modification-time fname)))) + (if (> curmod last-mod) + (list curmod fname) + res))) + '(0 "n/a") + all-files))) + +;;====================================================================== +;; use bash to expand a glob. Does NOT handle paths with spaces! +;; +(define (common:bash-glob instr) + (string-split + (with-input-from-pipe + (conc "/bin/bash -c \"echo " instr "\"") + read-line))) + +;;====================================================================== +;; Some safety net stuff +;;====================================================================== + +;;====================================================================== +;; return input if it is a list or return null +(define (common:list-or-null inlst #!key (ovrd #f)(message #f)) + (if (list? inlst) + inlst + (begin + (if message (debug:print-error 0 *default-log-port* message)) + (or ovrd '())))) + +;;====================================================================== +;; T A R G E T S , S T A T E , S T A T U S , +;; R U N N A M E A N D T E S T P A T T +;;====================================================================== + +;;====================================================================== +;; (map print (map car (hash-table->alist (read-config "runconfigs.config" #f #t)))) +;; +(define (common:get-runconfig-targets #!key (configf #f)) + (let ((targs (sort (map car (hash-table->alist + (or configf ;; NOTE: There is no value in using runconfig:read here. + (read-config (conc *toppath* "/runconfigs.config") + #f #t) + (make-hash-table)))) + stringsymbol force-setting) #f)) + (force-result (case force-type + ((#f) #f) + ((always) #t) + ((test) (if (args:get-arg "-execute") ;; we are in a test + #t + #f)) + (else + (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.") + #t)))) ;; default to requiring server + (if force-result + (begin + (debug:print-info 0 *default-log-port* "ATTENTION! Forcing use of server, force setting is \"" force-setting "\".") + #t) + #f))) + +;;====================================================================== +;; M I S C L I S T S +;;====================================================================== + +;;====================================================================== +;; items in lista are matched value and position in listb +;; return the remaining items in listb or #f +;; +(define (common:list-is-sublist lista listb) + (if (null? lista) + listb ;; all items in listb are "remaining" + (if (> (length lista)(length listb)) + #f + (let loop ((heda (car lista)) + (tala (cdr lista)) + (hedb (car listb)) + (talb (cdr listb))) + (if (equal? heda hedb) + (if (null? tala) ;; we are done + talb + (loop (car tala) + (cdr tala) + (car talb) + + (cdr talb))) + #f))))) + +;;====================================================================== +;; Needed for long lists to be sorted where (apply max ... ) dies +;; +(define (common:max inlst) + (let loop ((max-val (car inlst)) + (hed (car inlst)) + (tal (cdr inlst))) + (if (not (null? tal)) + (loop (max hed max-val) + (car tal) + (cdr tal)) + (max hed max-val)))) + +;;====================================================================== +;; get min or max, use > for max and < for min, this works around the limits on apply +;; +(define (common:min-max comp lst) + (if (null? lst) + #f ;; better than an exception for my needs + (fold (lambda (a b) + (if (comp a b) a b)) + (car lst) + lst))) + +;;====================================================================== +;; get min or max, use > for max and < for min, this works around the limits on apply +;; +(define (common:sum lst) + (if (null? lst) + 0 + (fold (lambda (a b) + (+ a b)) + (car lst) + lst))) + +;;====================================================================== +;; path list to hash-table tree +;; ((a b c)(a b d)(e b c)) => ((a (b (d) (c))) (e (b (c)))) +;; +(define (common:list->htree lst) + (let ((resh (make-hash-table))) + (for-each + (lambda (inlst) + (let loop ((ht resh) + (hed (car inlst)) + (tal (cdr inlst))) + (if (hash-table-ref/default ht hed #f) + (if (not (null? tal)) + (loop (hash-table-ref ht hed) + (car tal) + (cdr tal))) + (begin + (hash-table-set! ht hed (make-hash-table)) + (loop ht hed tal))))) + lst) + resh)) + +;;====================================================================== +;; hash-table tree to html list tree +;; +;; tipfunc takes two parameters: y the tip value and path the path to that point +;; +(define (common:htree->html ht path tipfunc) + (let ((datlist (sort (hash-table->alist ht) + (lambda (a b) + (string< (car a)(car b)))))) + (if (null? datlist) + (tipfunc #f path) ;; really shouldn't get here + (s:ul + (map (lambda (x) + (let* ((levelname (car x)) + (y (cdr x)) + (newpath (append path (list levelname))) + (leaf (or (not (hash-table? y)) + (null? (hash-table-keys y))))) + (if leaf + (s:li (tipfunc y newpath)) + (s:li + (list + levelname + (common:htree->html y newpath tipfunc)))))) + datlist))))) + +;;====================================================================== +;; hash-table tree to alist tree +;; +(define (common:htree->atree ht) + (map (lambda (x) + (cons (car x) + (let ((y (cdr x))) + (if (hash-table? y) + (common:htree->atree y) + y)))) + (hash-table->alist ht))) + +;;====================================================================== +;; M U N G E D A T A I N T O N I C E F O R M S +;;====================================================================== + +;;====================================================================== +;; Generate an index for a sparse list of key values +;; ( (rowname1 colname1 val1)(rowname2 colname2 val2) ) +;; +;; => +;; +;; ( (rowname1 0)(rowname2 1)) ;; rownames -> num +;; (colname1 0)(colname2 1)) ) ;; colnames -> num +;; +;; optional apply proc to rownum colnum value +(define (common:sparse-list-generate-index data #!key (proc #f)) + (if (null? data) + (list '() '()) + (let loop ((hed (car data)) + (tal (cdr data)) + (rownames '()) + (colnames '()) + (rownum 0) + (colnum 0)) + (let* ((rowkey (car hed)) + (colkey (cadr hed)) + (value (caddr hed)) + (existing-rowdat (assoc rowkey rownames)) + (existing-coldat (assoc colkey colnames)) + (curr-rownum (if existing-rowdat rownum (+ rownum 1))) + (curr-colnum (if existing-coldat colnum (+ colnum 1))) + (new-rownames (if existing-rowdat rownames (cons (list rowkey curr-rownum) rownames))) + (new-colnames (if existing-coldat colnames (cons (list colkey curr-colnum) colnames)))) + ;; (debug:print-info 0 *default-log-port* "Processing record: " hed ) + (if proc (proc curr-rownum curr-colnum rowkey colkey value)) + (if (null? tal) + (list new-rownames new-colnames) + (loop (car tal) + (cdr tal) + new-rownames + new-colnames + (if (> curr-rownum rownum) curr-rownum rownum) + (if (> curr-colnum colnum) curr-colnum colnum) + )))))) + +;;====================================================================== +;; if it looks like a number -> convert it to a number, else return it +;; +(define (common:lazy-convert inval) + (let* ((as-num (if (string? inval)(string->number inval) #f))) + (or as-num inval))) + +;;====================================================================== +;; convert string a=1; b=2; c=a silly thing; d= +;; to '((a . 1)(b . 2)(c . "a silly thing")(d . "")) +;; +(define (common:val->alist val #!key (convert #f)) + (let ((val-list (string-split-fields ";\\s*" val #:infix))) + (if val-list + (map (lambda (x) + (let ((f (string-split-fields "\\s*=\\s*" x #:infix))) + (case (length f) + ((0) `(,#f)) ;; null string case + ((1) `(,(string->symbol (car f)))) + ((2) `(,(string->symbol (car f)) . ,(let ((inval (cadr f))) + (if convert (common:lazy-convert inval) inval)))) + (else f)))) + val-list) + '()))) + +;;====================================================================== +;; S Y S T E M S T U F F +;;====================================================================== + +;;====================================================================== +;; lazy-safe get file mod time. on any error (file not existing etc.) return 0 +;; +(define (common:lazy-modification-time fpath) + (handle-exceptions + exn + (begin + (debug:print 2 *default-log-port* "Failed to get modification time for " fpath ", treating it as zero. exn=" exn) + 0) + (if (file-exists? fpath) + (file-modification-time fpath) + 0))) + +;;====================================================================== +;; find timestamp of newest file associated with a sqlite db file +(define (common:lazy-sqlite-db-modification-time fpath) + (let* ((glob-list (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "Failed to glob " fpath "*, exn=" exn) + `(,(conc "/no/such/file, message: " ((condition-property-accessor 'exn 'message) exn)))) + (glob (conc fpath "*")))) + (file-list (if (eq? 0 (length glob-list)) + '("/no/such/file") + glob-list))) + (apply max + (map + common:lazy-modification-time + file-list)))) + +;;====================================================================== +;; make "nice-path" available in config files and the repl +(define nice-path common:nice-path) + +;;====================================================================== +;; returns *effective load* (not normalized) +;; +(define (common:get-intercept onemin fivemin) + (if (< onemin fivemin) ;; load is decreasing, just use the onemin load + onemin + (let* ((load-change (- onemin fivemin)) + (tchange (- 300 60))) + (max (+ onemin (* 60 (/ load-change tchange))) 0)))) + +;;====================================================================== +;; calculate a delay number based on a droop curve +;; inputs are: +;; - load-in, load as from uptime, NOT normalized +;; - numcpus, number of cpus, ideally use the real cpus, not threads +;; +(define (common:get-delay load-in numcpus) + (let* ((ratio (/ load-in numcpus)) + (new-option (configf:lookup *configdat* "load" "new-load-method")) + (paramstr (or (configf:lookup *configdat* "load" "exp-params") + "15 12 1281453987.9543 0.75")) ;; 5 4 10 1")) + (paramlst (map string->number (string-split paramstr)))) + (if new-option + (begin + (cond ((and (>= ratio 0) (< ratio .5)) + 0) + ((and (>= ratio 0.5) (<= ratio .9)) + (* ratio (/ 5 .9))) + ((and (> ratio .9) (<= ratio 1.1)) + (+ 5 (* (- ratio .9) (/ 55 .2)))) + ((> ratio 1.1) + 60))) + (match paramlst + ((r1 r2 s1 s2) + (debug:print 3 *default-log-port* "Using params r1=" r1 " r2=" r2 " s1=" s1 " s2=" s2) + (min (max (/ (expt r1 (* r2 s2 ratio)) s1) 0) 30)) + (else + (debug:print 0 *default-log-port* "BAD exp-params, should be \"r1 r2 s1 s2\" but got " paramstr) + 30))))) + +;; -mrw- this appears to not be used +;; +;; (define (common:print-delay-table) +;; (let loop ((x 0)) +;; (print x "," (common:get-delay x 1)) +;; (if (< x 2) +;; (loop (+ x 0.1))))) + +;; (define (get-cpu-load #!key (remote-host #f)) +;; (car (common:get-cpu-load remote-host))) + +;;====================================================================== +;; (let* ((load-res (process:cmd-run->list "uptime")) +;; (load-rx (regexp "load average:\\s+(\\d+)")) +;; (cpu-load #f)) +;; (for-each (lambda (l) +;; (let ((match (string-search load-rx l))) +;; (if match +;; (let ((newval (string->number (cadr match)))) +;; (if (number? newval) +;; (set! cpu-load newval)))))) +;; (car load-res)) +;; cpu-load)) + +;;====================================================================== +;; get values from cached info from dropping file in .sysdata dir +;; e.g. key is host and dtype is normalized-load +;; +(define (common:get-cached-info key dtype #!key (age 10)) + (if *toppath* + (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")) + (delfile (lambda (exn) + (debug:print-info 2 *default-log-port* " removing bad file " fullpath ", exn=" exn) + (delete-file* fullpath) + #f))) + (if (and (file-exists? fullpath) + (file-read-access? fullpath)) + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "failed to get cached info from " fullpath ", exn=" exn) + #f) + (debug:print 2 *default-log-port* "reading file " fullpath) + (let ((real-age (- (current-seconds) + (handle-exceptions + exn + (begin + (debug:print 1 *default-log-port* "Failed to read mod time on file " + fullpath ", using 0, exn=" exn) + 0) + (file-change-time fullpath))))) + (if (< real-age age) + (handle-exceptions + exn + (delfile exn) + (let* ((res (with-input-from-file fullpath read))) + (if (eof-object? res) + (begin + (delfile "n/a") + #f) + res))) + (begin + (debug:print-info 2 *default-log-port* "file " fullpath + " is too old (" real-age" seconds) to trust, skipping reading it") + #f)))) + (begin + (debug:print 2 *default-log-port* "not reading file " fullpath) + #f))) + #f)) + +(define (common:write-cached-info key dtype dat) + (if *toppath* + (let* ((fulldir (conc *toppath* "/.sysdata")) + (fullpath (conc fulldir "/" key "-" dtype ".log"))) + (if (not (file-exists? fulldir))(create-directory fulldir #t)) + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "failed to write file " fullpath ", exn=" exn) + #f) + (with-output-to-file fullpath (lambda ()(pp dat))))) + #f)) + +(define (common:raw-get-remote-host-load remote-host) + (let* ((inp #f)) + (handle-exceptions + exn + (begin + (close-input-pipe inp) + (debug:print 0 *default-log-port* "failed to ssh to " remote-host " and get loadavg. exn=" exn) + #f) ;; more specific handling of errors needed + (set! inp (open-input-pipe (conc "ssh " remote-host " cat /proc/loadavg"))) + (let ((res (list (read inp)(read inp)(read inp)))) + (close-input-pipe inp) + res)))) + +;;====================================================================== +;; get cpu load by reading from /proc/loadavg, return all three values +;; +(define (common:get-cpu-load remote-host) + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "failed to ssh or read loadavg from host " remote-host ", exn=" exn) + '(-99 -99 -99)) + (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) + (or (common:get-cached-info actual-hostname "cpu-load") + (let ((result (if (and remote-host + (not (equal? remote-host (get-host-name)))) + (map (lambda (res) + (if (eof-object? res) 9e99 res)) + (common:raw-get-remote-host-load remote-host)) + (with-input-from-file "/proc/loadavg" + (lambda ()(list (read)(read)(read))))))) + (match + result + ((l1 l2 l3) + (if (and (number? l1) + (number? l2) + (number? l3)) + (begin + (common:write-cached-info actual-hostname "cpu-load" result) + result) + '(-1 -1 -1))) ;; -1 is bad result + (else '(-2 -2 -2)))))))) + +;;====================================================================== +;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads +;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc. +;; keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load +;; +(define (common:get-normalized-cpu-load remote-host) + (let ((res (common:get-normalized-cpu-load-raw remote-host)) + (default `((adj-proc-load . 2) ;; there is no right answer + (adj-core-load . 2) + (1m-load . 2) + (5m-load . 0) ;; causes a large delta - thus causing default of throttling if stuff goes wrong + (15m-load . 0) + (proc . 1) + (core . 1) + (phys . 1) + (error . #t)))) + (cond + ((and (list? res) + (> (length res) 2)) + res) + ((eq? res #f) default) ;; add messages? + ((eq? res #f) default) ;; this would be the #eof + (else default)))) + +(define (common:ssh-get-loadavg remote-host) + (let ((inp (open-input-pipe (conc "ssh " remote-host " \"cat /proc/loadavg;cat /proc/cpuinfo;echo end\"")))) + (let* ((res (read-lines inp))) + (close-input-pipe inp) + res))) + +(define (common:get-normalized-cpu-load-raw remote-host) + (let* ((actual-host (or remote-host (get-host-name)))) ;; #f is localhost + (or (common:get-cached-info actual-host "normalized-load") + (let ((data (if remote-host + (common:ssh-get-loadavg remote-host) + (append + (with-input-from-file "/proc/loadavg" + read-lines) + (with-input-from-file "/proc/cpuinfo" + read-lines) + (list "end")))) + (load-rx (regexp "^([\\d\\.]+)\\s+([\\d\\.]+)\\s+([\\d\\.]+)\\s+.*$")) + (proc-rx (regexp "^processor\\s+:\\s+(\\d+)\\s*$")) + (core-rx (regexp "^core id\\s+:\\s+(\\d+)\\s*$")) + (phys-rx (regexp "^physical id\\s+:\\s+(\\d+)\\s*$")) + (max-num (lambda (p n)(max (string->number p) n)))) + ;; (print "data=" data) + (if (null? data) ;; something went wrong + #f + (let loop ((hed (car data)) + (tal (cdr data)) + (loads #f) + (proc-num 0) ;; processor includes threads + (phys-num 0) ;; physical chip on motherboard + (core-num 0)) ;; core + ;;; (print hed ", " loads ", " proc-num ", " phys-num ", " core-num) + (if (null? tal) ;; have all our data, calculate normalized load and return result + (let* ((act-proc (+ proc-num 1)) + (act-phys (+ phys-num 1)) + (act-core (+ core-num 1)) + (adj-proc-load (/ (car loads) act-proc)) + (adj-core-load (/ (car loads) act-core)) + (result + (append (list (cons 'adj-proc-load adj-proc-load) + (cons 'adj-core-load adj-core-load)) + (list (cons '1m-load (car loads)) + (cons '5m-load (cadr loads)) + (cons '15m-load (caddr loads))) + (list (cons 'proc act-proc) + (cons 'core act-core) + (cons 'phys act-phys))))) + (common:write-cached-info actual-host "normalized-load" result) + result) + (regex-case + hed + (load-rx ( x l1 l5 l15 ) (loop (car tal)(cdr tal)(map string->number (list l1 l5 l15)) proc-num phys-num core-num)) + (proc-rx ( x p ) (loop (car tal)(cdr tal) loads (max-num p proc-num) phys-num core-num)) + (phys-rx ( x p ) (loop (car tal)(cdr tal) loads proc-num (max-num p phys-num) core-num)) + (core-rx ( x c ) (loop (car tal)(cdr tal) loads proc-num phys-num (max-num c core-num))) + (else + (begin + ;; (print "NO MATCH: " hed) + (loop (car tal)(cdr tal) loads proc-num phys-num core-num))))))))))) + +(define (common:unix-ping hostname) + (let ((res (system (conc "ping -c 1 " hostname " > /dev/null")))) + (eq? res 0))) + +;;====================================================================== +;; ideally put all this info into the db, no need to preserve it across moving homehost +;; +;; return list of +;; ( reachable? cpuload update-time ) +(define (common:get-host-info hostname) + (let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data + (load (car loadinfo)) + (load-sample-time (cdr loadinfo)) + (load-sample-age (- (current-seconds) load-sample-time)) + (loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds + (host-last-update-timeout-seconds 4) + (host-rec (hash-table-ref/default *host-loads* hostname #f)) + ) + (cond + ((< load-sample-age loadinfo-timeout-seconds) + (list #t + load-sample-time + load)) + ((and host-rec + (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds))) + (list #t + (host-last-update host-rec) + (host-last-cpuload host-rec ))) + ((common:unix-ping hostname) + (list #t + (current-seconds) + (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds + (else + (list #f 0 -1) ;; bad host, don't use! + )))) + +;;====================================================================== +;; see defstruct host at top of file. +;; host: reachable last-update last-used last-cpuload +;; +(define (common:update-host-loads-table hosts-raw) + (let* ((hosts (filter (lambda (x) + (string-match (regexp "^\\S+$") x)) + hosts-raw))) + (for-each + (lambda (hostname) + (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) + (if h + h + (let ((h (make-host))) + (hash-table-set! *host-loads* hostname h) + h)))) + (host-info (common:get-host-info hostname)) + (is-reachable (car host-info)) + (last-reached-time (cadr host-info)) + (load (caddr host-info))) + (host-reachable-set! rec is-reachable) + (host-last-update-set! rec last-reached-time) + (host-last-cpuload-set! rec load))) + hosts))) + +;;====================================================================== +;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the +;; [host-rules] section. +;; +(define (common:get-least-loaded-host hosts-raw host-type configdat) + (let* ((rdat (configf:lookup configdat "host-rules" host-type)) + (rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate + (maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load + (maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs + (maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second + (hosts (filter (lambda (x) + (string-match (regexp "^\\S+$") x)) + hosts-raw)) + ;; (best-host #f) + (get-rec (lambda (hostname) + ;; (print "get-rec hostname=" hostname) + (let ((h (hash-table-ref/default *host-loads* hostname #f))) + (if h + h + (let ((h (make-host))) + (hash-table-set! *host-loads* hostname h) + h))))) + (best-load 99999) + (curr-time (current-seconds)) + (get-hosts-sorted (lambda (hosts) + (sort hosts (lambda (a b) + (let ((a-rec (get-rec a)) + (b-rec (get-rec b))) + ;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec)) + ;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec)) + (< (host-last-used a-rec) + (host-last-used b-rec)))))))) + (debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts)) + (if (null? hosts) + #f ;; no hosts to select from. All done and giving up now. + (let ((hosts-sorted (get-hosts-sorted hosts))) + (common:update-host-loads-table hosts) + (let loop ((hostname (car hosts-sorted)) + (tal (cdr hosts-sorted)) + (best-host #f)) + (let* ((rec (get-rec hostname)) + (reachable (host-reachable rec)) + (load (host-last-cpuload rec)) + (last-used (host-last-used rec)) + (delta (- curr-time last-used)) + (job-rate (if (> delta 0) + (/ 1 delta) + 999)) ;; jobs per second + (new-best + (cond + ((not reachable) + (debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.") + best-host) + ((and (< load maxnload) ;; load is acceptable + (< job-rate maxjobrate)) ;; job rate is acceptable + (set! best-load load) + hostname) + (else best-host)))) + (debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." ) + (if new-best + (begin ;; found a host, return it + (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) + (host-last-used-set! rec curr-time) + new-best) + (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) + +(define (common:wait-for-homehost-load maxnormload msg) + (let loop ((start-time (current-seconds))) ;; we saw some instances of this being called before *toppath* was set. This might be an early setup race. This delay should help but it is impossible to test... + (if (not *toppath*) + (begin + (debug:print 0 *default-log-port* "ERROR: common:wait-for-homehost-load called before *toppath* set.") + (thread-sleep! 30) + (if (< (- (current-seconds) start-time) 300) + (loop start-time))))) + (case (rmt:transport-mode) + ((http) + (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. + #f + (server:choose-server *toppath* 'homehost))) + (hh (if hh-dat (car hh-dat) #f))) + (common:wait-for-normalized-load maxnormload msg hh))) + (else + (common:wait-for-normalized-load maxnormload msg (get-host-name))))) + +(define (common:get-num-cpus remote-host) + (let* ((actual-host (or remote-host (get-host-name)))) + ;; hosts had better not be changing the number of cpus too often! + (or (hash-table-ref/default *numcpus-cache* actual-host #f) + (let* ((numcpus (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) + (let* ((proc (lambda () + (let loop ((numcpu 0) + (inl (read-line))) + (if (eof-object? inl) + (if (> numcpu 0) + numcpu + #f) ;; if zero return #f so caller knows that things are not working + (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl) + (+ numcpu 1) + numcpu) + (read-line)))))) + (result (if (and remote-host + (not (equal? remote-host (get-host-name)))) + (common:generic-ssh + (conc "ssh " remote-host " cat /proc/cpuinfo") + proc -1) + (with-input-from-file "/proc/cpuinfo" proc)))) + (if (and (number? result) + (> result 0)) + (common:write-cached-info actual-host "num-cpus" result)) + result)))) + (hash-table-set! *numcpus-cache* actual-host numcpus) + numcpus)))) + +;;====================================================================== +;; wait for normalized cpu load to drop below maxload +;; +(define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5)) + (let ((num-cpus (common:get-num-cpus remote-host))) + (if num-cpus + (common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host) + (begin + (thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again + (if (> rem-tries 0) + (common:wait-for-normalized-load maxnormload msg remote-host (- rem-tries 1)) + #f))))) + +;;====================================================================== +;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load +;; count - count down to zero, at some point we'd give up if the load never drops +;; num-tries - count down to zero number tries to get numcpus +;; +(define (common:wait-for-cpuload maxnormload numcpus-in + #!key (count 1000) + (msg #f)(remote-host #f)(num-tries 5)) + (let* ((loadavg (common:get-cpu-load remote-host)) + ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again + (numcpus (if (<= 1 numcpus-in) + (common:get-num-cpus remote-host) numcpus-in)) + (first (car loadavg)) + (next (cadr loadavg)) + (adjmaxload (* maxnormload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude + ;; fallback is to at least use 1 + ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit + ;; etc. + (effective-load (common:get-intercept first next)) + (recommended-delay (common:get-delay effective-load numcpus)) + (effective-host (or remote-host "localhost")) + (normalized-effective-load (/ effective-load numcpus)) + (will-wait (> normalized-effective-load maxnormload))) + (if (and will-wait (> recommended-delay 1)) + (let* ((actual-delay (min recommended-delay 30))) + (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) + (debug:print-info 0 *default-log-port* "Load control, delaying " + actual-delay " seconds to maintain safe load. current normalized effective load is " + normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load)) + (thread-sleep! actual-delay))) + + (cond + ;; bad data, try again to get the data + ((not will-wait) + (if (common:low-noise-print 3600 (conc (round normalized-effective-load) "-load-acceptable-" effective-host)) + (debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing."))) + + ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable + (> num-tries 0)) + (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " + first ", we'll sleep 10s and try " num-tries " more times.") + (thread-sleep! 10) + (common:wait-for-cpuload maxnormload numcpus-in + count: count remote-host: remote-host num-tries: (- num-tries 1))) + + ;; need to wait for load to drop + ((and will-wait ;; (> first adjmaxload) + (> count 0)) + (debug:print-info 0 *default-log-port* + "Delaying 15" ;; adjwait + " seconds due to normalized effective load " normalized-effective-load ;; first + " exceeding max of " adjmaxload + " on server " (or remote-host (get-host-name)) + " (normalized load-limit: " maxnormload ") " (if msg msg "")) + (thread-sleep! 15) ;; adjwait) + (common:wait-for-cpuload maxnormload numcpus count: (- count 1) msg: msg remote-host: remote-host) + ;; put the message here to indicate came out of waiting + (debug:print-info 1 *default-log-port* + "On host: " effective-host + ", effective load: " effective-load + ", numcpus: " numcpus + ", normalized effective load: " normalized-effective-load + )) + ;; overloaded and count expired (i.e. went to zero) + (else + (if (> num-tries 0) ;; should be "num-tries-left". + (if (common:low-noise-print 30 (conc (round effective-load) "-load-acceptable-" effective-host)) + (debug:print 0 *default-log-port* "Load on " effective-host " is acceptable at effective normalized load of " + effective-normalized-load " continuing.")) + (debug:print 0 *default-log-port* "Load on " effective-host ", " + first" could not be retrieved. Giving up and continuing.")))))) + +;;====================================================================== +;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load +;; +;; (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5)) +;; (let* ((loadavg (common:get-cpu-load remote-host)) +;; (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again +;; (common:get-num-cpus remote-host) +;; numcpus-in)) +;; (maxload (if force-maxload +;; maxload-in +;; (if (number? maxload-in) +;; (max maxload-in 0.5) +;; 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? +;; (first (car loadavg)) +;; (next (cadr loadavg)) +;; (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where +;; ;; numcpus (or could be +;; ;; maxload) is zero, +;; ;; crude fallback is to +;; ;; at least use 1 +;; (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next? +;; 0 +;; next))) ;; we will force a conservative calculation any time next is large. +;; (first-next-avg (/ (+ first next) 2)) +;; ;; add some randomness to the time to break any alignment +;; ;; where netbatch dumps many jobs to machines simultaneously +;; (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10) +;; (/ (- 1000 count) 10) +;; waitdelay) +;; (- first adjmaxload) )))) +;; (load-jump-limit (configf:lookup-number *configdat* "setup" "load-jump-limit")) +;; ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit +;; ;; etc. +;; (effective-load (common:get-intercept first next)) +;; (effective-host (or remote-host "localhost")) +;; (normalized-effective-load (/ effective-load numcpus)) +;; (will-wait (> normalized-effective-load maxload))) +;; +;; ;; let's let the user know once in a long while that load checking +;; ;; is happening but not constantly report it +;; #;(if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time +;; (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload +;; ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp)) +;; +;; (debug:print-info 1 *default-log-port* +;; "On host: " effective-host +;; ", effective load: " effective-load +;; ", numcpus: " numcpus +;; ", normalized effective load: " normalized-effective-load +;; ) +;; +;; (cond +;; ;; bad data, try again to get the data +;; ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable +;; (> num-tries 0)) +;; (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.") +;; (thread-sleep! 10) +;; (common:wait-for-cpuload maxload-in numcpus-in waitdelay +;; count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1))) +;; ;; need to wait for load to drop +;; ((and will-wait ;; (> first adjmaxload) +;; (> count 0)) +;; (debug:print-info 0 *default-log-port* +;; "Delaying " 15 ;; adjwait +;; " seconds due to normalized effective load " normalized-effective-load ;; first +;; " exceeding max of " adjmaxload +;; " on server " (or remote-host (get-host-name)) +;; " (normalized load-limit: " maxload ") " (if msg msg "")) +;; (thread-sleep! 15) ;; adjwait) +;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) +;; ((and (> loadjmp (cond +;; (load-jump-limit load-jump-limit) +;; ((> numcpus 8)(/ numcpus 2)) +;; ((> numcpus 4)(/ numcpus 1.2)) +;; (else 0.5))) +;; (> count 0)) +;; (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". " +;; (if msg msg "")) +;; (thread-sleep! adjwait) +;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) +;; (else +;; (if (> num-tries 0) +;; (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost"))) +;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing.")) +;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing.")))))) +;; +(define (get-uname . params) + (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params))))) + (uname #f)) + (if (null? (car uname-res)) + "unknown" + (caar uname-res)))) + +;;====================================================================== +;; D I S K S P A C E +;;====================================================================== + +(define (common:get-disk-space-used fpath) + (with-input-from-pipe (conc "/usr/bin/du -s " fpath) read)) + +;;====================================================================== +;; given path get free space, allows override in [setup] +;; with free-space-script /path/to/some/script.sh +;; +(define (get-df path) + (if (configf:lookup *configdat* "setup" "free-space-script") + (with-input-from-pipe + (conc (configf:lookup *configdat* "setup" "free-space-script") " " path) + (lambda () + (let ((res (read-line))) + (if (string? res) + (string->number res))))) + (get-unix-df path))) + +(define (get-free-inodes path) + (if (configf:lookup *configdat* "setup" "free-inodes-script") + (with-input-from-pipe + (conc (configf:lookup *configdat* "setup" "free-inodes-script") " " path) + (lambda () + (let ((res (read-line))) + (if (string? res) + (string->number res))))) + (get-unix-inodes path))) + +(define (get-unix-df path) + (let* ((df-results (process:cmd-run->list (conc "df " path))) + (space-rx (regexp "([0-9]+)\\s+([0-9]+)%")) + (freespc #f)) + ;; (write df-results) + (for-each (lambda (l) + (let ((match (string-search space-rx l))) + (if match + (let ((newval (string->number (cadr match)))) + (if (number? newval) + (set! freespc newval)))))) + (car df-results)) + freespc)) + +(define (get-unix-inodes path) + (let* ((df-results (process:cmd-run->list (conc "df -i " path))) + (space-rx (regexp "([0-9]+)\\s+([0-9]+)%")) + (freenodes 0)) ;; 0 is a better failsafe than #f here. + ;; (write df-results) + (for-each (lambda (l) + (let ((match (string-search space-rx l))) + (if match + (let ((newval (string->number (cadr match)))) + (if (number? newval) + (set! freenodes newval)))))) + (car df-results)) + freenodes)) + +(define (common:check-space-in-dir dirpath required) + (let* ((dbspace (if (directory? dirpath) + (get-df dirpath) + 0))) + (list (> dbspace required) + dbspace + required + dirpath))) + +;;====================================================================== +;; check space in dbdir and in megatest dir +;; returns: ok/not dbspace required-space +;; +(define (common:check-db-dir-space) + (let* ((required (string->number + ;; default is 1GB (or actually a billion bytes) This is the number of 1 kB blocks. + (or (configf:lookup *configdat* "setup" "dbdir-space-required") + "1000000"))) + (dbdir (common:make-tmpdir-name *toppath* "")) ;; (db:get-dbdir)) + (tdbspace (common:check-space-in-dir dbdir required)) + (mdbspace (common:check-space-in-dir *toppath* required))) + (sort (list tdbspace mdbspace) (lambda (a b) + (< (cadr a)(cadr b)))))) + +;;====================================================================== +;; check available space in dbdir, exit if insufficient +;; +(define (common:check-db-dir-and-exit-if-insufficient) + (let* ((spacedat (car (common:check-db-dir-space))) ;; look only at worst for now + (is-ok (car spacedat)) + (dbspace (cadr spacedat)) + (required (caddr spacedat)) + (dbdir (cadddr spacedat))) + (if (not is-ok) + (begin + (debug:print-error 0 *default-log-port* "Insufficient space in " dbdir ", require " required ", have " dbspace ", exiting now.") + (exit 1))))) + +;;====================================================================== +;; paths is list of lists ((name path) ... ) +;; +(define (common:get-disk-with-most-free-space disks minsize) + (let* ((best #f) + (bestsize 0) + (default-min-inodes-string "1000000") + (default-min-inodes (string->number default-min-inodes-string)) + (min-inodes (or (string->number (if (configf:lookup *configdat* "setup" "min_inodes") (configf:lookup *configdat* "setup" "min_inodes") default-min-inodes-string)) default-min-inodes))) + + (for-each + (lambda (disk-num) + (let* ((dirpath (cadr (assoc disk-num disks))) + (freespc (cond + ((not (directory? dirpath)) + (if (common:low-noise-print 300 "disks not a dir " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a directory - ignoring it.")) + -1) + ((not (file-write-access? dirpath)) + (if (common:low-noise-print 300 "disks not writeable " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not writeable - ignoring it.")) + -1) + ((not (eq? (string-ref dirpath 0) #\/)) + (if (common:low-noise-print 300 "disks not a proper path " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a fully qualified path - ignoring it.")) + -1) + (else + (get-df dirpath)))) + (free-inodes (cond + ((not (directory? dirpath)) + (if (common:low-noise-print 300 "disks not a dir " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a directory - ignoring it.")) + -1) + ((not (file-write-access? dirpath)) + (if (common:low-noise-print 300 "disks not writeable " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not writeable - ignoring it.")) + -1) + ((not (eq? (string-ref dirpath 0) #\/)) + (if (common:low-noise-print 300 "disks not a proper path " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a fully qualified path - ignoring it.")) + -1) + (else + (get-free-inodes dirpath)))) + ;;(free-inodes (get-free-inodes dirpath)) + ) + (debug:print 2 *default-log-port* "INFO: disk " disk-num " path " dirpath " free space " freespc " free inodes " free-inodes) + (if (and (> freespc bestsize)(> free-inodes min-inodes )) + (begin + (set! best (cons disk-num dirpath)) + (set! bestsize freespc))) + ;;(print "Processing: " disk-num " bestsize: " bestsize " best: " best " freespc: " freespc " min-inodes: " min-inodes " free-inodes: " free-inodes) + )) + (map car disks)) + (if (and best (> bestsize minsize)) + best + #f))) ;; #f means no disk candidate found + +;;====================================================================== +;; convert a spec string to a list of vectors #( rx action rx-string ) +(define (common:spec-string->list-of-specs spec-string actions) + (let ((spec-strings (string-split-fields "\\s*;\\s*" spec-string #:infix)) + (actions-regex (regexp (conc "^(.*)\\s+(" (string-intersperse (map conc actions) "|") ")")))) + (filter + (lambda (x) x) + (map (lambda (s) + (let ((m (string-match actions-regex s))) + (if m + (vector (regexp (cadr m))(string->symbol (caddr m))(cadr m)) + (begin + (debug:print 0 *default-log-port* "WARNING: Unrecognised rule \"" s "\" in clean-up specification.") + #f)))) + spec-strings)))) + +;;====================================================================== +;; given a list of specs rx . rule and a file return the first matching rule +;; +(define (common:file-find-rule fname rules) ;; rule is vector #( rx action rx-string) + (let loop ((rule (car rules)) + (tail (cdr rules))) + (let ((rx (vector-ref rule 0)) + (rn (vector-ref rule 1))) ;; rule name + (if (string-match rx fname) + rule ;; return the whole rule so regex can be printed etc. + (if (null? tail) + #f + (loop (car tail)(cdr tail))))))) + +;;====================================================================== +;; given a spec apply some rules to a directory +;; +;; WARNING: This function will REMOVE files - be sure your spec and path is correct! +;; +;; spec format: +;; file-regex1 action; file-regex2 action; ... +;; e.g. +;; .*\.log$ keep; .* remove +;; --> keep all .log files, remove everything else +;; limitations: +;; cannot have a rule with ; as part of the spec +;; not very flexible, would be nice to return binned file names? +;; supported rules: +;; keep - keep this file +;; remove - remove this file +;; compress - compress this file +;; +(define (common:dir-clean-up path spec-string #!key (compress "gzip")(actions '(keep remove compress))(remove-empty #f)) + (let* ((specs (common:spec-string->list-of-specs spec-string actions)) + (keepers (make-hash-table)) + (directories (make-hash-table))) + (find-files + path + action: (lambda (p res) + (let ((rule (common:file-find-rule p specs))) + (cond + ((directory? p)(hash-table-set! directories p #t)) + (else + (case (vector-ref rule 1) + ((keep)(hash-table-set! keepers p rule)) + ((remove) + (debug:print 0 *default-log-port* "Removing file " p) + (delete-file p)) + ((compress) + (debug:print 0 *default-log-port* "Compressing file " p) + (system (conc compress " " p))) + (else + (debug:print 0 *default-log-port* "No match for file " p)))))))) + (if remove-empty + (for-each + (lambda (d) + (if (null? (glob (conc d "/.*")(conc d "/*"))) + (begin + (debug:print 0 *default-log-port* "Removing empty directory " d) + (delete-directory d)))) + (sort (hash-table-keys directories) (lambda (a b)(> (string-length a)(string-length b)))))) + )) + +;;====================================================================== +;; E N V I R O N M E N T V A R S +;;====================================================================== + +(define (save-environment-as-files fname #!key (ignorevars (list "USER" "HOME" "DISPLAY" "LS_COLORS" "XKEYSYMDB" "EDITOR" "MAKEFLAGS" "MAKEF" "MAKEOVERRIDES" "HOSTNAME"))) + ;;(bb-check-path msg: "save-environment-as-files entry") + (let ((envvars (get-environment-variables)) + (whitesp (regexp "[^a-zA-Z0-9_\\-:,\\.\\/%$]")) + (mungeval (lambda (val) + (cond + ((eq? val #t) "") ;; convert #t to empty string + ((eq? val #f) #f) ;; convert #f to itself (still thinking about this one + (else val))))) + (with-output-to-file (conc fname ".csh") + (lambda () + (for-each (lambda (keyval) + (let* ((key (car keyval)) + (val (cdr keyval)) + (delim (if (and (string-search whitesp val) + (not (string-search "^\".*\"$" val)) + (not (string-search "^'.*'$" val))) + "\"" + ""))) + + (print (if (or (member key ignorevars) + (string-search whitesp key)) + "# setenv " + "setenv ") + key " " delim (mungeval val) delim))) + envvars))) + (with-output-to-file (conc fname ".sh") + (lambda () + (for-each (lambda (keyval) + (let* ((key (car keyval)) + (val (cdr keyval)) + (delim (if (and (string-search whitesp val) + (not (string-search "^\".*\"$" val)) + (not (string-search "^'.*'$" val))) + "\"" + ""))) + (print (if (or (member key ignorevars) + (string-search whitesp key) + (string-search ":" key)) ;; internal only values to be skipped. + "# export " + "export ") + key "=" delim (mungeval val) delim))) + envvars))))) + +(define (common:get-param-mapping #!key (flavor #f)) + "returns alist mapping string keys in testconfig/subrun to megatest command line switches; if flavor is switch-symbol, maps tcmt symbolic switches to megatest switches" + (let ((default '(("tag-expr" . "-tagexpr") + ("mode-patt" . "-modepatt") + ("run-name" . "-runname") + ("contour" . "-contour") + ("target" . "-target") + ("test-patt" . "-testpatt") + ("msg" . "-m") + ("log" . "-log") + ("start-dir" . "-start-dir") + ("new" . "-set-state-status")))) + (if (eq? flavor 'switch-symbol) + (map (lambda (x) + (cons (string->symbol (conc "-" (car x))) (cdr x))) + default) + default))) + +;;====================================================================== +;; set some env vars from an alist, return an alist with original values +;; (("VAR" "value") ...) +;; a value of #f means "unset this var" +;; +(define (alist->env-vars lst) + (if (list? lst) + (let ((res '())) + (for-each (lambda (p) + (let* ((var (car p)) + (val (cadr p)) + (prv (get-environment-variable var))) + (set! res (cons (list var prv) res)) + (if val + (safe-setenv var (->string val)) + (unsetenv var)))) + lst) + res) + '())) + +;;====================================================================== +;; clear vars matching pattern, run proc, set vars back +;; if proc is a string run that string as a command with +;; system. +;; +(define *common:orig-env* + (let ((envvars (get-environment-variables))) + (if (get-environment-variable "MT_ORIG_ENV") + (with-input-from-string + (z3:decode-buffer (base64:base64-decode (get-environment-variable "MT_ORIG_ENV"))) + read) + (filter-map (lambda (x) + (if (string-match "^MT_.*" (car x)) + #f + x)) + envvars)))) + +(define (common:with-orig-env proc) + (let ((current-env (get-environment-variables))) + (for-each (lambda (x) (unsetenv (car x))) current-env) + (for-each (lambda (x) (setenv (car x) (cdr x))) *common:orig-env*) + (let ((rv (cond + ((string? proc)(system proc)) + (proc (proc))))) + (for-each (lambda (x) (unsetenv (car x))) *common:orig-env*) + (for-each (lambda (x) (setenv (car x) (cdr x))) current-env) + rv))) + +(define (common:without-vars proc . var-patts) + (let ((vars (make-hash-table))) + (for-each + (lambda (vardat) ;; each env var + (for-each + (lambda (var-patt) + (if (string-match var-patt (car vardat)) + (let ((var (car vardat)) + (val (cdr vardat))) + (hash-table-set! vars var val) + (unsetenv var)))) + var-patts)) + (get-environment-variables)) + (cond + ((string? proc)(system proc)) + (proc (proc))) + (hash-table-for-each + vars + (lambda (var val) + (setenv var val))) + vars)) + +(define (common:run-a-command cmd #!key (with-vars #f) (with-orig-env #f)) + (let* ((pre-cmd (dtests:get-pre-command)) + (post-cmd (dtests:get-post-command)) + (fullcmd (if (or pre-cmd post-cmd) + (conc pre-cmd cmd post-cmd) + (conc "viewscreen " cmd)))) + (debug:print-info 02 *default-log-port* "Running command: " fullcmd) + (cond + (with-vars (common:without-vars fullcmd)) + (with-orig-env (common:with-orig-env fullcmd)) + (else (common:without-vars fullcmd "MT_.*"))))) + +;;====================================================================== +;; C O L O R S +;;====================================================================== + +(define (common:name->iup-color name) + (case (string->symbol (string-downcase name)) + ((red) "223 33 49") + ((grey) "192 192 192") + ((orange) "255 172 13") + ((purple) "This is unfinished ..."))) + +;;====================================================================== +;; (define (common:get-color-for-state-status state status) +;; (case (string->symbol state) +;; ((COMPLETED) +;; (case (string->symbol status) +;; ((PASS) "70 249 73") +;; ((WARN WAIVED) "255 172 13") +;; ((SKIP) "230 230 0") +;; (else "223 33 49"))) +;; ((LAUNCHED) "101 123 142") +;; ((CHECK) "255 100 50") +;; ((REMOTEHOSTSTART) "50 130 195") +;; ((RUNNING) "9 131 232") +;; ((KILLREQ) "39 82 206") +;; ((KILLED) "234 101 17") +;; ((NOT_STARTED) "240 240 240") +;; (else "192 192 192"))) + +(define (common:iup-color->rgb-hex instr) + (string-intersperse + (map (lambda (x) + (number->string x 16)) + (map string->number + (string-split instr))) + "/")) + +;;====================================================================== +;; L O C K I N G M E C H A N I S M S +;;====================================================================== + +;;====================================================================== +;; faux-lock is deprecated. Please use simple-lock below +;; +(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t)) + (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count + (if (> wait-time 0) + (begin + (thread-sleep! 1) + (if (eq? wait-time 1) ;; only one second left, steal the lock + (begin + (debug:print-info 0 *default-log-port* "stealing lock for " keyname) + (common:faux-unlock keyname force: #t))) + (common:faux-lock keyname wait-time: (- wait-time 1))) + #f) + (begin + (rmt:no-sync-set keyname (conc (current-process-id))) + (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))))) + +(define (common:faux-unlock keyname #!key (force #f)) + (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))) + (begin + (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname)) + #t) + #f)) + +;;====================================================================== +;; simple lock. improve and converge on this one. +;; +(define (common:simple-lock keyname) + (rmt:no-sync-get-lock keyname)) + +(define (common:simple-unlock keyname #!key (force #f)) + (rmt:no-sync-del! keyname)) + +;;====================================================================== +;; +;;====================================================================== + +(define (common:in-running-test?) + (and (args:get-arg "-execute") (get-environment-variable "MT_CMDINFO"))) + +(define (common:get-color-from-status status) + (cond + ((equal? status "PASS") "green") + ((equal? status "FAIL") "red") + ((equal? status "WARN") "orange") + ((equal? status "KILLED") "orange") + ((equal? status "KILLREQ") "purple") + ((equal? status "RUNNING") "blue") + ((equal? status "ABORT") "brown") + (else "black"))) + +;;====================================================================== +;; N A N O M S G C L I E N T +;;====================================================================== +;; +;; +;; +;; (define (common:send-dboard-main-changed) +;; (let* ((dashboard-ips (mddb:get-dashboards))) +;; (for-each +;; (lambda (ipadr) +;; (let* ((soc (common:open-nm-req (conc "tcp://" ipadr))) +;; (msg (conc "main " *toppath*)) +;; (res (common:nm-send-receive-timeout soc msg))) +;; (if (not res) ;; couldn't reach that dashboard - remove it from db +;; (print "ERROR: couldn't reach dashboard " ipadr)) +;; res)) +;; dashboard-ips))) +;; +;; +;; ;;====================================================================== +;; ;; D A S H B O A R D D B +;; ;;====================================================================== +;; +;; (define (mddb:open-db) +;; (let* ((db (open-database (conc (get-environment-variable "HOME") "/.dashboard.db")))) +;; (set-busy-handler! db (busy-timeout 10000)) +;; (for-each +;; (lambda (qry) +;; (exec (sql db qry))) +;; (list +;; "CREATE TABLE IF NOT EXISTS vars (id INTEGER PRIMARY KEY,key TEXT, val TEXT, CONSTRAINT varsconstraint UNIQUE (key));" +;; "CREATE TABLE IF NOT EXISTS dashboards ( +;; id INTEGER PRIMARY KEY, +;; pid INTEGER, +;; username TEXT, +;; hostname TEXT, +;; ipaddr TEXT, +;; portnum INTEGER, +;; start_time TIMESTAMP DEFAULT (strftime('%s','now')), +;; CONSTRAINT hostport UNIQUE (hostname,portnum) +;; );" +;; )) +;; db)) +;; +;; ;; register a dashboard +;; ;; +;; (define (mddb:register-dashboard port) +;; (let* ((pid (current-process-id)) +;; (hostname (get-host-name)) +;; (ipaddr (server:get-best-guess-address hostname)) +;; (username (current-user-name)) ;; (car userinfo))) +;; (db (mddb:open-db))) +;; (print "Register monitor, pid: " pid ", hostname: " hostname ", port: " port ", username: " username) +;; (exec (sql db "INSERT OR REPLACE INTO dashboards (pid,username,hostname,ipaddr,portnum) VALUES (?,?,?,?,?);") +;; pid username hostname ipaddr port) +;; (close-database db))) +;; +;; ;; unregister a monitor +;; ;; +;; (define (mddb:unregister-dashboard host port) +;; (let* ((db (mddb:open-db))) +;; (print "Register unregister monitor, host:port=" host ":" port) +;; (exec (sql db "DELETE FROM dashboards WHERE hostname=? AND portnum=?;") host port) +;; (close-database db))) +;; +;; ;; get registered dashboards +;; ;; +;; (define (mddb:get-dashboards) +;; (let ((db (mddb:open-db))) +;; (query fetch-column +;; (sql db "SELECT ipaddr || ':' || portnum FROM dashboards;")))) + +;;====================================================================== +;; T E S T L A U N C H I N G P E R I T E M W I T H H O S T T Y P E S +;;====================================================================== +;; +;; [hosts] +;; arm cubie01 cubie02 +;; x86_64 zeus xena myth01 +;; allhosts #{g hosts arm} #{g hosts x86_64} +;; +;; [host-types] +;; general #MTLOWESTLOAD #{g hosts allhosts} +;; arm #MTLOWESTLOAD #{g hosts arm} +;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo +;; +;; [host-rules] +;; # maxnload => max normalized load +;; # maxnjobs => max jobs per cpu +;; # maxjobrate => max jobs per second +;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 +;; +;; [launchers] +;; envsetup general +;; xor/%/n 4C16G +;; % nbgeneral +;; +;; [jobtools] +;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match. +;; flexi-launcher yes +;; launcher nbfake +;; +(define (common:get-launcher configdat testname itempath) + (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher"))) + (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher + (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no"))) + (let* ((launchers (hash-table-ref/default configdat "launchers" '()))) + (if (null? launchers) + fallback-launcher + (let loop ((hed (car launchers)) + (tal (cdr launchers))) + (let ((patt (car hed)) + (host-type (cadr hed))) + (if (tests:match patt testname itempath) + (begin + (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) + (let ((launcher (configf:lookup configdat "host-types" host-type))) + (if launcher + (let* ((launcher-parts (string-split launcher)) + (launcher-exe (car launcher-parts))) + (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline + (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) + (count 100)) + (if targ-host + (conc "remrun " targ-host) + (if (> count 0) + (begin + (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) + (thread-sleep! (- 101 count)) + (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) + (- count 1))) + (begin + (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) + (exit))))) + launcher)) + (begin + (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) + (if (null? tal) + fallback-launcher + (loop (car tal)(cdr tal))))))) + ;; no match, try again + (if (null? tal) + fallback-launcher + (loop (car tal)(cdr tal)))))))) + fallback-launcher))) + +;;====================================================================== +;; D A S H B O A R D U S E R V I E W S +;;====================================================================== + +;;====================================================================== +;; first read ~/views.config if it exists, then read $MTRAH/views.config if it exists +;; +(define (common:load-views-config) + (let* ((view-cfgdat (make-hash-table)) + (home-cfgfile (conc (get-environment-variable "HOME") "/.mtviews.config")) + (mthome-cfgfile (conc *toppath* "/.mtviews.config"))) + (if (common:file-exists? mthome-cfgfile) + (read-config mthome-cfgfile view-cfgdat #t)) + ;; we load the home dir file AFTER the MTRAH file so the user can clobber settings when running the dashboard in read-only areas + (if (common:file-exists? home-cfgfile) + (read-config home-cfgfile view-cfgdat #t)) + view-cfgdat)) + +;;====================================================================== +;; H I E R A R C H I C A L H A S H T A B L E S +;;====================================================================== +;; +;; Every element including top element is a vector: +;; + +(define (hh:make-hh #!key (ht #f)(value #f)) + (vector (or ht (make-hash-table)) value)) + +;;====================================================================== +;; used internally +(define-inline (hh:set-ht! hh ht) (vector-set! hh 0 ht)) +(define-inline (hh:get-ht hh) (vector-ref hh 0)) +(define-inline (hh:set-value! hh value) (vector-set! hh 1 value)) +(define-inline (hh:get-value hh value) (vector-ref hh 1)) + +;;====================================================================== +;; given a hierarchial hash and some keys look up the value ... +;; +(define (hh:get hh . keys) + (if (null? keys) + (vector-ref hh 1) ;; we have reached the end of the line, return the value sought + (let ((sub-ht (hh:get-ht hh))) + (if sub-ht ;; yes, there is more hierarchy + (let ((sub-hh (hash-table-ref/default sub-ht (car keys) #f))) + (if sub-hh + (apply hh:get sub-hh (cdr keys)) + #f)) + #f)))) + +;;====================================================================== +;; given a hierarchial hash, a value and some keys, add needed hierarcy and insert the value +;; +(define (hh:set! hh value . keys) + (if (null? keys) + (hh:set-value! hh value) ;; we have reached the end of the line, store the value + (let ((sub-ht (hh:get-ht hh))) + (if sub-ht ;; yes, there is more hierarchy + (let ((sub-hh (hash-table-ref/default sub-ht (car keys) #f))) + (if (not sub-hh) ;; we'll need to add the next level of hierarchy + (let ((new-sub-hh (hh:make-hh))) + (hash-table-set! sub-ht (car keys) new-sub-hh) + (apply hh:set! new-sub-hh value (cdr keys))) + (apply hh:set! sub-hh value (cdr keys)))) ;; call the sub-hierhash with remaining keys + (begin + (hh:set-ht! hh (make-hash-table)) + (apply hh:set! hh value keys)))))) + +;;====================================================================== +;; Manage pkts, used in servers, tests and likely other contexts so put +;; in common +;;====================================================================== + +(define common:pkts-spec + '((default . ((parent . P) + (action . a) + (filename . f))) + (configf . ((parent . P) + (action . a) + (filename . f))) + (server . ((action . a) + (pid . d) + (ipaddr . i) + (port . p) + (parent . P))) + + (test . ((cpuuse . c) + (diskuse . d) + (item-path . i) + (runname . r) + (state . s) + (target . t) + (status . u) + (parent . P))))) + +(define (common:get-pkts-dirs mtconf use-lt) + (let* ((pktsdirs-str (or (configf:lookup mtconf "setup" "pktsdirs") + (and use-lt + (conc (or *toppath* + (current-directory)) + "/lt/.pkts")))) + (pktsdirs (if pktsdirs-str + (string-split pktsdirs-str " ") + #f))) + pktsdirs)) + +;;====================================================================== +;; use-lt is use linktree "lt" link to find pkts dir +(define (common:save-pkt pktalist-in mtconf use-lt #!key (add-only #f)) ;; add-only saves the pkt only if there is a parent already + (if (or (not add-only) + (hash-table-exists? *pkts-info* 'last-parent)) + (let* ((parent (hash-table-ref/default *pkts-info* 'last-parent #f)) + (pktalist (if parent + (cons `(parent . ,parent) + pktalist-in) + pktalist-in))) + (let-values (((uuid pkt) + (alist->pkt pktalist common:pkts-spec))) + (hash-table-set! *pkts-info* 'last-parent uuid) + (let ((pktsdir (or (hash-table-ref/default *pkts-info* 'pkts-dir #f) + (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) + (pktsdir (car pktsdirs))) ;; assume it is there + (hash-table-set! *pkts-info* 'pkts-dir pktsdir) + pktsdir)))) + (debug:print 0 *default-log-port* "pktsdir: "pktsdir) + (handle-exceptions + exn + (debug:print-info 0 "failed to write out packet to " pktsdir ", exn=" exn) ;; don't care if this failed for now but MUST FIX - BUG!! + (if (not (file-exists? pktsdir)) + (create-directory pktsdir #t)) + (with-output-to-file + (conc pktsdir "/" uuid ".pkt") + (lambda () + (print pkt))))))))) + +(define (common:with-queue-db mtconf proc #!key (use-lt #f)(toppath-in #f)) + (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) + (pktsdir (if pktsdirs (car pktsdirs) #f)) + (toppath (or (configf:lookup mtconf "scratchdat" "toppath") + toppath-in)) + (pdbpath (or (configf:lookup mtconf "setup" "pdbpath") pktsdir))) + (cond + ((not (and pktsdir toppath pdbpath)) + (debug:print 0 *default-log-port* "ERROR: settings are missing in your megatest.config for area management.") + (debug:print 0 *default-log-port* " you need to have pktsdirs in the [setup] section.")) + ((not (common:file-exists? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: pkts directory not found " pktsdir)) + ((not (equal? (file-owner pktsdir)(current-effective-user-id))) + (debug:print 0 *default-log-port* "ERROR: directory " pktsdir " is not owned by " (current-effective-user-name))) + (else + (let* ((pdb (open-queue-db pdbpath "pkts.db" + schema: '("CREATE TABLE groups (id INTEGER PRIMARY KEY,groupname TEXT, CONSTRAINT group_constraint UNIQUE (groupname));")))) + (proc pktsdirs pktsdir pdb) + (dbi:close pdb)))))) + +(define (common:load-pkts-to-db mtconf #!key (use-lt #f)) + (common:with-queue-db + mtconf + (lambda (pktsdirs pktsdir pdb) + (for-each + (lambda (pktsdir) ;; look at all + (cond + ((not (common:file-exists? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: packets directory " pktsdir " does not exist.")) + ((not (directory? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not a directory.")) + ((not (file-read-access? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not readable.")) + (else + (debug:print-info 0 *default-log-port* "Loading packets found in " pktsdir) + (let ((pkts (glob (conc pktsdir "/*.pkt"))) + (sqdb (dbi:db-conn pdb)) + ) + ;; Put this in a transaction to avoid issues overloading the db + (sqlite3:with-transaction + sqdb + (lambda () + (for-each + (lambda (pkt) + (let* ((uuid (cadr (string-match ".*/([0-9a-f]+).pkt" pkt))) + (exists (lookup-by-uuid pdb uuid #f))) + (if (not exists) + (let* ((pktdat (string-intersperse + (with-input-from-file pkt read-lines) + "\n")) + (apkt (pkt->alist pktdat)) + (ptype (alist-ref 'T apkt))) + (add-to-queue pdb pktdat uuid (or ptype 'cmd) #f 0) + (debug:print 4 *default-log-port* "Added " uuid " of type " ptype " to queue")) + (debug:print 4 *default-log-port* "pkt: " uuid " exists, skipping...") + ))) + pkts))))))) + pktsdirs)) + use-lt: use-lt)) + +(define (common:get-pkt-alists pkts) + (map (lambda (x) + (alist-ref 'apkt x)) ;; 'pkta pulls out the alist from the read pkt + pkts)) + +;;====================================================================== +;; given list of pkts (alist mode) return list of D cards as Unix epoch, sorted descending +;; also delete duplicates by target i.e. (car pkt) +;; +(define (common:get-pkt-times pkts) + (delete-duplicates + (sort + (map (lambda (x) + `(,(alist-ref 't x) . ,(string->number (alist-ref 'D x)))) + pkts) + (lambda (a b)(> (cdr a)(cdr b)))) ;; sort descending + (lambda (a b)(equal? (car a)(car b))))) ;; remove duplicates by target + + +(define *common:thread-punchlist* (make-hash-table)) +(define (common:send-thunk-to-background-thread thunk #!key (name #f)) + ;;(BB> "launched thread " name) + ;; we need a unique name for the thread. + (let* ((realname (if name + (if (not (hash-table-ref/default *common:thread-punchlist* name #f)) + name + (conc name"-" (symbol->string (gensym)))) + (conc "anonymous-"(symbol->string (gensym))))) + (realthunk (lambda () + (let ((res (thunk))) + (hash-table-delete! *common:thread-punchlist* realname) + res))) + (thread (make-thread realthunk realname))) + (hash-table-set! *common:thread-punchlist* realname thread) + (thread-start! thread) + )) + +(define (common:join-backgrounded-threads) + ;; may need to trap and ignore exceptions -- dunno how atomic threads are... + (for-each + (lambda (thread-name) + (let* ((thread (hash-table-ref/default *common:thread-punchlist* thread-name #f))) + (if thread + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "joining threads failed. exn=" exn) + #t) ;; just ignore it, it might have died in the meantime so joining it will throw an exception + (thread-join! thread)) + ))) + (hash-table-keys *common:thread-punchlist*))) + ) Index: mtut.scm ================================================================== --- mtut.scm +++ mtut.scm @@ -19,16 +19,17 @@ (declare (uses common)) (declare (uses mtargs)) (declare (uses mtargs.import)) (declare (uses debugprint)) (declare (uses debugprint.import)) +(declare (uses commonmod)) +(declare (uses commonmod.import)) (declare (uses configfmod)) (declare (uses configfmod.import)) (declare (uses configf)) -;; (declare (uses rmt)) -(declare (uses commonmod)) -(declare (uses commonmod.import)) +(declare (uses rmtmod)) +(declare (uses rmtmod.import)) (include "megatest-version.scm") ;; fake out readline usage of toplevel-command (define (toplevel-command . a) #f) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -48,750 +48,5 @@ ;; ;; generate entries for ~/.megatestrc with the following ;; ;; grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u - -;;====================================================================== -;; S U P P O R T F U N C T I O N S -;;====================================================================== - -(define (rmt:on-homehost? runremote) - (let* ((hh-dat (remote-hh-dat runremote))) - (if (pair? hh-dat) - (cdr hh-dat) - (begin - (debug:print-info 0 *default-log-port* "hh-dat="hh-dat) - #f)))) - -(define (make-and-init-remote areapath) - (case (rmt:transport-mode) - ((http)(make-remote)) - ((tcp) (tt:make-remote areapath)) - (else #f))) - -;;====================================================================== - -(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id -(define *ttdat* #f) -;; how to make area-dat -(define (rmt:set-ttdat areapath ttdat) - (if ttdat - ttdat - (if *ttdat* - *ttdat* - (begin - (debug:print-info 2 *default-log-port* "rmt:set-ttdat: Initialize new ttdat") - (let* ((newremote (make-and-init-remote areapath))) - (set! *ttdat* newremote) - newremote - ) - ) - ) - ) -) - -;; NB// area-dat replaced by ttdat -;; -(define (rmt:send-receive cmd run-id params #!key (attemptnum 1)(ttdat #f)) - (assert (or (not run-id) (number? run-id)) "FATAL: run-id is required to be a number or #f") - (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.") - (let* ((areapath *toppath*) ;; TODO - resolve from dbstruct to be compatible with multiple areas - (readonly-mode (rmtmod:calc-ro-mode ttdat *toppath*)) - (testsuite (common:get-testsuite-name))) - (case (rmt:transport-mode) - ((tcp) - (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value - (attemptnum (+ 1 attemptnum)) - (mtexe (common:find-local-megatest)) - (dbfname (conc (dbfile:run-id->dbnum run-id)".db")) - (ttdat (rmt:set-ttdat areapath ttdat)) - (conn (tt:get-conn ttdat dbfname)) - (is-main (equal? dbfname "main.db")) ;; why not (not run-id) ? - (server-start-proc (if is-main - #f - (lambda () - ;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname) - (rmt:start-server ;; tt:server-process-run - areapath - testsuite ;; (dbfile:testsuite-name) - mtexe - run-id))))) - ;; here we look at ttdat, if dbfname is NOT main.db we check that a conn exists for it - ;; and if there is no conn we first send a request to the main.db server to start a - ;; server for the dbfname. - #;(if (and (not is-main)(not conn)) ;; no existing connection to non-main server, call in a start up request - (begin - (server-start-proc) - (thread-sleep! 1))) - (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc))) - ((nfs) - (nfs-transport-handler cmd run-id params attemptnum areapath readonly-mode testsuite)) - (else - (debug:print-info 0 *default-log-port* "rmt:transport-mode is "(rmt:transport-mode)) - (assert #f "FATAL: rmt:transport-mode set to invalid value."))))) - -(define (nfs-transport-handler cmd run-id params attemptnum areapath readonly-mode testsuite) - (let* ((keys (common:get-fields *configdat*)) - (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath))) - (api:dispatch-request dbstruct cmd run-id params))) - -(define (rmt:get-max-query-average run-id) - (mutex-lock! *db-stats-mutex*) - (let* ((runkey (conc "run-id=" run-id " ")) - (cmds (filter (lambda (x) - (substring-index runkey x)) - (hash-table-keys *db-stats*))) - (res (if (null? cmds) - (cons 'none 0) - (let loop ((cmd (car cmds)) - (tal (cdr cmds)) - (max-cmd (car cmds)) - (res 0)) - (let* ((cmd-dat (hash-table-ref *db-stats* cmd)) - (tot (vector-ref cmd-dat 0)) - (curravg (/ (vector-ref cmd-dat 1) (vector-ref cmd-dat 0))) ;; count is never zero by construction - (currmax (max res curravg)) - (newmax-cmd (if (> curravg res) cmd max-cmd))) - (if (null? tal) - (if (> tot 10) - (cons newmax-cmd currmax) - (cons 'none 0)) - (loop (car tal)(cdr tal) newmax-cmd currmax))))))) - (mutex-unlock! *db-stats-mutex*) - res)) - -(define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5)) - (let* ((qry-is-write (not (member cmd api:read-only-queries))) - (db-file-path (common:make-tmpdir-name *toppath* "")) ;; 0)) - (dbstructs-local (db:setup)) - (read-only (not (file-write-access? db-file-path))) - (start (current-milliseconds)) - (resdat (if (not (and read-only qry-is-write)) - (let ((v (api:execute-requests dbstructs-local (vector (symbol->string cmd) params)))) - ;; (handle-exceptions ;; there has been a long history of receiving strange errors from values returned by the client when things go wrong.. - ;; exn ;; This is an attempt to detect that situation and recover gracefully - ;; (begin - ;; (debug:print 0 *default-log-port* "ERROR: bad data from server " v " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) - ;; (vector #t '())) ;; should always get a vector but if something goes wrong return a dummy - (if (and (vector? v) - (> (vector-length v) 1)) - (let ((newvec (vector (vector-ref v 0)(vector-ref v 1)))) - newvec) ;; by copying the vector while inside the error handler we should force the detection of a corrupted record - (vector #t '()))) ;; ) ;; we could also check that the returned types are valid - (vector #t '()))) - (success (vector-ref resdat 0)) - (res (vector-ref resdat 1)) - (duration (- (current-milliseconds) start))) - (if (and read-only qry-is-write) - (debug:print 0 *default-log-port* "ERROR: attempt to write to read-only database ignored. cmd=" cmd)) - (if (not success) - (if (> remretries 0) - (begin - (debug:print-error 0 *default-log-port* "local query failed. Trying again.") - (thread-sleep! (/ (random 5000) 1000)) ;; some random delay - (rmt:open-qry-close-locally cmd run-id params remretries: (- remretries 1))) - (begin - (debug:print-error 0 *default-log-port* "too many retries in rmt:open-qry-close-locally, giving up") - #f)) - (begin - ;; (rmt:update-db-stats run-id cmd params duration) - ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it - (if qry-is-write - (let ((start-time (current-seconds))) - (mutex-lock! *db-multi-sync-mutex*) - (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) - (mutex-unlock! *db-multi-sync-mutex*))))) - res)) - -;;====================================================================== -;; -;; A C T U A L A P I C A L L S -;; -;;====================================================================== - -;;====================================================================== -;; S E R V E R -;;====================================================================== - -(define (rmt:kill-server run-id) - (rmt:send-receive 'kill-server run-id (list run-id))) - -(define (rmt:start-server areapath testsuite mtexe run-id) ;; run on main.db server - (rmt:send-receive 'start-server #f (list areapath testsuite mtexe run-id))) - -;;====================================================================== -;; M I S C -;;====================================================================== - -(define (rmt:login run-id) - (rmt:send-receive 'login run-id (list *toppath* megatest-version (client:get-signature)))) - -;; This login does no retries under the hood - it acts a bit like a ping. -;; Deprecated for nmsg-transport. -;; -;; (define (rmt:login-no-auto-client-setup runremote) -;; (rmt:send-receive-no-auto-client-setup runremote 'login #f (list *toppath* megatest-version (client:get-signature)))) - - -;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host -(define (rmt:get-latest-host-load hostname) - (rmt:send-receive 'get-latest-host-load #f (list hostname))) - -(define (rmt:sdb-qry qry val run-id) - ;; add caching if qry is 'getid or 'getstr - (rmt:send-receive 'sdb-qry run-id (list qry val))) - -;; NOT COMPLETED -(define (rmt:runtests user run-id testpatt params) - (rmt:send-receive 'runtests run-id testpatt)) - -(define (rmt:get-run-record-ids target run keynames ) - (rmt:send-receive 'get-run-record-ids #f (list target run keynames ))) - -(define (rmt:get-changed-record-ids since-time) - (rmt:send-receive 'get-changed-record-ids #f (list since-time)) ) - -(define (rmt:get-all-runids) - (rmt:send-receive 'get-all-run-ids #f '()) ) - -(define (rmt:get-changed-record-run-ids since-time) - (rmt:send-receive 'get-changed-record-run-ids #f (list since-time))) - -(define (rmt:get-changed-record-test-ids run-id since-time) - (rmt:send-receive 'get-changed-record-test-ids run-id (list since-time run-id))) - - - -(define (rmt:drop-all-triggers) - (rmt:send-receive 'drop-all-triggers #f '())) - -(define (rmt:create-all-triggers) - (rmt:send-receive 'create-all-triggers #f '())) - -;;====================================================================== -;; T E S T M E T A -;;====================================================================== - -(define (rmt:get-tests-tags) - (rmt:send-receive 'get-tests-tags #f '())) - -;;====================================================================== -;; K E Y S -;;====================================================================== - -;; These require run-id because the values come from the run! -;; however the query must still apply to main.db -;; -(define (rmt:get-key-val-pairs run-id) - (rmt:send-receive 'get-key-val-pairs #f (list run-id))) - -(define (rmt:get-keys) - (if *db-keys* *db-keys* - (let ((res (rmt:send-receive 'get-keys #f '()))) - (set! *db-keys* res) - res))) - -(define (rmt:get-keys-write) ;; dummy query to force server start - (let ((res (rmt:send-receive 'get-keys-write #f '()))) - (set! *db-keys* res) - res)) - -;; we don't reuse run-id's (except possibly *after* a db cleanup) so it is safe -;; to cache the resuls in a hash -;; -(define (rmt:get-key-vals run-id) - (or (hash-table-ref/default *keyvals* run-id #f) - (let ((res (rmt:send-receive 'get-key-vals #f (list run-id)))) - (hash-table-set! *keyvals* run-id res) - res))) - -(define (rmt:get-targets) - (rmt:send-receive 'get-targets #f '())) - -(define (rmt:get-target run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-target #f (list run-id))) - -(define (rmt:get-run-times runpatt targetpatt) - (rmt:send-receive 'get-run-times #f (list runpatt targetpatt ))) - - -;;====================================================================== -;; T E S T S -;;====================================================================== - -;; IDEA: Threadify these - they spend a lot of time waiting ... -;; -(define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) - (let ((multi-run-mutex (make-mutex)) - (run-id-list (if run-ids - run-ids - (rmt:get-all-run-ids))) - (result '())) - (if (null? run-id-list) - '() - (let loop ((hed (car run-id-list)) - (tal (cdr run-id-list)) - (threads '())) - (if (> (length threads) 5) - (loop hed tal (filter (lambda (th)(not (member (thread-state th) '(terminated dead)))) threads)) - (let* ((newthread (make-thread - (lambda () - (let ((res (rmt:send-receive 'get-tests-for-run-mindata hed (list hed testpatt states status not-in)))) - (if (list? res) - (begin - (mutex-lock! multi-run-mutex) - (set! result (append result res)) - (mutex-unlock! multi-run-mutex)) - (debug:print-error 0 *default-log-port* "get-tests-for-run-mindata failed for run-id " hed ", testpatt " testpatt ", states " states ", status " status ", not-in " not-in)))) - (conc "multi-run-thread for run-id " hed))) - (newthreads (cons newthread threads))) - (thread-start! newthread) - (thread-sleep! 0.05) ;; give that thread some time to start - (if (null? tal) - newthreads - (loop (car tal)(cdr tal) newthreads)))))) - result)) - -;; ;; IDEA: Threadify these - they spend a lot of time waiting ... -;; ;; -;; (define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) -;; (let ((run-id-list (if run-ids -;; run-ids -;; (rmt:get-all-run-ids)))) -;; (apply append (map (lambda (run-id) -;; (rmt:send-receive 'get-tests-for-run-mindata run-id (list run-ids testpatt states status not-in))) -;; run-id-list)))) - -(define (rmt:delete-test-records run-id test-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'delete-test-records run-id (list run-id test-id))) - -(define (rmt:test-set-state-status run-id test-id state status msg) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-set-state-status run-id (list run-id test-id state status msg))) - -(define (rmt:test-toplevel-num-items run-id test-name) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-toplevel-num-items run-id (list run-id test-name))) - -;; (define (rmt:get-previous-test-run-record run-id test-name item-path) -;; (rmt:send-receive 'get-previous-test-run-record run-id (list run-id test-name item-path))) - -(define (rmt:get-matching-previous-test-run-records run-id test-name item-path) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-matching-previous-test-run-records run-id (list run-id test-name item-path))) - -(define (rmt:test-get-logfile-info run-id test-name) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-get-logfile-info run-id (list run-id test-name))) - -(define (rmt:test-get-records-for-index-file run-id test-name) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-get-records-for-index-file run-id (list run-id test-name))) - -(define (rmt:get-testinfo-state-status run-id test-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-testinfo-state-status run-id (list run-id test-id))) - -(define (rmt:test-set-log! run-id test-id logf) - (assert (number? run-id) "FATAL: Run id required.") - (if (string? logf)(rmt:general-call 'test-set-log run-id logf test-id))) - -(define (rmt:test-set-top-process-pid run-id test-id pid) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-set-top-process-pid run-id (list run-id test-id pid))) - -(define (rmt:test-get-top-process-pid run-id test-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-get-top-process-pid run-id (list run-id test-id))) - -(define (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt) - (rmt:send-receive 'get-run-ids-matching-target #f (list keynames target res runname testpatt statepatt statuspatt))) - -;; NOTE: This will open and access ALL run databases. -;; -(define (rmt:test-get-paths-matching-keynames-target-new keynames target res testpatt statepatt statuspatt runname) - (let ((run-ids (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt))) - (apply append - (map (lambda (run-id) - (rmt:send-receive 'test-get-paths-matching-keynames-target-new run-id (list run-id keynames target res testpatt statepatt statuspatt runname))) - run-ids)))) - - - -(define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f)) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps))) - -(define (rmt:get-count-tests-running-for-run-id run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id))) - -(define (rmt:get-not-completed-cnt run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-not-completed-cnt run-id (list run-id))) - - -;; Statistical queries - -(define (rmt:get-count-tests-running run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-count-tests-running run-id (list run-id))) - -(define (rmt:get-count-tests-running-for-testname run-id testname) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-count-tests-running-for-testname run-id (list run-id testname))) - -(define (rmt:get-count-tests-running-in-jobgroup run-id jobgroup) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-count-tests-running-in-jobgroup run-id (list run-id jobgroup))) - -(define (rmt:set-state-status-and-roll-up-run run-id state status) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'set-state-status-and-roll-up-run run-id (list run-id state status))) - - -(define (rmt:update-pass-fail-counts run-id test-name) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:general-call 'update-pass-fail-counts run-id test-name test-name test-name)) - -(define (rmt:top-test-set-per-pf-counts run-id test-name) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'top-test-set-per-pf-counts run-id (list run-id test-name))) - -(define (rmt:get-raw-run-stats run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-raw-run-stats run-id (list run-id))) - -(define (rmt:get-test-times runname target) - (rmt:send-receive 'get-test-times #f (list runname target ))) - -;;====================================================================== -;; R U N S -;;====================================================================== - -;; BUG - LOOK AT HOW THIS WORKS!!! -;; -(define (rmt:get-run-info run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-run-info #f (list run-id))) - -(define (rmt:get-num-runs runpatt) - (rmt:send-receive 'get-num-runs #f (list runpatt))) - -(define (rmt:get-runs-cnt-by-patt runpatt targetpatt keys) - (rmt:send-receive 'get-runs-cnt-by-patt #f (list runpatt targetpatt keys))) - -;; Use the special run-id == #f scenario here since there is no run yet -(define (rmt:register-run keyvals runname state status user contour) - (rmt:send-receive 'register-run #f (list keyvals runname state status user contour))) - -(define (rmt:get-run-name-from-id run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-run-name-from-id #f (list run-id))) - -(define (rmt:delete-run run-id) - (rmt:send-receive 'delete-run #f (list run-id))) - -(define (rmt:update-run-stats run-id stats) - (rmt:send-receive 'update-run-stats #f (list run-id stats))) - -(define (rmt:delete-old-deleted-test-records run-id) - (rmt:send-receive 'delete-old-deleted-test-records run-id (list run-id))) - -(define (rmt:get-runs runpatt count offset keypatts) - (rmt:send-receive 'get-runs #f (list runpatt count offset keypatts))) - -(define (rmt:simple-get-runs runpatt count offset target last-update) - (rmt:send-receive 'simple-get-runs #f (list runpatt count offset target last-update))) - -(define (rmt:get-all-run-ids) - (rmt:send-receive 'get-all-run-ids #f '())) - -(define (rmt:get-prev-run-ids run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-prev-run-ids #f (list run-id))) - -(define (rmt:lock/unlock-run run-id lock unlock user) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'lock/unlock-run #f (list run-id lock unlock user))) - -;; set/get status -(define (rmt:get-run-status run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-run-status #f (list run-id))) - -(define (rmt:get-run-state run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-run-state #f (list run-id))) - -(define (rmt:get-run-state-status run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-run-state-status #f (list run-id))) - -(define (rmt:set-run-status run-id run-status #!key (msg #f)) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'set-run-status #f (list run-id run-status msg))) - -(define (rmt:set-run-state-status run-id state status ) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'set-run-state-status #f (list run-id state status))) - -(define (rmt:update-tesdata-on-repilcate-db old-lt new-lt) -(rmt:send-receive 'update-tesdata-on-repilcate-db #f (list old-lt new-lt))) - -(define (rmt:update-run-event_time run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'update-run-event_time #f (list run-id))) - -(define (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-runs-update #!key (sort-order "asc")) ;; fields of #f uses default - (rmt:send-receive 'get-runs-by-patt #f (list keys runnamepatt targpatt offset limit fields last-runs-update sort-order))) - -(define (rmt:find-and-mark-incomplete run-id ovr-deadtime) - (assert (number? run-id) "FATAL: Run id required.") - ;; (if (rmt:send-receive 'have-incompletes? run-id (list run-id ovr-deadtime)) - (rmt:send-receive 'mark-incomplete run-id (list run-id ovr-deadtime))) ;; ) - -(define (rmt:get-main-run-stats run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-main-run-stats #f (list run-id))) - -(define (rmt:get-var varname) - (rmt:send-receive 'get-var #f (list varname))) - -(define (rmt:del-var varname) - (rmt:send-receive 'del-var #f (list varname))) - -(define (rmt:set-var varname value) - (rmt:send-receive 'set-var #f (list varname value))) - -(define (rmt:inc-var varname) - (rmt:send-receive 'inc-var #f (list varname))) - -(define (rmt:dec-var varname) - (rmt:send-receive 'dec-var #f (list varname))) - -(define (rmt:add-var varname value) - (rmt:send-receive 'add-var #f (list varname value))) - -;;====================================================================== -;; M U L T I R U N Q U E R I E S -;;====================================================================== - -;; Need to move this to multi-run section and make associated changes -(define (rmt:find-and-mark-incomplete-all-runs #!key (ovr-deadtime #f)) - (let ((run-ids (rmt:get-all-run-ids))) - (for-each (lambda (run-id) - (rmt:find-and-mark-incomplete run-id ovr-deadtime)) - run-ids))) - -;; get the previous record for when this test was run where all keys match but runname -;; returns #f if no such test found, returns a single test record if found -;; -;; Run this at the client end since we have to connect to multiple run-id dbs -;; -(define (rmt:get-previous-test-run-record run-id test-name item-path) - (let* ((keyvals (rmt:get-key-val-pairs run-id)) - (keys (rmt:get-keys)) - (selstr (string-intersperse keys ",")) - (qrystr (string-intersperse (map (lambda (x)(conc x "=?")) keys) " AND "))) - (if (not keyvals) - #f - (let ((prev-run-ids (rmt:get-prev-run-ids run-id))) - ;; for each run starting with the most recent look to see if there is a matching test - ;; if found then return that matching test record - (debug:print 4 *default-log-port* "selstr: " selstr ", qrystr: " qrystr ", keyvals: " keyvals ", previous run ids found: " prev-run-ids) - (if (null? prev-run-ids) #f - (let loop ((hed (car prev-run-ids)) - (tal (cdr prev-run-ids))) - (let ((results (rmt:get-tests-for-run hed (conc test-name "/" item-path) '() '() ;; run-id testpatt states statuses - #f #f #f ;; offset limit not-in hide/not-hide - #f #f #f #f 'normal))) ;; sort-by sort-order qryvals last-update mode - (debug:print 4 *default-log-port* "Got tests for run-id " run-id ", test-name " test-name ", item-path " item-path ": " results) - (if (and (null? results) - (not (null? tal))) - (loop (car tal)(cdr tal)) - (if (null? results) #f - (car results)))))))))) - -(define (rmt:get-run-stats) - (rmt:send-receive 'get-run-stats #f '())) - -;;====================================================================== -;; S T E P S -;;====================================================================== - -;; Getting steps is more complicated. -;; -;; If given work area -;; 1. Find the testdat.db file -;; 2. Open the testdat.db file and do the query -;; If not given the work area -;; 1. Do a remote call to get the test path -;; 2. Continue as above -;; -;;(define (rmt:get-steps-for-test run-id test-id) -;; (rmt:send-receive 'get-steps-data run-id (list test-id))) - -(define (rmt:teststep-set-status! run-id test-id teststep-name state-in status-in comment logfile) - (assert (number? run-id) "FATAL: Run id required.") - (let* ((state (items:check-valid-items "state" state-in)) - (status (items:check-valid-items "status" status-in))) - (if (or (not state)(not status)) - (debug:print 3 *default-log-port* "WARNING: Invalid " (if status "status" "state") - " value \"" (if status state-in status-in) "\", update your validvalues section in megatest.config")) - (rmt:send-receive 'teststep-set-status! run-id (list run-id test-id teststep-name state-in status-in comment logfile)))) - - -(define (rmt:delete-steps-for-test! run-id test-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'delete-steps-for-test! run-id (list run-id test-id))) - -(define (rmt:get-steps-for-test run-id test-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-steps-for-test run-id (list run-id test-id))) - -(define (rmt:get-steps-info-by-id run-id test-step-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-steps-info-by-id #f (list run-id test-step-id))) - -;;====================================================================== -;; T E S T D A T A -;;====================================================================== - -(define (rmt:read-test-data run-id test-id categorypatt #!key (work-area #f)) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'read-test-data run-id (list run-id test-id categorypatt))) - -(define (rmt:read-test-data-varpatt run-id test-id categorypatt varpatt #!key (work-area #f)) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'read-test-data-varpatt run-id (list run-id test-id categorypatt varpatt))) - -(define (rmt:get-data-info-by-id run-id test-data-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-data-info-by-id #f (list run-id test-data-id))) - -(define (rmt:testmeta-add-record testname) - (rmt:send-receive 'testmeta-add-record #f (list testname))) - -(define (rmt:testmeta-get-record testname) - (rmt:send-receive 'testmeta-get-record #f (list testname))) - -(define (rmt:testmeta-update-field test-name fld val) - (rmt:send-receive 'testmeta-update-field #f (list test-name fld val))) - -(define (rmt:test-data-rollup run-id test-id status) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-data-rollup run-id (list run-id test-id status))) - -(define (rmt:csv->test-data run-id test-id csvdata) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'csv->test-data run-id (list run-id test-id csvdata))) - -;;====================================================================== -;; T A S K S -;;====================================================================== - -(define (rmt:tasks-find-task-queue-records target run-name test-patt state-patt action-patt) - (rmt:send-receive 'find-task-queue-records #f (list target run-name test-patt state-patt action-patt))) - -(define (rmt:tasks-add action owner target runname testpatt params) - (rmt:send-receive 'tasks-add #f (list action owner target runname testpatt params))) - -(define (rmt:tasks-set-state-given-param-key param-key new-state) - (rmt:send-receive 'tasks-set-state-given-param-key #f (list param-key new-state))) - -(define (rmt:tasks-get-last target runname) - (rmt:send-receive 'tasks-get-last #f (list target runname))) - -;;====================================================================== -;; N O S Y N C D B -;;====================================================================== - -(define (rmt:no-sync-set var val) - (rmt:send-receive 'no-sync-set #f `(,var ,val))) - -(define (rmt:no-sync-get/default var default) - (rmt:send-receive 'no-sync-get/default #f `(,var ,default))) - -(define (rmt:no-sync-del! var) - (rmt:send-receive 'no-sync-del! #f `(,var))) - -(define (rmt:no-sync-get-lock keyname) - (rmt:send-receive 'no-sync-get-lock #f `(,keyname))) - -;; process registration - -(define (rmt:register-process host port pid starttime status purpose dbname mtversion) - (rmt:send-receive 'register-process #f (list host port pid starttime status purpose dbname mtversion))) - -(define (rmt:set-process-done host pid reason) - (rmt:send-receive 'set-process-done #f (list host pid reason))) - -(define (rmt:set-process-status host pid newstatus) - (rmt:send-receive 'set-process-status #f (list host pid newstatus))) - -(define (rmt:get-process-options purpose dbname) - (rmt:get-process-options 'get-process-options #f (list purpose dbname))) - -;;====================================================================== -;; A R C H I V E S -;;====================================================================== - -(define (rmt:archive-get-allocations testname itempath dneeded) - (rmt:send-receive 'archive-get-allocations #f (list testname itempath dneeded))) - -(define (rmt:archive-register-block-name bdisk-id archive-path) - (rmt:send-receive 'archive-register-block-name #f (list bdisk-id archive-path))) - -(define (rmt:archive-allocate-testsuite/area-to-block block-id testsuite-name areakey) - (rmt:send-receive 'archive-allocate-test-to-block #f (list block-id testsuite-name areakey))) - -(define (rmt:archive-register-disk bdisk-name bdisk-path df) - (rmt:send-receive 'archive-register-disk #f (list bdisk-name bdisk-path df))) - -(define (rmt:test-set-archive-block-id run-id test-id archive-block-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-set-archive-block-id run-id (list run-id test-id archive-block-id))) - -(define (rmt:test-get-archive-block-info archive-block-id) - (rmt:send-receive 'test-get-archive-block-info #f (list archive-block-id))) - -(define (rmtmod:calc-ro-mode runremote *toppath*) - (case (rmt:transport-mode) - ((http) - (if (and runremote - (remote-ro-mode-checked runremote)) - (remote-ro-mode runremote) - (let* ((mtcfgfile (conc *toppath* "/megatest.config")) - (ro-mode (not (file-write-access? mtcfgfile)))) ;; TODO: use dbstruct or runremote to figure this out in future - (if runremote - (begin - (remote-ro-mode-set! runremote ro-mode) - (remote-ro-mode-checked-set! runremote #t) - ro-mode) - ro-mode)))) - ((tcp) - (if (and runremote - (tt-ro-mode-checked runremote)) - (tt-ro-mode runremote) - (let* ((mtcfgfile (conc *toppath* "/megatest.config")) - (ro-mode (not (file-write-access? mtcfgfile)))) ;; TODO: use dbstruct or runremote to figure this out in future - (if runremote - (begin - (tt-ro-mode-set! runremote ro-mode) - (tt-ro-mode-checked-set! runremote #t) - ro-mode) - ro-mode)))))) - -;;====================================================================== -;; Maintenance -;;====================================================================== - -(define (rmt:find-and-mark-incomplete run-id ovr-deadtime) - (let* ((cfg-deadtime (configf:lookup-number *configdat* "setup" "deadtime")) - (test-stats-update-period (configf:lookup-number *configdat* "setup" "test-stats-update-period"))) - (rmt:find-and-mark-incomplete-engine run-id ovr-deadtime cfg-deadtime test-stats-update-period) - ;;call end of eud of run detection for posthook - (launch:end-of-run-check run-id))) Index: rmtmod.scm ================================================================== --- rmtmod.scm +++ rmtmod.scm @@ -294,7 +294,753 @@ ;; call end of eud of run detection for posthook - from merge, is it needed? ;; (launch:end-of-run-check run-id) all-ids) ))))) +;;====================================================================== + +(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id +(define *ttdat* #f) + +;; NB// area-dat replaced by ttdat +;; +(define (rmt:send-receive cmd run-id params #!key (attemptnum 1)(ttdat #f)) + (assert (or (not run-id) (number? run-id)) "FATAL: run-id is required to be a number or #f") + (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.") + (let* ((areapath *toppath*) ;; TODO - resolve from dbstruct to be compatible with multiple areas + (readonly-mode (rmtmod:calc-ro-mode ttdat *toppath*)) + (testsuite (common:get-testsuite-name))) + (case (rmt:transport-mode) + ((tcp) + (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value + (attemptnum (+ 1 attemptnum)) + (mtexe (common:find-local-megatest)) + (dbfname (conc (dbfile:run-id->dbnum run-id)".db")) + (ttdat (rmt:set-ttdat areapath ttdat)) + (conn (tt:get-conn ttdat dbfname)) + (is-main (equal? dbfname "main.db")) ;; why not (not run-id) ? + (server-start-proc (if is-main + #f + (lambda () + ;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname) + (rmt:start-server ;; tt:server-process-run + areapath + testsuite ;; (dbfile:testsuite-name) + mtexe + run-id))))) + ;; here we look at ttdat, if dbfname is NOT main.db we check that a conn exists for it + ;; and if there is no conn we first send a request to the main.db server to start a + ;; server for the dbfname. + #;(if (and (not is-main)(not conn)) ;; no existing connection to non-main server, call in a start up request + (begin + (server-start-proc) + (thread-sleep! 1))) + (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc))) + ((nfs) + (nfs-transport-handler cmd run-id params attemptnum areapath readonly-mode testsuite)) + (else + (debug:print-info 0 *default-log-port* "rmt:transport-mode is "(rmt:transport-mode)) + (assert #f "FATAL: rmt:transport-mode set to invalid value."))))) + +(define (nfs-transport-handler cmd run-id params attemptnum areapath readonly-mode testsuite) + (let* ((keys (common:get-fields *configdat*)) + (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath))) + (api:dispatch-request dbstruct cmd run-id params))) + +(define (rmt:get-max-query-average run-id) + (mutex-lock! *db-stats-mutex*) + (let* ((runkey (conc "run-id=" run-id " ")) + (cmds (filter (lambda (x) + (substring-index runkey x)) + (hash-table-keys *db-stats*))) + (res (if (null? cmds) + (cons 'none 0) + (let loop ((cmd (car cmds)) + (tal (cdr cmds)) + (max-cmd (car cmds)) + (res 0)) + (let* ((cmd-dat (hash-table-ref *db-stats* cmd)) + (tot (vector-ref cmd-dat 0)) + (curravg (/ (vector-ref cmd-dat 1) (vector-ref cmd-dat 0))) ;; count is never zero by construction + (currmax (max res curravg)) + (newmax-cmd (if (> curravg res) cmd max-cmd))) + (if (null? tal) + (if (> tot 10) + (cons newmax-cmd currmax) + (cons 'none 0)) + (loop (car tal)(cdr tal) newmax-cmd currmax))))))) + (mutex-unlock! *db-stats-mutex*) + res)) + +(define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5)) + (let* ((qry-is-write (not (member cmd api:read-only-queries))) + (db-file-path (common:make-tmpdir-name *toppath* "")) ;; 0)) + (dbstructs-local (db:setup)) + (read-only (not (file-write-access? db-file-path))) + (start (current-milliseconds)) + (resdat (if (not (and read-only qry-is-write)) + (let ((v (api:execute-requests dbstructs-local (vector (symbol->string cmd) params)))) + ;; (handle-exceptions ;; there has been a long history of receiving strange errors from values returned by the client when things go wrong.. + ;; exn ;; This is an attempt to detect that situation and recover gracefully + ;; (begin + ;; (debug:print 0 *default-log-port* "ERROR: bad data from server " v " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) + ;; (vector #t '())) ;; should always get a vector but if something goes wrong return a dummy + (if (and (vector? v) + (> (vector-length v) 1)) + (let ((newvec (vector (vector-ref v 0)(vector-ref v 1)))) + newvec) ;; by copying the vector while inside the error handler we should force the detection of a corrupted record + (vector #t '()))) ;; ) ;; we could also check that the returned types are valid + (vector #t '()))) + (success (vector-ref resdat 0)) + (res (vector-ref resdat 1)) + (duration (- (current-milliseconds) start))) + (if (and read-only qry-is-write) + (debug:print 0 *default-log-port* "ERROR: attempt to write to read-only database ignored. cmd=" cmd)) + (if (not success) + (if (> remretries 0) + (begin + (debug:print-error 0 *default-log-port* "local query failed. Trying again.") + (thread-sleep! (/ (random 5000) 1000)) ;; some random delay + (rmt:open-qry-close-locally cmd run-id params remretries: (- remretries 1))) + (begin + (debug:print-error 0 *default-log-port* "too many retries in rmt:open-qry-close-locally, giving up") + #f)) + (begin + ;; (rmt:update-db-stats run-id cmd params duration) + ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it + (if qry-is-write + (let ((start-time (current-seconds))) + (mutex-lock! *db-multi-sync-mutex*) + (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) + (mutex-unlock! *db-multi-sync-mutex*))))) + res)) + +;;====================================================================== +;; +;; A C T U A L A P I C A L L S +;; +;;====================================================================== + +;;====================================================================== +;; S E R V E R +;;====================================================================== + +(define (rmt:kill-server run-id) + (rmt:send-receive 'kill-server run-id (list run-id))) + +(define (rmt:start-server areapath testsuite mtexe run-id) ;; run on main.db server + (rmt:send-receive 'start-server #f (list areapath testsuite mtexe run-id))) + +;;====================================================================== +;; M I S C +;;====================================================================== + +(define (rmt:login run-id) + (rmt:send-receive 'login run-id (list *toppath* megatest-version (client:get-signature)))) + +;; This login does no retries under the hood - it acts a bit like a ping. +;; Deprecated for nmsg-transport. +;; +;; (define (rmt:login-no-auto-client-setup runremote) +;; (rmt:send-receive-no-auto-client-setup runremote 'login #f (list *toppath* megatest-version (client:get-signature)))) + + +;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host +(define (rmt:get-latest-host-load hostname) + (rmt:send-receive 'get-latest-host-load #f (list hostname))) + +(define (rmt:sdb-qry qry val run-id) + ;; add caching if qry is 'getid or 'getstr + (rmt:send-receive 'sdb-qry run-id (list qry val))) + +;; NOT COMPLETED +(define (rmt:runtests user run-id testpatt params) + (rmt:send-receive 'runtests run-id testpatt)) + +(define (rmt:get-run-record-ids target run keynames ) + (rmt:send-receive 'get-run-record-ids #f (list target run keynames ))) + +(define (rmt:get-changed-record-ids since-time) + (rmt:send-receive 'get-changed-record-ids #f (list since-time)) ) + +(define (rmt:get-all-runids) + (rmt:send-receive 'get-all-run-ids #f '()) ) + +(define (rmt:get-changed-record-run-ids since-time) + (rmt:send-receive 'get-changed-record-run-ids #f (list since-time))) + +(define (rmt:get-changed-record-test-ids run-id since-time) + (rmt:send-receive 'get-changed-record-test-ids run-id (list since-time run-id))) + + + +(define (rmt:drop-all-triggers) + (rmt:send-receive 'drop-all-triggers #f '())) + +(define (rmt:create-all-triggers) + (rmt:send-receive 'create-all-triggers #f '())) + +;;====================================================================== +;; T E S T M E T A +;;====================================================================== + +(define (rmt:get-tests-tags) + (rmt:send-receive 'get-tests-tags #f '())) + +;;====================================================================== +;; K E Y S +;;====================================================================== + +;; These require run-id because the values come from the run! +;; however the query must still apply to main.db +;; +(define (rmt:get-key-val-pairs run-id) + (rmt:send-receive 'get-key-val-pairs #f (list run-id))) + +(define (rmt:get-keys) + (if *db-keys* *db-keys* + (let ((res (rmt:send-receive 'get-keys #f '()))) + (set! *db-keys* res) + res))) + +(define (rmt:get-keys-write) ;; dummy query to force server start + (let ((res (rmt:send-receive 'get-keys-write #f '()))) + (set! *db-keys* res) + res)) + +;; we don't reuse run-id's (except possibly *after* a db cleanup) so it is safe +;; to cache the resuls in a hash +;; +(define (rmt:get-key-vals run-id) + (or (hash-table-ref/default *keyvals* run-id #f) + (let ((res (rmt:send-receive 'get-key-vals #f (list run-id)))) + (hash-table-set! *keyvals* run-id res) + res))) + +(define (rmt:get-targets) + (rmt:send-receive 'get-targets #f '())) + +(define (rmt:get-target run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-target #f (list run-id))) + +(define (rmt:get-run-times runpatt targetpatt) + (rmt:send-receive 'get-run-times #f (list runpatt targetpatt ))) + + +;;====================================================================== +;; T E S T S +;;====================================================================== + +;; IDEA: Threadify these - they spend a lot of time waiting ... +;; +(define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) + (let ((multi-run-mutex (make-mutex)) + (run-id-list (if run-ids + run-ids + (rmt:get-all-run-ids))) + (result '())) + (if (null? run-id-list) + '() + (let loop ((hed (car run-id-list)) + (tal (cdr run-id-list)) + (threads '())) + (if (> (length threads) 5) + (loop hed tal (filter (lambda (th)(not (member (thread-state th) '(terminated dead)))) threads)) + (let* ((newthread (make-thread + (lambda () + (let ((res (rmt:send-receive 'get-tests-for-run-mindata hed (list hed testpatt states status not-in)))) + (if (list? res) + (begin + (mutex-lock! multi-run-mutex) + (set! result (append result res)) + (mutex-unlock! multi-run-mutex)) + (debug:print-error 0 *default-log-port* "get-tests-for-run-mindata failed for run-id " hed ", testpatt " testpatt ", states " states ", status " status ", not-in " not-in)))) + (conc "multi-run-thread for run-id " hed))) + (newthreads (cons newthread threads))) + (thread-start! newthread) + (thread-sleep! 0.05) ;; give that thread some time to start + (if (null? tal) + newthreads + (loop (car tal)(cdr tal) newthreads)))))) + result)) + +;; ;; IDEA: Threadify these - they spend a lot of time waiting ... +;; ;; +;; (define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) +;; (let ((run-id-list (if run-ids +;; run-ids +;; (rmt:get-all-run-ids)))) +;; (apply append (map (lambda (run-id) +;; (rmt:send-receive 'get-tests-for-run-mindata run-id (list run-ids testpatt states status not-in))) +;; run-id-list)))) + +(define (rmt:delete-test-records run-id test-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'delete-test-records run-id (list run-id test-id))) + +(define (rmt:test-set-state-status run-id test-id state status msg) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-set-state-status run-id (list run-id test-id state status msg))) + +(define (rmt:test-toplevel-num-items run-id test-name) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-toplevel-num-items run-id (list run-id test-name))) + +;; (define (rmt:get-previous-test-run-record run-id test-name item-path) +;; (rmt:send-receive 'get-previous-test-run-record run-id (list run-id test-name item-path))) + +(define (rmt:get-matching-previous-test-run-records run-id test-name item-path) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-matching-previous-test-run-records run-id (list run-id test-name item-path))) + +(define (rmt:test-get-logfile-info run-id test-name) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-get-logfile-info run-id (list run-id test-name))) + +(define (rmt:test-get-records-for-index-file run-id test-name) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-get-records-for-index-file run-id (list run-id test-name))) + +(define (rmt:get-testinfo-state-status run-id test-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-testinfo-state-status run-id (list run-id test-id))) + +(define (rmt:test-set-log! run-id test-id logf) + (assert (number? run-id) "FATAL: Run id required.") + (if (string? logf)(rmt:general-call 'test-set-log run-id logf test-id))) + +(define (rmt:test-set-top-process-pid run-id test-id pid) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-set-top-process-pid run-id (list run-id test-id pid))) + +(define (rmt:test-get-top-process-pid run-id test-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-get-top-process-pid run-id (list run-id test-id))) + +(define (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt) + (rmt:send-receive 'get-run-ids-matching-target #f (list keynames target res runname testpatt statepatt statuspatt))) + +;; NOTE: This will open and access ALL run databases. +;; +(define (rmt:test-get-paths-matching-keynames-target-new keynames target res testpatt statepatt statuspatt runname) + (let ((run-ids (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt))) + (apply append + (map (lambda (run-id) + (rmt:send-receive 'test-get-paths-matching-keynames-target-new run-id (list run-id keynames target res testpatt statepatt statuspatt runname))) + run-ids)))) + + + +(define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f)) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps))) + +(define (rmt:get-count-tests-running-for-run-id run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id))) + +(define (rmt:get-not-completed-cnt run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-not-completed-cnt run-id (list run-id))) + + +;; Statistical queries + +(define (rmt:get-count-tests-running run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-count-tests-running run-id (list run-id))) + +(define (rmt:get-count-tests-running-for-testname run-id testname) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-count-tests-running-for-testname run-id (list run-id testname))) + +(define (rmt:get-count-tests-running-in-jobgroup run-id jobgroup) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-count-tests-running-in-jobgroup run-id (list run-id jobgroup))) + +(define (rmt:set-state-status-and-roll-up-run run-id state status) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'set-state-status-and-roll-up-run run-id (list run-id state status))) + + +(define (rmt:update-pass-fail-counts run-id test-name) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:general-call 'update-pass-fail-counts run-id test-name test-name test-name)) + +(define (rmt:top-test-set-per-pf-counts run-id test-name) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'top-test-set-per-pf-counts run-id (list run-id test-name))) + +(define (rmt:get-raw-run-stats run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-raw-run-stats run-id (list run-id))) + +(define (rmt:get-test-times runname target) + (rmt:send-receive 'get-test-times #f (list runname target ))) + +;;====================================================================== +;; R U N S +;;====================================================================== + +;; BUG - LOOK AT HOW THIS WORKS!!! +;; +(define (rmt:get-run-info run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-run-info #f (list run-id))) + +(define (rmt:get-num-runs runpatt) + (rmt:send-receive 'get-num-runs #f (list runpatt))) + +(define (rmt:get-runs-cnt-by-patt runpatt targetpatt keys) + (rmt:send-receive 'get-runs-cnt-by-patt #f (list runpatt targetpatt keys))) + +;; Use the special run-id == #f scenario here since there is no run yet +(define (rmt:register-run keyvals runname state status user contour) + (rmt:send-receive 'register-run #f (list keyvals runname state status user contour))) + +(define (rmt:get-run-name-from-id run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-run-name-from-id #f (list run-id))) + +(define (rmt:delete-run run-id) + (rmt:send-receive 'delete-run #f (list run-id))) + +(define (rmt:update-run-stats run-id stats) + (rmt:send-receive 'update-run-stats #f (list run-id stats))) + +(define (rmt:delete-old-deleted-test-records run-id) + (rmt:send-receive 'delete-old-deleted-test-records run-id (list run-id))) + +(define (rmt:get-runs runpatt count offset keypatts) + (rmt:send-receive 'get-runs #f (list runpatt count offset keypatts))) + +(define (rmt:simple-get-runs runpatt count offset target last-update) + (rmt:send-receive 'simple-get-runs #f (list runpatt count offset target last-update))) + +(define (rmt:get-all-run-ids) + (rmt:send-receive 'get-all-run-ids #f '())) + +(define (rmt:get-prev-run-ids run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-prev-run-ids #f (list run-id))) + +(define (rmt:lock/unlock-run run-id lock unlock user) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'lock/unlock-run #f (list run-id lock unlock user))) + +;; set/get status +(define (rmt:get-run-status run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-run-status #f (list run-id))) + +(define (rmt:get-run-state run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-run-state #f (list run-id))) + +(define (rmt:get-run-state-status run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-run-state-status #f (list run-id))) + +(define (rmt:set-run-status run-id run-status #!key (msg #f)) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'set-run-status #f (list run-id run-status msg))) + +(define (rmt:set-run-state-status run-id state status ) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'set-run-state-status #f (list run-id state status))) + +(define (rmt:update-tesdata-on-repilcate-db old-lt new-lt) +(rmt:send-receive 'update-tesdata-on-repilcate-db #f (list old-lt new-lt))) + +(define (rmt:update-run-event_time run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'update-run-event_time #f (list run-id))) + +(define (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-runs-update #!key (sort-order "asc")) ;; fields of #f uses default + (rmt:send-receive 'get-runs-by-patt #f (list keys runnamepatt targpatt offset limit fields last-runs-update sort-order))) + +(define (rmt:find-and-mark-incomplete run-id ovr-deadtime) + (assert (number? run-id) "FATAL: Run id required.") + ;; (if (rmt:send-receive 'have-incompletes? run-id (list run-id ovr-deadtime)) + (rmt:send-receive 'mark-incomplete run-id (list run-id ovr-deadtime))) ;; ) + +(define (rmt:get-main-run-stats run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-main-run-stats #f (list run-id))) + +(define (rmt:get-var varname) + (rmt:send-receive 'get-var #f (list varname))) + +(define (rmt:del-var varname) + (rmt:send-receive 'del-var #f (list varname))) + +(define (rmt:set-var varname value) + (rmt:send-receive 'set-var #f (list varname value))) + +(define (rmt:inc-var varname) + (rmt:send-receive 'inc-var #f (list varname))) + +(define (rmt:dec-var varname) + (rmt:send-receive 'dec-var #f (list varname))) + +(define (rmt:add-var varname value) + (rmt:send-receive 'add-var #f (list varname value))) + +;;====================================================================== +;; M U L T I R U N Q U E R I E S +;;====================================================================== + +;; Need to move this to multi-run section and make associated changes +(define (rmt:find-and-mark-incomplete-all-runs #!key (ovr-deadtime #f)) + (let ((run-ids (rmt:get-all-run-ids))) + (for-each (lambda (run-id) + (rmt:find-and-mark-incomplete run-id ovr-deadtime)) + run-ids))) + +;; get the previous record for when this test was run where all keys match but runname +;; returns #f if no such test found, returns a single test record if found +;; +;; Run this at the client end since we have to connect to multiple run-id dbs +;; +(define (rmt:get-previous-test-run-record run-id test-name item-path) + (let* ((keyvals (rmt:get-key-val-pairs run-id)) + (keys (rmt:get-keys)) + (selstr (string-intersperse keys ",")) + (qrystr (string-intersperse (map (lambda (x)(conc x "=?")) keys) " AND "))) + (if (not keyvals) + #f + (let ((prev-run-ids (rmt:get-prev-run-ids run-id))) + ;; for each run starting with the most recent look to see if there is a matching test + ;; if found then return that matching test record + (debug:print 4 *default-log-port* "selstr: " selstr ", qrystr: " qrystr ", keyvals: " keyvals ", previous run ids found: " prev-run-ids) + (if (null? prev-run-ids) #f + (let loop ((hed (car prev-run-ids)) + (tal (cdr prev-run-ids))) + (let ((results (rmt:get-tests-for-run hed (conc test-name "/" item-path) '() '() ;; run-id testpatt states statuses + #f #f #f ;; offset limit not-in hide/not-hide + #f #f #f #f 'normal))) ;; sort-by sort-order qryvals last-update mode + (debug:print 4 *default-log-port* "Got tests for run-id " run-id ", test-name " test-name ", item-path " item-path ": " results) + (if (and (null? results) + (not (null? tal))) + (loop (car tal)(cdr tal)) + (if (null? results) #f + (car results)))))))))) + +(define (rmt:get-run-stats) + (rmt:send-receive 'get-run-stats #f '())) + +;;====================================================================== +;; S T E P S +;;====================================================================== + +;; Getting steps is more complicated. +;; +;; If given work area +;; 1. Find the testdat.db file +;; 2. Open the testdat.db file and do the query +;; If not given the work area +;; 1. Do a remote call to get the test path +;; 2. Continue as above +;; +;;(define (rmt:get-steps-for-test run-id test-id) +;; (rmt:send-receive 'get-steps-data run-id (list test-id))) + +(define (rmt:teststep-set-status! run-id test-id teststep-name state-in status-in comment logfile) + (assert (number? run-id) "FATAL: Run id required.") + (let* ((state (items:check-valid-items "state" state-in)) + (status (items:check-valid-items "status" status-in))) + (if (or (not state)(not status)) + (debug:print 3 *default-log-port* "WARNING: Invalid " (if status "status" "state") + " value \"" (if status state-in status-in) "\", update your validvalues section in megatest.config")) + (rmt:send-receive 'teststep-set-status! run-id (list run-id test-id teststep-name state-in status-in comment logfile)))) + + +(define (rmt:delete-steps-for-test! run-id test-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'delete-steps-for-test! run-id (list run-id test-id))) + +(define (rmt:get-steps-for-test run-id test-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-steps-for-test run-id (list run-id test-id))) + +(define (rmt:get-steps-info-by-id run-id test-step-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-steps-info-by-id #f (list run-id test-step-id))) + +;;====================================================================== +;; T E S T D A T A +;;====================================================================== + +(define (rmt:read-test-data run-id test-id categorypatt #!key (work-area #f)) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'read-test-data run-id (list run-id test-id categorypatt))) + +(define (rmt:read-test-data-varpatt run-id test-id categorypatt varpatt #!key (work-area #f)) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'read-test-data-varpatt run-id (list run-id test-id categorypatt varpatt))) + +(define (rmt:get-data-info-by-id run-id test-data-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-data-info-by-id #f (list run-id test-data-id))) + +(define (rmt:testmeta-add-record testname) + (rmt:send-receive 'testmeta-add-record #f (list testname))) + +(define (rmt:testmeta-get-record testname) + (rmt:send-receive 'testmeta-get-record #f (list testname))) + +(define (rmt:testmeta-update-field test-name fld val) + (rmt:send-receive 'testmeta-update-field #f (list test-name fld val))) + +(define (rmt:test-data-rollup run-id test-id status) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-data-rollup run-id (list run-id test-id status))) + +(define (rmt:csv->test-data run-id test-id csvdata) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'csv->test-data run-id (list run-id test-id csvdata))) + +;;====================================================================== +;; T A S K S +;;====================================================================== + +(define (rmt:tasks-find-task-queue-records target run-name test-patt state-patt action-patt) + (rmt:send-receive 'find-task-queue-records #f (list target run-name test-patt state-patt action-patt))) + +(define (rmt:tasks-add action owner target runname testpatt params) + (rmt:send-receive 'tasks-add #f (list action owner target runname testpatt params))) + +(define (rmt:tasks-set-state-given-param-key param-key new-state) + (rmt:send-receive 'tasks-set-state-given-param-key #f (list param-key new-state))) + +(define (rmt:tasks-get-last target runname) + (rmt:send-receive 'tasks-get-last #f (list target runname))) + +;;====================================================================== +;; N O S Y N C D B +;;====================================================================== + +(define (rmt:no-sync-set var val) + (rmt:send-receive 'no-sync-set #f `(,var ,val))) + +(define (rmt:no-sync-get/default var default) + (rmt:send-receive 'no-sync-get/default #f `(,var ,default))) + +(define (rmt:no-sync-del! var) + (rmt:send-receive 'no-sync-del! #f `(,var))) + +(define (rmt:no-sync-get-lock keyname) + (rmt:send-receive 'no-sync-get-lock #f `(,keyname))) + +;; process registration + +(define (rmt:register-process host port pid starttime status purpose dbname mtversion) + (rmt:send-receive 'register-process #f (list host port pid starttime status purpose dbname mtversion))) + +(define (rmt:set-process-done host pid reason) + (rmt:send-receive 'set-process-done #f (list host pid reason))) + +(define (rmt:set-process-status host pid newstatus) + (rmt:send-receive 'set-process-status #f (list host pid newstatus))) + +(define (rmt:get-process-options purpose dbname) + (rmt:get-process-options 'get-process-options #f (list purpose dbname))) + +;;====================================================================== +;; A R C H I V E S +;;====================================================================== + +(define (rmt:archive-get-allocations testname itempath dneeded) + (rmt:send-receive 'archive-get-allocations #f (list testname itempath dneeded))) + +(define (rmt:archive-register-block-name bdisk-id archive-path) + (rmt:send-receive 'archive-register-block-name #f (list bdisk-id archive-path))) + +(define (rmt:archive-allocate-testsuite/area-to-block block-id testsuite-name areakey) + (rmt:send-receive 'archive-allocate-test-to-block #f (list block-id testsuite-name areakey))) + +(define (rmt:archive-register-disk bdisk-name bdisk-path df) + (rmt:send-receive 'archive-register-disk #f (list bdisk-name bdisk-path df))) + +(define (rmt:test-set-archive-block-id run-id test-id archive-block-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-set-archive-block-id run-id (list run-id test-id archive-block-id))) + +(define (rmt:test-get-archive-block-info archive-block-id) + (rmt:send-receive 'test-get-archive-block-info #f (list archive-block-id))) + +(define (rmtmod:calc-ro-mode runremote *toppath*) + (case (rmt:transport-mode) + ((http) + (if (and runremote + (remote-ro-mode-checked runremote)) + (remote-ro-mode runremote) + (let* ((mtcfgfile (conc *toppath* "/megatest.config")) + (ro-mode (not (file-write-access? mtcfgfile)))) ;; TODO: use dbstruct or runremote to figure this out in future + (if runremote + (begin + (remote-ro-mode-set! runremote ro-mode) + (remote-ro-mode-checked-set! runremote #t) + ro-mode) + ro-mode)))) + ((tcp) + (if (and runremote + (tt-ro-mode-checked runremote)) + (tt-ro-mode runremote) + (let* ((mtcfgfile (conc *toppath* "/megatest.config")) + (ro-mode (not (file-write-access? mtcfgfile)))) ;; TODO: use dbstruct or runremote to figure this out in future + (if runremote + (begin + (tt-ro-mode-set! runremote ro-mode) + (tt-ro-mode-checked-set! runremote #t) + ro-mode) + ro-mode)))))) + + +;;====================================================================== +;; Maintenance +;;====================================================================== + +(define (rmt:find-and-mark-incomplete run-id ovr-deadtime) + (let* ((cfg-deadtime (configf:lookup-number *configdat* "setup" "deadtime")) + (test-stats-update-period (configf:lookup-number *configdat* "setup" "test-stats-update-period"))) + (rmt:find-and-mark-incomplete-engine run-id ovr-deadtime cfg-deadtime test-stats-update-period) + ;;call end of eud of run detection for posthook + (launch:end-of-run-check run-id))) + +;;====================================================================== +;; S U P P O R T F U N C T I O N S +;;====================================================================== + +(define (rmt:on-homehost? runremote) + (let* ((hh-dat (remote-hh-dat runremote))) + (if (pair? hh-dat) + (cdr hh-dat) + (begin + (debug:print-info 0 *default-log-port* "hh-dat="hh-dat) + #f)))) + +(define (make-and-init-remote areapath) + (case (rmt:transport-mode) + ((http)(make-remote)) + ((tcp) (tt:make-remote areapath)) + (else #f))) + +;; how to make area-dat +(define (rmt:set-ttdat areapath ttdat) + (if ttdat + ttdat + (if *ttdat* + *ttdat* + (begin + (debug:print-info 2 *default-log-port* "rmt:set-ttdat: Initialize new ttdat") + (let* ((newremote (make-and-init-remote areapath))) + (set! *ttdat* newremote) + newremote + ) + ) + ) + ) +) ) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -16,20 +16,18 @@ ;; along with Megatest. If not, see . ;; (declare (unit server)) -(declare (uses commonmod)) -(declare (uses configfmod)) -(declare (uses debugprint)) (declare (uses common)) (declare (uses db)) (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. -;; (declare (uses synchash)) -;;(declare (uses rpc-transport)) +(declare (uses debugprint)) +(declare (uses commonmod)) +(declare (uses configfmod)) +(declare (uses rmtmod)) (declare (uses launch)) -;; (declare (uses daemon)) (declare (uses mtargs)) (use (srfi 18) extras s11n) (use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) (use directory-utils posix-extras matchable utils) @@ -41,727 +39,5 @@ (prefix mtargs args:)) (include "common_records.scm") (include "db_records.scm") -(define (server:make-server-url hostport) - (if (not hostport) - #f - (conc "http://" (car hostport) ":" (cadr hostport)))) - -(define *server-loop-heart-beat* (current-seconds)) - -;;====================================================================== -;; P K T S S T U F F -;;====================================================================== - -;; ??? - -;;====================================================================== -;; P K T S S T U F F -;;====================================================================== - -;; ??? - -;;====================================================================== -;; S E R V E R -;;====================================================================== - -;; Call this to start the actual server -;; - -;;====================================================================== -;; S E R V E R U T I L I T I E S -;;====================================================================== - -;; Get the transport -(define (server:get-transport) - (if *transport-type* - *transport-type* - (let ((ttype (string->symbol - (or (args:get-arg "-transport") - (configf:lookup *configdat* "server" "transport") - "rpc")))) - (set! *transport-type* ttype) - ttype))) - -;; Generate a unique signature for this server -(define (server:mk-signature) - (message-digest-string (md5-primitive) - (with-output-to-string - (lambda () - (write (list (current-directory) - (current-process-id) - (argv))))))) - -(define (server:get-client-signature) - (if *my-client-signature* *my-client-signature* - (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic - (set! *my-client-signature* sig) - *my-client-signature*))) - -(define (server:get-server-id) - (if *server-id* *server-id* - (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic - (set! *server-id* sig) - *server-id*))) - -;; ;; When using zmq this would send the message back (two step process) -;; ;; with spiffy or rpc this simply returns the return data to be returned -;; ;; -;; (define (server:reply return-addr query-sig success/fail result) -;; (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result) -;; ;; (send-message pubsock target send-more: #t) -;; ;; (send-message pubsock -;; (case (server:get-transport) -;; ((rpc) (db:obj->string (vector success/fail query-sig result))) -;; ((http) (db:obj->string (vector success/fail query-sig result))) -;; ((fs) result) -;; (else -;; (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) -;; result))) - -;; Given an area path, start a server process ### NOTE ### > file 2>&1 -;; if the target-host is set -;; try running on that host -;; incidental: rotate logs in logs/ dir. -;; -(define (server:run areapath) ;; areapath is *toppath* for a given testsuite area - (let* ((testsuite (common:get-testsuite-name)) - (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) - (profile-mode (or (configf:lookup *configdat* "misc" "profilesw") - "")) - (cmdln (conc (common:get-megatest-exe) - " -server - ";; (or target-host "-") - (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") - " -daemonize " - "") - ;; " -log " logfile - " -m testsuite:" testsuite - " " profile-mode - )) ;; (conc " >> " logfile " 2>&1 &"))))) - (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? - (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) - ;; we want the remote server to start in *toppath* so push there - (push-directory areapath) - (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") - (thread-start! log-rotate) - - ;; host.domain.tld match host? - ;; (if (and target-host - ;; ;; look at target host, is it host.domain.tld or ip address and does it - ;; ;; match current ip or hostname - ;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) - ;; (not (equal? curr-ip target-host))) - ;; (begin - ;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) - ;; (setenv "TARGETHOST" target-host))) - ;; - (setenv "TARGETHOST_LOGF" logfile) - (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time - (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) - (system (conc "nbfake " cmdln)) - (unsetenv "TARGETHOST_LOGF") - ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) - (thread-join! log-rotate) - (pop-directory))) - -;; given a path to a server log return: host port startseconds server-id -;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let -;; example of what it's looking for in the log file: -;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 - -(define (server:logf-get-start-info logf) - (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id - (dbprep-rx (regexp "^SERVER: dbprep")) - (dbprep-found 0) - (bad-dat (list #f #f #f #f #f))) - (handle-exceptions - exn - (begin - ;; WARNING: this is potentially dangerous to blanket ignore the errors - (if (file-exists? logf) - (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn)) - bad-dat) ;; no idea what went wrong, call it a bad server - (with-input-from-file - logf - (lambda () - (let loop ((inl (read-line)) - (lnum 0)) - (if (not (eof-object? inl)) - (let ((mlst (string-match server-rx inl)) - (dbprep (string-match dbprep-rx inl))) - (if dbprep (set! dbprep-found 1)) - (if (not mlst) - (if (< lnum 500) ;; give up if more than 500 lines of server log read - (loop (read-line)(+ lnum 1)) - (begin - (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) - bad-dat)) - (match mlst - ((_ host port start server-id pid) - (list host - (string->number port) - (string->number start) - server-id - (string->number pid))) - (else - (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) - bad-dat)))) - (begin - (if dbprep-found - (begin - (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) - (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? - (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) - bad-dat)))))))) - -;; ;; get a list of servers from the log files, with all relevant data -;; ;; ( mod-time host port start-time pid ) -;; ;; -;; (define (server:get-list areapath #!key (limit #f)) -;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) -;; (day-seconds (* 24 60 60))) -;; ;; if the directory exists continue to get the list -;; ;; otherwise attempt to create the logs dir and then -;; ;; continue -;; (if (if (directory-exists? (conc areapath "/logs")) -;; '() -;; (if (file-write-access? areapath) -;; (begin -;; (condition-case -;; (create-directory (conc areapath "/logs") #t) -;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) -;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) -;; (directory-exists? (conc areapath "/logs"))) -;; '())) -;; -;; ;; Get the list of server logs. -;; (let* ( -;; ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. -;; ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) -;; (server-logs (glob (conc areapath "/logs/server-*-*.log"))) -;; (num-serv-logs (length server-logs))) -;; (if (or (null? server-logs) (= num-serv-logs 0)) -;; (let () -;; (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) -;; '() -;; ) -;; (let loop ((hed (string-chomp (car server-logs))) -;; (tal (cdr server-logs)) -;; (res '())) -;; (let* ((mod-time (handle-exceptions -;; exn -;; (begin -;; (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn) -;; (current-seconds)) ;; 0 -;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted -;; (down-time (- (current-seconds) mod-time)) -;; (serv-dat (if (or (< num-serv-logs 10) -;; (< down-time 900)) ;; day-seconds)) -;; (server:logf-get-start-info hed) -;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at -;; (serv-rec (cons mod-time serv-dat)) -;; (fmatch (string-match fname-rx hed)) -;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) -;; (new-res (if (null? serv-dat) -;; res -;; (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let -;; (if (null? tal) -;; (if (and limit -;; (> (length new-res) limit)) -;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work -;; new-res) -;; (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) - -#;(define (server:get-num-alive srvlst) - (let ((num-alive 0)) - (for-each - (lambda (server) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get server start-time and/or mod-time from " server ", exn=" exn)) - (match-let (((mod-time host port start-time server-id pid) - server)) - (let* ((uptime (- (current-seconds) mod-time)) - (runtime (if start-time - (- mod-time start-time) - 0))) - (if (< uptime 5)(set! num-alive (+ num-alive 1))))))) - srvlst) - num-alive)) - -;; ;; given a list of servers get a list of valid servers, i.e. at least -;; ;; 10 seconds old, has started and is less than 1 hour old and is -;; ;; active (i.e. mod-time < 10 seconds -;; ;; -;; ;; mod-time host port start-time pid -;; ;; -;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off -;; ;; and servers should stick around for about two hours or so. -;; ;; -;; (define (server:get-best srvlst) -;; (let* ((nums (server:get-num-servers)) -;; (now (current-seconds)) -;; (slst (sort -;; (filter (lambda (rec) -;; (if (and (list? rec) -;; (> (length rec) 2)) -;; (let ((start-time (list-ref rec 3)) -;; (mod-time (list-ref rec 0))) -;; ;; (print "start-time: " start-time " mod-time: " mod-time) -;; (and start-time mod-time -;; (> (- now start-time) 0) ;; been running at least 0 seconds -;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds -;; (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set -;; (< (- now start-time) -;; (+ (- (string->number (configf:lookup *configdat* "server" "runtime")) -;; 180) -;; (random 360)))) ;; under one hour running time +/- 180 -;; )) -;; #f)) -;; srvlst) -;; (lambda (a b) -;; (< (list-ref a 3) -;; (list-ref b 3)))))) -;; (if (> (length slst) nums) -;; (take slst nums) -;; slst))) - -;; ;; switch from server:get-list to server:get-servers-info -;; ;; -;; (define (server:get-first-best areapath) -;; (let ((srvrs (server:get-best (server:get-list areapath)))) -;; (if (and srvrs -;; (not (null? srvrs))) -;; (car srvrs) -;; #f))) -;; -;; (define (server:get-rand-best areapath) -;; (let ((srvrs (server:get-best (server:get-list areapath)))) -;; (if (and (list? srvrs) -;; (not (null? srvrs))) -;; (let* ((len (length srvrs)) -;; (idx (random len))) -;; (list-ref srvrs idx)) -;; #f))) - -(define (server:record->id servr) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn) - #f) - (match-let (((host port start-time server-id pid) - servr)) - (if server-id - server-id - #f)))) - -(define (server:record->url servr) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn) - #f) - (match-let (((host port start-time server-id pid) - servr)) - (if (and host port) - (conc host ":" port) - #f)))) - - -;; if server-start-last exists, and wasn't old enough, wait + 1, then call this function recursively until it is old enough. -;; if it is old enough, overwrite it and wait 0.25 seconds. -;; if it then has the wrong server key, wait + 1 and call this function recursively. -;; -#;(define (server:wait-for-server-start-last-flag areapath) - (let* ((start-flag (conc areapath "/logs/server-start-last")) - ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) - (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) - (server-key (conc (get-host-name) "-" (current-process-id)))) - (if (file-exists? start-flag) - (let* ((fmodtime (file-modification-time start-flag)) - (delta (- (current-seconds) fmodtime)) - (old-enough (> delta idletime)) - (new-server-key "")) - ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than seconds, and the new file still has the same server key as you just wrote, return #t. - ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process. - (if (and old-enough - (begin - (debug:print-info 2 *default-log-port* "Writing " start-flag) - (with-output-to-file start-flag (lambda () (print server-key))) - (thread-sleep! 0.25) - (set! new-server-key (with-input-from-file start-flag (lambda () (read-line)))) - (equal? server-key new-server-key))) - #t - ;; If either of the above conditions is not true, print a "Gating server start" message, wait + 1, then call this function recursively. - (begin - (debug:print-info 0 *default-log-port* "Gating server start, last start: " - (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) - - (thread-sleep! ( + 1 idletime)) - (server:wait-for-server-start-last-flag areapath))))))) - -;; oldest server alive determines host then choose random of youngest -;; five servers on that host -;; -(define (server:get-servers-info areapath) - ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") - (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo"))) - (if (not (file-exists? servinfodir)) - (create-directory servinfodir)) - (let* ((allfiles (glob (conc servinfodir"/*"))) - (res (make-hash-table))) - (for-each - (lambda (f) - (let* ((hostport (pathname-strip-directory f)) - (serverdat (server:logf-get-start-info f))) - (match serverdat - ((host port start server-id pid) - (if (and host port start server-id pid) - (hash-table-set! res hostport serverdat) - (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))) - (else - (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))))) - allfiles) - res))) - -;; check the .servinfo directory, are there other servers running on this -;; or another host? -;; -;; returns #t => ok to start another server -;; #f => not ok to start another server -;; -(define (server:minimal-check areapath) - (server:clean-up-old areapath) - (let* ((srvdir (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo")) - (servrs (glob (conc srvdir"/*"))) - (thishostip (server:get-best-guess-address (get-host-name))) - (thisservrs (glob (conc srvdir"/"thishostip":*"))) - (homehostinf (server:choose-server areapath 'homehost)) - (havehome (car homehostinf)) - (wearehome (cdr homehostinf))) - (debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome - ", numservers: "(length thisservrs)) - (cond - ((not havehome) #t) ;; no homehost yet, go for it - ((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another - ((and havehome (not wearehome)) #f) ;; we are not the home host - ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running - (else - (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs) - #t)))) - - -(define server-last-start 0) - - -;; oldest server alive determines host then choose random of youngest -;; five servers on that host -;; -;; mode: -;; best - get best server (random of newest five) -;; home - get home host based on oldest server -;; info - print info -(define (server:choose-server areapath #!optional (mode 'best)) - ;; age is current-starttime - ;; find oldest alive - ;; 1. sort by age ascending and ping until good - ;; find alive rand from youngest - ;; 1. sort by age descending - ;; 2. take five - ;; 3. check alive, discard if not and repeat - ;; first we clean up old server files - (assert (eq? (rmt:transport-mode) 'http) "FATAL: server:run called with rmt:transport-mode="(rmt:transport-mode)) - (server:clean-up-old areapath) - (let* ((since-last (- (current-seconds) server-last-start)) - (server-start-delay 10)) - (if ( < (- (current-seconds) server-last-start) 10 ) - (begin - (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) - (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds") - (thread-sleep! server-start-delay) - ) - (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) - ) - ) - (let* ((serversdat (server:get-servers-info areapath)) - (servkeys (hash-table-keys serversdat)) - (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last - (sort servkeys ;; list of "host:port" - (lambda (a b) - (>= (list-ref (hash-table-ref serversdat a) 2) - (list-ref (hash-table-ref serversdat b) 2)))) - '()))) - (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat) - (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys) - (if (not (null? by-time-asc)) - (let* ((oldest (last by-time-asc)) - (oldest-dat (hash-table-ref serversdat oldest)) - (host (list-ref oldest-dat 0)) - (all-valid (filter (lambda (x) - (equal? host (list-ref (hash-table-ref serversdat x) 0))) - by-time-asc)) - (best-ten (lambda () - (if (> (length all-valid) 11) - (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out - (if (> (length all-valid) 8) - (drop-right all-valid 1) - all-valid)))) - (names->dats (lambda (names) - (map (lambda (x) - (hash-table-ref serversdat x)) - names))) - (am-home? (lambda () - (let* ((currhost (get-host-name)) - (bestadrs (server:get-best-guess-address currhost))) - (or (equal? host currhost) - (equal? host bestadrs)))))) - (case mode - ((info) - (debug:print 0 *default-log-port* "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) - (debug:print 0 *default-log-port* "youngest: "(hash-table-ref serversdat (car all-valid)))) - ((home) host) - ((homehost) (cons host (am-home?))) ;; shut up old code - ((home?) (am-home?)) - ((best-ten)(names->dats (best-ten))) - ((all-valid)(names->dats all-valid)) - ((best) (let* ((best-ten (best-ten)) - (len (length best-ten))) - (hash-table-ref serversdat (list-ref best-ten (random len))))) - ((count)(length all-valid)) - (else - (debug:print 0 *default-log-port* "ERROR: invalid command "mode) - #f))) - (begin - (server:run areapath) - (set! server-last-start (current-seconds)) - ;; (thread-sleep! 3) - (case mode - ((homehost) (cons #f #f)) - (else #f)))))) - -(define (server:get-servinfo-dir areapath) - (let* ((spath (conc areapath"/.servinfo"))) - (if (not (file-exists? spath)) - (create-directory spath #t)) - spath)) - -(define (server:clean-up-old areapath) - ;; any server file that has not been touched in ten minutes is effectively dead - (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*")))) - (for-each - (lambda (sfile) - (let* ((modtime (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile) - (current-seconds)) - (file-modification-time sfile)))) - (if (and (number? modtime) - (> (- (current-seconds) modtime) - 600)) - (begin - (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.") - (handle-exceptions - exn - (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile) - (delete-file sfile)))))) - sfiles))) - -;; would like to eventually get rid of this -;; -(define (common:on-homehost?) - (if (eq? (rmt:transport-mode) 'http) - (server:choose-server *toppath* 'home?) - #t)) ;; there is no homehost for tcp and nfs is always on home so #t should work - -;; kind start up of server, wait before allowing another server for a given -;; area to be launched -;; -(define (server:kind-run areapath) - ;; look for $MT_RUN_AREA_HOME/logs/server-start-last - ;; and wait for it to be at least seconds old - ;; (server:wait-for-server-start-last-flag areapath) - (let loop () - (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2) - (begin - (if (common:low-noise-print 30 "our-host-load") - (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server.")) - (loop)))) - (if (< (server:choose-server areapath 'count) 20) - (server:run areapath)) - #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? - (let* ((lock-file (conc areapath "/logs/server-start.lock"))) - (let* ((start-flag (conc areapath "/logs/server-start-last"))) - (common:simple-file-lock-and-wait lock-file expire-time: 25) - (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag) - (system (conc "touch " start-flag)) ;; lazy but safe - (server:run areapath) - (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED". - (common:simple-file-release-lock lock-file))) - (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another."))) - -;; this one seems to be the general entry point -;; -(define (server:start-and-wait areapath #!key (timeout 60)) - (let ((give-up-time (+ (current-seconds) timeout))) - (let loop ((server-info (server:check-if-running areapath)) - (try-num 0)) - (if (or server-info - (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. - (server:record->url server-info) - (let* ( (servers (server:choose-server areapath 'all-valid)) - (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0))) - (if (and (> try-num 0) ;; first time through simply wait a little while then try again - (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one - (server:run areapath)) - (thread-sleep! 5) - (loop (server:check-if-running areapath) - (+ try-num 1))))))) - -(define (server:get-num-servers #!key (numservers 2)) - (let ((ns (string->number - (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) - (or ns numservers))) - -;; no longer care if multiple servers are started by accident. older servers will drop off in time. -;; -(define (server:check-if-running areapath) ;; #!key (numservers "2")) - (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed - (servers (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath)))) - (if (or (and servers - (null? servers)) - (not servers)) - ;; (and (list? servers) - ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers - #f - (let loop ((hed (car servers)) - (tal (cdr servers))) - (let ((res (server:check-server hed))) - (if res - hed - (if (null? tal) - #f - (loop (car tal)(cdr tal))))))))) - -;; ping the given server -;; -(define (server:check-server server-record) - (let* ((server-url (server:record->url server-record)) - (server-id (server:record->id server-record)) - (res (server:ping server-url server-id))) - (if res - server-url - #f))) - -(define (server:kill servr) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " servr ", exn=" exn) - #f) - (match-let (((hostname port start-time server-id pid) - servr)) - (tasks:kill-server hostname pid)))) - -;; ;; called in megatest.scm, host-port is string hostname:port -;; ;; -;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running -;; ;; in the same process as the server. -;; ;; -;; (define (server:ping host:port server-id #!key (do-exit #f)) -;; (let* ((host-port (cond -;; ((string? host:port) -;; (let ((slst (string-split host:port ":"))) -;; (if (eq? (length slst) 2) -;; (list (car slst)(string->number (cadr slst))) -;; #f))) -;; (else -;; #f)))) -;; (cond -;; ((and (list? host-port) -;; (eq? (length host-port) 2)) -;; (let* ((myrunremote (make-and-init-remote *toppath*)) -;; (iface (car host-port)) -;; (port (cadr host-port)) -;; (server-dat (client:connect iface port server-id myrunremote)) -;; (login-res (rmt:login-no-auto-client-setup myrunremote))) -;; (http-transport:close-connections myrunremote) -;; (if (and (list? login-res) -;; (car login-res)) -;; (begin -;; ;; (print "LOGIN_OK") -;; (if do-exit (exit 0)) -;; #t) -;; (begin -;; ;; (print "LOGIN_FAILED") -;; (if do-exit (exit 1)) -;; #f)))) -;; (else -;; (if host:port -;; (debug:print 0 *default-log-port* "ERROR: bad host:port "host:port)) -;; (if do-exit -;; (exit 1) -;; #f))))) -;; -;; ;; run ping in separate process, safest way in some cases -;; ;; -;; (define (server:ping-server ifaceport) -;; (with-input-from-pipe -;; (conc (common:get-megatest-exe) " -ping " ifaceport) -;; (lambda () -;; (let loop ((inl (read-line)) -;; (res "NOREPLY")) -;; (if (eof-object? inl) -;; (case (string->symbol res) -;; ((NOREPLY) #f) -;; ((LOGIN_OK) #t) -;; (else #f)) -;; (loop (read-line) inl)))))) -;; -;; ;; NOT USED (well, ok, reference in rpc-transport but otherwise not used). -;; ;; -;; (define (server:login toppath) -;; (lambda (toppath) -;; (set! *db-last-access* (current-seconds)) ;; might not be needed. -;; (if (equal? *toppath* toppath) -;; #t -;; #f))) - -;; timeout is hms string: 1h 5m 3s, default is 1 minute -;; This is currently broken. Just use the number of hours with no unit. -;; Default is 600 seconds. -;; -(define (server:expiration-timeout) - (let* ((tmo (configf:lookup *configdat* "server" "timeout"))) - (if (string? tmo) - (let* ((num (string->number tmo))) - (if num - (* 3600 num) - (common:hms-string->seconds tmo))) - 600 ;; this is the default - ))) - -(define (server:get-best-guess-address hostname) - (let ((res #f)) - (for-each - (lambda (adr) - (if (not (eq? (u8vector-ref adr 0) 127)) - (set! res adr))) - ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME - (vector->list (hostinfo-addresses (hostname->hostinfo hostname)))) - (string-intersperse - (map number->string - (u8vector->list - (if res res (hostname->ip hostname)))) "."))) - -;; moving this here as it needs access to db and cannot be in common. -;; - -(define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f)) - (debug:print "WARNING: bruteforce-syncer is called but has been disabled!") - (lambda () - (debug:print "WARNING: bruteforce-syncer is called but has been disabled!"))) -