Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -37,11 +37,12 @@ diff-report.scm cgisetup/models/pgdb.scm # module source files MSRCFILES = dbfile.scm debugprint.scm mtargs.scm commonmod.scm dbmod.scm \ tcp-transportmod.scm rmtmod.scm portlogger.scm apimod.scm \ - configfmod.scm processmod.scm servermod.scm + configfmod.scm processmod.scm servermod.scm megatestmod.scm \ + stml2.scm transport-mode.scm : transport-mode.scm.template cp transport-mode.scm.template transport-mode.scm dashboard-transport-mode.scm : dashboard-transport-mode.scm.template @@ -56,15 +57,15 @@ mofiles/configfmod.o : mofiles/processmod.o mofiles/processmod.o : mofiles/commonmod.o mofiles/servermod.o : mofiles/commonmod.o mofiles/dbfile.o : \ - mofiles/debugprint.o mofiles/commonmod.o + mofiles/debugprint.o mofiles/commonmod.o mofiles/configfmod.o mofiles/apimod.o : mofiles/commonmod.o mofiles/tcp-transportmod.o mofiles/configfmod.o mofiles/dbmod.o : mofiles/dbfile.o mofiles/api.o : mofiles/apimod.o -mofiles/commonmod.o : mofiles/debugprint.o +mofiles/commonmod.o : mofiles/debugprint.o mofiles/stml2.o configf.o : commonmod.import.o mofiles/dbfile.o : mofiles/debugprint.o mofiles/rmtmod.o mofiles/dbmod.o : mofiles/dbfile.o mofiles/commonmod.o mofiles/debugprint.o db.o : mofiles/dbmod.o mofiles/dbfile.o mofiles/debugprint.o : mofiles/mtargs.o Index: apimod.scm ================================================================== --- apimod.scm +++ apimod.scm @@ -281,11 +281,12 @@ (loop))))) (let loop ((thnum 0)) (thread-start! (make-thread thproc (conc "queue-thread-" thnum))) (thread-sleep! 0.05) (if (< thnum 20) - (loop (+ thnum 1)))))) + (loop (+ thnum 1)) + (debug:print 0 *default-log-port* "Started "thnum" api threads"))))) (define (api:tcp-dispatch-request-make-handler-core dbstruct api:dispatch-request) (assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.") (if (not *server-signature*) (set! *server-signature* (tt:mk-signature *toppath*))) Index: commonmod.scm ================================================================== --- commonmod.scm +++ commonmod.scm @@ -18,10 +18,12 @@ ;;====================================================================== (declare (unit commonmod)) (declare (uses debugprint)) +(declare (uses mtargs)) +(declare (uses stml2)) (use srfi-69) (module commonmod * @@ -30,11 +32,12 @@ (cond-expand (chicken-4 (import chicken ports - + (prefix base64 base64:) + (prefix sqlite3 sqlite3:) data-structures extras files matchable @@ -43,16 +46,21 @@ pathname-expand posix posix-extras regex regex-case + sparse-vectors srfi-1 + srfi-13 srfi-18 srfi-69 typed-records - + z3 + debugprint + stml2 + (prefix mtargs args:) ) (use srfi-69)) (chicken-5 (import (prefix sqlite3 sqlite3:) ;; data-structures @@ -124,10 +132,230 @@ (begin (hash-table-set! *common:denoise* key currtime) #t) #f))) +;; GLOBALS + +;; CONTEXTS +(defstruct cxt + (taskdb #f) + (cmutex (make-mutex))) +;; (define *contexts* (make-hash-table)) +;; (define *context-mutex* (make-mutex)) + +;; ;; safe method for accessing a context given a toppath +;; ;; +;; (define (common:with-cxt toppath proc) +;; (mutex-lock! *context-mutex*) +;; (let ((cxt (hash-table-ref/default *contexts* toppath #f))) +;; (if (not cxt) +;; (set! cxt (let ((x (make-cxt)))(hash-table-set! *contexts* toppath x) x))) +;; (let ((cxt-mutex (cxt-mutex cxt))) +;; (mutex-unlock! *context-mutex*) +;; (mutex-lock! cxt-mutex) +;; (let ((res (proc cxt))) +;; (mutex-unlock! cxt-mutex) +;; res)))) + +;; A hash table that can be accessed by #{scheme ...} calls in +;; config files. Allows communicating between confgs +;; +(define *user-hash-data* (make-hash-table)) + +(define *db-keys* #f) + +(define *pkts-info* (make-hash-table)) ;; store stuff like the last parent here +(define *configinfo* #f) ;; raw results from setup, includes toppath and table from megatest.config +(define *runconfigdat* #f) ;; run configs data +(define *configdat* #f) ;; megatest.config data +(define *configstatus* #f) ;; status of data; 'fulldata : all processing done, #f : no data yet, 'partialdata : partial read done +;; (define *toppath* #f) ;; moved to commonmod +(define *already-seen-runconfig-info* #f) + +(define *test-meta-updated* (make-hash-table)) +(define *globalexitstatus* 0) ;; attempt to work around possible thread issues +(define *passnum* 0) ;; when running track calls to run-tests or similar +;; (define *alt-log-file* #f) ;; used by -log +;; (define *common:denoise* (make-hash-table)) ;; for low noise printing +(define *time-zero* (current-seconds)) ;; for the watchdog +(define *on-exit-procs* '()) ;; add procs to this list to be executed on exit +(define *default-area-tag* "local") + +;; DATABASE +;; task db +(define *task-db* #f) ;; (vector db path-to-db) +(define *db-access-allowed* #t) ;; flag to allow access +;; (define *db-access-mutex* (make-mutex)) ;; moved to dbfile +;; (define *db-transaction-mutex* (make-mutex)) +(define *db-cache-path* #f) +;; (define *db-with-db-mutex* (make-mutex)) +(define *db-api-call-time* (make-hash-table)) ;; hash of command => (list of times) + +;; SERVER +(define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg +(define *runremote* #f) ;; if set up for server communication this will hold +;; (define *max-cache-size* 0) +(define *logged-in-clients* (make-hash-table)) +(define *server-id* #f) +;; (define *server-info* #f) ;; good candidate for easily convert to non-global +(define *time-to-exit* #f) +(define *run-id* #f) +(define *server-kind-run* (make-hash-table)) +(define *home-host* #f) +;; (define *total-non-write-delay* 0) +(define *heartbeat-mutex* (make-mutex)) +;; (define *api-process-request-count* 0) +;; (define *max-api-process-requests* 0) +(define *server-overloaded* #f) + +;; client +(define *rmt-mutex* (make-mutex)) ;; remote access calls mutex + +;; RPC transport +(define *rpc:listener* #f) + +;; KEY info +(define *target* (make-hash-table)) ;; cache the target here; target is keyval1/keyval2/.../keyvalN +(define *keys* (make-hash-table)) ;; cache the keys here +(define *keyvals* (make-hash-table)) +(define *toptest-paths* (make-hash-table)) ;; cache toptest path settings here +(define *test-paths* (make-hash-table)) ;; cache test-id to test run paths here +(define *test-ids* (make-hash-table)) ;; cache run-id, testname, and item-path => test-id +(define *test-info* (make-hash-table)) ;; cache the test info records, update the state, status, run_duration etc. from testdat.db + +(define *run-info-cache* (make-hash-table)) ;; run info is stable, no need to reget +(define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set +(define *homehost-mutex* (make-mutex)) + +;; Miscellaneous +(define *triggers-mutex* (make-mutex)) ;; block overlapping processing of triggers +(define *numcpus-cache* (make-hash-table)) + +;; this plugs a hole in posix-extras in recent chicken versions > 4.9) +(let-values (( (chicken-release-number chicken-major-version) + (apply values + (map string->number + (take + (string-split (chicken-version) ".") + 2))))) + (let ((resolve-pathname-broken? + (or (> chicken-release-number 4) + (and (eq? 4 chicken-release-number) (> chicken-major-version 9))))) + (if resolve-pathname-broken? + (define ##sys#expand-home-path pathname-expand)))) + +(define (realpath x) (resolve-pathname (pathname-expand (or x "/dev/null")) )) + +(define (common:get-this-exe-fullpath #!key (argv (argv))) + (let* ((this-script + (cond + ((and (> (length argv) 2) + (string-match "^(.*/csi|csi)$" (car argv)) + (string-match "^-(s|ss|sx|script)$" (cadr argv))) + (caddr argv)) + (else (car argv)))) + (fullpath (realpath this-script))) + fullpath)) + +;;====================================================================== + +(define *common:this-exe-fullpath* (common:get-this-exe-fullpath)) +(define *common:this-exe-dir* (pathname-directory *common:this-exe-fullpath*)) +(define *common:this-exe-name* (pathname-strip-directory *common:this-exe-fullpath*)) + +(define (common:get-sync-lock-filepath) + (let* ((tmp-area (common:make-tmpdir-name *toppath* "")) + (lockfile (conc tmp-area "/megatest.db.lock"))) + lockfile)) + +(define *common:logpro-exit-code->status-sym-alist* + '( ( 0 . pass ) + ( 1 . fail ) + ( 2 . warn ) + ( 3 . check ) + ( 4 . waived ) + ( 5 . abort ) + ( 6 . skip ))) + +(define (common:logpro-exit-code->status-sym exit-code) + (or (alist-ref exit-code *common:logpro-exit-code->status-sym-alist*) 'fail)) + +(define (common:worse-status-sym ss1 ss2) + (let loop ((status-syms-remaining '(abort fail check skip warn waived pass))) + (cond + ((null? status-syms-remaining) + 'fail) + ((eq? (car status-syms-remaining) ss1) + ss1) + ((eq? (car status-syms-remaining) ss2) + ss2) + (else + (loop (cdr status-syms-remaining)))))) + +(define (common:steps-can-proceed-given-status-sym status-sym) + (if (member status-sym '(warn waived pass)) + #t + #f)) + +(define (status-sym->string status-sym) + (case status-sym + ((pass) "PASS") + ((fail) "FAIL") + ((warn) "WARN") + ((check) "CHECK") + ((waived) "WAIVED") + ((abort) "ABORT") + ((skip) "SKIP") + (else "FAIL"))) + +(define (common:logpro-exit-code->test-status exit-code) + (status-sym->string (common:logpro-exit-code->status-sym exit-code))) + +;; launching and hosts +(defstruct host + (reachable #f) + (last-update 0) + (last-used 0) + (last-cpuload 1)) + +(define *host-loads* (make-hash-table)) + +;; cache environment vars for each run here +(define *env-vars-by-run-id* (make-hash-table)) + +;; Testconfig and runconfig caches. +(define *testconfigs* (make-hash-table)) ;; test-name => testconfig +(define *runconfigs* (make-hash-table)) ;; target => runconfig + +;; This is a cache of pre-reqs met, don't re-calc in cases where called with same params less than +;; five seconds ago +(define *pre-reqs-met-cache* (make-hash-table)) + +;; cache of verbosity given string +;; +(define *verbosity-cache* (make-hash-table)) + +(define (common:clear-caches) + (set! *target* (make-hash-table)) + (set! *keys* (make-hash-table)) + (set! *keyvals* (make-hash-table)) + (set! *toptest-paths* (make-hash-table)) + (set! *test-paths* (make-hash-table)) + (set! *test-ids* (make-hash-table)) + (set! *test-info* (make-hash-table)) + (set! *run-info-cache* (make-hash-table)) + (set! *env-vars-by-run-id* (make-hash-table)) + (set! *test-id-cache* (make-hash-table))) + +;; Generic string database +(define sdb:qry #f) ;; (make-sdb:qry)) ;; 'init #f) +;; Generic path database +(define *fdb* #f) + +(define *last-launch* (current-seconds)) ;; use for throttling the launch rate. Would be better to use the db and last time of a test in LAUNCHED state. + ;; environment vars handy stuff from common.scm ;; (define getenv get-environment-variable) (define (safe-setenv key val) (if (or (substring-index "!" key) @@ -471,58 +699,10 @@ ;; get the normalized (i.e. load / numcpus) for *this* host ;; (define (get-normalized-cpu-load) (/ (commonmod:get-cpu-load)(get-current-host-cores))) -;;====================================================================== -;; testsuite and area utilites -;;====================================================================== - -(define (get-testsuite-name toppath configdat) - (or (lookup configdat "setup" "area-name") - (lookup configdat "setup" "testsuite") - (get-environment-variable "MT_TESTSUITE_NAME") - (if (string? toppath) - (pathname-file toppath) - #f))) - -(define (get-area-path-signature toppath #!optional (short #f)) - (let ((res (message-digest-string (md5-primitive) toppath))) - (if short - (substring res 0 4) - res))) - -(define (get-area-name configdat toppath #!optional (short #f)) - ;; look up my area name in areas table (future) - ;; generate auto name - (conc (get-area-path-signature toppath short) - "-" - (get-testsuite-name toppath configdat))) - -;; need generic find-record-with-var-nmatching-val -;; -(define (path->area-record cfgdat path) - (let* ((areadat (get-cfg-areas cfgdat)) - (all (filter (lambda (x) - (let* ((keyvals (cdr x)) - (pth (alist-ref 'path keyvals))) - (equal? path pth))) - areadat))) - (if (null? all) - #f - (car all)))) ;; return first match - -;; given a config return an alist of alists -;; area-name => data -;; -(define (get-cfg-areas cfgdat) - (let ((adat (get-section cfgdat "areas"))) - (map (lambda (entry) - `(,(car entry) . - ,(val->alist (cadr entry)))) - adat))) - ;;====================================================================== ;; time utils ;;====================================================================== (define (common:human-time) @@ -944,341 +1124,10 @@ ) ) -;; GLOBALS - -;; CONTEXTS -(defstruct cxt - (taskdb #f) - (cmutex (make-mutex))) -;; (define *contexts* (make-hash-table)) -;; (define *context-mutex* (make-mutex)) - -;; ;; safe method for accessing a context given a toppath -;; ;; -;; (define (common:with-cxt toppath proc) -;; (mutex-lock! *context-mutex*) -;; (let ((cxt (hash-table-ref/default *contexts* toppath #f))) -;; (if (not cxt) -;; (set! cxt (let ((x (make-cxt)))(hash-table-set! *contexts* toppath x) x))) -;; (let ((cxt-mutex (cxt-mutex cxt))) -;; (mutex-unlock! *context-mutex*) -;; (mutex-lock! cxt-mutex) -;; (let ((res (proc cxt))) -;; (mutex-unlock! cxt-mutex) -;; res)))) - -;; A hash table that can be accessed by #{scheme ...} calls in -;; config files. Allows communicating between confgs -;; -(define *user-hash-data* (make-hash-table)) - -(define *db-keys* #f) - -(define *pkts-info* (make-hash-table)) ;; store stuff like the last parent here -(define *configinfo* #f) ;; raw results from setup, includes toppath and table from megatest.config -(define *runconfigdat* #f) ;; run configs data -(define *configdat* #f) ;; megatest.config data -(define *configstatus* #f) ;; status of data; 'fulldata : all processing done, #f : no data yet, 'partialdata : partial read done -;; (define *toppath* #f) ;; moved to commonmod -(define *already-seen-runconfig-info* #f) - -(define *test-meta-updated* (make-hash-table)) -(define *globalexitstatus* 0) ;; attempt to work around possible thread issues -(define *passnum* 0) ;; when running track calls to run-tests or similar -;; (define *alt-log-file* #f) ;; used by -log -;; (define *common:denoise* (make-hash-table)) ;; for low noise printing -(define *default-log-port* (current-error-port)) -(define *time-zero* (current-seconds)) ;; for the watchdog -(define *on-exit-procs* '()) ;; add procs to this list to be executed on exit -(define *default-area-tag* "local") - -;; DATABASE -;; db access -(define *db-last-access* (current-seconds)) ;; last db access, used in server -;; (define *db-write-access* #t) -;; db sync -;; (define *db-last-sync* 0) ;; last time the sync to megatest.db happened -(define *db-sync-in-progress* #f) ;; if there is a sync in progress do not try to start another -;; (define *db-multi-sync-mutex* (make-mutex)) ;; protect access to *db-sync-in-progress*, *db-last-sync* -;; task db -(define *task-db* #f) ;; (vector db path-to-db) -(define *db-access-allowed* #t) ;; flag to allow access -;; (define *db-access-mutex* (make-mutex)) ;; moved to dbfile -;; (define *db-transaction-mutex* (make-mutex)) -(define *db-cache-path* #f) -;; (define *db-with-db-mutex* (make-mutex)) -(define *db-api-call-time* (make-hash-table)) ;; hash of command => (list of times) - -;; SERVER -(define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg -(define *runremote* #f) ;; if set up for server communication this will hold -;; (define *max-cache-size* 0) -(define *logged-in-clients* (make-hash-table)) -(define *server-id* #f) -;; (define *server-info* #f) ;; good candidate for easily convert to non-global -(define *time-to-exit* #f) -(define *run-id* #f) -(define *server-kind-run* (make-hash-table)) -(define *home-host* #f) -;; (define *total-non-write-delay* 0) -(define *heartbeat-mutex* (make-mutex)) -;; (define *api-process-request-count* 0) -;; (define *max-api-process-requests* 0) -(define *server-overloaded* #f) - -;; client -(define *rmt-mutex* (make-mutex)) ;; remote access calls mutex - -;; RPC transport -(define *rpc:listener* #f) - -;; KEY info -(define *target* (make-hash-table)) ;; cache the target here; target is keyval1/keyval2/.../keyvalN -(define *keys* (make-hash-table)) ;; cache the keys here -(define *keyvals* (make-hash-table)) -(define *toptest-paths* (make-hash-table)) ;; cache toptest path settings here -(define *test-paths* (make-hash-table)) ;; cache test-id to test run paths here -(define *test-ids* (make-hash-table)) ;; cache run-id, testname, and item-path => test-id -(define *test-info* (make-hash-table)) ;; cache the test info records, update the state, status, run_duration etc. from testdat.db - -(define *run-info-cache* (make-hash-table)) ;; run info is stable, no need to reget -(define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set -(define *homehost-mutex* (make-mutex)) - -;; Miscellaneous -(define *triggers-mutex* (make-mutex)) ;; block overlapping processing of triggers -(define *numcpus-cache* (make-hash-table)) - -;; this plugs a hole in posix-extras in recent chicken versions > 4.9) -(let-values (( (chicken-release-number chicken-major-version) - (apply values - (map string->number - (take - (string-split (chicken-version) ".") - 2))))) - (let ((resolve-pathname-broken? - (or (> chicken-release-number 4) - (and (eq? 4 chicken-release-number) (> chicken-major-version 9))))) - (if resolve-pathname-broken? - (define ##sys#expand-home-path pathname-expand)))) - -(define (realpath x) (resolve-pathname (pathname-expand (or x "/dev/null")) )) - -(define (common:get-this-exe-fullpath #!key (argv (argv))) - (let* ((this-script - (cond - ((and (> (length argv) 2) - (string-match "^(.*/csi|csi)$" (car argv)) - (string-match "^-(s|ss|sx|script)$" (cadr argv))) - (caddr argv)) - (else (car argv)))) - (fullpath (realpath this-script))) - fullpath)) - -;;====================================================================== - -(define *common:this-exe-fullpath* (common:get-this-exe-fullpath)) -(define *common:this-exe-dir* (pathname-directory *common:this-exe-fullpath*)) -(define *common:this-exe-name* (pathname-strip-directory *common:this-exe-fullpath*)) - -(define (common:get-sync-lock-filepath) - (let* ((tmp-area (common:make-tmpdir-name *toppath* "")) - (lockfile (conc tmp-area "/megatest.db.lock"))) - lockfile)) - -(define *common:logpro-exit-code->status-sym-alist* - '( ( 0 . pass ) - ( 1 . fail ) - ( 2 . warn ) - ( 3 . check ) - ( 4 . waived ) - ( 5 . abort ) - ( 6 . skip ))) - -(define (common:logpro-exit-code->status-sym exit-code) - (or (alist-ref exit-code *common:logpro-exit-code->status-sym-alist*) 'fail)) - -(define (common:worse-status-sym ss1 ss2) - (let loop ((status-syms-remaining '(abort fail check skip warn waived pass))) - (cond - ((null? status-syms-remaining) - 'fail) - ((eq? (car status-syms-remaining) ss1) - ss1) - ((eq? (car status-syms-remaining) ss2) - ss2) - (else - (loop (cdr status-syms-remaining)))))) - -(define (common:steps-can-proceed-given-status-sym status-sym) - (if (member status-sym '(warn waived pass)) - #t - #f)) - -(define (status-sym->string status-sym) - (case status-sym - ((pass) "PASS") - ((fail) "FAIL") - ((warn) "WARN") - ((check) "CHECK") - ((waived) "WAIVED") - ((abort) "ABORT") - ((skip) "SKIP") - (else "FAIL"))) - -(define (common:logpro-exit-code->test-status exit-code) - (status-sym->string (common:logpro-exit-code->status-sym exit-code))) - -;; -(defstruct remote - - ;; transport to be used - ;; http - use http-transport - ;; http-read-cached - use http-transport for writes but in-mem cached for reads - (rmode 'http) - (hh-dat (let ((res (or (server:choose-server *toppath* 'homehost) - (cons #f #f)))) - (assert (pair? res)(conc "FATAL: hh-dat should be a pair, got "res)) - res)) - (server-url #f) ;; (server:check-if-running *toppath*) #f)) - (server-id #f) - (server-info #f) ;; (if *toppath* (server:check-if-running *toppath*) #f)) - (last-server-check 0) ;; last time we checked to see if the server was alive - (connect-time (current-seconds)) ;; when we first connected - (last-access (current-seconds)) ;; last time we talked to server - ;; (conndat #f) ;; iface port api-uri api-url api-req seconds server-id - (server-timeout (server:expiration-timeout)) - (force-server #f) - (ro-mode #f) - (ro-mode-checked #f) ;; flag that indicates we have checked for ro-mode - - ;; conndat stuff - (iface #f) ;; TODO: Consolidate this data with server-url and server-info above - (port #f) - (api-url #f) - (api-uri #f) - (api-req #f)) - -;; launching and hosts -(defstruct host - (reachable #f) - (last-update 0) - (last-used 0) - (last-cpuload 1)) - -(define *host-loads* (make-hash-table)) - -;; cache environment vars for each run here -(define *env-vars-by-run-id* (make-hash-table)) - -;; Testconfig and runconfig caches. -(define *testconfigs* (make-hash-table)) ;; test-name => testconfig -(define *runconfigs* (make-hash-table)) ;; target => runconfig - -;; This is a cache of pre-reqs met, don't re-calc in cases where called with same params less than -;; five seconds ago -(define *pre-reqs-met-cache* (make-hash-table)) - -;; cache of verbosity given string -;; -(define *verbosity-cache* (make-hash-table)) - -(define (common:clear-caches) - (set! *target* (make-hash-table)) - (set! *keys* (make-hash-table)) - (set! *keyvals* (make-hash-table)) - (set! *toptest-paths* (make-hash-table)) - (set! *test-paths* (make-hash-table)) - (set! *test-ids* (make-hash-table)) - (set! *test-info* (make-hash-table)) - (set! *run-info-cache* (make-hash-table)) - (set! *env-vars-by-run-id* (make-hash-table)) - (set! *test-id-cache* (make-hash-table))) - -;; Generic string database -(define sdb:qry #f) ;; (make-sdb:qry)) ;; 'init #f) -;; Generic path database -(define *fdb* #f) - -(define *last-launch* (current-seconds)) ;; use for throttling the launch rate. Would be better to use the db and last time of a test in LAUNCHED state. - -;;====================================================================== -;; V E R S I O N -;;====================================================================== - -(define (common:get-full-version) - (conc megatest-version "-" megatest-fossil-hash)) - -(define (common:version-signature) - (conc megatest-version "-" (substring megatest-fossil-hash 0 4))) - -;;====================================================================== -;; from metadat lookup MEGATEST_VERSION -;; -(define (common:get-last-run-version) ;; RADT => How does this work in send-receive function??; assume it is the value saved in some DB - (rmt:get-var "MEGATEST_VERSION")) - -(define (common:get-last-run-version-number) - (string->number - (substring (common:get-last-run-version) 0 6))) - -(define (common:set-last-run-version) - (rmt:set-var "MEGATEST_VERSION" (common:version-signature))) - -;;====================================================================== -;; postive number if megatest version > db version -;; negative number if megatest version < db version -(define (common:version-db-delta) - (- megatest-version (common:get-last-run-version-number))) - -(define (common:version-changed?) - (not (equal? (common:get-last-run-version) - (common:version-signature)))) - - -;; From 1.70 to 1.80, db's are compatible. - -(define (common:api-changed?) - (let* ( - (megatest-major-version (substring (->string megatest-version) 0 4)) - (run-major-version (substring (conc (common:get-last-run-version)) 0 4)) - ) - (and (not (equal? megatest-major-version "1.80")) - (not (equal? megatest-major-version megatest-run-version))) - ) -) - -;;====================================================================== -;; Move me elsewhere ... -;; RADT => Why do we meed the version check here, this is called only if version misma -;; -(define (common:cleanup-db dbstruct #!key (full #f)) - (case (rmt:transport-mode) - ((http) - (apply db:multi-db-sync - dbstruct - 'schema - 'killservers - 'adj-target - 'new2old - '(dejunk) - )) - ((tcp nfs) - (apply db:multi-db-sync - dbstruct - 'schema - 'killservers - 'adj-target - 'new2old - '(dejunk) - ))) - (if (common:api-changed?) - (common:set-last-run-version))) - (define (common:snapshot-file filepath #!key (subdir ".") ) (if (file-exists? filepath) (let* ((age-sec (lambda (file) (if (file-exists? file) (- (current-seconds) (file-modification-time file)) @@ -1355,144 +1204,10 @@ (copy daysfile wksfile) (copy hrsfile daysfile)) #t) #f)) -;;====================================================================== -;; Rotate logs, logic: -;; if > 500k and older than 1 week: -;; remove previous compressed log and compress this log -;; WARNING: This proc operates assuming that it is in the directory above the -;; logs directory you wish to log-rotate. -;; -(define (common:rotate-logs) - (let* ((all-files (make-hash-table)) - (stats (make-hash-table)) - (inc-stat (lambda (key) - (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1)))) - (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "600")))) ;; name -> age - (if (not (directory-exists? "logs"))(create-directory "logs")) - (directory-fold - (lambda (file rem) - (handle-exceptions - exn - (begin - (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore. exn=" exn) - (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - ;; (print-call-chain (current-error-port)) ;; - ) - (let* ((fullname (conc "logs/" file)) - (mod-time (file-modification-time fullname)) - (file-age (- (current-seconds) mod-time)) - (file-old (> file-age (* 48 60 60))) - (file-big (> (file-size fullname) 200000))) - (hash-table-set! all-files file mod-time) - (if (or (and (string-match "^.*.log" file) - file-old - file-big) - (and (string-match "^server-.*.log" file) - file-old)) - (let ((gzfile (conc fullname ".gz"))) - (if (common:file-exists? gzfile) - (begin - (debug:print-info 0 *default-log-port* "removing " gzfile) - (delete-file* gzfile) - (hash-table-delete! all-files gzfile) ;; needed? - )) - (debug:print-info 0 *default-log-port* "compressing " file) - (system (conc "gzip " fullname)) - (inc-stat "gzipped") - (hash-table-set! all-files (conc file ".gz") file-age) ;; add the .gz file and remove the base file - (hash-table-delete! all-files file) - ) - (if (and (> file-age (* (string->number (or (configf:lookup *configdat* "setup" "log-expire-days") "30")) 24 3600)) - (file-exists? fullname)) ;; just in case it was gzipped - will get it next time - (handle-exceptions - exn - #f - (if (directory? fullname) - (begin - (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") - (inc-stat "directories")) - (begin - (delete-file* fullname) - (inc-stat "deleted"))) - (hash-table-delete! all-files file))))))) - '() - "logs") - (for-each - (lambda (category) - (let ((quant (hash-table-ref/default stats category 0))) - (if (> quant 0) - (debug:print-info 0 *default-log-port* category " log files: " quant)))) - `("deleted" "gzipped" "directories")) - (let ((num-logs (hash-table-size all-files))) - (if (> num-logs max-allowed) ;; because NFS => don't let number of logs exceed 300 - (let ((files (take (sort (hash-table-keys all-files) - (lambda (a b) - (< (hash-table-ref all-files a)(hash-table-ref all-files b)))) - (- num-logs max-allowed)))) - (for-each - (lambda (file) - (let* ((fullname (conc "logs/" file))) - (if (directory? fullname) - (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") - (handle-exceptions - exn - (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn) - (delete-file* fullname))))) - files) - (debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files.")))))) - -;;====================================================================== -;; Force a megatest cleanup-db if version is changed and skip-version-check not specified -;; Do NOT check if not on homehost! -;; -(define (common:exit-on-version-changed) - (if (and *toppath* ;; do nothing if *toppath* not yet provided - (common:on-homehost?)) - (if (common:api-changed?) - (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")) - (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") ".mtdb/main.db")) - (read-only (not (file-write-access? dbfile))) - (dbstruct (db:setup))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;; #t))) - (debug:print 0 *default-log-port* - "WARNING: Version mismatch!\n" - " expected: " (common:version-signature) "\n" - " got: " (common:get-last-run-version)) - (cond - ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t) - ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only) - (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db - (debug:print 0 *default-log-port* " I see you are the owner of megatest.config, attempting to cleanup and reset to new version") - (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "Failed to switch versions. exn=" exn) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (print-call-chain (current-error-port)) - (exit 1)) - (common:cleanup-db dbstruct))) - ((not (common:file-exists? mtconf)) - (debug:print 0 *default-log-port* " megatest.config does not exist in this area. Cannot proceed with megatest version migration.") - (exit 1)) - ((not (common:file-exists? dbfile)) - (debug:print 0 *default-log-port* " .mtdb/main.db does not exist in this area. Cannot proceed with megatest version migration.") - (exit 1)) - ((not (eq? (current-user-id)(file-owner mtconf))) - (debug:print 0 *default-log-port* " You do not own .mtdb/main.db in this area. Cannot proceed with megatest version migration.") - (exit 1)) - (read-only - (debug:print 0 *default-log-port* " You have read-only access to this area. Cannot proceed with megatest version migration.") - (exit 1)) - (else - (debug:print 0 *default-log-port* " to switch versions you can run: \"megatest -cleanup-db\"") - (exit 1))))))) -;;====================================================================== -;; (begin -;; (debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.") -;; (exit 1)))) ;;====================================================================== ;; S P A R S E A R R A Y S ;;====================================================================== @@ -1517,27 +1232,10 @@ (sparse-vector-set! row y val) (let ((new-row (make-sparse-vector))) (sparse-vector-set! a x new-row) (sparse-vector-set! new-row y val))))) -;;====================================================================== -;; L O C K E R S A N D B L O C K E R S -;;====================================================================== - -;; block further accesses to databases. Call this before shutting db down -(define (common:db-block-further-queries) - (mutex-lock! *db-access-mutex*) - (set! *db-access-allowed* #f) - (mutex-unlock! *db-access-mutex*)) - -(define (common:db-access-allowed?) - (let ((val (begin - (mutex-lock! *db-access-mutex*) - *db-access-allowed* - (mutex-unlock! *db-access-mutex*)))) - val)) - ;;====================================================================== ;; U S E F U L S T U F F ;;====================================================================== ;; convert things to an alist or assoc list, #f gets converted to "" @@ -1731,20 +1429,10 @@ (define (assoc/default key lst . default) (let ((res (assoc key lst))) (if res (cadr res)(if (null? default) #f (car default))))) -(define (common:get-testsuite-name) - (or (configf:lookup *configdat* "setup" "area-name") ;; megatest is a flexible tool, testsuite is too limiting a description. - (configf:lookup *configdat* "setup" "testsuite" ) - (getenv "MT_TESTSUITE_NAME") - (pathname-file (or (if (string? *toppath* ) - (pathname-file *toppath*) - #f) - (common:get-toppath #f))) - "please-set-setup-area-name")) ;; (pathname-file (current-directory))))) - ;;====================================================================== ;; safe getting of toppath (define (common:get-toppath areapath) (or *toppath* (if areapath @@ -1768,82 +1456,20 @@ #f) (loop (pathname-directory thepath))))) )) -;;====================================================================== -;; redefine for future cleanup (converge on area-name, the more generic -;; -(define common:get-area-name common:get-testsuite-name) - -(define (common:get-db-tmp-area . junk) - (if *db-cache-path* - *db-cache-path* - (if *toppath* ;; common:get-create-writeable-dir - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* "Couldn't create path to " *db-cache-path* ", exn=" exn) - (exit 1)) - (let* ((toppath (common:real-path *toppath*)) - (tsname (common:get-testsuite-name)) - (dbpath (common:get-create-writeable-dir - (list (conc "/tmp/" (current-user-name) - "/megatest_localdb/" - tsname "/" - (string-translate toppath "/" ".")) - (conc "/tmp/" (current-process-id) ;; just in case we have an issue with the dir by own user name - "/"(current-user-name) "/megatest_localdb/" - tsname - (string-translate toppath "/" ".")) - )))) - (set! *db-cache-path* dbpath) - ;; ensure megatest area has .mtdb - (let ((dbarea (conc *toppath* "/.mtdb"))) - (if (not (file-exists? dbarea)) - (create-directory dbarea))) - ;; ensure tmp area has .mtdb - (let ((dbarea (conc dbpath "/.mtdb"))) - (if (not (file-exists? dbarea)) - (create-directory dbarea))) - dbpath)) - #f))) +(define (get-area-path-signature toppath #!optional (short #f)) + (let ((res (message-digest-string (md5-primitive) toppath))) + (if short + (substring res 0 4) + res))) + (define (common:get-area-path-signature) (message-digest-string (md5-primitive) *toppath*)) -;;====================================================================== -;; E X I T H A N D L I N G -;;====================================================================== - -(define (common:run-sync?) - (and *toppath* ;; gate if called before *toppath* is set - (common:on-homehost?) - (args:get-arg "-server"))) - - -(define (std-signal-handler signum) - ;; (signal-mask! signum) - (set! *time-to-exit* #t) - ;;(debug:print-info 13 *default-log-port* "got signal "signum) - (debug:print-error 0 *default-log-port* "Received signal " signum " aaa exiting promptly") - ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway - (exit)) - -(define (special-signal-handler signum) - ;; (signal-mask! signum) - (set! *time-to-exit* #t) - ;;(debug:print-info 13 *default-log-port* "got signal "signum) - (debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting!!") - ;;TODO send email to notify admin contact listed in the config that the lisner got killed - ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway - (exit)) - - -(set-signal-handler! signal/int std-signal-handler) ;; ^C -(set-signal-handler! signal/term std-signal-handler) - ;; (set-signal-handler! signal/stop std-signal-handler) ;; ^Z NO, do NOT handle ^Z! ;;====================================================================== ;; M I S C U T I L S ;;====================================================================== @@ -1873,28 +1499,10 @@ (set! res #t)))) (string-split patts ",")) res) #t)) -;;====================================================================== -;; '(print (string-intersperse (map cadr (hash-table-ref/default (read-config "megatest.config" \#f \#t) "disks" '"'"'("none" ""))) "\n"))' -(define (common:get-disks #!key (configf #f)) - (hash-table-ref/default - (or configf (read-config "megatest.config" #f #t)) - "disks" '("none" ""))) - -(define (common:get-install-area) - (let ((exe-path (car (argv)))) - (if (common:file-exists? exe-path) - (handle-exceptions - exn - #f - (pathname-directory - (pathname-directory - (pathname-directory exe-path)))) - #f))) - ;;====================================================================== ;; return first path that can be created or already exists and is writable ;; (define (common:get-create-writeable-dir dirs) (if (null? dirs) @@ -1961,177 +1569,10 @@ inlst (begin (if message (debug:print-error 0 *default-log-port* message)) (or ovrd '())))) -;;====================================================================== -;; T A R G E T S , S T A T E , S T A T U S , -;; R U N N A M E A N D T E S T P A T T -;;====================================================================== - -;;====================================================================== -;; (map print (map car (hash-table->alist (read-config "runconfigs.config" #f #t)))) -;; -(define (common:get-runconfig-targets #!key (configf #f)) - (let ((targs (sort (map car (hash-table->alist - (or configf ;; NOTE: There is no value in using runconfig:read here. - (read-config (conc *toppath* "/runconfigs.config") - #f #t) - (make-hash-table)))) - stringsymbol force-setting) #f)) - (force-result (case force-type - ((#f) #f) - ((always) #t) - ((test) (if (args:get-arg "-execute") ;; we are in a test - #t - #f)) - (else - (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.") - #t)))) ;; default to requiring server - (if force-result - (begin - (debug:print-info 0 *default-log-port* "ATTENTION! Forcing use of server, force setting is \"" force-setting "\".") - #t) - #f))) ;;====================================================================== ;; M I S C L I S T S ;;====================================================================== @@ -2368,64 +1809,10 @@ onemin (let* ((load-change (- onemin fivemin)) (tchange (- 300 60))) (max (+ onemin (* 60 (/ load-change tchange))) 0)))) -;;====================================================================== -;; calculate a delay number based on a droop curve -;; inputs are: -;; - load-in, load as from uptime, NOT normalized -;; - numcpus, number of cpus, ideally use the real cpus, not threads -;; -(define (common:get-delay load-in numcpus) - (let* ((ratio (/ load-in numcpus)) - (new-option (configf:lookup *configdat* "load" "new-load-method")) - (paramstr (or (configf:lookup *configdat* "load" "exp-params") - "15 12 1281453987.9543 0.75")) ;; 5 4 10 1")) - (paramlst (map string->number (string-split paramstr)))) - (if new-option - (begin - (cond ((and (>= ratio 0) (< ratio .5)) - 0) - ((and (>= ratio 0.5) (<= ratio .9)) - (* ratio (/ 5 .9))) - ((and (> ratio .9) (<= ratio 1.1)) - (+ 5 (* (- ratio .9) (/ 55 .2)))) - ((> ratio 1.1) - 60))) - (match paramlst - ((r1 r2 s1 s2) - (debug:print 3 *default-log-port* "Using params r1=" r1 " r2=" r2 " s1=" s1 " s2=" s2) - (min (max (/ (expt r1 (* r2 s2 ratio)) s1) 0) 30)) - (else - (debug:print 0 *default-log-port* "BAD exp-params, should be \"r1 r2 s1 s2\" but got " paramstr) - 30))))) - -;; -mrw- this appears to not be used -;; -;; (define (common:print-delay-table) -;; (let loop ((x 0)) -;; (print x "," (common:get-delay x 1)) -;; (if (< x 2) -;; (loop (+ x 0.1))))) - -;; (define (get-cpu-load #!key (remote-host #f)) -;; (car (common:get-cpu-load remote-host))) - -;;====================================================================== -;; (let* ((load-res (process:cmd-run->list "uptime")) -;; (load-rx (regexp "load average:\\s+(\\d+)")) -;; (cpu-load #f)) -;; (for-each (lambda (l) -;; (let ((match (string-search load-rx l))) -;; (if match -;; (let ((newval (string->number (cadr match)))) -;; (if (number? newval) -;; (set! cpu-load newval)))))) -;; (car load-res)) -;; cpu-load)) - ;;====================================================================== ;; get values from cached info from dropping file in .sysdata dir ;; e.g. key is host and dtype is normalized-load ;; (define (common:get-cached-info key dtype #!key (age 10)) @@ -2612,151 +1999,10 @@ (define (common:unix-ping hostname) (let ((res (system (conc "ping -c 1 " hostname " > /dev/null")))) (eq? res 0))) -;;====================================================================== -;; ideally put all this info into the db, no need to preserve it across moving homehost -;; -;; return list of -;; ( reachable? cpuload update-time ) -(define (common:get-host-info hostname) - (let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data - (load (car loadinfo)) - (load-sample-time (cdr loadinfo)) - (load-sample-age (- (current-seconds) load-sample-time)) - (loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds - (host-last-update-timeout-seconds 4) - (host-rec (hash-table-ref/default *host-loads* hostname #f)) - ) - (cond - ((< load-sample-age loadinfo-timeout-seconds) - (list #t - load-sample-time - load)) - ((and host-rec - (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds))) - (list #t - (host-last-update host-rec) - (host-last-cpuload host-rec ))) - ((common:unix-ping hostname) - (list #t - (current-seconds) - (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds - (else - (list #f 0 -1) ;; bad host, don't use! - )))) - -;;====================================================================== -;; see defstruct host at top of file. -;; host: reachable last-update last-used last-cpuload -;; -(define (common:update-host-loads-table hosts-raw) - (let* ((hosts (filter (lambda (x) - (string-match (regexp "^\\S+$") x)) - hosts-raw))) - (for-each - (lambda (hostname) - (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) - (if h - h - (let ((h (make-host))) - (hash-table-set! *host-loads* hostname h) - h)))) - (host-info (common:get-host-info hostname)) - (is-reachable (car host-info)) - (last-reached-time (cadr host-info)) - (load (caddr host-info))) - (host-reachable-set! rec is-reachable) - (host-last-update-set! rec last-reached-time) - (host-last-cpuload-set! rec load))) - hosts))) - -;;====================================================================== -;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the -;; [host-rules] section. -;; -(define (common:get-least-loaded-host hosts-raw host-type configdat) - (let* ((rdat (configf:lookup configdat "host-rules" host-type)) - (rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate - (maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load - (maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs - (maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second - (hosts (filter (lambda (x) - (string-match (regexp "^\\S+$") x)) - hosts-raw)) - ;; (best-host #f) - (get-rec (lambda (hostname) - ;; (print "get-rec hostname=" hostname) - (let ((h (hash-table-ref/default *host-loads* hostname #f))) - (if h - h - (let ((h (make-host))) - (hash-table-set! *host-loads* hostname h) - h))))) - (best-load 99999) - (curr-time (current-seconds)) - (get-hosts-sorted (lambda (hosts) - (sort hosts (lambda (a b) - (let ((a-rec (get-rec a)) - (b-rec (get-rec b))) - ;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec)) - ;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec)) - (< (host-last-used a-rec) - (host-last-used b-rec)))))))) - (debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts)) - (if (null? hosts) - #f ;; no hosts to select from. All done and giving up now. - (let ((hosts-sorted (get-hosts-sorted hosts))) - (common:update-host-loads-table hosts) - (let loop ((hostname (car hosts-sorted)) - (tal (cdr hosts-sorted)) - (best-host #f)) - (let* ((rec (get-rec hostname)) - (reachable (host-reachable rec)) - (load (host-last-cpuload rec)) - (last-used (host-last-used rec)) - (delta (- curr-time last-used)) - (job-rate (if (> delta 0) - (/ 1 delta) - 999)) ;; jobs per second - (new-best - (cond - ((not reachable) - (debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.") - best-host) - ((and (< load maxnload) ;; load is acceptable - (< job-rate maxjobrate)) ;; job rate is acceptable - (set! best-load load) - hostname) - (else best-host)))) - (debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." ) - (if new-best - (begin ;; found a host, return it - (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) - (host-last-used-set! rec curr-time) - new-best) - (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) - -(define (common:wait-for-homehost-load maxnormload msg) - (let loop ((start-time (current-seconds))) ;; we saw some instances of this being called before *toppath* was set. This might be an early setup race. This delay should help but it is impossible to test... - (if (not *toppath*) - (begin - (debug:print 0 *default-log-port* "ERROR: common:wait-for-homehost-load called before *toppath* set.") - (thread-sleep! 30) - (if (< (- (current-seconds) start-time) 300) - (loop start-time))))) - (case (rmt:transport-mode) - ((http) - (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. - #f - (server:choose-server *toppath* 'homehost))) - (hh (if hh-dat (car hh-dat) #f))) - (common:wait-for-normalized-load maxnormload msg hh))) - (else - (common:wait-for-normalized-load maxnormload msg (get-host-name))))) - (define (common:get-num-cpus remote-host) (let* ((actual-host (or remote-host (get-host-name)))) ;; hosts had better not be changing the number of cpus too often! (or (hash-table-ref/default *numcpus-cache* actual-host #f) (let* ((numcpus (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) @@ -2782,342 +2028,10 @@ (common:write-cached-info actual-host "num-cpus" result)) result)))) (hash-table-set! *numcpus-cache* actual-host numcpus) numcpus)))) -;;====================================================================== -;; wait for normalized cpu load to drop below maxload -;; -(define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5)) - (let ((num-cpus (common:get-num-cpus remote-host))) - (if num-cpus - (common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host) - (begin - (thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again - (if (> rem-tries 0) - (common:wait-for-normalized-load maxnormload msg remote-host (- rem-tries 1)) - #f))))) - -;;====================================================================== -;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load -;; count - count down to zero, at some point we'd give up if the load never drops -;; num-tries - count down to zero number tries to get numcpus -;; -(define (common:wait-for-cpuload maxnormload numcpus-in - #!key (count 1000) - (msg #f)(remote-host #f)(num-tries 5)) - (let* ((loadavg (common:get-cpu-load remote-host)) - ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again - (numcpus (if (<= 1 numcpus-in) - (common:get-num-cpus remote-host) numcpus-in)) - (first (car loadavg)) - (next (cadr loadavg)) - (adjmaxload (* maxnormload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude - ;; fallback is to at least use 1 - ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit - ;; etc. - (effective-load (common:get-intercept first next)) - (recommended-delay (common:get-delay effective-load numcpus)) - (effective-host (or remote-host "localhost")) - (normalized-effective-load (/ effective-load numcpus)) - (will-wait (> normalized-effective-load maxnormload))) - (if (and will-wait (> recommended-delay 1)) - (let* ((actual-delay (min recommended-delay 30))) - (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) - (debug:print-info 0 *default-log-port* "Load control, delaying " - actual-delay " seconds to maintain safe load. current normalized effective load is " - normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load)) - (thread-sleep! actual-delay))) - - (cond - ;; bad data, try again to get the data - ((not will-wait) - (if (common:low-noise-print 3600 (conc (round normalized-effective-load) "-load-acceptable-" effective-host)) - (debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing."))) - - ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable - (> num-tries 0)) - (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " - first ", we'll sleep 10s and try " num-tries " more times.") - (thread-sleep! 10) - (common:wait-for-cpuload maxnormload numcpus-in - count: count remote-host: remote-host num-tries: (- num-tries 1))) - - ;; need to wait for load to drop - ((and will-wait ;; (> first adjmaxload) - (> count 0)) - (debug:print-info 0 *default-log-port* - "Delaying 15" ;; adjwait - " seconds due to normalized effective load " normalized-effective-load ;; first - " exceeding max of " adjmaxload - " on server " (or remote-host (get-host-name)) - " (normalized load-limit: " maxnormload ") " (if msg msg "")) - (thread-sleep! 15) ;; adjwait) - (common:wait-for-cpuload maxnormload numcpus count: (- count 1) msg: msg remote-host: remote-host) - ;; put the message here to indicate came out of waiting - (debug:print-info 1 *default-log-port* - "On host: " effective-host - ", effective load: " effective-load - ", numcpus: " numcpus - ", normalized effective load: " normalized-effective-load - )) - ;; overloaded and count expired (i.e. went to zero) - (else - (if (> num-tries 0) ;; should be "num-tries-left". - (if (common:low-noise-print 30 (conc (round effective-load) "-load-acceptable-" effective-host)) - (debug:print 0 *default-log-port* "Load on " effective-host " is acceptable at effective normalized load of " - effective-normalized-load " continuing.")) - (debug:print 0 *default-log-port* "Load on " effective-host ", " - first" could not be retrieved. Giving up and continuing.")))))) - -;;====================================================================== -;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load -;; -;; (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5)) -;; (let* ((loadavg (common:get-cpu-load remote-host)) -;; (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again -;; (common:get-num-cpus remote-host) -;; numcpus-in)) -;; (maxload (if force-maxload -;; maxload-in -;; (if (number? maxload-in) -;; (max maxload-in 0.5) -;; 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? -;; (first (car loadavg)) -;; (next (cadr loadavg)) -;; (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where -;; ;; numcpus (or could be -;; ;; maxload) is zero, -;; ;; crude fallback is to -;; ;; at least use 1 -;; (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next? -;; 0 -;; next))) ;; we will force a conservative calculation any time next is large. -;; (first-next-avg (/ (+ first next) 2)) -;; ;; add some randomness to the time to break any alignment -;; ;; where netbatch dumps many jobs to machines simultaneously -;; (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10) -;; (/ (- 1000 count) 10) -;; waitdelay) -;; (- first adjmaxload) )))) -;; (load-jump-limit (configf:lookup-number *configdat* "setup" "load-jump-limit")) -;; ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit -;; ;; etc. -;; (effective-load (common:get-intercept first next)) -;; (effective-host (or remote-host "localhost")) -;; (normalized-effective-load (/ effective-load numcpus)) -;; (will-wait (> normalized-effective-load maxload))) -;; -;; ;; let's let the user know once in a long while that load checking -;; ;; is happening but not constantly report it -;; #;(if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time -;; (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload -;; ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp)) -;; -;; (debug:print-info 1 *default-log-port* -;; "On host: " effective-host -;; ", effective load: " effective-load -;; ", numcpus: " numcpus -;; ", normalized effective load: " normalized-effective-load -;; ) -;; -;; (cond -;; ;; bad data, try again to get the data -;; ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable -;; (> num-tries 0)) -;; (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.") -;; (thread-sleep! 10) -;; (common:wait-for-cpuload maxload-in numcpus-in waitdelay -;; count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1))) -;; ;; need to wait for load to drop -;; ((and will-wait ;; (> first adjmaxload) -;; (> count 0)) -;; (debug:print-info 0 *default-log-port* -;; "Delaying " 15 ;; adjwait -;; " seconds due to normalized effective load " normalized-effective-load ;; first -;; " exceeding max of " adjmaxload -;; " on server " (or remote-host (get-host-name)) -;; " (normalized load-limit: " maxload ") " (if msg msg "")) -;; (thread-sleep! 15) ;; adjwait) -;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) -;; ((and (> loadjmp (cond -;; (load-jump-limit load-jump-limit) -;; ((> numcpus 8)(/ numcpus 2)) -;; ((> numcpus 4)(/ numcpus 1.2)) -;; (else 0.5))) -;; (> count 0)) -;; (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". " -;; (if msg msg "")) -;; (thread-sleep! adjwait) -;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) -;; (else -;; (if (> num-tries 0) -;; (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost"))) -;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing.")) -;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing.")))))) -;; -(define (get-uname . params) - (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params))))) - (uname #f)) - (if (null? (car uname-res)) - "unknown" - (caar uname-res)))) - -;;====================================================================== -;; D I S K S P A C E -;;====================================================================== - -(define (common:get-disk-space-used fpath) - (with-input-from-pipe (conc "/usr/bin/du -s " fpath) read)) - -;;====================================================================== -;; given path get free space, allows override in [setup] -;; with free-space-script /path/to/some/script.sh -;; -(define (get-df path) - (if (configf:lookup *configdat* "setup" "free-space-script") - (with-input-from-pipe - (conc (configf:lookup *configdat* "setup" "free-space-script") " " path) - (lambda () - (let ((res (read-line))) - (if (string? res) - (string->number res))))) - (get-unix-df path))) - -(define (get-free-inodes path) - (if (configf:lookup *configdat* "setup" "free-inodes-script") - (with-input-from-pipe - (conc (configf:lookup *configdat* "setup" "free-inodes-script") " " path) - (lambda () - (let ((res (read-line))) - (if (string? res) - (string->number res))))) - (get-unix-inodes path))) - -(define (get-unix-df path) - (let* ((df-results (process:cmd-run->list (conc "df " path))) - (space-rx (regexp "([0-9]+)\\s+([0-9]+)%")) - (freespc #f)) - ;; (write df-results) - (for-each (lambda (l) - (let ((match (string-search space-rx l))) - (if match - (let ((newval (string->number (cadr match)))) - (if (number? newval) - (set! freespc newval)))))) - (car df-results)) - freespc)) - -(define (get-unix-inodes path) - (let* ((df-results (process:cmd-run->list (conc "df -i " path))) - (space-rx (regexp "([0-9]+)\\s+([0-9]+)%")) - (freenodes 0)) ;; 0 is a better failsafe than #f here. - ;; (write df-results) - (for-each (lambda (l) - (let ((match (string-search space-rx l))) - (if match - (let ((newval (string->number (cadr match)))) - (if (number? newval) - (set! freenodes newval)))))) - (car df-results)) - freenodes)) - -(define (common:check-space-in-dir dirpath required) - (let* ((dbspace (if (directory? dirpath) - (get-df dirpath) - 0))) - (list (> dbspace required) - dbspace - required - dirpath))) - -;;====================================================================== -;; check space in dbdir and in megatest dir -;; returns: ok/not dbspace required-space -;; -(define (common:check-db-dir-space) - (let* ((required (string->number - ;; default is 1GB (or actually a billion bytes) This is the number of 1 kB blocks. - (or (configf:lookup *configdat* "setup" "dbdir-space-required") - "1000000"))) - (dbdir (common:make-tmpdir-name *toppath* "")) ;; (db:get-dbdir)) - (tdbspace (common:check-space-in-dir dbdir required)) - (mdbspace (common:check-space-in-dir *toppath* required))) - (sort (list tdbspace mdbspace) (lambda (a b) - (< (cadr a)(cadr b)))))) - -;;====================================================================== -;; check available space in dbdir, exit if insufficient -;; -(define (common:check-db-dir-and-exit-if-insufficient) - (let* ((spacedat (car (common:check-db-dir-space))) ;; look only at worst for now - (is-ok (car spacedat)) - (dbspace (cadr spacedat)) - (required (caddr spacedat)) - (dbdir (cadddr spacedat))) - (if (not is-ok) - (begin - (debug:print-error 0 *default-log-port* "Insufficient space in " dbdir ", require " required ", have " dbspace ", exiting now.") - (exit 1))))) - -;;====================================================================== -;; paths is list of lists ((name path) ... ) -;; -(define (common:get-disk-with-most-free-space disks minsize) - (let* ((best #f) - (bestsize 0) - (default-min-inodes-string "1000000") - (default-min-inodes (string->number default-min-inodes-string)) - (min-inodes (or (string->number (if (configf:lookup *configdat* "setup" "min_inodes") (configf:lookup *configdat* "setup" "min_inodes") default-min-inodes-string)) default-min-inodes))) - - (for-each - (lambda (disk-num) - (let* ((dirpath (cadr (assoc disk-num disks))) - (freespc (cond - ((not (directory? dirpath)) - (if (common:low-noise-print 300 "disks not a dir " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a directory - ignoring it.")) - -1) - ((not (file-write-access? dirpath)) - (if (common:low-noise-print 300 "disks not writeable " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not writeable - ignoring it.")) - -1) - ((not (eq? (string-ref dirpath 0) #\/)) - (if (common:low-noise-print 300 "disks not a proper path " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a fully qualified path - ignoring it.")) - -1) - (else - (get-df dirpath)))) - (free-inodes (cond - ((not (directory? dirpath)) - (if (common:low-noise-print 300 "disks not a dir " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a directory - ignoring it.")) - -1) - ((not (file-write-access? dirpath)) - (if (common:low-noise-print 300 "disks not writeable " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not writeable - ignoring it.")) - -1) - ((not (eq? (string-ref dirpath 0) #\/)) - (if (common:low-noise-print 300 "disks not a proper path " disk-num) - (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a fully qualified path - ignoring it.")) - -1) - (else - (get-free-inodes dirpath)))) - ;;(free-inodes (get-free-inodes dirpath)) - ) - (debug:print 2 *default-log-port* "INFO: disk " disk-num " path " dirpath " free space " freespc " free inodes " free-inodes) - (if (and (> freespc bestsize)(> free-inodes min-inodes )) - (begin - (set! best (cons disk-num dirpath)) - (set! bestsize freespc))) - ;;(print "Processing: " disk-num " bestsize: " bestsize " best: " best " freespc: " freespc " min-inodes: " min-inodes " free-inodes: " free-inodes) - )) - (map car disks)) - (if (and best (> bestsize minsize)) - best - #f))) ;; #f means no disk candidate found - ;;====================================================================== ;; convert a spec string to a list of vectors #( rx action rx-string ) (define (common:spec-string->list-of-specs spec-string actions) (let ((spec-strings (string-split-fields "\\s*;\\s*" spec-string #:infix)) (actions-regex (regexp (conc "^(.*)\\s+(" (string-intersperse (map conc actions) "|") ")")))) @@ -3329,22 +2243,10 @@ vars (lambda (var val) (setenv var val))) vars)) -(define (common:run-a-command cmd #!key (with-vars #f) (with-orig-env #f)) - (let* ((pre-cmd (dtests:get-pre-command)) - (post-cmd (dtests:get-post-command)) - (fullcmd (if (or pre-cmd post-cmd) - (conc pre-cmd cmd post-cmd) - (conc "viewscreen " cmd)))) - (debug:print-info 02 *default-log-port* "Running command: " fullcmd) - (cond - (with-vars (common:without-vars fullcmd)) - (with-orig-env (common:with-orig-env fullcmd)) - (else (common:without-vars fullcmd "MT_.*"))))) - ;;====================================================================== ;; C O L O R S ;;====================================================================== (define (common:name->iup-color name) @@ -3381,45 +2283,10 @@ "/")) ;;====================================================================== ;; L O C K I N G M E C H A N I S M S ;;====================================================================== - -;;====================================================================== -;; faux-lock is deprecated. Please use simple-lock below -;; -(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t)) - (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count - (if (> wait-time 0) - (begin - (thread-sleep! 1) - (if (eq? wait-time 1) ;; only one second left, steal the lock - (begin - (debug:print-info 0 *default-log-port* "stealing lock for " keyname) - (common:faux-unlock keyname force: #t))) - (common:faux-lock keyname wait-time: (- wait-time 1))) - #f) - (begin - (rmt:no-sync-set keyname (conc (current-process-id))) - (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))))) - -(define (common:faux-unlock keyname #!key (force #f)) - (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))) - (begin - (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname)) - #t) - #f)) - -;;====================================================================== -;; simple lock. improve and converge on this one. -;; -(define (common:simple-lock keyname) - (rmt:no-sync-get-lock keyname)) - -(define (common:simple-unlock keyname #!key (force #f)) - (rmt:no-sync-del! keyname)) - ;;====================================================================== ;; ;;====================================================================== (define (common:in-running-test?) @@ -3506,102 +2373,10 @@ ;; (define (mddb:get-dashboards) ;; (let ((db (mddb:open-db))) ;; (query fetch-column ;; (sql db "SELECT ipaddr || ':' || portnum FROM dashboards;")))) -;;====================================================================== -;; T E S T L A U N C H I N G P E R I T E M W I T H H O S T T Y P E S -;;====================================================================== -;; -;; [hosts] -;; arm cubie01 cubie02 -;; x86_64 zeus xena myth01 -;; allhosts #{g hosts arm} #{g hosts x86_64} -;; -;; [host-types] -;; general #MTLOWESTLOAD #{g hosts allhosts} -;; arm #MTLOWESTLOAD #{g hosts arm} -;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo -;; -;; [host-rules] -;; # maxnload => max normalized load -;; # maxnjobs => max jobs per cpu -;; # maxjobrate => max jobs per second -;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 -;; -;; [launchers] -;; envsetup general -;; xor/%/n 4C16G -;; % nbgeneral -;; -;; [jobtools] -;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match. -;; flexi-launcher yes -;; launcher nbfake -;; -(define (common:get-launcher configdat testname itempath) - (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher"))) - (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher - (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no"))) - (let* ((launchers (hash-table-ref/default configdat "launchers" '()))) - (if (null? launchers) - fallback-launcher - (let loop ((hed (car launchers)) - (tal (cdr launchers))) - (let ((patt (car hed)) - (host-type (cadr hed))) - (if (tests:match patt testname itempath) - (begin - (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) - (let ((launcher (configf:lookup configdat "host-types" host-type))) - (if launcher - (let* ((launcher-parts (string-split launcher)) - (launcher-exe (car launcher-parts))) - (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline - (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) - (count 100)) - (if targ-host - (conc "remrun " targ-host) - (if (> count 0) - (begin - (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) - (thread-sleep! (- 101 count)) - (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) - (- count 1))) - (begin - (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) - (exit))))) - launcher)) - (begin - (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) - (if (null? tal) - fallback-launcher - (loop (car tal)(cdr tal))))))) - ;; no match, try again - (if (null? tal) - fallback-launcher - (loop (car tal)(cdr tal)))))))) - fallback-launcher))) - -;;====================================================================== -;; D A S H B O A R D U S E R V I E W S -;;====================================================================== - -;;====================================================================== -;; first read ~/views.config if it exists, then read $MTRAH/views.config if it exists -;; -(define (common:load-views-config) - (let* ((view-cfgdat (make-hash-table)) - (home-cfgfile (conc (get-environment-variable "HOME") "/.mtviews.config")) - (mthome-cfgfile (conc *toppath* "/.mtviews.config"))) - (if (common:file-exists? mthome-cfgfile) - (read-config mthome-cfgfile view-cfgdat #t)) - ;; we load the home dir file AFTER the MTRAH file so the user can clobber settings when running the dashboard in read-only areas - (if (common:file-exists? home-cfgfile) - (read-config home-cfgfile view-cfgdat #t)) - view-cfgdat)) - ;;====================================================================== ;; H I E R A R C H I C A L H A S H T A B L E S ;;====================================================================== ;; ;; Every element including top element is a vector: @@ -3674,110 +2449,10 @@ (state . s) (target . t) (status . u) (parent . P))))) -(define (common:get-pkts-dirs mtconf use-lt) - (let* ((pktsdirs-str (or (configf:lookup mtconf "setup" "pktsdirs") - (and use-lt - (conc (or *toppath* - (current-directory)) - "/lt/.pkts")))) - (pktsdirs (if pktsdirs-str - (string-split pktsdirs-str " ") - #f))) - pktsdirs)) - -;;====================================================================== -;; use-lt is use linktree "lt" link to find pkts dir -(define (common:save-pkt pktalist-in mtconf use-lt #!key (add-only #f)) ;; add-only saves the pkt only if there is a parent already - (if (or (not add-only) - (hash-table-exists? *pkts-info* 'last-parent)) - (let* ((parent (hash-table-ref/default *pkts-info* 'last-parent #f)) - (pktalist (if parent - (cons `(parent . ,parent) - pktalist-in) - pktalist-in))) - (let-values (((uuid pkt) - (alist->pkt pktalist common:pkts-spec))) - (hash-table-set! *pkts-info* 'last-parent uuid) - (let ((pktsdir (or (hash-table-ref/default *pkts-info* 'pkts-dir #f) - (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) - (pktsdir (car pktsdirs))) ;; assume it is there - (hash-table-set! *pkts-info* 'pkts-dir pktsdir) - pktsdir)))) - (debug:print 0 *default-log-port* "pktsdir: "pktsdir) - (handle-exceptions - exn - (debug:print-info 0 "failed to write out packet to " pktsdir ", exn=" exn) ;; don't care if this failed for now but MUST FIX - BUG!! - (if (not (file-exists? pktsdir)) - (create-directory pktsdir #t)) - (with-output-to-file - (conc pktsdir "/" uuid ".pkt") - (lambda () - (print pkt))))))))) - -(define (common:with-queue-db mtconf proc #!key (use-lt #f)(toppath-in #f)) - (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) - (pktsdir (if pktsdirs (car pktsdirs) #f)) - (toppath (or (configf:lookup mtconf "scratchdat" "toppath") - toppath-in)) - (pdbpath (or (configf:lookup mtconf "setup" "pdbpath") pktsdir))) - (cond - ((not (and pktsdir toppath pdbpath)) - (debug:print 0 *default-log-port* "ERROR: settings are missing in your megatest.config for area management.") - (debug:print 0 *default-log-port* " you need to have pktsdirs in the [setup] section.")) - ((not (common:file-exists? pktsdir)) - (debug:print 0 *default-log-port* "ERROR: pkts directory not found " pktsdir)) - ((not (equal? (file-owner pktsdir)(current-effective-user-id))) - (debug:print 0 *default-log-port* "ERROR: directory " pktsdir " is not owned by " (current-effective-user-name))) - (else - (let* ((pdb (open-queue-db pdbpath "pkts.db" - schema: '("CREATE TABLE groups (id INTEGER PRIMARY KEY,groupname TEXT, CONSTRAINT group_constraint UNIQUE (groupname));")))) - (proc pktsdirs pktsdir pdb) - (dbi:close pdb)))))) - -(define (common:load-pkts-to-db mtconf #!key (use-lt #f)) - (common:with-queue-db - mtconf - (lambda (pktsdirs pktsdir pdb) - (for-each - (lambda (pktsdir) ;; look at all - (cond - ((not (common:file-exists? pktsdir)) - (debug:print 0 *default-log-port* "ERROR: packets directory " pktsdir " does not exist.")) - ((not (directory? pktsdir)) - (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not a directory.")) - ((not (file-read-access? pktsdir)) - (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not readable.")) - (else - (debug:print-info 0 *default-log-port* "Loading packets found in " pktsdir) - (let ((pkts (glob (conc pktsdir "/*.pkt"))) - (sqdb (dbi:db-conn pdb)) - ) - ;; Put this in a transaction to avoid issues overloading the db - (sqlite3:with-transaction - sqdb - (lambda () - (for-each - (lambda (pkt) - (let* ((uuid (cadr (string-match ".*/([0-9a-f]+).pkt" pkt))) - (exists (lookup-by-uuid pdb uuid #f))) - (if (not exists) - (let* ((pktdat (string-intersperse - (with-input-from-file pkt read-lines) - "\n")) - (apkt (pkt->alist pktdat)) - (ptype (alist-ref 'T apkt))) - (add-to-queue pdb pktdat uuid (or ptype 'cmd) #f 0) - (debug:print 4 *default-log-port* "Added " uuid " of type " ptype " to queue")) - (debug:print 4 *default-log-port* "pkt: " uuid " exists, skipping...") - ))) - pkts))))))) - pktsdirs)) - use-lt: use-lt)) - (define (common:get-pkt-alists pkts) (map (lambda (x) (alist-ref 'apkt x)) ;; 'pkta pulls out the alist from the read pkt pkts)) Index: dashboard-tests.scm ================================================================== --- dashboard-tests.scm +++ dashboard-tests.scm @@ -950,5 +950,17 @@ (curr-updaters (hash-table-ref/default (dboard:commondat-updaters commondat) tnum '()))) (hash-table-set! (dboard:commondat-updaters commondat) tnum (cons updater curr-updaters)))) +(define (common:run-a-command cmd #!key (with-vars #f) (with-orig-env #f)) + (let* ((pre-cmd (dtests:get-pre-command)) + (post-cmd (dtests:get-post-command)) + (fullcmd (if (or pre-cmd post-cmd) + (conc pre-cmd cmd post-cmd) + (conc "viewscreen " cmd)))) + (debug:print-info 02 *default-log-port* "Running command: " fullcmd) + (cond + (with-vars (common:without-vars fullcmd)) + (with-orig-env (common:with-orig-env fullcmd)) + (else (common:without-vars fullcmd "MT_.*"))))) + Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -1648,7 +1648,24 @@ ;; (define (db:with-mutex-for-stmth proc) ;; (mutex-lock! *mutex-stmth-call*) ;; (let* ((res (proc))) ;; (mutex-unlock! *mutex-stmth-call*) ;; res)) +;;====================================================================== +;; L O C K E R S A N D B L O C K E R S +;;====================================================================== + +;; block further accesses to databases. Call this before shutting db down +(define (common:db-block-further-queries) + (mutex-lock! *db-access-mutex*) + (set! *db-access-allowed* #f) + (mutex-unlock! *db-access-mutex*)) + +(define (common:db-access-allowed?) + (let ((val (begin + (mutex-lock! *db-access-mutex*) + *db-access-allowed* + (mutex-unlock! *db-access-mutex*)))) + val)) + ) Index: debugprint.scm ================================================================== --- debugprint.scm +++ debugprint.scm @@ -32,10 +32,11 @@ srfi-1 (prefix mtargs args:)) (define setenv set-environment-variable!) + (define unsetenv unset-environment-variable!) )) ;;====================================================================== ;; debug stuff ;;====================================================================== ADDED megatestmod.scm Index: megatestmod.scm ================================================================== --- /dev/null +++ megatestmod.scm @@ -0,0 +1,878 @@ +;;====================================================================== +;; Copyright 2017, Matthew Welland. +;; +;; This file is part of Megatest. +;; +;; Megatest is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. +;; +;; Megatest is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with Megatest. If not, see . + +;;====================================================================== + +;;====================================================================== +;; Megatestmod: +;; +;; Put things here don't fit anywhere else +;;====================================================================== + +(declare (unit megatestmod)) +(declare (uses debugprint)) +(declare (uses mtargs)) + +(use srfi-69) + +(module megatestmod + * + +(import scheme) +(cond-expand + (chicken-4 + + (import chicken + ports + (prefix base64 base64:) + + (prefix sqlite3 sqlite3:) + data-structures + extras + files + matchable + md5 + message-digest + pathname-expand + posix + posix-extras + regex + regex-case + sparse-vectors + srfi-1 + srfi-18 + srfi-69 + typed-records + z3 + + debugprint + (prefix mtargs args:) + ) + (use srfi-69)) + (chicken-5 + (import (prefix sqlite3 sqlite3:) + ;; data-structures + ;; extras + ;; files + ;; posix + ;; posix-extras + chicken.base + chicken.condition + chicken.file + chicken.file.posix + chicken.io + chicken.pathname + chicken.port + chicken.process + chicken.process-context + chicken.process-context.posix + chicken.sort + chicken.string + chicken.time + chicken.time.posix + + matchable + md5 + message-digest + pathname-expand + regex + regex-case + srfi-1 + srfi-18 + srfi-69 + typed-records + system-information + + debugprint + ))) + +;;====================================================================== +;; '(print (string-intersperse (map cadr (hash-table-ref/default (read-config "megatest.config" \#f \#t) "disks" '"'"'("none" ""))) "\n"))' +(define (common:get-disks #!key (configf #f)) + (hash-table-ref/default + (or configf (read-config "megatest.config" #f #t)) + "disks" '("none" ""))) + +(define (common:get-install-area) + (let ((exe-path (car (argv)))) + (if (common:file-exists? exe-path) + (handle-exceptions + exn + #f + (pathname-directory + (pathname-directory + (pathname-directory exe-path)))) + #f))) + +;;====================================================================== +;; testsuite and area utilites +;;====================================================================== + +(define (get-testsuite-name toppath configdat) + (or (lookup configdat "setup" "area-name") + (lookup configdat "setup" "testsuite") + (get-environment-variable "MT_TESTSUITE_NAME") + (if (string? toppath) + (pathname-file toppath) + #f))) + +;; need generic find-record-with-var-nmatching-val +;; +(define (path->area-record cfgdat path) + (let* ((areadat (get-cfg-areas cfgdat)) + (all (filter (lambda (x) + (let* ((keyvals (cdr x)) + (pth (alist-ref 'path keyvals))) + (equal? path pth))) + areadat))) + (if (null? all) + #f + (car all)))) ;; return first match + +(define (get-area-name configdat toppath #!optional (short #f)) + ;; look up my area name in areas table (future) + ;; generate auto name + (conc (get-area-path-signature toppath short) + "-" + (get-testsuite-name toppath configdat))) + +;; given a config return an alist of alists +;; area-name => data +;; +(define (get-cfg-areas cfgdat) + (let ((adat (get-section cfgdat "areas"))) + (map (lambda (entry) + `(,(car entry) . + ,(val->alist (cadr entry)))) + adat))) + +;;====================================================================== +;; redefine for future cleanup (converge on area-name, the more generic +;; +(define common:get-area-name common:get-testsuite-name) + +(define (common:get-db-tmp-area . junk) + (if *db-cache-path* + *db-cache-path* + (if *toppath* ;; common:get-create-writeable-dir + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* "Couldn't create path to " *db-cache-path* ", exn=" exn) + (exit 1)) + (let* ((toppath (common:real-path *toppath*)) + (tsname (common:get-testsuite-name)) + (dbpath (common:get-create-writeable-dir + (list (conc "/tmp/" (current-user-name) + "/megatest_localdb/" + tsname "/" + (string-translate toppath "/" ".")) + (conc "/tmp/" (current-process-id) ;; just in case we have an issue with the dir by own user name + "/"(current-user-name) "/megatest_localdb/" + tsname + (string-translate toppath "/" ".")) + )))) + (set! *db-cache-path* dbpath) + ;; ensure megatest area has .mtdb + (let ((dbarea (conc *toppath* "/.mtdb"))) + (if (not (file-exists? dbarea)) + (create-directory dbarea))) + ;; ensure tmp area has .mtdb + (let ((dbarea (conc dbpath "/.mtdb"))) + (if (not (file-exists? dbarea)) + (create-directory dbarea))) + dbpath)) + #f))) +;;====================================================================== +;; T A R G E T S , S T A T E , S T A T U S , +;; R U N N A M E A N D T E S T P A T T +;;====================================================================== + +;;====================================================================== +;; (map print (map car (hash-table->alist (read-config "runconfigs.config" #f #t)))) +;; +(define (common:get-runconfig-targets #!key (configf #f)) + (let ((targs (sort (map car (hash-table->alist + (or configf ;; NOTE: There is no value in using runconfig:read here. + (read-config (conc *toppath* "/runconfigs.config") + #f #t) + (make-hash-table)))) + stringsymbol force-setting) #f)) + (force-result (case force-type + ((#f) #f) + ((always) #t) + ((test) (if (args:get-arg "-execute") ;; we are in a test + #t + #f)) + (else + (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.") + #t)))) ;; default to requiring server + (if force-result + (begin + (debug:print-info 0 *default-log-port* "ATTENTION! Forcing use of server, force setting is \"" force-setting "\".") + #t) + #f))) + +;;====================================================================== +;; calculate a delay number based on a droop curve +;; inputs are: +;; - load-in, load as from uptime, NOT normalized +;; - numcpus, number of cpus, ideally use the real cpus, not threads +;; +(define (common:get-delay load-in numcpus) + (let* ((ratio (/ load-in numcpus)) + (new-option (configf:lookup *configdat* "load" "new-load-method")) + (paramstr (or (configf:lookup *configdat* "load" "exp-params") + "15 12 1281453987.9543 0.75")) ;; 5 4 10 1")) + (paramlst (map string->number (string-split paramstr)))) + (if new-option + (begin + (cond ((and (>= ratio 0) (< ratio .5)) + 0) + ((and (>= ratio 0.5) (<= ratio .9)) + (* ratio (/ 5 .9))) + ((and (> ratio .9) (<= ratio 1.1)) + (+ 5 (* (- ratio .9) (/ 55 .2)))) + ((> ratio 1.1) + 60))) + (match paramlst + ((r1 r2 s1 s2) + (debug:print 3 *default-log-port* "Using params r1=" r1 " r2=" r2 " s1=" s1 " s2=" s2) + (min (max (/ (expt r1 (* r2 s2 ratio)) s1) 0) 30)) + (else + (debug:print 0 *default-log-port* "BAD exp-params, should be \"r1 r2 s1 s2\" but got " paramstr) + 30))))) + +;; -mrw- this appears to not be used +;; +;; (define (common:print-delay-table) +;; (let loop ((x 0)) +;; (print x "," (common:get-delay x 1)) +;; (if (< x 2) +;; (loop (+ x 0.1))))) + +;; (define (get-cpu-load #!key (remote-host #f)) +;; (car (common:get-cpu-load remote-host))) + +;;====================================================================== +;; (let* ((load-res (process:cmd-run->list "uptime")) +;; (load-rx (regexp "load average:\\s+(\\d+)")) +;; (cpu-load #f)) +;; (for-each (lambda (l) +;; (let ((match (string-search load-rx l))) +;; (if match +;; (let ((newval (string->number (cadr match)))) +;; (if (number? newval) +;; (set! cpu-load newval)))))) +;; (car load-res)) +;; cpu-load)) + +;;====================================================================== +;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the +;; [host-rules] section. +;; +(define (common:get-least-loaded-host hosts-raw host-type configdat) + (let* ((rdat (configf:lookup configdat "host-rules" host-type)) + (rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate + (maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load + (maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs + (maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second + (hosts (filter (lambda (x) + (string-match (regexp "^\\S+$") x)) + hosts-raw)) + ;; (best-host #f) + (get-rec (lambda (hostname) + ;; (print "get-rec hostname=" hostname) + (let ((h (hash-table-ref/default *host-loads* hostname #f))) + (if h + h + (let ((h (make-host))) + (hash-table-set! *host-loads* hostname h) + h))))) + (best-load 99999) + (curr-time (current-seconds)) + (get-hosts-sorted (lambda (hosts) + (sort hosts (lambda (a b) + (let ((a-rec (get-rec a)) + (b-rec (get-rec b))) + ;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec)) + ;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec)) + (< (host-last-used a-rec) + (host-last-used b-rec)))))))) + (debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts)) + (if (null? hosts) + #f ;; no hosts to select from. All done and giving up now. + (let ((hosts-sorted (get-hosts-sorted hosts))) + (common:update-host-loads-table hosts) + (let loop ((hostname (car hosts-sorted)) + (tal (cdr hosts-sorted)) + (best-host #f)) + (let* ((rec (get-rec hostname)) + (reachable (host-reachable rec)) + (load (host-last-cpuload rec)) + (last-used (host-last-used rec)) + (delta (- curr-time last-used)) + (job-rate (if (> delta 0) + (/ 1 delta) + 999)) ;; jobs per second + (new-best + (cond + ((not reachable) + (debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.") + best-host) + ((and (< load maxnload) ;; load is acceptable + (< job-rate maxjobrate)) ;; job rate is acceptable + (set! best-load load) + hostname) + (else best-host)))) + (debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." ) + (if new-best + (begin ;; found a host, return it + (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) + (host-last-used-set! rec curr-time) + new-best) + (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) + +;;====================================================================== +;; given path get free space, allows override in [setup] +;; with free-space-script /path/to/some/script.sh +;; +(define (get-df path) + (if (configf:lookup *configdat* "setup" "free-space-script") + (with-input-from-pipe + (conc (configf:lookup *configdat* "setup" "free-space-script") " " path) + (lambda () + (let ((res (read-line))) + (if (string? res) + (string->number res))))) + (get-unix-df path))) + +(define (get-free-inodes path) + (if (configf:lookup *configdat* "setup" "free-inodes-script") + (with-input-from-pipe + (conc (configf:lookup *configdat* "setup" "free-inodes-script") " " path) + (lambda () + (let ((res (read-line))) + (if (string? res) + (string->number res))))) + (get-unix-inodes path))) + +;;====================================================================== +;; check space in dbdir and in megatest dir +;; returns: ok/not dbspace required-space +;; +(define (common:check-db-dir-space) + (let* ((required (string->number + ;; default is 1GB (or actually a billion bytes) This is the number of 1 kB blocks. + (or (configf:lookup *configdat* "setup" "dbdir-space-required") + "1000000"))) + (dbdir (common:make-tmpdir-name *toppath* "")) ;; (db:get-dbdir)) + (tdbspace (common:check-space-in-dir dbdir required)) + (mdbspace (common:check-space-in-dir *toppath* required))) + (sort (list tdbspace mdbspace) (lambda (a b) + (< (cadr a)(cadr b)))))) + +;;====================================================================== +;; check available space in dbdir, exit if insufficient +;; +(define (common:check-db-dir-and-exit-if-insufficient) + (let* ((spacedat (car (common:check-db-dir-space))) ;; look only at worst for now + (is-ok (car spacedat)) + (dbspace (cadr spacedat)) + (required (caddr spacedat)) + (dbdir (cadddr spacedat))) + (if (not is-ok) + (begin + (debug:print-error 0 *default-log-port* "Insufficient space in " dbdir ", require " required ", have " dbspace ", exiting now.") + (exit 1))))) + +;;====================================================================== +;; paths is list of lists ((name path) ... ) +;; +(define (common:get-disk-with-most-free-space disks minsize) + (let* ((best #f) + (bestsize 0) + (default-min-inodes-string "1000000") + (default-min-inodes (string->number default-min-inodes-string)) + (min-inodes (or (string->number (if (configf:lookup *configdat* "setup" "min_inodes") (configf:lookup *configdat* "setup" "min_inodes") default-min-inodes-string)) default-min-inodes))) + + (for-each + (lambda (disk-num) + (let* ((dirpath (cadr (assoc disk-num disks))) + (freespc (cond + ((not (directory? dirpath)) + (if (common:low-noise-print 300 "disks not a dir " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a directory - ignoring it.")) + -1) + ((not (file-write-access? dirpath)) + (if (common:low-noise-print 300 "disks not writeable " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not writeable - ignoring it.")) + -1) + ((not (eq? (string-ref dirpath 0) #\/)) + (if (common:low-noise-print 300 "disks not a proper path " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a fully qualified path - ignoring it.")) + -1) + (else + (get-df dirpath)))) + (free-inodes (cond + ((not (directory? dirpath)) + (if (common:low-noise-print 300 "disks not a dir " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a directory - ignoring it.")) + -1) + ((not (file-write-access? dirpath)) + (if (common:low-noise-print 300 "disks not writeable " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not writeable - ignoring it.")) + -1) + ((not (eq? (string-ref dirpath 0) #\/)) + (if (common:low-noise-print 300 "disks not a proper path " disk-num) + (debug:print 0 *default-log-port* "WARNING: disk " disk-num " at path \"" dirpath "\" is not a fully qualified path - ignoring it.")) + -1) + (else + (get-free-inodes dirpath)))) + ;;(free-inodes (get-free-inodes dirpath)) + ) + (debug:print 2 *default-log-port* "INFO: disk " disk-num " path " dirpath " free space " freespc " free inodes " free-inodes) + (if (and (> freespc bestsize)(> free-inodes min-inodes )) + (begin + (set! best (cons disk-num dirpath)) + (set! bestsize freespc))) + ;;(print "Processing: " disk-num " bestsize: " bestsize " best: " best " freespc: " freespc " min-inodes: " min-inodes " free-inodes: " free-inodes) + )) + (map car disks)) + (if (and best (> bestsize minsize)) + best + #f))) ;; #f means no disk candidate found + +;;====================================================================== +;; T E S T L A U N C H I N G P E R I T E M W I T H H O S T T Y P E S +;;====================================================================== +;; +;; [hosts] +;; arm cubie01 cubie02 +;; x86_64 zeus xena myth01 +;; allhosts #{g hosts arm} #{g hosts x86_64} +;; +;; [host-types] +;; general #MTLOWESTLOAD #{g hosts allhosts} +;; arm #MTLOWESTLOAD #{g hosts arm} +;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo +;; +;; [host-rules] +;; # maxnload => max normalized load +;; # maxnjobs => max jobs per cpu +;; # maxjobrate => max jobs per second +;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 +;; +;; [launchers] +;; envsetup general +;; xor/%/n 4C16G +;; % nbgeneral +;; +;; [jobtools] +;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match. +;; flexi-launcher yes +;; launcher nbfake +;; +(define (common:get-launcher configdat testname itempath) + (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher"))) + (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher + (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no"))) + (let* ((launchers (hash-table-ref/default configdat "launchers" '()))) + (if (null? launchers) + fallback-launcher + (let loop ((hed (car launchers)) + (tal (cdr launchers))) + (let ((patt (car hed)) + (host-type (cadr hed))) + (if (tests:match patt testname itempath) + (begin + (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) + (let ((launcher (configf:lookup configdat "host-types" host-type))) + (if launcher + (let* ((launcher-parts (string-split launcher)) + (launcher-exe (car launcher-parts))) + (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline + (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) + (count 100)) + (if targ-host + (conc "remrun " targ-host) + (if (> count 0) + (begin + (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) + (thread-sleep! (- 101 count)) + (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) + (- count 1))) + (begin + (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) + (exit))))) + launcher)) + (begin + (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) + (if (null? tal) + fallback-launcher + (loop (car tal)(cdr tal))))))) + ;; no match, try again + (if (null? tal) + fallback-launcher + (loop (car tal)(cdr tal)))))))) + fallback-launcher))) + +(define (common:get-pkts-dirs mtconf use-lt) + (let* ((pktsdirs-str (or (configf:lookup mtconf "setup" "pktsdirs") + (and use-lt + (conc (or *toppath* + (current-directory)) + "/lt/.pkts")))) + (pktsdirs (if pktsdirs-str + (string-split pktsdirs-str " ") + #f))) + pktsdirs)) + +(define (common:with-queue-db mtconf proc #!key (use-lt #f)(toppath-in #f)) + (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) + (pktsdir (if pktsdirs (car pktsdirs) #f)) + (toppath (or (configf:lookup mtconf "scratchdat" "toppath") + toppath-in)) + (pdbpath (or (configf:lookup mtconf "setup" "pdbpath") pktsdir))) + (cond + ((not (and pktsdir toppath pdbpath)) + (debug:print 0 *default-log-port* "ERROR: settings are missing in your megatest.config for area management.") + (debug:print 0 *default-log-port* " you need to have pktsdirs in the [setup] section.")) + ((not (common:file-exists? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: pkts directory not found " pktsdir)) + ((not (equal? (file-owner pktsdir)(current-effective-user-id))) + (debug:print 0 *default-log-port* "ERROR: directory " pktsdir " is not owned by " (current-effective-user-name))) + (else + (let* ((pdb (open-queue-db pdbpath "pkts.db" + schema: '("CREATE TABLE groups (id INTEGER PRIMARY KEY,groupname TEXT, CONSTRAINT group_constraint UNIQUE (groupname));")))) + (proc pktsdirs pktsdir pdb) + (dbi:close pdb)))))) + +(define (common:load-pkts-to-db mtconf #!key (use-lt #f)) + (common:with-queue-db + mtconf + (lambda (pktsdirs pktsdir pdb) + (for-each + (lambda (pktsdir) ;; look at all + (cond + ((not (common:file-exists? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: packets directory " pktsdir " does not exist.")) + ((not (directory? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not a directory.")) + ((not (file-read-access? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not readable.")) + (else + (debug:print-info 0 *default-log-port* "Loading packets found in " pktsdir) + (let ((pkts (glob (conc pktsdir "/*.pkt"))) + (sqdb (dbi:db-conn pdb)) + ) + ;; Put this in a transaction to avoid issues overloading the db + (sqlite3:with-transaction + sqdb + (lambda () + (for-each + (lambda (pkt) + (let* ((uuid (cadr (string-match ".*/([0-9a-f]+).pkt" pkt))) + (exists (lookup-by-uuid pdb uuid #f))) + (if (not exists) + (let* ((pktdat (string-intersperse + (with-input-from-file pkt read-lines) + "\n")) + (apkt (pkt->alist pktdat)) + (ptype (alist-ref 'T apkt))) + (add-to-queue pdb pktdat uuid (or ptype 'cmd) #f 0) + (debug:print 4 *default-log-port* "Added " uuid " of type " ptype " to queue")) + (debug:print 4 *default-log-port* "pkt: " uuid " exists, skipping...") + ))) + pkts))))))) + pktsdirs)) + use-lt: use-lt)) + +;;====================================================================== +;; D I S K S P A C E +;;====================================================================== + +(define (common:get-disk-space-used fpath) + (with-input-from-pipe (conc "/usr/bin/du -s " fpath) read)) + +(define (get-unix-df path) + (let* ((df-results (process:cmd-run->list (conc "df " path))) + (space-rx (regexp "([0-9]+)\\s+([0-9]+)%")) + (freespc #f)) + ;; (write df-results) + (for-each (lambda (l) + (let ((match (string-search space-rx l))) + (if match + (let ((newval (string->number (cadr match)))) + (if (number? newval) + (set! freespc newval)))))) + (car df-results)) + freespc)) + +(define (get-unix-inodes path) + (let* ((df-results (process:cmd-run->list (conc "df -i " path))) + (space-rx (regexp "([0-9]+)\\s+([0-9]+)%")) + (freenodes 0)) ;; 0 is a better failsafe than #f here. + ;; (write df-results) + (for-each (lambda (l) + (let ((match (string-search space-rx l))) + (if match + (let ((newval (string->number (cadr match)))) + (if (number? newval) + (set! freenodes newval)))))) + (car df-results)) + freenodes)) + +(define (common:check-space-in-dir dirpath required) + (let* ((dbspace (if (directory? dirpath) + (get-df dirpath) + 0))) + (list (> dbspace required) + dbspace + required + dirpath))) + +(define (get-uname . params) + (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params))))) + (uname #f)) + (if (null? (car uname-res)) + "unknown" + (caar uname-res)))) + + +;;====================================================================== +;; faux-lock is deprecated. Please use simple-lock below +;; +(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t)) + (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count + (if (> wait-time 0) + (begin + (thread-sleep! 1) + (if (eq? wait-time 1) ;; only one second left, steal the lock + (begin + (debug:print-info 0 *default-log-port* "stealing lock for " keyname) + (common:faux-unlock keyname force: #t))) + (common:faux-lock keyname wait-time: (- wait-time 1))) + #f) + (begin + (rmt:no-sync-set keyname (conc (current-process-id))) + (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))))) + +(define (common:faux-unlock keyname #!key (force #f)) + (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))) + (begin + (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname)) + #t) + #f)) + +;;====================================================================== +;; simple lock. improve and converge on this one. +;; +(define (common:simple-lock keyname) + (rmt:no-sync-get-lock keyname)) + +(define (common:simple-unlock keyname #!key (force #f)) + (rmt:no-sync-del! keyname)) + +;;====================================================================== +;; use-lt is use linktree "lt" link to find pkts dir +(define (common:save-pkt pktalist-in mtconf use-lt #!key (add-only #f)) ;; add-only saves the pkt only if there is a parent already + (if (or (not add-only) + (hash-table-exists? *pkts-info* 'last-parent)) + (let* ((parent (hash-table-ref/default *pkts-info* 'last-parent #f)) + (pktalist (if parent + (cons `(parent . ,parent) + pktalist-in) + pktalist-in))) + (let-values (((uuid pkt) + (alist->pkt pktalist common:pkts-spec))) + (hash-table-set! *pkts-info* 'last-parent uuid) + (let ((pktsdir (or (hash-table-ref/default *pkts-info* 'pkts-dir #f) + (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) + (pktsdir (car pktsdirs))) ;; assume it is there + (hash-table-set! *pkts-info* 'pkts-dir pktsdir) + pktsdir)))) + (debug:print 0 *default-log-port* "pktsdir: "pktsdir) + (handle-exceptions + exn + (debug:print-info 0 "failed to write out packet to " pktsdir ", exn=" exn) ;; don't care if this failed for now but MUST FIX - BUG!! + (if (not (file-exists? pktsdir)) + (create-directory pktsdir #t)) + (with-output-to-file + (conc pktsdir "/" uuid ".pkt") + (lambda () + (print pkt))))))))) + +;;====================================================================== +;; Lookup a value in runconfigs based on -reqtarg or -target +;; +(define (runconfigs-get config var) + (let ((targ (common:args-get-target))) ;; (or (args:get-arg "-reqtarg")(args:get-arg "-target")(getenv "MT_TARGET")))) + (if targ + (or (configf:lookup config targ var) + (configf:lookup config "default" var)) + (configf:lookup config "default" var)))) + + +) Index: mt.scm ================================================================== --- mt.scm +++ mt.scm @@ -212,23 +212,26 @@ (state (if newstate newstate (db:test-get-state test-dat))) (status (if newstatus newstatus (db:test-get-status test-dat))) (target (getenv "MT_TARGET")) (runname (getenv "MT_RUNNAME"))) ;; (mutex-lock! *triggers-mutex*) - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* " Exception in mt:process-triggers for run-id="run-id" test-id="test-id" newstate="newstate" newstatus="newstatus - "\n error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn - "\n test-rundir="test-rundir - "\n test-name="test-name - "\n item-path="item-path - "\n state="state - "\n status="status - "\n") - (print-call-chain (current-error-port)) - #f) + ;;;;;; (handle-exceptions + ;;;;;; exn + ;;;;;; (begin + ;;;;;; (debug:print-error 0 *default-log-port* " Exception in mt:process-triggers for run-id="run-id" test-id="test-id" newstate="newstate" newstatus="newstatus + ;;;;;; "\n error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn + ;;;;;; "\n test-rundir="test-rundir + ;;;;;; "\n test-name="test-name + ;;;;;; "\n item-path="item-path + ;;;;;; "\n state="state + ;;;;;; "\n status="status + ;;;;;; "\n") + ;;;;;; (print-call-chain (current-error-port)) + ;;;;;; (with-output-to-port *default-log-port* + ;;;;;; (lambda () + ;;;;;; (print (condition->list exn)))) + ;;;;;; #f) (if (and test-name test-rundir) ;; #f means no dir set yet ;; (common:file-exists? test-rundir) ;; (directory? test-rundir)) (call-with-environment-variables @@ -252,11 +255,11 @@ (list (conc state "/" status) (conc state "/") (conc "/" status))) (pop-directory)) - ))) + )) ;; ) ;; (mutex-unlock! *triggers-mutex*) ))))) ;;====================================================================== ;; S T A T E A N D S T A T U S F O R T E S T S Index: rmtmod.scm ================================================================== --- rmtmod.scm +++ rmtmod.scm @@ -1040,7 +1040,83 @@ ) ) ) ) ) + +;;====================================================================== +;; from metadat lookup MEGATEST_VERSION +;; +(define (common:get-last-run-version) ;; RADT => How does this work in send-receive function??; assume it is the value saved in some DB + (rmt:get-var "MEGATEST_VERSION")) + +(define (common:get-last-run-version-number) + (string->number + (substring (common:get-last-run-version) 0 6))) + +(define (common:set-last-run-version) + (rmt:set-var "MEGATEST_VERSION" (common:version-signature))) + +;;====================================================================== +;; V E R S I O N +;;====================================================================== + +(define (common:get-full-version) + (conc megatest-version "-" megatest-fossil-hash)) + +(define (common:version-signature) + (conc megatest-version "-" (substring megatest-fossil-hash 0 4))) + +;;====================================================================== +;; postive number if megatest version > db version +;; negative number if megatest version < db version +(define (common:version-db-delta) + (- megatest-version (common:get-last-run-version-number))) + +(define (common:version-changed?) + (not (equal? (common:get-last-run-version) + (common:version-signature)))) + + +;; From 1.70 to 1.80, db's are compatible. + +(define (common:api-changed?) + (let* ( + (megatest-major-version (substring (->string megatest-version) 0 4)) + (run-major-version (substring (conc (common:get-last-run-version)) 0 4)) + ) + (and (not (equal? megatest-major-version "1.80")) + (not (equal? megatest-major-version megatest-run-version))) + ) +) + +;;====================================================================== +;; Move me elsewhere ... +;; RADT => Why do we meed the version check here, this is called only if version misma +;; +(define (common:cleanup-db dbstruct #!key (full #f)) + (case (rmt:transport-mode) + ((http) + (apply db:multi-db-sync + dbstruct + 'schema + 'killservers + 'adj-target + 'new2old + '(dejunk) + )) + ((tcp nfs) + (apply db:multi-db-sync + dbstruct + 'schema + 'killservers + 'adj-target + 'new2old + '(dejunk) + ))) + (if (common:api-changed?) + (common:set-last-run-version))) + + + ) ADDED servermod.scm Index: servermod.scm ================================================================== --- /dev/null +++ servermod.scm @@ -0,0 +1,1217 @@ +;; Copyright 2006-2017, Matthew Welland. +;; +;; This file is part of Megatest. +;; +;; Megatest is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. +;; +;; Megatest is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with Megatest. If not, see . +;; + +(declare (unit servermod)) + +(declare (uses commonmod)) +(declare (uses configfmod)) +(declare (uses debugprint)) +(declare (uses mtargs)) + +(module servermod + * + +(use (srfi 18) extras s11n) +(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) +(use directory-utils posix-extras matchable utils) +(use spiffy uri-common intarweb http-client spiffy-request-vars) + +(import commonmod + configfmod + debugprint + (prefix mtargs args:)) + +(include "common_records.scm") +(include "db_records.scm") + +(define (server:make-server-url hostport) + (if (not hostport) + #f + (conc "http://" (car hostport) ":" (cadr hostport)))) + +(define *server-loop-heart-beat* (current-seconds)) + +;;====================================================================== +;; P K T S S T U F F +;;====================================================================== + +;; ??? + +;;====================================================================== +;; P K T S S T U F F +;;====================================================================== + +;; ??? + +;;====================================================================== +;; S E R V E R +;;====================================================================== + +;; Call this to start the actual server +;; + +;;====================================================================== +;; S E R V E R U T I L I T I E S +;;====================================================================== + +;; Get the transport +(define (server:get-transport) + (if *transport-type* + *transport-type* + (let ((ttype (string->symbol + (or (args:get-arg "-transport") + (configf:lookup *configdat* "server" "transport") + "rpc")))) + (set! *transport-type* ttype) + ttype))) + +;; Generate a unique signature for this server +(define (server:mk-signature) + (message-digest-string (md5-primitive) + (with-output-to-string + (lambda () + (write (list (current-directory) + (current-process-id) + (argv))))))) + +(define (server:get-client-signature) + (if *my-client-signature* *my-client-signature* + (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic + (set! *my-client-signature* sig) + *my-client-signature*))) + +(define (server:get-server-id) + (if *server-id* *server-id* + (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic + (set! *server-id* sig) + *server-id*))) + +;; ;; When using zmq this would send the message back (two step process) +;; ;; with spiffy or rpc this simply returns the return data to be returned +;; ;; +;; (define (server:reply return-addr query-sig success/fail result) +;; (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result) +;; ;; (send-message pubsock target send-more: #t) +;; ;; (send-message pubsock +;; (case (server:get-transport) +;; ((rpc) (db:obj->string (vector success/fail query-sig result))) +;; ((http) (db:obj->string (vector success/fail query-sig result))) +;; ((fs) result) +;; (else +;; (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) +;; result))) + +;; Given an area path, start a server process ### NOTE ### > file 2>&1 +;; if the target-host is set +;; try running on that host +;; incidental: rotate logs in logs/ dir. +;; +(define (server:run areapath) ;; areapath is *toppath* for a given testsuite area + (let* ((testsuite (common:get-testsuite-name)) + (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) + (profile-mode (or (configf:lookup *configdat* "misc" "profilesw") + "")) + (cmdln (conc (common:get-megatest-exe) + " -server - ";; (or target-host "-") + (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") + " -daemonize " + "") + ;; " -log " logfile + " -m testsuite:" testsuite + " " profile-mode + )) ;; (conc " >> " logfile " 2>&1 &"))))) + (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? + (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) + ;; we want the remote server to start in *toppath* so push there + (push-directory areapath) + (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") + (thread-start! log-rotate) + + ;; host.domain.tld match host? + ;; (if (and target-host + ;; ;; look at target host, is it host.domain.tld or ip address and does it + ;; ;; match current ip or hostname + ;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) + ;; (not (equal? curr-ip target-host))) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) + ;; (setenv "TARGETHOST" target-host))) + ;; + (setenv "TARGETHOST_LOGF" logfile) + (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time + (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) + (system (conc "nbfake " cmdln)) + (unsetenv "TARGETHOST_LOGF") + ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) + (thread-join! log-rotate) + (pop-directory))) + +;; given a path to a server log return: host port startseconds server-id +;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let +;; example of what it's looking for in the log file: +;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 + +(define (server:logf-get-start-info logf) + (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id + (dbprep-rx (regexp "^SERVER: dbprep")) + (dbprep-found 0) + (bad-dat (list #f #f #f #f #f))) + (handle-exceptions + exn + (begin + ;; WARNING: this is potentially dangerous to blanket ignore the errors + (if (file-exists? logf) + (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn)) + bad-dat) ;; no idea what went wrong, call it a bad server + (with-input-from-file + logf + (lambda () + (let loop ((inl (read-line)) + (lnum 0)) + (if (not (eof-object? inl)) + (let ((mlst (string-match server-rx inl)) + (dbprep (string-match dbprep-rx inl))) + (if dbprep (set! dbprep-found 1)) + (if (not mlst) + (if (< lnum 500) ;; give up if more than 500 lines of server log read + (loop (read-line)(+ lnum 1)) + (begin + (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) + bad-dat)) + (match mlst + ((_ host port start server-id pid) + (list host + (string->number port) + (string->number start) + server-id + (string->number pid))) + (else + (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) + bad-dat)))) + (begin + (if dbprep-found + (begin + (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) + (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? + (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) + bad-dat)))))))) + +;; ;; get a list of servers from the log files, with all relevant data +;; ;; ( mod-time host port start-time pid ) +;; ;; +;; (define (server:get-list areapath #!key (limit #f)) +;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) +;; (day-seconds (* 24 60 60))) +;; ;; if the directory exists continue to get the list +;; ;; otherwise attempt to create the logs dir and then +;; ;; continue +;; (if (if (directory-exists? (conc areapath "/logs")) +;; '() +;; (if (file-write-access? areapath) +;; (begin +;; (condition-case +;; (create-directory (conc areapath "/logs") #t) +;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) +;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) +;; (directory-exists? (conc areapath "/logs"))) +;; '())) +;; +;; ;; Get the list of server logs. +;; (let* ( +;; ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. +;; ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) +;; (server-logs (glob (conc areapath "/logs/server-*-*.log"))) +;; (num-serv-logs (length server-logs))) +;; (if (or (null? server-logs) (= num-serv-logs 0)) +;; (let () +;; (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) +;; '() +;; ) +;; (let loop ((hed (string-chomp (car server-logs))) +;; (tal (cdr server-logs)) +;; (res '())) +;; (let* ((mod-time (handle-exceptions +;; exn +;; (begin +;; (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn) +;; (current-seconds)) ;; 0 +;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted +;; (down-time (- (current-seconds) mod-time)) +;; (serv-dat (if (or (< num-serv-logs 10) +;; (< down-time 900)) ;; day-seconds)) +;; (server:logf-get-start-info hed) +;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at +;; (serv-rec (cons mod-time serv-dat)) +;; (fmatch (string-match fname-rx hed)) +;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) +;; (new-res (if (null? serv-dat) +;; res +;; (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let +;; (if (null? tal) +;; (if (and limit +;; (> (length new-res) limit)) +;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work +;; new-res) +;; (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) + +#;(define (server:get-num-alive srvlst) + (let ((num-alive 0)) + (for-each + (lambda (server) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "Unable to get server start-time and/or mod-time from " server ", exn=" exn)) + (match-let (((mod-time host port start-time server-id pid) + server)) + (let* ((uptime (- (current-seconds) mod-time)) + (runtime (if start-time + (- mod-time start-time) + 0))) + (if (< uptime 5)(set! num-alive (+ num-alive 1))))))) + srvlst) + num-alive)) + +;; ;; given a list of servers get a list of valid servers, i.e. at least +;; ;; 10 seconds old, has started and is less than 1 hour old and is +;; ;; active (i.e. mod-time < 10 seconds +;; ;; +;; ;; mod-time host port start-time pid +;; ;; +;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off +;; ;; and servers should stick around for about two hours or so. +;; ;; +;; (define (server:get-best srvlst) +;; (let* ((nums (server:get-num-servers)) +;; (now (current-seconds)) +;; (slst (sort +;; (filter (lambda (rec) +;; (if (and (list? rec) +;; (> (length rec) 2)) +;; (let ((start-time (list-ref rec 3)) +;; (mod-time (list-ref rec 0))) +;; ;; (print "start-time: " start-time " mod-time: " mod-time) +;; (and start-time mod-time +;; (> (- now start-time) 0) ;; been running at least 0 seconds +;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds +;; (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set +;; (< (- now start-time) +;; (+ (- (string->number (configf:lookup *configdat* "server" "runtime")) +;; 180) +;; (random 360)))) ;; under one hour running time +/- 180 +;; )) +;; #f)) +;; srvlst) +;; (lambda (a b) +;; (< (list-ref a 3) +;; (list-ref b 3)))))) +;; (if (> (length slst) nums) +;; (take slst nums) +;; slst))) + +;; ;; switch from server:get-list to server:get-servers-info +;; ;; +;; (define (server:get-first-best areapath) +;; (let ((srvrs (server:get-best (server:get-list areapath)))) +;; (if (and srvrs +;; (not (null? srvrs))) +;; (car srvrs) +;; #f))) +;; +;; (define (server:get-rand-best areapath) +;; (let ((srvrs (server:get-best (server:get-list areapath)))) +;; (if (and (list? srvrs) +;; (not (null? srvrs))) +;; (let* ((len (length srvrs)) +;; (idx (random len))) +;; (list-ref srvrs idx)) +;; #f))) + +(define (server:record->id servr) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn) + #f) + (match-let (((host port start-time server-id pid) + servr)) + (if server-id + server-id + #f)))) + +(define (server:record->url servr) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn) + #f) + (match-let (((host port start-time server-id pid) + servr)) + (if (and host port) + (conc host ":" port) + #f)))) + + +;; if server-start-last exists, and wasn't old enough, wait + 1, then call this function recursively until it is old enough. +;; if it is old enough, overwrite it and wait 0.25 seconds. +;; if it then has the wrong server key, wait + 1 and call this function recursively. +;; +#;(define (server:wait-for-server-start-last-flag areapath) + (let* ((start-flag (conc areapath "/logs/server-start-last")) + ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) + (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) + (server-key (conc (get-host-name) "-" (current-process-id)))) + (if (file-exists? start-flag) + (let* ((fmodtime (file-modification-time start-flag)) + (delta (- (current-seconds) fmodtime)) + (old-enough (> delta idletime)) + (new-server-key "")) + ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than seconds, and the new file still has the same server key as you just wrote, return #t. + ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process. + (if (and old-enough + (begin + (debug:print-info 2 *default-log-port* "Writing " start-flag) + (with-output-to-file start-flag (lambda () (print server-key))) + (thread-sleep! 0.25) + (set! new-server-key (with-input-from-file start-flag (lambda () (read-line)))) + (equal? server-key new-server-key))) + #t + ;; If either of the above conditions is not true, print a "Gating server start" message, wait + 1, then call this function recursively. + (begin + (debug:print-info 0 *default-log-port* "Gating server start, last start: " + (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) + + (thread-sleep! ( + 1 idletime)) + (server:wait-for-server-start-last-flag areapath))))))) + +;; oldest server alive determines host then choose random of youngest +;; five servers on that host +;; +(define (server:get-servers-info areapath) + ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") + (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo"))) + (if (not (file-exists? servinfodir)) + (create-directory servinfodir)) + (let* ((allfiles (glob (conc servinfodir"/*"))) + (res (make-hash-table))) + (for-each + (lambda (f) + (let* ((hostport (pathname-strip-directory f)) + (serverdat (server:logf-get-start-info f))) + (match serverdat + ((host port start server-id pid) + (if (and host port start server-id pid) + (hash-table-set! res hostport serverdat) + (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))) + (else + (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))))) + allfiles) + res))) + +;; check the .servinfo directory, are there other servers running on this +;; or another host? +;; +;; returns #t => ok to start another server +;; #f => not ok to start another server +;; +(define (server:minimal-check areapath) + (server:clean-up-old areapath) + (let* ((srvdir (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo")) + (servrs (glob (conc srvdir"/*"))) + (thishostip (server:get-best-guess-address (get-host-name))) + (thisservrs (glob (conc srvdir"/"thishostip":*"))) + (homehostinf (server:choose-server areapath 'homehost)) + (havehome (car homehostinf)) + (wearehome (cdr homehostinf))) + (debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome + ", numservers: "(length thisservrs)) + (cond + ((not havehome) #t) ;; no homehost yet, go for it + ((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another + ((and havehome (not wearehome)) #f) ;; we are not the home host + ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running + (else + (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs) + #t)))) + + +(define server-last-start 0) + + +;; oldest server alive determines host then choose random of youngest +;; five servers on that host +;; +;; mode: +;; best - get best server (random of newest five) +;; home - get home host based on oldest server +;; info - print info +(define (server:choose-server areapath #!optional (mode 'best)) + ;; age is current-starttime + ;; find oldest alive + ;; 1. sort by age ascending and ping until good + ;; find alive rand from youngest + ;; 1. sort by age descending + ;; 2. take five + ;; 3. check alive, discard if not and repeat + ;; first we clean up old server files + (assert (eq? (rmt:transport-mode) 'http) "FATAL: server:run called with rmt:transport-mode="(rmt:transport-mode)) + (server:clean-up-old areapath) + (let* ((since-last (- (current-seconds) server-last-start)) + (server-start-delay 10)) + (if ( < (- (current-seconds) server-last-start) 10 ) + (begin + (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) + (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds") + (thread-sleep! server-start-delay) + ) + (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) + ) + ) + (let* ((serversdat (server:get-servers-info areapath)) + (servkeys (hash-table-keys serversdat)) + (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last + (sort servkeys ;; list of "host:port" + (lambda (a b) + (>= (list-ref (hash-table-ref serversdat a) 2) + (list-ref (hash-table-ref serversdat b) 2)))) + '()))) + (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat) + (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys) + (if (not (null? by-time-asc)) + (let* ((oldest (last by-time-asc)) + (oldest-dat (hash-table-ref serversdat oldest)) + (host (list-ref oldest-dat 0)) + (all-valid (filter (lambda (x) + (equal? host (list-ref (hash-table-ref serversdat x) 0))) + by-time-asc)) + (best-ten (lambda () + (if (> (length all-valid) 11) + (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out + (if (> (length all-valid) 8) + (drop-right all-valid 1) + all-valid)))) + (names->dats (lambda (names) + (map (lambda (x) + (hash-table-ref serversdat x)) + names))) + (am-home? (lambda () + (let* ((currhost (get-host-name)) + (bestadrs (server:get-best-guess-address currhost))) + (or (equal? host currhost) + (equal? host bestadrs)))))) + (case mode + ((info) + (debug:print 0 *default-log-port* "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) + (debug:print 0 *default-log-port* "youngest: "(hash-table-ref serversdat (car all-valid)))) + ((home) host) + ((homehost) (cons host (am-home?))) ;; shut up old code + ((home?) (am-home?)) + ((best-ten)(names->dats (best-ten))) + ((all-valid)(names->dats all-valid)) + ((best) (let* ((best-ten (best-ten)) + (len (length best-ten))) + (hash-table-ref serversdat (list-ref best-ten (random len))))) + ((count)(length all-valid)) + (else + (debug:print 0 *default-log-port* "ERROR: invalid command "mode) + #f))) + (begin + (server:run areapath) + (set! server-last-start (current-seconds)) + ;; (thread-sleep! 3) + (case mode + ((homehost) (cons #f #f)) + (else #f)))))) + +(define (server:get-servinfo-dir areapath) + (let* ((spath (conc areapath"/.servinfo"))) + (if (not (file-exists? spath)) + (create-directory spath #t)) + spath)) + +(define (server:clean-up-old areapath) + ;; any server file that has not been touched in ten minutes is effectively dead + (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*")))) + (for-each + (lambda (sfile) + (let* ((modtime (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile) + (current-seconds)) + (file-modification-time sfile)))) + (if (and (number? modtime) + (> (- (current-seconds) modtime) + 600)) + (begin + (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.") + (handle-exceptions + exn + (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile) + (delete-file sfile)))))) + sfiles))) + +;; would like to eventually get rid of this +;; +(define (common:on-homehost?) + (if (eq? (rmt:transport-mode) 'http) + (server:choose-server *toppath* 'home?) + #t)) ;; there is no homehost for tcp and nfs is always on home so #t should work + +;; kind start up of server, wait before allowing another server for a given +;; area to be launched +;; +(define (server:kind-run areapath) + ;; look for $MT_RUN_AREA_HOME/logs/server-start-last + ;; and wait for it to be at least seconds old + ;; (server:wait-for-server-start-last-flag areapath) + (let loop () + (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2) + (begin + (if (common:low-noise-print 30 "our-host-load") + (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server.")) + (loop)))) + (if (< (server:choose-server areapath 'count) 20) + (server:run areapath)) + #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? + (let* ((lock-file (conc areapath "/logs/server-start.lock"))) + (let* ((start-flag (conc areapath "/logs/server-start-last"))) + (common:simple-file-lock-and-wait lock-file expire-time: 25) + (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag) + (system (conc "touch " start-flag)) ;; lazy but safe + (server:run areapath) + (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED". + (common:simple-file-release-lock lock-file))) + (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another."))) + +;; this one seems to be the general entry point +;; +(define (server:start-and-wait areapath #!key (timeout 60)) + (let ((give-up-time (+ (current-seconds) timeout))) + (let loop ((server-info (server:check-if-running areapath)) + (try-num 0)) + (if (or server-info + (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. + (server:record->url server-info) + (let* ( (servers (server:choose-server areapath 'all-valid)) + (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0))) + (if (and (> try-num 0) ;; first time through simply wait a little while then try again + (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one + (server:run areapath)) + (thread-sleep! 5) + (loop (server:check-if-running areapath) + (+ try-num 1))))))) + +(define (server:get-num-servers #!key (numservers 2)) + (let ((ns (string->number + (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) + (or ns numservers))) + +;; no longer care if multiple servers are started by accident. older servers will drop off in time. +;; +(define (server:check-if-running areapath) ;; #!key (numservers "2")) + (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed + (servers (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath)))) + (if (or (and servers + (null? servers)) + (not servers)) + ;; (and (list? servers) + ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers + #f + (let loop ((hed (car servers)) + (tal (cdr servers))) + (let ((res (server:check-server hed))) + (if res + hed + (if (null? tal) + #f + (loop (car tal)(cdr tal))))))))) + +;; ping the given server +;; +(define (server:check-server server-record) + (let* ((server-url (server:record->url server-record)) + (server-id (server:record->id server-record)) + (res (server:ping server-url server-id))) + (if res + server-url + #f))) + +(define (server:kill servr) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " servr ", exn=" exn) + #f) + (match-let (((hostname port start-time server-id pid) + servr)) + (tasks:kill-server hostname pid)))) + +;; ;; called in megatest.scm, host-port is string hostname:port +;; ;; +;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running +;; ;; in the same process as the server. +;; ;; +;; (define (server:ping host:port server-id #!key (do-exit #f)) +;; (let* ((host-port (cond +;; ((string? host:port) +;; (let ((slst (string-split host:port ":"))) +;; (if (eq? (length slst) 2) +;; (list (car slst)(string->number (cadr slst))) +;; #f))) +;; (else +;; #f)))) +;; (cond +;; ((and (list? host-port) +;; (eq? (length host-port) 2)) +;; (let* ((myrunremote (make-and-init-remote *toppath*)) +;; (iface (car host-port)) +;; (port (cadr host-port)) +;; (server-dat (client:connect iface port server-id myrunremote)) +;; (login-res (rmt:login-no-auto-client-setup myrunremote))) +;; (http-transport:close-connections myrunremote) +;; (if (and (list? login-res) +;; (car login-res)) +;; (begin +;; ;; (print "LOGIN_OK") +;; (if do-exit (exit 0)) +;; #t) +;; (begin +;; ;; (print "LOGIN_FAILED") +;; (if do-exit (exit 1)) +;; #f)))) +;; (else +;; (if host:port +;; (debug:print 0 *default-log-port* "ERROR: bad host:port "host:port)) +;; (if do-exit +;; (exit 1) +;; #f))))) +;; +;; ;; run ping in separate process, safest way in some cases +;; ;; +;; (define (server:ping-server ifaceport) +;; (with-input-from-pipe +;; (conc (common:get-megatest-exe) " -ping " ifaceport) +;; (lambda () +;; (let loop ((inl (read-line)) +;; (res "NOREPLY")) +;; (if (eof-object? inl) +;; (case (string->symbol res) +;; ((NOREPLY) #f) +;; ((LOGIN_OK) #t) +;; (else #f)) +;; (loop (read-line) inl)))))) +;; +;; ;; NOT USED (well, ok, reference in rpc-transport but otherwise not used). +;; ;; +;; (define (server:login toppath) +;; (lambda (toppath) +;; (set! *db-last-access* (current-seconds)) ;; might not be needed. +;; (if (equal? *toppath* toppath) +;; #t +;; #f))) + +;; timeout is hms string: 1h 5m 3s, default is 1 minute +;; This is currently broken. Just use the number of hours with no unit. +;; Default is 600 seconds. +;; +(define (server:expiration-timeout) + (let* ((tmo (configf:lookup *configdat* "server" "timeout"))) + (if (string? tmo) + (let* ((num (string->number tmo))) + (if num + (* 3600 num) + (common:hms-string->seconds tmo))) + 600 ;; this is the default + ))) + +(define (server:get-best-guess-address hostname) + (let ((res #f)) + (for-each + (lambda (adr) + (if (not (eq? (u8vector-ref adr 0) 127)) + (set! res adr))) + ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME + (vector->list (hostinfo-addresses (hostname->hostinfo hostname)))) + (string-intersperse + (map number->string + (u8vector->list + (if res res (hostname->ip hostname)))) "."))) + +;; moving this here as it needs access to db and cannot be in common. +;; + +(define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f)) + (debug:print "WARNING: bruteforce-syncer is called but has been disabled!") + (lambda () + (debug:print "WARNING: bruteforce-syncer is called but has been disabled!"))) + +;; +(defstruct remote + + ;; transport to be used + ;; http - use http-transport + ;; http-read-cached - use http-transport for writes but in-mem cached for reads + (rmode 'http) + (hh-dat (let ((res (or (server:choose-server *toppath* 'homehost) + (cons #f #f)))) + (assert (pair? res)(conc "FATAL: hh-dat should be a pair, got "res)) + res)) + (server-url #f) ;; (server:check-if-running *toppath*) #f)) + (server-id #f) + (server-info #f) ;; (if *toppath* (server:check-if-running *toppath*) #f)) + (last-server-check 0) ;; last time we checked to see if the server was alive + (connect-time (current-seconds)) ;; when we first connected + (last-access (current-seconds)) ;; last time we talked to server + ;; (conndat #f) ;; iface port api-uri api-url api-req seconds server-id + (server-timeout (server:expiration-timeout)) + (force-server #f) + (ro-mode #f) + (ro-mode-checked #f) ;; flag that indicates we have checked for ro-mode + + ;; conndat stuff + (iface #f) ;; TODO: Consolidate this data with server-url and server-info above + (port #f) + (api-url #f) + (api-uri #f) + (api-req #f)) + +;;====================================================================== +;; Rotate logs, logic: +;; if > 500k and older than 1 week: +;; remove previous compressed log and compress this log +;; WARNING: This proc operates assuming that it is in the directory above the +;; logs directory you wish to log-rotate. +;; +(define (common:rotate-logs) + (let* ((all-files (make-hash-table)) + (stats (make-hash-table)) + (inc-stat (lambda (key) + (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1)))) + (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "600")))) ;; name -> age + (if (not (directory-exists? "logs"))(create-directory "logs")) + (directory-fold + (lambda (file rem) + (handle-exceptions + exn + (begin + (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore. exn=" exn) + (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + ;; (print-call-chain (current-error-port)) ;; + ) + (let* ((fullname (conc "logs/" file)) + (mod-time (file-modification-time fullname)) + (file-age (- (current-seconds) mod-time)) + (file-old (> file-age (* 48 60 60))) + (file-big (> (file-size fullname) 200000))) + (hash-table-set! all-files file mod-time) + (if (or (and (string-match "^.*.log" file) + file-old + file-big) + (and (string-match "^server-.*.log" file) + file-old)) + (let ((gzfile (conc fullname ".gz"))) + (if (common:file-exists? gzfile) + (begin + (debug:print-info 0 *default-log-port* "removing " gzfile) + (delete-file* gzfile) + (hash-table-delete! all-files gzfile) ;; needed? + )) + (debug:print-info 0 *default-log-port* "compressing " file) + (system (conc "gzip " fullname)) + (inc-stat "gzipped") + (hash-table-set! all-files (conc file ".gz") file-age) ;; add the .gz file and remove the base file + (hash-table-delete! all-files file) + ) + (if (and (> file-age (* (string->number (or (configf:lookup *configdat* "setup" "log-expire-days") "30")) 24 3600)) + (file-exists? fullname)) ;; just in case it was gzipped - will get it next time + (handle-exceptions + exn + #f + (if (directory? fullname) + (begin + (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") + (inc-stat "directories")) + (begin + (delete-file* fullname) + (inc-stat "deleted"))) + (hash-table-delete! all-files file))))))) + '() + "logs") + (for-each + (lambda (category) + (let ((quant (hash-table-ref/default stats category 0))) + (if (> quant 0) + (debug:print-info 0 *default-log-port* category " log files: " quant)))) + `("deleted" "gzipped" "directories")) + (let ((num-logs (hash-table-size all-files))) + (if (> num-logs max-allowed) ;; because NFS => don't let number of logs exceed 300 + (let ((files (take (sort (hash-table-keys all-files) + (lambda (a b) + (< (hash-table-ref all-files a)(hash-table-ref all-files b)))) + (- num-logs max-allowed)))) + (for-each + (lambda (file) + (let* ((fullname (conc "logs/" file))) + (if (directory? fullname) + (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") + (handle-exceptions + exn + (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn) + (delete-file* fullname))))) + files) + (debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files.")))))) + +(define (common:get-testsuite-name) + (or (configf:lookup *configdat* "setup" "area-name") ;; megatest is a flexible tool, testsuite is too limiting a description. + (configf:lookup *configdat* "setup" "testsuite" ) + (getenv "MT_TESTSUITE_NAME") + (pathname-file (or (if (string? *toppath* ) + (pathname-file *toppath*) + #f) + (common:get-toppath #f))) + "please-set-setup-area-name")) ;; (pathname-file (current-directory))))) + +(define (common:wait-for-homehost-load maxnormload msg) + (let loop ((start-time (current-seconds))) ;; we saw some instances of this being called before *toppath* was set. This might be an early setup race. This delay should help but it is impossible to test... + (if (not *toppath*) + (begin + (debug:print 0 *default-log-port* "ERROR: common:wait-for-homehost-load called before *toppath* set.") + (thread-sleep! 30) + (if (< (- (current-seconds) start-time) 300) + (loop start-time))))) + (case (rmt:transport-mode) + ((http) + (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. + #f + (server:choose-server *toppath* 'homehost))) + (hh (if hh-dat (car hh-dat) #f))) + (common:wait-for-normalized-load maxnormload msg hh))) + (else + (common:wait-for-normalized-load maxnormload msg (get-host-name))))) + +;;====================================================================== +;; Force a megatest cleanup-db if version is changed and skip-version-check not specified +;; Do NOT check if not on homehost! +;; +(define (common:exit-on-version-changed) + (if (and *toppath* ;; do nothing if *toppath* not yet provided + (common:on-homehost?)) + (if (common:api-changed?) + (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")) + (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") ".mtdb/main.db")) + (read-only (not (file-write-access? dbfile))) + (dbstruct (db:setup))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;; #t))) + (debug:print 0 *default-log-port* + "WARNING: Version mismatch!\n" + " expected: " (common:version-signature) "\n" + " got: " (common:get-last-run-version)) + (cond + ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t) + ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only) + (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db + (debug:print 0 *default-log-port* " I see you are the owner of megatest.config, attempting to cleanup and reset to new version") + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "Failed to switch versions. exn=" exn) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (print-call-chain (current-error-port)) + (exit 1)) + (common:cleanup-db dbstruct))) + ((not (common:file-exists? mtconf)) + (debug:print 0 *default-log-port* " megatest.config does not exist in this area. Cannot proceed with megatest version migration.") + (exit 1)) + ((not (common:file-exists? dbfile)) + (debug:print 0 *default-log-port* " .mtdb/main.db does not exist in this area. Cannot proceed with megatest version migration.") + (exit 1)) + ((not (eq? (current-user-id)(file-owner mtconf))) + (debug:print 0 *default-log-port* " You do not own .mtdb/main.db in this area. Cannot proceed with megatest version migration.") + (exit 1)) + (read-only + (debug:print 0 *default-log-port* " You have read-only access to this area. Cannot proceed with megatest version migration.") + (exit 1)) + (else + (debug:print 0 *default-log-port* " to switch versions you can run: \"megatest -cleanup-db\"") + (exit 1))))))) +;;====================================================================== +;; (begin +;; (debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.") +;; (exit 1)))) + +;;====================================================================== +;; E X I T H A N D L I N G +;;====================================================================== + +(define (common:run-sync?) + (and *toppath* ;; gate if called before *toppath* is set + (common:on-homehost?) + (args:get-arg "-server"))) + + +(define (std-signal-handler signum) + ;; (signal-mask! signum) + (set! *time-to-exit* #t) + ;;(debug:print-info 13 *default-log-port* "got signal "signum) + (debug:print-error 0 *default-log-port* "Received signal " signum " aaa exiting promptly") + ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway + (exit)) + +(define (special-signal-handler signum) + ;; (signal-mask! signum) + (set! *time-to-exit* #t) + ;;(debug:print-info 13 *default-log-port* "got signal "signum) + (debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting!!") + ;;TODO send email to notify admin contact listed in the config that the lisner got killed + ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway + (exit)) + + +(set-signal-handler! signal/int std-signal-handler) ;; ^C +(set-signal-handler! signal/term std-signal-handler) +;;====================================================================== +;; ideally put all this info into the db, no need to preserve it across moving homehost +;; +;; return list of +;; ( reachable? cpuload update-time ) +(define (common:get-host-info hostname) + (let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data + (load (car loadinfo)) + (load-sample-time (cdr loadinfo)) + (load-sample-age (- (current-seconds) load-sample-time)) + (loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds + (host-last-update-timeout-seconds 4) + (host-rec (hash-table-ref/default *host-loads* hostname #f)) + ) + (cond + ((< load-sample-age loadinfo-timeout-seconds) + (list #t + load-sample-time + load)) + ((and host-rec + (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds))) + (list #t + (host-last-update host-rec) + (host-last-cpuload host-rec ))) + ((common:unix-ping hostname) + (list #t + (current-seconds) + (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds + (else + (list #f 0 -1) ;; bad host, don't use! + )))) + +;;====================================================================== +;; see defstruct host at top of file. +;; host: reachable last-update last-used last-cpuload +;; +(define (common:update-host-loads-table hosts-raw) + (let* ((hosts (filter (lambda (x) + (string-match (regexp "^\\S+$") x)) + hosts-raw))) + (for-each + (lambda (hostname) + (let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f))) + (if h + h + (let ((h (make-host))) + (hash-table-set! *host-loads* hostname h) + h)))) + (host-info (common:get-host-info hostname)) + (is-reachable (car host-info)) + (last-reached-time (cadr host-info)) + (load (caddr host-info))) + (host-reachable-set! rec is-reachable) + (host-last-update-set! rec last-reached-time) + (host-last-cpuload-set! rec load))) + hosts))) + +;;====================================================================== +;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load +;; count - count down to zero, at some point we'd give up if the load never drops +;; num-tries - count down to zero number tries to get numcpus +;; +(define (common:wait-for-cpuload maxnormload numcpus-in + #!key (count 1000) + (msg #f)(remote-host #f)(num-tries 5)) + (let* ((loadavg (common:get-cpu-load remote-host)) + ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again + (numcpus (if (<= 1 numcpus-in) + (common:get-num-cpus remote-host) numcpus-in)) + (first (car loadavg)) + (next (cadr loadavg)) + (adjmaxload (* maxnormload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude + ;; fallback is to at least use 1 + ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit + ;; etc. + (effective-load (common:get-intercept first next)) + (recommended-delay (common:get-delay effective-load numcpus)) + (effective-host (or remote-host "localhost")) + (normalized-effective-load (/ effective-load numcpus)) + (will-wait (> normalized-effective-load maxnormload))) + (if (and will-wait (> recommended-delay 1)) + (let* ((actual-delay (min recommended-delay 30))) + (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) + (debug:print-info 0 *default-log-port* "Load control, delaying " + actual-delay " seconds to maintain safe load. current normalized effective load is " + normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load)) + (thread-sleep! actual-delay))) + + (cond + ;; bad data, try again to get the data + ((not will-wait) + (if (common:low-noise-print 3600 (conc (round normalized-effective-load) "-load-acceptable-" effective-host)) + (debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing."))) + + ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable + (> num-tries 0)) + (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " + first ", we'll sleep 10s and try " num-tries " more times.") + (thread-sleep! 10) + (common:wait-for-cpuload maxnormload numcpus-in + count: count remote-host: remote-host num-tries: (- num-tries 1))) + + ;; need to wait for load to drop + ((and will-wait ;; (> first adjmaxload) + (> count 0)) + (debug:print-info 0 *default-log-port* + "Delaying 15" ;; adjwait + " seconds due to normalized effective load " normalized-effective-load ;; first + " exceeding max of " adjmaxload + " on server " (or remote-host (get-host-name)) + " (normalized load-limit: " maxnormload ") " (if msg msg "")) + (thread-sleep! 15) ;; adjwait) + (common:wait-for-cpuload maxnormload numcpus count: (- count 1) msg: msg remote-host: remote-host) + ;; put the message here to indicate came out of waiting + (debug:print-info 1 *default-log-port* + "On host: " effective-host + ", effective load: " effective-load + ", numcpus: " numcpus + ", normalized effective load: " normalized-effective-load + )) + ;; overloaded and count expired (i.e. went to zero) + (else + (if (> num-tries 0) ;; should be "num-tries-left". + (if (common:low-noise-print 30 (conc (round effective-load) "-load-acceptable-" effective-host)) + (debug:print 0 *default-log-port* "Load on " effective-host " is acceptable at effective normalized load of " + effective-normalized-load " continuing.")) + (debug:print 0 *default-log-port* "Load on " effective-host ", " + first" could not be retrieved. Giving up and continuing.")))))) + +;;====================================================================== +;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load +;; +;; (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5)) +;; (let* ((loadavg (common:get-cpu-load remote-host)) +;; (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again +;; (common:get-num-cpus remote-host) +;; numcpus-in)) +;; (maxload (if force-maxload +;; maxload-in +;; (if (number? maxload-in) +;; (max maxload-in 0.5) +;; 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? +;; (first (car loadavg)) +;; (next (cadr loadavg)) +;; (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where +;; ;; numcpus (or could be +;; ;; maxload) is zero, +;; ;; crude fallback is to +;; ;; at least use 1 +;; (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next? +;; 0 +;; next))) ;; we will force a conservative calculation any time next is large. +;; (first-next-avg (/ (+ first next) 2)) +;; ;; add some randomness to the time to break any alignment +;; ;; where netbatch dumps many jobs to machines simultaneously +;; (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10) +;; (/ (- 1000 count) 10) +;; waitdelay) +;; (- first adjmaxload) )))) +;; (load-jump-limit (configf:lookup-number *configdat* "setup" "load-jump-limit")) +;; ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit +;; ;; etc. +;; (effective-load (common:get-intercept first next)) +;; (effective-host (or remote-host "localhost")) +;; (normalized-effective-load (/ effective-load numcpus)) +;; (will-wait (> normalized-effective-load maxload))) +;; +;; ;; let's let the user know once in a long while that load checking +;; ;; is happening but not constantly report it +;; #;(if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time +;; (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload +;; ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp)) +;; +;; (debug:print-info 1 *default-log-port* +;; "On host: " effective-host +;; ", effective load: " effective-load +;; ", numcpus: " numcpus +;; ", normalized effective load: " normalized-effective-load +;; ) +;; +;; (cond +;; ;; bad data, try again to get the data +;; ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable +;; (> num-tries 0)) +;; (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.") +;; (thread-sleep! 10) +;; (common:wait-for-cpuload maxload-in numcpus-in waitdelay +;; count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1))) +;; ;; need to wait for load to drop +;; ((and will-wait ;; (> first adjmaxload) +;; (> count 0)) +;; (debug:print-info 0 *default-log-port* +;; "Delaying " 15 ;; adjwait +;; " seconds due to normalized effective load " normalized-effective-load ;; first +;; " exceeding max of " adjmaxload +;; " on server " (or remote-host (get-host-name)) +;; " (normalized load-limit: " maxload ") " (if msg msg "")) +;; (thread-sleep! 15) ;; adjwait) +;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) +;; ((and (> loadjmp (cond +;; (load-jump-limit load-jump-limit) +;; ((> numcpus 8)(/ numcpus 2)) +;; ((> numcpus 4)(/ numcpus 1.2)) +;; (else 0.5))) +;; (> count 0)) +;; (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". " +;; (if msg msg "")) +;; (thread-sleep! adjwait) +;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) +;; (else +;; (if (> num-tries 0) +;; (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost"))) +;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing.")) +;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing.")))))) +;; + +;;====================================================================== +;; wait for normalized cpu load to drop below maxload +;; +(define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5)) + (let ((num-cpus (common:get-num-cpus remote-host))) + (if num-cpus + (common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host) + (begin + (thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again + (if (> rem-tries 0) + (common:wait-for-normalized-load maxnormload msg remote-host (- rem-tries 1)) + #f))))) + + +)