Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -32,10 +32,13 @@ archive.scm env.scm diff-report.scm cgisetup/models/pgdb.scm # module source files MSRCFILES = ftail.scm portlogger.scm nmsg-transport.scm +# files needed for mtserve +MTSERVEFILES = common.scm megatest-version.scm margs.scm + # Eggs to install (straightforward ones) EGGS=matchable readline apropos base64 regex-literals format regex-case test coops trace csv \ dot-locking posix-utils posix-extras directory-utils hostinfo tcp-server rpc csv-xml fmt \ json md5 awful http-client spiffy uri-common intarweb spiffy-request-vars \ spiffy-directory-listing ssax sxml-serializer sxml-modifications iup canvas-draw sqlite3 @@ -42,10 +45,11 @@ GUISRCF = dashboard-context-menu.scm dashboard-tests.scm dashboard-guimonitor.scm gutils.scm dcommon.scm tree.scm vg.scm OFILES = $(SRCFILES:%.scm=%.o) GOFILES = $(GUISRCF:%.scm=%.o) +MTSERVEOFILES = $(MTSERVEFILES:%.scm=%.o) MOFILES = $(addprefix mofiles/,$(MSRCFILES:%.scm=%.o)) mofiles/%.o : %.scm mkdir -p mofiles @@ -69,15 +73,18 @@ # ARCHSTR=$(shell bash -c "echo \$$MACHTYPE") PNGFILES = $(shell cd docs/manual;ls *png) #all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut ndboard -all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut +all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut mtserve mtest: $(OFILES) readline-fix.scm megatest.o $(MOFILES) csc $(CSCOPTS) $(OFILES) $(MOFILES) megatest.o -o mtest +mtserve: $(MTSERVEOFILES) readline-fix.scm mtserve.o $(MOFILES) + csc $(CSCOPTS) $(MTSERVEOFILES) $(MOFILES) mtserve.o -o mtserve + showmtesthash: @echo $(MTESTHASH) dboard : $(OFILES) $(GOFILES) dashboard.scm $(MOFILES) csc $(CSCOPTS) $(OFILES) dashboard.scm $(GOFILES) $(MOFILES) -o dboard @@ -181,10 +188,16 @@ $(PREFIX)/bin/.$(ARCHSTR)/mtest : mtest utils/mk_wrapper @echo Installing to PREFIX=$(PREFIX) $(INSTALL) mtest $(PREFIX)/bin/.$(ARCHSTR)/mtest utils/mk_wrapper $(PREFIX) mtest $(PREFIX)/bin/megatest chmod a+x $(PREFIX)/bin/megatest + +$(PREFIX)/bin/.$(ARCHSTR)/mtserve : mtserve utils/mk_wrapper + @echo Installing to PREFIX=$(PREFIX) + $(INSTALL) mtserve $(PREFIX)/bin/.$(ARCHSTR)/mtserve + utils/mk_wrapper $(PREFIX) mtserve $(PREFIX)/bin/mtserver + chmod a+x $(PREFIX)/bin/mtserver $(PREFIX)/bin/.$(ARCHSTR)/ndboard : ndboard $(INSTALL) ndboard $(PREFIX)/bin/.$(ARCHSTR)/ndboard $(PREFIX)/bin/newdashboard : $(PREFIX)/bin/.$(ARCHSTR)/ndboard utils/mk_wrapper Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -16,355 +16,20 @@ ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see . ;; ;;====================================================================== -(use srfi-69 posix) - -(declare (unit api)) -(declare (uses rmt)) -(declare (uses db)) -(declare (uses tasks)) - -;; allow these queries through without starting a server -;; -(define api:read-only-queries - '(get-key-val-pairs - get-var - get-keys - get-key-vals - test-toplevel-num-items - get-test-info-by-id - get-steps-info-by-id - get-data-info-by-id - test-get-rundir-from-test-id - get-count-tests-running-for-testname - get-count-tests-running - get-count-tests-running-in-jobgroup - get-previous-test-run-record - get-matching-previous-test-run-records - test-get-logfile-info - test-get-records-for-index-file - get-testinfo-state-status - test-get-top-process-pid - test-get-paths-matching-keynames-target-new - get-prereqs-not-met - get-count-tests-running-for-run-id - get-run-info - get-run-status - get-run-stats - get-run-times - get-targets - get-target - ;; register-run - get-tests-tags - get-test-times - get-tests-for-run - get-test-id - get-tests-for-runs-mindata - get-tests-for-run-mindata - get-run-name-from-id - get-runs - simple-get-runs - get-num-runs - get-runs-cnt-by-patt - get-all-run-ids - get-prev-run-ids - get-run-ids-matching-target - get-runs-by-patt - get-steps-data - get-steps-for-test - read-test-data - read-test-data* - login - tasks-get-last - testmeta-get-record - have-incompletes? - synchash-get - get-changed-record-ids - get-run-record-ids - )) - -(define api:write-queries - '( - get-keys-write ;; dummy "write" query to force server start - - ;; SERVERS - start-server - kill-server - - ;; TESTS - test-set-state-status-by-id - delete-test-records - delete-old-deleted-test-records - test-set-state-status - test-set-top-process-pid - set-state-status-and-roll-up-items - update-pass-fail-counts - top-test-set-per-pf-counts ;; (db:top-test-set-per-pf-counts (db:get-db *db* 5) 5 "runfirst") - - ;; RUNS - register-run - set-tests-state-status - delete-run - lock/unlock-run - update-run-event_time - mark-incomplete - - ;; STEPS - teststep-set-status! - - ;; TEST DATA - test-data-rollup - csv->test-data - - ;; MISC - sync-inmem->db - - ;; TESTMETA - testmeta-add-record - testmeta-update-field - - ;; TASKS - tasks-add - tasks-set-state-given-param-key - )) - -;; These are called by the server on recipt of /api calls -;; - keep it simple, only return the actual result of the call, i.e. no meta info here -;; -;; - returns #( flag result ) -;; -(define (api:execute-requests dbstruct dat) - (handle-exceptions - exn - (let ((call-chain (get-call-chain))) - (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat) - (print-call-chain (current-error-port)) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens - (cond - ((not (vector? dat)) ;; it is an error to not receive a vector - (vector #f (vector #f "remote must be called with a vector"))) - ((> *api-process-request-count* 20) ;; 20) - (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") - (set! *server-overloaded* #t) - (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! - (else - (let* ((cmd-in (vector-ref dat 0)) - (cmd (if (symbol? cmd-in) - cmd-in - (string->symbol cmd-in))) - (params (vector-ref dat 1)) - (start-t (current-milliseconds)) - (readonly-mode (dbr:dbstruct-read-only dbstruct)) - (readonly-command (member cmd api:read-only-queries)) - (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) - (res - (if writecmd-in-readonly-mode - (conc "attempt to run write command "cmd" on a read-only database") - (case cmd - ;;=============================================== - ;; READ/WRITE QUERIES - ;;=============================================== - - ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl - - ;; SERVERS - ((start-server) (apply server:kind-run params)) - ((kill-server) (set! *server-run* #f)) - - ;; TESTS - - ;;((test-set-state-status-by-id) (apply mt:test-set-state-status-by-id dbstruct params)) - ;;BB - commented out above because it was calling below, eventually, incorrectly (dbstruct passed to mt:test-set-state-status-by-id, which previosly did more, but now only passes thru to db:set-state-status-and-roll-up-items. - ((test-set-state-status-by-id) - - ;; (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) - (db:set-state-status-and-roll-up-items - dbstruct - (list-ref params 0) ; run-id - (list-ref params 1) ; test-name - #f ; item-path - (list-ref params 2) ; state - (list-ref params 3) ; status - (list-ref params 4) ; comment - )) - - ((delete-test-records) (apply db:delete-test-records dbstruct params)) - ((delete-old-deleted-test-records) (apply db:delete-old-deleted-test-records dbstruct params)) - ((test-set-state-status) (apply db:test-set-state-status dbstruct params)) - ((test-set-top-process-pid) (apply db:test-set-top-process-pid dbstruct params)) - ((set-state-status-and-roll-up-items) (apply db:set-state-status-and-roll-up-items dbstruct params)) - ((top-test-set-per-pf-counts) (apply db:top-test-set-per-pf-counts dbstruct params)) - ((test-set-archive-block-id) (apply db:test-set-archive-block-id dbstruct params)) - - ;; RUNS - ((register-run) (apply db:register-run dbstruct params)) - ((set-tests-state-status) (apply db:set-tests-state-status dbstruct params)) - ((delete-run) (apply db:delete-run dbstruct params)) - ((lock/unlock-run) (apply db:lock/unlock-run dbstruct params)) - ((update-run-event_time) (apply db:update-run-event_time dbstruct params)) - ((update-run-stats) (apply db:update-run-stats dbstruct params)) - ((set-var) (apply db:set-var dbstruct params)) - ((del-var) (apply db:del-var dbstruct params)) - - ;; STEPS - ((teststep-set-status!) (apply db:teststep-set-status! dbstruct params)) - - ;; TEST DATA - ((test-data-rollup) (apply db:test-data-rollup dbstruct params)) - ((csv->test-data) (apply db:csv->test-data dbstruct params)) - - ;; MISC - ((sync-inmem->db) (let ((run-id (car params))) - (db:sync-touched dbstruct run-id force-sync: #t))) - ((mark-incomplete) (apply db:find-and-mark-incomplete dbstruct params)) - - ;; TESTMETA - ((testmeta-add-record) (apply db:testmeta-add-record dbstruct params)) - ((testmeta-update-field) (apply db:testmeta-update-field dbstruct params)) - ((get-tests-tags) (db:get-tests-tags dbstruct)) - - ;; TASKS - ((tasks-add) (apply tasks:add dbstruct params)) - ((tasks-set-state-given-param-key) (apply tasks:set-state-given-param-key dbstruct params)) - ((tasks-get-last) (apply tasks:get-last dbstruct params)) - - ;; NO SYNC DB - ((no-sync-set) (apply db:no-sync-set *no-sync-db* params)) - ((no-sync-get/default) (apply db:no-sync-get/default *no-sync-db* params)) - ((no-sync-del!) (apply db:no-sync-del! *no-sync-db* params)) - ((no-sync-get-lock) (apply db:no-sync-get-lock *no-sync-db* params)) - - ;; ARCHIVES - ;; ((archive-get-allocations) - ((archive-register-disk) (apply db:archive-register-disk dbstruct params)) - ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params)) - ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey)) - - ;;====================================================================== - ;; READ ONLY QUERIES - ;;====================================================================== - - ;; KEYS - ((get-key-val-pairs) (apply db:get-key-val-pairs dbstruct params)) - ((get-keys) (db:get-keys dbstruct)) - ((get-key-vals) (apply db:get-key-vals dbstruct params)) - ((get-target) (apply db:get-target dbstruct params)) - ((get-targets) (db:get-targets dbstruct)) - - ;; ARCHIVES - ((test-get-archive-block-info) (apply db:test-get-archive-block-info dbstruct params)) - - ;; TESTS - ((test-toplevel-num-items) (apply db:test-toplevel-num-items dbstruct params)) - ((get-test-info-by-id) (apply db:get-test-info-by-id dbstruct params)) - ((test-get-rundir-from-test-id) (apply db:test-get-rundir-from-test-id dbstruct params)) - ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params)) - ((get-count-tests-running) (apply db:get-count-tests-running dbstruct params)) - ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params)) - ;; ((delete-test-step-records) (apply db:delete-test-step-records dbstruct params)) - ((get-previous-test-run-record) (apply db:get-previous-test-run-record dbstruct params)) - ((get-matching-previous-test-run-records)(apply db:get-matching-previous-test-run-records dbstruct params)) - ((test-get-logfile-info) (apply db:test-get-logfile-info dbstruct params)) - ((test-get-records-for-index-file) (apply db:test-get-records-for-index-file dbstruct params)) - ((get-testinfo-state-status) (apply db:get-testinfo-state-status dbstruct params)) - ((test-get-top-process-pid) (apply db:test-get-top-process-pid dbstruct params)) - ((test-get-paths-matching-keynames-target-new) (apply db:test-get-paths-matching-keynames-target-new dbstruct params)) - ((get-prereqs-not-met) (apply db:get-prereqs-not-met dbstruct params)) - ((get-count-tests-running-for-run-id) (apply db:get-count-tests-running-for-run-id dbstruct params)) - ((synchash-get) (apply synchash:server-get dbstruct params)) - ((get-raw-run-stats) (apply db:get-raw-run-stats dbstruct params)) - ((get-test-times) (apply db:get-test-times dbstruct params)) - - ;; RUNS - ((get-run-info) (apply db:get-run-info dbstruct params)) - ((get-run-status) (apply db:get-run-status dbstruct params)) - ((set-run-status) (apply db:set-run-status dbstruct params)) - ((get-tests-for-run) (apply db:get-tests-for-run dbstruct params)) - ((get-test-id) (apply db:get-test-id dbstruct params)) - ((get-tests-for-run-mindata) (apply db:get-tests-for-run-mindata dbstruct params)) - ((get-tests-for-runs-mindata) (apply db:get-tests-for-runs-mindata dbstruct params)) - ((get-runs) (apply db:get-runs dbstruct params)) - ((simple-get-runs) (apply db:simple-get-runs dbstruct params)) - ((get-num-runs) (apply db:get-num-runs dbstruct params)) - ((get-runs-cnt-by-patt) (apply db:get-runs-cnt-by-patt dbstruct params)) - ((get-all-run-ids) (db:get-all-run-ids dbstruct)) - ((get-prev-run-ids) (apply db:get-prev-run-ids dbstruct params)) - ((get-run-ids-matching-target) (apply db:get-run-ids-matching-target dbstruct params)) - ((get-runs-by-patt) (apply db:get-runs-by-patt dbstruct params)) - ((get-run-name-from-id) (apply db:get-run-name-from-id dbstruct params)) - ((get-main-run-stats) (apply db:get-main-run-stats dbstruct params)) - ((get-var) (apply db:get-var dbstruct params)) - ((get-run-stats) (apply db:get-run-stats dbstruct params)) - ((get-run-times) (apply db:get-run-times dbstruct params)) - - ;; STEPS - ((get-steps-data) (apply db:get-steps-data dbstruct params)) - ((get-steps-for-test) (apply db:get-steps-for-test dbstruct params)) - ((get-steps-info-by-id) (apply db:get-steps-info-by-id dbstruct params)) - - ;; TEST DATA - ((read-test-data) (apply db:read-test-data dbstruct params)) - ((read-test-data*) (apply db:read-test-data* dbstruct params)) - ((get-data-info-by-id) (apply db:get-data-info-by-id dbstruct params)) - - ;; MISC - ((get-latest-host-load) (apply db:get-latest-host-load dbstruct params)) - ((have-incompletes?) (apply db:have-incompletes? dbstruct params)) - ((login) (apply db:login dbstruct params)) - ((general-call) (let ((stmtname (car params)) - (run-id (cadr params)) - (realparams (cddr params))) - (db:general-call dbstruct stmtname realparams))) - ((sdb-qry) (apply sdb:qry params)) - ((ping) (current-process-id)) - ((get-changed-record-ids) (apply db:get-changed-record-ids dbstruct params)) - ((get-run-record-ids) (apply db:get-run-record-ids dbstruct params)) - ;; TESTMETA - ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) - - ;; TASKS - ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)) - (else - (debug:print 0 *default-log-port* "ERROR: bad api call " cmd) - (conc "ERROR: BAD api call " cmd)))))) - - ;; save all stats - (let ((delta-t (- (current-milliseconds) - start-t))) - (hash-table-set! *db-api-call-time* cmd - (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) - (if writecmd-in-readonly-mode - (vector #f res) - (vector #t res))))))) - -;; http-server send-response -;; api:process-request -;; db:* -;; -;; NB// Runs on the server as part of the server loop -;; -(define (api:process-request dbstruct $) ;; the $ is the request vars proc - (set! *api-process-request-count* (+ *api-process-request-count* 1)) - (let* ((cmd ($ 'cmd)) - (paramsj ($ 'params)) - (params (db:string->obj paramsj transport: 'http)) ;; incoming data from the POST (or is it a GET?) - (resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) - (success (vector-ref resdat 0)) - (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?) - (if (not success) - (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) - (if (> *api-process-request-count* *max-api-process-requests*) - (set! *max-api-process-requests* *api-process-request-count*)) - (set! *api-process-request-count* (- *api-process-request-count* 1)) - ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds - ;; (rmt:dat->json-str - ;; (if (or (string? res) - ;; (list? res) - ;; (number? res) - ;; (boolean? res)) - ;; res - ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res))))) - (db:obj->string res transport: 'http))) - +(declare (unit api)) + +(module + api + ( + * + ) + +(import scheme posix chicken data-structures ports) + + +(define (api:execute-requests . args) #t) +(define (api:process-request . args) #t) + +) Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -31,91 +31,86 @@ (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. (include "common_records.scm") (include "db_records.scm") -;; client:get-signature -(define (client:get-signature) - (if *my-client-signature* *my-client-signature* - (let ((sig (conc (get-host-name) " " (current-process-id)))) - (set! *my-client-signature* sig) - *my-client-signature*))) - -;; Not currently used! But, I think it *should* be used!!! -(define (client:logout serverdat) - (let ((ok (and (socket? serverdat) - (cdb:logout serverdat *toppath* (client:get-signature))))) - ok)) - -(define (client:connect iface port) - (case (server:get-transport) - ((rpc) (rpc:client-connect iface port)) - ((http) (http:client-connect iface port)) - ((zmq) (zmq:client-connect iface port)) - (else (rpc:client-connect iface port)))) - -(define (client:setup areapath #!key (remaining-tries 100) (failed-connects 0)) - (case (server:get-transport) - ((rpc) (rpc-transport:client-setup remaining-tries: remaining-tries failed-connects: failed-connects)) ;;(client:setup-rpc run-id)) - ((http)(client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects)) - (else (rpc-transport:client-setup remaining-tries: remaining-tries failed-connects: failed-connects)))) ;; (client:setup-rpc run-id)))) - -;; Do all the connection work, look up the transport type and set up the -;; connection if required. -;; -;; There are two scenarios. -;; 1. We are a test manager and we received *transport-type* and *runremote* via cmdline -;; 2. We are a run tests, list runs or other interactive process and we must figure out -;; *transport-type* and *runremote* from the monitor.db -;; -;; client:setup -;; -;; lookup_server, need to remove *runremote* stuff -;; - -(define (client:setup-http areapath #!key (remaining-tries 100) (failed-connects 0)(area-dat #f)) - (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) - (server:start-and-wait areapath) - (if (<= remaining-tries 0) - (begin - (debug:print-error 0 *default-log-port* "failed to start or connect to server") - (exit 1)) - ;; - ;; Alternatively here, we can get the list of candidate servers and work our way - ;; through them searching for a good one. - ;; - (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath)) - (runremote (or area-dat *runremote*))) - (if (not server-dat) ;; no server found - (client:setup-http areapath remaining-tries: (- remaining-tries 1)) - (let ((host (cadr server-dat)) - (port (caddr server-dat))) - (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) - (if (and (not area-dat) - (not *runremote*)) - (set! *runremote* (make-remote))) - (if (and host port) - (let* ((start-res (case *transport-type* - ((http)(http-transport:client-connect host port)))) - (ping-res (case *transport-type* - ((http)(rmt:login-no-auto-client-setup start-res))))) - (if (and start-res - ping-res) - (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago - (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res) - (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) - start-res) - (begin ;; login failed but have a server record, clean out the record and try again - (debug:print-info 0 *default-log-port* "client:setup, login failed, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid. Fixes part of Randy;s ticket 1405717332 - (case *transport-type* - ((http)(http-transport:close-connections))) - (remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id) - (thread-sleep! 1) - (client:setup-http areapath remaining-tries: (- remaining-tries 1)) - ))) - (begin ;; no server registered - ;; (server:kind-run areapath) - (server:start-and-wait areapath) - (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) - (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. - (client:setup-http areapath remaining-tries: (- remaining-tries 1))))))))) - +;; ;; Not currently used! But, I think it *should* be used!!! +;; (define (client:logout serverdat) +;; (let ((ok (and (socket? serverdat) +;; (cdb:logout serverdat *toppath* (client:get-signature))))) +;; ok)) +;; +;; (define (client:connect iface port) +;; (case (server:get-transport) +;; ((http) (http:client-connect iface port)) +;; ((zmq) (zmq:client-connect iface port)) +;; (else (begin +;; (debug:print 0 *default-log-port* "ERROR: no such transport " (server:get-transport) ", exiting now.") +;; (exit 1))))) +;; +;; (define (client:setup areapath #!key (remaining-tries 100) (failed-connects 0)) +;; (case (server:get-transport) +;; ((http)(client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects)) +;; (else (begin +;; (debug:print 0 *default-log-port* "ERROR: no such transport " (server:get-transport) ", exiting now.") +;; (exit 1))))) +;; +;; ;; Do all the connection work, look up the transport type and set up the +;; ;; connection if required. +;; ;; +;; ;; There are two scenarios. +;; ;; 1. We are a test manager and we received *transport-type* and *runremote* via cmdline +;; ;; 2. We are a run tests, list runs or other interactive process and we must figure out +;; ;; *transport-type* and *runremote* from the monitor.db +;; ;; +;; ;; client:setup +;; ;; +;; ;; lookup_server, need to remove *runremote* stuff +;; ;; +;; +;; (define (client:setup-http areapath #!key (remaining-tries 100) (failed-connects 0)(area-dat #f)) +;; (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) +;; (server:start-and-wait areapath) +;; (if (<= remaining-tries 0) +;; (begin +;; (debug:print-error 0 *default-log-port* "failed to start or connect to server") +;; (exit 1)) +;; ;; +;; ;; Alternatively here, we can get the list of candidate servers and work our way +;; ;; through them searching for a good one. +;; ;; +;; (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath)) +;; (runremote (or area-dat *runremote*))) +;; (if (not server-dat) ;; no server found +;; (client:setup-http areapath remaining-tries: (- remaining-tries 1)) +;; (let ((host (cadr server-dat)) +;; (port (caddr server-dat))) +;; (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) +;; (if (and (not area-dat) +;; (not *runremote*)) +;; (set! *runremote* (make-remote))) +;; (if (and host port) +;; (let* ((start-res (case *transport-type* +;; ((http)(http-transport:client-connect host port)))) +;; (ping-res (case *transport-type* +;; ((http)(rmt:login-no-auto-client-setup start-res))))) +;; (if (and start-res +;; ping-res) +;; (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago +;; (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res) +;; (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) +;; start-res) +;; (begin ;; login failed but have a server record, clean out the record and try again +;; (debug:print-info 0 *default-log-port* "client:setup, login failed, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid. Fixes part of Randy;s ticket 1405717332 +;; (case *transport-type* +;; ((http)(http-transport:close-connections))) +;; (remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id) +;; (thread-sleep! 1) +;; (client:setup-http areapath remaining-tries: (- remaining-tries 1)) +;; ))) +;; (begin ;; no server registered +;; ;; (server:kind-run areapath) +;; (server:start-and-wait areapath) +;; (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) +;; (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. +;; (client:setup-http areapath remaining-tries: (- remaining-tries 1))))))))) +;; Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -1135,11 +1135,11 @@ ;; logic for getting homehost. Returns (host . at-home) ;; IF *toppath* is not set, wait up to five seconds trying every two seconds ;; (this is to accomodate the watchdog) ;; -(define (common:get-homehost #!key (trynum 5)) +#;(define (common:get-homehost #!key (trynum 5)) ;; called often especially at start up. use mutex to eliminate collisions (mutex-lock! *homehost-mutex*) (cond (*home-host* (mutex-unlock! *homehost-mutex*) @@ -1187,11 +1187,11 @@ (mutex-unlock! *homehost-mutex*) *home-host*)))) ;; am I on the homehost? ;; -(define (common:on-homehost?) +#;(define (common:on-homehost?) (let ((hh (common:get-homehost))) (if hh (cdr hh) #f))) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -585,11 +585,11 @@ (if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun") (debug:print 0 *default-log-port* (string-intersperse (argv) " "))) ;; some switches imply homehost. Exit here if not on homehost ;; -(let ((homehost-required (list "-cleanup-db" "-server"))) +#;(let ((homehost-required (list "-cleanup-db" "-server"))) (if (apply args:any? homehost-required) (if (not (common:on-homehost?)) (for-each (lambda (switch) (if (args:get-arg switch) @@ -2196,11 +2196,11 @@ (if (or (getenv "MT_RUNSCRIPT") (args:get-arg "-repl") (args:get-arg "-load")) (let* ((toppath (launch:setup)) (dbstruct (if (and toppath - (common:on-homehost?)) + #;(common:on-homehost?)) (db:setup #t) #f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f))) (if *toppath* (cond ((getenv "MT_RUNSCRIPT") Index: nmsg-transport.scm ================================================================== --- nmsg-transport.scm +++ nmsg-transport.scm @@ -14,19 +14,24 @@ ;; GNU General Public License for more details. ;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see . +;;====================================================================== +;; Support routines for nmsg usage. +;; This should be reusable, non-megatest specific stuff +;;====================================================================== + (declare (unit nmsg-transport)) (module - nmsg-transport - ( - nmsg:start-server - nmsg:open-send-close - nmsg:open-send-receive - ) + nmsg-transport + ( + nmsg:start-server + nmsg:open-send-close + nmsg:open-send-receive + ) (import scheme posix chicken data-structures ports) (use pkts) (use nanomsg srfi-18) @@ -114,6 +119,10 @@ (thread-start! th1) (thread-start! th2) (thread-join! th1) res)))) +;; get a signature for identifing this process +(define (nmsg:get-process-signature) + (conc (get-host-name) " " (current-process-id))) + ) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -20,970 +20,130 @@ (use format typed-records) ;; RADT => purpose of json format?? (declare (unit rmt)) (declare (uses api)) -(declare (uses http-transport)) (include "common_records.scm") -(declare (uses portlogger)) -(import portlogger) -(declare (uses nmsg-transport)) -(import nmsg-transport) - (use (prefix pkts pkts:) srfi-18) -;; -;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!! -;; - -;; generate entries for ~/.megatestrc with the following -;; -;; grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u - -;;====================================================================== -;; N A N O M S G S E R V E R -;;====================================================================== - -(defstruct nmsg - (conn #f) - (hosts (make-hash-table)) - pkt - pktspec - (mutex (make-mutex)) - ) - -;; make it a global -(define *nmsg-conndat* (make-nmsg)) -(nmsg-pktspec-set! *nmsg-conndat* - `((server (hostname . h) - (port . p) - (pid . i) - ))) -;; get a port -;; start the nmsg server -;; look for other servers -;; contact other servers and compile list of servers -;; there are two types of server -;; main servers - dashboards, runners and dedicated servers - need pkt -;; passive servers - test executers, step calls, list-runs - no pkt -;; -(define (rmt:start-nmsg #!key (force-server-type #f)) - (mutex-lock! (nmsg-mutex *nmsg-conndat*)) - (let* ((server-type (or force-server-type - (if (args:any? "-run" "-server") - 'main - 'passive))) - (port-num (portlogger:open-run-close portlogger:find-port)) - (nmsg-conn (nmsg:start-server port-num)) - (pktspec (nmsg-pktspec *nmsg-conndat*)) - (pktdir (conc (get-environment-variable "MT_RUN_AREA_HOME") - "/.server-pkts"))) - (if (not (directory? pktdir))(create-directory pktdir)) - ;; server is started, now create pkt if needed - (if (eq? server-type 'main) - (nmsg-pkt-set! *nmsg-conndat* - (pkts:write-alist->pkt - pktdir - `((hostname . ,(get-host-name)) - (port . ,port-num) - (pid . ,(current-process-id))) - pktspec: pktspec - ptype: 'server))) - (nmsg-conn-set! *nmsg-conndat* nmsg-conn) - (mutex-unlock! (nmsg-mutex *nmsg-conndat*)) - )) - -;;====================================================================== -;; S U P P O R T F U N C T I O N S -;;====================================================================== - -;; if a server is either running or in the process of starting call client:setup -;; else return #f to let the calling proc know that there is no server available -;; -(define (rmt:get-connection-info areapath #!key (area-dat #f)) ;; TODO: push areapath down. - (let* ((runremote (or area-dat *runremote*)) - (cinfo (if (remote? runremote) - (remote-conndat runremote) - #f))) - (if cinfo - cinfo - (if (server:check-if-running areapath) - (client:setup areapath) - #f)))) - -(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id - -;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) -;; -(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected - - ;;DOT digraph megatest_state_status { - ;;DOT ranksep=0; - ;;DOT // rankdir=LR; - ;;DOT node [shape="box"]; - ;;DOT "rmt:send-receive" -> MUTEXLOCK; - ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; } - ;; do all the prep locked under the rmt-mutex - (mutex-lock! *rmt-mutex*) - - ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote - ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. - ;; 3. do the query, if on homehost use local access - ;; - (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value - (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas - (runremote (or area-dat - *runremote*)) - (readonly-mode (if (and runremote - (remote-ro-mode-checked runremote)) - (remote-ro-mode runremote) - (let* ((dbfile (conc *toppath* "/megatest.db")) - (ro-mode (not (file-write-access? dbfile)))) ;; TODO: use dbstruct or runremote to figure this out in future - (if runremote - (begin - (remote-ro-mode-set! runremote ro-mode) - (remote-ro-mode-checked-set! runremote #t) - ro-mode) - ro-mode))))) - - ;; DOT INIT_RUNREMOTE; // leaving off - doesn't really add to the clarity - ;; DOT MUTEXLOCK -> INIT_RUNREMOTE [label="no remote?"]; - ;; DOT INIT_RUNREMOTE -> MUTEXLOCK; - ;; ensure we have a record for our connection for given area - (if (not runremote) ;; can remove this one. should never get here. - (begin - (set! *runremote* (make-remote)) - (set! runremote *runremote*))) ;; new runremote will come from this on next iteration - - ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity - ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"]; - ;; DOT SET_HOMEHOST -> MUTEXLOCK; - ;; ensure we have a homehost record - (if (not (pair? (remote-hh-dat runremote))) ;; not on homehost - (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little - (remote-hh-dat-set! runremote (common:get-homehost))) - - ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) - (cond - ;;DOT EXIT; - ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" } - ;; give up if more than 15 attempts - ((> attemptnum 15) - (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.") - (exit 1)) - - ;;DOT CASE2 [label="local\nreadonly\nquery"]; - ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2} - ;;DOT CASE2 -> "rmt:open-qry-close-locally"; - ;; readonly mode, read request- handle it - case 2 - ((and readonly-mode - (member cmd api:read-only-queries)) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") - (rmt:open-qry-close-locally cmd 0 params) - ) - - ;;DOT CASE3 [label="write in\nread-only mode"]; - ;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3} - ;;DOT CASE3 -> "#f"; - ;; readonly mode, write request. Do nothing, return #f - (readonly-mode - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3") - (debug:print 0 *default-log-port* "WARNING: write transaction requested on a readonly area. cmd="cmd" params="params) - #f) - - ;; This block was for pre-emptively resetting the connection if there had been no communication for some time. - ;; I don't think it adds any value. If the server is not there, just fail and start a new connection. - ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout) - ;; - ;;DOT CASE4 [label="reset\nconnection"]; - ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} - ;;DOT CASE4 -> "rmt:send-receive"; - ;; reset the connection if it has been unused too long - ((and runremote - (remote-conndat runremote) - (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on - (+ (http-transport:server-dat-get-last-access (remote-conndat runremote)) - (remote-server-timeout runremote)))) - (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") - (http-transport:close-connections area-dat: runremote) - (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. - (mutex-unlock! *rmt-mutex*) - (rmt:send-receive cmd rid params attemptnum: attemptnum)) - - ;;DOT CASE5 [label="local\nread"]; - ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; - ;;DOT CASE5 -> "rmt:open-qry-close-locally"; - - ;; on homehost and this is a read - ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required - (cdr (remote-hh-dat runremote)) ;; on homehost - (member cmd api:read-only-queries)) ;; this is a read - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") - (rmt:open-qry-close-locally cmd 0 params)) - - ;;DOT CASE6 [label="init\nremote"]; - ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6}; - ;;DOT CASE6 -> "rmt:send-receive"; - ;; on homehost and this is a write, we already have a server, but server has died - ((and (cdr (remote-hh-dat runremote)) ;; on homehost - (not (member cmd api:read-only-queries)) ;; this is a write - (remote-server-url runremote) ;; have a server - (not (server:ping (remote-server-url runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. - (set! *runremote* (make-remote)) - (remote-force-server-set! runremote (common:force-server?)) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6") - (rmt:send-receive cmd rid params attemptnum: attemptnum)) - - ;;DOT CASE7 [label="homehost\nwrite"]; - ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7}; - ;;DOT CASE7 -> "rmt:open-qry-close-locally"; - ;; on homehost and this is a write, we already have a server - ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required - (cdr (remote-hh-dat runremote)) ;; on homehost - (not (member cmd api:read-only-queries)) ;; this is a write - (remote-server-url runremote)) ;; have a server - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") - (rmt:open-qry-close-locally cmd 0 params)) - - ;;DOT CASE8 [label="force\nserver"]; - ;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8}; - ;;DOT CASE8 -> "rmt:open-qry-close-locally"; - ;; on homehost, no server contact made and this is a write, passively start a server - ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required - (cdr (remote-hh-dat runremote)) ;; have homehost - (not (remote-server-url runremote)) ;; no connection yet - (not (member cmd api:read-only-queries))) ;; not a read-only query - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") - (let ((server-url (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call - (if server-url - (remote-server-url-set! runremote server-url) ;; the string can be consumed by the client setup if needed - (if (common:force-server?) - (server:start-and-wait *toppath*) - (server:kind-run *toppath*)))) - (remote-force-server-set! runremote (common:force-server?)) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8.1") - (rmt:open-qry-close-locally cmd 0 params)) - - ;;DOT CASE9 [label="force server\nnot on homehost"]; - ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9}; - ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive"; - ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one - (not (remote-conndat runremote))) - (and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost - (not (remote-conndat runremote)))) ;; and no connection - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) - (mutex-unlock! *rmt-mutex*) - (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up? - (server:start-and-wait *toppath*)) - (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http - (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as - - ;;DOT CASE10 [label="on homehost"]; - ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10}; - ;;DOT CASE10 -> "rmt:open-qry-close-locally"; - ;; all set up if get this far, dispatch the query - ((and (not (remote-force-server runremote)) - (cdr (remote-hh-dat runremote))) ;; we are on homehost - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10") - (rmt:open-qry-close-locally cmd (if rid rid 0) params)) - - ;;DOT CASE11 [label="send_receive"]; - ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11}; - ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"]; - ;;DOT CASE11 -> "RESULT" [label="call succeeded"]; - ;; not on homehost, do server query - (else - ;; (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9") - ;; (mutex-lock! *rmt-mutex*) - (let* ((conninfo (remote-conndat runremote)) - (dat (case (remote-transport runremote) - ((http) (condition-case ;; handling here has caused a lot of problems. However it is needed to deal with attemtped communication to servers that have gone away - (http-transport:client-api-send-receive 0 conninfo cmd params) - ((commfail)(vector #f "communications fail")) - ((exn)(vector #f "other fail" (print-call-chain))))) - (else - (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported") - (exit)))) - (success (if (vector? dat) (vector-ref dat 0) #f)) - (res (if (vector? dat) (vector-ref dat 1) #f))) - (if (and (vector? conninfo) (< 5 (vector-length conninfo))) - (http-transport:server-dat-update-last-access conninfo) ;; refresh access time - (begin - (debug:print 0 *default-log-port* "INFO: Should not get here! conninfo=" conninfo) - (set! conninfo #f) - (remote-conndat-set! *runremote* #f) - (http-transport:close-connections area-dat: runremote))) - ;; (mutex-unlock! *rmt-mutex*) - (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) - (mutex-unlock! *rmt-mutex*) - (if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end - (if (and (vector? res) - (eq? (vector-length res) 2) - (eq? (vector-ref res 1) 'overloaded)) ;; since we are looking at the data to carry the error we'll use a fairly obtuse combo to minimise the chances of some sort of collision. - ;; this is the case where the returned data is bad or the server is overloaded and we want - ;; to ease off the queries - (let ((wait-delay (+ attemptnum (* attemptnum 10)))) - (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") - (mutex-lock! *rmt-mutex*) - (http-transport:close-connections area-dat: runremote) - (set! *runremote* #f) ;; force starting over - (mutex-unlock! *rmt-mutex*) - (thread-sleep! wait-delay) - (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) - res) ;; All good, return res - (begin - (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) - (mutex-lock! *rmt-mutex*) - (remote-conndat-set! runremote #f) - (http-transport:close-connections area-dat: runremote) - (remote-server-url-set! runremote #f) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") - ;; (if (not (server:check-if-running *toppath*)) - ;; (server:start-and-wait *toppath*)) - (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) - - ;;DOT } - -;; (define (rmt:update-db-stats run-id rawcmd params duration) -;; (mutex-lock! *db-stats-mutex*) -;; (handle-exceptions -;; exn -;; (begin -;; (debug:print 0 *default-log-port* "WARNING: stats collection failed in update-db-stats") -;; (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) -;; (print "exn=" (condition->list exn)) -;; #f) ;; if this fails we don't care, it is just stats -;; (let* ((cmd (conc "run-id=" run-id " " (if (eq? rawcmd 'general-call) (car params) rawcmd))) -;; (stat-vec (hash-table-ref/default *db-stats* cmd #f))) -;; (if (not (vector? stat-vec)) -;; (let ((newvec (vector 0 0))) -;; (hash-table-set! *db-stats* cmd newvec) -;; (set! stat-vec newvec))) -;; (vector-set! stat-vec 0 (+ (vector-ref stat-vec 0) 1)) -;; (vector-set! stat-vec 1 (+ (vector-ref stat-vec 1) duration)))) -;; (mutex-unlock! *db-stats-mutex*)) - -(define (rmt:print-db-stats) - (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f" - (debug:print 18 *default-log-port* "DB Stats\n========") - (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg")) - (for-each (lambda (cmd) - (let ((cmd-dat (hash-table-ref *db-stats* cmd))) - (debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0)))))) - (sort (hash-table-keys *db-stats*) - (lambda (a b) - (> (vector-ref (hash-table-ref *db-stats* a) 0) - (vector-ref (hash-table-ref *db-stats* b) 0))))))) - -(define (rmt:get-max-query-average run-id) - (mutex-lock! *db-stats-mutex*) - (let* ((runkey (conc "run-id=" run-id " ")) - (cmds (filter (lambda (x) - (substring-index runkey x)) - (hash-table-keys *db-stats*))) - (res (if (null? cmds) - (cons 'none 0) - (let loop ((cmd (car cmds)) - (tal (cdr cmds)) - (max-cmd (car cmds)) - (res 0)) - (let* ((cmd-dat (hash-table-ref *db-stats* cmd)) - (tot (vector-ref cmd-dat 0)) - (curravg (/ (vector-ref cmd-dat 1) (vector-ref cmd-dat 0))) ;; count is never zero by construction - (currmax (max res curravg)) - (newmax-cmd (if (> curravg res) cmd max-cmd))) - (if (null? tal) - (if (> tot 10) - (cons newmax-cmd currmax) - (cons 'none 0)) - (loop (car tal)(cdr tal) newmax-cmd currmax))))))) - (mutex-unlock! *db-stats-mutex*) - res)) - -(define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5)) - (let* ((qry-is-write (not (member cmd api:read-only-queries))) - (db-file-path (db:dbfile-path)) ;; 0)) - (dbstruct-local (db:setup #t)) ;; make-dbr:dbstruct path: dbdir local: #t))) - (read-only (not (file-write-access? db-file-path))) - (start (current-milliseconds)) - (resdat (if (not (and read-only qry-is-write)) - (let ((v (api:execute-requests dbstruct-local (vector (symbol->string cmd) params)))) - (handle-exceptions ;; there has been a long history of receiving strange errors from values returned by the client when things go wrong.. - exn ;; This is an attempt to detect that situation and recover gracefully - (begin - (debug:print0 *default-log-port* "ERROR: bad data from server " v " message: " ((condition-property-accessor 'exn 'message) exn)) - (vector #t '())) ;; should always get a vector but if something goes wrong return a dummy - (if (and (vector? v) - (> (vector-length v) 1)) - (let ((newvec (vector (vector-ref v 0)(vector-ref v 1)))) - newvec) ;; by copying the vector while inside the error handler we should force the detection of a corrupted record - (vector #t '())))) ;; we could also check that the returned types are valid - (vector #t '()))) - (success (vector-ref resdat 0)) - (res (vector-ref resdat 1)) - (duration (- (current-milliseconds) start))) - (if (and read-only qry-is-write) - (debug:print 0 *default-log-port* "ERROR: attempt to write to read-only database ignored. cmd=" cmd)) - (if (not success) - (if (> remretries 0) - (begin - (debug:print-error 0 *default-log-port* "local query failed. Trying again.") - (thread-sleep! (/ (random 5000) 1000)) ;; some random delay - (rmt:open-qry-close-locally cmd run-id params remretries: (- remretries 1))) - (begin - (debug:print-error 0 *default-log-port* "too many retries in rmt:open-qry-close-locally, giving up") - #f)) - (begin - ;; (rmt:update-db-stats run-id cmd params duration) - ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it - (if qry-is-write - (let ((start-time (current-seconds))) - (mutex-lock! *db-multi-sync-mutex*) -/ (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) - (mutex-unlock! *db-multi-sync-mutex*))))) - res)) - -(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) - (let* ((run-id (if run-id run-id 0)) - (res (handle-exceptions - exn - #f - (http-transport:client-api-send-receive run-id connection-info cmd params)))) - (if (and res (vector-ref res 0)) - (vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!! - #f))) - -;; ;; Wrap json library for strings (why the ports crap in the first place?) -;; (define (rmt:dat->json-str dat) -;; (with-output-to-string -;; (lambda () -;; (json-write dat)))) -;; -;; (define (rmt:json-str->dat json-str) -;; (with-input-from-string json-str -;; (lambda () -;; (json-read)))) - -;;====================================================================== -;; -;; A C T U A L A P I C A L L S -;; -;;====================================================================== - -;;====================================================================== -;; S E R V E R -;;====================================================================== - -(define (rmt:kill-server run-id) - (rmt:send-receive 'kill-server run-id (list run-id))) - -(define (rmt:start-server run-id) - (rmt:send-receive 'start-server 0 (list run-id))) - -;;====================================================================== -;; M I S C -;;====================================================================== - -(define (rmt:login run-id) - (rmt:send-receive 'login run-id (list *toppath* megatest-version *my-client-signature*))) - -;; This login does no retries under the hood - it acts a bit like a ping. -;; Deprecated for nmsg-transport. -;; -(define (rmt:login-no-auto-client-setup connection-info) - (case *transport-type* ;; run-id of 0 is just a placeholder - ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login 0 (list *toppath* megatest-version *my-client-signature*))) - ;;((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info 'login (list *toppath* megatest-version run-id *my-client-signature*))) - )) - -;; hand off a call to one of the db:queries statements -;; added run-id to make looking up the correct db possible -;; -(define (rmt:general-call stmtname run-id . params) - (rmt:send-receive 'general-call run-id (append (list stmtname run-id) params))) - - -;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host -(define (rmt:get-latest-host-load hostname) - (rmt:send-receive 'get-latest-host-load 0 (list hostname))) - -;; (define (rmt:sync-inmem->db run-id) -;; (rmt:send-receive 'sync-inmem->db run-id '())) - -(define (rmt:sdb-qry qry val run-id) - ;; add caching if qry is 'getid or 'getstr - (rmt:send-receive 'sdb-qry run-id (list qry val))) - -;; NOT COMPLETED -(define (rmt:runtests user run-id testpatt params) - (rmt:send-receive 'runtests run-id testpatt)) - -(define (rmt:get-run-record-ids target run keynames test-patt) - (rmt:send-receive 'get-run-record-ids #f (list target run keynames test-patt))) - -(define (rmt:get-changed-record-ids since-time) - (rmt:send-receive 'get-changed-record-ids #f (list since-time)) ) - -;;====================================================================== -;; T E S T M E T A -;;====================================================================== - -(define (rmt:get-tests-tags) - (rmt:send-receive 'get-tests-tags #f '())) - -;;====================================================================== -;; K E Y S -;;====================================================================== - -;; These require run-id because the values come from the run! -;; -(define (rmt:get-key-val-pairs run-id) - (rmt:send-receive 'get-key-val-pairs run-id (list run-id))) - -(define (rmt:get-keys) - (if *db-keys* *db-keys* - (let ((res (rmt:send-receive 'get-keys #f '()))) - (set! *db-keys* res) - res))) - -(define (rmt:get-keys-write) ;; dummy query to force server start - (let ((res (rmt:send-receive 'get-keys-write #f '()))) - (set! *db-keys* res) - res)) - -;; we don't reuse run-id's (except possibly *after* a db cleanup) so it is safe -;; to cache the resuls in a hash -;; -(define (rmt:get-key-vals run-id) - (or (hash-table-ref/default *keyvals* run-id #f) - (let ((res (rmt:send-receive 'get-key-vals #f (list run-id)))) - (hash-table-set! *keyvals* run-id res) - res))) - -(define (rmt:get-targets) - (rmt:send-receive 'get-targets #f '())) - -(define (rmt:get-target run-id) - (rmt:send-receive 'get-target run-id (list run-id))) - -(define (rmt:get-run-times runpatt targetpatt) - (rmt:send-receive 'get-run-times #f (list runpatt targetpatt ))) - - -;;====================================================================== -;; T E S T S -;;====================================================================== - -;; Just some syntatic sugar -(define (rmt:register-test run-id test-name item-path) - (rmt:general-call 'register-test run-id run-id test-name item-path)) - -(define (rmt:get-test-id run-id testname item-path) - (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) - -;; run-id is NOT used -;; -(define (rmt:get-test-info-by-id run-id test-id) - (if (number? test-id) - (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)) - (begin - (debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) - (print-call-chain (current-error-port)) - #f))) - -(define (rmt:test-get-rundir-from-test-id run-id test-id) - (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id))) - -(define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f)) - (let* ((test-path (if (string? work-area) - work-area - (rmt:test-get-rundir-from-test-id run-id test-id)))) - (debug:print 3 *default-log-port* "TEST PATH: " test-path) - (open-test-db test-path))) - -;; WARNING: This currently bypasses the transaction wrapped writes system -(define (rmt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment) - (rmt:send-receive 'test-set-state-status-by-id run-id (list run-id test-id newstate newstatus newcomment))) - -(define (rmt:set-tests-state-status run-id testnames currstate currstatus newstate newstatus) - (rmt:send-receive 'set-tests-state-status run-id (list run-id testnames currstate currstatus newstate newstatus))) - -(define (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode) - ;; (if (number? run-id) - (rmt:send-receive 'get-tests-for-run run-id (list run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode))) - ;; (begin - ;; (debug:print-error 0 *default-log-port* "rmt:get-tests-for-run called with bad run-id=" run-id) - ;; (print-call-chain (current-error-port)) - ;; '()))) - -;; get stuff via synchash -(define (rmt:synchash-get run-id proc synckey keynum params) - (rmt:send-receive 'synchash-get run-id (list run-id proc synckey keynum params))) - -(define (rmt:get-tests-for-run-mindata run-id testpatt states status not-in) - (rmt:send-receive 'get-tests-for-run-mindata run-id (list run-id testpatt states status not-in))) - -;; IDEA: Threadify these - they spend a lot of time waiting ... -;; -(define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) - (let ((multi-run-mutex (make-mutex)) - (run-id-list (if run-ids - run-ids - (rmt:get-all-run-ids))) - (result '())) - (if (null? run-id-list) - '() - (let loop ((hed (car run-id-list)) - (tal (cdr run-id-list)) - (threads '())) - (if (> (length threads) 5) - (loop hed tal (filter (lambda (th)(not (member (thread-state th) '(terminated dead)))) threads)) - (let* ((newthread (make-thread - (lambda () - (let ((res (rmt:send-receive 'get-tests-for-run-mindata hed (list hed testpatt states status not-in)))) - (if (list? res) - (begin - (mutex-lock! multi-run-mutex) - (set! result (append result res)) - (mutex-unlock! multi-run-mutex)) - (debug:print-error 0 *default-log-port* "get-tests-for-run-mindata failed for run-id " hed ", testpatt " testpatt ", states " states ", status " status ", not-in " not-in)))) - (conc "multi-run-thread for run-id " hed))) - (newthreads (cons newthread threads))) - (thread-start! newthread) - (thread-sleep! 0.05) ;; give that thread some time to start - (if (null? tal) - newthreads - (loop (car tal)(cdr tal) newthreads)))))) - result)) - -;; ;; IDEA: Threadify these - they spend a lot of time waiting ... -;; ;; -;; (define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) -;; (let ((run-id-list (if run-ids -;; run-ids -;; (rmt:get-all-run-ids)))) -;; (apply append (map (lambda (run-id) -;; (rmt:send-receive 'get-tests-for-run-mindata run-id (list run-ids testpatt states status not-in))) -;; run-id-list)))) - -(define (rmt:delete-test-records run-id test-id) - (rmt:send-receive 'delete-test-records run-id (list run-id test-id))) - -;; This is not needed as test steps are deleted on test delete call -;; -;; (define (rmt:delete-test-step-records run-id test-id) -;; (rmt:send-receive 'delete-test-step-records run-id (list run-id test-id))) - -(define (rmt:test-set-state-status run-id test-id state status msg) - (rmt:send-receive 'test-set-state-status run-id (list run-id test-id state status msg))) - -(define (rmt:test-toplevel-num-items run-id test-name) - (rmt:send-receive 'test-toplevel-num-items run-id (list run-id test-name))) - -;; (define (rmt:get-previous-test-run-record run-id test-name item-path) -;; (rmt:send-receive 'get-previous-test-run-record run-id (list run-id test-name item-path))) - -(define (rmt:get-matching-previous-test-run-records run-id test-name item-path) - (rmt:send-receive 'get-matching-previous-test-run-records run-id (list run-id test-name item-path))) - -(define (rmt:test-get-logfile-info run-id test-name) - (rmt:send-receive 'test-get-logfile-info run-id (list run-id test-name))) - -(define (rmt:test-get-records-for-index-file run-id test-name) - (rmt:send-receive 'test-get-records-for-index-file run-id (list run-id test-name))) - -(define (rmt:get-testinfo-state-status run-id test-id) - (rmt:send-receive 'get-testinfo-state-status run-id (list run-id test-id))) - -(define (rmt:test-set-log! run-id test-id logf) - (if (string? logf)(rmt:general-call 'test-set-log run-id logf test-id))) - -(define (rmt:test-set-top-process-pid run-id test-id pid) - (rmt:send-receive 'test-set-top-process-pid run-id (list run-id test-id pid))) - -(define (rmt:test-get-top-process-pid run-id test-id) - (rmt:send-receive 'test-get-top-process-pid run-id (list run-id test-id))) - -(define (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt) - (rmt:send-receive 'get-run-ids-matching-target #f (list keynames target res runname testpatt statepatt statuspatt))) - -;; NOTE: This will open and access ALL run databases. -;; -(define (rmt:test-get-paths-matching-keynames-target-new keynames target res testpatt statepatt statuspatt runname) - (let ((run-ids (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt))) - (apply append - (map (lambda (run-id) - (rmt:send-receive 'test-get-paths-matching-keynames-target-new run-id (list run-id keynames target res testpatt statepatt statuspatt runname))) - run-ids)))) - -;; (define (rmt:get-run-ids-matching keynames target res) -;; (rmt:send-receive #f 'get-run-ids-matching (list keynames target res))) - -(define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f)) - (rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps))) - -(define (rmt:get-count-tests-running-for-run-id run-id) - (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id))) - -;; Statistical queries - -(define (rmt:get-count-tests-running run-id) - (rmt:send-receive 'get-count-tests-running run-id (list run-id))) - -(define (rmt:get-count-tests-running-for-testname run-id testname) - (rmt:send-receive 'get-count-tests-running-for-testname run-id (list run-id testname))) - -(define (rmt:get-count-tests-running-in-jobgroup run-id jobgroup) - (rmt:send-receive 'get-count-tests-running-in-jobgroup run-id (list run-id jobgroup))) - -;; state and status are extra hints not usually used in the calculation -;; -(define (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status comment) - (rmt:send-receive 'set-state-status-and-roll-up-items run-id (list run-id test-name item-path state status comment))) - -(define (rmt:update-pass-fail-counts run-id test-name) - (rmt:general-call 'update-pass-fail-counts run-id test-name test-name test-name)) - -(define (rmt:top-test-set-per-pf-counts run-id test-name) - (rmt:send-receive 'top-test-set-per-pf-counts run-id (list run-id test-name))) - -(define (rmt:get-raw-run-stats run-id) - (rmt:send-receive 'get-raw-run-stats run-id (list run-id))) - -(define (rmt:get-test-times runname target) - (rmt:send-receive 'get-test-times #f (list runname target ))) - -;;====================================================================== -;; R U N S -;;====================================================================== - -(define (rmt:get-run-info run-id) - (rmt:send-receive 'get-run-info run-id (list run-id))) - -(define (rmt:get-num-runs runpatt) - (rmt:send-receive 'get-num-runs #f (list runpatt))) - -(define (rmt:get-runs-cnt-by-patt runpatt targetpatt keys) - (rmt:send-receive 'get-runs-cnt-by-patt #f (list runpatt targetpatt keys))) - -;; Use the special run-id == #f scenario here since there is no run yet -(define (rmt:register-run keyvals runname state status user contour) - (rmt:send-receive 'register-run #f (list keyvals runname state status user contour))) - -(define (rmt:get-run-name-from-id run-id) - (rmt:send-receive 'get-run-name-from-id run-id (list run-id))) - -(define (rmt:delete-run run-id) - (rmt:send-receive 'delete-run run-id (list run-id))) - -(define (rmt:update-run-stats run-id stats) - (rmt:send-receive 'update-run-stats #f (list run-id stats))) - -(define (rmt:delete-old-deleted-test-records) - (rmt:send-receive 'delete-old-deleted-test-records #f '())) - -(define (rmt:get-runs runpatt count offset keypatts) - (rmt:send-receive 'get-runs #f (list runpatt count offset keypatts))) - -(define (rmt:simple-get-runs runpatt count offset target) - (rmt:send-receive 'simple-get-runs #f (list runpatt count offset target))) - -(define (rmt:get-all-run-ids) - (rmt:send-receive 'get-all-run-ids #f '())) - -(define (rmt:get-prev-run-ids run-id) - (rmt:send-receive 'get-prev-run-ids #f (list run-id))) - -(define (rmt:lock/unlock-run run-id lock unlock user) - (rmt:send-receive 'lock/unlock-run #f (list run-id lock unlock user))) - -;; set/get status -(define (rmt:get-run-status run-id) - (rmt:send-receive 'get-run-status #f (list run-id))) - -(define (rmt:set-run-status run-id run-status #!key (msg #f)) - (rmt:send-receive 'set-run-status #f (list run-id run-status msg))) - -(define (rmt:update-run-event_time run-id) - (rmt:send-receive 'update-run-event_time #f (list run-id))) - -(define (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-runs-update #!key (sort-order "asc")) ;; fields of #f uses default - (rmt:send-receive 'get-runs-by-patt #f (list keys runnamepatt targpatt offset limit fields last-runs-update sort-order))) - -(define (rmt:find-and-mark-incomplete run-id ovr-deadtime) - ;; (if (rmt:send-receive 'have-incompletes? run-id (list run-id ovr-deadtime)) - (rmt:send-receive 'mark-incomplete run-id (list run-id ovr-deadtime))) ;; ) - -(define (rmt:get-main-run-stats run-id) - (rmt:send-receive 'get-main-run-stats #f (list run-id))) - -(define (rmt:get-var varname) - (rmt:send-receive 'get-var #f (list varname))) - -(define (rmt:del-var varname) - (rmt:send-receive 'del-var #f (list varname))) - -(define (rmt:set-var varname value) - (rmt:send-receive 'set-var #f (list varname value))) - -;;====================================================================== -;; M U L T I R U N Q U E R I E S -;;====================================================================== - -;; Need to move this to multi-run section and make associated changes -(define (rmt:find-and-mark-incomplete-all-runs #!key (ovr-deadtime #f)) - (let ((run-ids (rmt:get-all-run-ids))) - (for-each (lambda (run-id) - (rmt:find-and-mark-incomplete run-id ovr-deadtime)) - run-ids))) - -;; get the previous record for when this test was run where all keys match but runname -;; returns #f if no such test found, returns a single test record if found -;; -;; Run this at the client end since we have to connect to multiple run-id dbs -;; -(define (rmt:get-previous-test-run-record run-id test-name item-path) - (let* ((keyvals (rmt:get-key-val-pairs run-id)) - (keys (rmt:get-keys)) - (selstr (string-intersperse keys ",")) - (qrystr (string-intersperse (map (lambda (x)(conc x "=?")) keys) " AND "))) - (if (not keyvals) - #f - (let ((prev-run-ids (rmt:get-prev-run-ids run-id))) - ;; for each run starting with the most recent look to see if there is a matching test - ;; if found then return that matching test record - (debug:print 4 *default-log-port* "selstr: " selstr ", qrystr: " qrystr ", keyvals: " keyvals ", previous run ids found: " prev-run-ids) - (if (null? prev-run-ids) #f - (let loop ((hed (car prev-run-ids)) - (tal (cdr prev-run-ids))) - (let ((results (rmt:get-tests-for-run hed (conc test-name "/" item-path) '() '() ;; run-id testpatt states statuses - #f #f #f ;; offset limit not-in hide/not-hide - #f #f #f #f 'normal))) ;; sort-by sort-order qryvals last-update mode - (debug:print 4 *default-log-port* "Got tests for run-id " run-id ", test-name " test-name ", item-path " item-path ": " results) - (if (and (null? results) - (not (null? tal))) - (loop (car tal)(cdr tal)) - (if (null? results) #f - (car results)))))))))) - -(define (rmt:get-run-stats) - (rmt:send-receive 'get-run-stats #f '())) - -;;====================================================================== -;; S T E P S -;;====================================================================== - -;; Getting steps is more complicated. -;; -;; If given work area -;; 1. Find the testdat.db file -;; 2. Open the testdat.db file and do the query -;; If not given the work area -;; 1. Do a remote call to get the test path -;; 2. Continue as above -;; -;;(define (rmt:get-steps-for-test run-id test-id) -;; (rmt:send-receive 'get-steps-data run-id (list test-id))) - -(define (rmt:teststep-set-status! run-id test-id teststep-name state-in status-in comment logfile) - (let* ((state (items:check-valid-items "state" state-in)) - (status (items:check-valid-items "status" status-in))) - (if (or (not state)(not status)) - (debug:print 3 *default-log-port* "WARNING: Invalid " (if status "status" "state") - " value \"" (if status state-in status-in) "\", update your validvalues section in megatest.config")) - (rmt:send-receive 'teststep-set-status! run-id (list run-id test-id teststep-name state-in status-in comment logfile)))) - -(define (rmt:get-steps-for-test run-id test-id) - (rmt:send-receive 'get-steps-for-test run-id (list run-id test-id))) - -(define (rmt:get-steps-info-by-id test-step-id) - (rmt:send-receive 'get-steps-info-by-id #f (list test-step-id))) - -;;====================================================================== -;; T E S T D A T A -;;====================================================================== - -(define (rmt:read-test-data run-id test-id categorypatt #!key (work-area #f)) - (rmt:send-receive 'read-test-data run-id (list run-id test-id categorypatt))) -(define (rmt:read-test-data* run-id test-id categorypatt varpatt #!key (work-area #f)) - (rmt:send-receive 'read-test-data* run-id (list run-id test-id categorypatt varpatt))) - -(define (rmt:get-data-info-by-id test-data-id) - (rmt:send-receive 'get-data-info-by-id #f (list test-data-id))) - -(define (rmt:testmeta-add-record testname) - (rmt:send-receive 'testmeta-add-record #f (list testname))) - -(define (rmt:testmeta-get-record testname) - (rmt:send-receive 'testmeta-get-record #f (list testname))) - -(define (rmt:testmeta-update-field test-name fld val) - (rmt:send-receive 'testmeta-update-field #f (list test-name fld val))) - -(define (rmt:test-data-rollup run-id test-id status) - (rmt:send-receive 'test-data-rollup run-id (list run-id test-id status))) - -(define (rmt:csv->test-data run-id test-id csvdata) - (rmt:send-receive 'csv->test-data run-id (list run-id test-id csvdata))) - -;;====================================================================== -;; T A S K S -;;====================================================================== - -(define (rmt:tasks-find-task-queue-records target run-name test-patt state-patt action-patt) - (rmt:send-receive 'find-task-queue-records #f (list target run-name test-patt state-patt action-patt))) - -(define (rmt:tasks-add action owner target runname testpatt params) - (rmt:send-receive 'tasks-add #f (list action owner target runname testpatt params))) - -(define (rmt:tasks-set-state-given-param-key param-key new-state) - (rmt:send-receive 'tasks-set-state-given-param-key #f (list param-key new-state))) - -(define (rmt:tasks-get-last target runname) - (rmt:send-receive 'tasks-get-last #f (list target runname))) - -;;====================================================================== -;; N O S Y N C D B -;;====================================================================== - -(define (rmt:no-sync-set var val) - (rmt:send-receive 'no-sync-set #f `(,var ,val))) - -(define (rmt:no-sync-get/default var default) - (rmt:send-receive 'no-sync-get/default #f `(,var ,default))) - -(define (rmt:no-sync-del! var) - (rmt:send-receive 'no-sync-del! #f `(,var))) - -(define (rmt:no-sync-get-lock keyname) - (rmt:send-receive 'no-sync-get-lock #f `(,keyname))) - -;;====================================================================== -;; A R C H I V E S -;;====================================================================== - -(define (rmt:archive-get-allocations testname itempath dneeded) - (rmt:send-receive 'archive-get-allocations #f (list testname itempath dneeded))) - -(define (rmt:archive-register-block-name bdisk-id archive-path) - (rmt:send-receive 'archive-register-block-name #f (list bdisk-id archive-path))) - -(define (rmt:archive-allocate-testsuite/area-to-block block-id testsuite-name areakey) - (rmt:send-receive 'archive-allocate-test-to-block #f (list block-id testsuite-name areakey))) - -(define (rmt:archive-register-disk bdisk-name bdisk-path df) - (rmt:send-receive 'archive-register-disk #f (list bdisk-name bdisk-path df))) - -(define (rmt:test-set-archive-block-id run-id test-id archive-block-id) - (rmt:send-receive 'test-set-archive-block-id run-id (list run-id test-id archive-block-id))) - -(define (rmt:test-get-archive-block-info archive-block-id) - (rmt:send-receive 'test-get-archive-block-info #f (list archive-block-id))) + +(defstruct cmdrec + cmd + (host #f) + (run-ids #f) + params) + +;; call cmd on remote host (#f for any host) +;; +;; example: (rmt:run 'get-runs target run-name test-patt state status) +;; +(define (rmt:run cmd . params) + (let ((server (rmt:get-server cmdrec))) ;; look up server + #f)) + +(define (rmt:get-connection-info . args) #t) +(define (rmt:send-receive . args) #t) +(define (rmt:print-db-stats . args) #t) +(define (rmt:get-max-query-average . args) #t) +(define (rmt:open-qry-close-locally . args) #t) +(define (rmt:send-receive-no-auto-client-setup . args) #t) +(define (rmt:kill-server . args) #t) +(define (rmt:start-server . args) #t) +(define (rmt:login . args) #t) +(define (rmt:login-no-auto-client-setup . args) #t) +(define (rmt:general-call . args) #t) +(define (rmt:get-latest-host-load . args) #t) +(define (rmt:sdb-qry . args) #t) +(define (rmt:runtests . args) #t) +(define (rmt:get-run-record-ids . args) #t) +(define (rmt:get-changed-record-ids . args) #t) +(define (rmt:get-tests-tags . args) #t) +(define (rmt:get-key-val-pairs . args) #t) +(define (rmt:get-keys . args) #t) +(define (rmt:get-keys-write . args) #t) +(define (rmt:get-key-vals . args) #t) +(define (rmt:get-targets . args) #t) +(define (rmt:get-target . args) #t) +(define (rmt:get-run-times . args) #t) +(define (rmt:register-test . args) #t) +(define (rmt:get-test-id . args) #t) +(define (rmt:get-test-info-by-id . args) #t) +(define (rmt:test-get-rundir-from-test-id . args) #t) +(define (rmt:open-test-db-by-test-id . args) #t) +(define (rmt:test-set-state-status-by-id . args) #t) +(define (rmt:set-tests-state-status . args) #t) +(define (rmt:get-tests-for-run . args) #t) +(define (rmt:synchash-get . args) #t) +(define (rmt:get-tests-for-run-mindata . args) #t) +(define (rmt:get-tests-for-runs-mindata . args) #t) +(define (rmt:delete-test-records . args) #t) +(define (rmt:test-set-state-status . args) #t) +(define (rmt:test-toplevel-num-items . args) #t) +(define (rmt:get-matching-previous-test-run-records . args) #t) +(define (rmt:test-get-logfile-info . args) #t) +(define (rmt:test-get-records-for-index-file . args) #t) +(define (rmt:get-testinfo-state-status . args) #t) +(define (rmt:test-set-log! . args) #t) +(define (rmt:test-set-top-process-pid . args) #t) +(define (rmt:test-get-top-process-pid . args) #t) +(define (rmt:get-run-ids-matching-target . args) #t) +(define (rmt:test-get-paths-matching-keynames-target-new . args) #t) +(define (rmt:get-prereqs-not-met . args) #t) +(define (rmt:get-count-tests-running-for-run-id . args) #t) +(define (rmt:get-count-tests-running . args) #t) +(define (rmt:get-count-tests-running-for-testname . args) #t) +(define (rmt:get-count-tests-running-in-jobgroup . args) #t) +(define (rmt:set-state-status-and-roll-up-items . args) #t) +(define (rmt:update-pass-fail-counts . args) #t) +(define (rmt:top-test-set-per-pf-counts . args) #t) +(define (rmt:get-raw-run-stats . args) #t) +(define (rmt:get-test-times . args) #t) +(define (rmt:get-run-info . args) #t) +(define (rmt:get-num-runs . args) #t) +(define (rmt:get-runs-cnt-by-patt . args) #t) +(define (rmt:register-run . args) #t) +(define (rmt:get-run-name-from-id . args) #t) +(define (rmt:delete-run . args) #t) +(define (rmt:update-run-stats . args) #t) +(define (rmt:delete-old-deleted-test-records . args) #t) +(define (rmt:get-runs . args) #t) +(define (rmt:simple-get-runs . args) #t) +(define (rmt:get-all-run-ids . args) #t) +(define (rmt:get-prev-run-ids . args) #t) +(define (rmt:lock/unlock-run . args) #t) +(define (rmt:get-run-status . args) #t) +(define (rmt:set-run-status . args) #t) +(define (rmt:update-run-event_time . args) #t) +(define (rmt:get-runs-by-patt . args) #t) +(define (rmt:find-and-mark-incomplete . args) #t) +(define (rmt:get-main-run-stats . args) #t) +(define (rmt:get-var . args) #t) +(define (rmt:del-var . args) #t) +(define (rmt:set-var . args) #t) +(define (rmt:find-and-mark-incomplete-all-runs . args) #t) +(define (rmt:get-previous-test-run-record . args) #t) +(define (rmt:get-run-stats . args) #t) +(define (rmt:teststep-set-status! . args) #t) +(define (rmt:get-steps-for-test . args) #t) +(define (rmt:get-steps-info-by-id . args) #t) +(define (rmt:read-test-data . args) #t) +(define (rmt:read-test-data* . args) #t) +(define (rmt:get-data-info-by-id . args) #t) +(define (rmt:testmeta-add-record . args) #t) +(define (rmt:testmeta-get-record . args) #t) +(define (rmt:testmeta-update-field . args) #t) +(define (rmt:test-data-rollup . args) #t) +(define (rmt:csv->test-data . args) #t) +(define (rmt:tasks-find-task-queue-records . args) #t) +(define (rmt:tasks-add . args) #t) +(define (rmt:tasks-set-state-given-param-key . args) #t) +(define (rmt:tasks-get-last . args) #t) +(define (rmt:no-sync-set . args) #t) +(define (rmt:no-sync-get/default . args) #t) +(define (rmt:no-sync-del! . args) #t) +(define (rmt:no-sync-get-lock . args) #t) +(define (rmt:archive-get-allocations . args) #t) +(define (rmt:archive-register-block-name . args) #t) +(define (rmt:archive-allocate-testsuite/area-to-block . args) #t) +(define (rmt:archive-register-disk . args) #t) +(define (rmt:test-set-archive-block-id . args) #t) +(define (rmt:test-get-archive-block-info . args) #t) DELETED rpc-transport.scm Index: rpc-transport.scm ================================================================== --- rpc-transport.scm +++ /dev/null @@ -1,237 +0,0 @@ - -;; Copyright 2006-2012, Matthew Welland. -;; -;; This file is part of Megatest. -;; -;; Megatest is free software: you can redistribute it and/or modify -;; it under the terms of the GNU General Public License as published by -;; the Free Software Foundation, either version 3 of the License, or -;; (at your option) any later version. -;; -;; Megatest is distributed in the hope that it will be useful, -;; but WITHOUT ANY WARRANTY; without even the implied warranty of -;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -;; GNU General Public License for more details. -;; -;; You should have received a copy of the GNU General Public License -;; along with Megatest. If not, see . -;; - -(require-extension (srfi 18) extras tcp s11n rpc) -(import (prefix rpc rpc:)) - -(use sqlite3 srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) -(import (prefix sqlite3 sqlite3:)) - -(declare (unit rpc-transport)) - -(declare (uses common)) -(declare (uses db)) -(declare (uses tests)) -(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. - -(include "common_records.scm") -(include "db_records.scm") - -;; procstr is the name of the procedure to be called as a string -(define (rpc-transport:autoremote procstr params) - (handle-exceptions - exn - (begin - (debug:print 1 *default-log-port* "Remote failed for " proc " " params) - (apply (eval (string->symbol procstr)) params)) - ;; (if *runremote* - ;; (apply (eval (string->symbol (conc "remote:" procstr))) params) - (apply (eval (string->symbol procstr)) params))) - -;; all routes though here end in exit ... -;; -;; start_server? -;; -(define (rpc-transport:launch run-id) - (let* ((tdbdat (tasks:open-db))) - (BB> "rpc-transport:launch fired for run-id="run-id) - (set! *run-id* run-id) - (if (args:get-arg "-daemonize") - (daemon:ize)) - (if (server:check-if-running run-id) - (begin - (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") - (exit 0))) - (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)) - (remtries 4)) - (if (not server-id) - (if (> remtries 0) - (begin - (thread-sleep! 2) - (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id) - (- remtries 1))) - (begin - ;; since we didn't get the server lock we are going to clean up and bail out - (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") - (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " rpc-transport:launch"))) - (begin - (rpc-transport:run (if (args:get-arg "-server")(args:get-arg "-server") "-") run-id server-id) - (exit)))))) - -(define (rpc-transport:run hostn run-id server-id) - (debug:print 2 *default-log-port* "Attempting to start the rpc server ...") - ;; (trace rpc:publish-procedure!) - - (rpc:publish-procedure! 'server:login server:login) - (rpc:publish-procedure! 'testing (lambda () "Just testing")) - - (let* ((db #f) - (hostname (get-host-name)) - (ipaddrstr (let ((ipstr (if (string=? "-" hostn) - ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") - (server:get-best-guess-address hostname) - #f))) - (if ipstr ipstr hostn))) ;; hostname))) - (start-port (open-run-close tasks:server-get-next-port tasks:open-db)) - (link-tree-path (configf:lookup *configdat* "setup" "linktree")) - (rpc:listener (rpc-transport:find-free-port-and-open (rpc:default-server-port))) - (th1 (make-thread - (lambda () - ((rpc:make-server rpc:listener) #t)) - "rpc:server")) - ;; (cute (rpc:make-server rpc:listener) "rpc:server") - ;; 'rpc:server)) - (hostname (if (string=? "-" hostn) - (get-host-name) - hostn)) - (ipaddrstr (if (string=? "-" hostn) - (server:get-best-guess-address hostname) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") - #f)) - (portnum (rpc:default-server-port)) - (host:port (conc (if ipaddrstr ipaddrstr hostname) ":" portnum)) - (tdb (tasks:open-db))) - (thread-start! th1) - (set! db *dbstruct-db*) - (open-run-close tasks:server-set-interface-port - tasks:open-db - server-id - ipaddrstr portnum) - (debug:print 0 *default-log-port* "Server started on " host:port) - - ;; (trace rpc:publish-procedure!) - ;; (rpc:publish-procedure! 'server:login server:login) - ;; (rpc:publish-procedure! 'testing (lambda () "Just testing")) - - ;;====================================================================== - ;; ;; end of publish-procedure section - ;;====================================================================== - ;; - (on-exit (lambda () - (open-run-close tasks:server-set-state! tasks:open-db server-id "stopped"))) - - (set! *rpc:listener* rpc:listener) - (tasks:server-set-state! tdb server-id "running") - (set! *dbstruct-db* (db:setup run-id)) - ;; if none running or if > 20 seconds since - ;; server last used then start shutdown - (let loop ((count 0)) - (thread-sleep! 5) ;; no need to do this very often - (let ((numrunning -1)) ;; (db:get-count-tests-running db))) - (if (or (> numrunning 0) - (> (+ *db-last-access* 60)(current-seconds))) - (begin - (debug:print-info 0 *default-log-port* "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *db-last-access*)) - (loop (+ 1 count))) - (begin - (debug:print-info 0 *default-log-port* "Starting to shutdown the server side") - (open-run-close tasks:server-delete-record tasks:open-db server-id " rpc-transport:try-start-server stop") - (thread-sleep! 10) - (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*) - (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting") - )))))) - -(define (rpc-transport:find-free-port-and-open port) - (handle-exceptions - exn - (begin - (print "Failed to bind to port " (rpc:default-server-port) ", trying next port") - (rpc-transport:find-free-port-and-open (+ port 1))) - (rpc:default-server-port port) - (tcp-read-timeout 240000) - (tcp-listen (rpc:default-server-port) 10000))) - -(define (rpc-transport:ping run-id host port) - (handle-exceptions - exn - (begin - (print "SERVER_NOT_FOUND") - (exit 1)) - (let ((login-res ((rpc:procedure 'server:login host port) *toppath*))) - (if (and (list? login-res) - (car login-res)) - (begin - (print "LOGIN_OK") - (exit 0)) - (begin - (print "LOGIN_FAILED") - (exit 1)))))) - -(define (rpc-transport:client-setup run-id #!key (remtries 10)) - (if *runremote* - (begin - (debug:print-error 0 *default-log-port* "Attempt to connect to server but already connected") - #f) - (let* ((host-info (hash-table-ref/default *runremote* run-id #f))) ;; (open-run-close db:get-var #f "SERVER")) - (if host-info - (let ((iface (car host-info)) - (port (cadr host-info)) - (ping-res ((rpc:procedure 'server:login host port) *toppath*))) - (if ping-res - (let ((server-dat (list iface port #f #f #f))) - (hash-table-set! *runremote* run-id server-dat) - server-dat) - (begin - (server:try-running *toppath*) - (thread-sleep! 2) - (rpc-transport:client-setup run-id (- remtries 1))))) - (let* ((server-db-info (open-run-close tasks:get-server tasks:open-db run-id))) - (debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) - (if server-db-info - (let* ((iface (tasks:hostinfo-get-interface server-db-info)) - (port (tasks:hostinfo-get-port server-db-info)) - (server-dat (list iface port #f #f #f)) - (ping-res ((rpc:procedure 'server:login host port) *toppath*))) - (if start-res - (begin - (hash-table-set! *runremote* run-id server-dat) - server-dat) - (begin - (server:try-running *toppath*) - (thread-sleep! 2) - (rpc-transport:client-setup run-id (- remtries 1))))) - (begin - (server:try-running *toppath*) - (thread-sleep! 2) - (rpc-transport:client-setup run-id (- remtries 1))))))))) -;; -;; (port (if (and hostinfo (> (length hostdat) 1))(cadr hostdat) #f))) -;; (if (and port -;; (string->number port)) -;; (let ((portn (string->number port))) -;; (debug:print-info 2 *default-log-port* "Setting up to connect to host " host ":" port) -;; (handle-exceptions -;; exn -;; (begin -;; (debug:print-error 0 *default-log-port* "Failed to open a connection to the server at host: " host " port: " port) -;; (debug:print 0 *default-log-port* " EXCEPTION: " ((condition-property-accessor 'exn 'message) exn)) -;; ;; (open-run-close -;; ;; (lambda (db . param) -;; ;; (sqlite3:execute db "DELETE FROM metadat WHERE var='SERVER'")) -;; ;; #f) -;; (set! *runremote* #f)) -;; (if (and (not (args:get-arg "-server")) ;; no point in the server using the server using the server -;; ((rpc:procedure 'server:login host portn) *toppath*)) -;; (begin -;; (debug:print-info 2 *default-log-port* "Logged in and connected to " host ":" port) -;; (set! *runremote* (vector host portn))) -;; (begin -;; (debug:print-info 2 *default-log-port* "Failed to login or connect to " host ":" port) -;; (set! *runremote* #f))))) -;; (debug:print-info 2 *default-log-port* "no server available"))))) - Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -15,606 +15,658 @@ ;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see . ;; +;;====================================================================== +;; +;; This is the Megatest specific stuff for starting and maintaining a +;; server. Stuff that talks to the server should go in client.scm. +;; General nanomsg stuff (not Megatest specific) should go in the +;; nmsg-transport.scm file. +;; +;;====================================================================== + (require-extension (srfi 18) extras tcp s11n) (use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest - directory-utils posix-extras matchable) + directory-utils posix-extras matchable typed-records) (use spiffy uri-common intarweb http-client spiffy-request-vars) (declare (unit server)) (declare (uses common)) (declare (uses db)) -(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. -;; (declare (uses synchash)) -(declare (uses http-transport)) -;;(declare (uses rpc-transport)) -(declare (uses launch)) -;; (declare (uses daemon)) - -(include "common_records.scm") -(include "db_records.scm") - -(define (server:make-server-url hostport) - (if (not hostport) - #f - (conc "http://" (car hostport) ":" (cadr hostport)))) - -(define *server-loop-heart-beat* (current-seconds)) - -;;====================================================================== -;; P K T S S T U F F -;;====================================================================== - -;; ??? - -;;====================================================================== -;; P K T S S T U F F -;;====================================================================== - -;; ??? - -;;====================================================================== -;; S E R V E R -;;====================================================================== - -;; Call this to start the actual server -;; - -;; all routes though here end in exit ... -;; -;; start_server -;; -(define (server:launch run-id transport-type) - (case transport-type - ((http)(http-transport:launch)) - ;;((nmsg)(nmsg-transport:launch run-id)) - ;;((rpc) (rpc-transport:launch run-id)) - (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type)))) - -;;====================================================================== -;; S E R V E R U T I L I T I E S -;;====================================================================== - -;; Get the transport -(define (server:get-transport) - (if *transport-type* - *transport-type* - (let ((ttype (string->symbol - (or (args:get-arg "-transport") - (configf:lookup *configdat* "server" "transport") - "rpc")))) - (set! *transport-type* ttype) - ttype))) - -;; Generate a unique signature for this server -(define (server:mk-signature) - (message-digest-string (md5-primitive) - (with-output-to-string - (lambda () - (write (list (current-directory) - (argv))))))) - -;; When using zmq this would send the message back (two step process) -;; with spiffy or rpc this simply returns the return data to be returned -;; -(define (server:reply return-addr query-sig success/fail result) - (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result) - ;; (send-message pubsock target send-more: #t) - ;; (send-message pubsock - (case (server:get-transport) - ((rpc) (db:obj->string (vector success/fail query-sig result))) - ((http) (db:obj->string (vector success/fail query-sig result))) - ((fs) result) - (else - (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) - result))) - -;; Given a run id start a server process ### NOTE ### > file 2>&1 -;; if the run-id is zero and the target-host is set -;; try running on that host -;; incidental: rotate logs in logs/ dir. -;; -(define (server:run areapath) ;; areapath is *toppath* for a given testsuite area - (let* ((curr-host (get-host-name)) - ;; (attempt-in-progress (server:start-attempted? areapath)) - ;; (dot-server-url (server:check-if-running areapath)) - (curr-ip (server:get-best-guess-address curr-host)) - (curr-pid (current-process-id)) - (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) - (target-host (car homehost)) - (testsuite (common:get-testsuite-name)) - (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) - (cmdln (conc (common:get-megatest-exe) - " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") - " -daemonize " - "") - ;; " -log " logfile - " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) - (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) - (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) - ;; we want the remote server to start in *toppath* so push there - (push-directory areapath) - (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") - (thread-start! log-rotate) - - ;; host.domain.tld match host? - (if (and target-host - ;; look at target host, is it host.domain.tld or ip address and does it - ;; match current ip or hostname - (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) - (not (equal? curr-ip target-host))) - (begin - (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) - (setenv "TARGETHOST" target-host))) - - (setenv "TARGETHOST_LOGF" logfile) - (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time - (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever - (system (conc "nbfake " cmdln)) - (unsetenv "TARGETHOST_LOGF") - (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) - (thread-join! log-rotate) - (pop-directory))) - -;; given a path to a server log return: host port startseconds -;; -(define (server:logf-get-start-info logf) - (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs - (handle-exceptions - exn - (list #f #f #f) ;; no idea what went wrong, call it a bad server - (with-input-from-file - logf - (lambda () - (let loop ((inl (read-line)) - (lnum 0)) - (if (not (eof-object? inl)) - (let ((mlst (string-match rx inl))) - (if (not mlst) - (if (< lnum 500) ;; give up if more than 500 lines of server log read - (loop (read-line)(+ lnum 1)) - (list #f #f #f)) - (let ((dat (cdr mlst))) - (list (car dat) ;; host - (string->number (cadr dat)) ;; port - (string->number (caddr dat)))))) - (list #f #f #f)))))))) - -;; get a list of servers with all relevant data -;; ( mod-time host port start-time pid ) -;; -(define (server:get-list areapath #!key (limit #f)) - (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) - (day-seconds (* 24 60 60))) - ;; if the directory exists continue to get the list - ;; otherwise attempt to create the logs dir and then - ;; continue - (if (if (directory-exists? (conc areapath "/logs")) - '() - (if (file-write-access? areapath) - (begin - (condition-case - (create-directory (conc areapath "/logs") #t) - (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) - (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list."))) - (directory-exists? (conc areapath "/logs"))) - '())) - (let* ((server-logs (glob (conc areapath "/logs/server-*.log"))) - (num-serv-logs (length server-logs))) - (if (null? server-logs) - '() - (let loop ((hed (car server-logs)) - (tal (cdr server-logs)) - (res '())) - (let* ((mod-time (handle-exceptions - exn - (current-seconds) ;; 0 - (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted - (down-time (- (current-seconds) mod-time)) - (serv-dat (if (or (< num-serv-logs 10) - (< down-time 900)) ;; day-seconds)) - (server:logf-get-start-info hed) - '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at - (serv-rec (cons mod-time serv-dat)) - (fmatch (string-match fname-rx hed)) - (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) - (new-res (if (null? serv-dat) - res - (cons (append serv-rec (list pid)) res)))) - (if (null? tal) - (if (and limit - (> (length new-res) limit)) - new-res ;; (take new-res limit) <= need intelligent sorting before this will work - new-res) - (loop (car tal)(cdr tal) new-res))))))))) - -(define (server:get-num-alive srvlst) - (let ((num-alive 0)) - (for-each - (lambda (server) - (match-let (((mod-time host port start-time pid) - server)) - (let* ((uptime (- (current-seconds) mod-time)) - (runtime (if start-time - (- mod-time start-time) - 0))) - (if (< uptime 5)(set! num-alive (+ num-alive 1)))))) - srvlst) - num-alive)) - -;; given a list of servers get a list of valid servers, i.e. at least -;; 10 seconds old, has started and is less than 1 hour old and is -;; active (i.e. mod-time < 10 seconds -;; -;; mod-time host port start-time pid -;; -;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off -;; and servers should stick around for about two hours or so. -;; -(define (server:get-best srvlst) - (let* ((nums (server:get-num-servers)) - (now (current-seconds)) - (slst (sort - (filter (lambda (rec) - (if (and (list? rec) - (> (length rec) 2)) - (let ((start-time (list-ref rec 3)) - (mod-time (list-ref rec 0))) - ;; (print "start-time: " start-time " mod-time: " mod-time) - (and start-time mod-time - (> (- now start-time) 0) ;; been running at least 0 seconds - (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds - (< (- now start-time) - (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) - 180) - (random 360))) ;; under one hour running time +/- 180 - )) - #f)) - srvlst) - (lambda (a b) - (< (list-ref a 3) - (list-ref b 3)))))) - (if (> (length slst) nums) - (take slst nums) - slst))) - -(define (server:get-first-best areapath) - (let ((srvrs (server:get-best (server:get-list areapath)))) - (if (and srvrs - (not (null? srvrs))) - (car srvrs) - #f))) - -(define (server:get-rand-best areapath) - (let ((srvrs (server:get-best (server:get-list areapath)))) - (if (and (list? srvrs) - (not (null? srvrs))) - (let* ((len (length srvrs)) - (idx (random len))) - (list-ref srvrs idx)) - #f))) - - -(define (server:record->url servr) - (match-let (((mod-time host port start-time pid) - servr)) - (if (and host port) - (conc host ":" port) - #f))) - -(define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. - (if *my-client-signature* *my-client-signature* - (let ((sig (server:mk-signature))) - (set! *my-client-signature* sig) - *my-client-signature*))) - -;; kind start up of servers, wait 40 seconds before allowing another server for a given -;; run-id to be launched -(define (server:kind-run areapath) - (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? - (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun - (call-num (car last-run-dat)) - (when-run (cadr last-run-dat)) - (run-delay (+ (case call-num - ((0) 0) - ((1) 20) - ((2) 300) - (else 600)) - (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously - (lock-file (conc areapath "/logs/server-start.lock"))) - (if (> (- (current-seconds) when-run) run-delay) - (begin - (common:simple-file-lock-and-wait lock-file expire-time: 15) - (server:run areapath) - (thread-sleep! 5) ;; don't release the lock for at least a few seconds - (common:simple-file-release-lock lock-file))) - (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) - -(define (server:start-and-wait areapath #!key (timeout 60)) - (let ((give-up-time (+ (current-seconds) timeout))) - (let loop ((server-url (server:check-if-running areapath)) - (try-num 0)) - (if (or server-url - (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. - server-url - (let ((num-ok (length (server:get-best (server:get-list areapath))))) - (if (and (> try-num 0) ;; first time through simply wait a little while then try again - (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one - (server:kind-run areapath)) - (thread-sleep! 5) - (loop (server:check-if-running areapath) - (+ try-num 1))))))) - -(define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. - -(define (server:get-num-servers #!key (numservers 2)) - (let ((ns (string->number - (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) - (or ns numservers))) - -;; no longer care if multiple servers are started by accident. older servers will drop off in time. -;; -(define (server:check-if-running areapath) ;; #!key (numservers "2")) - (let* ((ns (server:get-num-servers)) - (servers (server:get-best (server:get-list areapath)))) - ;; (print "servers: " servers " ns: " ns) - (if (or (and servers - (null? servers)) - (not servers) - (and (list? servers) - (< (length servers) (random ns)))) ;; somewhere between 0 and numservers - #f - (let loop ((hed (car servers)) - (tal (cdr servers))) - (let ((res (server:check-server hed))) - (if res - res - (if (null? tal) - #f - (loop (car tal)(cdr tal))))))))) - -;; ping the given server -;; -(define (server:check-server server-record) - (let* ((server-url (server:record->url server-record)) - (res (case *transport-type* - ((http)(server:ping server-url)) - ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) - ))) - (if res - server-url - #f))) - -(define (server:kill servr) - (match-let (((mod-time hostname port start-time pid) - servr)) - (tasks:kill-server hostname pid))) - -;; called in megatest.scm, host-port is string hostname:port -;; -;; NOTE: This is NOT called directly from clients as not all transports support a client running -;; in the same process as the server. -;; -(define (server:ping host-port-in #!key (do-exit #f)) - (let ((host:port (if (not host-port-in) ;; use read-dotserver to find - #f ;; (server:check-if-running *toppath*) - ;; (if (number? host-port-in) ;; we were handed a server-id - ;; (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) - ;; ;; (print "srec: " srec " host-port-in: " host-port-in) - ;; (if srec - ;; (conc (vector-ref srec 3) ":" (vector-ref srec 4)) - ;; (conc "no such server-id " host-port-in))) - host-port-in))) ;; ) - (let* ((host-port (if host:port - (let ((slst (string-split host:port ":"))) - (if (eq? (length slst) 2) - (list (car slst)(string->number (cadr slst))) - #f)) - #f))) -;; (toppath (launch:setup))) - ;; (print "host-port=" host-port) - (if (not host-port) - (begin - (if host-port-in - (debug:print 0 *default-log-port* "ERROR: bad host:port")) - (if do-exit (exit 1)) - #f) - (let* ((iface (car host-port)) - (port (cadr host-port)) - (server-dat (http-transport:client-connect iface port)) - (login-res (rmt:login-no-auto-client-setup server-dat))) - (if (and (list? login-res) - (car login-res)) - (begin - ;; (print "LOGIN_OK") - (if do-exit (exit 0)) - #t) - (begin - ;; (print "LOGIN_FAILED") - (if do-exit (exit 1)) - #f))))))) - -;; run ping in separate process, safest way in some cases -;; -(define (server:ping-server ifaceport) - (with-input-from-pipe - (conc (common:get-megatest-exe) " -ping " ifaceport) - (lambda () - (let loop ((inl (read-line)) - (res "NOREPLY")) - (if (eof-object? inl) - (case (string->symbol res) - ((NOREPLY) #f) - ((LOGIN_OK) #t) - (else #f)) - (loop (read-line) inl)))))) - -;; NOT USED (well, ok, reference in rpc-transport but otherwise not used). -;; -(define (server:login toppath) - (lambda (toppath) - (set! *db-last-access* (current-seconds)) ;; might not be needed. - (if (equal? *toppath* toppath) - #t - #f))) - -;; timeout is hms string: 1h 5m 3s, default is 1 minute -;; -(define (server:expiration-timeout) - (let ((tmo (configf:lookup *configdat* "server" "timeout"))) - (if (and (string? tmo) - (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below - (* 3600 (string->number tmo)) - 60))) - -(define (server:get-best-guess-address hostname) - (let ((res #f)) - (for-each - (lambda (adr) - (if (not (eq? (u8vector-ref adr 0) 127)) - (set! res adr))) - ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME - (vector->list (hostinfo-addresses (hostname->hostinfo hostname)))) - (string-intersperse - (map number->string - (u8vector->list - (if res res (hostname->ip hostname)))) "."))) - -;; moving this here as it needs access to db and cannot be in common. -;; -(define (server:writable-watchdog dbstruct) - (thread-sleep! 0.05) ;; delay for startup - (let ((legacy-sync (common:run-sync?)) - (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300)) - (debug-mode (debug:debug-mode 1)) - (last-time (current-seconds)) - (no-sync-db (db:open-no-sync-db)) - (sync-duration 0) ;; run time of the sync in milliseconds - ;;(this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x))) - ) - (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls - (debug:print-info 2 *default-log-port* "Periodic sync thread started.") - (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id) );; " this-wd-num="this-wd-num) - (if (and legacy-sync (not *time-to-exit*)) - (let* (;;(dbstruct (db:setup)) - (mtdb (dbr:dbstruct-mtdb dbstruct)) - (mtpath (db:dbdat-get-path mtdb)) - (tmp-area (common:get-db-tmp-area)) - (start-file (conc tmp-area "/.start-sync")) - (end-file (conc tmp-area "/.end-sync"))) - (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") - (let loop () - ;; sync for filesystem local db writes - ;; - (mutex-lock! *db-multi-sync-mutex*) - (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write - (sync-in-progress *db-sync-in-progress*) - (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 5)) - (should-sync (and (not *time-to-exit*) - (> (- (current-seconds) *db-last-sync*) min-intersync-delay))) ;; sync every five seconds minimum, deprecated logic, can probably be removed - (start-time (current-seconds)) - (cpu-load-adj (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f))) - (mt-mod-time (file-modification-time mtpath)) - (last-sync-start (if (common:file-exists? start-file) - (file-modification-time start-file) - 0)) - (last-sync-end (if (common:file-exists? end-file) - (file-modification-time end-file) - 10)) - (sync-period (+ 3 (* cpu-load-adj 30))) ;; as adjusted load increases increase the sync period - (recently-synced (and (< (- start-time mt-mod-time) sync-period) ;; not useful if sync didn't modify megatest.db! - (< mt-mod-time last-sync-start))) - (sync-done (<= last-sync-start last-sync-end)) - (sync-stale (> start-time (+ last-sync-start sync-stale-seconds))) - (will-sync (and (not *time-to-exit*) ;; do not start a sync if we are in the process of exiting - (or need-sync should-sync) - (or sync-done sync-stale) - (not sync-in-progress) - (not recently-synced)))) - (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop. need-sync="need-sync" sync-in-progress=" sync-in-progress - " should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync - " sync-done=" sync-done " sync-period=" sync-period) - (if (and (> sync-period 5) - (common:low-noise-print 30 "sync-period")) - (debug:print-info 0 *default-log-port* "Increased sync period due to long sync times, sync took: " sync-period " seconds.")) - ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced)) - ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) - (if will-sync (set! *db-sync-in-progress* #t)) - (mutex-unlock! *db-multi-sync-mutex*) - (if will-sync - (let (;; (max-sync-duration (configf:lookup-number *configdat* "server" "max-sync-duration")) ;; KEEPING THIS AVAILABLE BUT SHOULD NOT USE, I'M PRETTY SURE IT DOES NOT WORK! - (sync-start (current-milliseconds))) - (with-output-to-file start-file (lambda ()(print (current-process-id)))) - - ;; put lock here - - ;; (if (or (not max-sync-duration) - ;; (< sync-duration max-sync-duration)) ;; NOTE: db:sync-to-megatest.db keeps track of time of last sync and syncs incrementally - (let ((res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive - (set! sync-duration (- (current-milliseconds) sync-start)) - (if (> res 0) ;; some records were transferred, keep the db alive - (begin - (mutex-lock! *heartbeat-mutex*) - (set! *db-last-access* (current-seconds)) - (mutex-unlock! *heartbeat-mutex*) - (debug:print-info 0 *default-log-port* "sync called, " res " records transferred.")) - (debug:print-info 2 *default-log-port* "sync called but zero records transferred"))))) -;; ;; TODO: factor this next routine out into a function -;; (with-input-from-pipe ;; this should not block other threads but need to verify this -;; (conc "megatest -sync-to-megatest.db -m testsuite:" (common:get-area-name) ":" *toppath*) -;; (lambda () -;; (let loop ((inl (read-line)) -;; (res #f)) -;; (if (eof-object? inl) -;; (begin -;; (set! sync-duration (- (current-milliseconds) sync-start)) -;; (cond -;; ((not res) -;; (debug:print 0 *default-log-port* "ERROR: sync from /tmp db to megatest.db appears to have failed. Recommended that you stop your runs and run \"megatest -cleanup-db\"")) -;; ((> res 0) -;; (mutex-lock! *heartbeat-mutex*) -;; (set! *db-last-access* (current-seconds)) -;; (mutex-unlock! *heartbeat-mutex*)))) -;; (let ((num-synced (let ((matches (string-match "^Synced (\\d+).*$" inl))) -;; (if matches -;; (string->number (cadr matches)) -;; #f)))) -;; (loop (read-line) -;; (or num-synced res)))))))))) - (if will-sync - (begin - (mutex-lock! *db-multi-sync-mutex*) - (set! *db-sync-in-progress* #f) - (set! *db-last-sync* start-time) - (with-output-to-file end-file (lambda ()(print (current-process-id)))) - - ;; release lock here - - (mutex-unlock! *db-multi-sync-mutex*))) - (if (and debug-mode - (> (- start-time last-time) 60)) - (begin - (set! last-time start-time) - (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*)))))) - - ;; keep going unless time to exit - ;; - (if (not *time-to-exit*) - (let delay-loop ((count 0)) - ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) - - (if (and (not *time-to-exit*) - (< count 6)) ;; was 11, changing to 4. - (begin - (thread-sleep! 1) - (delay-loop (+ count 1)))) - (if (not *time-to-exit*) (loop)))) - ;; time to exit, close the no-sync db here - (db:no-sync-close-db no-sync-db) - (if (common:low-noise-print 30) - (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) ))))))) ;;" this-wd-num="this-wd-num))))))) - + +;; Basic stuff for safely kicking off a server +(declare (uses portlogger)) +(import portlogger) +(declare (uses nmsg-transport)) +(import nmsg-transport) + +;; Might want to bring the daemonizing back +;; (declare (uses daemon)) + +(include "common_records.scm") +(include "db_records.scm") + +;;====================================================================== +;; P K T S S T U F F +;;====================================================================== + +;;====================================================================== +;; N A N O M S G B A S E D S E R V E R +;;====================================================================== + +(defstruct nmsg + (conn #f) + (hosts (make-hash-table)) + pkt + pktspec + (mutex (make-mutex)) + ) + +;; make it a global? Well, it is local to nmsg module + +(define *nmsg-conndat* (make-nmsg)) +(nmsg-pktspec-set! *nmsg-conndat* + `((server (hostname . h) + (port . p) + (pid . i) + ))) +;; get a port +;; start the nmsg server +;; look for other servers +;; contact other servers and compile list of servers +;; there are two types of server +;; main servers - dashboards, runners and dedicated servers - need pkt +;; passive servers - test executers, step calls, list-runs - no pkt +;; +(define (server:start-nmsg #!key (force-server-type #f)) + (mutex-lock! (nmsg-mutex *nmsg-conndat*)) + (let* ((server-type (or force-server-type + (if (args:any? "-run" "-server") + 'main + 'passive))) + (port-num (portlogger:open-run-close portlogger:find-port)) + (nmsg-conn (nmsg:start-server port-num)) + (pktspec (nmsg-pktspec *nmsg-conndat*)) + (pktdir (conc (get-environment-variable "MT_RUN_AREA_HOME") + "/.server-pkts"))) + (if (not (directory? pktdir))(create-directory pktdir)) + ;; server is started, now create pkt if needed + (if (eq? server-type 'main) + (nmsg-pkt-set! *nmsg-conndat* + (pkts:write-alist->pkt + pktdir + `((hostname . ,(get-host-name)) + (port . ,port-num) + (pid . ,(current-process-id))) + pktspec: pktspec + ptype: 'server))) + (nmsg-conn-set! *nmsg-conndat* nmsg-conn) + (mutex-unlock! (nmsg-mutex *nmsg-conndat*)) + )) + +;; +;; +;; ;; Call this to start the actual server +;; ;; +;; +;; ;; all routes though here end in exit ... +;; ;; +;; ;; start_server +;; ;; +;; (define (server:launch run-id transport-type) +;; (case transport-type +;; ((http)(http-transport:launch)) +;; ;;((nmsg)(nmsg-transport:launch run-id)) +;; ;;((rpc) (rpc-transport:launch run-id)) +;; (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type)))) +;; +;; ;;====================================================================== +;; ;; S E R V E R U T I L I T I E S +;; ;;====================================================================== +;; +;; ;; Get the transport +;; (define (server:get-transport) +;; (if *transport-type* +;; *transport-type* +;; (let ((ttype (string->symbol +;; (or (args:get-arg "-transport") +;; (configf:lookup *configdat* "server" "transport") +;; "rpc")))) +;; (set! *transport-type* ttype) +;; ttype))) +;; +;; ;; Generate a unique signature for this server +;; (define (server:mk-signature) +;; (message-digest-string (md5-primitive) +;; (with-output-to-string +;; (lambda () +;; (write (list (current-directory) +;; (argv))))))) +;; +;; ;; When using zmq this would send the message back (two step process) +;; ;; with spiffy or rpc this simply returns the return data to be returned +;; ;; +;; (define (server:reply return-addr query-sig success/fail result) +;; (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result) +;; ;; (send-message pubsock target send-more: #t) +;; ;; (send-message pubsock +;; (case (server:get-transport) +;; ((rpc) (db:obj->string (vector success/fail query-sig result))) +;; ((http) (db:obj->string (vector success/fail query-sig result))) +;; ((fs) result) +;; (else +;; (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) +;; result))) +;; +;; ;; Given a run id start a server process ### NOTE ### > file 2>&1 +;; ;; if the run-id is zero and the target-host is set +;; ;; try running on that host +;; ;; incidental: rotate logs in logs/ dir. +;; ;; +;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area +;; (let* ((curr-host (get-host-name)) +;; ;; (attempt-in-progress (server:start-attempted? areapath)) +;; ;; (dot-server-url (server:check-if-running areapath)) +;; (curr-ip (server:get-best-guess-address curr-host)) +;; (curr-pid (current-process-id)) +;; (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) +;; (target-host (car homehost)) +;; (testsuite (common:get-testsuite-name)) +;; (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) +;; (cmdln (conc (common:get-megatest-exe) +;; " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") +;; " -daemonize " +;; "") +;; ;; " -log " logfile +;; " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) +;; (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) +;; (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) +;; ;; we want the remote server to start in *toppath* so push there +;; (push-directory areapath) +;; (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") +;; (thread-start! log-rotate) +;; +;; ;; host.domain.tld match host? +;; (if (and target-host +;; ;; look at target host, is it host.domain.tld or ip address and does it +;; ;; match current ip or hostname +;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) +;; (not (equal? curr-ip target-host))) +;; (begin +;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) +;; (setenv "TARGETHOST" target-host))) +;; +;; (setenv "TARGETHOST_LOGF" logfile) +;; (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time +;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever +;; (system (conc "nbfake " cmdln)) +;; (unsetenv "TARGETHOST_LOGF") +;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) +;; (thread-join! log-rotate) +;; (pop-directory))) +;; +;; ;; given a path to a server log return: host port startseconds +;; ;; +;; (define (server:logf-get-start-info logf) +;; (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs +;; (handle-exceptions +;; exn +;; (list #f #f #f) ;; no idea what went wrong, call it a bad server +;; (with-input-from-file +;; logf +;; (lambda () +;; (let loop ((inl (read-line)) +;; (lnum 0)) +;; (if (not (eof-object? inl)) +;; (let ((mlst (string-match rx inl))) +;; (if (not mlst) +;; (if (< lnum 500) ;; give up if more than 500 lines of server log read +;; (loop (read-line)(+ lnum 1)) +;; (list #f #f #f)) +;; (let ((dat (cdr mlst))) +;; (list (car dat) ;; host +;; (string->number (cadr dat)) ;; port +;; (string->number (caddr dat)))))) +;; (list #f #f #f)))))))) +;; +;; ;; get a list of servers with all relevant data +;; ;; ( mod-time host port start-time pid ) +;; ;; +;; (define (server:get-list areapath #!key (limit #f)) +;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) +;; (day-seconds (* 24 60 60))) +;; ;; if the directory exists continue to get the list +;; ;; otherwise attempt to create the logs dir and then +;; ;; continue +;; (if (if (directory-exists? (conc areapath "/logs")) +;; '() +;; (if (file-write-access? areapath) +;; (begin +;; (condition-case +;; (create-directory (conc areapath "/logs") #t) +;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) +;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list."))) +;; (directory-exists? (conc areapath "/logs"))) +;; '())) +;; (let* ((server-logs (glob (conc areapath "/logs/server-*.log"))) +;; (num-serv-logs (length server-logs))) +;; (if (null? server-logs) +;; '() +;; (let loop ((hed (car server-logs)) +;; (tal (cdr server-logs)) +;; (res '())) +;; (let* ((mod-time (handle-exceptions +;; exn +;; (current-seconds) ;; 0 +;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted +;; (down-time (- (current-seconds) mod-time)) +;; (serv-dat (if (or (< num-serv-logs 10) +;; (< down-time 900)) ;; day-seconds)) +;; (server:logf-get-start-info hed) +;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at +;; (serv-rec (cons mod-time serv-dat)) +;; (fmatch (string-match fname-rx hed)) +;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) +;; (new-res (if (null? serv-dat) +;; res +;; (cons (append serv-rec (list pid)) res)))) +;; (if (null? tal) +;; (if (and limit +;; (> (length new-res) limit)) +;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work +;; new-res) +;; (loop (car tal)(cdr tal) new-res))))))))) +;; +;; (define (server:get-num-alive srvlst) +;; (let ((num-alive 0)) +;; (for-each +;; (lambda (server) +;; (match-let (((mod-time host port start-time pid) +;; server)) +;; (let* ((uptime (- (current-seconds) mod-time)) +;; (runtime (if start-time +;; (- mod-time start-time) +;; 0))) +;; (if (< uptime 5)(set! num-alive (+ num-alive 1)))))) +;; srvlst) +;; num-alive)) +;; +;; ;; given a list of servers get a list of valid servers, i.e. at least +;; ;; 10 seconds old, has started and is less than 1 hour old and is +;; ;; active (i.e. mod-time < 10 seconds +;; ;; +;; ;; mod-time host port start-time pid +;; ;; +;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off +;; ;; and servers should stick around for about two hours or so. +;; ;; +;; (define (server:get-best srvlst) +;; (let* ((nums (server:get-num-servers)) +;; (now (current-seconds)) +;; (slst (sort +;; (filter (lambda (rec) +;; (if (and (list? rec) +;; (> (length rec) 2)) +;; (let ((start-time (list-ref rec 3)) +;; (mod-time (list-ref rec 0))) +;; ;; (print "start-time: " start-time " mod-time: " mod-time) +;; (and start-time mod-time +;; (> (- now start-time) 0) ;; been running at least 0 seconds +;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds +;; (< (- now start-time) +;; (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) +;; 180) +;; (random 360))) ;; under one hour running time +/- 180 +;; )) +;; #f)) +;; srvlst) +;; (lambda (a b) +;; (< (list-ref a 3) +;; (list-ref b 3)))))) +;; (if (> (length slst) nums) +;; (take slst nums) +;; slst))) +;; +;; (define (server:get-first-best areapath) +;; (let ((srvrs (server:get-best (server:get-list areapath)))) +;; (if (and srvrs +;; (not (null? srvrs))) +;; (car srvrs) +;; #f))) +;; +;; (define (server:get-rand-best areapath) +;; (let ((srvrs (server:get-best (server:get-list areapath)))) +;; (if (and (list? srvrs) +;; (not (null? srvrs))) +;; (let* ((len (length srvrs)) +;; (idx (random len))) +;; (list-ref srvrs idx)) +;; #f))) +;; +;; +;; (define (server:record->url servr) +;; (match-let (((mod-time host port start-time pid) +;; servr)) +;; (if (and host port) +;; (conc host ":" port) +;; #f))) +;; +;; (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. +;; (if *my-client-signature* *my-client-signature* +;; (let ((sig (server:mk-signature))) +;; (set! *my-client-signature* sig) +;; *my-client-signature*))) +;; +;; ;; kind start up of servers, wait 40 seconds before allowing another server for a given +;; ;; run-id to be launched +;; (define (server:kind-run areapath) +;; (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? +;; (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun +;; (call-num (car last-run-dat)) +;; (when-run (cadr last-run-dat)) +;; (run-delay (+ (case call-num +;; ((0) 0) +;; ((1) 20) +;; ((2) 300) +;; (else 600)) +;; (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously +;; (lock-file (conc areapath "/logs/server-start.lock"))) +;; (if (> (- (current-seconds) when-run) run-delay) +;; (begin +;; (common:simple-file-lock-and-wait lock-file expire-time: 15) +;; (server:run areapath) +;; (thread-sleep! 5) ;; don't release the lock for at least a few seconds +;; (common:simple-file-release-lock lock-file))) +;; (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) +;; +;; (define (server:start-and-wait areapath #!key (timeout 60)) +;; (let ((give-up-time (+ (current-seconds) timeout))) +;; (let loop ((server-url (server:check-if-running areapath)) +;; (try-num 0)) +;; (if (or server-url +;; (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. +;; server-url +;; (let ((num-ok (length (server:get-best (server:get-list areapath))))) +;; (if (and (> try-num 0) ;; first time through simply wait a little while then try again +;; (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one +;; (server:kind-run areapath)) +;; (thread-sleep! 5) +;; (loop (server:check-if-running areapath) +;; (+ try-num 1))))))) +;; +;; (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. +;; +;; (define (server:get-num-servers #!key (numservers 2)) +;; (let ((ns (string->number +;; (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) +;; (or ns numservers))) +;; +;; ;; no longer care if multiple servers are started by accident. older servers will drop off in time. +;; ;; +;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) +;; (let* ((ns (server:get-num-servers)) +;; (servers (server:get-best (server:get-list areapath)))) +;; ;; (print "servers: " servers " ns: " ns) +;; (if (or (and servers +;; (null? servers)) +;; (not servers) +;; (and (list? servers) +;; (< (length servers) (random ns)))) ;; somewhere between 0 and numservers +;; #f +;; (let loop ((hed (car servers)) +;; (tal (cdr servers))) +;; (let ((res (server:check-server hed))) +;; (if res +;; res +;; (if (null? tal) +;; #f +;; (loop (car tal)(cdr tal))))))))) +;; +;; ;; ping the given server +;; ;; +;; (define (server:check-server server-record) +;; (let* ((server-url (server:record->url server-record)) +;; (res (case *transport-type* +;; ((http)(server:ping server-url)) +;; ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) +;; ))) +;; (if res +;; server-url +;; #f))) +;; +;; ;; DO STUFF HERE - BUG +;; (define (server:kill servr) +;; (match-let (((mod-time hostname port start-time pid) +;; servr)) +;; #;(tasks:kill-server hostname pid) +;; #f +;; )) +;; +;; ;; called in megatest.scm, host-port is string hostname:port +;; ;; +;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running +;; ;; in the same process as the server. +;; ;; +;; (define (server:ping host-port-in #!key (do-exit #f)) +;; (let ((host:port (if (not host-port-in) ;; use read-dotserver to find +;; #f ;; (server:check-if-running *toppath*) +;; ;; (if (number? host-port-in) ;; we were handed a server-id +;; ;; (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) +;; ;; ;; (print "srec: " srec " host-port-in: " host-port-in) +;; ;; (if srec +;; ;; (conc (vector-ref srec 3) ":" (vector-ref srec 4)) +;; ;; (conc "no such server-id " host-port-in))) +;; host-port-in))) ;; ) +;; (let* ((host-port (if host:port +;; (let ((slst (string-split host:port ":"))) +;; (if (eq? (length slst) 2) +;; (list (car slst)(string->number (cadr slst))) +;; #f)) +;; #f))) +;; ;; (toppath (launch:setup))) +;; ;; (print "host-port=" host-port) +;; (if (not host-port) +;; (begin +;; (if host-port-in +;; (debug:print 0 *default-log-port* "ERROR: bad host:port")) +;; (if do-exit (exit 1)) +;; #f) +;; (let* ((iface (car host-port)) +;; (port (cadr host-port)) +;; (server-dat (http-transport:client-connect iface port)) +;; (login-res (rmt:login-no-auto-client-setup server-dat))) +;; (if (and (list? login-res) +;; (car login-res)) +;; (begin +;; ;; (print "LOGIN_OK") +;; (if do-exit (exit 0)) +;; #t) +;; (begin +;; ;; (print "LOGIN_FAILED") +;; (if do-exit (exit 1)) +;; #f))))))) +;; +;; ;; run ping in separate process, safest way in some cases +;; ;; +;; (define (server:ping-server ifaceport) +;; (with-input-from-pipe +;; (conc (common:get-megatest-exe) " -ping " ifaceport) +;; (lambda () +;; (let loop ((inl (read-line)) +;; (res "NOREPLY")) +;; (if (eof-object? inl) +;; (case (string->symbol res) +;; ((NOREPLY) #f) +;; ((LOGIN_OK) #t) +;; (else #f)) +;; (loop (read-line) inl)))))) +;; +;; ;; NOT USED (well, ok, reference in rpc-transport but otherwise not used). +;; ;; +;; (define (server:login toppath) +;; (lambda (toppath) +;; (set! *db-last-access* (current-seconds)) ;; might not be needed. +;; (if (equal? *toppath* toppath) +;; #t +;; #f))) +;; +;; ;; timeout is hms string: 1h 5m 3s, default is 1 minute +;; ;; +;; (define (server:expiration-timeout) +;; (let ((tmo (configf:lookup *configdat* "server" "timeout"))) +;; (if (and (string? tmo) +;; (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below +;; (* 3600 (string->number tmo)) +;; 60))) +;; +;; (define (server:get-best-guess-address hostname) +;; (let ((res #f)) +;; (for-each +;; (lambda (adr) +;; (if (not (eq? (u8vector-ref adr 0) 127)) +;; (set! res adr))) +;; ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME +;; (vector->list (hostinfo-addresses (hostname->hostinfo hostname)))) +;; (string-intersperse +;; (map number->string +;; (u8vector->list +;; (if res res (hostname->ip hostname)))) "."))) +;; +;; ;; moving this here as it needs access to db and cannot be in common. +;; ;; +;; (define (server:writable-watchdog dbstruct) +;; (thread-sleep! 0.05) ;; delay for startup +;; (let ((legacy-sync (common:run-sync?)) +;; (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300)) +;; (debug-mode (debug:debug-mode 1)) +;; (last-time (current-seconds)) +;; (no-sync-db (db:open-no-sync-db)) +;; (sync-duration 0) ;; run time of the sync in milliseconds +;; ;;(this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x))) +;; ) +;; (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls +;; (debug:print-info 2 *default-log-port* "Periodic sync thread started.") +;; (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id) );; " this-wd-num="this-wd-num) +;; (if (and legacy-sync (not *time-to-exit*)) +;; (let* (;;(dbstruct (db:setup)) +;; (mtdb (dbr:dbstruct-mtdb dbstruct)) +;; (mtpath (db:dbdat-get-path mtdb)) +;; (tmp-area (common:get-db-tmp-area)) +;; (start-file (conc tmp-area "/.start-sync")) +;; (end-file (conc tmp-area "/.end-sync"))) +;; (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") +;; (let loop () +;; ;; sync for filesystem local db writes +;; ;; +;; (mutex-lock! *db-multi-sync-mutex*) +;; (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write +;; (sync-in-progress *db-sync-in-progress*) +;; (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 5)) +;; (should-sync (and (not *time-to-exit*) +;; (> (- (current-seconds) *db-last-sync*) min-intersync-delay))) ;; sync every five seconds minimum, deprecated logic, can probably be removed +;; (start-time (current-seconds)) +;; (cpu-load-adj (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f))) +;; (mt-mod-time (file-modification-time mtpath)) +;; (last-sync-start (if (common:file-exists? start-file) +;; (file-modification-time start-file) +;; 0)) +;; (last-sync-end (if (common:file-exists? end-file) +;; (file-modification-time end-file) +;; 10)) +;; (sync-period (+ 3 (* cpu-load-adj 30))) ;; as adjusted load increases increase the sync period +;; (recently-synced (and (< (- start-time mt-mod-time) sync-period) ;; not useful if sync didn't modify megatest.db! +;; (< mt-mod-time last-sync-start))) +;; (sync-done (<= last-sync-start last-sync-end)) +;; (sync-stale (> start-time (+ last-sync-start sync-stale-seconds))) +;; (will-sync (and (not *time-to-exit*) ;; do not start a sync if we are in the process of exiting +;; (or need-sync should-sync) +;; (or sync-done sync-stale) +;; (not sync-in-progress) +;; (not recently-synced)))) +;; (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop. need-sync="need-sync" sync-in-progress=" sync-in-progress +;; " should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync +;; " sync-done=" sync-done " sync-period=" sync-period) +;; (if (and (> sync-period 5) +;; (common:low-noise-print 30 "sync-period")) +;; (debug:print-info 0 *default-log-port* "Increased sync period due to long sync times, sync took: " sync-period " seconds.")) +;; ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced)) +;; ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) +;; (if will-sync (set! *db-sync-in-progress* #t)) +;; (mutex-unlock! *db-multi-sync-mutex*) +;; (if will-sync +;; (let (;; (max-sync-duration (configf:lookup-number *configdat* "server" "max-sync-duration")) ;; KEEPING THIS AVAILABLE BUT SHOULD NOT USE, I'M PRETTY SURE IT DOES NOT WORK! +;; (sync-start (current-milliseconds))) +;; (with-output-to-file start-file (lambda ()(print (current-process-id)))) +;; +;; ;; put lock here +;; +;; ;; (if (or (not max-sync-duration) +;; ;; (< sync-duration max-sync-duration)) ;; NOTE: db:sync-to-megatest.db keeps track of time of last sync and syncs incrementally +;; (let ((res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive +;; (set! sync-duration (- (current-milliseconds) sync-start)) +;; (if (> res 0) ;; some records were transferred, keep the db alive +;; (begin +;; (mutex-lock! *heartbeat-mutex*) +;; (set! *db-last-access* (current-seconds)) +;; (mutex-unlock! *heartbeat-mutex*) +;; (debug:print-info 0 *default-log-port* "sync called, " res " records transferred.")) +;; (debug:print-info 2 *default-log-port* "sync called but zero records transferred"))))) +;; ;; ;; TODO: factor this next routine out into a function +;; ;; (with-input-from-pipe ;; this should not block other threads but need to verify this +;; ;; (conc "megatest -sync-to-megatest.db -m testsuite:" (common:get-area-name) ":" *toppath*) +;; ;; (lambda () +;; ;; (let loop ((inl (read-line)) +;; ;; (res #f)) +;; ;; (if (eof-object? inl) +;; ;; (begin +;; ;; (set! sync-duration (- (current-milliseconds) sync-start)) +;; ;; (cond +;; ;; ((not res) +;; ;; (debug:print 0 *default-log-port* "ERROR: sync from /tmp db to megatest.db appears to have failed. Recommended that you stop your runs and run \"megatest -cleanup-db\"")) +;; ;; ((> res 0) +;; ;; (mutex-lock! *heartbeat-mutex*) +;; ;; (set! *db-last-access* (current-seconds)) +;; ;; (mutex-unlock! *heartbeat-mutex*)))) +;; ;; (let ((num-synced (let ((matches (string-match "^Synced (\\d+).*$" inl))) +;; ;; (if matches +;; ;; (string->number (cadr matches)) +;; ;; #f)))) +;; ;; (loop (read-line) +;; ;; (or num-synced res)))))))))) +;; (if will-sync +;; (begin +;; (mutex-lock! *db-multi-sync-mutex*) +;; (set! *db-sync-in-progress* #f) +;; (set! *db-last-sync* start-time) +;; (with-output-to-file end-file (lambda ()(print (current-process-id)))) +;; +;; ;; release lock here +;; +;; (mutex-unlock! *db-multi-sync-mutex*))) +;; (if (and debug-mode +;; (> (- start-time last-time) 60)) +;; (begin +;; (set! last-time start-time) +;; (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*)))))) +;; +;; ;; keep going unless time to exit +;; ;; +;; (if (not *time-to-exit*) +;; (let delay-loop ((count 0)) +;; ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) +;; +;; (if (and (not *time-to-exit*) +;; (< count 6)) ;; was 11, changing to 4. +;; (begin +;; (thread-sleep! 1) +;; (delay-loop (+ count 1)))) +;; (if (not *time-to-exit*) (loop)))) +;; ;; time to exit, close the no-sync db here +;; (db:no-sync-close-db no-sync-db) +;; (if (common:low-noise-print 30) +;; (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) ))))))) ;;" this-wd-num="this-wd-num))))))) +;;