Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -4,14 +4,15 @@ INSTALL=install SRCFILES = common.scm items.scm launch.scm \ ods.scm runconfig.scm server.scm configf.scm \ db.scm keys.scm margs.scm megatest-version.scm \ process.scm runs.scm tasks.scm tests.scm genexample.scm \ - http-transport.scm filedb.scm \ + http-transport.scm nmsg-transport.scm filedb.scm \ client.scm gutils.scm synchash.scm daemon.scm mt.scm dcommon.scm \ tree.scm ezsteps.scm lock-queue.scm sdb.scm \ - rmt.scm api.scm tdb.scm portlogger.scm + rmt.scm api.scm tdb.scm rpc-transport.scm \ + portlogger.scm archive.scm # Eggs to install (straightforward ones) EGGS=matchable readline apropos base64 regex-literals format regex-case test coops trace csv \ dot-locking posix-utils posix-extras directory-utils hostinfo tcp-server rpc csv-xml fmt \ json md5 awful http-client spiffy uri-common intarweb spiffy-request-vars \ @@ -58,11 +59,11 @@ tests.o runs.o dashboard.o dashboard-tests.o dashboard-main.o : run_records.scm db.o ezsteps.o keys.o launch.o megatest.o monitor.o runs-for-ref.o runs.o tests.o : key_records.scm tests.o tasks.o dashboard-tasks.o : task_records.scm runs.o : test_records.scm megatest.o : megatest-fossil-hash.scm -client.scm common.scm configf.scm dashboard-guimonitor.scm dashboard-tests.scm dashboard.scm db.scm dcommon.scm ezsteps.scm fs-transport.scm http-transport.scm index-tree.scm items.scm keys.scm launch.scm megatest.scm monitor.scm mt.scm newdashboard.scm runconfig.scm runs.scm server.scm tdb.scm tests.scm tree.scm zmq-transport.scm : common_records.scm +client.scm common.scm configf.scm dashboard-guimonitor.scm dashboard-tests.scm dashboard.scm db.scm dcommon.scm ezsteps.scm fs-transport.scm http-transport.scm index-tree.scm items.scm keys.scm launch.scm megatest.scm monitor.scm mt.scm newdashboard.scm runconfig.scm runs.scm server.scm tdb.scm tests.scm tree.scm zmq-transport.scm : common_records.scm rpc-transport.scm # Temporary while transitioning to new routine # runs.o : run-tests-queue-classic.scm run-tests-queue-new.scm megatest-fossil-hash.scm : $(SRCFILES) megatest.scm *_records.scm Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -10,10 +10,11 @@ ;;====================================================================== (declare (unit api)) (declare (uses rmt)) (declare (uses db)) +(declare (uses tasks)) ;; allow these queries through without starting a server ;; (define api:read-only-queries '(get-key-val-pairs @@ -44,94 +45,191 @@ get-prev-run-ids get-run-ids-matching-target get-runs-by-patt get-steps-data login - testmeta-get-record)) + testmeta-get-record + have-incompletes? + synchash-get + )) + +(define api:write-queries + '( + ;; SERVERS + start-server + kill-server + + ;; TESTS + test-set-state-status-by-id + delete-test-records + delete-old-deleted-test-records + test-set-status-state + test-set-top-process-pid + roll-up-pass-fail-counts + update-fail-pass-counts + + ;; RUNS + register-run + set-tests-state-status + delete-run + lock/unlock-run + update-run-event_time + mark-incomplete + + ;; STEPS + teststep-set-status! + + ;; TEST DATA + test-data-rollup + csv->test-data + + ;; MISC + sync-inmem->db + + ;; TESTMETA + testmeta-add-record + testmeta-update-field + + ;; TASKS + tasks-add + tasks-set-state-given-param-key + )) ;; These are called by the server on recipt of /api calls ;; - keep it simple, only return the actual result of the call, i.e. no meta info here ;; -(define (api:execute-requests dbstruct cmd params) - (case (string->symbol cmd) - ;; SERVERS - ((start-server) (apply server:kind-run params)) - ((kill-server) (set! *server-run* #f)) - - ;; KEYS - ((get-key-val-pairs) (apply db:get-key-val-pairs dbstruct params)) - ((get-keys) (db:get-keys dbstruct)) - - ;; TESTS - ((test-toplevel-num-items) (apply db:test-toplevel-num-items dbstruct params)) - ((get-test-info-by-id) (apply db:get-test-info-by-id dbstruct params)) - ((test-get-rundir-from-test-id) (apply db:test-get-rundir-from-test-id dbstruct params)) - ((test-set-state-status-by-id) (apply db:test-set-state-status-by-id dbstruct params)) - ((get-count-tests-running) (apply db:get-count-tests-running dbstruct params)) - ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params)) - ((delete-test-records) (apply db:delete-test-records dbstruct params)) - ;; ((delete-test-step-records) (apply db:delete-test-step-records dbstruct params)) - ((delete-old-deleted-test-records) (apply db:delete-old-deleted-test-records dbstruct params)) - ((test-set-status-state) (apply db:test-set-status-state dbstruct params)) - ((get-previous-test-run-record) (apply db:get-previous-test-run-record dbstruct params)) - ((get-matching-previous-test-run-records)(apply db:get-matching-previous-test-run-records dbstruct params)) - ((test-get-logfile-info) (apply db:test-get-logfile-info dbstruct params)) - ((test-get-records-for-index-file) (apply db:test-get-records-for-index-file dbstruct params)) - ((get-testinfo-state-status) (apply db:get-testinfo-state-status dbstruct params)) - ((test-set-top-process-pid) (apply db:test-set-top-process-pid dbstruct params)) - ((test-get-top-process-pid) (apply db:test-get-top-process-pid dbstruct params)) - ((test-get-paths-matching-keynames-target-new) (apply db:test-get-paths-matching-keynames-target-new dbstruct params)) - ((get-prereqs-not-met) (apply db:get-prereqs-not-met dbstruct params)) - ((roll-up-pass-fail-counts) (apply db:roll-up-pass-fail-counts dbstruct params)) - ((update-fail-pass-counts) (apply db:general-call dbstruct 'update-pass-fail-counts params)) - ((get-count-tests-running-for-run-id) (apply db:get-count-tests-running-for-run-id dbstruct params)) - - ;; RUNS - ((get-run-info) (apply db:get-run-info dbstruct params)) - ((get-run-status) (apply db:get-run-status dbstruct params)) - ((set-run-status) (apply db:set-run-status dbstruct params)) - ((register-run) (apply db:register-run dbstruct params)) - ((set-tests-state-status) (apply db:set-tests-state-status dbstruct params)) - ((get-tests-for-run) (apply db:get-tests-for-run dbstruct params)) - ((get-test-id) (apply db:get-test-id dbstruct params)) +;; - returns #( flag result ) +;; +(define (api:execute-requests dbstruct dat) + (handle-exceptions + exn + (let ((call-chain (get-call-chain))) + (print-call-chain (current-error-port)) + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens + (if (not (vector? dat)) ;; it is an error to not receive a vector + (vector #f #f "remote must be called with a vector") + (vector ;; return a vector + the returned data structure + #t + (let ((cmd (vector-ref dat 0)) + (params (vector-ref dat 1))) + (case (if (symbol? cmd) + cmd + (string->symbol cmd)) + + ;;=============================================== + ;; READ/WRITE QUERIES + ;;=============================================== + + ;; SERVERS + ((start-server) (apply server:kind-run params)) + ((kill-server) (set! *server-run* #f)) + + ;; TESTS + ((test-set-state-status-by-id) (apply db:test-set-state-status-by-id dbstruct params)) + ((delete-test-records) (apply db:delete-test-records dbstruct params)) + ((delete-old-deleted-test-records) (apply db:delete-old-deleted-test-records dbstruct params)) + ((test-set-status-state) (apply db:test-set-status-state dbstruct params)) + ((test-set-top-process-pid) (apply db:test-set-top-process-pid dbstruct params)) + ((roll-up-pass-fail-counts) (apply db:roll-up-pass-fail-counts dbstruct params)) + ((update-fail-pass-counts) (apply db:general-call dbstruct 'update-pass-fail-counts params)) + ((test-set-archive-block-id) (apply db:test-set-archive-block-id dbstruct params)) + + ;; RUNS + ((register-run) (apply db:register-run dbstruct params)) + ((set-tests-state-status) (apply db:set-tests-state-status dbstruct params)) + ((delete-run) (apply db:delete-run dbstruct params)) + ((lock/unlock-run) (apply db:lock/unlock-run dbstruct params)) + ((update-run-event_time) (apply db:update-run-event_time dbstruct params)) + + ;; STEPS + ((teststep-set-status!) (apply db:teststep-set-status! dbstruct params)) + + ;; TEST DATA + ((test-data-rollup) (apply db:test-data-rollup dbstruct params)) + ((csv->test-data) (apply db:csv->test-data dbstruct params)) + + ;; MISC + ((sync-inmem->db) (let ((run-id (car params))) + (db:sync-touched dbstruct run-id force-sync: #t))) + ((mark-incomplete) (apply db:find-and-mark-incomplete dbstruct params)) + + ;; TESTMETA + ((testmeta-add-record) (apply db:testmeta-add-record dbstruct params)) + ((testmeta-update-field) (apply db:testmeta-update-field dbstruct params)) + + ;; TASKS + ((tasks-add) (apply tasks:add dbstruct params)) + ((tasks-set-state-given-param-key) (apply tasks:set-state-given-param-key dbstruct params)) + + ;; ARCHIVES + ;; ((archive-get-allocations) + ((archive-register-disk) (apply db:archive-register-disk dbstruct params)) + ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params)) + ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey)) + + ;;====================================================================== + ;; READ ONLY QUERIES + ;;====================================================================== + + ;; KEYS + ((get-key-val-pairs) (apply db:get-key-val-pairs dbstruct params)) + ((get-keys) (db:get-keys dbstruct)) + + ;; TESTS + ((test-toplevel-num-items) (apply db:test-toplevel-num-items dbstruct params)) + ((get-test-info-by-id) (apply db:get-test-info-by-id dbstruct params)) + ((test-get-rundir-from-test-id) (apply db:test-get-rundir-from-test-id dbstruct params)) + ((get-count-tests-running) (apply db:get-count-tests-running dbstruct params)) + ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params)) + ;; ((delete-test-step-records) (apply db:delete-test-step-records dbstruct params)) + ((get-previous-test-run-record) (apply db:get-previous-test-run-record dbstruct params)) + ((get-matching-previous-test-run-records)(apply db:get-matching-previous-test-run-records dbstruct params)) + ((test-get-logfile-info) (apply db:test-get-logfile-info dbstruct params)) + ((test-get-records-for-index-file) (apply db:test-get-records-for-index-file dbstruct params)) + ((get-testinfo-state-status) (apply db:get-testinfo-state-status dbstruct params)) + ((test-get-top-process-pid) (apply db:test-get-top-process-pid dbstruct params)) + ((test-get-paths-matching-keynames-target-new) (apply db:test-get-paths-matching-keynames-target-new dbstruct params)) + ((get-prereqs-not-met) (apply db:get-prereqs-not-met dbstruct params)) + ((get-count-tests-running-for-run-id) (apply db:get-count-tests-running-for-run-id dbstruct params)) + ((synchash-get) (apply synchash:server-get dbstruct params)) + + ;; RUNS + ((get-run-info) (apply db:get-run-info dbstruct params)) + ((get-run-status) (apply db:get-run-status dbstruct params)) + ((set-run-status) (apply db:set-run-status dbstruct params)) + ((get-tests-for-run) (apply db:get-tests-for-run dbstruct params)) + ((get-test-id) (apply db:get-test-id dbstruct params)) ((get-tests-for-run-mindata) (apply db:get-tests-for-run-mindata dbstruct params)) - ((get-run-name-from-id) (apply db:get-run-name-from-id dbstruct params)) - ((delete-run) (apply db:delete-run dbstruct params)) - ((get-runs) (apply db:get-runs dbstruct params)) - ((get-all-run-ids) (db:get-all-run-ids dbstruct)) - ((get-prev-run-ids) (apply db:get-prev-run-ids dbstruct params)) - ((get-run-ids-matching-target) (apply db:get-run-ids-matching-target dbstruct params)) - ((get-runs-by-patt) (apply db:get-runs-by-patt dbstruct params)) - ((lock/unlock-run) (apply db:lock/unlock-run dbstruct params)) - ((update-run-event_time) (apply db:update-run-event_time dbstruct params)) - ((find-and-mark-incomplete) (apply db:find-and-mark-incomplete dbstruct params)) - - ;; STEPS - ((teststep-set-status!) (apply db:teststep-set-status! dbstruct params)) - - ;; TEST DATA - ((test-data-rollup) (apply db:test-data-rollup dbstruct params)) - ((csv->test-data) (apply db:csv->test-data dbstruct params)) - ((get-steps-data) (apply db:get-steps-data dbstruct params)) - - ;; MISC - ((login) (apply db:login dbstruct params)) - ((general-call) (let ((stmtname (car params)) - (run-id (cadr params)) - (realparams (cddr params))) - (db:with-db dbstruct run-id #t ;; these are all for modifying the db - (lambda (db) - (db:general-call db stmtname realparams))))) - ((sync-inmem->db) (db:sync-touched dbstruct run-id force-sync: #t)) - ((sdb-qry) (apply sdb:qry params)) - - ;; TESTMETA - ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) - ((testmeta-add-record) (apply db:testmeta-add-record dbstruct params)) - ((testmeta-update-field) (apply db:testmeta-update-field dbstruct params)) - (else - (list "ERROR" 0)))) + ((get-runs) (apply db:get-runs dbstruct params)) + ((get-all-run-ids) (db:get-all-run-ids dbstruct)) + ((get-prev-run-ids) (apply db:get-prev-run-ids dbstruct params)) + ((get-run-ids-matching-target) (apply db:get-run-ids-matching-target dbstruct params)) + ((get-runs-by-patt) (apply db:get-runs-by-patt dbstruct params)) + ((get-run-name-from-id) (apply db:get-run-name-from-id dbstruct params)) + + ;; STEPS + ((get-steps-data) (apply db:get-steps-data dbstruct params)) + + ;; MISC + ((have-incompletes?) (apply db:have-incompletes? dbstruct params)) + ((login) (apply db:login dbstruct params)) + ((general-call) (let ((stmtname (car params)) + (run-id (cadr params)) + (realparams (cddr params))) + (db:with-db dbstruct run-id #t ;; these are all for modifying the db + (lambda (db) + (db:general-call db stmtname realparams))))) + ((sdb-qry) (apply sdb:qry params)) + ((ping) (current-process-id)) + + ;; TESTMETA + ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) + + ;; TASKS + ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)))))))) + ;; http-server send-response ;; api:process-request ;; db:* ;; @@ -138,18 +236,19 @@ ;; NB// Runs on the server as part of the server loop ;; (define (api:process-request dbstruct $) ;; the $ is the request vars proc (let* ((cmd ($ 'cmd)) (paramsj ($ 'params)) - (params (db:string->obj paramsj)) ;; (rmt:json-str->dat paramsj)) - (res (api:execute-requests dbstruct cmd params))) + (params (db:string->obj paramsj transport: 'http)) ;; (rmt:json-str->dat paramsj)) + (resdat (api:execute-requests dbstruct (vector cmd params))) ;; #( flag result ) + (res (vector-ref resdat 1))) ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds ;; (rmt:dat->json-str ;; (if (or (string? res) ;; (list? res) ;; (number? res) ;; (boolean? res)) ;; res ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res))))) - (db:obj->string res))) + (db:obj->string res transport: 'http))) Index: archive.scm ================================================================== --- archive.scm +++ archive.scm @@ -7,12 +7,283 @@ ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. ;; strftime('%m/%d/%Y %H:%M:%S','now','localtime') -(use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking format) +(use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking format md5 message-digest) (import (prefix sqlite3 sqlite3:)) (declare (unit archive)) (declare (uses db)) (declare (uses common)) +(include "common_records.scm") +(include "db_records.scm") + +;;====================================================================== +;; +;;====================================================================== + +;; NOT CURRENTLY USED +;; +(define (archive:main linktree target runname testname itempath options) + (let ((testdir (conc linktree "/" target "/" runname "/" testname "/" itempatt)) + (flavor 'plain) ;; type of machine to run jobs on + (maxload 1.5) ;; max allowed load for this work + (adisks (archive:get-archive-disks))) + ;; get testdir size + ;; - hand off du to job mgr + (if (and (file-exists? testdir) + (file-is-writable? testdir)) + (let* ((dused (jobrunner:run-job + flavor ;; machine type + maxload ;; max allowed load + '() ;; prevars - environment vars to set for the job + common:get-disk-space-used ;; if a proc call it, if a string it is a unix command + (list testdir))) + (apath (archive:get-archive testname itempath dused))) + (jobrunner:run-job + flavor + maxload + '() + archive:run-bup + (list testdir apath)))))) + +;; Get archive disks from megatest.config +;; +(define (archive:get-archive-disks) + (let ((section (configf:get-section *configdat* "archive-disks"))) + (if section + section + '()))) + +;; look for the best candidate archive area, else create new +;; area +;; +(define (archive:get-archive testname itempath dused) + ;; look up in archive_allocations if there is a pre-used archive + ;; with adequate diskspace + ;; + (let* ((existing-blocks (rmt:archive-get-allocations testname itempath dused)) + (candidate-disks (map (lambda (block) + (list + (vector-ref block 1) ;; archive-area-name + (vector-ref block 2))) ;; disk-path + existing-blocks))) + (or (common:get-disk-with-most-free-space candidate-disks dused) + (archive:allocate-new-archive-block testname itempath)))) + +;; allocate a new archive area +;; +(define (archive:allocate-new-archive-block run-area-home testsuite-name dneeded) + (let* ((adisks (archive:get-archive-disks)) + (best-disk (common:get-disk-with-most-free-space adisks dneeded))) + (if best-disk + (let* ((bdisk-name (car best-disk)) + (bdisk-path (cdr best-disk)) + (area-key (substring (message-digest-string (md5-primitive) run-area-home) 0 5)) + (bdisk-id (rmt:archive-register-disk bdisk-name bdisk-path (get-df bdisk-path))) + (archive-name (let ((sec (current-seconds))) + (conc (time->string (seconds->local-time sec) "%Y") + "_q" (seconds->quarter sec) "/" + testsuite-name "_" area-key))) + (archive-path (conc bdisk-path "/" archive-name)) + (block-id (rmt:archive-register-block-name bdisk-id archive-path))) + ;; (allocation-id (rmt:archive-allocate-testsuite/area-to-block block-id testsuite-name area-key))) + (if block-id ;; (and block-id allocation-id) + (cons block-id archive-path) + #f)) + #f))) + +;; archive - run bup +;; +;; 1. create the bup dir if not exists +;; 2. start the du of each directory +;; 3. gen index +;; 4. save +;; +(define (archive:run-bup archive-command run-id run-name tests) + ;; move the getting of archive space down into the below block so that a single run can + ;; allocate as needed should a disk fill up + ;; + (let* ((min-space (string->number (or (configf:lookup *configdat* "archive" "minspace") "1000"))) + (archive-info (archive:allocate-new-archive-block *toppath* (common:get-testsuite-name) min-space)) + (archive-dir (if archive-info (cdr archive-info) #f)) + (archive-id (if archive-info (car archive-info) -1)) + (disk-groups (make-hash-table)) + (test-groups (make-hash-table)) ;; these two (disk and test groups) could be combined nicely + (bup-exe (or (configf:lookup *configdat* "archive" "bup") "bup")) + (compress (or (configf:lookup *configdat* "archive" "compress") "9")) + (linktree (configf:lookup *configdat* "setup" "linktree"))) + + (if (not archive-dir) ;; no archive disk found, this is fatal + (begin + (debug:print 0 "FATAL: No archive disks found. Please add disks with at least " min-space " MB space to the [archive-disks] section of megatest.config") + (debug:print 0 " use [archive] minspace to specify minimum available space") + (debug:print 0 " disks: " (string-intersperse (map cadr (archive:get-archive-disks)) "\n ")) + (exit 1)) + (debug:print-info 0 "Using path " archive-dir " for archiving")) + + ;; from the test info bin the path to the test by stem + ;; + (for-each + (lambda (test-dat) + (let* ((item-path (db:test-get-item-path test-dat)) + (test-name (db:test-get-testname test-dat)) + (test-id (db:test-get-id test-dat)) + (run-id (db:test-get-run_id test-dat)) + (target (string-intersperse (map cadr (rmt:get-key-val-pairs run-id)) "/")) + + (toplevel/children (and (db:test-get-is-toplevel test-dat) + (> (rmt:test-toplevel-num-items run-id test-name) 0))) + (test-partial-path (conc target "/" run-name "/" (runs:make-full-test-name test-name item-path))) + ;; note the trailing slash to get the dir inspite of it being a link + (test-path (conc linktree "/" test-partial-path)) + (test-physical-path (if (file-exists? test-path) (read-symbolic-link test-path #t) #f)) + (partial-path-index (if test-physical-path (substring-index test-partial-path test-physical-path) #f)) + (test-base (if (and partial-path-index + test-physical-path ) + (substring test-physical-path + 0 + partial-path-index) + #f))) + + (if (or toplevel/children + (not (file-exists? test-path))) + #f + (begin + (debug:print 0 + "From test-dat=" test-dat " derived the following:\n" + "test-partial-path = " test-partial-path "\n" + "test-path = " test-path "\n" + "test-physical-path = " test-physical-path "\n" + "partial-path-index = " partial-path-index "\n" + "test-base = " test-base) + (hash-table-set! disk-groups test-base (cons test-physical-path (hash-table-ref/default disk-groups test-base '()))) + (hash-table-set! test-groups test-base (cons test-dat (hash-table-ref/default test-groups test-base '()))) + test-path)))) + tests) + ;; for each disk-group + (for-each + (lambda (disk-group) + (debug:print 0 "Processing disk-group " disk-group) + (let* ((test-paths (hash-table-ref disk-groups disk-group)) + ;; ((string-intersperse (map cadr (rmt:get-key-val-pairs 1)) "-") + (bup-init-params (list "-d" archive-dir "init")) + (bup-index-params (append (list "-d" archive-dir "index") test-paths)) + (bup-save-params (append (list "-d" archive-dir "save" ;; (conc "--strip-path=" linktree) + (conc "-" compress) ;; or (conc "--compress=" compress) + "-n" (conc (common:get-testsuite-name) "-" run-id) + (conc "--strip-path=" disk-group)) + test-paths)) + (print-prefix #f)) ;; "Running: ")) ;; change to #f to turn off printing + (if (not (file-exists? archive-dir)) + (create-directory archive-dir #t)) + (if (not (file-exists? (conc archive-dir "/HEAD"))) + (begin + ;; replace this with jobrunner stuff enventually + (debug:print-info 0 "Init bup in " archive-dir) + (run-n-wait bup-exe params: bup-init-params print-cmd: print-prefix))) + (debug:print-info 0 "Indexing data to be archived") + (run-n-wait bup-exe params: bup-index-params print-cmd: print-prefix) + (debug:print-info 0 "Archiving data with bup") + (run-n-wait bup-exe params: bup-save-params print-cmd: print-prefix) + (for-each + (lambda (test-dat) + (let ((test-id (db:test-get-id test-dat)) + (run-id (db:test-get-run_id test-dat))) + (rmt:test-set-archive-block-id run-id test-id archive-id) + (if (member archive-command '("save-remove")) + (runs:remove-test-directory test-dat 'archive-remove)))) + (hash-table-ref test-groups disk-group)))) + (hash-table-keys disk-groups)) + #t)) + +(define (archive:bup-restore archive-command run-id run-name tests) ;; move the getting of archive space down into the below block so that a single run can + ;; allocate as needed should a disk fill up + ;; + (let* ((disk-groups (make-hash-table)) + (test-groups (make-hash-table)) ;; these two (disk and test groups) could be combined nicely + (bup-exe (or (configf:lookup *configdat* "archive" "bup") "bup")) + (linktree (configf:lookup *configdat* "setup" "linktree"))) + + ;; from the test info bin the path to the test by stem + ;; + (for-each + (lambda (test-dat) + (let* ((item-path (db:test-get-item-path test-dat)) + (test-name (db:test-get-testname test-dat)) + (test-id (db:test-get-id test-dat)) + (run-id (db:test-get-run_id test-dat)) + (target (string-intersperse (map cadr (rmt:get-key-val-pairs run-id)) "/")) + + (toplevel/children (and (db:test-get-is-toplevel test-dat) + (> (rmt:test-toplevel-num-items run-id test-name) 0))) + (test-partial-path (conc target "/" run-name "/" (runs:make-full-test-name test-name item-path))) + ;; note the trailing slash to get the dir inspite of it being a link + (test-path (conc linktree "/" test-partial-path)) + (test-physical-path (if (file-exists? test-path) (read-symbolic-link test-path #t) #f)) + (partial-path-index (if test-physical-path (substring-index test-partial-path test-physical-path) #f)) + (test-base (if (and partial-path-index + test-physical-path ) + (substring test-physical-path + 0 + partial-path-index) + #f))) + + ;; CREATE WORK AREA + (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat #!key (remtries 2)) + + ;; DO BUP RESTORE + + + + (if (or toplevel/children + (not (file-exists? test-path))) + #f + (begin + (debug:print 0 + "From test-dat=" test-dat " derived the following:\n" + "test-partial-path = " test-partial-path "\n" + "test-path = " test-path "\n" + "test-physical-path = " test-physical-path "\n" + "partial-path-index = " partial-path-index "\n" + "test-base = " test-base) + (hash-table-set! disk-groups test-base (cons test-physical-path (hash-table-ref/default disk-groups test-base '()))) + (hash-table-set! test-groups test-base (cons test-dat (hash-table-ref/default test-groups test-base '()))) + test-path)))) + tests) + ;; for each disk-group + (for-each + (lambda (disk-group) + (debug:print 0 "Processing disk-group " disk-group) + (let* ((test-paths (hash-table-ref disk-groups disk-group)) + ;; ((string-intersperse (map cadr (rmt:get-key-val-pairs 1)) "-") + (bup-init-params (list "-d" archive-dir "init")) + (bup-index-params (append (list "-d" archive-dir "index") test-paths)) + (bup-save-params (append (list "-d" archive-dir "save" ;; (conc "--strip-path=" linktree) + (conc "-" compress) ;; or (conc "--compress=" compress) + "-n" (conc (common:get-testsuite-name) "-" run-id) + (conc "--strip-path=" disk-group)) + test-paths)) + (print-prefix #f)) ;; "Running: ")) ;; change to #f to turn off printing + (if (not (file-exists? archive-dir)) + (create-directory archive-dir #t)) + (if (not (file-exists? (conc archive-dir "/HEAD"))) + (begin + ;; replace this with jobrunner stuff enventually + (debug:print-info 0 "Init bup in " archive-dir) + (run-n-wait bup-exe params: bup-init-params print-cmd: print-prefix))) + (debug:print-info 0 "Indexing data to be archived") + (run-n-wait bup-exe params: bup-index-params print-cmd: print-prefix) + (debug:print-info 0 "Archiving data with bup") + (run-n-wait bup-exe params: bup-save-params print-cmd: print-prefix) + (for-each + (lambda (test-dat) + (let ((test-id (db:test-get-id test-dat)) + (run-id (db:test-get-run_id test-dat))) + (rmt:test-set-archive-block-id run-id test-id archive-id) + (if (member archive-command '("save-remove")) + (runs:remove-test-directory test-dat 'archive-remove)))) + (hash-table-ref test-groups disk-group)))) + (hash-table-keys disk-groups)) + #t)) Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -41,10 +41,109 @@ (define (client:logout serverdat) (let ((ok (and (socket? serverdat) (cdb:logout serverdat *toppath* (client:get-signature))))) ok)) +(define (client:connect iface port) + (case (server:get-transport) + ((rpc) (rpc:client-connect iface port)) + ((http) (http:client-connect iface port)) + ((zmq) (zmq:client-connect iface port)) + (else (rpc:client-connect iface port)))) + +(define (client:setup run-id #!key (remaining-tries 10) (failed-connects 0)) + (case (server:get-transport) + ((rpc) (rpc-transport:client-setup run-id)) ;;(client:setup-rpc run-id)) + ((http)(client:setup-http run-id)) + (else (rpc-transport:client-setup run-id)))) ;; (client:setup-rpc run-id)))) + +;; (define (client:login-no-auto-setup server-info run-id) +;; (case (server:get-transport) +;; ((rpc) (rpc:login-no-auto-client-setup server-info run-id)) +;; ((http) (rmt:login-no-auto-client-setup server-info run-id)) +;; (else (rpc:login-no-auto-client-setup server-info run-id)))) +;; +;; (define (client:setup-rpc run-id) +;; (debug:print 0 "INFO: client:setup remaining-tries=" remaining-tries) +;; (if (<= remaining-tries 0) +;; (begin +;; (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id) +;; (exit 1)) +;; (let ((host-info (hash-table-ref/default *runremote* run-id #f))) +;; (debug:print-info 0 "client:setup host-info=" host-info ", remaining-tries=" remaining-tries) +;; (if host-info +;; (let* ((iface (car host-info)) +;; (port (cadr host-info)) +;; (start-res (client:connect iface port)) +;; ;; (ping-res (server:ping-server run-id iface port)) +;; (ping-res (client:login-no-auto-setup start-res run-id))) +;; (if ping-res ;; sucessful login? +;; (begin +;; (hash-table-set! *runremote* run-id start-res) +;; start-res) ;; return the server info +;; (if (member remaining-tries '(3 4 6)) +;; (begin ;; login failed +;; (debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" host-info) +;; (hash-table-delete! *runremote* run-id) +;; (open-run-close tasks:server-force-clean-run-record +;; tasks:open-db +;; run-id +;; (car host-info) +;; (cadr host-info) +;; " client:setup (host-info=#t)") +;; (thread-sleep! 5) +;; (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1))) +;; (begin +;; (debug:print 25 "INFO: client:setup failed to connect, start-res=" start-res ", run-id=" run-id ", host-info=" host-info) +;; (thread-sleep! 5) +;; (client:setup run-id remaining-tries: (- remaining-tries 1)))))) +;; ;; YUK: rename server-dat here +;; (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id))) +;; (debug:print-info 0 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) +;; (if server-dat +;; (let* ((iface (tasks:hostinfo-get-interface server-dat)) +;; (port (tasks:hostinfo-get-port server-dat)) +;; (start-res (http-transport:client-connect iface port)) +;; ;; (ping-res (server:ping-server run-id iface port)) +;; (ping-res (rmt:login-no-auto-client-setup start-res run-id))) +;; (if start-res +;; (begin +;; (hash-table-set! *runremote* run-id start-res) +;; start-res) +;; (if (member remaining-tries '(2 5)) +;; (begin ;; login failed +;; (debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) +;; (hash-table-delete! *runremote* run-id) +;; (open-run-close tasks:server-force-clean-run-record +;; tasks:open-db +;; run-id +;; (tasks:hostinfo-get-interface server-dat) +;; (tasks:hostinfo-get-port server-dat) +;; " client:setup (server-dat = #t)") +;; (thread-sleep! 2) +;; (server:try-running run-id) +;; (thread-sleep! 10) ;; give server a little time to start up +;; (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1))) +;; (begin +;; (debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) +;; (thread-sleep! 5) +;; (client:setup run-id remaining-tries: (- remaining-tries 1)))))) +;; (begin ;; no server registered +;; (if (eq? remaining-tries 2) +;; (begin +;; ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)") +;; (client:setup run-id remaining-tries: 10)) +;; (begin +;; (thread-sleep! 2) +;; (debug:print 25 "INFO: client:setup start-res (not defined here), run-id=" run-id ", server-dat=" server-dat) +;; (if (< (open-run-close tasks:num-in-available-state tasks:open-db run-id) 3) +;; (begin +;; ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)") +;; (server:try-running run-id))) +;; (thread-sleep! 10) ;; give server a little time to start up +;; (client:setup run-id remaining-tries: (- remaining-tries 1))))))))))) + ;; Do all the connection work, look up the transport type and set up the ;; connection if required. ;; ;; There are two scenarios. ;; 1. We are a test manager and we received *transport-type* and *runremote* via cmdline @@ -53,75 +152,63 @@ ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; -(define (client:setup run-id #!key (remaining-tries 100) (failed-connects 0)) +(define (client:setup-http run-id #!key (remaining-tries 10) (failed-connects 0)) (debug:print-info 2 "client:setup remaining-tries=" remaining-tries) (let* ((tdbdat (tasks:open-db))) (if (<= remaining-tries 0) (begin (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id) (exit 1)) - (let ((host-info (hash-table-ref/default *runremote* run-id #f))) - (if host-info - (let* ((iface (http-transport:server-dat-get-iface host-info)) - (port (http-transport:server-dat-get-port host-info)) - (start-res (http-transport:client-connect iface port)) - (ping-res (rmt:login-no-auto-client-setup start-res run-id))) - (if ping-res ;; sucessful login? - (begin - (debug:print-info 2 "client:setup, ping is good using host-info=" host-info ", remaining-tries=" remaining-tries) - ;; Why add the close-connections here? - ;; (http-transport:close-connections run-id) - (hash-table-set! *runremote* run-id start-res) - start-res) ;; return the server info - ;; have host info but no ping. shutdown the current connection and try again - (begin ;; login failed - (debug:print-info 1 "client:setup, ping is bad for start-res=" start-res " and *runremote*=" host-info) - (http-transport:close-connections run-id) - (hash-table-delete! *runremote* run-id) - (if (< remaining-tries 8) - (thread-sleep! 5) - (thread-sleep! 1)) - (client:setup run-id remaining-tries: (- remaining-tries 1))))) - ;; YUK: rename server-dat here - (let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) - (debug:print-info 4 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) - (if server-dat - (let* ((iface (tasks:hostinfo-get-interface server-dat)) - (port (tasks:hostinfo-get-port server-dat)) - (start-res (http-transport:client-connect iface port)) - (ping-res (rmt:login-no-auto-client-setup start-res run-id))) - (if (and start-res - ping-res) - (begin - (hash-table-set! *runremote* run-id start-res) - (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res)) - start-res) - (begin ;; login failed but have a server record, clean out the record and try again - (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) - (http-transport:close-connections run-id) - (hash-table-delete! *runremote* run-id) - (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) - run-id - (tasks:hostinfo-get-interface server-dat) - (tasks:hostinfo-get-port server-dat) - " client:setup (server-dat = #t)") - (thread-sleep! 2) - (server:try-running run-id) - (thread-sleep! 10) ;; give server a little time to start up - (client:setup run-id remaining-tries: (- remaining-tries 1))))) - (begin ;; no server registered - (let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id))) - (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) - (thread-sleep! 2) - (if (< num-available 2) - (begin - (server:try-running run-id))) - (thread-sleep! 10) ;; give server a little time to start up - (client:setup run-id remaining-tries: (- remaining-tries 1))))))))))) + (let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) + (debug:print-info 4 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) + (if server-dat + (let* ((iface (tasks:hostinfo-get-interface server-dat)) + (hostname (tasks:hostinfo-get-hostname server-dat)) + (port (tasks:hostinfo-get-port server-dat)) + (start-res (case *transport-type* + ((http)(http-transport:client-connect iface port)) + ((nmsg)(nmsg-transport:client-connect hostname port)))) + (ping-res (case *transport-type* + ((http)(rmt:login-no-auto-client-setup start-res run-id)) + ((nmsg)(let ((logininfo (rmt:login-no-auto-client-setup start-res run-id))) + (if logininfo + (car (vector-ref logininfo 1)) + #f)))))) + (if (and start-res + ping-res) + (begin + (hash-table-set! *runremote* run-id start-res) + (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res)) + start-res) + (begin ;; login failed but have a server record, clean out the record and try again + (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) + (case *transport-type* + ((http)(http-transport:close-connections run-id))) + (hash-table-delete! *runremote* run-id) + (tasks:kill-server-run-id run-id) + (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) + run-id + (tasks:hostinfo-get-interface server-dat) + (tasks:hostinfo-get-port server-dat) + " client:setup (server-dat = #t)") + (if (> remaining-tries 8) + (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little + (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time + (server:try-running run-id) + (thread-sleep! 5) ;; give server a little time to start up + (client:setup run-id remaining-tries: (- remaining-tries 1)) + ))) + (begin ;; no server registered + (let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id))) + (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) + (if (< num-available 2) + (server:try-running run-id)) + (thread-sleep! (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. + (client:setup run-id remaining-tries: (- remaining-tries 1))))))))) ;; keep this as a function to ease future (define (client:start run-id server-info) (http-transport:client-connect (tasks:hostinfo-get-interface server-info) (tasks:hostinfo-get-port server-info))) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -66,11 +66,11 @@ (define *db-access-mutex* (make-mutex)) ;; SERVER (define *my-client-signature* #f) (define *transport-type* 'http) -(define *rpc:listener* #f) ;; if set up for server communication this will hold the tcp port +(define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg (define *runremote* (make-hash-table)) ;; if set up for server communication this will hold (define *max-cache-size* 0) (define *logged-in-clients* (make-hash-table)) (define *client-non-blocking-mode* #f) (define *server-id* #f) @@ -177,11 +177,12 @@ (2 "RUNNING") (3 "REMOTEHOSTSTART") (4 "LAUNCHED") (5 "KILLED") (6 "KILLREQ") - (7 "STUCK"))) + (7 "STUCK") + (8 "ARCHIVED"))) (define *common:std-statuses* '((0 "PASS") (1 "WARN") (2 "FAIL") @@ -193,11 +194,11 @@ (8 "STUCK/DEAD") (9 "ABORT"))) ;; These are stopping conditions that prevent a test from being run (define *common:cant-run-states-sym* - '(COMPLETED KILLED WAIVED UNKNOWN INCOMPLETE ABORT)) + '(COMPLETED KILLED WAIVED UNKNOWN INCOMPLETE ABORT ARCHIVED)) ;;====================================================================== ;; D E B U G G I N G S T U F F ;;====================================================================== @@ -226,10 +227,11 @@ (let ((run-ids (hash-table-keys *db-local-sync*))) (if (and (not (null? run-ids)) (configf:lookup *configdat* "setup" "megatest-db")) (db:multi-db-sync run-ids 'new2old))) (if *dbstruct-db* (db:close-all *dbstruct-db*)) + (if *inmemdb* (db:close-all *inmemdb*)) (if (and *megatest-db* (sqlite3:database? *megatest-db*)) (begin (sqlite3:interrupt! *megatest-db*) (sqlite3:finalize! *megatest-db* #t) @@ -442,24 +444,10 @@ (define (nice-path dir) (normalize-pathname (if (absolute-pathname? dir) dir (conc (current-directory) "/" dir)))) -(define (get-df path) - (let* ((df-results (cmd-run->list (conc "df " path))) - (space-rx (regexp "([0-9]+)\\s+([0-9]+)%")) - (freespc #f)) - ;; (write df-results) - (for-each (lambda (l) - (let ((match (string-search space-rx l))) - (if match - (let ((newval (string->number (cadr match)))) - (if (number? newval) - (set! freespc newval)))))) - (car df-results)) - freespc)) - (define (get-cpu-load) (car (common:get-cpu-load))) ;; (let* ((load-res (cmd-run->list "uptime")) ;; (load-rx (regexp "load average:\\s+(\\d+)")) ;; (cpu-load #f)) @@ -512,10 +500,67 @@ (let* ((uname-res (cmd-run->list (conc "uname " (if (null? params) "-a" (car params))))) (uname #f)) (if (null? (car uname-res)) "unknown" (caar uname-res)))) + +;;====================================================================== +;; D I S K S P A C E +;;====================================================================== + +(define (common:get-disk-space-used fpath) + (with-input-from-pipe (conc "/usr/bin/du -s " fpath) read)) + +(define (get-df path) + (let* ((df-results (cmd-run->list (conc "df " path))) + (space-rx (regexp "([0-9]+)\\s+([0-9]+)%")) + (freespc #f)) + ;; (write df-results) + (for-each (lambda (l) + (let ((match (string-search space-rx l))) + (if match + (let ((newval (string->number (cadr match)))) + (if (number? newval) + (set! freespc newval)))))) + (car df-results)) + freespc)) + +;; paths is list of lists ((name path) ... ) +;; +(define (common:get-disk-with-most-free-space disks minsize) + (let ((best #f) + (bestsize 0)) + (for-each + (lambda (disk-num) + (let* ((dirpath (cadr (assoc disk-num disks))) + (freespc (cond + ((not (directory? dirpath)) + (if (common:low-noise-print 50 "disks not a dir " disk-num) + (debug:print 0 "WARNING: disk " disk-num " at path " dirpath " is not a directory - ignoring it.")) + -1) + ((not (file-write-access? dirpath)) + (if (common:low-noise-print 50 "disks not writeable " disk-num) + (debug:print 0 "WARNING: disk " disk-num " at path " dirpath " is not writeable - ignoring it.")) + -1) + ((not (eq? (string-ref dirpath 0) #\/)) + (if (common:low-noise-print 50 "disks not a proper path " disk-num) + (debug:print 0 "WARNING: disk " disk-num " at path " dirpath " is not a fully qualified path - ignoring it.")) + -1) + (else + (get-df dirpath))))) + (if (> freespc bestsize) + (begin + (set! best (cons disk-num dirpath)) + (set! bestsize freespc))))) + (map car disks)) + (if (and best (> bestsize minsize)) + best + #f))) ;; #f means no disk candidate found + +;;====================================================================== +;; E N V I R O N M E N T V A R S +;;====================================================================== (define (save-environment-as-files fname #!key (ignorevars (list "USER" "HOME" "DISPLAY" "LS_COLORS" "XKEYSYMDB" "EDITOR" "MAKEFLAGS" "MAKEF"))) (let ((envvars (get-environment-variables)) (whitesp (regexp "[^a-zA-Z0-9_\\-:,.\\/%$]"))) (with-output-to-file (conc fname ".csh") Index: configf.scm ================================================================== --- configf.scm +++ configf.scm @@ -465,11 +465,12 @@ (ref-dat (configf:read-file dat-path #f #t)) (ref-assoc (map (lambda (key) (list key (hash-table-ref ref-dat key))) (hash-table-keys ref-dat)))) ;; (hash-table->alist ref-dat))) - (set! data (append data (list (list sheet-name ref-assoc)))))) + ;; (set! data (append data (list (list sheet-name ref-assoc)))))) + (set! data (cons (list sheet-name ref-assoc) data)))) sheets) (list data "NO ERRORS")))))) ;; map over all pairs in a three level hierarchial alist and apply a function to the keys/val ;; @@ -489,6 +490,32 @@ (let* ((valtmp (assoc varname sectiondat)) (val (if valtmp (cadr valtmp) ""))) (proc sheetname sectionname varname val))) (map car sectiondat)))) (map car sheetdat)))) - (map car data))) + (map car data)) + data) + +;;====================================================================== +;; C O N F I G T O / F R O M A L I S T +;;====================================================================== + +(define (configf:config->alist cfgdat) + (hash-table->alist cfgdat)) + +(define (configf:alist->config adat) + (let ((ht (make-hash-table))) + (for-each + (lambda (section) + (hash-table-set! ht (car section)(cdr section))) + adat) + ht)) + +(define (configf:read-alist fname) + (configf:alist->config + (with-input-from-file fname read))) + +(define (configf:write-alist cdat fname) + (with-output-to-file fname + (lambda () + (pp (configf:config->alist cdat))))) + Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -475,11 +475,11 @@ (teststatus (db:test-get-status test)) (teststate (db:test-get-state test)) ;;(teststart (db:test-get-event_time test)) ;;(runtime (db:test-get-run_duration test)) (buttontxt (cond - ((equal? teststate "COMPLETED") teststatus) + ((member teststate '("COMPLETED" "ARCHIVED")) teststatus) ((and (equal? teststate "NOT_STARTED") (member teststatus '("ZERO_ITEMS" "BLOCKED" "PREQ_FAIL" "PREQ_DISCARDED" "TIMED_OUT" "KEEP_TRYING" "TEN_STRIKES"))) teststatus) (else teststate))) @@ -1473,13 +1473,14 @@ (file-modification-time *monitor-db-path*) -1)) (run-update-time (current-seconds)) (recalc (dashboard:recalc modtime *please-update-buttons* *last-db-update-time*))) (if (and (eq? *current-tab-number* 0) - (> monitor-modtime *last-monitor-update-time*)) + (or (> monitor-modtime *last-monitor-update-time*) + (> (- run-update-time *last-monitor-update-time*) 5))) ;; update every 1/2 minute just in case (begin - (set! *last-monitor-update-time* monitor-modtime) + (set! *last-monitor-update-time* run-update-time) ;; monitor-modtime) (if dashboard:update-servers-table (dashboard:update-servers-table)))) (if recalc (begin (case *current-tab-number* ((0) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -234,18 +234,19 @@ (begin (dbr:dbstruct-set-localdb! dbstruct run-id db) ;; (dbr:dbstruct-set-inmem! dbstruct db) ;; direct access ... db) (begin (dbr:dbstruct-set-inmem! dbstruct inmem) - (sqlite3:execute db "DELETE FROM tests WHERE state='DELETED';") ;; they just slow us down in this context + ;; dec 14, 2014 - keep deleted records available. hunch is that they are needed for id placeholders + ;; (sqlite3:execute db "DELETE FROM tests WHERE state='DELETED';") ;; they just slow us down in this context (db:sync-tables db:sync-tests-only db inmem) - (db:delay-if-busy refdb) ;; dbpath: (db:dbdat-get-path refdb)) + (db:delay-if-busy refdb) ;; dbpath: (db:dbdat-get-path refdb)) ;; What does delaying here achieve? (dbr:dbstruct-set-refdb! dbstruct refdb) - (db:sync-tables db:sync-tests-only db refdb) - ;; sync once more to deal with delays - (db:sync-tables db:sync-tests-only db inmem) - (db:sync-tables db:sync-tests-only db refdb) + (db:sync-tables db:sync-tests-only inmem refdb) ;; use inmem as the reference, don't read again from db + ;; sync once more to deal with delays? + ;; (db:sync-tables db:sync-tests-only db inmem) + ;; (db:sync-tables db:sync-tests-only inmem refdb) inmem)))))) ;; This routine creates the db. It is only called if the db is not already ls opened ;; (define (db:open-main dbstruct) ;; (conc *toppath* "/megatest.db") (car *configinfo*))) @@ -296,11 +297,11 @@ (refdb (dbr:dbstruct-get-refdb dbstruct)) (olddb (dbr:dbstruct-get-olddb dbstruct)) ;; (runid (dbr:dbstruct-get-run-id dbstruct)) ) (debug:print-info 4 "Syncing for run-id: " run-id) - (mutex-lock! *http-mutex*) + ;; (mutex-lock! *http-mutex*) (if (eq? run-id 0) ;; runid equal to 0 is main.db (if maindb (if (or (not (number? mtime)) (not (number? stime)) @@ -325,16 +326,16 @@ (> mtime stime) force-sync) (begin (db:delay-if-busy rundb) (db:delay-if-busy olddb) + (dbr:dbstruct-set-stime! dbstruct (current-milliseconds)) (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb))) - (dbr:dbstruct-set-stime! dbstruct (current-milliseconds)) - (mutex-unlock! *http-mutex*) + ;; (mutex-unlock! *http-mutex*) num-synced) (begin - (mutex-unlock! *http-mutex*) + ;; (mutex-unlock! *http-mutex*) 0)))))) (define (db:close-main dbstruct) (let ((maindb (dbr:dbstruct-get-main dbstruct))) (if maindb @@ -493,13 +494,18 @@ (debug:print 0 " status: " ((condition-property-accessor 'sqlite3 'status) exn)) (debug:print 0 " src db: " (db:dbdat-get-path fromdb)) (for-each (lambda (dbdat) (debug:print 0 " dbpath: " (db:dbdat-get-path dbdat))) (cons todb slave-dbs)) - (if *server-run* ;; we are inside a server - (set! *time-to-exit* #t) ;; let watch dog know that it is time to die. - (exit 1))) + (if *server-run* ;; we are inside a server, throw a sync-failed error + (signal (make-composite-condition + (make-property-condition 'sync-failed 'message "db:sync-tables failed in a server context."))))) + + ;; (set! *time-to-exit* #t) ;; let watch dog know that it is time to die. + ;; (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") + ;; (portlogger:open-run-close portlogger:set-port port "released") + ;; (exit 1))) (cond ((not fromdb) (debug:print 3 "WARNING: db:sync-tables called with fromdb missing") -1) ((not todb) (debug:print 3 "WARNING: db:sync-tables called with todb missing") -2) ((not (sqlite3:database? (db:dbdat-get-db fromdb))) (debug:print 0 "ERROR: db:sync-tables called with fromdb not a database " fromdb) -3) @@ -545,10 +551,14 @@ (set! fromdats (cons fromdat fromdats)) (set! fromdat '()) (set! totrecords (+ totrecords 1))))) (db:dbdat-get-db fromdb) full-sel) + + ;; tack on remaining records in fromdat + (if (not (null? fromdat)) + (set! fromdats (cons fromdat fromdats))) (debug:print-info 2 "found " totrecords " records to sync") ;; read the target table (sqlite3:for-each-row @@ -617,10 +627,11 @@ ;; (define (db:multi-db-sync run-ids . options) (let* ((toppath (launch:setup-for-run)) (dbstruct (if toppath (make-dbr:dbstruct path: toppath) #f)) (mtdb (if toppath (db:open-megatest-db))) + (allow-cleanup (if run-ids #f #t)) (run-ids (if run-ids run-ids (if toppath (begin (db:delay-if-busy mtdb) (db:get-all-run-ids mtdb))))) @@ -663,21 +674,47 @@ (db:replace-test-records dbstruct run-id testrecs) (sqlite3:finalize! (db:dbdat-get-db (dbr:dbstruct-get-rundb dbstruct))))) run-ids))) ;; now ensure all newdb data are synced to megatest.db + ;; do not use the run-ids list passed in to the function + ;; (if (member 'new2old options) - (for-each - (lambda (run-id) - (let* ((fromdb (if toppath (make-dbr:dbstruct path: toppath local: #t) #f)) - (frundb (db:dbdat-get-db (db:get-db fromdb run-id)))) - ;; (db:delay-if-busy frundb) - ;; (db:delay-if-busy mtdb) - (if (eq? run-id 0) - (db:sync-tables (db:sync-main-list dbstruct) (db:get-db fromdb #f) mtdb) - (db:sync-tables db:sync-tests-only (db:get-db fromdb run-id) mtdb)))) - (cons 0 run-ids))) + (let* ((maindb (make-dbr:dbstruct path: toppath local: #t)) + (src-run-ids (db:get-all-run-ids (db:dbdat-get-db (db:get-db maindb 0)))) + (all-run-ids (sort (delete-duplicates (cons 0 src-run-ids)) <)) + (count 1) + (total (length all-run-ids)) + (dead-runs '())) + (for-each + (lambda (run-id) + (debug:print 0 "Processing run " (if (eq? run-id 0) " main.db " run-id) ", " count " of " total) + (set! count (+ count 1)) + (let* ((fromdb (if toppath (make-dbr:dbstruct path: toppath local: #t) #f)) + (frundb (db:dbdat-get-db (db:get-db fromdb run-id)))) + ;; (db:delay-if-busy frundb) + ;; (db:delay-if-busy mtdb) + ;; (db:clean-up frundb) + (if (eq? run-id 0) + (begin + (db:sync-tables (db:sync-main-list dbstruct) (db:get-db fromdb #f) mtdb) + (set! dead-runs (db:clean-up-maindb (db:get-db fromdb #f)))) + (begin + ;; NB// must sync first to ensure deleted tests get marked as such in megatest.db + (db:sync-tables db:sync-tests-only (db:get-db fromdb run-id) mtdb) + (db:clean-up-rundb (db:get-db fromdb run-id)) + )))) + all-run-ids) + ;; removed deleted runs + (let ((dbdir (tasks:get-task-db-path))) + (for-each (lambda (run-id) + (let ((fullname (conc dbdir "/" run-id ".db"))) + (if (file-exists? fullname) + (begin + (debug:print 0 "Removing database file for deleted run " fullname) + (delete-file fullname))))) + dead-runs)))) ;; (db:close-all dbstruct) ;; (sqlite3:finalize! mdb) )) ;; keeping it around for debugging purposes only @@ -769,10 +806,48 @@ avg_runtime REAL, avg_disk REAL, tags TEXT DEFAULT '', jobgroup TEXT DEFAULT 'default', CONSTRAINT test_meta_constraint UNIQUE (testname));") + (sqlite3:execute db "CREATE TABLE IF NOT EXISTS tasks_queue (id INTEGER PRIMARY KEY, + action TEXT DEFAULT '', + owner TEXT, + state TEXT DEFAULT 'new', + target TEXT DEFAULT '', + name TEXT DEFAULT '', + testpatt TEXT DEFAULT '', + keylock TEXT, + params TEXT, + creation_time TIMESTAMP DEFAULT (strftime('%s','now')), + execution_time TIMESTAMP);") + ;; archive disk areas, cached info from [archive-disks] + (sqlite3:execute db "CREATE TABLE IF NOT EXISTS archive_disks ( + id INTEGER PRIMARY KEY, + archive_area_name TEXT, + disk_path TEXT, + last_df INTEGER DEFAULT -1, + last_df_time TIMESTAMP DEFAULT (strftime('%s','now')), + creation_time TIMESTAMP DEFAULT (strftime('%','now')));") + ;; individual bup (or tar) data chunks + (sqlite3:execute db "CREATE TABLE IF NOT EXISTS archive_blocks ( + id INTEGER PRIMARY KEY, + archive_disk_id INTEGER, + disk_path TEXT, + last_du INTEGER DEFAULT -1, + last_du_time TIMESTAMP DEFAULT (strftime('%s','now')), + creation_time TIMESTAMP DEFAULT (strftime('%','now')));") + ;; tests allocated to what chunks. reusing a chunk for a test/item_path is very efficient + ;; NB// the per run/test recording of where the archive is stored is done in the test + ;; record. + (sqlite3:execute db "CREATE TABLE IF NOT EXISTS archive_allocations ( + id INTEGER PRIMARY KEY, + archive_block_id INTEGER, + testname TEXT, + item_path TEXT, + creation_time TIMESTAMP DEFAULT (strftime('%','now')));") + ;; move this clean up call somewhere else + (sqlite3:execute db "DELETE FROM tasks_queue WHERE state='done' AND creation_time < ?;" (- (current-seconds)(* 24 60 60))) ;; remove older than 24 hrs (sqlite3:execute db (conc "CREATE INDEX IF NOT EXISTS runs_index ON runs (runname" (if havekeys "," "") keystr ");")) ;; (sqlite3:execute db "CREATE VIEW runs_tests AS SELECT * FROM runs INNER JOIN tests ON runs.id=tests.run_id;") (sqlite3:execute db "CREATE TABLE IF NOT EXISTS extradat (id INTEGER PRIMARY KEY, run_id INTEGER, key TEXT, val TEXT);") (sqlite3:execute db "CREATE TABLE IF NOT EXISTS metadat (id INTEGER PRIMARY KEY, var TEXT, val TEXT, CONSTRAINT metadat_constraint UNIQUE (var));") @@ -809,11 +884,11 @@ run_duration INTEGER DEFAULT 0, comment TEXT DEFAULT '', event_time TIMESTAMP DEFAULT (strftime('%s','now')), fail_count INTEGER DEFAULT 0, pass_count INTEGER DEFAULT 0, - archived INTEGER DEFAULT 0, -- 0=no, 1=in progress, 2=yes + archived INTEGER DEFAULT 0, -- 0=no, > 1=archive block id where test data can be found CONSTRAINT testsconstraint UNIQUE (run_id, testname, item_path));") (sqlite3:execute db "CREATE INDEX IF NOT EXISTS tests_index ON tests (run_id, testname, item_path);") (sqlite3:execute db "CREATE TABLE IF NOT EXISTS test_steps (id INTEGER PRIMARY KEY, test_id INTEGER, @@ -852,12 +927,130 @@ test_id INTEGER, update_time TIMESTAMP, cpuload INTEGER DEFAULT -1, diskfree INTEGER DEFAULT -1, diskusage INTGER DEFAULT -1, - run_duration INTEGER DEFAULT 0);"))) + run_duration INTEGER DEFAULT 0);") + (sqlite3:execute db "CREATE TABLE IF NOT EXISTS archives ( + id INTEGER PRIMARY KEY, + test_id INTEGER, + state TEXT DEFAULT 'new', + status TEXT DEFAULT 'n/a', + archive_type TEXT DEFAULT 'bup', + du INTEGER, + archive_path TEXT);"))) db) + +;;====================================================================== +;; A R C H I V E S +;;====================================================================== + +;; dneeded is minimum space needed, scan for existing archives that +;; are on disks with adequate space and already have this test/itempath +;; archived +;; +(define (db:archive-get-allocations dbstruct testname itempath dneeded) + (let* ((dbdat (db:get-db dbstruct #f)) ;; archive tables are in main.db + (db (db:dbdat-get-db dbdat)) + (res '()) + (blocks '())) ;; a block is an archive chunck that can be added too if there is space + (sqlite3:for-each-row + (lambda (id archive-disk-id disk-path last-du last-du-time) + (set! res (cons (vector id archive-disk-id disk-path last-du last-du-time) res))) + db + "SELECT b.id,b.archive_disk_id,b.disk_path,b.last_du,b.last_du_time FROM archive_blocks AS b + INNER JOIN archive_allocations AS a ON a.archive_block_id=b.id + WHERE a.testname=? AND a.item_path=?;" + testname itempath) + ;; Now res has list of candidate paths, look in archive_disks for candidate with potential free space + (if (null? res) + '() + (sqlite3:for-each-row + (lambda (id archive-area-name disk-path last-df last-df-time) + (set! blocks (cons (vector id archive-area-name disk-path last-df last-df-time) blocks))) + db + (conc + "SELECT d.id,d.archive_area_name,disk_path,last_df,last_df_time FROM archive_disks AS d + INNER JOIN archive_blocks AS b ON d.id=b.archive_disk_id + WHERE b.id IN (" (string-intersperse (map conc res) ",") ") AND + last_df > ?;") + dneeded)) + blocks)) + +;; returns id of the record, register a disk allocated to archiving and record it's last known +;; available space +;; +(define (db:archive-register-disk dbstruct bdisk-name bdisk-path df) + (let* ((dbdat (db:get-db dbstruct #f)) ;; archive tables are in main.db + (db (db:dbdat-get-db dbdat)) + (res #f)) + (sqlite3:for-each-row + (lambda (id) + (set! res id)) + db + "SELECT id FROM archive_disks WHERE archive_area_name=? AND disk_path=?;" + bdisk-name bdisk-path) + (if res ;; record exists, update df and return id + (begin + (sqlite3:execute db "UPDATE archive_disks SET last_df=?,last_df_time=(strftime('%s','now')) + WHERE archive_area_name=? AND disk_path=?;" + df bdisk-name bdisk-path) + res) + (begin + (sqlite3:execute + db + "INSERT OR REPLACE INTO archive_disks (archive_area_name,disk_path,last_df) + VALUES (?,?,?);" + bdisk-name bdisk-path df) + (db:archive-register-disk dbstruct bdisk-name bdisk-path df))))) + +;; record an archive path created on a given archive disk (identified by it's bdisk-id) +;; if path starts with / then it is full, otherwise it is relative to the archive disk +;; preference is to store the relative path. +;; +(define (db:archive-register-block-name dbstruct bdisk-id archive-path #!key (du #f)) + (let* ((dbdat (db:get-db dbstruct #f)) ;; archive tables are in main.db + (db (db:dbdat-get-db dbdat)) + (res #f)) + ;; first look to see if this path is already registered + (sqlite3:for-each-row + (lambda (id) + (set! res id)) + db + "SELECT id FROM archive_blocks WHERE archive_disk_id=? AND disk_path=?;" + bdisk-id archive-path) + (if res ;; record exists, update du if applicable and return res + (begin + (if du (sqlite3:exectute db "UPDATE archive_blocks SET last_du=?,last_du_time=(strftime('%s','now')) + WHERE archive_disk_id=? AND disk_path=?;" + bdisk-id archive-path du)) + res) + (begin + (sqlite3:execute db "INSERT OR REPLACE INTO archive_blocks (archive_disk_id,disk_path,last_du) + VALUES (?,?,?);" + bdisk-id archive-path (or du 0)) + (db:archive-register-block-name dbstruct bdisk-id archive-path du: du))))) + + +;; The "archived" field in tests is overloaded; 0 = not archived, > 0 archived in block with given id +;; +(define (db:test-set-archive-block-id dbstruct run-id test-id archive-block-id) + (db:with-db + dbstruct + run-id + #f + (lambda (db) + (sqlite3:execute db "UPDATE tests SET archived=? WHERE id=?;" + archive-block-id test-id)))) + + +;; (define (db:archive-allocate-testsuite/area-to-block block-id testsuite-name areakey) +;; (let* ((dbdat (db:get-db dbstruct #f)) ;; archive tables are in main.db +;; (db (db:dbdat-get-db dbdat)) +;; (res '()) +;; (blocks '())) ;; a block is an archive chunck that can be added too if there is space +;; (sqlite3:for-each-row #f) ;;====================================================================== ;; L O G G I N G D B ;;====================================================================== @@ -895,10 +1088,64 @@ ;;====================================================================== ;;====================================================================== ;; M A I N T E N A N C E ;;====================================================================== + +(define (db:have-incompletes? dbstruct run-id ovr-deadtime) + (let* ((dbdat (db:get-db dbstruct run-id)) + (db (db:dbdat-get-db dbdat)) + (incompleted '()) + (oldlaunched '()) + (toplevels '()) + (deadtime-str (configf:lookup *configdat* "setup" "deadtime")) + (deadtime (if (and deadtime-str + (string->number deadtime-str)) + (string->number deadtime-str) + 7200))) ;; two hours + (if (number? ovr-deadtime)(set! deadtime ovr-deadtime)) + + ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes + ;; + ;; HOWEVER: this code in run:test seems to work fine + ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) + ;; (db:test-get-run_duration testdat))) + ;; 600) + (db:delay-if-busy dbdat) + (sqlite3:for-each-row + (lambda (test-id run-dir uname testname item-path) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (begin + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (debug:print-info 0 "Found old toplevel test in RUNNING state, test-id=" test-id)) + (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))) + db + "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');" + run-id deadtime) + + ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config + ;; + (db:delay-if-busy dbdat) + (sqlite3:for-each-row + (lambda (test-id run-dir uname testname item-path) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))) + db + "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');" + run-id) + + (debug:print-info 18 "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.") + (if (and (null? incompleted) + (null? oldlaunched) + (null? toplevels)) + #f + #t))) ;; select end_time-now from ;; (select testname,item_path,event_time+run_duration as ;; end_time,strftime('%s','now') as now from tests where state in ;; ('RUNNING','REMOTEHOSTSTART','LAUNCED')); @@ -993,11 +1240,11 @@ ;; 2. Look at run records ;; a. If have tests that are not deleted, set state='unknown' ;; b. .... ;; (define (db:clean-up dbdat) - (debug:print 0 "WARNING: db clean up not fully ported to v1.60, cleanup action will be on megatest.db") + ;; (debug:print 0 "WARNING: db clean up not fully ported to v1.60, cleanup action will be on megatest.db") (let* ((db (db:dbdat-get-db dbdat)) (count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM tests)+(SELECT count(id) FROM runs);")) (statements (map (lambda (stmt) (sqlite3:prepare db stmt)) @@ -1027,10 +1274,99 @@ (map sqlite3:finalize! statements) (sqlite3:finalize! count-stmt) ;; (db:find-and-mark-incomplete db) (db:delay-if-busy dbdat) (sqlite3:execute db "VACUUM;"))) + +;; Clean out old junk and vacuum the database +;; +;; Ultimately do something like this: +;; +;; 1. Look at test records either deleted or part of deleted run: +;; a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown' +;; b. If test dir gone, delete the test record +;; 2. Look at run records +;; a. If have tests that are not deleted, set state='unknown' +;; b. .... +;; +(define (db:clean-up-rundb dbdat) + ;; (debug:print 0 "WARNING: db clean up not fully ported to v1.60, cleanup action will be on megatest.db") + (let* ((db (db:dbdat-get-db dbdat)) + (count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM tests);")) + (statements + (map (lambda (stmt) + (sqlite3:prepare db stmt)) + (list + ;; delete all tests that belong to runs that are 'deleted' + ;; (conc "DELETE FROM tests WHERE run_id NOT IN (" (string-intersperse (map conc valid-runs) ",") ");") + ;; delete all tests that are 'DELETED' + "DELETE FROM tests WHERE state='DELETED';" + )))) + (db:delay-if-busy dbdat) + (sqlite3:with-transaction + db + (lambda () + (sqlite3:for-each-row (lambda (tot) + (debug:print-info 0 "Records count before clean: " tot)) + count-stmt) + (map sqlite3:execute statements) + (sqlite3:for-each-row (lambda (tot) + (debug:print-info 0 "Records count after clean: " tot)) + count-stmt))) + (map sqlite3:finalize! statements) + (sqlite3:finalize! count-stmt) + ;; (db:find-and-mark-incomplete db) + (db:delay-if-busy dbdat) + (sqlite3:execute db "VACUUM;"))) + +;; Clean out old junk and vacuum the database +;; +;; Ultimately do something like this: +;; +;; 1. Look at test records either deleted or part of deleted run: +;; a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown' +;; b. If test dir gone, delete the test record +;; 2. Look at run records +;; a. If have tests that are not deleted, set state='unknown' +;; b. .... +;; +(define (db:clean-up-maindb dbdat) + ;; (debug:print 0 "WARNING: db clean up not fully ported to v1.60, cleanup action will be on megatest.db") + (let* ((db (db:dbdat-get-db dbdat)) + (count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM runs);")) + (statements + (map (lambda (stmt) + (sqlite3:prepare db stmt)) + (list + ;; delete all tests that belong to runs that are 'deleted' + ;; (conc "DELETE FROM tests WHERE run_id NOT IN (" (string-intersperse (map conc valid-runs) ",") ");") + ;; delete all tests that are 'DELETED' + "DELETE FROM runs WHERE state='deleted';" + ))) + (dead-runs '())) + (sqlite3:for-each-row + (lambda (run-id) + (set! dead-runs (cons run-id dead-runs))) + db + "SELECT id FROM runs WHERE state='deleted';") + (db:delay-if-busy dbdat) + (sqlite3:with-transaction + db + (lambda () + (sqlite3:for-each-row (lambda (tot) + (debug:print-info 0 "Records count before clean: " tot)) + count-stmt) + (map sqlite3:execute statements) + (sqlite3:for-each-row (lambda (tot) + (debug:print-info 0 "Records count after clean: " tot)) + count-stmt))) + (map sqlite3:finalize! statements) + (sqlite3:finalize! count-stmt) + ;; (db:find-and-mark-incomplete db) + (db:delay-if-busy dbdat) + (sqlite3:execute db "VACUUM;") + dead-runs)) ;;====================================================================== ;; M E T A G E T A N D S E T V A R S ;;====================================================================== @@ -2296,26 +2632,26 @@ ;;====================================================================== ;; QUEUE UP META, TEST STATUS AND STEPS REMOTE ACCESS ;;====================================================================== ;; NOTE: Can remove the regex and base64 encoding for zmq -(define (db:obj->string obj) - (case *transport-type* +(define (db:obj->string obj #!key (transport 'http)) + (case transport ;; ((fs) obj) ((http fs) (string-substitute (regexp "=") "_" (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda ()(serialize obj))))) #t)) - ((zmq)(with-output-to-string (lambda ()(serialize obj)))) + ((zmq nmsg)(with-output-to-string (lambda ()(serialize obj)))) (else obj))) -(define (db:string->obj msg) - (case *transport-type* +(define (db:string->obj msg #!key (transport 'http)) + (case transport ;; ((fs) msg) ((http fs) (if (string? msg) (with-input-from-string (z3:decode-buffer @@ -2323,12 +2659,12 @@ (string-substitute (regexp "_") "=" msg #t))) (lambda ()(deserialize))) (begin (debug:print 0 "ERROR: reception failed. Received " msg " but cannot translate it.") - #f))) ;; crude reply for when things go awry - ((zmq)(with-input-from-string msg (lambda ()(deserialize)))) + msg))) ;; crude reply for when things go awry + ((zmq nmsg)(with-input-from-string msg (lambda ()(deserialize)))) (else msg))) (define (db:test-set-status-state dbstruct run-id test-id status state msg) (let ((dbdat (db:get-db dbstruct run-id))) (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) @@ -2348,18 +2684,10 @@ (db:general-call dbdat 'top-test-set (list "LAUNCHED" test-name)) (db:general-call dbdat 'top-test-set-per-pf-counts (list test-name run-id test-name test-name test-name)))) #f) #f)) -(define (db:tests-register-test dbstruct run-id test-name item-path) - (db:with-db - dbstruct - run-id - #t - (lambda (db) - (sqlite3:execute db 'register-test run-id test-name item-path)))) - (define (db:test-get-logfile-info dbstruct run-id test-name) (db:with-db dbstruct run-id #f Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -136,21 +136,26 @@ (get-tests-sig (conc (client:get-signature) " get-tests")) (get-details-sig (conc (client:get-signature) " get-test-details")) ;; test-ids to get and display are indexed on window-id in curr-test-ids hash (test-ids (hash-table-values (dboard:data-get-curr-test-ids *data*))) - - (run-changes (synchash:client-get 'db:get-runs get-runs-sig (length keypatts) data runname #f #f keypatts)) + ;; run-id is #f in next line to send the query to server 0 + (run-changes (synchash:client-get 'db:get-runs get-runs-sig (length keypatts) data #f runname #f #f keypatts)) (tests-detail-changes (if (not (null? test-ids)) (synchash:client-get 'db:get-test-info-by-ids get-details-sig 0 data #f test-ids) '())) ;; Now can calculate the run-ids (run-hash (hash-table-ref/default data get-runs-sig #f)) (run-ids (if run-hash (filter number? (hash-table-keys run-hash)) '())) - (test-changes (synchash:client-get 'db:get-tests-for-runs-mindata get-tests-sig 0 data run-ids testpatt states statuses #f)) + (all-test-changes (let ((res (make-hash-table))) + (for-each (lambda (run-id) + (if (> run-id 0) + (hash-table-set! res run-id (synchash:client-get 'db:get-tests-for-run-mindata get-tests-sig 0 data run-id 1 testpatt states statuses #f)))) + run-ids) + res)) (runs-hash (hash-table-ref/default data get-runs-sig #f)) (header (hash-table-ref/default runs-hash "header" #f)) (run-ids (sort (filter number? (hash-table-keys runs-hash)) (lambda (a b) (let* ((record-a (hash-table-ref runs-hash a)) @@ -192,10 +197,11 @@ ;; Scan all tests to be displayed and organise all the test names, respecting what is in the hash table ;; Do this analysis in the order of the run-ids, the most recent run wins (for-each (lambda (run-id) (let* ((run-path (hash-table-ref (dboard:data-get-run-keys *data*) run-id)) + (test-changes (hash-table-ref all-test-changes run-id)) (new-test-dat (car test-changes)) (removed-tests (cadr test-changes)) (tests (sort (map cadr (filter (lambda (testrec) (eq? run-id (db:mintest-get-run_id (cadr testrec)))) new-test-dat)) @@ -253,11 +259,11 @@ )) ;; set the cell text and color ;; (debug:print 2 "rownum:colnum=" rownum ":" colnum ", state=" status) (iup:attribute-set! (dboard:data-get-runs-matrix *data*) (conc rownum ":" colnum) - (if (string=? state "COMPLETED") + (if (member state '("ARCHIVED" "COMPLETED")) status state)) (iup:attribute-set! (dboard:data-get-runs-matrix *data*) (conc "BGCOLOR" rownum ":" colnum) (car (gutils:get-color-for-state-status state status))) @@ -269,11 +275,11 @@ (if updater (updater (hash-table-ref/default data get-details-sig #f)))) (iup:attribute-set! (dboard:data-get-runs-matrix *data*) "REDRAW" "ALL") ;; (debug:print 2 "run-changes: " run-changes) ;; (debug:print 2 "test-changes: " test-changes) - (list run-changes test-changes))) + (list run-changes all-test-changes))) ;;====================================================================== ;; TESTS DATA ;;====================================================================== Index: docs/manual/megatest_manual.html ================================================================== --- docs/manual/megatest_manual.html +++ docs/manual/megatest_manual.html @@ -1,1284 +1,1361 @@ - - - - - -The Megatest Users Manual - - - - - -
-
-

Preface

-
-

This book is organised as three sub-books; getting started, writing tests and reference.

-
-

Why Megatest?

-

The Megatest project was started for two reasons, the first was an -immediate and pressing need for a generalized tool to manage a suite -of regression tests and the second was the fact that the author had -written or maintained several such tools at different companies over -the years and it seemed a good thing to have a single open source -tool, flexible enough to meet the needs of any team doing continuous -integrating and or running a complex suite of tests for release -qualification.

-
-
-

Megatest Design Philosophy

-

Megatest is intended to provide the minimum needed resources to make -writing a suite of tests and tasks for implementing continuous build -for software, design engineering or process control (via owlfs for -example) without being specialized for any specific problem -space. Megatest in of itself does not know what constitutes a PASS or -FAIL of a test. In most cases megatest is best used in conjunction -with logpro or a similar tool to parse, analyze and decide on the test -outcome.

-
-
-

Megatest Architecture

-

All data to specify the tests and configure the system is stored in -plain text files. All system state is stored in an sqlite3 -database. Tests are launched using the launching system available for -the distributed compute platform in use. A template script is provided -which can launch jobs on local and remote Linux hosts. Currently -megatest uses the network filesystem to call home to your master -sqlite3 database.

-
-
-
-

Road Map

-

Note 1: This road-map is tentative and subject to change without notice.

-

Note 2: Starting over. Old plan is commented out.

-
-

Current Items

-
-
-

ww05 - migrate to inmem-db

-

Keep as much the same as possible. Add internal reference to almost -eliminate contention on db(s).

-
    -
  1. -

    -Add internal reference db -

    -
  2. -
  3. -

    -Verify that actions are accessing correct db -

    -
      -
    1. -

      --runtests - inmem -

      -
    2. -
    3. -

      --list-runs - local (but not megatest.db) -

      -
    4. -
    5. -

      -dashboard - local (but not megatest.db) -

      -
    6. -
    -
  4. -
  5. -

    -Mirror db to /var/tmp… -

    -
  6. -
  7. -

    -Dashboard read db from per-run db. -

    -
  8. -
  9. -

    -Dashboard read db from /var/tmp -

    -
  10. -
  11. -

    -Runs register in tasks table in monitor.db -

    -
  12. -
  13. -

    -Server polls tasks table for next action (in addition?) -

    -
  14. -
  15. -

    -Change run loop to execute in server, triggered by call to polling of tasks table -

    -
  16. -
-
-
-
-

Getting Started

-
-
Getting started with Megatest
-
-

How to install Megatest and set it up for running your regressions and continuous integration process.

-
-
-

Installation

-
-
-

Dependencies

-

Chicken scheme and a number of "eggs" are required for building -Megatest. See the script installall.sch in the utils directory of the -distribution for a mostly automated way to install everything needed -for building Megatest on Linux.

-


[An example footnote.]

-

And now for something completely different: monkeys, lions and -tigers (Bengal and Siberian) using the alternative syntax index -entries. - - - -Note that multi-entry terms generate separate index entries.

-

Here are a couple of image examples: an -images/smallnew.png - -example inline image followed by an example block image:

-
-
-Tiger image -
-
Figure 1. Tiger block image
-
-

Followed by an example table:

-
- - --- - - - - - - - - - - - - - - - -
Table 1. An example table
Option Description

-a USER GROUP

Add USER to GROUP.

-R GROUP

Disables access to GROUP.

-
-
-
Example 1. An example example
-
-

Lorum ipum…

-
-
-
-

Sub-section with Anchor

-

Sub-section at level 2.

-
-

Chapter Sub-section

-

Sub-section at level 3.

-
-
Chapter Sub-section
-

Sub-section at level 4.

-

This is the maximum sub-section depth supported by the distributed -AsciiDoc configuration. -
[A second example footnote.]

-
-
-
-
-
-
-

The Second Chapter

-
-

An example link to anchor at start of the first sub-section.

-

An example link to a bibliography entry [taoup].

-
-
-

Writing Tests

-
-

The First Chapter of the Second Part

-
-

Chapters grouped into book parts are at level 1 and can contain -sub-sections.

-
-
-

How To Do Things

-
-

Tricks

-
-

This section is a compendium of a various useful tricks for debugging, -configuring and generally getting the most out of Megatest.

-
-
-
-

Limiting your running jobs

-
-

The following example will limit a test in the jobgroup "group1" to no more than 10 tests simultaneously.

-

In your testconfig:

-
-
-
[test_meta]
-jobgroup group1
-
-

In your megatest.config:

-
-
-
[jobgroups]
-group1 10
-custdes 4
-
-
-
-
-

Debugging Tricks

-
-
-

Examining The Environment

-
-

During Config File Processing

-
-
-

Organising Your Tests and Tasks

-
-
-
[tests-paths]
-1 #{get misc parent}/simplerun/tests
-
-
-
-
[setup]
-
-

The runscript method is a brute force way to run scripts where the -user is responsible for setting STATE and STATUS

-
-
-
runscript main.csh
-
-
-
-
-

Debugging Server Problems

-
-
-
sudo lsof -i
-sudo netstat -lptu
-sudo netstat -tulpn
-
-
-
-
-

Reference

-
-

The First Chapter of the Second Part

-
-

Chapters grouped into book parts are at level 1 and can contain -sub-sections.

-
-
-
-

The testconfig File

-
-
-

Setup section

-
-

Header

-
-
-
[setup]
-
-

The runscript method is a brute force way to run scripts where the -user is responsible for setting STATE and STATUS

-
-
-
runscript main.csh
-
-
-
-
-

Requirements section

-
-

Header

-
-
-
[requirements]
-
-
-
-

Wait on Other Tests

-
-
-
# A normal waiton waits for the prior tests to be COMPLETED
-# and PASS, CHECK or WAIVED
-waiton test1 test2
-
-
-
-

Mode

-

The default (i.e. if mode is not specified) is normal. All pre-dependent tests -must be COMPLETED and PASS, CHECK or WAIVED before the test will start

-
-
-
mode   normal
-
-

The toplevel mode requires only that the prior tests are COMPLETED.

-
-
-
mode toplevel
-
-

A item based waiton will start items in a test when the -same-named item is COMPLETED and PASS, CHECK or WAIVED -in the prior test

-
-
-
mode itemmatch
-
-
-
-
# With a toplevel test you may wish to generate your list
-# of tests to run dynamically
-#
-# waiton #{shell get-valid-tests-to-run.sh}
-
-
-
-

Run time limit

-
-
-
runtimelim 1h 2m 3s  # this will automatically kill the test if it runs for more than 1h 2m and 3s
-
-
-
-

Skip

-
-
-

Header

-
-
-
[skip]
-
-
-
-

Skip on Still-running Tests

-
-
-
# NB// If the prevrunning line exists with *any* value the test will
-# automatically SKIP if the same-named test is currently RUNNING
-
-prevrunning x
-
-
-
-

Skip if a File Exists

-
-
-
fileexists /path/to/a/file # skip if /path/to/a/file exists
-
-
-
-

Controlled waiver propagation

-

If test is FAIL and previous test in run with same MT_TARGET is WAIVED then apply the following rules from the testconfig: -If a waiver check is specified in the testconfig apply the check and if it passes then set this FAIL to WAIVED

-

Waiver check has two parts, 1) a list of waiver, rulename, filepatterns and 2) the rulename script spec (note that "diff" and "logpro" are predefined)

-
-
-
###### EXAMPLE FROM testconfig #########
-# matching file(s) will be diff'd with previous run and logpro applied
-# if PASS or WARN result from logpro then WAIVER state is set
-#
-[waivers]
-# logpro_file    rulename      input_glob
-waiver_1         logpro        lookittmp.log
-
-[waiver_rules]
-
-# This builtin rule is the default if there is no <waivername>.logpro file
-# diff   diff %file1% %file2%
-
-# This builtin rule is applied if a <waivername>.logpro file exists
-# logpro diff %file1% %file2% | logpro %waivername%.logpro %waivername%.html
-
-
-
-
-

Ezsteps

-

To transfer the environment to the next step you can do the following:

-
-
-
$MT_MEGATEST -env2file .ezsteps/${stepname}
-
-
-
-

Triggers

-

In your testconfig triggers can be specified

-
-
-
[triggers]
-
-# Call script running.sh when test goes to state=RUNNING, status=PASS
-RUNNING/PASS running.sh
-
-# Call script running.sh any time state goes to RUNNING
-RUNNING/ running.sh
-
-# Call script onpass.sh any time status goes to PASS
-PASS/ onpass.sh
-
-

Scripts called will have; test-id test-rundir trigger, added to the commandline.

-

HINT

-

To start an xterm (useful for debugging), use a command line like the following:

-
-
-
[triggers]
-COMPLETED/ xterm -e bash -s --
-
-
- - - -
-
Note
-
There is a trailing space after the --
-
-
-
-

Megatest Internals

-
-
-server.png -
-
-
-
-
-
-

Appendix A: Example Appendix

-
-

One or more optional appendixes go here at section level zero.

-
-

Appendix Sub-section

-
- - - -
-
Note
-
Preface and appendix subsections start out of sequence at level -2 (level 1 is skipped). This only applies to multi-part book -documents.
-
-
-
-
-
-

Example Bibliography

-
-

The bibliography list is a style of AsciiDoc bulleted list.

-
    -
  • -

    -[taoup] Eric Steven Raymond. The Art of Unix - Programming. Addison-Wesley. ISBN 0-13-142901-9. -

    -
  • -
  • -

    -[walsh-muellner] Norman Walsh & Leonard Muellner. - DocBook - The Definitive Guide. O’Reilly & Associates. 1999. - ISBN 1-56592-580-7. -

    -
  • -
-
-
-
-

Example Glossary

-
-

Glossaries are optional. Glossaries entries are an example of a style -of AsciiDoc labeled lists.

-
-
-A glossary term -
-
-

- The corresponding (indented) definition. -

-
-
-A second glossary term -
-
-

- The corresponding (indented) definition. -

-
-
-
-
-
-

Example Colophon

-
-

Text at the end of a book describing facts about its production.

-
-
-
-

Example Index

-
-
-
-
-

- - - + + + + + +The Megatest Users Manual + + + + + +
+
+

Preface

+
+

This book is organised as three sub-books; getting started, writing tests and reference.

+
+

Why Megatest?

+

The Megatest project was started for two reasons, the first was an +immediate and pressing need for a generalized tool to manage a suite +of regression tests and the second was the fact that the author had +written or maintained several such tools at different companies over +the years and it seemed a good thing to have a single open source +tool, flexible enough to meet the needs of any team doing continuous +integrating and or running a complex suite of tests for release +qualification.

+
+
+

Megatest Design Philosophy

+

Megatest is intended to provide the minimum needed resources to make +writing a suite of tests and tasks for implementing continuous build +for software, design engineering or process control (via owlfs for +example) without being specialized for any specific problem +space. Megatest in of itself does not know what constitutes a PASS or +FAIL of a test. In most cases megatest is best used in conjunction +with logpro or a similar tool to parse, analyze and decide on the test +outcome.

+
+
+

Megatest Architecture

+

All data to specify the tests and configure the system is stored in +plain text files. All system state is stored in an sqlite3 +database. Tests are launched using the launching system available for +the distributed compute platform in use. A template script is provided +which can launch jobs on local and remote Linux hosts. Currently +megatest uses the network filesystem to call home to your master +sqlite3 database.

+
+
+
+

Road Map

+

Note 1: This road-map is tentative and subject to change without notice.

+

Note 2: Starting over. Old plan is commented out.

+
+

Current Items

+
+
+

ww05 - migrate to inmem-db

+

Keep as much the same as possible. Add internal reference to almost +eliminate contention on db(s).

+
    +
  1. +

    +Add internal reference db +

    +
  2. +
  3. +

    +Verify that actions are accessing correct db +

    +
      +
    1. +

      +-runtests - inmem +

      +
    2. +
    3. +

      +-list-runs - local (but not megatest.db) +

      +
    4. +
    5. +

      +dashboard - local (but not megatest.db) +

      +
    6. +
    +
  4. +
  5. +

    +Mirror db to /var/tmp… +

    +
  6. +
  7. +

    +Dashboard read db from per-run db. +

    +
  8. +
  9. +

    +Dashboard read db from /var/tmp +

    +
  10. +
  11. +

    +Runs register in tasks table in monitor.db +

    +
  12. +
  13. +

    +Server polls tasks table for next action (in addition?) +

    +
  14. +
  15. +

    +Change run loop to execute in server, triggered by call to polling of tasks table +

    +
  16. +
+
+
+
+

Getting Started

+
+
Getting started with Megatest
+
+

How to install Megatest and set it up for running your regressions and continuous integration process.

+
+
+

Installation

+
+
+

Dependencies

+

Chicken scheme and a number of "eggs" are required for building +Megatest. See the script installall.sch in the utils directory of the +distribution for a mostly automated way to install everything needed +for building Megatest on Linux.

+


[An example footnote.]

+

And now for something completely different: monkeys, lions and +tigers (Bengal and Siberian) using the alternative syntax index +entries. + + + +Note that multi-entry terms generate separate index entries.

+

Here are a couple of image examples: an +images/smallnew.png + +example inline image followed by an example block image:

+
+
+Tiger image +
+
Figure 1. Tiger block image
+
+

Followed by an example table:

+
+ + +++ + + + + + + + + + + + + + + + +
Table 1. An example table
Option Description

-a USER GROUP

Add USER to GROUP.

-R GROUP

Disables access to GROUP.

+
+
+
Example 1. An example example
+
+

Lorum ipum…

+
+
+
+

Sub-section with Anchor

+

Sub-section at level 2.

+
+

Chapter Sub-section

+

Sub-section at level 3.

+
+
Chapter Sub-section
+

Sub-section at level 4.

+

This is the maximum sub-section depth supported by the distributed +AsciiDoc configuration. +
[A second example footnote.]

+
+
+
+
+
+
+

The Second Chapter

+
+

An example link to anchor at start of the first sub-section.

+

An example link to a bibliography entry [taoup].

+
+
+

Writing Tests

+
+

The First Chapter of the Second Part

+
+

Chapters grouped into book parts are at level 1 and can contain +sub-sections.

+
+
+

How To Do Things

+
+

Tricks

+
+

This section is a compendium of a various useful tricks for debugging, +configuring and generally getting the most out of Megatest.

+
+
+
+

Limiting your running jobs

+
+

The following example will limit a test in the jobgroup "group1" to no more than 10 tests simultaneously.

+

In your testconfig:

+
+
+
[test_meta]
+jobgroup group1
+
+

In your megatest.config:

+
+
+
[jobgroups]
+group1 10
+custdes 4
+
+
+
+
+

Debugging Tricks

+
+
+

Examining The Environment

+
+

During Config File Processing

+
+
+

Organising Your Tests and Tasks

+
+
+
[tests-paths]
+1 #{get misc parent}/simplerun/tests
+
+
+
+
[setup]
+
+

The runscript method is a brute force way to run scripts where the +user is responsible for setting STATE and STATUS

+
+
+
runscript main.csh
+
+
+
+
+

Debugging Server Problems

+
+
+
sudo lsof -i
+sudo netstat -lptu
+sudo netstat -tulpn
+
+
+
+
+

Reference

+
+

The testconfig File

+
+
+

Setup section

+
+

Header

+
+
+
[setup]
+
+

The runscript method is a brute force way to run scripts where the +user is responsible for setting STATE and STATUS

+
+
+
runscript main.csh
+
+
+
+
+

Requirements section

+
+

Header

+
+
+
[requirements]
+
+
+
+

Wait on Other Tests

+
+
+
# A normal waiton waits for the prior tests to be COMPLETED
+# and PASS, CHECK or WAIVED
+waiton test1 test2
+
+
+
+

Mode

+

The default (i.e. if mode is not specified) is normal. All pre-dependent tests +must be COMPLETED and PASS, CHECK or WAIVED before the test will start

+
+
+
mode   normal
+
+

The toplevel mode requires only that the prior tests are COMPLETED.

+
+
+
mode toplevel
+
+

A item based waiton will start items in a test when the +same-named item is COMPLETED and PASS, CHECK or WAIVED +in the prior test

+
+
+
mode itemmatch
+
+
+
+
# With a toplevel test you may wish to generate your list
+# of tests to run dynamically
+#
+# waiton #{shell get-valid-tests-to-run.sh}
+
+
+
+

Run time limit

+
+
+
runtimelim 1h 2m 3s  # this will automatically kill the test if it runs for more than 1h 2m and 3s
+
+
+
+

Skip

+
+
+

Header

+
+
+
[skip]
+
+
+
+

Skip on Still-running Tests

+
+
+
# NB// If the prevrunning line exists with *any* value the test will
+# automatically SKIP if the same-named test is currently RUNNING
+
+prevrunning x
+
+
+
+

Skip if a File Exists

+
+
+
fileexists /path/to/a/file # skip if /path/to/a/file exists
+
+
+
+

Controlled waiver propagation

+

If test is FAIL and previous test in run with same MT_TARGET is WAIVED then apply the following rules from the testconfig: +If a waiver check is specified in the testconfig apply the check and if it passes then set this FAIL to WAIVED

+

Waiver check has two parts, 1) a list of waiver, rulename, filepatterns and 2) the rulename script spec (note that "diff" and "logpro" are predefined)

+
+
+
###### EXAMPLE FROM testconfig #########
+# matching file(s) will be diff'd with previous run and logpro applied
+# if PASS or WARN result from logpro then WAIVER state is set
+#
+[waivers]
+# logpro_file    rulename      input_glob
+waiver_1         logpro        lookittmp.log
+
+[waiver_rules]
+
+# This builtin rule is the default if there is no <waivername>.logpro file
+# diff   diff %file1% %file2%
+
+# This builtin rule is applied if a <waivername>.logpro file exists
+# logpro diff %file1% %file2% | logpro %waivername%.logpro %waivername%.html
+
+
+
+
+

Ezsteps

+

To transfer the environment to the next step you can do the following:

+
+
+
$MT_MEGATEST -env2file .ezsteps/${stepname}
+
+
+
+

Triggers

+

In your testconfig triggers can be specified

+
+
+
[triggers]
+
+# Call script running.sh when test goes to state=RUNNING, status=PASS
+RUNNING/PASS running.sh
+
+# Call script running.sh any time state goes to RUNNING
+RUNNING/ running.sh
+
+# Call script onpass.sh any time status goes to PASS
+PASS/ onpass.sh
+
+

Scripts called will have; test-id test-rundir trigger, added to the commandline.

+

HINT

+

To start an xterm (useful for debugging), use a command line like the following:

+
+
+
[triggers]
+COMPLETED/ xterm -e bash -s --
+
+
+ + + +
+
Note
+
There is a trailing space after the --
+
+
+
+
+
+

Programming API

+
+

These routines can be called from the megatest repl.

+
+ + +++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Table 2. API Server Management Calls
API Call Purpose comments Returns Comments

(rmt:login run-id)

Verify the the version, testsuite area etc. are correct.

#( #t "successful login" )

(rmt:start-server run-id)

#( success/fail n/a )

(rmt:kill-server run-id)

#( success/fail n/a )

Works only if the server is still reachable

+
+
+ + +++++ + + + + + + + + + + + + + + + + + + + + + + + +
Table 3. API Keys Related Calls
API Call Purpose comments Returns Comments

(rmt:get-key-val-pairs run-id)

#t=success/#f=fail

Works only if the server is still reachable

(rmt:get-keys run-id)

( key1 key2 … )

+
+
+

Megatest Internals

+
+
+server.png +
+
+
+
+
+
+

Appendix A: Example Appendix

+
+

One or more optional appendixes go here at section level zero.

+
+

Appendix Sub-section

+
+ + + +
+
Note
+
Preface and appendix subsections start out of sequence at level +2 (level 1 is skipped). This only applies to multi-part book +documents.
+
+
+
+
+
+

Example Bibliography

+
+

The bibliography list is a style of AsciiDoc bulleted list.

+
    +
  • +

    +[taoup] Eric Steven Raymond. The Art of Unix + Programming. Addison-Wesley. ISBN 0-13-142901-9. +

    +
  • +
  • +

    +[walsh-muellner] Norman Walsh & Leonard Muellner. + DocBook - The Definitive Guide. O’Reilly & Associates. 1999. + ISBN 1-56592-580-7. +

    +
  • +
+
+
+
+

Example Glossary

+
+

Glossaries are optional. Glossaries entries are an example of a style +of AsciiDoc labeled lists.

+
+
+A glossary term +
+
+

+ The corresponding (indented) definition. +

+
+
+A second glossary term +
+
+

+ The corresponding (indented) definition. +

+
+
+
+
+
+

Example Colophon

+
+

Text at the end of a book describing facts about its production.

+
+
+
+

Example Index

+
+
+
+
+

+ + + Index: docs/manual/reference.txt ================================================================== --- docs/manual/reference.txt +++ docs/manual/reference.txt @@ -1,15 +1,9 @@ Reference ========= -The First Chapter of the Second Part ------------------------------------- - -Chapters grouped into book parts are at level 1 and can contain -sub-sections. - The testconfig File ------------------- Setup section ~~~~~~~~~~~~~ @@ -178,8 +172,31 @@ [triggers] COMPLETED/ xterm -e bash -s -- ----------------- NOTE: There is a trailing space after the -- + +Programming API +--------------- + +These routines can be called from the megatest repl. + +.API Server Management Calls +[width="70%",cols="^,2m,2m,2m",frame="topbot",options="header,footer"] +|====================== +|API Call | Purpose comments | Returns | Comments +|(rmt:start-server run-id) | | #( success/fail n/a ) | +|(rmt:kill-server run-id) | | #( success/fail n/a ) | Works only if the server is still reachable +|(rmt:login run-id) | Verify the the version, testsuite area etc. are correct. | #( #t "successful login" ) | +|====================== + +.API Keys Related Calls +[width="70%",cols="^,2m,2m,2m",frame="topbot",options="header,footer"] +|====================== +|API Call | Purpose comments | Returns | Comments +|(rmt:get-keys run-id) | | ( key1 key2 ... ) | +| (rmt:get-key-val-pairs run-id) | | #t=success/#f=fail | Works only if the server is still reachable +|====================== + :numbered!: ADDED fs-transport.scm Index: fs-transport.scm ================================================================== --- /dev/null +++ fs-transport.scm @@ -0,0 +1,44 @@ + +;; Copyright 2006-2012, Matthew Welland. +;; +;; This program is made available under the GNU GPL version 2.0 or +;; greater. See the accompanying file COPYING for details. +;; +;; This program is distributed WITHOUT ANY WARRANTY; without even the +;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +;; PURPOSE. + +(require-extension (srfi 18) extras tcp s11n) + +(use sqlite3 srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) +(import (prefix sqlite3 sqlite3:)) + +(use spiffy uri-common intarweb http-client spiffy-request-vars) + +(tcp-buffer-size 2048) + +(declare (unit fs-transport)) + +(declare (uses common)) +(declare (uses db)) +(declare (uses tests)) +(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. + +(include "common_records.scm") +(include "db_records.scm") + + +;;====================================================================== +;; F S T R A N S P O R T S E R V E R +;;====================================================================== + +;; There is no "server" per se but a convience routine to make it non +;; necessary to be reopening the db over and over again. +;; + +(define (fs:process-queue-item packet) + (if (not *megatest-db*) ;; we will require that (setup-for-run) has already been called + (set! *megatest-db* (open-db))) + (debug:print-info 11 "fs:process-queue-item called with packet=" packet) + (db:process-queue-item *megatest-db* packet)) + Index: gutils.scm ================================================================== --- gutils.scm +++ gutils.scm @@ -23,11 +23,11 @@ (null? (filter (lambda (x)(> x 3)) delta)))) (define (gutils:get-color-for-state-status state status);; #!key (get-label #f)) ;; ((if get-label cadr car) (case (string->symbol state) - ((COMPLETED) + ((COMPLETED ARCHIVED) (case (string->symbol status) ((PASS) (list "70 249 73" status)) ((WARN WAIVED) (list "255 172 13" status)) ((SKIP) (list "230 230 0" status)) (else (list "223 33 49" status)))) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -76,10 +76,16 @@ (debug:print-info 0 "portlogger recommended port: " start-port) (root-path (if link-tree-path link-tree-path (current-directory))) ;; WARNING: SECURITY HOLE. FIX ASAP! (handle-directory spiffy-directory-listing) + (handle-exception (lambda (exn chain) + (signal (make-composite-condition + (make-property-condition + 'server + 'message "server error"))))) + ;; http-transport:handle-directory) ;; simple-directory-handler) ;; Setup the web server and a /ctrl interface ;; (vhost-map `(((* any) . ,(lambda (continue) ;; open the db on the first call @@ -93,31 +99,10 @@ (send-response body: (api:process-request *inmemdb* $) ;; the $ is the request vars proc headers: '((content-type text/plain))) (mutex-lock! *heartbeat-mutex*) (set! *last-db-access* (current-seconds)) (mutex-unlock! *heartbeat-mutex*)) - ;; This is the /ctrl path where data is handed to the server and - ;; responses - ;; ((equal? (uri-path (request-uri (current-request))) - ;; '(/ "ctrl")) - ;; (let* ((packet (db:string->obj dat)) - ;; (qtype (cdb:packet-get-qtype packet))) - ;; (debug:print-info 12 "server=> received packet=" packet) - ;; (if (not (member qtype '(sync ping))) - ;; (begin - ;; (mutex-lock! *heartbeat-mutex*) - ;; (set! *last-db-access* (current-seconds)) - ;; (mutex-unlock! *heartbeat-mutex*))) - ;; ;; (mutex-lock! *db:process-queue-mutex*) ;; trying a mutex - ;; ;; (set! res (open-run-close db:process-queue-item open-db packet)) - ;; (set! res (db:process-queue-item db packet)) - ;; ;; (mutex-unlock! *db:process-queue-mutex*) - ;; (debug:print-info 11 "Return value from db:process-queue-item is " res) - ;; (send-response body: (conc "ctrl data\n" - ;; res - ;; "") - ;; headers: '((content-type text/plain))))) ((equal? (uri-path (request-uri (current-request))) '(/ "")) (send-response body: (http-transport:main-page))) ((equal? (uri-path (request-uri (current-request))) '(/ "runs")) @@ -243,28 +228,30 @@ (http-transport:server-dat-get-api-req serverdat) (begin (debug:print 0 "FATAL ERROR: http-transport:client-api-send-receive called with no server info") (exit 1)))) (res #f) - (success #t)) - (handle-exceptions - exn - (if (> numretries 0) - (begin - (mutex-unlock! *http-mutex*) - (thread-sleep! 1) - (handle-exceptions - exn - (debug:print 0 "WARNING: closing connections failed. Server at " fullurl " almost certainly dead") - (close-all-connections!)) - (debug:print 0 "WARNING: Failed to communicate with server, trying again, numretries left: " numretries) - (http-transport:client-api-send-receive run-id serverdat cmd params numretries: (- numretries 1))) - (begin - (mutex-unlock! *http-mutex*) - (tasks:kill-server-run-id run-id) - #f)) - (begin + (success #t) + (sparams (db:obj->string params transport: 'http))) +;; (condition-case +;; handle-exceptions +;; exn +;; (if (> numretries 0) +;; (begin +;; (mutex-unlock! *http-mutex*) +;; (thread-sleep! 1) +;; (handle-exceptions +;; exn +;; (debug:print 0 "WARNING: closing connections failed. Server at " fullurl " almost certainly dead") +;; (close-all-connections!)) +;; (debug:print 0 "WARNING: Failed to communicate with server, trying again, numretries left: " numretries) +;; (http-transport:client-api-send-receive run-id serverdat cmd sparams numretries: (- numretries 1))) +;; (begin +;; (mutex-unlock! *http-mutex*) +;; (tasks:kill-server-run-id run-id) +;; #f)) +;; (begin (debug:print-info 11 "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n") ;; set up the http-client here (max-retry-attempts 1) ;; consider all requests indempotent (retry-request? (lambda (request) @@ -276,25 +263,31 @@ (mutex-lock! *http-mutex*) ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines) ;; ((exn http client-error) e (print e))) (set! res (vector success - (handle-exceptions - exn - (begin - (set! success #f) - (debug:print 0 "WARNING: failure in with-input-from-request to " fullurl ". Killing associated server to allow clean retry.") - (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) - (hash-table-delete! *runremote* run-id) - ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine. - #f) - (with-input-from-request ;; was dat - fullurl - (list (cons 'key "thekey") - (cons 'cmd cmd) - (cons 'params params)) - read-string)))) + (db:string->obj + (handle-exceptions + exn + (begin + (set! success #f) + (debug:print 0 "WARNING: failure in with-input-from-request to " fullurl ".") + (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (hash-table-delete! *runremote* run-id) + ;; Killing associated server to allow clean retry.") + (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? + (mutex-unlock! *http-mutex*) + (signal (make-composite-condition + (make-property-condition 'commfail 'message "failed to connect to server"))) + "communications failed") + (with-input-from-request ;; was dat + fullurl + (list (cons 'key "thekey") + (cons 'cmd cmd) + (cons 'params sparams)) + read-string)) + transport: 'http))) ;; Shouldn't this be a call to the managed call-all-connections stuff above? (close-all-connections!) (mutex-unlock! *http-mutex*) )) (time-out (lambda () @@ -305,11 +298,24 @@ (thread-start! th1) (thread-start! th2) (thread-join! th1) (thread-terminate! th2) (debug:print-info 11 "got res=" res) - res))))) + (if (vector? res) + (if (vector-ref res 0) + res + (begin ;; note: this code also called in nmsg-transport - consider consolidating it + (debug:print 0 "ERROR: error occured at server, info=" (vector-ref res 2)) + (debug:print 0 " client call chain:") + (print-call-chain (current-error-port)) + (debug:print 0 " server call chain:") + (pp (vector-ref res 1) (current-error-port)) + (signal (vector-ref result 0)))) + (signal (make-composite-condition + (make-property-condition + 'timeout + 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server"))))))) ;; careful closing of connections stored in *runremote* ;; (define (http-transport:close-connections run-id) (let* ((server-dat (hash-table-ref/default *runremote* run-id #f))) @@ -318,17 +324,18 @@ (close-connection! api-dat) #t) #f))) -(define (make-http-transport:server-dat)(make-vector 5)) +(define (make-http-transport:server-dat)(make-vector 6)) (define (http-transport:server-dat-get-iface vec) (vector-ref vec 0)) (define (http-transport:server-dat-get-port vec) (vector-ref vec 1)) (define (http-transport:server-dat-get-api-uri vec) (vector-ref vec 2)) (define (http-transport:server-dat-get-api-url vec) (vector-ref vec 3)) (define (http-transport:server-dat-get-api-req vec) (vector-ref vec 4)) (define (http-transport:server-dat-get-last-access vec) (vector-ref vec 5)) +(define (http-transport:server-dat-get-socket vec) (vector-ref vec 6)) (define (http-transport:server-dat-make-url vec) (if (and (http-transport:server-dat-get-iface vec) (http-transport:server-dat-get-port vec)) (conc "http://" @@ -336,11 +343,15 @@ ":" (http-transport:server-dat-get-port vec)) #f)) (define (http-transport:server-dat-update-last-access vec) - (vector-set! vec 5 (current-seconds))) + (if (vector? vec) + (vector-set! vec 5 (current-seconds)) + (begin + (print-call-chain (current-error-port)) + (debug:print 0 "ERROR: call to http-transport:server-dat-update-last-access with non-vector!!")))) ;; ;; connect ;; (define (http-transport:client-connect iface port) @@ -355,10 +366,11 @@ ;; (define (http-transport:keep-running server-id run-id) ;; if none running or if > 20 seconds since ;; server last used then start shutdown ;; This thread waits for the server to come alive + (debug:print-info 0 "Starting the sync-back, keep alive thread in server for run-id=" run-id) (let* ((tdbdat (tasks:open-db)) (server-info (let loop ((start-time (current-seconds)) (changed #t) (last-sdat "not this")) (let ((sdat #f)) @@ -383,47 +395,62 @@ (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) (port (cadr server-info)) (last-access 0) - (server-timeout (let ((tmo (configf:lookup *configdat* "server" "timeout"))) - (if (and (string? tmo) - (string->number tmo)) - (* 60 60 (string->number tmo)) - ;; (* 3 24 60 60) ;; default to three days - (* 60 1) ;; default to one minute - ;; (* 60 60 25) ;; default to 25 hours - )))) + (server-timeout (server:get-timeout))) (let loop ((count 0) - (server-state 'available)) + (server-state 'available) + (bad-sync-count 0)) + ;; Use this opportunity to sync the inmemdb to db - (let ((start-time (current-milliseconds)) - (sync-time #f) - (rem-time #f)) - ;; inmemdb is a dbstruct - (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t)) - (set! sync-time (- (current-milliseconds) start-time)) - (set! rem-time (quotient (- 4000 sync-time) 1000)) - (debug:print 2 "SYNC: time= " sync-time ", rem-time=" rem-time) - - ;; - ;; set_running after our first pass through and start the db - ;; - (if (eq? server-state 'available) - (begin - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") - (thread-sleep! 5) ;; give some margin for queries to complete before switching from file based access to server based access - (set! *inmemdb* (db:setup run-id)) - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running"))) - - (if (and (<= rem-time 4) - (> rem-time 0)) - (thread-sleep! rem-time) - (thread-sleep! 4))) ;; fallback for if the math is changed ... + (if *inmemdb* + (let ((start-time (current-milliseconds)) + (sync-time #f) + (rem-time #f)) + ;; inmemdb is a dbstruct + (condition-case + (db:sync-touched *inmemdb* *run-id* force-sync: #t) + ((sync-failed)(cond + ((> bad-sync-count 10) ;; time to give up + (http-transport:server-shutdown server-id port)) + (else ;; (> bad-sync-count 0) ;; we've had a fail or two, delay and loop + (thread-sleep! 5) + (loop count server-state (+ bad-sync-count 1))))) + ((exn) + (debug:print 0 "ERROR: error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server") + (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running crashed") + (exit))) + (set! sync-time (- (current-milliseconds) start-time)) + (set! rem-time (quotient (- 4000 sync-time) 1000)) + (debug:print 2 "SYNC: time= " sync-time ", rem-time=" rem-time) + + (if (and (<= rem-time 4) + (> rem-time 0)) + (thread-sleep! rem-time) + (thread-sleep! 4))) ;; fallback for if the math is changed ... + + ;; + ;; no *inmemdb* yet, set running after our first pass through and start the db + ;; + (if (eq? server-state 'available) + (let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers + (if (equal? new-server-id server-id) + (begin + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") + (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access + (set! *inmemdb* (db:setup run-id)) + ;; force initialization + ;; (db:get-db *inmemdb* #t) + (db:get-db *inmemdb* run-id) + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")) + (begin ;; gotta exit nicely + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision") + (http-transport:server-shutdown server-id port)))))) (if (< count 1) ;; 3x3 = 9 secs aprox - (loop (+ count 1) 'running)) + (loop (+ count 1) 'running bad-sync-count)) ;; Check that iface and port have not changed (can happen if server port collides) (mutex-lock! *heartbeat-mutex*) (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) @@ -445,28 +472,22 @@ ;; no_traffic, no running tests, if server 0, no running servers ;; ;; (let ((wait-on-running (configf:lookup *configdat* "server" "wait-on-running"))) ;; wait on running tasks (if not true then exit on time out) ;; (if (and *server-run* - ;; (or (> (+ last-access server-timeout) (current-seconds))) -;; (and (eq? run-id 0) -;; (> (tasks:num-servers-non-zero-running tdb) 0)) -;; (and (not (eq? run-id 0)) ;; only makes sense in non-zero run-id servers -;; (> (db:get-count-tests-actually-running *inmemdb* run-id) 0)) -;; )) (begin (debug:print-info 0 "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) ;; ;; Consider implementing some smarts here to re-insert the record or kill self is ;; the db indicates so ;; ;; (if (tasks:server-am-i-the-server? tdb run-id) ;; (tasks:server-set-state! tdb server-id "running")) ;; - (loop 0 server-state)) + (loop 0 server-state bad-sync-count)) (http-transport:server-shutdown server-id port))))) (define (http-transport:server-shutdown server-id port) (let ((tdbdat (tasks:open-db))) (debug:print-info 0 "Starting to shutdown the server.") @@ -545,10 +566,22 @@ (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. (thread-start! th3) (set! *didsomething* #t) (thread-join! th2) (exit)))))) + +(define (http:ping run-id host-port) + (let* ((server-dat (http-transport:client-connect (car host-port)(cadr host-port))) + (login-res (rmt:login-no-auto-client-setup server-dat run-id))) + (if (and (list? login-res) + (car login-res)) + (begin + (print "LOGIN_OK") + (exit 0)) + (begin + (print "LOGIN_FAILED") + (exit 1))))) (define (http-transport:server-signal-handler signum) (signal-mask! signum) (handle-exceptions exn Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -451,17 +451,34 @@ ;; have chicken/egg scenario. need to read megatest.config then read it again. Going to ;; pass on that idea for now ;; special case (if (or force (not (hash-table? *configdat*))) ;; no need to re-open on every call (begin - (set! *configinfo* (find-and-read-config - (if (args:get-arg "-config")(args:get-arg "-config") "megatest.config") - environ-patt: "env-override" - given-toppath: (get-environment-variable "MT_RUN_AREA_HOME") - pathenvvar: "MT_RUN_AREA_HOME")) + (set! *configinfo* (or (if (get-environment-variable "MT_CMDINFO") ;; we are inside a test - do not reprocess configs + (let ((alistconfig (conc (get-environment-variable "MT_LINKTREE") "/" + (get-environment-variable "MT_TARGET") "/" + (get-environment-variable "MT_RUNNAME") "/" + ".megatest.cfg"))) + (if (file-exists? alistconfig) + (list (configf:read-alist alistconfig) + (get-environment-variable "MT_RUN_AREA_HOME")) + #f)) + #f) ;; no config cached - give up + (find-and-read-config + (if (args:get-arg "-config")(args:get-arg "-config") "megatest.config") + environ-patt: "env-override" + given-toppath: (get-environment-variable "MT_RUN_AREA_HOME") + pathenvvar: "MT_RUN_AREA_HOME"))) (set! *configdat* (if (car *configinfo*)(car *configinfo*) #f)) (set! *toppath* (if (car *configinfo*)(cadr *configinfo*) #f)) + (let* ((tmptransport (configf:lookup *configdat* "server" "transport")) + (transport (if tmptransport (string->symbol tmptransport) 'http))) + (if (member transport '(http rpc nmsg)) + (set! *transport-type* transport) + (begin + (debug:print 0 "ERROR: Unrecognised transport " transport) + (exit)))) (let ((linktree (configf:lookup *configdat* "setup" "linktree"))) ;; link tree is critical (if linktree (if (not (file-exists? linktree)) (begin (handle-exceptions @@ -489,47 +506,53 @@ (if (and *toppath* (directory-exists? *toppath*)) (setenv "MT_RUN_AREA_HOME" *toppath*) (begin (debug:print 0 "ERROR: failed to find the top path to your Megatest area.") - (exit 1)))))) + (exit 1))) + ))) *toppath*) + +(define (launch:cache-config) + ;; if we have a linktree and -runtests and -target and the directory exists dump the config + ;; to megatest-(current-seconds).cfg and symlink it to megatest.cfg + (if (and *configdat* + (args:get-arg "-runtests")) + (let* ((linktree (get-environment-variable "MT_LINKTREE")) + (target (common:args-get-target)) + (runname (or (args:get-arg "-runname") + (args:get-arg ":runname"))) + (fulldir (conc linktree "/" + target "/" + runname))) + (debug:print-info 0 "Have -runtests with target=" target ", runname=" runname ", fulldir=" fulldir) + (if (file-exists? linktree) ;; can't proceed without linktree + (begin + (if (not (file-exists? fulldir)) + (create-directory fulldir #t)) ;; need to protect with exception handler + (if (and target + runname + (file-exists? fulldir)) + (let ((tmpfile (conc fulldir "/.megatest.cfg." (current-seconds))) + (targfile (conc fulldir "/.megatest.cfg"))) + (debug:print-info 0 "Caching megatest.config in " fulldir "/.megatest.cfg") + (configf:write-alist *configdat* tmpfile) + (system (conc "ln -sf " tmpfile " " targfile)) + ))))))) (define (get-best-disk confdat) (let* ((disks (hash-table-ref/default confdat "disks" #f)) - (best #f) - (bestsize 0)) + (minspace (let ((m (configf:lookup confdat "setup" "minspace"))) + (string->number (or m "10000"))))) (if disks - (for-each - (lambda (disk-num) - (let* ((dirpath (cadr (assoc disk-num disks))) - (freespc (cond - ((not (directory? dirpath)) - (if (common:low-noise-print 50 "disks not a dir " disk-num) - (debug:print 0 "WARNING: disk " disk-num " at path " dirpath " is not a directory - ignoring it.")) - -1) - ((not (file-write-access? dirpath)) - (if (common:low-noise-print 50 "disks not writeable " disk-num) - (debug:print 0 "WARNING: disk " disk-num " at path " dirpath " is not writeable - ignoring it.")) - -1) - ((not (eq? (string-ref dirpath 0) #\/)) - (if (common:low-noise-print 50 "disks not a proper path " disk-num) - (debug:print 0 "WARNING: disk " disk-num " at path " dirpath " is not a fully qualified path - ignoring it.")) - -1) - (else - (get-df dirpath))))) - (if (> freespc bestsize) - (begin - (set! best dirpath) - (set! bestsize freespc))))) - (map car disks))) - (if (and best (> bestsize 0)) - best - (begin - (if (common:low-noise-print 20 "no valid disks") - (debug:print 0 "ERROR: No valid disks found in megatest.config. Please add some to your [disks] section and ensure the directory exists!")) - (exit 1))))) + (let ((res (common:get-disk-with-most-free-space disks minspace))) ;; min size of 1000, seems tad dumb + (if res + (cdr res) + (begin + (if (common:low-noise-print 20 "no valid disks") + (debug:print 0 "ERROR: No valid disks found in megatest.config. Please add some to your [disks] section and ensure the directory exists!")) + (exit 1))))))) ;; Desired directory structure: ;; ;; - - -. ;; | @@ -581,11 +604,16 @@ (debug:print 0 "WARNING: linktree did not exist! Creating it now at " linktree) (create-directory linktree #t))) ;; (system (conc "mkdir -p " linktree)))) ;; create the directory for the tests dir links, this is needed no matter what... (if (and (not (directory-exists? lnkbase)) (not (file-exists? lnkbase))) - (create-directory lnkbase #t)) + (handle-exceptions + exn + (begin + (debug:print "ERROR: Problem creating linktree base at " lnkbase) + (print-error-message exn (current-error-port))) + (create-directory lnkbase #t))) ;; update the toptest record with its location rundir, cache the path ;; This wass highly inefficient, one db write for every subtest, potentially ;; thousands of unnecessary updates, cache the fact it was set and don't set it ;; again. @@ -644,11 +672,14 @@ ;; (rmt:general-call 'test-set-rundir run-id lnkpath testname "") ;; toptest-path) (if (or (not curr-test-path) (not (directory-exists? toptest-path))) (begin (debug:print-info 2 "Creating " toptest-path " and link " lnkpath) - (create-directory toptest-path #t) + (handle-exceptions + exn + #f ;; don't care to catch and deal with errors here for now. + (create-directory toptest-path #t)) (hash-table-set! *toptest-paths* testname toptest-path))))) ;; The toptest path has been created, the link to the test in the linktree has ;; been created. Now, if this is an iterated test the real test dir must be created (if (not not-iterated) ;; this is an iterated test @@ -767,11 +798,11 @@ ;; set up the run work area for this test (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir (begin (debug:print-info 0 "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) - (runs:remove-test-directory #f testinfo #t))) ;; remove data only, do not perturb the record + (runs:remove-test-directory testinfo 'remove-data-only))) ;; remove data only, do not perturb the record ;; prevent overlapping actions - set to LAUNCHED as early as possible ;; (tests:test-set-status! run-id test-id "LAUNCHED" "n/a" #f #f) ;; (if launch-results launch-results "FAILED")) (rmt:roll-up-pass-fail-counts run-id test-name item-path "LAUNCHED") Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -1,7 +1,7 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1,1101 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.6006) +(define megatest-version 1.6007) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -8,15 +8,20 @@ ;; PURPOSE. ;; (include "common.scm") ;; (include "megatest-version.scm") -(use sqlite3 srfi-1 posix regex regex-case srfi-69 base64 format readline apropos json - http-client directory-utils z3 srfi-18) ;; extras) +(use sqlite3 srfi-1 posix regex regex-case srfi-69 base64 format readline apropos json http-client directory-utils rpc ;; (srfi 18) extras) + http-client srfi-18) ;; zmq extras) + +;; Added for csv stuff - will be removed +;; +(use sparse-vectors) (import (prefix sqlite3 sqlite3:)) (import (prefix base64 base64:)) +(import (prefix rpc rpc:)) ;; (use zmq) (declare (uses common)) (declare (uses megatest-version)) @@ -27,12 +32,11 @@ (declare (uses client)) (declare (uses tests)) (declare (uses genexample)) (declare (uses daemon)) (declare (uses db)) -;; (declare (uses sdb)) -;; (declare (uses filedb)) + (declare (uses tdb)) (declare (uses mt)) (declare (uses api)) (declare (uses tasks)) ;; only used for debugging. @@ -131,10 +135,11 @@ -update-meta : update the tests metadata for all tests -setvars VAR1=val1,VAR2=val2 : Add environment variables to a run NB// these are overwritten by values set in config files. -server -|hostname : start the server (reduces contention on megatest.db), use - to automatically figure out hostname + -transport http|zmq : use http or zmq for transport (default is http) -daemonize : fork into background and disconnect from stdin/out -log logfile : send stdout and stderr to logfile -list-servers : list the servers -stop-server id : stop server specified by id (see output of -list-servers), use 0 to kill all @@ -144,12 +149,17 @@ -ping run-id|host:port : ping server, exit with 0 if found Utilities -env2file fname : write the environment to fname.csh and fname.sh -refdb2dat refdb : convert refdb to sexp or to format specified by -dumpmode - formats: perl, ruby, sqlite3 + formats: perl, ruby, sqlite3, csv (for csv the -o param + will substitute %s for the sheet name in generating + multiple sheets) -o : output file for refdb2dat (defaults to stdout) + -archive cmd : archive runs specified by selectors to one of disks specified + in the [archive-disks] section. + cmd: keep-html, restore, save, save-remove Spreadsheet generation -extract-ods fname.ods : extract an open document spreadsheet from the database -pathmod path : insert path, i.e. path/runame/itempath/logfile.html will clear the field if no rundir/testname/itempath/logfile @@ -206,10 +216,12 @@ ":units" ;; misc "-start-dir" "-server" "-stop-server" + "-transport" + "-kill-server" "-port" "-extract-ods" "-pathmod" "-env2file" "-setvars" @@ -226,10 +238,11 @@ "-run-id" "-ping" "-refdb2dat" "-o" "-log" + "-archive" ) (list "-h" "-help" "--help" "-version" "-force" "-xterm" @@ -241,11 +254,10 @@ "-summarize-items" "-gui" "-daemonize" "-preclean" ;; misc - "-archive" "-repl" "-lock" "-unlock" "-list-servers" "-run-wait" ;; wait on a run to complete (i.e. no RUNNING) @@ -283,15 +295,18 @@ args:arg-hash 0)) ;; The watchdog is to keep an eye on things like db sync etc. ;; +(define *time-zero* (current-seconds)) (define *watchdog* (make-thread (lambda () (thread-sleep! 0.05) ;; delay for startup - (let ((legacy-sync (configf:lookup *configdat* "setup" "megatest-db"))) + (let ((legacy-sync (configf:lookup *configdat* "setup" "megatest-db")) + (debug-mode (debug:debug-mode 1)) + (last-time (current-seconds))) (let loop () ;; sync for filesystem local db writes ;; (let ((start-time (current-seconds)) (servers-started (make-hash-table))) @@ -310,19 +325,28 @@ ;; (begin ;; (debug:print-info 0 "Sync is taking a long time, start up a server to assist for run " run-id) ;; (server:kind-run run-id))))) (hash-table-delete! *db-local-sync* run-id))) (mutex-unlock! *db-multi-sync-mutex*)) - (hash-table-keys *db-local-sync*))) - + (hash-table-keys *db-local-sync*)) + (if (and debug-mode + (> (- start-time last-time) 60)) + (begin + (set! last-time start-time) + (debug:print-info 1 "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*)))))) + ;; keep going unless time to exit ;; (if (not *time-to-exit*) - (begin - (thread-sleep! 1) ;; wait one second before syncing again - (loop))))) - "Watchdog thread"))) + (let delay-loop ((count 0)) + (if (and (not *time-to-exit*) + (< count 11)) ;; aprox 5-6 seconds + (begin + (thread-sleep! 1) + (delay-loop (+ count 1)))) + (loop)))))) + "Watchdog thread")) (thread-start! *watchdog*) (if (args:get-arg "-log") (let ((oup (open-output-file (args:get-arg "-log")))) @@ -412,24 +436,72 @@ x " => ")) (common:get-disks *configdat*)) "\n")) (set! *didsomething* #t))) + +(define (make-sparse-array) + (let ((a (make-sparse-vector))) + (sparse-vector-set! a 0 (make-sparse-vector)) + a)) + +(define (sparse-array? a) + (and (sparse-vector? a) + (sparse-vector? (sparse-vector-ref a 0)))) + +(define (sparse-array-ref a x y) + (let ((row (sparse-vector-ref a x))) + (if row + (sparse-vector-ref row y) + #f))) + +(define (sparse-array-set! a x y val) + (let ((row (sparse-vector-ref a x))) + (if row + (sparse-vector-set! row y val) + (let ((new-row (make-sparse-vector))) + (sparse-vector-set! a x new-row) + (sparse-vector-set! new-row y val))))) + +;; csv processing record +(define (make-refdb:csv) + (vector + (make-sparse-array) + (make-hash-table) + (make-hash-table) + 0 + 0)) +(define-inline (refdb:csv-get-svec vec) (vector-ref vec 0)) +(define-inline (refdb:csv-get-rows vec) (vector-ref vec 1)) +(define-inline (refdb:csv-get-cols vec) (vector-ref vec 2)) +(define-inline (refdb:csv-get-maxrow vec) (vector-ref vec 3)) +(define-inline (refdb:csv-get-maxcol vec) (vector-ref vec 4)) +(define-inline (refdb:csv-set-svec! vec val)(vector-set! vec 0 val)) +(define-inline (refdb:csv-set-rows! vec val)(vector-set! vec 1 val)) +(define-inline (refdb:csv-set-cols! vec val)(vector-set! vec 2 val)) +(define-inline (refdb:csv-set-maxrow! vec val)(vector-set! vec 3 val)) +(define-inline (refdb:csv-set-maxcol! vec val)(vector-set! vec 4 val)) + +(define (get-dat results sheetname) + (or (hash-table-ref/default results sheetname #f) + (let ((tmp-vec (make-refdb:csv))) + (hash-table-set! results sheetname tmp-vec) + tmp-vec))) (if (args:get-arg "-refdb2dat") (let* ((input-db (args:get-arg "-refdb2dat")) (out-file (args:get-arg "-o")) (out-fmt (or (args:get-arg "-dumpmode") "scheme")) (out-port (if (and out-file - (not (equal? out-fmt "sqlite3"))) + (not (member out-fmt '("sqlite3" "csv")))) (open-output-file out-file) (current-output-port))) (res-data (configf:read-refdb input-db)) (data (car res-data)) (msg (cadr res-data))) (if (not data) - (debug:print 0 data) ;; some error occurred + (debug:print 0 "Bad input? data=" data) ;; some error occurred (with-output-to-port out-port (lambda () (case (string->symbol out-fmt) ((scheme)(pp data)) ((perl) @@ -452,10 +524,81 @@ (lambda (sheetname) (print "data[\"" sheetname "\"] = {}")) initproc2: (lambda (sheetname sectionname) (print "data[\"" sheetname "\"][\"" sectionname "\"] = {}")))) + ((csv) + (let* ((results (make-hash-table)) ;; (make-sparse-array))) + (row-cols (make-hash-table))) ;; hash of hashes where section => ht { row- => num or col- => num + ;; (print "data=") + ;; (pp data) + (configf:map-all-hier-alist + data + (lambda (sheetname sectionname varname val) + ;; (print "sheetname: " sheetname ", sectionname: " sectionname ", varname: " varname ", val: " val) + (let* ((dat (get-dat results sheetname)) + (vec (refdb:csv-get-svec dat)) + (rownames (refdb:csv-get-rows dat)) + (colnames (refdb:csv-get-cols dat)) + (currrown (hash-table-ref/default rownames varname #f)) + (currcoln (hash-table-ref/default colnames sectionname #f)) + (rown (or currrown + (let* ((lastn (refdb:csv-get-maxrow dat)) + (newrown (+ lastn 1))) + (refdb:csv-set-maxrow! dat newrown) + newrown))) + (coln (or currcoln + (let* ((lastn (refdb:csv-get-maxcol dat)) + (newcoln (+ lastn 1))) + (refdb:csv-set-maxcol! dat newcoln) + newcoln)))) + (if (not (sparse-array-ref vec 0 coln)) ;; (eq? rown 0) + (begin + (sparse-array-set! vec 0 coln sectionname) + ;; (print "sparse-array-ref " 0 "," coln "=" (sparse-array-ref vec 0 coln)) + )) + (if (not (sparse-array-ref vec rown 0)) ;; (eq? coln 0) + (begin + (sparse-array-set! vec rown 0 varname) + ;; (print "sparse-array-ref " rown "," 0 "=" (sparse-array-ref vec rown 0)) + )) + (if (not currrown)(hash-table-set! rownames varname rown)) + (if (not currcoln)(hash-table-set! colnames sectionname coln)) + ;; (print "dat=" dat ", rown=" rown ", coln=" coln) + (sparse-array-set! vec rown coln val) + ;; (print "sparse-array-ref " rown "," coln "=" (sparse-array-ref vec rown coln)) + ))) + (for-each + (lambda (sheetname) + (let* ((sheetdat (get-dat results sheetname)) + (svec (refdb:csv-get-svec sheetdat)) + (maxrow (refdb:csv-get-maxrow sheetdat)) + (maxcol (refdb:csv-get-maxcol sheetdat)) + (fname (if out-file + (string-substitute "%s" sheetname out-file) ;; "/foo/bar/%s.csv") + (conc sheetname ".csv")))) + (with-output-to-file fname + (lambda () + ;; (print "Sheetname: " sheetname) + (let loop ((row 0) + (col 0) + (curr-row '()) + (result '())) + (let* ((val (sparse-array-ref svec row col)) + (disp-val (if val + (conc "\"" val "\"") + ""))) + (if (> col 0)(display ",")) + (display disp-val) + (cond + ((> row maxrow)(display "\n") result) + ((>= col maxcol) + (display "\n") + (loop (+ row 1) 0 '() (append result (list curr-row)))) + (else + (loop row (+ col 1) (append curr-row (list val)) result))))))))) + (hash-table-keys results)))) ((sqlite3) (let* ((db-file (or out-file (pathname-file input-db))) (db-exists (file-exists? db-file)) (db (sqlite3:open-database db-file))) (if (not db-exists)(sqlite3:execute db "CREATE TABLE data (sheet,section,var,val);")) @@ -475,10 +618,18 @@ (if (args:get-arg "-ping") (let* ((run-id (string->number (args:get-arg "-run-id"))) (host:port (args:get-arg "-ping"))) (server:ping run-id host:port))) +;; (set! *did-something* #t) +;; (begin +;; (print ((rpc:procedure 'testing (car host-port)(cadr host-port)))) +;; (case (server:get-transport) +;; ((http)(http:ping run-id host-port)) +;; ((rpc) (rpc:procedure 'server:login (car host-port)(cadr host-port));; *toppath*)) ;; (rpc-transport:ping run-id (car host-port)(cadr host-port))) +;; (else (debug:print 0 "ERROR: No transport set")(exit))))) + ;;====================================================================== ;; Start the server - can be done in conjunction with -runall or -runtests (one day...) ;; we start the server if not running else start the client thread ;;====================================================================== @@ -502,11 +653,12 @@ equal? (hash-table-keys args:arg-hash) '("-list-servers" "-stop-server" "-show-cmdinfo" - "-list-runs"))) + "-list-runs" + "-ping"))) (if (launch:setup-for-run) (let ((run-id (and (args:get-arg "-run-id") (string->number (args:get-arg "-run-id"))))) ;; (set! *fdb* (filedb:open-db (conc *toppath* "/db/paths.db"))) ;; if not list or kill then start a client (if appropriate) @@ -908,10 +1060,11 @@ (if (or (args:get-arg "-test-files")(args:get-arg "-test-paths")) ;; if we are in a test use the MT_CMDINFO data (if (getenv "MT_CMDINFO") (let* ((startingdir (current-directory)) (cmdinfo (common:read-encoded-string (getenv "MT_CMDINFO"))) + (transport (assoc/default 'transport cmdinfo)) (testpath (assoc/default 'testpath cmdinfo)) (test-name (assoc/default 'test-name cmdinfo)) (runscript (assoc/default 'runscript cmdinfo)) (db-host (assoc/default 'db-host cmdinfo)) (run-id (assoc/default 'run-id cmdinfo)) @@ -951,49 +1104,16 @@ ;;====================================================================== ;; Archive tests ;;====================================================================== ;; Archive tests matching target, runname, and testpatt (if (args:get-arg "-archive") - ;; if we are in a test use the MT_CMDINFO data - (if (getenv "MT_CMDINFO") - (let* ((startingdir (current-directory)) - (cmdinfo (common:read-encoded-string (getenv "MT_CMDINFO"))) - (testpath (assoc/default 'testpath cmdinfo)) - (test-name (assoc/default 'test-name cmdinfo)) - (runscript (assoc/default 'runscript cmdinfo)) - (db-host (assoc/default 'db-host cmdinfo)) - (run-id (assoc/default 'run-id cmdinfo)) - (itemdat (assoc/default 'itemdat cmdinfo)) - (state (args:get-arg ":state")) - (status (args:get-arg ":status")) - (target (args:get-arg "-target"))) - (change-directory testpath) - (if (not target) - (begin - (debug:print 0 "ERROR: -target is required.") - (exit 1))) - (if (not (launch:setup-for-run)) - (begin - (debug:print 0 "Failed to setup, giving up on -archive, exiting") - (exit 1))) - (let* ((keys (rmt:get-keys)) - (paths (tests:test-get-paths-matching keys target))) - (set! *didsomething* #t) - (for-each (lambda (path) - (print path)) - paths)) - ;; (if (sqlite3:database? db)(sqlite3:finalize! db)) - ) - ;; else do a general-run-call - (general-run-call - "-test-paths" - "Get paths to tests" - (lambda (target runname keys keyvals) - (let* ((paths (tests:test-get-paths-matching keys target))) - (for-each (lambda (path) - (print path)) - paths)))))) + ;; else do a general-run-call + (general-run-call + "-archive" + "Archive" + (lambda (target runname keys keyvals) + (operate-on 'archive)))) ;;====================================================================== ;; Extract a spreadsheet from the runs database ;;====================================================================== @@ -1033,10 +1153,11 @@ (if (not (getenv "MT_CMDINFO")) (begin (debug:print 0 "ERROR: MT_CMDINFO env var not set, -step must be called *inside* a megatest invoked environment!") (exit 5)) (let* ((cmdinfo (common:read-encoded-string (getenv "MT_CMDINFO"))) + (transport (assoc/default 'transport cmdinfo)) (testpath (assoc/default 'testpath cmdinfo)) (test-name (assoc/default 'test-name cmdinfo)) (runscript (assoc/default 'runscript cmdinfo)) (db-host (assoc/default 'db-host cmdinfo)) (run-id (assoc/default 'run-id cmdinfo)) @@ -1057,12 +1178,12 @@ (if (args:get-arg "-step") (begin (megatest:step (args:get-arg "-step") - (args:get-arg ":state") - (args:get-arg ":status") + (or (args:get-arg "-state")(args:get-arg ":state")) + (or (args:get-arg "-status")(args:get-arg ":status")) (args:get-arg "-setlog") (args:get-arg "-m")) ;; (if db (sqlite3:finalize! db)) (set! *didsomething* #t))) @@ -1079,10 +1200,11 @@ (begin (debug:print 0 "ERROR: MT_CMDINFO env var not set, commands -test-status, -runstep and -setlog must be called *inside* a megatest environment!") (exit 5)) (let* ((startingdir (current-directory)) (cmdinfo (common:read-encoded-string (getenv "MT_CMDINFO"))) + (transport (assoc/default 'transport cmdinfo)) (testpath (assoc/default 'testpath cmdinfo)) (test-name (assoc/default 'test-name cmdinfo)) (runscript (assoc/default 'runscript cmdinfo)) (db-host (assoc/default 'db-host cmdinfo)) (run-id (assoc/default 'run-id cmdinfo)) ADDED mlaunch.scm Index: mlaunch.scm ================================================================== --- /dev/null +++ mlaunch.scm @@ -0,0 +1,26 @@ +;; Copyright 2006-2014, Matthew Welland. +;; +;; This program is made available under the GNU GPL version 2.0 or +;; greater. See the accompanying file COPYING for details. +;; +;; This program is distributed WITHOUT ANY WARRANTY; without even the +;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +;; PURPOSE. + +;; strftime('%m/%d/%Y %H:%M:%S','now','localtime') + +;;====================================================================== +;; MLAUNCH +;; +;; take jobs from the given queue and keep launching them keeping +;; the cpu load at the targeted level +;; +;;====================================================================== + +(use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking format) +(import (prefix sqlite3 sqlite3:)) + +(declare (unit mlaunch)) +(declare (uses db)) +(declare (uses common)) + Index: newdashboard.scm ================================================================== --- newdashboard.scm +++ newdashboard.scm @@ -78,10 +78,14 @@ ;; (begin ;; (set! *runremote* (string-split (args:get-arg "-host" ":"))) ;; (client:launch)) ;; (client:launch)) +;; ease debugging by loading ~/.dashboardrc +(let ((debugcontrolf (conc (get-environment-variable "HOME") "/.dashboardrc"))) + (if (file-exists? debugcontrolf) + (load debugcontrolf))) (define *dbdir* (conc (configf:lookup *configdat* "setup" "linktree") "/.db")) (define *dbstruct-local* (make-dbr:dbstruct path: *dbdir* local: #t)) (define *db-file-path* (db:dbfile-path 0)) ADDED nmsg-transport.scm Index: nmsg-transport.scm ================================================================== --- /dev/null +++ nmsg-transport.scm @@ -0,0 +1,358 @@ + +;; Copyright 2006-2012, Matthew Welland. +;; +;; This program is made available under the GNU GPL version 2.0 or +;; greater. See the accompanying file COPYING for details. +;; +;; This program is distributed WITHOUT ANY WARRANTY; without even the +;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +;; PURPOSE. + +(require-extension (srfi 18) extras tcp s11n) + +(use sqlite3 srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) +(import (prefix sqlite3 sqlite3:)) + +;; (use nanomsg) + +(declare (unit nmsg-transport)) + +(declare (uses common)) +(declare (uses db)) +(declare (uses tests)) +(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. +(declare (uses server)) + +(include "common_records.scm") +(include "db_records.scm") + +;; Transition to pub --> sub with pull <-- push +;; +;; 1. client sends request to server via push to the pull port +;; 2. server puts request in queue or processes immediately as appropriate +;; 3. server puts responses from completed requests into pub port +;; +;; TODO +;; +;; Done Tested +;; [x] [ ] 1. Add columns pullport pubport to servers table +;; [x] [ ] 2. Add rm of monitor.db if older than 11/12/2012 +;; [x] [ ] 3. Add create of pullport and pubport with finding of available ports +;; [x] [ ] 4. Add client compose of request +;; [x] [ ] - name of client: testname/itempath-test_id-hostname +;; [x] [ ] - name of request: callname, params +;; [x] [ ] - request key: f(clientname, callname, params) +;; [x] [ ] 5. Add processing of subscription hits +;; [x] [ ] - done when get key +;; [x] [ ] - return results +;; [x] [ ] 6. Add timeout processing +;; [x] [ ] - after 60 seconds +;; [ ] [ ] i. check server alive, connect to new if necessary +;; [ ] [ ] ii. resend request +;; [ ] [ ] 7. Turn self ping back on + +(define (nmsg-transport:make-server-url hostport #!key (bindall #f)) + (if (not hostport) + #f + (conc "tcp://" (if bindall "*" (car hostport)) ":" (cadr hostport)))) + +(define *server-loop-heart-beat* (current-seconds)) +(define *heartbeat-mutex* (make-mutex)) + +;;====================================================================== +;; S E R V E R +;;====================================================================== + +(define (nmsg-transport:run dbstruct hostn run-id server-id #!key (retrynum 1000)) + (debug:print 2 "Attempting to start the server ...") + (let* ((start-port (portlogger:open-run-close portlogger:find-port)) + (server-thread (make-thread (lambda () + (nmsg-transport:try-start-server dbstruct run-id start-port server-id)) + "server thread")) + (tdbdat (tasks:open-db))) + (thread-start! server-thread) + (thread-sleep! 0.1) + (if (nmsg-transport:ping hostn start-port timeout: 2 expected-key: (current-process-id)) + (let ((interface (if (equal? hostn "-")(get-host-name) hostn))) + (tasks:server-set-interface-port (db:delay-if-busy tdbdat) server-id interface start-port) + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") + (set! *server-info* (list hostn start-port)) ;; probably not needed anymore? currently used by keep-running + (thread-sleep! 3) ;; give some margin for queries to complete before switching from file based access to server based access + ;; (set! *inmemdb* dbstruct) + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") + (thread-start! (make-thread + (lambda ()(nmsg-transport:keep-running server-id run-id)) + "keep running")) + (thread-join! server-thread)) + (if (> retrynum 0) + (begin + (debug:print 0 "WARNING: Failed to connect to server (self) on host " hostn ":" start-port ", trying again.") + (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature") + (portlogger:open-run-close portlogger:set-failed start-port) + (nmsg-transport:run dbstruct hostn run-id server-id)) + (begin + (debug:print 0 "ERROR: could not find an open port to start server on. Giving up") + (exit 1)))))) + +(define (nmsg-transport:try-start-server dbstruct run-id portnum server-id) + (let ((repsoc (nn-socket 'rep))) + (nn-bind repsoc (conc "tcp://*:" portnum)) + (let loop ((msg-in (nn-recv repsoc))) + (let* ((dat (db:string->obj msg-in transport: 'nmsg))) + (debug:print 0 "server, received: " dat) + (let ((result (api:execute-requests dbstruct dat))) + (debug:print 0 "server, sending: " result) + (nn-send repsoc (db:obj->string result transport: 'nmsg))) + (loop (nn-recv repsoc)))))) + +;; all routes though here end in exit ... +;; +(define (nmsg-transport:launch run-id) + (let* ((tdbdat (tasks:open-db)) + (dbstruct (db:setup run-id)) + (hostn (or (args:get-arg "-server") "-"))) + (set! *run-id* run-id) + (set! *inmemdb* dbstruct) + ;; with nbfake daemonize isn't really needed + ;; + ;; (if (args:get-arg "-daemonize") + ;; (begin + ;; (daemon:ize) + ;; (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it + ;; (begin + ;; (current-error-port *alt-log-file*) + ;; (current-output-port *alt-log-file*))))) + (if (server:check-if-running run-id) + (begin + (debug:print-info 0 "Server for run-id " run-id " already running") + (exit 0))) + (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)) + (remtries 4)) + (if (not server-id) + (if (> remtries 0) + (begin + (thread-sleep! 2) + (if (not (server:check-if-running run-id)) + (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id) + (- remtries 1)) + (begin + (debug:print-info 0 "Another server took the slot, exiting") + (exit 0)))) + (begin + ;; since we didn't get the server lock we are going to clean up and bail out + (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") + (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch") + )) + ;; locked in a server id, try to start up + (nmsg-transport:run dbstruct hostn run-id server-id)) + (set! *didsomething* #t) + (exit)))) + +;;====================================================================== +;; S E R V E R U T I L I T I E S +;;====================================================================== + +(define (nmsg-transport:mk-signature) + (message-digest-string (md5-primitive) + (with-output-to-string + (lambda () + (write (list (current-directory) + (argv))))))) + +;;====================================================================== +;; C L I E N T S +;;====================================================================== + +;; ping the server at host:port +;; return the open socket if successful (return-socket == #t) +;; expect the key expected-key returned in payload +;; send our-key or #f as payload +;; +(define (nmsg-transport:ping hostn port #!key (timeout 3)(return-socket #t)(expected-key #f)(our-key #f)(socket #f)) + ;; send a random number along with pid and check that we get it back + (let* ((host (if (or (not hostn) + (equal? hostn "-")) ;; use localhost + (get-host-name) + hostn)) + (req (or socket + (let ((soc (nn-socket 'req))) + (nn-connect soc (conc "tcp://" host ":" port)) + soc))) + (success #t) + (dat (vector "ping" our-key)) + (result (condition-case + (nmsg-transport:client-api-send-receive-raw req dat timeout: timeout) + ((timeout)(set! success #f) #f))) + (key (if success + (vector-ref result 1) + #f))) + (debug:print 0 "success=" success ", key=" key ", expected-key=" expected-key ", equal? " (equal? key expected-key)) + (if (and success + (or (not expected-key) ;; just getting a reply is good enough then + (equal? key expected-key))) + (if return-socket + req + (begin + (if (not socket)(nn-close req)) ;; don't want a side effect of closing socket if handed it + #t)) + (begin + (if (not socket)(nn-close req)) ;; failed to ping, close socket as side effect + #f)))) + +;; send data to server, wait max of timeout seconds for a response. +;; return #( success/fail result ) +;; +;; for effiency it is easier to do the obj->string and string->obj here. +;; +(define (nmsg-transport:client-api-send-receive-raw socreq indat #!key (enable-send #t)(timeout 25)) + (let* ((success #f) + (result #f) + (keepwaiting #t) + (dat (db:obj->string indat transport: 'nmsg)) + (send-recv (make-thread + (lambda () + (nn-send socreq dat) + (let* ((res (nn-recv socreq))) + (set! success #t) + (set! result (db:string->obj res transport: 'nmsg)))) + "send-recv")) + (timeout (make-thread + (lambda () + (let loop ((count 0)) + (thread-sleep! 1) + (debug:print-info 1 "send-receive-raw, still waiting after " count " seconds...") + (if (and keepwaiting (< count timeout)) ;; yes, this is very aproximate + (loop (+ count 1)))) + (if keepwaiting + (begin + (print "timeout waiting for ping") + (thread-terminate! send-recv)))) + "timeout"))) + ;; replace with condition-case? + (handle-exceptions + exn + (set! result "timeout") + (thread-start! timeout) + (thread-start! send-recv) + (thread-join! send-recv) + (if success (thread-terminate! timeout))) + ;; raise timeout error if timed out + (if success + (if (and (vector? result) + (vector-ref result 0)) ;; did it fail at the server? + result ;; nope, all good + (begin + (debug:print 0 "ERROR: error occured at server, info=" (vector-ref result 2)) + (debug:print 0 " client call chain:") + (print-call-chain (current-error-port)) + (debug:print 0 " server call chain:") + (pp (vector-ref result 1) (current-error-port)) + (signal (vector-ref result 0)))) + (signal (make-composite-condition + (make-property-condition 'timeout 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server")))))) + +;; run nmsg-transport:keep-running in a parallel thread to monitor that the db is being +;; used and to shutdown after sometime if it is not. +;; +(define (nmsg-transport:keep-running server-id run-id) + ;; if none running or if > 20 seconds since + ;; server last used then start shutdown + ;; This thread waits for the server to come alive + (let* ((server-info (let loop () + (let ((sdat #f)) + (mutex-lock! *heartbeat-mutex*) + (set! sdat *server-info*) + (mutex-unlock! *heartbeat-mutex*) + (if sdat + (begin + (debug:print-info 0 "keep-running got sdat=" sdat) + sdat) + (begin + (thread-sleep! 0.5) + (loop)))))) + (iface (car server-info)) + (port (cadr server-info)) + (last-access 0) + (tdbdat (tasks:open-db)) + (server-timeout (let ((tmo (configf:lookup *configdat* "server" "timeout"))) + (if (and (string? tmo) + (string->number tmo)) + (* 60 60 (string->number tmo)) + ;; (* 3 24 60 60) ;; default to three days + (* 60 1) ;; default to one minute + ;; (* 60 60 25) ;; default to 25 hours + )))) + (print "Keep-running got server pid " server-id ", using iface " iface " and port " port) + (let loop ((count 0)) + (thread-sleep! 4) ;; no need to do this very often + ;; NB// sync currently does NOT return queue-length + (let () ;; (queue-len (cdb:client-call server-info 'sync #t 1))) + ;; (print "Server running, count is " count) + (if (< count 1) ;; 3x3 = 9 secs aprox + (loop (+ count 1))) + + (mutex-lock! *heartbeat-mutex*) + (set! last-access *last-db-access*) + (mutex-unlock! *heartbeat-mutex*) + (db:sync-touched *inmemdb* run-id force-sync: #t) + (if (and *server-run* + (> (+ last-access server-timeout) + (current-seconds))) + (begin + (debug:print-info 0 "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) + (loop 0)) + (begin + (debug:print-info 0 "Starting to shutdown the server.") + (set! *time-to-exit* #t) + (db:sync-touched *inmemdb* run-id force-sync: #t) + (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running") + (debug:print-info 0 "Server shutdown complete. Exiting") + (exit) + )))))) + +;;====================================================================== +;; C L I E N T S +;;====================================================================== + +(define (nmsg-transport:client-connect iface portnum) + (let* ((reqsoc (nmsg-transport:ping iface portnum return-socket: #t))) + (vector iface portnum #f #f #f (current-seconds) reqsoc))) + +;; returns result, there is no sucess/fail flag - handled via excpections +;; +(define (nmsg-transport:client-api-send-receive run-id connection-info cmd param #!key (remtries 5)) + ;; NB// In the html version of this routine there is a call to + ;; tasks:kill-server-run-id when there is an exception + (mutex-lock! *http-mutex*) + (let* ((packet (vector cmd param)) + (reqsoc (http-transport:server-dat-get-socket connection-info)) + (res (nmsg-transport:client-api-send-receive-raw reqsoc packet))) +;; (status (vector-ref rawres 0)) +;; (result (vector-ref rawres 1))) + (mutex-unlock! *http-mutex*) + res)) ;; (vector status (if status (db:string->obj result transport: 'nmsg) result)))) + +;;====================================================================== +;; J U N K +;;====================================================================== + +;; DO NOT USE +;; +(define (nmsg-transport:client-signal-handler signum) + (handle-exceptions + exn + (debug:print " ... exiting ...") + (let ((th1 (make-thread (lambda () + (if (not *received-response*) + (receive-message* *runremote*))) ;; flush out last call if applicable + "eat response")) + (th2 (make-thread (lambda () + (debug:print 0 "ERROR: Received ^C, attempting clean exit. Please be patient and wait a few seconds before hitting ^C again.") + (thread-sleep! 3) ;; give the flush three seconds to do it's stuff + (debug:print 0 " Done.") + (exit 4)) + "exit on ^C timer"))) + (thread-start! th2) + (thread-start! th1) + (thread-join! th2)))) + Index: process.scm ================================================================== --- process.scm +++ process.scm @@ -101,12 +101,23 @@ (append result (list curr))) result)))) ;; here is an example line where the shell is sh or bash ;; "find / -print 2&>1 > findall.log" -(define (run-n-wait cmdline) - (let ((pid (process-run cmdline))) +(define (run-n-wait cmdline #!key (params #f)(print-cmd #f)) + (if print-cmd + (debug:print 0 + (if (string? print-cmd) + print-cmd + "") + cmdline + (if params + (string-intersperse params " ") + ""))) + (let ((pid (if params + (process-run cmdline params) + (process-run cmdline)))) (let loop ((i 0)) (let-values (((pid-val exit-status exit-code) (process-wait pid #t))) (if (eq? pid-val 0) (begin (thread-sleep! 2) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -13,30 +13,44 @@ (declare (unit rmt)) (declare (uses api)) (declare (uses tdb)) (declare (uses http-transport)) +(declare (uses nmsg-transport)) ;; ;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!! ;; ;; ;; For debugging add the following to ~/.megatestrc -;; +;;a ;; (require-library trace) ;; (import trace) ;; (trace ;; rmt:send-receive ;; api:execute-requests ;; ) +;; generate entries for ~/.megatestrc with the following +;; +;; grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u + ;;====================================================================== ;; S U P P O R T F U N C T I O N S ;;====================================================================== -;; #t means - please start a server! +;; NOT USED? +;; +;; (define (rmt:call-transport run-id connection-info cmd jparams) +;; (case (server:get-transport) +;; ((rpc) ( rpc-transport:client-api-send-receive run-id connection-info cmd jparams)) +;; ((http) (http-transport:client-api-send-receive run-id connection-info cmd jparams)) +;; ((fs) ( fs-transport:client-api-send-receive run-id connection-info cmd jparams)) +;; ((zmq) (zmq-transport:client-api-send-receive run-id connection-info cmd jparams)) +;; (else ( rpc-transport:client-api-send-receive run-id connection-info cmd jparams)))) + ;; (define (rmt:write-frequency-over-limit? cmd run-id) (and (not (member cmd api:read-only-queries)) (let* ((tmprec (hash-table-ref/default *write-frequency* run-id #f)) (record (if tmprec tmprec @@ -53,10 +67,13 @@ (begin (debug:print-info 1 "db write rate too high, starting a server, count=" count " start=" start " run-id=" run-id " queries-per-second=" queries-per-second) #t) #f)))) +;; if a server is either running or in the process of starting call client:setup +;; else return #f to let the calling proc know that there is no server available +;; (define (rmt:get-connection-info run-id) (let ((cinfo (hash-table-ref/default *runremote* run-id #f))) (if cinfo cinfo ;; NB// can cache the answer for server running for 10 seconds ... @@ -63,63 +80,91 @@ ;; ;; (and (not (rmt:write-frequency-over-limit? cmd run-id)) (if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id) (client:setup run-id) #f)))) -;; cmd is a symbol -;; vars is a json string encoding the parameters for the call -;; -(define (rmt:send-receive cmd rid params #!key (attemptnum 0)) +(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id +(define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected ;; clean out old connections (mutex-lock! *db-multi-sync-mutex*) - (let ((expire-time (- (current-seconds) 60))) + (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin (for-each (lambda (run-id) (let ((connection (hash-table-ref/default *runremote* run-id #f))) - (if (and connection - (< (http-transport:server-dat-get-last-access connection) expire-time)) - (begin - (debug:print-info 0 "Discarding connection to server for run-id " run-id ", too long between accesses") - (hash-table-delete! *runremote* run-id))))) + (if (and (vector? connection) + (< (http-transport:server-dat-get-last-access connection) expire-time)) + (begin + (debug:print-info 0 "Discarding connection to server for run-id " run-id ", too long between accesses") + ;; SHOULD CLOSE THE CONNECTION HERE + (case *transport-type* + ((nmsg)(nn-close (http-transport:server-dat-get-socket + (hash-table-ref *runremote* run-id))))) + (hash-table-delete! *runremote* run-id))))) (hash-table-keys *runremote*))) (mutex-unlock! *db-multi-sync-mutex*) + ;; (mutex-lock! *send-receive-mutex*) (let* ((run-id (if rid rid 0)) - (connection-info (rmt:get-connection-info run-id)) - (jparams (db:obj->string params))) + (connection-info (rmt:get-connection-info run-id))) + ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also) (if connection-info ;; use the server if have connection info - (let* ((dat (http-transport:client-api-send-receive run-id connection-info cmd jparams)) - (res (if (and dat (vector? dat)) (vector-ref dat 1) #f)) - (success (if (and dat (vector? dat)) (vector-ref dat 0) #f))) - (http-transport:server-dat-update-last-access connection-info) + (let* ((dat (case *transport-type* + ((http)(condition-case + (http-transport:client-api-send-receive run-id connection-info cmd params) + ((commfail)(vector #f "communications fail")) + ((exn)(vector #f "other fail")))) + ((nmsg)(condition-case + (nmsg-transport:client-api-send-receive run-id connection-info cmd params) + ((timeout)(vector #f "timeout talking to server")))) + (else (exit)))) + (success (if (vector? dat) (vector-ref dat 0) #f)) + (res (if (vector? dat) (vector-ref dat 1) #f))) + (if (vector? connection-info)(http-transport:server-dat-update-last-access connection-info)) (if success - (db:string->obj res) - ;; (if (< attemptnum 100) - ;; (begin - ;; (hash-table-delete! *runremote* run-id) - ;; (thread-sleep! 0.5) - ;; (rmt:send-receive cmd rid params attempnum: (+ attemptnum 1))) - ;; (begin - ;; (print-call-chain (current-error-port)) - ;; (debug:print 0 "ERROR: too many attempts to communicate have failed. Giving up. Kill your mtest processes and start over") - ;; (exit 1))))) + (begin + ;; (mutex-unlock! *send-receive-mutex*) + (case *transport-type* + ((http) res) ;; (db:string->obj res)) + ((nmsg) res))) ;; (vector-ref res 1))) (begin ;; let ((new-connection-info (client:setup run-id))) - (debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.") + (debug:print 0 "WARNING: Communication failed, trying call to rmt:send-receive again.") + ;; (case *transport-type* + ;; ((nmsg)(nn-close (http-transport:server-dat-get-socket connection-info)))) (hash-table-delete! *runremote* run-id) ;; don't keep using the same connection + ;; NOTE: killing server causes this process to block forever. No idea why. Dec 2. + ;; (if (eq? (modulo attemptnum 5) 0) + ;; (tasks:kill-server-run-id run-id tag: "api-send-receive-failed")) + ;; (mutex-unlock! *send-receive-mutex*) ;; close the mutex here to allow other threads access to communications + (tasks:start-and-wait-for-server (tasks:open-db) run-id 15) + ;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1)))))) ;; no longer killing the server in http-transport:client-api-send-receive ;; may kill it here but what are the criteria? ;; start with three calls then kill server - (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id)) - (thread-sleep! 2) + ;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id)) + ;; (thread-sleep! 2) (rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1))))) - (if (and (< attemptnum 10) - (tasks:need-server run-id)) + ;; no connection info? try to start a server, or access locally if no + ;; server and the query is read-only + ;; + ;; Note: The tasks db was checked for a server in starting mode in the rmt:get-connection-info call + ;; + (if (and (< attemptnum 15) + (member cmd api:write-queries)) (begin + (hash-table-delete! *runremote* run-id) + ;; (mutex-unlock! *send-receive-mutex*) (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10) - (rmt:send-receive cmd rid params (+ attemptnum 1))) - (rmt:open-qry-close-locally cmd run-id params))))) + ;; (client:setup run-id) ;; client setup happens in rmt:get-connection-info + (thread-sleep! (random 5)) ;; give some time to settle and minimize collison? + (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) + (begin + ;; (debug:print 0 "ERROR: Communication failed!") + ;; (mutex-unlock! *send-receive-mutex*) + ;; (exit) + (rmt:open-qry-close-locally cmd run-id params) + ))))) (define (rmt:update-db-stats run-id rawcmd params duration) (mutex-lock! *db-stats-mutex*) (handle-exceptions exn @@ -128,11 +173,11 @@ (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) (print "exn=" (condition->list exn)) #f) ;; if this fails we don't care, it is just stats (let* ((cmd (conc "run-id=" run-id " " (if (eq? rawcmd 'general-call) (car params) rawcmd))) (stat-vec (hash-table-ref/default *db-stats* cmd #f))) - (if (not stat-vec) + (if (not (vector? stat-vec)) (let ((newvec (vector 0 0))) (hash-table-set! *db-stats* cmd newvec) (set! stat-vec newvec))) (vector-set! stat-vec 0 (+ (vector-ref stat-vec 0) 1)) (vector-set! stat-vec 1 (+ (vector-ref stat-vec 1) duration)))) @@ -174,42 +219,60 @@ (cons 'none 0)) (loop (car tal)(cdr tal) newmax-cmd currmax))))))) (mutex-unlock! *db-stats-mutex*) res)) -(define (rmt:open-qry-close-locally cmd run-id params) +(define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5)) (let* ((dbstruct-local (if *dbstruct-db* *dbstruct-db* (let* ((dbdir (conc (configf:lookup *configdat* "setup" "linktree") "/.db")) (db (make-dbr:dbstruct path: dbdir local: #t))) (set! *dbstruct-db* db) db))) - (db-file-path (db:dbfile-path 0))) - ;; (read-only (not (file-read-access? db-file-path))) - (let* ((start (current-milliseconds)) - (res (api:execute-requests dbstruct-local (symbol->string cmd) params)) - (duration (- (current-milliseconds) start))) - (rmt:update-db-stats run-id cmd params duration) - ;; mark this run as dirty if this was a write - (if (not (member cmd api:read-only-queries)) - (let ((start-time (current-seconds))) - (mutex-lock! *db-multi-sync-mutex*) - ;; (if (not (hash-table-ref/default *db-local-sync* run-id #f)) - ;; just set it every time. Is a write more expensive than a read and does it matter? - (hash-table-set! *db-local-sync* (or run-id 0) start-time) ;; the oldest "write" - (mutex-unlock! *db-multi-sync-mutex*))) - res))) + (db-file-path (db:dbfile-path 0)) + ;; (read-only (not (file-read-access? db-file-path))) + (start (current-milliseconds)) + (resdat (api:execute-requests dbstruct-local (vector (symbol->string cmd) params))) + (success (vector-ref resdat 0)) + (res (vector-ref resdat 1)) + (duration (- (current-milliseconds) start))) + (if (not success) + (if (> remretries 0) + (begin + (debug:print 0 "ERROR: local query failed. Trying again.") + (thread-sleep! (/ (random 5000) 1000)) ;; some random delay + (rmt:open-qry-close-locally cmd run-id params remretries: (- remretries 1))) + (begin + (debug:print 0 "ERROR: too many retries in rmt:open-qry-close-locally, giving up") + #f)) + (begin + (rmt:update-db-stats run-id cmd params duration) + ;; mark this run as dirty if this was a write + (if (not (member cmd api:read-only-queries)) + (let ((start-time (current-seconds))) + (mutex-lock! *db-multi-sync-mutex*) + ;; (if (not (hash-table-ref/default *db-local-sync* run-id #f)) + ;; just set it every time. Is a write more expensive than a read and does it matter? + (hash-table-set! *db-local-sync* (or run-id 0) start-time) ;; the oldest "write" + (mutex-unlock! *db-multi-sync-mutex*))) + res)))) (define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) (let* ((run-id (if run-id run-id 0)) - (jparams (db:obj->string params)) ;; (rmt:dat->json-str params)) - (dat (http-transport:client-api-send-receive run-id connection-info cmd jparams))) - (if (and dat (vector-ref dat 0)) - (db:string->obj (vector-ref dat 1)) - (begin - (debug:print 0 "ERROR: rmt:send-receive-no-auto-client-setup failed, attempting to continue. Got " dat) - dat)))) + ;; (jparams (db:obj->string params)) ;; (rmt:dat->json-str params)) + (res (handle-exceptions + exn + #f + (http-transport:client-api-send-receive run-id connection-info cmd params)))) +;; ((commfail) (vector #f "communications fail"))))) + (if (and res (vector-ref res 0)) + (vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!! + #f))) +;; (db:string->obj (vector-ref dat 1)) +;; (begin +;; (debug:print 0 "ERROR: rmt:send-receive-no-auto-client-setup failed, attempting to continue. Got " dat) +;; dat)))) ;; Wrap json library for strings (why the ports crap in the first place?) (define (rmt:dat->json-str dat) (with-output-to-string (lambda () @@ -242,14 +305,17 @@ (define (rmt:login run-id) (rmt:send-receive 'login run-id (list *toppath* megatest-version run-id *my-client-signature*))) ;; This login does no retries under the hood - it acts a bit like a ping. +;; Deprecated for nmsg-transport. ;; (define (rmt:login-no-auto-client-setup connection-info run-id) - (rmt:send-receive-no-auto-client-setup connection-info 'login run-id (list *toppath* megatest-version run-id *my-client-signature*))) - + (case *transport-type* + ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login run-id (list *toppath* megatest-version run-id *my-client-signature*))) + ((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info 'login (list *toppath* megatest-version run-id *my-client-signature*))))) + ;; hand off a call to one of the db:queries statements ;; added run-id to make looking up the correct db possible ;; (define (rmt:general-call stmtname run-id . params) (rmt:send-receive 'general-call run-id (append (list stmtname run-id) params))) @@ -315,10 +381,14 @@ (begin (debug:print "ERROR: rmt:get-tests-for-run called with bad run-id=" run-id) (print-call-chain (current-error-port)) '()))) +;; get stuff via synchash +(define (rmt:synchash-get run-id proc synckey keynum params) + (rmt:send-receive 'synchash-get run-id (list run-id proc synckey keynum params))) + ;; IDEA: Threadify these - they spend a lot of time waiting ... ;; (define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) (let ((multi-run-mutex (make-mutex)) (run-id-list (if run-ids @@ -325,33 +395,31 @@ run-ids (rmt:get-all-run-ids))) (result '())) (if (null? run-id-list) '() - (for-each - (lambda (th) - - (thread-join! th)) ;; I assume that joining completed threads just moves on - (let loop ((hed (car run-id-list)) - (tal (cdr run-id-list)) - (threads '())) - (let* ((newthread (make-thread - (lambda () - (let ((res (rmt:send-receive 'get-tests-for-run-mindata hed (list hed testpatt states status not-in)))) - (if (list? res) - (begin - (mutex-lock! multi-run-mutex) - (set! result (append result res)) - (mutex-unlock! multi-run-mutex)) - (debug:print 0 "ERROR: get-tests-for-run-mindata failed for run-id " hed ", testpatt " testpatt ", states " states ", status " status ", not-in " not-in)))) - (conc "multi-run-thread for run-id " hed))) - (newthreads (cons newthread threads))) - (thread-start! newthread) - (thread-sleep! 0.5) ;; give that thread some time to start - (if (null? tal) - newthreads - (loop (car tal)(cdr tal) newthreads)))))) + (let loop ((hed (car run-id-list)) + (tal (cdr run-id-list)) + (threads '())) + (if (> (length threads) 5) + (loop hed tal (filter (lambda (th)(not (member (thread-state th) '(terminated dead)))) threads)) + (let* ((newthread (make-thread + (lambda () + (let ((res (rmt:send-receive 'get-tests-for-run-mindata hed (list hed testpatt states status not-in)))) + (if (list? res) + (begin + (mutex-lock! multi-run-mutex) + (set! result (append result res)) + (mutex-unlock! multi-run-mutex)) + (debug:print 0 "ERROR: get-tests-for-run-mindata failed for run-id " hed ", testpatt " testpatt ", states " states ", status " status ", not-in " not-in)))) + (conc "multi-run-thread for run-id " hed))) + (newthreads (cons newthread threads))) + (thread-start! newthread) + (thread-sleep! 0.5) ;; give that thread some time to start + (if (null? tal) + newthreads + (loop (car tal)(cdr tal) newthreads)))))) result)) ;; ;; IDEA: Threadify these - they spend a lot of time waiting ... ;; ;; ;; (define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) @@ -482,11 +550,12 @@ (define (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit) (rmt:send-receive 'get-runs-by-patt #f (list keys runnamepatt targpatt offset limit))) (define (rmt:find-and-mark-incomplete run-id ovr-deadtime) - (rmt:send-receive 'find-and-mark-incomplete run-id (list run-id ovr-deadtime))) + (if (rmt:send-receive 'have-incompletes? run-id (list run-id ovr-deadtime)) + (rmt:send-receive 'mark-incomplete run-id (list run-id ovr-deadtime)))) ;;====================================================================== ;; M U L T I R U N Q U E R I E S ;;====================================================================== @@ -573,5 +642,37 @@ (define (rmt:test-data-rollup run-id test-id status) (rmt:send-receive 'test-data-rollup run-id (list run-id test-id status))) (define (rmt:csv->test-data run-id test-id csvdata) (rmt:send-receive 'csv->test-data run-id (list run-id test-id csvdata))) + +;;====================================================================== +;; T A S K S +;;====================================================================== + +(define (rmt:tasks-find-task-queue-records target run-name test-patt state-patt action-patt) + (rmt:send-receive 'find-task-queue-records #f (list target run-name test-patt state-patt action-patt))) + +(define (rmt:tasks-add action owner target runname testpatt params) + (rmt:send-receive 'tasks-add #f (list action owner target runname testpatt params))) + +(define (rmt:tasks-set-state-given-param-key param-key new-state) + (rmt:send-receive 'tasks-set-state-given-param-key #f (list param-key new-state))) + +;;====================================================================== +;; A R C H I V E S +;;====================================================================== + +(define (rmt:archive-get-allocations testname itempath dneeded) + (rmt:send-receive 'archive-get-allocations #f (list testname itempath dneeded))) + +(define (rmt:archive-register-block-name bdisk-id archive-path) + (rmt:send-receive 'archive-register-block-name #f (list bdisk-id archive-path))) + +(define (rmt:archive-allocate-testsuite/area-to-block block-id testsuite-name areakey) + (rmt:send-receive 'archive-allocate-test-to-block #f (list block-id testsuite-name areakey))) + +(define (rmt:archive-register-disk bdisk-name bdisk-path df) + (rmt:send-receive 'archive-register-disk #f (list bdisk-name bdisk-path df))) + +(define (rmt:test-set-archive-block-id run-id test-id archive-block-id) + (rmt:send-receive 'test-set-archive-block-id run-id (list run-id test-id archive-block-id))) ADDED rpc-transport.scm Index: rpc-transport.scm ================================================================== --- /dev/null +++ rpc-transport.scm @@ -0,0 +1,226 @@ + +;; Copyright 2006-2012, Matthew Welland. +;; +;; This program is made available under the GNU GPL version 2.0 or +;; greater. See the accompanying file COPYING for details. +;; +;; This program is distributed WITHOUT ANY WARRANTY; without even the +;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +;; PURPOSE. + +(require-extension (srfi 18) extras tcp s11n rpc) +(import (prefix rpc rpc:)) + +(use sqlite3 srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) +(import (prefix sqlite3 sqlite3:)) + +(declare (unit rpc-transport)) + +(declare (uses common)) +(declare (uses db)) +(declare (uses tests)) +(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. + +(include "common_records.scm") +(include "db_records.scm") + +;; procstr is the name of the procedure to be called as a string +(define (rpc-transport:autoremote procstr params) + (handle-exceptions + exn + (begin + (debug:print 1 "Remote failed for " proc " " params) + (apply (eval (string->symbol procstr)) params)) + ;; (if *runremote* + ;; (apply (eval (string->symbol (conc "remote:" procstr))) params) + (apply (eval (string->symbol procstr)) params))) + +;; all routes though here end in exit ... +;; +;; start_server? +;; +(define (rpc-transport:launch run-id) + (set! *run-id* run-id) + (if (args:get-arg "-daemonize") + (daemon:ize)) + (if (server:check-if-running run-id) + (begin + (debug:print 0 "INFO: Server for run-id " run-id " already running") + (exit 0))) + (let loop ((server-id (open-run-close tasks:server-lock-slot tasks:open-db run-id)) + (remtries 4)) + (if (not server-id) + (if (> remtries 0) + (begin + (thread-sleep! 2) + (loop (open-run-close tasks:server-lock-slot tasks:open-db run-id) + (- remtries 1))) + (begin + ;; since we didn't get the server lock we are going to clean up and bail out + (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") + (open-run-close tasks:server-delete-records-for-this-pid tasks:open-db " rpc-transport:launch"))) + (begin + (rpc-transport:run (if (args:get-arg "-server")(args:get-arg "-server") "-") run-id server-id) + (exit))))) + +(define (rpc-transport:run hostn run-id server-id) + (debug:print 2 "Attempting to start the rpc server ...") + ;; (trace rpc:publish-procedure!) + + (rpc:publish-procedure! 'server:login server:login) + (rpc:publish-procedure! 'testing (lambda () "Just testing")) + + (let* ((db #f) + (hostname (get-host-name)) + (ipaddrstr (let ((ipstr (if (string=? "-" hostn) + ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") + (server:get-best-guess-address hostname) + #f))) + (if ipstr ipstr hostn))) ;; hostname))) + (start-port (open-run-close tasks:server-get-next-port tasks:open-db)) + (link-tree-path (configf:lookup *configdat* "setup" "linktree")) + (rpc:listener (rpc-transport:find-free-port-and-open (rpc:default-server-port))) + (th1 (make-thread + (lambda () + ((rpc:make-server rpc:listener) #t)) + "rpc:server")) + ;; (cute (rpc:make-server rpc:listener) "rpc:server") + ;; 'rpc:server)) + (hostname (if (string=? "-" hostn) + (get-host-name) + hostn)) + (ipaddrstr (if (string=? "-" hostn) + (server:get-best-guess-address hostname) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") + #f)) + (portnum (rpc:default-server-port)) + (host:port (conc (if ipaddrstr ipaddrstr hostname) ":" portnum)) + (tdb (tasks:open-db))) + (thread-start! th1) + (set! db *inmemdb*) + (open-run-close tasks:server-set-interface-port + tasks:open-db + server-id + ipaddrstr portnum) + (debug:print 0 "Server started on " host:port) + + ;; (trace rpc:publish-procedure!) + ;; (rpc:publish-procedure! 'server:login server:login) + ;; (rpc:publish-procedure! 'testing (lambda () "Just testing")) + + ;;====================================================================== + ;; ;; end of publish-procedure section + ;;====================================================================== + ;; + (on-exit (lambda () + (open-run-close tasks:server-set-state! tasks:open-db server-id "stopped"))) + + (set! *rpc:listener* rpc:listener) + (tasks:server-set-state! tdb server-id "running") + (set! *inmemdb* (db:setup run-id)) + ;; if none running or if > 20 seconds since + ;; server last used then start shutdown + (let loop ((count 0)) + (thread-sleep! 5) ;; no need to do this very often + (let ((numrunning -1)) ;; (db:get-count-tests-running db))) + (if (or (> numrunning 0) + (> (+ *last-db-access* 60)(current-seconds))) + (begin + (debug:print-info 0 "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*)) + (loop (+ 1 count))) + (begin + (debug:print-info 0 "Starting to shutdown the server side") + (open-run-close tasks:server-delete-record tasks:open-db server-id " rpc-transport:try-start-server stop") + (thread-sleep! 10) + (debug:print-info 0 "Max cached queries was " *max-cache-size*) + (debug:print-info 0 "Server shutdown complete. Exiting") + )))))) + +(define (rpc-transport:find-free-port-and-open port) + (handle-exceptions + exn + (begin + (print "Failed to bind to port " (rpc:default-server-port) ", trying next port") + (rpc-transport:find-free-port-and-open (+ port 1))) + (rpc:default-server-port port) + (tcp-read-timeout 240000) + (tcp-listen (rpc:default-server-port) 10000))) + +(define (rpc-transport:ping run-id host port) + (handle-exceptions + exn + (begin + (print "SERVER_NOT_FOUND") + (exit 1)) + (let ((login-res ((rpc:procedure 'server:login host port) *toppath*))) + (if (and (list? login-res) + (car login-res)) + (begin + (print "LOGIN_OK") + (exit 0)) + (begin + (print "LOGIN_FAILED") + (exit 1)))))) + +(define (rpc-transport:client-setup run-id #!key (remtries 10)) + (if *runremote* + (begin + (debug:print 0 "ERROR: Attempt to connect to server but already connected") + #f) + (let* ((host-info (hash-table-ref/default *runremote* run-id #f))) ;; (open-run-close db:get-var #f "SERVER")) + (if host-info + (let ((iface (car host-info)) + (port (cadr host-info)) + (ping-res ((rpc:procedure 'server:login host port) *toppath*))) + (if ping-res + (let ((server-dat (list iface port #f #f #f))) + (hash-table-set! *runremote* run-id server-dat) + server-dat) + (begin + (server:try-running run-id) + (thread-sleep! 2) + (rpc-transport:client-setup run-id (- remtries 1))))) + (let* ((server-db-info (open-run-close tasks:get-server tasks:open-db run-id))) + (debug:print-info 0 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) + (if server-db-info + (let* ((iface (tasks:hostinfo-get-interface server-db-info)) + (port (tasks:hostinfo-get-port server-db-info)) + (server-dat (list iface port #f #f #f)) + (ping-res ((rpc:procedure 'server:login host port) *toppath*))) + (if start-res + (begin + (hash-table-set! *runremote* run-id server-dat) + server-dat) + (begin + (server:try-running run-id) + (thread-sleep! 2) + (rpc-transport:client-setup run-id (- remtries 1))))) + (begin + (server:try-running run-id) + (thread-sleep! 2) + (rpc-transport:client-setup run-id (- remtries 1))))))))) +;; +;; (port (if (and hostinfo (> (length hostdat) 1))(cadr hostdat) #f))) +;; (if (and port +;; (string->number port)) +;; (let ((portn (string->number port))) +;; (debug:print-info 2 "Setting up to connect to host " host ":" port) +;; (handle-exceptions +;; exn +;; (begin +;; (debug:print 0 "ERROR: Failed to open a connection to the server at host: " host " port: " port) +;; (debug:print 0 " EXCEPTION: " ((condition-property-accessor 'exn 'message) exn)) +;; ;; (open-run-close +;; ;; (lambda (db . param) +;; ;; (sqlite3:execute db "DELETE FROM metadat WHERE var='SERVER'")) +;; ;; #f) +;; (set! *runremote* #f)) +;; (if (and (not (args:get-arg "-server")) ;; no point in the server using the server using the server +;; ((rpc:procedure 'server:login host portn) *toppath*)) +;; (begin +;; (debug:print-info 2 "Logged in and connected to " host ":" port) +;; (set! *runremote* (vector host portn))) +;; (begin +;; (debug:print-info 2 "Failed to login or connect to " host ":" port) +;; (set! *runremote* #f))))) +;; (debug:print-info 2 "no server available"))))) + Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -19,10 +19,11 @@ (declare (uses items)) (declare (uses runconfig)) (declare (uses tests)) (declare (uses server)) (declare (uses mt)) +(declare (uses archive)) ;; (declare (uses filedb)) (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") @@ -160,13 +161,13 @@ (hash-table-set! *runs:denoise* key currtime) #t) #f))) (define (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) - ;;(thread-sleep! (cond - ;; ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while - ;; (else 0))) + (thread-sleep! (cond + ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while + (else 0))) (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (config-lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) @@ -223,17 +224,17 @@ (set-signal-handler! signal/int (lambda (signum) (signal-mask! signum) (print "Received signal " signum ", cleaning up before exit. Please wait...") (let ((tdbdat (tasks:open-db))) - (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "killed")) + (rmt:tasks-set-state-given-param-key task-key "killed")) (print "Killed by signal " signum ". Exiting") (exit))) ;; register this run in monitor.db - (tasks:add (db:delay-if-busy tdbdat) "run-tests" user target runname test-patts task-key) ;; params) - (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "running") + (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) + (rmt:tasks-set-state-given-param-key task-key "running") (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process (if (file-exists? runconfigf) (setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target) (debug:print 0 "WARNING: You do not have a run config file: " runconfigf)) @@ -367,20 +368,33 @@ (debug:print-info 1 "Adding " required-tests " to the run queue")) ;; NOTE: these are all parent tests, items are not expanded yet. (debug:print-info 4 "test-records=" (hash-table->alist test-records)) (let ((reglen (configf:lookup *configdat* "setup" "runqueue"))) (if (> (length (hash-table-keys test-records)) 0) - (let* ((keep-going #t) + (let* ((keep-going #t) + (run-queue-retries 5) (th1 (make-thread (lambda () - (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry)) + (handle-exceptions + exn + (begin + (print-call-chain (current-error-port)) + (debug:print 0 "ERROR: failure in runs:run-tests-queue thread, error: " ((condition-property-accessor 'exn 'message) exn)) + (if (> run-queue-retries 0) + (begin + (set! run-queue-retries (- run-queue-retries 1)) + (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry)))) + (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry))) "runs:run-tests-queue")) (th2 (make-thread (lambda () ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going - (rmt:find-and-mark-incomplete run-id #f))) ;; ovr-deadtime))) + (handle-exceptions + exn + (debug:print 0 "error in calling find-and-mark-incomplete for run-id " run-id) + (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) run-ids))) "runs: mark-incompletes"))) (thread-start! th1) (thread-start! th2) (thread-join! th1) @@ -394,11 +408,11 @@ (if (not (hash-table-ref/default flags "-rerun" #f)) (hash-table-set! flags "-rerun" "STUCK/DEAD,n/a,ZERO_ITEMS")) (runs:run-tests target runname test-patts user flags run-count: (- run-count 1))))) (debug:print-info 0 "No tests to run"))) (debug:print-info 4 "All done by here") - (tasks:set-state-given-param-key (db:delay-if-busy tdbdat) task-key "done") + (rmt:tasks-set-state-given-param-key task-key "done") ;; (sqlite3:finalize! tasks-db) )) ;; loop logic. These are used in runs:run-tests-queue to make it a bit more readable. @@ -944,11 +958,11 @@ (num-running (rmt:get-count-tests-running-for-run-id run-id))) ;; every couple minutes verify the server is there for this run (if (and (common:low-noise-print 60 "try start server" run-id) (tasks:need-server run-id)) - (tasks:start-and-wait-for-server tdbdat run-id 10)) + (tasks:start-and-wait-for-server tdbdat run-id 10)) ;; NOTE: delay and wait is done under the hood (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running 240)) @@ -1401,11 +1415,11 @@ ;; 'remove-runs ;; 'set-state-status ;; ;; NB// should pass in keys? ;; -(define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(remove-data-only #f)) +(define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(mode 'remove-all)(options '())) (common:clear-caches) ;; clear all caches (let* ((db #f) (tdbdat (tasks:open-db)) (keys (rmt:get-keys)) (rundat (mt:get-runs-by-patt keys runnamepatt target)) @@ -1435,20 +1449,21 @@ (run-state (db:get-value-by-header run header "state")) (run-name (db:get-value-by-header run header "runname")) (tests (if (not (equal? run-state "locked")) (proc-get-tests run-id) '())) - (lasttpath "/does/not/exist/I/hope")) + (lasttpath "/does/not/exist/I/hope") + (worker-thread #f)) (debug:print-info 4 "runs:operate-on run=" run ", header=" header) (if (not (null? tests)) (begin (case action ((remove-runs) (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) ;; seek and kill in flight -runtests with % as testpatt here (if (equal? testpatt "%") - (tasks:kill-runner (db:delay-if-busy tdbdat) target run-name) + (tasks:kill-runner target run-name) (debug:print 0 "not attempting to kill any run launcher processes as testpatt is " testpatt)) (debug:print 1 "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((set-state-status) (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) (debug:print 1 "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname"))) @@ -1455,12 +1470,24 @@ ((print-run) (debug:print 1 "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header) action) ((run-wait) (debug:print 1 "Waiting for run " runkey ", run=" runnamepatt " to complete")) + ((archive) + (debug:print 1 "Archiving data for run: " runkey " " (db:get-value-by-header run header "runname")) + (set! worker-thread (make-thread (lambda () + (case (string->symbol (args:get-arg "-archive")) + ((save save-remove keep-html)(archive:run-bup (args:get-arg "-archive") run-id run-name tests)) + ((restore)(archive:bup-restore (args:get-arg "-archive") run-id run-name tests)) + (else (debug:print 0 "ERROR: unrecognised sub command to -archive. Run \"megatest\" to see help")))) + "archive-bup-thread")) + (thread-start! worker-thread)) (else (debug:print-info 0 "action not recognised " action))) + + ;; actions that operate on one test at a time can be handled below + ;; (let ((sorted-tests (sort tests (lambda (a b)(let ((dira ;; (rmt:sdb-qry 'getstr (db:test-get-rundir a)) ;; ) ;; (filedb:get-path *fdb* (db:test-get-rundir a))) (dirb ;; (rmt:sdb-qry 'getstr (db:test-get-rundir b))) ;; ) ;; ((filedb:get-path *fdb* (db:test-get-rundir b)))) (if (and (string? dira)(string? dirb)) @@ -1522,11 +1549,11 @@ ;; NOTE: This is suboptimal as the testdata will be used later and the state/status may have changed ... (if (null? tal) (loop new-test-dat tal) (loop (car tal)(append tal (list new-test-dat))))) (begin - (runs:remove-test-directory db new-test-dat remove-data-only) + (runs:remove-test-directory new-test-dat mode) ;; 'remove-all) (if (not (null? tal)) (loop (car tal)(cdr tal)))))))) ((set-state-status) (debug:print-info 2 "new state " (car state-status) ", new status " (cadr state-status)) (mt:test-set-state-status-by-id run-id (db:test-get-id test) (car state-status)(cadr state-status) #f) @@ -1536,12 +1563,21 @@ (debug:print-info 2 "still waiting, " (length tests) " tests still running") (thread-sleep! 10) (let ((new-tests (proc-get-tests run-id))) (if (null? new-tests) (debug:print-info 1 "Run completed according to zero tests matching provided criteria.") - (loop (car new-tests)(cdr new-tests)))))))) - ))))) + (loop (car new-tests)(cdr new-tests))))) + ((archive) + (if (not toplevel-with-children) + (begin + (debug:print-info 0 "Estimating disk space usage for " test-fulln) + (debug:print-info 0 " " (common:get-disk-space-used (conc run-dir "/"))))) + (if (not (null? tal)) + (loop (car tal)(cdr tal)))) + ))) + ) + (if worker-thread (thread-join! worker-thread)))))) ;; remove the run if zero tests remain (if (eq? action 'remove-runs) (let ((remtests (mt:get-tests-for-run (db:get-value-by-header run header "id") #f '("DELETED") '("n/a") not-in: #t))) (if (null? remtests) ;; no more tests remaining (let* ((dparts (string-split lasttpath "/")) @@ -1562,18 +1598,19 @@ runs) ;; (sqlite3:finalize! (db:delay-if-busy tdbdat)) ) #t) -(define (runs:remove-test-directory db test remove-data-only) +(define (runs:remove-test-directory test mode) ;; remove-data-only) (let* ((run-dir (db:test-get-rundir test)) ;; run dir is from the link tree (real-dir (if (file-exists? run-dir) (resolve-pathname run-dir) #f))) - (if remove-data-only - (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f) - (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f)) + (case mode + ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f)) + ((remove-all) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f)) + ((archive-remove) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVE_REMOVING" #f #f))) (debug:print-info 1 "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir) (if (and real-dir (> (string-length real-dir) 5) (file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc. (begin ;; let* ((realpath (resolve-pathname run-dir))) @@ -1602,13 +1639,14 @@ (not (member run-dir (list "n/a" "/tmp/badname")))) (debug:print 0 "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink") (debug:print 0 "NOTE: the run dir for this test is undefined. Test may have already been deleted.")) )) ;; Only delete the records *after* removing the directory. If things fail we have a record - (if remove-data-only - (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "NOT_STARTED" "n/a" #f) - (rmt:delete-test-records (db:test-get-run_id test) (db:test-get-id test))))) + (case mode + ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "NOT_STARTED" "n/a" #f)) + ((archive-remove) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVED" #f #f)) + (else (rmt:delete-test-records (db:test-get-run_id test) (db:test-get-id test)))))) ;;====================================================================== ;; Routines for manipulating runs ;;====================================================================== @@ -1623,13 +1661,14 @@ (exit 3)) ((not runname) (debug:print 0 "ERROR: Missing required parameter for " switchname ", you must specify the run name with -runname runname") (exit 3)) (else - (let ((db #f) + (let (;; (db #f) (keys #f)) - (if (not (launch:setup-for-run)) + (if (launch:setup-for-run) + (launch:cache-config) (begin (debug:print 0 "Failed to setup, exiting") (exit 1))) ;; (if (args:get-arg "-server") ;; (cdb:remote-run server:start db (args:get-arg "-server"))) @@ -1641,12 +1680,13 @@ (if (hash-table-ref/default runconfig (args:get-arg "-reqtarg") #f) (keys:target-set-args keys (args:get-arg "-reqtarg") args:arg-hash) (begin (debug:print 0 "ERROR: [" (args:get-arg "-reqtarg") "] not found in " runconfigf) - (if db (sqlite3:finalize! db)) - (exit 1)))) + ;; (if db (sqlite3:finalize! db)) + (exit 1) + ))) (if (args:get-arg "-target") (keys:target-set-args keys (args:get-arg "-target" args:arg-hash) args:arg-hash))) (if (not (car *configinfo*)) (begin (debug:print 0 "ERROR: Attempted to " action-desc " but run area config file not found") @@ -1653,11 +1693,11 @@ (exit 1)) ;; Extract out stuff needed in most or many calls ;; here then call proc (let* ((keyvals (keys:target->keyval keys target))) (proc target runname keys keyvals))) - (if db (sqlite3:finalize! db)) + ;; (if db (sqlite3:finalize! db)) (set! *didsomething* #t)))))) ;;====================================================================== ;; Lock/unlock runs ;;====================================================================== Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -20,12 +20,13 @@ (declare (uses common)) (declare (uses db)) (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. (declare (uses synchash)) (declare (uses http-transport)) +(declare (uses rpc-transport)) +(declare (uses nmsg-transport)) (declare (uses launch)) -;; (declare (uses zmq-transport)) (declare (uses daemon)) (include "common_records.scm") (include "db_records.scm") @@ -47,37 +48,59 @@ ;; all routes though here end in exit ... ;; ;; start_server ;; (define (server:launch run-id) - (http-transport:launch run-id)) - -;;====================================================================== -;; Q U E U E M A N A G E M E N T -;;====================================================================== - -;; We don't want to flush the queue if it was just flushed -(define *server:last-write-flush* (current-milliseconds)) + (case *transport-type* + ((http)(http-transport:launch run-id)) + ((nmsg)(nmsg-transport:launch run-id)) + ((rpc) (rpc-transport:launch run-id)) + (else (debug:print 0 "ERROR: unknown server type " *transport-type*)))) +;; (else (debug:print 0 "ERROR: No known transport set, transport=" transport ", using rpc") +;; (rpc-transport:launch run-id))))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== +;; Get the transport +(define (server:get-transport) + (if *transport-type* + *transport-type* + (let ((ttype (string->symbol + (or (args:get-arg "-transport") + (configf:lookup *configdat* "server" "transport") + "rpc")))) + (set! *transport-type* ttype) + ttype))) + ;; Generate a unique signature for this server (define (server:mk-signature) (message-digest-string (md5-primitive) (with-output-to-string (lambda () (write (list (current-directory) (argv))))))) - ;; When using zmq this would send the message back (two step process) ;; with spiffy or rpc this simply returns the return data to be returned ;; (define (server:reply return-addr query-sig success/fail result) - (db:obj->string (vector success/fail query-sig result))) + (debug:print-info 11 "server:reply return-addr=" return-addr ", result=" result) + ;; (send-message pubsock target send-more: #t) + ;; (send-message pubsock + (case (server:get-transport) + ((rpc) (db:obj->string (vector success/fail query-sig result))) + ((http) (db:obj->string (vector success/fail query-sig result))) + ((zmq) + (let ((pub-socket (vector-ref *runremote* 1))) + (send-message pub-socket return-addr send-more: #t) + (send-message pub-socket (db:obj->string (vector success/fail query-sig result))))) + ((fs) result) + (else + (debug:print 0 "ERROR: unrecognised transport type: " *transport-type*) + result))) ;; Given a run id start a server process ### NOTE ### > file 2>&1 ;; if the run-id is zero and the target-host is set ;; try running on that host ;; @@ -108,10 +131,16 @@ (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) ;; (system cmdln) (pop-directory))) + +(define (server:get-client-signature) + (if *my-client-signature* *my-client-signature* + (let ((sig (server:mk-signature))) + (set! *my-client-signature* sig) + *my-client-signature*))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched (define (server:kind-run run-id) (let ((last-run-time (hash-table-ref/default *server-kind-run* run-id #f))) @@ -136,13 +165,17 @@ ;; note: client:start will set *runremote*. this needs to be changed ;; also, client:start will login to the server, also need to change that. ;; ;; client:start returns #t if login was successful. ;; - (let ((res (server:ping-server run-id - (tasks:hostinfo-get-interface server) - (tasks:hostinfo-get-port server)))) + (let ((res (case *transport-type* + ((http)(server:ping-server run-id + (tasks:hostinfo-get-interface server) + (tasks:hostinfo-get-port server))) + ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) + (tasks:hostinfo-get-port server) + timeout: 2))))) ;; if the server didn't respond we must remove the record (if res #t (begin (debug:print-info 0 "server at " server " not responding, removing record") @@ -196,5 +229,27 @@ (case (string->symbol res) ((NOREPLY) #f) ((LOGIN_OK) #t) (else #f)) (loop (read-line) inl)))))) + +(define (server:login toppath) + (lambda (toppath) + (set! *last-db-access* (current-seconds)) + (if (equal? *toppath* toppath) + (begin + ;; (debug:print-info 2 "login successful") + #t) + (begin + ;; (debug:print-info 2 "login failed") + #f)))) + +(define (server:get-timeout) + (let ((tmo (configf:lookup *configdat* "server" "timeout"))) + (if (and (string? tmo) + (string->number tmo)) + (* 60 60 (string->number tmo)) + ;; (* 3 24 60 60) ;; default to three days + (* 60 1) ;; default to one minute + ;; (* 60 60 25) ;; default to 25 hours + ))) + Index: synchash.scm ================================================================== --- synchash.scm +++ synchash.scm @@ -62,13 +62,12 @@ ;; (cdb:remote-run db:get-num-runs #f "%") ;; (cdb:remote-run db:get-runs #f runnamepatt numruns *start-run-offset* keypatts) ;; ;; keynum => the field to use as the unique key (usually 0 but can be other field) ;; -(define (synchash:client-get proc synckey keynum synchash . params) - (let* ((data ;; (apply cdb:remote-run synchash:server-get #f proc synckey keynum params)) - (apply synchash:server-get #f proc synckey keynum params)) +(define (synchash:client-get proc synckey keynum synchash run-id . params) + (let* ((data (rmt:synchash-get run-id proc synckey keynum params)) (newdat (car data)) (removs (cadr data)) (myhash (hash-table-ref/default synchash synckey #f))) (if (not myhash) (begin @@ -89,17 +88,18 @@ ;; data)) ;; return the changed and deleted list (list newdat removs))) ;; synchash)) (define *synchashes* (make-hash-table)) -(define (synchash:server-get indb proc synckey keynum . params) +(define (synchash:server-get dbstruct run-id proc synckey keynum params) ;; (debug:print-info 2 "synckey: " synckey ", keynum: " keynum ", params: " params) - (let* ((db (if indb indb (db:open-megatest-db))) + (let* ((dbdat (db:get-db dbstruct run-id)) + (db (db:dbdat-get-db dbdat)) (synchash (hash-table-ref/default *synchashes* synckey #f)) (newdat (apply (case proc ((db:get-runs) db:get-runs) - ((db:get-tests-for-runs-mindata) db:get-tests-for-runs-mindata) + ((db:get-tests-for-run-mindata) db:get-tests-for-run-mindata) ((db:get-test-info-by-ids) db:get-test-info-by-ids) (else (print "ERROR: sync for hash " proc " not setup! Edits needed in synchash.scm") print)) db params)) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -12,10 +12,11 @@ (use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking format) (import (prefix sqlite3 sqlite3:)) (declare (unit tasks)) (declare (uses db)) +(declare (uses rmt)) (declare (uses common)) (include "task_records.scm") ;;====================================================================== @@ -103,21 +104,24 @@ (db:set-sync mdb) ;; (sqlite3:execute mdb (conc "PRAGMA synchronous = 0;")) ;; (if (or (and (not exists) ;; (file-write-access? *toppath*)) ;; (not (file-read-access? dbpath))) ;; (begin - (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS tasks_queue (id INTEGER PRIMARY KEY, - action TEXT DEFAULT '', - owner TEXT, - state TEXT DEFAULT 'new', - target TEXT DEFAULT '', - name TEXT DEFAULT '', - testpatt TEXT DEFAULT '', - keylock TEXT, - params TEXT, - creation_time TIMESTAMP, - execution_time TIMESTAMP);") + ;; + ;; TASKS QUEUE MOVED TO main.db + ;; + ;; (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS tasks_queue (id INTEGER PRIMARY KEY, + ;; action TEXT DEFAULT '', + ;; owner TEXT, + ;; state TEXT DEFAULT 'new', + ;; target TEXT DEFAULT '', + ;; name TEXT DEFAULT '', + ;; testpatt TEXT DEFAULT '', + ;; keylock TEXT, + ;; params TEXT, + ;; creation_time TIMESTAMP, + ;; execution_time TIMESTAMP);") (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS monitors (id INTEGER PRIMARY KEY, pid INTEGER, start_time TIMESTAMP, last_update TIMESTAMP, hostname TEXT, @@ -145,11 +149,10 @@ login_time TIMESTAMP, logout_time TIMESTAMP DEFAULT -1, CONSTRAINT clients_constraint UNIQUE (pid,hostname));") ;)) - (sqlite3:execute mdb "DELETE FROM tasks_queue WHERE state='done' AND creation_time < ?;" (- (current-seconds)(* 24 60 60))) ;; remove older than 24 hrs (set! *task-db* (cons mdb dbpath)) *task-db*)))) ;;====================================================================== ;; Server and client management @@ -167,29 +170,30 @@ (define (tasks:server-lock-slot mdb run-id) (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot") (if (< (tasks:num-in-available-state mdb run-id) 4) (begin (tasks:server-set-available mdb run-id) - ;; (thread-sleep! 2) ;; Try removing this. It may not be needed. + (thread-sleep! (/ (random 1500) 1000)) ;; (thread-sleep! 2) ;; Try removing this. It may not be needed. (tasks:server-am-i-the-server? mdb run-id)) #f)) ;; register that this server may come online (first to register goes though with the process) (define (tasks:server-set-available mdb run-id) (sqlite3:execute mdb "INSERT INTO servers (pid,hostname,port,pubport,start_time, priority,state,mt_version,heartbeat, interface,transport,run_id) VALUES(?, ?, ?, ?, strftime('%s','now'), ?, ?, ?,-1,?, ?, ?);" - (current-process-id) ;; pid - (get-host-name) ;; hostname - -1 ;; port - -1 ;; pubport - (random 1000) ;; priority (used a tiebreaker on get-available) - "available" ;; state - (common:version-signature) ;; mt_version - -1 ;; interface - "http" ;; transport + (current-process-id) ;; pid + (get-host-name) ;; hostname + -1 ;; port + -1 ;; pubport + (random 1000) ;; priority (used a tiebreaker on get-available) + "available" ;; state + (common:version-signature) ;; mt_version + -1 ;; interface + ;; (conc (server:get-transport)) ;; transport + (conc *transport-type*) ;; transport run-id )) (define (tasks:num-in-available-state mdb run-id) (let ((res 0)) @@ -277,24 +281,26 @@ port)))))) (define (tasks:server-am-i-the-server? mdb run-id) (let* ((all (tasks:server-get-servers-vying-for-run-id mdb run-id)) (first (if (null? all) - (begin (debug:print 0 "ERROR: no servers listed, should be at least one by now.") - (sqlite3:finalize! mdb) - (exit 1)) - (car (db:get-rows all)))) - (header (db:get-header all)) - (id (db:get-value-by-header first header "id")) - (hostname (db:get-value-by-header first header "hostname")) - (pid (db:get-value-by-header first header "pid")) - (priority (db:get-value-by-header first header "priority"))) - (debug:print 0 "INFO: am-i-the-server got record " first) - ;; for now a basic check. add tiebreaking by priority later - (if (and (equal? hostname (get-host-name)) - (equal? pid (current-process-id))) - id + #f;; (begin (debug:print 0 "ERROR: no servers listed, should be at least one by now.") + ;; (sqlite3:finalize! mdb) + ;; (exit 1)) + (car (db:get-rows all))))) + (if first + (let* ((header (db:get-header all)) + (id (db:get-value-by-header first header "id")) + (hostname (db:get-value-by-header first header "hostname")) + (pid (db:get-value-by-header first header "pid")) + (priority (db:get-value-by-header first header "priority"))) + ;; (debug:print 0 "INFO: am-i-the-server got record " first) + ;; for now a basic check. add tiebreaking by priority later + (if (and (equal? hostname (get-host-name)) + (equal? pid (current-process-id))) + id + #f)) #f))) ;; Use: (db:get-value-by-header (car (db:get-rows dat)) (db:get-header dat) "fieldname") ;; to extract info from the structure returned ;; @@ -355,24 +361,25 @@ mdb ;; NEEDS dbprep ADDED "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id) res)) (define (tasks:need-server run-id) - (let ((forced (configf:lookup *configdat* "server" "required")) - (maxqry (cdr (rmt:get-max-query-average run-id))) - (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) - (cond - (forced - (if (common:low-noise-print 60 run-id "server required is set") - (debug:print-info 0 "Server required is set, starting server.")) - #t) - ((> maxqry threshold) - (if (common:low-noise-print 60 run-id "Max query time execeeded") - (debug:print-info 0 "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, starting server.")) - #t) - (else - #f)))) + (configf:lookup *configdat* "server" "required")) + +;; (maxqry (cdr (rmt:get-max-query-average run-id))) +;; (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) +;; (cond +;; (forced +;; (if (common:low-noise-print 60 run-id "server required is set") +;; (debug:print-info 0 "Server required is set, starting server for run-id " run-id ".")) +;; #t) +;; ((> maxqry threshold) +;; (if (common:low-noise-print 60 run-id "Max query time execeeded") +;; (debug:print-info 0 "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, server needed for run-id " run-id ".")) +;; #t) +;; (else +;; #f)))) ;; try to start a server and wait for it to be available ;; (define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries) ;; ensure a server is running for this run @@ -381,23 +388,37 @@ (if (and (not server-dat) (< delay-time delay-max-tries)) (begin (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id) (debug:print 0 "Try starting server for run-id " run-id)) + (thread-sleep! (/ (random 2000) 1000)) (server:kind-run run-id) - (thread-sleep! (min delay-time 5)) + (thread-sleep! (min delay-time 1)) (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1)))))) (define (tasks:get-all-servers mdb) (let ((res '())) (sqlite3:for-each-row (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) ;; 0 1 2 3 4 5 6 7 8 9 10 11 12 (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res))) mdb - "SELECT id,pid,hostname,interface,port,pubport,start_time,priority,state,mt_version,strftime('%s','now')-heartbeat AS last_update,transport,run_id FROM servers WHERE state NOT LIKE 'defunct%' ORDER BY start_time DESC;") + "SELECT id,pid,hostname,interface,port,pubport,start_time,priority,state,mt_version,strftime('%s','now')-heartbeat AS last_update,transport,run_id + FROM servers WHERE state NOT LIKE 'defunct%' ORDER BY start_time DESC;") res)) + +(define (tasks:get-server-records mdb run-id) + (let ((res '())) + (sqlite3:for-each-row + (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) + ;; 0 1 2 3 4 5 6 7 8 9 10 11 12 + (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res))) + mdb + "SELECT id,pid,hostname,interface,port,pubport,start_time,priority,state,mt_version,strftime('%s','now')-heartbeat AS last_update,transport,run_id + FROM servers WHERE run_id=? AND state NOT LIKE 'defunct%' ORDER BY start_time DESC;" + run-id) + (reverse res))) ;; no elegance here ... ;; (define (tasks:kill-server hostname pid) (debug:print-info 0 "Attempting to kill server process " pid " on host " hostname) @@ -422,167 +443,18 @@ (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) - -;;====================================================================== -;; Tasks and Task monitors -;;====================================================================== - - -;;====================================================================== -;; Tasks -;;====================================================================== - - - -;;====================================================================== -;; Task Monitors -;;====================================================================== - -(define (tasks:register-monitor db mdb) - (let* ((pid (current-process-id)) - (hostname (get-host-name)) - (userinfo (user-information (current-user-id))) - (username (car userinfo))) - (print "Register monitor, pid: " pid ", hostname: " hostname ", username: " username) - (sqlite3:execute mdb "INSERT INTO monitors (pid,start_time,last_update,hostname,username) VALUES (?,strftime('%s','now'),strftime('%s','now'),?,?);" - pid hostname username))) - -(define (tasks:get-num-alive-monitors mdb) - (let ((res 0)) - (sqlite3:for-each-row - (lambda (count) - (set! res count)) - mdb - "SELECT count(id) FROM monitors WHERE last_update < (strftime('%s','now') - 300) AND username=?;" - (car (user-information (current-user-id)))) - res)) - -;; register a task -(define (tasks:add mdb action owner target runname testpatt params) - (sqlite3:execute mdb "INSERT INTO tasks_queue (action,owner,state,target,name,testpatt,params,creation_time,execution_time) - VALUES (?,?,'new',?,?,?,?,strftime('%s','now'),0);" - action - owner - target - runname - testpatt - (if params params ""))) - -(define (keys:key-vals-hash->target keys key-params) - (let ((tmp (hash-table-ref/default key-params (vector-ref (car keys) 0) ""))) - (if (> (length keys) 1) - (for-each (lambda (key) - (set! tmp (conc tmp "/" (hash-table-ref/default key-params (vector-ref key 0) "")))) - (cdr keys))) - tmp)) - -;; for use from the gui -(define (tasks:add-from-params mdb action keys key-params var-params) - (let ((target (keys:key-vals-hash->target keys key-params)) - (owner (car (user-information (current-user-id)))) - (runname (hash-table-ref/default var-params "runname" #f)) - (testpatts (hash-table-ref/default var-params "testpatts" "%")) - (params (hash-table-ref/default var-params "params" ""))) - (tasks:add mdb action owner target runname testpatts params))) - -;; return one task from those who are 'new' OR 'waiting' AND more than 10sec old -;; -(define (tasks:snag-a-task mdb) - (let ((res #f) - (keytxt (conc (current-process-id) "-" (get-host-name) "-" (car (user-information (current-user-id)))))) - - ;; first randomly set a new to pid-hostname-hostname - (sqlite3:execute - mdb - "UPDATE tasks_queue SET keylock=? WHERE id IN - (SELECT id FROM tasks_queue - WHERE state='new' OR - (state='waiting' AND (strftime('%s','now')-execution_time) > 10) OR - state='reset' - ORDER BY RANDOM() LIMIT 1);" keytxt) - - (sqlite3:for-each-row - (lambda (id . rem) - (set! res (apply vector id rem))) - mdb - "SELECT id,action,owner,state,target,name,test,item,params,creation_time,execution_time FROM tasks_queue WHERE keylock=? ORDER BY execution_time ASC LIMIT 1;" keytxt) - (if res ;; yep, have work to be done - (begin - (sqlite3:execute mdb "UPDATE tasks_queue SET state='inprogress',execution_time=strftime('%s','now') WHERE id=?;" - (tasks:task-get-id res)) - res) - #f))) - -(define (tasks:reset-stuck-tasks mdb) - (let ((res '())) - (sqlite3:for-each-row - (lambda (id delta) - (set! res (cons id res))) - mdb - "SELECT id,strftime('%s','now')-execution_time AS delta FROM tasks_queue WHERE state='inprogress' AND delta>700 ORDER BY delta DESC LIMIT 2;") - (sqlite3:execute - mdb - (conc "UPDATE tasks_queue SET state='reset' WHERE id IN ('" (string-intersperse (map conc res) "','") "');")))) - -;; return all tasks in the tasks_queue table -;; -(define (tasks:get-tasks mdb types states) - (let ((res '())) - (sqlite3:for-each-row - (lambda (id . rem) - (set! res (cons (apply vector id rem) res))) - mdb - (conc "SELECT id,action,owner,state,target,name,test,item,params,creation_time,execution_time - FROM tasks_queue " - ;; WHERE - ;; state IN " statesstr " AND - ;; action IN " actionsstr - " ORDER BY creation_time DESC;")) - res)) - -;; remove tasks given by a string of numbers comma separated -(define (tasks:remove-queue-entries mdb task-ids) - (sqlite3:execute mdb (conc "DELETE FROM tasks_queue WHERE id IN (" task-ids ");"))) - -;; -(define (tasks:start-monitor db mdb) - (if (> (tasks:get-num-alive-monitors mdb) 2) ;; have two running, no need for more - (debug:print-info 1 "Not starting monitor, already have more than two running") - (let* ((megatestdb (conc *toppath* "/megatest.db")) - (monitordbf (conc (configf:lookup *configdat* "setup" "linktree") "/.db/monitor.db")) - (last-db-update 0)) ;; (file-modification-time megatestdb))) - (task:register-monitor mdb) - (let loop ((count 0) - (next-touch 0)) ;; next-touch is the time where we need to update last_update - ;; if the db has been modified we'd best look at the task queue - (let ((modtime (file-modification-time megatestdbpath ))) - (if (> modtime last-db-update) - (tasks:process-queue db mdb last-db-update megatestdb next-touch)) - ;; WARNING: Possible race conditon here!! - ;; should this update be immediately after the task-get-action call above? - (if (> (current-seconds) next-touch) - (begin - (tasks:monitors-update mdb) - (loop (+ count 1)(+ (current-seconds) 240))) - (loop (+ count 1) next-touch))))))) - -(define (tasks:process-queue db mdb) - (let* ((task (tasks:snag-a-task mdb)) - (action (if task (tasks:task-get-action task) #f))) - (if action (print "tasks:process-queue task: " task)) - (if action - (case (string->symbol action) - ((run) (tasks:start-run db mdb task)) - ((remove) (tasks:remove-runs db mdb task)) - ((lock) (tasks:lock-runs db mdb task)) - ;; ((monitor) (tasks:start-monitor db task)) - ((rollup) (tasks:rollup-runs db mdb task)) - ((updatemeta)(tasks:update-meta db mdb task)) - ((kill) (tasks:kill-monitors db mdb task)))))) +;;====================================================================== +;; M O N I T O R S +;;====================================================================== + +(define (tasks:remove-monitor-record mdb) + (sqlite3:execute mdb "DELETE FROM monitors WHERE pid=? AND hostname=?;" + (current-process-id) + (get-host-name))) (define (tasks:get-monitors mdb) (let ((res '())) (sqlite3:for-each-row (lambda (a . rem) @@ -590,27 +462,10 @@ mdb "SELECT id,pid,strftime('%m/%d/%Y %H:%M',datetime(start_time,'unixepoch'),'localtime'),strftime('%m/%d/%Y %H:%M:%S',datetime(last_update,'unixepoch'),'localtime'),hostname,username FROM monitors ORDER BY last_update ASC;") (reverse res) )) -(define (tasks:tasks->text tasks) - (let ((fmtstr "~10a~10a~10a~12a~20a~12a~12a~10a")) - (conc (format #f fmtstr "id" "action" "owner" "state" "target" "runname" "testpatts" "params") "\n" - (string-intersperse - (map (lambda (task) - (format #f fmtstr - (tasks:task-get-id task) - (tasks:task-get-action task) - (tasks:task-get-owner task) - (tasks:task-get-state task) - (tasks:task-get-target task) - (tasks:task-get-name task) - (tasks:task-get-test task) - ;; (tasks:task-get-item task) - (tasks:task-get-params task))) - tasks) "\n")))) - (define (tasks:monitors->text-table monitors) (let ((fmtstr "~4a~8a~20a~20a~10a~10a")) (conc (format #f fmtstr "id" "pid" "start time" "last update" "hostname" "user") "\n" (string-intersperse (map (lambda (monitor) @@ -637,65 +492,259 @@ (set! deadlist (cons id deadlist))) mdb "SELECT id,pid,hostname,last_update,strftime('%s','now')-last_update AS delta FROM monitors WHERE delta > 700;") (sqlite3:execute mdb (conc "DELETE FROM monitors WHERE id IN ('" (string-intersperse (map conc deadlist) "','") "');"))) ) - -(define (tasks:remove-monitor-record mdb) - (sqlite3:execute mdb "DELETE FROM monitors WHERE pid=? AND hostname=?;" - (current-process-id) - (get-host-name))) - -(define (tasks:set-state mdb task-id state) - (sqlite3:execute mdb "UPDATE tasks_queue SET state=? WHERE id=?;" - state - task-id)) +(define (tasks:register-monitor db mdb) + (let* ((pid (current-process-id)) + (hostname (get-host-name)) + (userinfo (user-information (current-user-id))) + (username (car userinfo))) + (print "Register monitor, pid: " pid ", hostname: " hostname ", username: " username) + (sqlite3:execute mdb "INSERT INTO monitors (pid,start_time,last_update,hostname,username) VALUES (?,strftime('%s','now'),strftime('%s','now'),?,?);" + pid hostname username))) + +(define (tasks:get-num-alive-monitors mdb) + (let ((res 0)) + (sqlite3:for-each-row + (lambda (count) + (set! res count)) + mdb + "SELECT count(id) FROM monitors WHERE last_update < (strftime('%s','now') - 300) AND username=?;" + (car (user-information (current-user-id)))) + res)) + +;; +(define (tasks:start-monitor db mdb) + (if (> (tasks:get-num-alive-monitors mdb) 2) ;; have two running, no need for more + (debug:print-info 1 "Not starting monitor, already have more than two running") + (let* ((megatestdb (conc *toppath* "/megatest.db")) + (monitordbf (conc (configf:lookup *configdat* "setup" "linktree") "/.db/monitor.db")) + (last-db-update 0)) ;; (file-modification-time megatestdb))) + (task:register-monitor mdb) + (let loop ((count 0) + (next-touch 0)) ;; next-touch is the time where we need to update last_update + ;; if the db has been modified we'd best look at the task queue + (let ((modtime (file-modification-time megatestdbpath ))) + (if (> modtime last-db-update) + (tasks:process-queue db mdb last-db-update megatestdb next-touch)) + ;; WARNING: Possible race conditon here!! + ;; should this update be immediately after the task-get-action call above? + (if (> (current-seconds) next-touch) + (begin + (tasks:monitors-update mdb) + (loop (+ count 1)(+ (current-seconds) 240))) + (loop (+ count 1) next-touch))))))) + +;;====================================================================== +;; T A S K S Q U E U E +;; +;; NOTE:: These operate on task_queue which is in main.db +;; +;;====================================================================== + +;; NOTE: It might be good to add one more layer of checking to ensure +;; that no task gets run in parallel. + + + +;; register a task +(define (tasks:add dbstruct action owner target runname testpatt params) + (db:with-db + dbstruct #f #t + (lambda (db) + (sqlite3:execute db "INSERT INTO tasks_queue (action,owner,state,target,name,testpatt,params,creation_time,execution_time) + VALUES (?,?,'new',?,?,?,?,strftime('%s','now'),0);" + action + owner + target + runname + testpatt + (if params params ""))))) + +(define (keys:key-vals-hash->target keys key-params) + (let ((tmp (hash-table-ref/default key-params (vector-ref (car keys) 0) ""))) + (if (> (length keys) 1) + (for-each (lambda (key) + (set! tmp (conc tmp "/" (hash-table-ref/default key-params (vector-ref key 0) "")))) + (cdr keys))) + tmp)) + +;; for use from the gui, not ported +;; +;; (define (tasks:add-from-params mdb action keys key-params var-params) +;; (let ((target (keys:key-vals-hash->target keys key-params)) +;; (owner (car (user-information (current-user-id)))) +;; (runname (hash-table-ref/default var-params "runname" #f)) +;; (testpatts (hash-table-ref/default var-params "testpatts" "%")) +;; (params (hash-table-ref/default var-params "params" ""))) +;; (tasks:add mdb action owner target runname testpatts params))) + +;; return one task from those who are 'new' OR 'waiting' AND more than 10sec old +;; +(define (tasks:snag-a-task dbstruct) + (let ((res #f) + (keytxt (conc (current-process-id) "-" (get-host-name) "-" (car (user-information (current-user-id)))))) + (db:with-db + dbstruct #f #t + (lambda (db) + ;; first randomly set a new to pid-hostname-hostname + (sqlite3:execute + db + "UPDATE tasks_queue SET keylock=? WHERE id IN + (SELECT id FROM tasks_queue + WHERE state='new' OR + (state='waiting' AND (strftime('%s','now')-execution_time) > 10) OR + state='reset' + ORDER BY RANDOM() LIMIT 1);" keytxt) + + (sqlite3:for-each-row + (lambda (id . rem) + (set! res (apply vector id rem))) + db + "SELECT id,action,owner,state,target,name,test,item,params,creation_time,execution_time FROM tasks_queue WHERE keylock=? ORDER BY execution_time ASC LIMIT 1;" keytxt) + (if res ;; yep, have work to be done + (begin + (sqlite3:execute db "UPDATE tasks_queue SET state='inprogress',execution_time=strftime('%s','now') WHERE id=?;" + (tasks:task-get-id res)) + res) + #f))))) + +(define (tasks:reset-stuck-tasks dbstruct) + (let ((res '())) + (db:with-db + dbstruct #f #t + (lambda (db) + (sqlite3:for-each-row + (lambda (id delta) + (set! res (cons id res))) + db + "SELECT id,strftime('%s','now')-execution_time AS delta FROM tasks_queue WHERE state='inprogress' AND delta>700 ORDER BY delta DESC LIMIT 2;") + (sqlite3:execute + db + (conc "UPDATE tasks_queue SET state='reset' WHERE id IN ('" (string-intersperse (map conc res) "','") "');") + ))))) + +;; return all tasks in the tasks_queue table +;; +(define (tasks:get-tasks dbstruct types states) + (let ((res '())) + (db:with-db + dbstruct #f #f + (lambda (db) + (sqlite3:for-each-row + (lambda (id . rem) + (set! res (cons (apply vector id rem) res))) + db + (conc "SELECT id,action,owner,state,target,name,test,item,params,creation_time,execution_time + FROM tasks_queue " + ;; WHERE + ;; state IN " statesstr " AND + ;; action IN " actionsstr + " ORDER BY creation_time DESC;")) + res)))) + +;; remove tasks given by a string of numbers comma separated +(define (tasks:remove-queue-entries dbstruct task-ids) + (db:with-db + dbstruct #f #t + (lambda (db) + (sqlite3:execute db (conc "DELETE FROM tasks_queue WHERE id IN (" task-ids ");"))))) + +(define (tasks:process-queue dbstruct) + (let* ((task (tasks:snag-a-task dbstruct)) + (action (if task (tasks:task-get-action task) #f))) + (if action (print "tasks:process-queue task: " task)) + (if action + (case (string->symbol action) + ((run) (tasks:start-run dbstruct task)) + ((remove) (tasks:remove-runs dbstruct task)) + ((lock) (tasks:lock-runs dbstruct task)) + ;; ((monitor) (tasks:start-monitor db task)) + ((rollup) (tasks:rollup-runs dbstruct task)) + ((updatemeta)(tasks:update-meta dbstruct task)) + ((kill) (tasks:kill-monitors dbstruct task)))))) + +(define (tasks:tasks->text tasks) + (let ((fmtstr "~10a~10a~10a~12a~20a~12a~12a~10a")) + (conc (format #f fmtstr "id" "action" "owner" "state" "target" "runname" "testpatts" "params") "\n" + (string-intersperse + (map (lambda (task) + (format #f fmtstr + (tasks:task-get-id task) + (tasks:task-get-action task) + (tasks:task-get-owner task) + (tasks:task-get-state task) + (tasks:task-get-target task) + (tasks:task-get-name task) + (tasks:task-get-test task) + ;; (tasks:task-get-item task) + (tasks:task-get-params task))) + tasks) "\n")))) + +(define (tasks:set-state dbstruct task-id state) + (db:with-db + dbstruct #f #t + (lambda (db) + (sqlite3:execute db "UPDATE tasks_queue SET state=? WHERE id=?;" + state + task-id)))) ;;====================================================================== ;; Access using task key (stored in params; (hash-table->alist flags) hostname pid ;;====================================================================== -(define (tasks:param-key->id mdb task-params) - (handle-exceptions - exn - #f - (sqlite3:first-result mdb "SELECT id FROM tasks_queue WHERE params LIKE ?;" task-params))) - -(define (tasks:set-state-given-param-key mdb param-key new-state) - (sqlite3:execute mdb "UPDATE tasks_queue SET state=? WHERE params LIKE ?;" new-state param-key)) - -(define (tasks:get-records-given-param-key mdb param-key state-patt action-patt test-patt) - (handle-exceptions - exn - '() - (sqlite3:first-row mdb "SELECT id,action,owner,state,target,name,testpatt,keylock,params WHERE - params LIKE ? AND state LIKE ? AND action LIKE ? AND testpatt LIKE ?;" - param-key state-patt action-patt test-patt))) - - -;;====================================================================== -;; Rogue items, no place to put these yet -;;====================================================================== - -(define (tasks:find-task-queue-records mdb target run-name test-patt state-patt action-patt) +(define (tasks:param-key->id dbstruct task-params) + (db:with-db + dbstruct #f #f + (lambda (db) + (handle-exceptions + exn + #f + (sqlite3:first-result db "SELECT id FROM tasks_queue WHERE params LIKE ?;" + task-params))))) + +(define (tasks:set-state-given-param-key dbstruct param-key new-state) + (db:with-db + dbstruct #f #t + (lambda (db) + (sqlite3:execute db "UPDATE tasks_queue SET state=? WHERE params LIKE ?;" new-state param-key)))) + +(define (tasks:get-records-given-param-key dbstruct param-key state-patt action-patt test-patt) + (db:with-db + dbstruct #f #f + (lambda (db) + (handle-exceptions + exn + '() + (sqlite3:first-row db "SELECT id,action,owner,state,target,name,testpatt,keylock,params WHERE + params LIKE ? AND state LIKE ? AND action LIKE ? AND testpatt LIKE ?;" + param-key state-patt action-patt test-patt))))) + + +(define (tasks:find-task-queue-records dbstruct target run-name test-patt state-patt action-patt) ;; (handle-exceptions ;; exn ;; '() ;; (sqlite3:first-row - (let ((res '())) + (let ((db (db:delay-if-busy (db:get-db dbstruct #f))) + (res '())) (sqlite3:for-each-row (lambda (a . b) (set! res (cons (cons a b) res))) - mdb "SELECT id,action,owner,state,target,name,testpatt,keylock,params FROM tasks_queue - WHERE - target = ? AND name = ? AND state LIKE ? AND action LIKE ? AND testpatt LIKE ?;" + db "SELECT id,action,owner,state,target,name,testpatt,keylock,params FROM tasks_queue + WHERE + target = ? AND name = ? AND state LIKE ? AND action LIKE ? AND testpatt LIKE ?;" target run-name state-patt action-patt test-patt) res)) ;; ) - -(define (tasks:kill-runner mdb target run-name) - (let ((records (tasks:find-task-queue-records mdb target run-name "%" "running" "run-tests")) +;; kill any runner processes (i.e. processes handling -runtests) that match target/runname +;; +;; do a remote call to get the task queue info but do the killing as self here. +;; +(define (tasks:kill-runner target run-name) + (let ((records (rmt:tasks-find-task-queue-records target run-name "%" "running" "run-tests")) (hostpid-rx (regexp "\\s+(\\w+)\\s+(\\d+)$"))) ;; host pid is at end of param string (if (null? records) (debug:print 0 "No run launching processes found for " target " / " run-name) (debug:print 0 "Found " (length records) " run(s) to kill.")) (for-each @@ -725,45 +774,38 @@ (setenv "TARGETHOST_LOGF" "server-kills.log") (system (conc "nbfake kill " pid)) (if old-targethost (setenv "TARGETHOST" old-targethost)) (unsetenv "TARGETHOST") (unsetenv "TARGETHOST_LOGF")))) - (debug:print 0 "ERROR: no record or improper record for " target "/" run-name " in tasks_queue in monitor.db")))) + (debug:print 0 "ERROR: no record or improper record for " target "/" run-name " in tasks_queue in main.db")))) records))) - -;;====================================================================== -;; The routines to process tasks -;;====================================================================== - -;; NOTE: It might be good to add one more layer of checking to ensure -;; that no task gets run in parallel. - -(define (tasks:start-run db mdb task) - (let ((flags (make-hash-table))) - (hash-table-set! flags "-rerun" "NOT_STARTED") - (if (not (string=? (tasks:task-get-params task) "")) - (hash-table-set! flags "-setvars" (tasks:task-get-params task))) - (print "Starting run " task) - ;; sillyness, just call the damn routine with the task vector and be done with it. FIXME SOMEDAY - (runs:run-tests db - (tasks:task-get-target task) - (tasks:task-get-name task) - (tasks:task-get-test task) - (tasks:task-get-item task) - (tasks:task-get-owner task) - flags) - (tasks:set-state mdb (tasks:task-get-id task) "waiting"))) - -(define (tasks:rollup-runs db mdb task) - (let* ((flags (make-hash-table)) - (keys (db:get-keys db)) - (keyvals (keys:target-keyval keys (tasks:task-get-target task)))) - ;; (hash-table-set! flags "-rerun" "NOT_STARTED") - (print "Starting rollup " task) - ;; sillyness, just call the damn routine with the task vector and be done with it. FIXME SOMEDAY - (runs:rollup-run db - keys - keyvals - (tasks:task-get-name task) - (tasks:task-get-owner task)) - (tasks:set-state mdb (tasks:task-get-id task) "waiting"))) +;; (define (tasks:start-run dbstruct mdb task) +;; (let ((flags (make-hash-table))) +;; (hash-table-set! flags "-rerun" "NOT_STARTED") +;; (if (not (string=? (tasks:task-get-params task) "")) +;; (hash-table-set! flags "-setvars" (tasks:task-get-params task))) +;; (print "Starting run " task) +;; ;; sillyness, just call the damn routine with the task vector and be done with it. FIXME SOMEDAY +;; (runs:run-tests db +;; (tasks:task-get-target task) +;; (tasks:task-get-name task) +;; (tasks:task-get-test task) +;; (tasks:task-get-item task) +;; (tasks:task-get-owner task) +;; flags) +;; (tasks:set-state mdb (tasks:task-get-id task) "waiting"))) +;; +;; (define (tasks:rollup-runs db mdb task) +;; (let* ((flags (make-hash-table)) +;; (keys (db:get-keys db)) +;; (keyvals (keys:target-keyval keys (tasks:task-get-target task)))) +;; ;; (hash-table-set! flags "-rerun" "NOT_STARTED") +;; (print "Starting rollup " task) +;; ;; sillyness, just call the damn routine with the task vector and be done with it. FIXME SOMEDAY +;; (runs:rollup-run db +;; keys +;; keyvals +;; (tasks:task-get-name task) +;; (tasks:task-get-owner task)) +;; (tasks:set-state mdb (tasks:task-get-id task) "waiting"))) + ADDED testnanomsg/basic-req-rep.scm Index: testnanomsg/basic-req-rep.scm ================================================================== --- /dev/null +++ testnanomsg/basic-req-rep.scm @@ -0,0 +1,3 @@ +(use nanomsg srfi-18 sqlite3 numbers) + +(define resp (nn-socket 'rep)) ADDED testnanomsg/mockupclient.scm Index: testnanomsg/mockupclient.scm ================================================================== --- /dev/null +++ testnanomsg/mockupclient.scm @@ -0,0 +1,42 @@ +(use zmq posix numbers) + +(define cname "Bob") +(define runtime 10) +(let ((args (argv))) + (if (< (length args) 3) + (begin + (print "Usage: mockupclient clientname runtime") + (exit)) + (begin + (set! cname (cadr args)) + (set! runtime (string->number (caddr args)))))) + +;; (define start-delay (/ (random 100) 9)) +;; (define runtime (+ 1 (/ (random 200) 2))) + +(print "Starting client " cname " with runtime " runtime) + +(include "mockupclientlib.scm") + +(set! endtime (+ (current-seconds) runtime)) + +;; first ping the server to ensure we have a connection +(if (server-ping cname 5) + (print "SUCCESS: Client " cname " connected to server") + (begin + (print "ERROR: Client " cname " failed ping of server, exiting") + (exit))) + +(let loop () + (let ((x (random 15)) + (varname (list-ref (list "hello" "goodbye" "saluton" "kiaorana")(random 4)))) + (case x + ;; ((1)(dbaccess cname 'sync "nodat" #f)) + ((2 3 4 5)(dbaccess cname 'set varname (random 999))) + ((6 7 8 9 10)(print cname ": Get \"" varname "\" " (dbaccess cname 'get varname #f))) + (else + (thread-sleep! 0.011))) + (if (< (current-seconds) endtime) + (loop)))) + +(print "Client " cname " all done!!") ADDED testnanomsg/mockupclientlib.scm Index: testnanomsg/mockupclientlib.scm ================================================================== --- /dev/null +++ testnanomsg/mockupclientlib.scm @@ -0,0 +1,58 @@ +(define reqs (nn-socket 'req)) + +(connect-socket reqs "tcp://localhost:6563") + +(thread-sleep! 0.2) + +(define (server-ping cname timeout) + (let ((msg (conc cname ":ping:" timeout)) + (maxtime (+ (current-seconds) timeout))) + (print "pinging server from " cname " with timeout " timeout) + (let loop ((res #f)) + (if (< maxtime (current-seconds)) + #f ;; failed to ping + (if (equal? res "Got ping") + #t + (begin + (print "Ping received from server " res) + (send-message push msg) + (thread-sleep! 0.1) + (loop (receive-message sub non-blocking: #t)))))))) + +(define (dbaccess cname cmd var val #!key (numtries 20)) + (let* ((msg (conc cname ":" cmd ":" (if val (conc var " " val) var))) + (res #f) + (mtx1 (make-mutex)) + (do-access (lambda () + (let ((tmpres #f)) + (print "Sending msg: " msg) + (send-message push msg) + (print "Message " msg " sent") + (print "Client " cname " waiting for response to " msg) + (print "Client " cname " received address " (receive-message* sub)) + (set! tmpres (receive-message* sub)) + (mutex-lock! mtx1) + (set! res tmpres) + (mutex-unlock! mtx1)))) + (th1 (make-thread do-access "do access")) + (th2 (make-thread (lambda () + (let ((result #f)) + (mutex-lock! mtx1) + (set! result res) + (mutex-unlock! mtx1) + (thread-sleep! 5) + (if (not result) + (if (> numtries 0) + (begin + (print "WARNING: access timed out for " cname ", trying again. Trys remaining=" numtries) + (dbaccess cname cmd var val numtries: (- numtries 1))) + (begin + (print "ERROR: dbaccess timed out. Exiting") + (exit))))) + "timeout thread")))) + (thread-start! th1) + (thread-start! th2) + (thread-join! th1) + (if res (print "SUCCESS: received " res " with " numtries " remaining possible attempts")) + res)) + ADDED testnanomsg/mockupserver.scm Index: testnanomsg/mockupserver.scm ================================================================== --- /dev/null +++ testnanomsg/mockupserver.scm @@ -0,0 +1,146 @@ +;; pub/sub with envelope address +;; Note that if you don't insert a sleep, the server will crash with SIGPIPE as soon +;; as a client disconnects. Also a remaining client may receive tons of +;; messages afterward. + +(use nanomsg srfi-18 sqlite3 numbers) + +(define resp (nn-socket 'rep)) +(define cname "server") +(define total-db-accesses 0) +(define start-time (current-seconds)) + +(nn-bind resp "tcp://*:6563") + +(thread-sleep! 0.2) + +(define (open-db) + (let* ((dbpath "mockup.db") + (dbexists (file-exists? dbpath)) + (db (open-database dbpath)) ;; (never-give-up-open-db dbpath)) + (handler (make-busy-timeout 10))) + (set-busy-handler! db handler) + (if (not dbexists) + (for-each + (lambda (stmt) + (execute db stmt)) + (list + "PRAGMA SYNCHRONOUS=0;" + "CREATE TABLE clients (id INTEGER PRIMARY KEY,name TEXT,num_accesses INTEGER DEFAULT 0);" + "CREATE TABLE vars (var TEXT,val TEXT,CONSTRAINT vars_constraint UNIQUE (var));"))) + db)) + +(define cid-cache (make-hash-table)) + +(define (get-client-id db cname) + (let ((cid (hash-table-ref/default cid-cache cname #f))) + (if cid + cid + (begin + (execute db "INSERT OR REPLACE INTO clients (name) VALUES(?);" cname) + (for-each-row + (lambda (id) + (set! cid id)) + db + "SELECT id FROM clients WHERE name=?;" cname) + (hash-table-set! cid-cache cname cid) + (set! total-db-accesses (+ total-db-accesses 2)) + cid)))) + +(define (count-client db cname) + (let ((cid (get-client-id db cname))) + (execute db "UPDATE clients SET num_accesses=num_accesses+1 WHERE id=?;" cid) + (set! total-db-accesses (+ total-db-accesses 1)) + )) + +(define db (open-db)) +;; (define queuelst '()) +;; (define mx1 (make-mutex)) + +(define max-queue-len 0) + +(define (process-queue queuelst) + (let ((queuelen (length queuelst))) + (if (> queuelen max-queue-len) + (set! max-queue-len queuelen)) + (for-each + (lambda (item) + (let ((cname (vector-ref item 1)) + (clcmd (vector-ref item 2)) + (cdata (vector-ref item 3))) + (send-message pub cname send-more: #t) + (send-message pub (case clcmd + ((sync) + (conc queuelen)) + ((set) + (set! total-db-accesses (+ total-db-accesses 1)) + (apply execute db "INSERT OR REPLACE INTO vars (var,val) VALUES (?,?);" (string-split cdata)) + "ok") + ((get) + (set! total-db-accesses (+ total-db-accesses 1)) + (let ((res "noval")) + (for-each-row + (lambda (val) + (set! res val)) + db + "SELECT val FROM vars WHERE var=?;" cdata) + res)) + (else (conc "unk cmd: " clcmd)))))) + queuelst))) + +;; SERVER THREAD +(define th1 (make-thread + (lambda () + (let ((last-run 0)) ;; current-seconds when run last + (let loop ((queuelst '())) + (let* ((indat (receive-message* pull)) + (parts (string-split indat ":")) + (cname (car parts)) ;; client name + (clcmd (string->symbol (cadr parts))) ;; client cmd + (cdata (caddr parts)) ;; client data + (svect (vector (current-seconds) cname clcmd cdata))) ;; record for the queue + ;; (print "Server received message: " indat) + (count-client db cname) + (case clcmd + ((ping) + (print "Got ping from " cname) + (send-message pub cname send-more: #t) + (send-message pub "Got ping") + (loop queuelst)) + ((sync) ;; just process the queue + (print "Got sync from " cname) + (process-queue (cons svect queuelst)) + (loop '())) + ((get) + (process-queue (cons svect queuelst)) + (loop '())) + (else + (loop (cons svect queuelst)))))))) + "server thread")) + +(include "mockupclientlib.scm") + +;; SYNC THREAD +;; send a sync to the pull port +(define th2 (make-thread + (lambda () + (let ((last-action-time (current-seconds))) + (let loop () + (thread-sleep! 5) + (let ((queuelen (string->number (dbaccess "server" 'sync "nada" #f))) + (last-action-delta #f)) + (if (> queuelen 1)(set! last-action-time (current-seconds))) + (set! last-action-delta (- (current-seconds) last-action-time)) + (print "Server: Got queuelen=" queuelen ", last-action-delta=" last-action-delta) + (if (< last-action-delta 60) + (loop) + (print "Server exiting, 25 seconds since last access")))))) + "sync thread")) + +(thread-start! th1) +(thread-start! th2) +(thread-join! th2) + +(let* ((run-time (- (current-seconds) start-time)) + (queries/second (/ total-db-accesses run-time))) + (print "Server exited! Total db accesses=" total-db-accesses " in " run-time " seconds for " queries/second " queries/second with max queue length of: " max-queue-len)) ADDED testnanomsg/pipeline.scm Index: testnanomsg/pipeline.scm ================================================================== --- /dev/null +++ testnanomsg/pipeline.scm @@ -0,0 +1,25 @@ +;; watch nanomsg's pipeline load-balancer in action. +(use nanomsg) + +(define push (nn-socket 'push)) +(define pull1 (nn-socket 'pull)) +(define pull2 (nn-socket 'pull)) + +(nn-bind push "inproc://test") +(nn-connect pull1 "inproc://test") +(nn-connect pull2 "inproc://test") + +(nn-send push "a") +(nn-send push "b") +(nn-send push "c") +(nn-send push "d") + +(define ((th sock)) + (print (current-thread) ": " (nn-recv sock)) + (print (current-thread) ": " (nn-recv sock)) + (print (current-thread) " is done")) + +(thread-start! (th pull1)) +(thread-start! (th pull2)) + +(thread-sleep! 1) ADDED testnanomsg/req-rep-client.scm Index: testnanomsg/req-rep-client.scm ================================================================== --- /dev/null +++ testnanomsg/req-rep-client.scm @@ -0,0 +1,30 @@ +;; watch nanomsg's pipeline load-balancer in action. +(use nanomsg) + +(define req (nn-socket 'req)) + +(nn-connect req "tcp://localhost:22022") + +;; (with-output-to-string (lambda ()(serialize obj))) +(define (client-send-receive soc msg) + (nn-send soc msg) + (nn-recv soc)) + +(define ((talk-to-server soc)) + (let loop ((cnt 20)) + (let ((name (list-ref '("Matt" "Tom" "Bob" "Jill" "James" "Jane")(random 6)))) + (print "Sending " name) + (print (client-send-receive req name)) + (if (> cnt 0)(loop (- cnt 1))))) + (print (client-send-receive req "quit")) + (nn-close req) + (exit)) + +;; (thread-start! (lambda () +;; (thread-sleep! 20) +;; (print "Give up on waiting for the server") +;; (nn-close req) +;; (exit))) + +(thread-join! (thread-start! (talk-to-server req))) + ADDED testnanomsg/req-rep-server.scm Index: testnanomsg/req-rep-server.scm ================================================================== --- /dev/null +++ testnanomsg/req-rep-server.scm @@ -0,0 +1,90 @@ +;; watch nanomsg's pipeline load-balancer in action. +(use nanomsg) + +;; (use trace) +;; (trace nn-bind nn-socket nn-assert nn-recv nn-send thread-terminate! nn-close ) + +(define port 22022) +(define host "127.0.0.1") + +(define rep (nn-socket 'rep)) + +(print "connecting, got: " (nn-bind rep (conc "tcp://" "*" ":" port))) + +(define (server soc) + (print "server starting") + (let loop ((msg-in (nn-recv soc))) + (print "server received: " msg-in) + (cond + ((equal? msg-in "quit") + (nn-send soc "Ok, quitting")) + ((and (>= (string-length msg-in) 4) + (equal? (substring msg-in 0 4) "ping")) + (nn-send soc (conc (current-process-id))) + (loop (nn-recv soc))) + ;;((and (>= (string-length msg-in) + (else + (let ((this-task (random 15))) + (thread-sleep! this-task) + (nn-send soc (conc "hello " msg-in " this task took " this-task " seconds to complete")) + (loop (nn-recv soc))))))) + +(define (ping-self host port #!key (return-socket #t)) + ;; send a random number along with pid and check that we get it back + (let* ((req (nn-socket 'req)) + (key "ping") + (success #f) + (keepwaiting #t) + (ping (make-thread + (lambda () + (print "ping: sending string \"" key "\", expecting " (current-process-id)) + (nn-send req key) + (let ((result (nn-recv req))) + (if (equal? (conc (current-process-id)) result) + (begin + (print "ping, success: received \"" result "\"") + (set! success #t)) + (begin + (print "ping, failed: received key \"" result "\"") + (set! keepwaiting #f) + (set! success #f))))) + "ping")) + (timeout (make-thread (lambda () + (let loop ((count 0)) + (thread-sleep! 1) + (print "still waiting after count seconds...") + (if (and keepwaiting (< count 10)) + (loop (+ count 1)))) + (if keepwaiting + (begin + (print "timeout waiting for ping") + (thread-terminate! ping)))) + "timeout"))) + (nn-connect req (conc "tcp://" host ":" port)) + (handle-exceptions + exn + (begin + (print-call-chain) + (print 0 " message: " ((condition-property-accessor 'exn 'message) exn)) + (print "exn=" (condition->list exn)) + (print "ping failed to connect to " host ":" port)) + (thread-start! timeout) + (thread-start! ping) + (thread-join! ping) + (if success (thread-terminate! timeout))) + (if return-socket + (if success req #f) + (begin + (nn-close req) + success)))) + +(let ((server-thread (make-thread (lambda ()(server rep)) "server"))) + (thread-start! server-thread) + ;; (thread-sleep! 1) + (if (ping-self host port) + (begin + (thread-join! server-thread) + (nn-close rep)) + (print "ping failed"))) + +(exit) ADDED testnanomsg/req-rep.scm Index: testnanomsg/req-rep.scm ================================================================== --- /dev/null +++ testnanomsg/req-rep.scm @@ -0,0 +1,30 @@ +;; watch nanomsg's pipeline load-balancer in action. +(use nanomsg) + +(define req (nn-socket 'req)) +(define rep (nn-socket 'rep)) + +(nn-bind rep "inproc://test") +(nn-connect req "inproc://test") + +(define (client-send-receive soc msg) + (nn-send soc msg) + (nn-recv soc)) + +(define ((server soc)) + (let loop ((msg-in (nn-recv soc))) + (if (not (equal? msg-in "quit")) + (begin + (nn-send soc (conc "hello " msg-in)) + (loop (nn-recv soc)))))) + +(thread-start! (server rep)) + +(print (client-send-receive req "Matt")) +(print (client-send-receive req "Tom")) + +;; (client-send-receive req "quit") + +(nn-close req) +(nn-close rep) +(exit) ADDED testrpc/client.scm Index: testrpc/client.scm ================================================================== --- /dev/null +++ testrpc/client.scm @@ -0,0 +1,8 @@ +;;;; client.scm +(use rpc posix) + +(define call (rpc:procedure 'foo "localhost")) + +(do ((i 10 (sub1 i))) + ((zero? i)) + (print "-> " (call (random 100)))) ADDED testrpc/server.scm Index: testrpc/server.scm ================================================================== --- /dev/null +++ testrpc/server.scm @@ -0,0 +1,15 @@ +;;;; server.scm +(use rpc) + +(rpc:publish-procedure! + 'foo + (lambda (x) + (print "foo: " x) + #f)) + +(rpc:publish-procedure! + 'fini + (lambda () (print "fini") (thread-start! (lambda () (thread-sleep! 3) (print "terminate") (exit))) #f)) + +((rpc:make-server (tcp-listen (rpc:default-server-port))) #t) + Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -36,11 +36,11 @@ (let* ((test-search-path (tests:get-tests-search-path *configdat*))) (tests:get-valid-tests (make-hash-table) test-search-path))) (define (tests:get-tests-search-path cfgdat) (let ((paths (map cadr (configf:get-section cfgdat "tests-paths")))) - (cons (conc *toppath* "/tests") paths))) + (append paths (list (conc *toppath* "/tests"))))) (define (tests:get-valid-tests test-registry tests-paths) (if (null? tests-paths) test-registry (let loop ((hed (car tests-paths)) Index: tests/Makefile ================================================================== --- tests/Makefile +++ tests/Makefile @@ -34,11 +34,11 @@ cd ..;make -j && make install cd fullrun;$(MEGATEST) -stop-server 0 repl : cd ..;make -j && make install - cd fullrun;$(MEGATEST) -repl + cd fullrun;$(MEGATEST) -:b -repl test0 : cleanprep cd simplerun ; $(MEGATEST) -server - -debug $(DEBUG) test1 : cleanprep @@ -69,10 +69,11 @@ test4a : cleanprep cd fullrun;time $(MEGATEST) -debug $(DEBUG) -preclean -runtests all_toplevel -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_b -m "This is a comment specific to a run" -v $(LOGGING) # NOTE: Only one instance can be a server test5 : cleanprep + rm -f fullrun/a*.log fullrun/logs/* @echo "WARNING: No longer running fullprep, test converage may be lessened" cd fullrun;sleep 0;$(MEGATEST) -preclean -runtests % -target $(TARGET) :runname $(RUNNAME)_aa -debug $(DEBUG) $(LOGGING) > aa.log 2> aa.log & cd fullrun;sleep 0;$(MEGATEST) -preclean -runtests % -target $(TARGET) :runname $(RUNNAME)_ab -debug $(DEBUG) $(LOGGING) > ab.log 2> ab.log & cd fullrun;sleep 5;$(MEGATEST) -preclean -runtests % -target $(TARGET) :runname $(RUNNAME)_ac -debug $(DEBUG) $(LOGGING) > ac.log 2> ac.log & cd fullrun;sleep 8;$(MEGATEST) -preclean -runtests % -target $(TARGET) :runname $(RUNNAME)_ad -debug $(DEBUG) $(LOGGING) > ad.log 2> ad.log & @@ -116,27 +117,27 @@ test9 : minsetup test9a test9b test9c test9d test9e test9a : @echo Run super-simple mintest e, no waitons. cd mintest;$(DASHBOARD)& - cd mintest;$(MEGATEST) -preclean -runtests e -target $(VER) :runname `date +%H.%M.%S` -debug $(DEBUG) + cd mintest;$(MEGATEST) -preclean -runtests e -target $(VER) -runname $(shell date +%H.%M.%S) -debug $(DEBUG) test9b : @echo Run simple mintest d with one waiton c - cd mintest;$(MEGATEST) -preclean -runtests d -target $(VER) :runname `date +%H.%M.%S` -debug $(DEBUG) + cd mintest;$(MEGATEST) -preclean -runtests d -target $(VER) -runname `date +%H.%M.%S` -debug $(DEBUG) test9c : @echo Run mintest a with full waiton chain a -> b -> c -> d -> e - cd mintest;$(MEGATEST) -preclean -runtests a -target $(VER) :runname `date +%H.%M.%S` -debug $(DEBUG) + cd mintest;$(MEGATEST) -preclean -runtests a -target $(VER) -runname `date +%H.%M.%S` -debug $(DEBUG) test9d : @echo Run an itemized test with no items - cd mintest;$(MEGATEST) -preclean -runtests g -target $(VER) :runname `date +%H.%M.%S` -debug $(DEBUG) + cd mintest;$(MEGATEST) -preclean -runtests g -target $(VER) -runname `date +%H.%M.%S` -debug $(DEBUG) test9e : @echo Run mintest a1 with full waiton chain with d1fail: a1 -> b1 -> c1 -> d1fail -> e1 - cd mintest;$(MEGATEST) -preclean -runtests a1 -target $(VER) :runname `date +%H.%M.%S` -debug $(DEBUG) + cd mintest;$(MEGATEST) -preclean -runtests a1 -target $(VER) -runname `date +%H.%M.%S` -debug $(DEBUG) test10 : @echo Run a bunch of different targets simultaneously (cd fullrun;$(MEGATEST) -server - ;sleep 2)& for targ in mint/btrfs/mintdir sunos/sshfs/loc; do \ @@ -157,11 +158,11 @@ cd mintest;$(MEGATEST) -server - -debug $(DEBUG) > server.log 2> server.log & sleep 3 cd mintest;$(DASHBOARD) -rows 18 & cleanprep : ../*.scm Makefile */*.config - mkdir -p fullrun/tmp/mt_runs fullrun/tmp/mt_links + mkdir -p fullrun/tmp/mt_runs fullrun/tmp/mt_links /tmp/$(USER)/adisk1 cd ..;make -j;make install rm -f */logging.db touch cleanprep fullprep : cleanprep Index: tests/fullrun/megatest.config ================================================================== --- tests/fullrun/megatest.config +++ tests/fullrun/megatest.config @@ -31,13 +31,10 @@ # yes, anything else is no run-wait yes -# Use http instead of direct filesystem access -# transport http -# transport fs # If set to "default" the old code is used. Otherwise defaults to 200 or uses # numeric value given. # runqueue 20 @@ -124,27 +121,34 @@ MAX_ALLOWED_LOAD 200 # XTERM [system xterm] # RUNDEAD [system exit 56] [server] + +# Use http instead of direct filesystem access +transport http +# transport fs +# transport nmsg + synchronous 0 # If the server can't be started on this port it will try the next port until # it succeeds -port 8080 +port 9080 # This server will keep running this number of hours after last access. # Three minutes is 0.05 hours # timeout 0.025 -timeout 0.1 +timeout 0.061 # Server is required - slower but more resistant to Sqlite issues. -# required yes +required yes # Start server when average query takes longer than this +# server-query-threshold 55500 server-query-threshold 100 -# 55500 +timeout 0.01 # daemonize yes # hostname #{scheme (get-host-name)} ## disks are: @@ -167,5 +171,35 @@ sqlite3 6 blockz 10 # to your jobgroups where N is the number of parallel runs you are likely to see # +#====================================================================== +# Machine flavors +# +# These specify lists of hosts or scripts to use or call for various +# flavors of task. +# +#====================================================================== + +[flavors] + +plain hosts: xena, phoebe +strong command: NBFAKE_HOST=zeus nbfake +arm hosts: cubian + +[archive] + +# use machines of these flavor +useflavors plain +targsize 2G + +# minimum space required on an archive disk before allowing archiving to start (MB) +minspace 10 + +[archive-disks] + +# Archives will be organised under these paths like this: +# / +# Within the archive the data is structured like this: +# /// +disk0 /tmp/#{getenv USER}/adisk1 Index: tests/mintest/megatest.config ================================================================== --- tests/mintest/megatest.config +++ tests/mintest/megatest.config @@ -1,11 +1,11 @@ [fields] X TEXT [setup] max_concurrent_jobs 50 -linktree #{getenv PWD}/linktree +linktree #{getenv MT_RUN_AREA_HOME}/linktree transport http [server] port 8090 Index: tests/unittests/basicserver.scm ================================================================== --- tests/unittests/basicserver.scm +++ tests/unittests/basicserver.scm @@ -14,61 +14,67 @@ ;; NON Server tests go here (test #f #f (db:dbdat-get-path *db*)) (test #f #f (db:get-run-name-from-id *db* run-id)) -(test #f '("SYSTEM" "RELEASE") (rmt:get-keys)) +;; (test #f '("SYSTEM" "RELEASE") (rmt:get-keys)) ;; (exit) ;; Server tests go here -(test #f #f (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id)) -(server:kind-run run-id) -(test "did server start within 20 seconds?" - #t - (let loop ((remtries 20) - (running (tasks:server-running-or-starting? (db:delay-if-busy - (tasks:open-db)) - run-id))) - (if running - (> running 0) - (if (> remtries 0) - (begin - (thread-sleep! 1.1) - (loop (- remtries 1) - (tasks:server-running-or-starting? (db:delay-if-busy - (tasks:open-db)) - run-id))))))) - -(test "did server become available" #t - (let loop ((remtries 10) - (res (tasks:get-server (db:delay-if-busy (tasks:open-db)) run-id))) - (if res - (vector? res) - (begin - (if (> remtries 0) - (begin - (thread-sleep! 1.1) - (loop (- remtries 1)(tasks:get-server (db:delay-if-busy (tasks:open-db)) run-id))) - res))))) +(for-each + (lambda (run-id) + (test #f #f (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id)) + (server:kind-run run-id) + (test "did server start within 20 seconds?" + #t + (let loop ((remtries 20) + (running (tasks:server-running-or-starting? (db:delay-if-busy + (tasks:open-db)) + run-id))) + (if running + (> running 0) + (if (> remtries 0) + (begin + (thread-sleep! 1) + (loop (- remtries 1) + (tasks:server-running-or-starting? (db:delay-if-busy + (tasks:open-db)) + run-id))))))) + + (test "did server become available" #t + (let loop ((remtries 10) + (res (tasks:get-server (db:delay-if-busy (tasks:open-db)) run-id))) + (if res + (vector? res) + (begin + (if (> remtries 0) + (begin + (thread-sleep! 1.1) + (loop (- remtries 1)(tasks:get-server (db:delay-if-busy (tasks:open-db)) run-id))) + res))))) + ) + (list 0 1)) (define user (current-user-name)) (define runname "mytestrun") (define keys (rmt:get-keys)) (define runinfo #f) (define keyvals '(("SYSTEM" "abc")("RELEASE" "def"))) -(define header (vector "SYSTEM" "RELEASE" "id" "runname" "state" "status" "owner" "event_time")) +(define header (list "SYSTEM" "RELEASE" "id" "runname" "state" "status" "owner" "event_time")) ;; Setup ;; (test #f #f (not (client:setup run-id))) (test #f #f (not (hash-table-ref/default *runremote* run-id #f))) ;; Login ;; -(test #f '(#t "successful login") (rmt:login-no-auto-client-setup (hash-table-ref/default *runremote* run-id #f) run-id)) -(test #f '(#t "successful login") (rmt:login run-id)) +(test #f'(#t "successful login") + (rmt:login-no-auto-client-setup (hash-table-ref/default *runremote* run-id #f) run-id)) +(test #f '(#t "successful login") + (rmt:login run-id)) ;; Keys ;; (test #f '("SYSTEM" "RELEASE") (rmt:get-keys)) @@ -75,40 +81,83 @@ ;; No data in db ;; (test #f '() (rmt:get-all-run-ids)) (test #f #f (rmt:get-run-name-from-id run-id)) (test #f - (let ((runrec (vector #f #f))) - (vector-set! runrec header 0) - (vector-set! runrec (vector #f #f #f #f) 1) - runrec) + (vector + header + (vector #f #f #f #f)) (rmt:get-run-info run-id)) ;; Insert data into db ;; (test #f 1 (rmt:register-run keyvals runname "new" "n/a" user)) ;; (test #f #f (rmt:get-runs-by-patt keys runname)) (test #f #t (rmt:general-call 'register-test run-id run-id "test-one" "")) +(define test-one-id #f) +(test #f 1 (let ((test-id (rmt:get-test-id run-id "test-one" ""))) + (set! test-one-id test-id) + test-id)) +(define test-one-rec #f) +(test #f "test-one" (let ((test-rec (rmt:get-test-info-by-id run-id test-one-id))) + (set! test-one-rec test-rec) + (vector-ref test-rec 2))) ;; With data in db ;; +(print "Using runame=" runname) (test #f '(1) (rmt:get-all-run-ids)) (test #f runname (rmt:get-run-name-from-id run-id)) (test #f runname (let ((run-info (rmt:get-run-info run-id))) (db:get-value-by-header (db:get-rows run-info) (db:get-header run-info) "runname"))) - ;; (vector header (vector "abc" "def" 1 "mytestrun" "new" "n/a" "matt" 1416280640.0)) - +(for-each (lambda (run-id) ;; test killing server ;; (tasks:kill-server-run-id run-id) (test #f #f (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id)) +) +(list 0 1)) + +;; Tests to assess reading/writing while servers are starting/stopping +(define start-time (current-seconds)) +(let loop ((test-state 'start)) + (let* ((server-dats (tasks:get-server-records (db:delay-if-busy (tasks:open-db)) run-id)) + (first-dat (if (not (null? server-dats)) + (car server-dats) + #f))) + (map (lambda (dat) + (apply print (intersperse (vector->list dat) ", "))) + server-dats) + (test #f test-one-rec (rmt:get-test-info-by-id run-id test-one-id)) + (thread-sleep! 1) + (case test-state + ((start) + (print "Trying to start server") + (server:kind-run run-id) + (loop 'server-started)) + ((server-started) + (case (vector-ref first-dat) + ((running) + (print "Server appears to be running. Now ask it to shutdown") + (rmt:kill-server run-id) + (loop 'server-shutdown)) + ((shutting-down) + (loop test-state)) + (else (print "Don't know what to do if get here")))) + ((server-shutdown) + (loop test-state))))) + +;;====================================================================== +;; END OF TESTS +;;====================================================================== + ;; (test #f #f (client:setup run-id)) ;; (set! *transport-type* 'http) ;; Index: tests/unittests/server.scm ================================================================== --- tests/unittests/server.scm +++ tests/unittests/server.scm @@ -4,113 +4,190 @@ ;; Run like this: ;; ;; (cd ..;make && make install) && ./rununittest.sh server 1;(cd simplerun;megatest -stop-server 0) -(set! *transport-type* 'http) - -(test "setup for run" #t (begin (setup-for-run) - (string? (getenv "MT_RUN_AREA_HOME")))) - -(test "server-register, get-best-server" #t (let ((res #f)) - (open-run-close tasks:server-register tasks:open-db 1 "bob" 1234 100 'live 'http) - (set! res (open-run-close tasks:get-best-server tasks:open-db)) - (number? (vector-ref res 3)))) - -(test "de-register server" #f (let ((res #f)) - (open-run-close tasks:server-deregister tasks:open-db "bob" port: 1234) - (vector? (open-run-close tasks:get-best-server tasks:open-db)))) - -(define server-pid #f) - -;; Not sure how the following should work, replacing it with system of megatest -server -;; (test "launch server" #t (let ((pid (process-fork (lambda () -;; ;; (daemon:ize) -;; (server:launch 'http))))) -;; (set! server-pid pid) -;; (number? pid))) -(system "../../bin/megatest -server - -debug 22 > server.log 2> server.log &") - -(let loop ((n 10)) - (thread-sleep! 1) ;; need to wait for server to start. - (let ((res (open-run-close tasks:get-best-server tasks:open-db))) - (print "tasks:get-best-server returned " res) - (if (and (not res) - (> n 0)) - (loop (- n 1))))) - -(test "get-best-server" #t (begin - (client:launch) - (let ((dat (open-run-close tasks:get-best-server tasks:open-db))) - (vector? dat)))) - -(define *keys* (keys:config-get-fields *configdat*)) -(define *keyvals* (keys:target->keyval *keys* "a/b/c")) - -(test #f #t (string? (car *runremote*))) -(test #f '(#t "successful login") (rmt:login)) ;; *runremote* *toppath* *my-client-signature*))) - -(test #f #f (rmt:get-test-info-by-id 1 99)) ;; get non-existant test - -;; RUNS -(test #f 1 (rmt:register-run *keyvals* "firstrun" "new" "n/a" (current-user-name))) -(test "get run info" "firstrun" (let ((rinfo (rmt:get-run-info 1))) - (vector-ref (vector-ref rinfo 1) 3))) -(test "get runname from id" "firstrun" (rmt:get-run-name-from-id 1)) - -;; TESTS -(test "get tests (no data)" '() (rmt:get-tests-for-run 1 "%" '() '() #f #f #f #f #f #f)) -(test "register test" #t (rmt:general-call 'register-test 1 1 "test1" "")) -(test "get tests (some data)" 1 (length (rmt:get-tests-for-run 1 "%" '() '() #f #f #f #f #f #f))) -(test "get test id" 1 (rmt:get-test-id 1 "test1" "")) - -(test "sync back" #t (> (rmt:sync-inmem->db) 0)) -(test "get test id from main" 1 (db:get-test-id *db* 1 "test1" "")) - -(test "get keys" #t (list? (rmt:get-keys))) -(test "set comment" #t (begin (rmt:general-call 'set-test-comment 1 "this is a comment" 1) #t)) -(test "get comment" "this is a comment" (let ((trec (rmt:get-test-info-by-id 1 1))) - (db:test-get-comment trec))) - -;; MORE RUNS -(test "get runs" #t (let* ((runs (rmt:get-runs "%" #f #f '())) - (header (vector-ref runs 0)) - (data (vector-ref runs 1))) - (and (list? header) - (list? data) - (vector? (car data))))) - -(test "get local testinfo" "test1" (vector-ref (db:get-testinfo-state-status *db* 1 1) 2)) -(test "get testinfo" "test1" (vector-ref (rmt:get-testinfo-state-status 1 1) 2)) - -;;====================================================================== -;; D B -;;====================================================================== - -(test "pass fail counts" #t (rmt:general-call 'pass-fail-counts 10 9 1)) -(test "get pass fail counts" 19 (let ((dat (rmt:get-test-info-by-id 1))) - (+ (db:test-get-pass_count dat) - (db:test-get-fail_count dat)))) - -(define testregistry (make-hash-table)) -(for-each - (lambda (tname) - (for-each - (lambda (itempath) - (let ((tkey (conc tname "/" itempath)) - (rpass (random 10)) - (rfail (random 10))) - (hash-table-set! testregistry tkey (list tname itempath)) - (rmt:general-call 'register-test 1 tname itempath) - (let* ((tid (rmt:get-test-id 1 tname itempath)) - (tdat (rmt:get-test-info-by-id tid))) - (rmt:general-call 'pass-fail-counts rpass rfail (db:test-get-id tdat)) - (let* ((resdat (rmt:get-test-info-by-id tid))) - (test "set/get pass fail counts" (list rpass rfail) - (list (db:test-get-pass_count resdat) - (db:test-get-fail_count resdat))))))) - (list "" "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"))) - (list "test1" "test2" "test3" "test4" "test5")) - - -(test #f '(#t "exit process started") (rmt:kill-server)) ;; *toppath* *my-client-signature* #f))) - +(delete-file* "logs/1.log") +(define run-id 1) + +(test "setup for run" #t (begin (launch:setup-for-run) + (string? (getenv "MT_RUN_AREA_HOME")))) + +;; Insert data into db +;; +(define user (current-user-name)) +(define runname "mytestrun") +(define keys (rmt:get-keys)) +(define runinfo #f) +(define keyvals '(("SYSTEM" "abc")("RELEASE" "def"))) +(define header (list "SYSTEM" "RELEASE" "id" "runname" "state" "status" "owner" "event_time")) + +(test #f 1 (rmt:register-run keyvals runname "new" "n/a" user)) +;; (test #f #f (rmt:get-runs-by-patt keys runname)) +(test #f #t (rmt:general-call 'register-test run-id run-id "test-one" "")) +(define test-one-id #f) +(test #f 30001 (let ((test-id (rmt:get-test-id run-id "test-one" ""))) + (set! test-one-id test-id) + test-id)) +(define test-one-rec #f) +(test #f "test-one" (let ((test-rec (rmt:get-test-info-by-id run-id test-one-id))) + (set! test-one-rec test-rec) + (vector-ref test-rec 2))) + +(use trace) +(import trace) +;; (trace +;; rmt:send-receive +;; rmt:open-qry-close-locally +;; ) + +;; Tests to assess reading/writing while servers are starting/stopping +(define start-time (current-seconds)) +(let loop ((test-state 'start)) + (let* ((server-dats (tasks:get-server-records (db:delay-if-busy (tasks:open-db)) run-id)) + (first-dat (if (not (null? server-dats)) + (car server-dats) + #f)) + (server-state (or (and first-dat (string->symbol (vector-ref first-dat 8))) 'no-dat))) + (if first-dat + (map (lambda (dat) + (apply print (intersperse (vector->list dat) ", "))) + server-dats) + (print "No server")) + (test #f test-one-rec (rmt:get-test-info-by-id run-id test-one-id)) + (thread-sleep! 1) + (case test-state + ((start) + (print "Trying to start server") + (server:kind-run run-id) + (loop 'server-started)) + ((server-started) + (case server-state + ((running) + (print "Server appears to be running. Now ask it to shutdown") + (rmt:kill-server run-id) + ;; (trace rmt:open-qry-close-locally rmt:send-receive) + (loop 'shutdown-started)) + ((available) + (loop test-state)) + ((shutting-down) + (loop test-state)) + ((no-dat) + (loop test-state)) + (else (print "Don't know what to do if get here")))) + ((shutdown-started) + (case server-state + ((no-dat) + (print "Server appears to have shutdown, ending this test")) + (else + (loop test-state))))))) + +(exit) + +;; (set! *transport-type* 'http) +;; +;; (test "setup for run" #t (begin (setup-for-run) +;; (string? (getenv "MT_RUN_AREA_HOME")))) +;; +;; (test "server-register, get-best-server" #t (let ((res #f)) +;; (open-run-close tasks:server-register tasks:open-db 1 "bob" 1234 100 'live 'http) +;; (set! res (open-run-close tasks:get-best-server tasks:open-db)) +;; (number? (vector-ref res 3)))) +;; +;; (test "de-register server" #f (let ((res #f)) +;; (open-run-close tasks:server-deregister tasks:open-db "bob" port: 1234) +;; (vector? (open-run-close tasks:get-best-server tasks:open-db)))) +;; +;; (define server-pid #f) +;; +;; ;; Not sure how the following should work, replacing it with system of megatest -server +;; ;; (test "launch server" #t (let ((pid (process-fork (lambda () +;; ;; ;; (daemon:ize) +;; ;; (server:launch 'http))))) +;; ;; (set! server-pid pid) +;; ;; (number? pid))) +;; (system "../../bin/megatest -server - -debug 22 > server.log 2> server.log &") +;; +;; (let loop ((n 10)) +;; (thread-sleep! 1) ;; need to wait for server to start. +;; (let ((res (open-run-close tasks:get-best-server tasks:open-db))) +;; (print "tasks:get-best-server returned " res) +;; (if (and (not res) +;; (> n 0)) +;; (loop (- n 1))))) +;; +;; (test "get-best-server" #t (begin +;; (client:launch) +;; (let ((dat (open-run-close tasks:get-best-server tasks:open-db))) +;; (vector? dat)))) +;; +;; (define *keys* (keys:config-get-fields *configdat*)) +;; (define *keyvals* (keys:target->keyval *keys* "a/b/c")) +;; +;; (test #f #t (string? (car *runremote*))) +;; (test #f '(#t "successful login") (rmt:login)) ;; *runremote* *toppath* *my-client-signature*))) +;; +;; (test #f #f (rmt:get-test-info-by-id 1 99)) ;; get non-existant test +;; +;; ;; RUNS +;; (test #f 1 (rmt:register-run *keyvals* "firstrun" "new" "n/a" (current-user-name))) +;; (test "get run info" "firstrun" (let ((rinfo (rmt:get-run-info 1))) +;; (vector-ref (vector-ref rinfo 1) 3))) +;; (test "get runname from id" "firstrun" (rmt:get-run-name-from-id 1)) +;; +;; ;; TESTS +;; (test "get tests (no data)" '() (rmt:get-tests-for-run 1 "%" '() '() #f #f #f #f #f #f)) +;; (test "register test" #t (rmt:general-call 'register-test 1 1 "test1" "")) +;; (test "get tests (some data)" 1 (length (rmt:get-tests-for-run 1 "%" '() '() #f #f #f #f #f #f))) +;; (test "get test id" 1 (rmt:get-test-id 1 "test1" "")) +;; +;; (test "sync back" #t (> (rmt:sync-inmem->db) 0)) +;; (test "get test id from main" 1 (db:get-test-id *db* 1 "test1" "")) +;; +;; (test "get keys" #t (list? (rmt:get-keys))) +;; (test "set comment" #t (begin (rmt:general-call 'set-test-comment 1 "this is a comment" 1) #t)) +;; (test "get comment" "this is a comment" (let ((trec (rmt:get-test-info-by-id 1 1))) +;; (db:test-get-comment trec))) +;; +;; ;; MORE RUNS +;; (test "get runs" #t (let* ((runs (rmt:get-runs "%" #f #f '())) +;; (header (vector-ref runs 0)) +;; (data (vector-ref runs 1))) +;; (and (list? header) +;; (list? data) +;; (vector? (car data))))) +;; +;; (test "get local testinfo" "test1" (vector-ref (db:get-testinfo-state-status *db* 1 1) 2)) +;; (test "get testinfo" "test1" (vector-ref (rmt:get-testinfo-state-status 1 1) 2)) +;; +;; ;;====================================================================== +;; ;; D B +;; ;;====================================================================== +;; +;; (test "pass fail counts" #t (rmt:general-call 'pass-fail-counts 10 9 1)) +;; (test "get pass fail counts" 19 (let ((dat (rmt:get-test-info-by-id 1))) +;; (+ (db:test-get-pass_count dat) +;; (db:test-get-fail_count dat)))) +;; +;; (define testregistry (make-hash-table)) +;; (for-each +;; (lambda (tname) +;; (for-each +;; (lambda (itempath) +;; (let ((tkey (conc tname "/" itempath)) +;; (rpass (random 10)) +;; (rfail (random 10))) +;; (hash-table-set! testregistry tkey (list tname itempath)) +;; (rmt:general-call 'register-test 1 tname itempath) +;; (let* ((tid (rmt:get-test-id 1 tname itempath)) +;; (tdat (rmt:get-test-info-by-id tid))) +;; (rmt:general-call 'pass-fail-counts rpass rfail (db:test-get-id tdat)) +;; (let* ((resdat (rmt:get-test-info-by-id tid))) +;; (test "set/get pass fail counts" (list rpass rfail) +;; (list (db:test-get-pass_count resdat) +;; (db:test-get-fail_count resdat))))))) +;; (list "" "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"))) +;; (list "test1" "test2" "test3" "test4" "test5")) +;; +;; +;; (test #f '(#t "exit process started") (rmt:kill-server)) ;; *toppath* *my-client-signature* #f))) +;; Index: txtdb/nada3/RunsToLock.dat ================================================================== --- txtdb/nada3/RunsToLock.dat +++ txtdb/nada3/RunsToLock.dat @@ -1,6 +1,7 @@ [def] +def def ghi jkl qrst uvwx yz12 Index: utils/Makefile.installall ================================================================== --- utils/Makefile.installall +++ utils/Makefile.installall @@ -43,11 +43,11 @@ # Eggs to install (straightforward ones) EGGS=matchable readline apropos base64 regex-literals format regex-case test coops trace csv \ dot-locking posix-utils posix-extras directory-utils hostinfo tcp-server rpc csv-xml fmt \ json md5 awful http-client spiffy uri-common intarweb spiffy-request-vars \ spiffy-directory-listing ssax sxml-serializer sxml-modifications sql-de-lite \ - srfi-19 refdb ini-file + srfi-19 refdb ini-file sparse-vectors # # Derived variables # @@ -145,10 +145,26 @@ cd sqlite-autoconf-$(SQLITE3_VERSION);./configure --prefix=$(PREFIX);make;make install $(PREFIX)/lib/sqlite3.so : $(PREFIX)/bin/sqlite3 CSC_OPTIONS="-I$(PREFIX)/include -L$(PREFIX)/lib" $(CHICKEN_INSTALL) $(PROX) sqlite3 +#====================================================================== +# N A N O M S G +#====================================================================== + +nanomsg-0.5-beta.tar.gz : + wget http://download.nanomsg.org/nanomsg-0.5-beta.tar.gz + +nanomsg-0.5-beta/COPYING : nanomsg-0.5-beta.tar.gz + tar xfvz nanomsg-0.5-beta.tar.gz + +$(PREFIX)/bin/nanocat : nanomsg-0.5-beta/COPYING + cd nanomsg-0.5-beta;./configure --prefix=$(PREFIX);make;make install + +$(PREFIX)/lib/nanomsg.so : $(PREFIX)/bin/nanocat + CSC_OPTIONS="-I$(PREFIX)/include -L$(PREFIX)/lib" $(CHICKEN_INSTALL) $(PROX) nanomsg + #====================================================================== # M A T T S U T I L S #====================================================================== opensrc.fossil : Index: utils/plot-code.scm ================================================================== --- utils/plot-code.scm +++ utils/plot-code.scm @@ -1,7 +1,14 @@ #!/mfs/pkgs/chicken/4.8.0.5/bin/csi -nbq +;; Coming soon (right?) Usage: plot-code file1.scm,file2.scm "fun1,fun2,x*" *.scm > plot.dot +;; Usage: plot-code file1.scm,file2.scm *.scm > plot.dot +;; dot -Tpdf plot.dot > plot.pdf +;; first param is comma separated list of files to include in the map, use - to do all +;; second param is list of regexs for functions to include in the map +;; third param is list of files to scan + (use regex srfi-69 srfi-13) (define targs #f) (define files (cddddr (argv))) @@ -15,10 +22,12 @@ (define defn-rx (regexp "^\\s*\\(define\\s+\\(([^\\s\\)]+).*")) (define all-regexs (make-hash-table)) (define all-fns '()) + +;; for the se (define (print-err . data) (with-output-to-port (current-error-port) (lambda () (apply print data)))) @@ -81,11 +90,32 @@ newres))) (if (null? tal) res (loop (car tal)(cdr tal) res))))) '())) - + +;; (define mm-header #< +;; +;; +;; MMHEADER +;; +;; (define (add-node text) +;; +;; ) +;; +;; minimal mindmap file +;; +;; +;; +;; +;; +;; +;; +;; +;; + ;; Gather the usages (print "digraph G {") (define curr-cluster-num 0) (define function-calls '()) ADDED utils/trace/trace.import.scm Index: utils/trace/trace.import.scm ================================================================== --- /dev/null +++ utils/trace/trace.import.scm @@ -0,0 +1,32 @@ +;;;; trace.import.scm - GENERATED BY CHICKEN 4.9.0.1 -*- Scheme -*- + +(eval '(import + scheme + chicken + csi + advice + extras + ports + data-structures + (except srfi-1 break) + miscmacros)) +(##sys#register-compiled-module + 'trace + (list) + '((breakpoint . trace#breakpoint) + (trace . trace#trace) + (untrace . trace#untrace) + (break . trace#break) + (unbreak . trace#unbreak) + (trace-output-port . trace#trace-output-port) + (continue . trace#continue) + (c . trace#c) + (traced? . trace#traced?) + (trace-module . trace#trace-module) + (untrace-module . trace#untrace-module) + (trace-verbose . trace#trace-verbose) + (trace/untrace . trace#trace/untrace)) + (list) + (list)) + +;; END OF FILE ADDED utils/trace/trace.meta Index: utils/trace/trace.meta ================================================================== --- /dev/null +++ utils/trace/trace.meta @@ -0,0 +1,10 @@ +;;;; trace.meta -*- Scheme -*- + + +((category tools) + (synopsis "tracing and breakpoints") + (author "felix winkelmann") + (license "public domain") + (needs advice ; don't we all? + miscmacros) + (files "tests/run.scm" "trace.meta" "trace.release-info" "trace.scm" "trace.setup") ) ADDED utils/trace/trace.scm Index: utils/trace/trace.scm ================================================================== --- /dev/null +++ utils/trace/trace.scm @@ -0,0 +1,259 @@ +;;;; trace.scm + + +(module trace (breakpoint + trace untrace + break unbreak + trace-output-port + continue c + traced? + trace-module untrace-module + trace-verbose + trace/untrace) + +(import scheme chicken csi) + +(use advice extras ports data-structures) +(require-library srfi-1) +(import (except srfi-1 break) miscmacros) + + +(define *last-breakpoint* #f) +(define *traced-procedures* '()) +(define *broken-procedures* '()) +(define *trace-indent-level* 0) + +(define trace-output-port (make-parameter (current-output-port))) +(define trace-verbose (make-parameter #t)) + +(define (break-entry name args) + ;; Does _not_ unwind! + (##sys#call-with-current-continuation + (lambda (c) + (let ((exn (##sys#make-structure + 'condition + '(exn breakpoint) + (list '(exn . message) "*** breakpoint ***" + '(exn . arguments) (list (cons name args)) + '(exn . location) name + '(exn . continuation) c) ) ) ) + (set! *last-breakpoint* exn) + (signal exn) ) ) ) ) + +(define (break-resume exn) + (let ((a (member '(exn . continuation) (##sys#slot exn 2)))) + (if a + ((cadr a) (void)) + (error "condition has no continuation" exn) ) ) ) + +(define (breakpoint #!optional (name 'breakpoint)) + (break-entry name '()) ) + +(define (trace-indent) + (let ((port (trace-output-port))) + (do ((i (fxmin 3 *trace-indent-level*) (fx- i 1))) + ((fx<= i 0)) + (write-char #\space port) ) + (fprintf port "[~a] " *trace-indent-level*) ) ) + +(define (traced-procedure-entry name args) + (let ((port (trace-output-port))) + (trace-indent) + (set! *trace-indent-level* (fx+ 1 *trace-indent-level*)) + (write (cons name args) port) + (write ", Called from: " port) + (write (conc (car (reverse (get-call-chain))))) + (write-char #\newline port) + (flush-output port) ) ) + +(define (traced-procedure-exit name results) + (let ((port (trace-output-port))) + (set! *trace-indent-level* (fx- *trace-indent-level* 1)) + (trace-indent) + (fprintf port "~a -> " name) + (if results + (for-each + (lambda (x) + (write x port) + (write-char #\space port) ) + results) + (display "(escaping)" port)) + (write-char #\newline port) + (flush-output port) ) ) + +(define (procedure-name proc) + (cond ((procedure-information proc) => + (lambda (info) + (if (pair? info) (car info) info) ) ) + (else ')) ) + +(define (do-trace procs) + (for-each + (lambda (s) + (ensure procedure? s) + (cond ((traced? s) + (warning "procedure already traced" s) ) + (else + (let ((name (procedure-name s))) + (when (trace-verbose) + (fprintf (current-error-port) "; tracing ~a~%" name)) + (set! *traced-procedures* (cons (cons s name) *traced-procedures*)) + (advise + 'around s + (lambda (next args) + (let ((results #f)) + (dynamic-wind + (cut traced-procedure-entry name args) + (lambda () + (call-with-values (cut apply next args) + (lambda rs + (set! results rs) + (apply values rs)))) + (cut traced-procedure-exit name results)))) + '*trace*))))) + procs) ) + +(define (do-untrace-all) + (define (unadvise* p) + (ignore-errors (unadvise p '*trace*))) + (for-each + (lambda (proc) + (let ((proc (car proc))) + (when (trace-verbose) + (fprintf (current-error-port) "; untracing ~a~%" (procedure-name proc)) + (unadvise* proc)))) + *traced-procedures*) + (set! *traced-procedures* '())) + +(define (do-untrace procs) + (for-each + (lambda (s) + (ensure procedure? s) + (let ((p (assq s *traced-procedures*)) + (name (procedure-name s))) + (cond ((not p) (warning "procedure not traced" name)) + (else + (when (trace-verbose) + (fprintf (current-error-port) "; untracing ~a~%" name)) + (ignore-errors (unadvise s '*trace*)) + (set! *traced-procedures* + (delete + p *traced-procedures* + eq?)))))) + procs) ) + +(define (do-break procs) + (for-each + (lambda (s) + (let ((name (procedure-name s))) + (ensure procedure? s) + (cond ((assq s *broken-procedures*) + (warning "procedure already has break-point" name)) + (else + (when (trace-verbose) + (fprintf (current-error-port) "; setting break-point in ~a~%" name)) + (set! *broken-procedures* (cons (cons s name) *broken-procedures*)) + (advise + 'before s + (lambda (args) + (break-entry name args) ) + '*break*) ) ))) + procs) ) + +(define (do-unbreak procs) + (for-each + (lambda (s) + (ensure procedure? s) + (let ((p (assq s *broken-procedures*)) + (name (procedure-name s))) + (cond ((not p) (warning "procedure has no breakpoint" name)) + (else + (when (trace-verbose) + (fprintf (current-error-port) "; removing break-point in ~a~%" name)) + (ignore-errors (unadvise s '*break*)) + (set! *broken-procedures* (delete p *broken-procedures* eq?) ) ) ) ) ) + procs) ) + +(define (do-unbreak-all) + (for-each + (lambda (bp) + (ignore-errors (unadvise (car bp) '*break*))) + *broken-procedures*) + (set! *broken-procedures* '()) + (void)) + +(define (trace . procs) + (cond ((null? procs) + (when (pair? *traced-procedures*) + (printf "Traced:~%~%") + (for-each (lambda (p) (printf " ~a~%" (cdr p))) *traced-procedures*)) ) + (else + (do-trace procs) ) ) ) + +(define (untrace . procs) + (cond ((null? procs) (do-untrace-all)) + (else (do-untrace procs))) + (void)) + +(define (break . procs) + (cond ((null? procs) + (when (pair? *broken-procedures*) + (printf "Breakpoints:~%~%") + (for-each (lambda (p) (printf " ~a~%" (cdr p))) *broken-procedures*)) ) + (else + (do-break procs) ) ) ) + +(define (unbreak . procs) + (cond ((null? procs) (do-unbreak-all)) + (else (do-unbreak procs)))) + +(define (continue #!optional (bp *last-breakpoint*)) + (cond (*last-breakpoint* + (let ((exn *last-breakpoint*)) + (set! *last-breakpoint* #f) + (break-resume exn) ) ) + (else (display "no breakpoint pending\n") ) ) ) + +(define c continue) + +(define (traced? proc) + (assq proc *traced-procedures*)) + +(define (trace/untrace . procs) + (for-each + (lambda (proc) + ((if (traced? proc) do-untrace do-trace) (list proc))) + procs)) + +(define (walk-module mname proc) + (let* ((m (##sys#find-module mname)) + (exps (nth-value 1 (##sys#module-exports m)))) + (for-each + (lambda (exp) + (let* ((realname (cdr exp)) + (prim (get realname '##core#primitive))) + (if prim + (warning "export is a core-library primitive - not traced" (car exp)) + (when (##sys#symbol-has-toplevel-binding? realname) + (let ((val (##sys#slot realname 0))) + (when (procedure? val) + (proc val))))))) + exps))) + +(define (trace-module . mnames) + (for-each + (lambda (mname) + (walk-module mname trace)) + mnames)) + +(define (untrace-module . mnames) + (for-each + (lambda (mname) + (walk-module + mname + (lambda (proc) + (when (traced? proc) + (do-untrace (list proc)))))) + mnames)) + +) ADDED utils/trace/trace.setup Index: utils/trace/trace.setup ================================================================== --- /dev/null +++ utils/trace/trace.setup @@ -0,0 +1,9 @@ +;;;; trace.setup -*- Scheme -*- + + +(compile -s trace.scm -O3 -d1 -j trace) +(compile -s trace.import.scm -O3 -d0) + +(install-extension + 'trace + '("trace.so" "trace.import.so")) ADDED zmq-transport.scm Index: zmq-transport.scm ================================================================== --- /dev/null +++ zmq-transport.scm @@ -0,0 +1,493 @@ + +;; Copyright 2006-2012, Matthew Welland. +;; +;; This program is made available under the GNU GPL version 2.0 or +;; greater. See the accompanying file COPYING for details. +;; +;; This program is distributed WITHOUT ANY WARRANTY; without even the +;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +;; PURPOSE. + +(require-extension (srfi 18) extras tcp s11n) + +(use sqlite3 srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) +(import (prefix sqlite3 sqlite3:)) + +(use zmq) + +(declare (unit zmq-transport)) + +(declare (uses common)) +(declare (uses db)) +(declare (uses tests)) +(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. +(declare (uses server)) + +(include "common_records.scm") +(include "db_records.scm") + +;; Transition to pub --> sub with pull <-- push +;; +;; 1. client sends request to server via push to the pull port +;; 2. server puts request in queue or processes immediately as appropriate +;; 3. server puts responses from completed requests into pub port +;; +;; TODO +;; +;; Done Tested +;; [x] [ ] 1. Add columns pullport pubport to servers table +;; [x] [ ] 2. Add rm of monitor.db if older than 11/12/2012 +;; [x] [ ] 3. Add create of pullport and pubport with finding of available ports +;; [x] [ ] 4. Add client compose of request +;; [x] [ ] - name of client: testname/itempath-test_id-hostname +;; [x] [ ] - name of request: callname, params +;; [x] [ ] - request key: f(clientname, callname, params) +;; [x] [ ] 5. Add processing of subscription hits +;; [x] [ ] - done when get key +;; [x] [ ] - return results +;; [x] [ ] 6. Add timeout processing +;; [x] [ ] - after 60 seconds +;; [ ] [ ] i. check server alive, connect to new if necessary +;; [ ] [ ] ii. resend request +;; [ ] [ ] 7. Turn self ping back on + +(define (zmq-transport:make-server-url hostport) + (if (not hostport) + #f + (conc "tcp://" (car hostport) ":" (cadr hostport)))) + +(define *server-loop-heart-beat* (current-seconds)) +(define *heartbeat-mutex* (make-mutex)) + +;;====================================================================== +;; S E R V E R +;;====================================================================== + +(define-inline (zmqsock:get-pub dat)(vector-ref dat 0)) +(define-inline (zmqsock:get-pull dat)(vector-ref dat 1)) +(define-inline (zmqsock:set-pub! dat s)(vector-set! dat s 0)) +(define-inline (zmqsock:set-pull! dat s)(vector-set! dat s 0)) + +(define (zmq-transport:run hostn) + (debug:print 2 "Attempting to start the server ...") + (if (not *toppath*) + (if (not (setup-for-run)) + (begin + (debug:print 0 "ERROR: cannot find megatest.config, cannot start server, exiting") + (exit)))) + (let* ((db (open-db)) ;; here we *do not* want to be opening and closing the db + (zmq-sdat1 #f) + (zmq-sdat2 #f) + (pull-socket #f) + (pub-socket #f) + (p1 #f) + (p2 #f) + (zmq-sockets-dat #f) + (iface (if (string=? "-" hostn) + "*" ;; (get-host-name) + hostn)) + (hostname (get-host-name)) + (ipaddrstr (let ((ipstr (if (string=? "-" hostn) + (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") + #f))) + (if ipstr ipstr hostname))) + (last-run 0)) + (set! zmq-sockets-dat (zmq-transport:setup-ports ipaddrstr (if (args:get-arg "-port") + (string->number (args:get-arg "-port")) + (+ 5000 (random 1001))))) + + (set! zmq-sdat1 (car zmq-sockets-dat)) + (set! pull-socket (cadr zmq-sdat1)) ;; (iface s port) + (set! p1 (caddr zmq-sdat1)) + + (set! zmq-sdat2 (cadr zmq-sockets-dat)) + (set! pub-socket (cadr zmq-sdat2)) + (set! p2 (caddr zmq-sdat2)) + + (set! *cache-on* #t) + + (set! *runremote* (vector pull-socket pub-socket)) ;; overloading the use of *runremote* BUG!? + + ;; what to do when we quit + ;; +;; (on-exit (lambda () +;; (if (and *toppath* *server-info*) +;; (open-run-close tasks:server-deregister-self tasks:open-db (car *server-info*)) +;; (let loop () +;; (let ((queue-len 0)) +;; (thread-sleep! (random 5)) +;; (mutex-lock! *incoming-mutex*) +;; (set! queue-len (length *incoming-data*)) +;; (mutex-unlock! *incoming-mutex*) +;; (if (> queue-len 0) +;; (begin +;; (debug:print-info 0 "Queue not flushed, waiting ...") +;; (loop)))))))) + + ;; The heavy lifting + ;; + ;; make-vector-record cdb packet client-sig qtype immediate query-sig params qtime + ;; + (debug:print-info 11 "Server setup complete, start listening for messages") + (let loop ((queue-lst '())) + (let* ((rawmsg (receive-message* pull-socket)) + (packet (db:string->obj rawmsg)) + (qtype (cdb:packet-get-qtype packet))) + (debug:print-info 12 "server=> received packet=" packet) + (if (not (member qtype '(sync ping))) + (begin + (mutex-lock! *heartbeat-mutex*) + (set! *last-db-access* (current-seconds)) + (mutex-unlock! *heartbeat-mutex*))) + (if #t ;; (cdb:packet-get-immediate packet) ;; process immediately or put in queue + (begin + (db:process-queue-item db packet) + ;; (open-run-close db:process-queue #f pub-socket (cons packet queue-lst)) + + (loop '())) + (loop (cons packet queue-lst))))))) + +;; run zmq-transport:keep-running in a parallel thread to monitor that the db is being +;; used and to shutdown after sometime if it is not. +;; +(define (zmq-transport:keep-running) + ;; if none running or if > 20 seconds since + ;; server last used then start shutdown + ;; This thread waits for the server to come alive + (let* ((server-info (let loop () + (let ((sdat #f)) + (mutex-lock! *heartbeat-mutex*) + (set! sdat *server-info*) + (mutex-unlock! *heartbeat-mutex*) + (if sdat sdat + (begin + (debug:print 12 "WARNING: server not started yet, waiting few seconds before trying again") + (sleep 4) + (loop)))))) + (iface (cadr server-info)) + (pullport (caddr server-info)) + (pubport (cadddr server-info)) ;; id interface pullport pubport) + ;; (zmq-sockets (zmq-transport:client-connect iface pullport pubport)) + (last-access 0)) + (debug:print-info 11 "heartbeat started for zmq server on " iface " " pullport " " pubport) + (let loop ((count 0)) + (thread-sleep! 4) ;; no need to do this very often + ;; NB// sync currently does NOT return queue-length + ;; GET REAL QUEUE LENGTH FROM THE VARIABLE + (let ((queue-len 0)) ;; FOR NOW DO NOT DO THIS (cdb:client-call zmq-sockets 'sync #t 1))) + ;; (print "Server running, count is " count) + (if (< count 1) ;; 3x3 = 9 secs aprox + (loop (+ count 1))) + + ;; NOTE: Get rid of this mechanism! It really is not needed... + (open-run-close tasks:server-update-heartbeat tasks:open-db (car server-info)) + + ;; (if ;; (or (> numrunning 0) ;; stay alive for two days after last access + (mutex-lock! *heartbeat-mutex*) + (set! last-access *last-db-access*) + (mutex-unlock! *heartbeat-mutex*) + (if (> (+ last-access + ;; (* 50 60 60) ;; 48 hrs + ;; 60 ;; one minute + ;; (* 60 60) ;; one hour + (* 45 60) ;; 45 minutes, until the db deletion bug is fixed. + ) + (current-seconds)) + (begin + (debug:print-info 2 "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) + (loop 0)) + (begin + (debug:print-info 0 "Starting to shutdown the server.") + ;; need to delete only *my* server entry (future use) + (set! *time-to-exit* #t) + (open-run-close tasks:server-deregister-self tasks:open-db (get-host-name)) + (thread-sleep! 1) + (debug:print-info 0 "Max cached queries was " *max-cache-size*) + (debug:print-info 0 "Server shutdown complete. Exiting") + (exit))))))) + +(define (zmq-transport:find-free-port-and-open iface s port stype #!key (trynum 50)) + (let ((s (if s s (make-socket stype))) + (p (if (number? port) port 5555)) + (old-handler (current-exception-handler))) + (handle-exceptions + exn + (begin + (debug:print 0 "Failed to bind to port " p ", trying next port") + (debug:print 0 " EXCEPTION: " ((condition-property-accessor 'exn 'message) exn)) + ;; (old-handler) + ;; (print-call-chain) + (if (> trynum 0) + (zmq-transport:find-free-port-and-open iface s (+ p 1) trynum: (- trynum 1)) + (debug:print-info 0 "Tried ports up to " p + " but all were in use. Please try a different port range by starting the server with parameter \" -port N\" where N is the starting port number to use")) + (exit)) ;; To exit or not? That is the question. + (let ((zmq-url (conc "tcp://" iface ":" p))) + (debug:print 2 "Trying to start server on " zmq-url) + (bind-socket s zmq-url) + (list iface s port))))) + +(define (zmq-transport:setup-ports ipaddrstr startport) + (let* ((s1 (zmq-transport:find-free-port-and-open ipaddrstr #f startport 'pull)) + (p1 (caddr s1)) + (s2 (zmq-transport:find-free-port-and-open ipaddrstr #f (+ 1 (if p1 p1 (+ startport 1))) 'pub)) + (p2 (caddr s2))) + (set! *runremote* #f) + (debug:print 0 "Server started on " ipaddrstr " ports " p1 " and " p2) + (mutex-lock! *heartbeat-mutex*) + (set! *server-info* (open-run-close tasks:server-register + tasks:open-db + (current-process-id) + ipaddrstr p1 + 0 + 'live + 'zmq + pubport: p2)) + (debug:print-info 11 "*server-info* set to " *server-info*) + (mutex-unlock! *heartbeat-mutex*) + (list s1 s2))) + +(define (zmq-transport:mk-signature) + (message-digest-string (md5-primitive) + (with-output-to-string + (lambda () + (write (list (current-directory) + (argv))))))) + +;;====================================================================== +;; S E R V E R U T I L I T I E S +;;====================================================================== + +;;====================================================================== +;; C L I E N T S +;;====================================================================== + +;; +(define (zmq-transport:client-socket-connect iface port #!key (context #f)(type 'req)(subscriptions '())) + (debug:print-info 3 "client-connect " iface ":" port ", type=" type ", subscriptions=" subscriptions) + (let ((connect-ok #f) + (zmq-socket (if context + (make-socket type context) + (make-socket type))) + (conurl (zmq-transport:make-server-url (list iface port)))) + (if (socket? zmq-socket) + (begin + ;; first apply subscriptions + (for-each (lambda (subscription) + (debug:print 2 "Subscribing to " subscription) + (socket-option-set! zmq-socket 'subscribe subscription)) + subscriptions) + (connect-socket zmq-socket conurl) + zmq-socket) + (begin + (debug:print 0 "ERROR: Failed to open socket to " conurl) + #f)))) + +(define (zmq-transport:client-connect iface pullport pubport) + (let* ((push-socket (zmq-transport:client-socket-connect iface pullport type: 'push)) + (sub-socket (zmq-transport:client-socket-connect iface pubport + type: 'sub + subscriptions: (list (server:get-client-signature) "all"))) + (zmq-sockets (vector push-socket sub-socket)) + (login-res #f)) + (debug:print-info 11 "zmq-transport:client-connect started. Next is login") + (set! login-res (server:client-login zmq-sockets)) + (if (and (not (null? login-res)) + (car login-res)) + (begin + (debug:print-info 2 "Logged in and connected to " iface ":" pullport "/" pubport ".") + (set! *runremote* zmq-sockets) + zmq-sockets) + (begin + (debug:print-info 2 "Failed to login or connect to " conurl) + (set! *runremote* #f) + #f)))) + +;; run zmq-transport:keep-running in a parallel thread to monitor that the db is being +;; used and to shutdown after sometime if it is not. +;; +(define (zmq-transport:keep-running) + ;; if none running or if > 20 seconds since + ;; server last used then start shutdown + ;; This thread waits for the server to come alive + (let* ((server-info (let loop () + (let ((sdat #f)) + (mutex-lock! *heartbeat-mutex*) + (set! sdat *runremote*) + (mutex-unlock! *heartbeat-mutex*) + (if sdat sdat + (begin + (sleep 4) + (loop)))))) + (iface (car server-info)) + (port (cadr server-info)) + (last-access 0) + (tdb (tasks:open-db)) + (spid (tasks:server-get-server-id tdb #f iface port #f))) + (print "Keep-running got server pid " spid ", using iface " iface " and port " port) + (let loop ((count 0)) + (thread-sleep! 4) ;; no need to do this very often + ;; NB// sync currently does NOT return queue-length + (let () ;; (queue-len (cdb:client-call server-info 'sync #t 1))) + ;; (print "Server running, count is " count) + (if (< count 1) ;; 3x3 = 9 secs aprox + (loop (+ count 1))) + + ;; NOTE: Get rid of this mechanism! It really is not needed... + (tasks:server-update-heartbeat tdb spid) + + ;; (if ;; (or (> numrunning 0) ;; stay alive for two days after last access + (mutex-lock! *heartbeat-mutex*) + (set! last-access *last-db-access*) + (mutex-unlock! *heartbeat-mutex*) + (if (> (+ last-access + ;; (* 50 60 60) ;; 48 hrs + ;; 60 ;; one minute + ;; (* 60 60) ;; one hour + (* 45 60) ;; 45 minutes, until the db deletion bug is fixed. + ) + (current-seconds)) + (begin + (debug:print-info 2 "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) + (loop 0)) + (begin + (debug:print-info 0 "Starting to shutdown the server.") + ;; need to delete only *my* server entry (future use) + (set! *time-to-exit* #t) + (tasks:server-deregister-self tdb (get-host-name)) + (thread-sleep! 1) + (debug:print-info 0 "Max cached queries was " *max-cache-size*) + (debug:print-info 0 "Server shutdown complete. Exiting") + (exit))))))) + +;; all routes though here end in exit ... +(define (zmq-transport:launch) + (if (not *toppath*) + (if (not (setup-for-run)) + (begin + (debug:print 0 "ERROR: cannot find megatest.config, exiting") + (exit)))) + (debug:print-info 2 "Starting zmq server") + (if *toppath* + (let* (;; (th1 (make-thread (lambda () + ;; (let ((server-info #f)) + ;; ;; wait for the server to be online and available + ;; (let loop () + ;; (debug:print-info 2 "Waiting for the server to come online before starting heartbeat") + ;; (thread-sleep! 2) + ;; (mutex-lock! *heartbeat-mutex*) + ;; (set! server-info *server-info* ) + ;; (mutex-unlock! *heartbeat-mutex*) + ;; (if (not server-info)(loop))) + ;; (debug:print 2 "Server alive, starting self-ping") + ;; (zmq-transport:self-ping server-info) + ;; )) + ;; "Self ping")) + (th2 (make-thread (lambda () + (zmq-transport:run + (if (args:get-arg "-server") + (args:get-arg "-server") + "-"))) "Server run")) + ;; (th3 (make-thread (lambda ()(zmq-transport:keep-running)) "Keep running")) + ) + (set! *client-non-blocking-mode* #t) + ;; (thread-start! th1) + (thread-start! th2) + ;; (thread-start! th3) + (set! *didsomething* #t) + ;; (thread-join! th3) + (thread-join! th2) + ) + (debug:print 0 "ERROR: Failed to setup for megatest"))) + +(define (zmq-transport:client-signal-handler signum) + (handle-exceptions + exn + (debug:print " ... exiting ...") + (let ((th1 (make-thread (lambda () + (if (not *received-response*) + (receive-message* *runremote*))) ;; flush out last call if applicable + "eat response")) + (th2 (make-thread (lambda () + (debug:print 0 "ERROR: Received ^C, attempting clean exit. Please be patient and wait a few seconds before hitting ^C again.") + (thread-sleep! 3) ;; give the flush three seconds to do it's stuff + (debug:print 0 " Done.") + (exit 4)) + "exit on ^C timer"))) + (thread-start! th2) + (thread-start! th1) + (thread-join! th2)))) + +(define (zmq-transport:client-launch) + (set-signal-handler! signal/int zmq-transport:client-signal-handler) + (if (zmq-transport:client-setup) + (debug:print-info 2 "connected as client") + (begin + (debug:print 0 "ERROR: Failed to connect as client") + (exit)))) + +;;====================================================================== +;; Defunct functions +;;====================================================================== + +;; ping a server and return number of clients or #f (if no response) +;; NOT IN USE! +(define (zmq-transport:ping host port #!key (secs 10)(return-socket #f)) + (cdb:use-non-blocking-mode + (lambda () + (let* ((res #f) + (th1 (make-thread + (lambda () + (let* ((zmq-context (make-context 1)) + (zmq-socket (zmq-transport:client-connect host port context: zmq-context))) + (if zmq-socket + (if (zmq-transport:client-login zmq-socket) + (let ((numclients (cdb:num-clients zmq-socket))) + (if (not return-socket) + (begin + (zmq-transport:client-logout zmq-socket) + (close-socket zmq-socket))) + (set! res (list #t numclients (if return-socket zmq-socket #f)))) + (begin + ;; (close-socket zmq-socket) + (set! res (list #f "CAN'T LOGIN" #f)))) + (set! res (list #f "CAN'T CONNECT" #f))))) + "Ping: th1")) + (th2 (make-thread + (lambda () + (let loop ((count 1)) + (debug:print-info 1 "Ping " count " server on " host " at port " port) + (thread-sleep! 2) + (if (< count (/ secs 2)) + (loop (+ count 1)))) + ;; (thread-terminate! th1) + (set! res (list #f "TIMED OUT" #f))) + "Ping: th2"))) + (thread-start! th2) + (thread-start! th1) + (handle-exceptions + exn + (set! res (list #f "TIMED OUT" #f)) + (thread-join! th1 secs)) + res)))) + +;; (define (zmq-transport:self-ping server-info) +;; ;; server-info: server-id interface pullport pubport +;; (let ((iface (list-ref server-info 1)) +;; (pullport (list-ref server-info 2)) +;; (pubport (list-ref server-info 3))) +;; (zmq-transport:client-connect iface pullport pubport) +;; (let loop () +;; (thread-sleep! 2) +;; (cdb:client-call *runremote* 'ping #t) +;; (debug:print 4 "zmq-transport:self-ping - I'm alive on " iface ":" pullport "/" pubport "!") +;; (mutex-lock! *heartbeat-mutex*) +;; (set! *server-loop-heart-beat* (current-seconds)) +;; (mutex-unlock! *heartbeat-mutex*) +;; (loop)))) + +(define (zmq-transport:reply pubsock target query-sig success/fail result) + (debug:print-info 11 "zmq-transport:reply target=" target ", result=" result) + (send-message pubsock target send-more: #t) + (send-message pubsock (db:obj->string (vector success/fail query-sig result)))) +