Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -1,6 +1,8 @@ # make install CSCOPTS='-accumulate-profile -profile-name $(PWD)/profile-ww$(shell date +%V.%u)' +# rm .o ; make install CSCOPTS='-profile' ; ... ; chicken-profile | less + PREFIX=$(PWD) CSCOPTS= INSTALL=install SRCFILES = common.scm items.scm launch.scm \ ods.scm runconfig.scm server.scm configf.scm \ Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -107,11 +107,11 @@ (define (api:execute-requests dbstruct dat) (handle-exceptions exn (let ((call-chain (get-call-chain))) (print-call-chain (current-error-port)) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (if (not (vector? dat)) ;; it is an error to not receive a vector (vector #f #f "remote must be called with a vector") (vector ;; return a vector + the returned data structure #t Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -1267,10 +1267,11 @@ (iup:treebox #:value 0 #:name "Runs" #:expand "YES" #:addexpanded "NO" + #:size "10x" #:selection-cb (lambda (obj id state) (debug:catch-and-dump (lambda () (let* ((run-path (tree:node->path obj id)) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -204,11 +204,11 @@ ;; (if (file-exists? fname) ;; (let ((db (sqlite3:open-database fname))) ;; (sqlite3:set-busy-handler! db (make-busy-timeout 136000)) ;; (db:set-sync db) ;; (sqlite3:execute db "PRAGMA synchronous = 0;") ;; db) - (let* ((parent-dir (pathname-directory fname)) + (let* ((parent-dir (or (pathname-directory fname)(current-directory))) ;; no parent? go local (dir-writable (file-write-access? parent-dir)) (file-exists (file-exists? fname)) (file-write (if file-exists (file-write-access? fname) dir-writable ))) @@ -783,31 +783,31 @@ UPDATE run_stats SET last_update=(strftime('%s','now')) WHERE id=old.id; END;")) (define (db:cache-for-read-only source target) - (let* ((toppath (launch:setup)) - (cache-db (db:open-megatest-db path: target)) - (source-db (db:open-megatest-db path: source)) - (curr-time (current-seconds)) - (res '())) - (print source-db) - (begin - (if (not (file-exists? target)) - ((db:sync-tables (db:sync-main-list source-db) source-db cache-db) - (db:sync-tables db:sync-tests-only source-db cache-db) - (db:clean-up-rundb cache-db)) - ((sqlite3:for-each-row - (lambda (id release runname state status owner event_time comment fail_count pass_count ) - (set! res (cons (id release runname state status owner event_time comment fail_count pass_count ) res))) - (db:dbdat-get-db source-db) - "SELECT id, release, runname, state, status, owner, event_time, comment, fail_count, pass_count FROM runs;")) - ) - (print res) - (sqlite3:finalize! (db:dbdat-get-db cache-db)) - )) - ) + (let* ((toppath (launch:setup)) + (cache-db (db:open-megatest-db path: target)) + (source-db (db:open-megatest-db path: source)) + (curr-time (current-seconds)) + (res '())) + (print source-db) + (begin + (if (not (file-exists? target)) + ((db:sync-tables (db:sync-main-list source-db) source-db cache-db) + (db:sync-tables db:sync-tests-only source-db cache-db) + (db:clean-up-rundb cache-db)) + ((sqlite3:for-each-row + (lambda (id release runname state status owner event_time comment fail_count pass_count ) + (set! res (cons (id release runname state status owner event_time comment fail_count pass_count ) res))) + (db:dbdat-get-db source-db) + "SELECT id, release, runname, state, status, owner, event_time, comment, fail_count, pass_count FROM runs;")) + ) + (print res) + (sqlite3:finalize! (db:dbdat-get-db cache-db)) + )) + ) ;; options: ;; ;; 'killservers - kills all servers ;; 'dejunk - removes junk records @@ -3076,14 +3076,14 @@ (string-substitute (regexp "=") "_" (base64:base64-encode (z3:encode-buffer (with-output-to-string - (lambda ()(serialize obj))))) + (lambda ()(serialize obj))))) ;; BB: serialize - this is what causes problems between different builds of megatest communicating. serialize is sensitive to binary image of mtest. #t)) ((zmq nmsg)(with-output-to-string (lambda ()(serialize obj)))) - (else obj))) + (else obj))) ;; rpc (define (db:string->obj msg #!key (transport 'http)) (case transport ;; ((fs) msg) ((http fs) @@ -3096,11 +3096,11 @@ (lambda ()(deserialize))) (begin (debug:print-error 0 *default-log-port* "reception failed. Received " msg " but cannot translate it.") msg))) ;; crude reply for when things go awry ((zmq nmsg)(with-input-from-string msg (lambda ()(deserialize)))) - (else msg))) + (else msg))) ;; rpc (define (db:test-set-status-state dbstruct run-id test-id status state msg) (let ((dbdat (db:get-db dbstruct run-id))) (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) (db:general-call dbdat 'set-test-start-time (list test-id))) Index: docs/manual/megatest_manual.html ================================================================== --- docs/manual/megatest_manual.html +++ docs/manual/megatest_manual.html @@ -867,12 +867,83 @@

Road Map

-

Note 1: This road-map is tentative and subject to change without notice.

-

Note 2: Starting over. Old plan is commented out.

+

Note 1: This road-map is still evolving and subject to change without notice.

+
+

Architecture Refactor

+
+

Goals

+
    +
  1. +

    +Reduce load on the file system. Sqlite3 files on network filesystem can be + a burden. +

    +
  2. +
  3. +

    +Reduce number of servers and frequency of start/stop. This is mostly an + issue of clutter but also a reduction in "moving parts". +

    +
  4. +
  5. +

    +Coalesce activities to a single home host where possible. Give the user + feedback that they have started the dashboard on a host other than the + home host. +

    +
  6. +
  7. +

    +Reduce number of processes involved in managing running tests. +

    +
  8. +
+
+
+

Changes Needed

+
    +
  1. +

    +ACID compliant db will be on /tmp and synced to megatest.db with a five + second max delay. +

    +
  2. +
  3. +

    +Read/writes to db for processes on homehost will go direct to /tmp + megatest.db file. +

    +
  4. +
  5. +

    +Read/wites fron non-homehost processes will go through one server. Bulk + reads (e.g. for dashboard or list-runs) will be cached on the current host + in /tmp and synced from the home megatest.db in the testsuite area. +

    +
  6. +
  7. +

    +Db syncs rely on the target db file timestame minus some margin. +

    +
  8. +
  9. +

    +Since bulk reads do not use the server we can switch to simple RPC for the + network transport. +

    +
  10. +
  11. +

    +Test running manager process extended to manage multiple running tests. +

    +
  12. +
+
+

Current Items

ww05 - migrate to inmem-db

    @@ -1359,10 +1430,21 @@
    [setup]
     nodot
    +
+
+

Dashboard settings

+
+
Runs tab buttons, font and size
+
+
[dashboard]
+btn-height x14
+btn-fontsz 10
+cell-width 60
+

Database settings

number (args:get-arg "-run-id"))))) + (string->number (args:get-arg "-run-id")))) + (transport-type (string->symbol (or (args:get-arg "-transport") "http")))) (if run-id (begin - (server:launch run-id) + (server:launch run-id transport-type) (set! *didsomething* #t)) (debug:print-error 0 *default-log-port* "server requires run-id be specified with -run-id"))) ;; Not a server? This section will decide how to communicate ;; @@ -764,10 +764,11 @@ (if (null? (lset-intersection equal? (hash-table-keys args:arg-hash) '("-list-servers" "-stop-server" + "-kill-server" "-show-cmdinfo" "-list-runs" "-ping"))) (if (launch:setup) (let ((run-id (and (args:get-arg "-run-id") @@ -786,18 +787,20 @@ ;; MAY STILL NEED THIS ;; (set! *megatest-db* (make-dbr:dbstruct path: *toppath* local: #t)))))))))) (if (or (args:get-arg "-list-servers") - (args:get-arg "-stop-server")) + (args:get-arg "-stop-server") + (args:get-arg "-kill-server")) (let ((tl (launch:setup))) (if tl (let* ((tdbdat (tasks:open-db)) (servers (tasks:get-all-servers (db:delay-if-busy tdbdat))) (fmtstr "~5a~12a~8a~20a~24a~10a~10a~10a~10a\n") (servers-to-kill '()) - (killinfo (args:get-arg "-stop-server")) + (kill-switch (if (args:get-arg "-kill-server") "-9" "")) + (killinfo (or (args:get-arg "-stop-server") (args:get-arg "-kill-server") )) (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f)) (sid (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f))) (format #t fmtstr "Id" "MTver" "Pid" "Host" "Interface:OutPort" "InPort" "LastBeat" "State" "Transport") (format #t fmtstr "==" "=====" "===" "====" "=================" "======" "========" "=====" "=========") (for-each @@ -827,12 +830,12 @@ (format #t fmtstr id mt-ver pid hostname (conc interface ":" pullport) pubport last-update (if status "alive" "dead") transport) (if (or (equal? id sid) (equal? sid 0)) ;; kill all/any (begin - (debug:print-info 0 *default-log-port* "Attempting to stop server with pid " pid) - (tasks:kill-server status hostname pullport pid transport))))) + (debug:print-info 0 *default-log-port* "Attempting to kill "kill-switch" server with pid " pid) + (tasks:kill-server hostname pid kill-switch: kill-switch))))) servers) (debug:print-info 1 *default-log-port* "Done with listservers") (set! *didsomething* #t) (exit)) ;; must do, would have to add checks to many/all calls below (exit)))) Index: rpc-transport.scm ================================================================== --- rpc-transport.scm +++ rpc-transport.scm @@ -38,32 +38,34 @@ ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (rpc-transport:launch run-id) - (set! *run-id* run-id) - (if (args:get-arg "-daemonize") - (daemon:ize)) - (if (server:check-if-running run-id) - (begin - (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") - (exit 0))) - (let loop ((server-id (open-run-close tasks:server-lock-slot tasks:open-db run-id)) - (remtries 4)) - (if (not server-id) - (if (> remtries 0) - (begin - (thread-sleep! 2) - (loop (open-run-close tasks:server-lock-slot tasks:open-db run-id) - (- remtries 1))) - (begin - ;; since we didn't get the server lock we are going to clean up and bail out - (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") - (open-run-close tasks:server-delete-records-for-this-pid tasks:open-db " rpc-transport:launch"))) - (begin - (rpc-transport:run (if (args:get-arg "-server")(args:get-arg "-server") "-") run-id server-id) - (exit))))) + (let* ((tdbdat (tasks:open-db))) + (BB> "rpc-transport:launch fired for run-id="run-id) + (set! *run-id* run-id) + (if (args:get-arg "-daemonize") + (daemon:ize)) + (if (server:check-if-running run-id) + (begin + (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") + (exit 0))) + (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)) + (remtries 4)) + (if (not server-id) + (if (> remtries 0) + (begin + (thread-sleep! 2) + (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id) + (- remtries 1))) + (begin + ;; since we didn't get the server lock we are going to clean up and bail out + (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") + (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " rpc-transport:launch"))) + (begin + (rpc-transport:run (if (args:get-arg "-server")(args:get-arg "-server") "-") run-id server-id) + (exit)))))) (define (rpc-transport:run hostn run-id server-id) (debug:print 2 *default-log-port* "Attempting to start the rpc server ...") ;; (trace rpc:publish-procedure!) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1324,11 +1324,11 @@ ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) (thread-sleep! 5) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id)) (prev-num-running 0)) - ;; (debug:print 0 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running) + (BB> "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup *configdat* "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -47,16 +47,17 @@ ;; all routes though here end in exit ... ;; ;; start_server ;; -(define (server:launch run-id) - (case *transport-type* +(define (server:launch run-id transport-type) + (BB> "server:launch fired for run-id="run-id" transport-type="transport-type) + (case transport-type ((http)(http-transport:launch run-id)) ;;((nmsg)(nmsg-transport:launch run-id)) ((rpc) (rpc-transport:launch run-id)) - (else (debug:print-error 0 *default-log-port* "unknown server type " *transport-type*)))) + (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type)))) ;; (else (debug:print-error 0 *default-log-port* "No known transport set, transport=" transport ", using rpc") ;; (rpc-transport:launch run-id))))) ;;====================================================================== ;; S E R V E R U T I L I T I E S @@ -101,10 +102,11 @@ result))) ;; Given a run id start a server process ### NOTE ### > file 2>&1 ;; if the run-id is zero and the target-host is set ;; try running on that host +;; incidental: rotate logs in logs/ dir. ;; (define (server:run run-id) (let* ((curr-host (get-host-name)) (curr-ip (server:get-best-guess-address curr-host)) (target-host (configf:lookup *configdat* "server" "homehost" )) @@ -116,12 +118,15 @@ "") " -m testsuite:" testsuite))) ;; (conc " >> " logfile " 2>&1 &"))))) (debug:print 0 *default-log-port* "INFO: Starting server (" cmdln ") as none running ...") (push-directory *toppath*) (if (not (directory-exists? "logs"))(create-directory "logs")) + ;; Rotate logs, logic: - ;; if > 500k and older than 1 week, remove previous compressed log and compress this log + ;; if > 500k and older than 1 week: + ;; remove previous compressed log and compress this log + ;; (directory-fold (lambda (file rem) (if (and (string-match "^.*.log" file) (> (file-size (conc "logs/" file)) 200000)) (let ((gzfile (conc "logs/" file ".gz"))) @@ -141,19 +146,20 @@ (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) (not (equal? curr-ip target-host))) (begin (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) + (setenv "TARGETHOST_LOGF" logfile) (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) ;; (system cmdln) (pop-directory))) -(define (server:get-client-signature) +(define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) (set! *my-client-signature* sig) *my-client-signature*))) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -229,10 +229,18 @@ (define (tasks:server-force-clean-run-record mdb run-id iface port tag) (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=? AND interface=? AND port=?;" (conc "defunct" tag) run-id iface port)) + +;; BB> adding missing func for --list-servers +(define (tasks:server-deregister mdb hostname #!key (pullport #f) (pid #f) (action #f)) ;;pullport pid: pid action: 'delete)) + (if (eq? action 'delete) + (sqlite3:execute mdb "DELETE FROM servers WHERE pid=? AND port=? AND hostname=?;" pid pullport hostname) + (sqlite3:execute mdb "UPDATE servers SET state='defunct', heartbeat=strftime('%s','now') WHERE hostname=? AND pid=?;" + hostname pid))) + (define (tasks:server-delete-records-for-this-pid mdb tag) (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE hostname=? AND pid=?;" (conc "defunct" tag) (get-host-name) (current-process-id))) (define (tasks:server-delete-record mdb server-id tag) @@ -423,15 +431,15 @@ run-id) (reverse res))) ;; no elegance here ... ;; -(define (tasks:kill-server hostname pid) +(define (tasks:kill-server hostname pid #!key (kill-switch "")) (debug:print-info 0 *default-log-port* "Attempting to kill server process " pid " on host " hostname) (setenv "TARGETHOST" hostname) (setenv "TARGETHOST_LOGF" "server-kills.log") - (system (conc "nbfake kill " pid)) + (system (conc "nbfake kill "kill-switch" "pid)) (unsetenv "TARGETHOST_LOGF") (unsetenv "TARGETHOST")) ;; look up a server by run-id and send it a kill, also delete the record for that server ;; Index: utils/viewscreen ================================================================== --- utils/viewscreen +++ utils/viewscreen @@ -6,14 +6,14 @@ fi if [[ $(screen -list | egrep 'Attached|Detached'|awk '{print $1}') == "" ]];then # echo "No screen found for displaying to. Run \"screen\" in an xterm" # exit 1 - xterm -e screen -e^rr & + xterm -e screen -e^ff & sleep 1 screen -X hardstatus off screen -X hardstatus alwayslastline screen -X hardstatus string '%{= kG}[ %{G}%H %{g}][%= %{= kw}%?%-Lw%?%{r}(%{W}%n*%f%t%?(%u)%?%{r})%{w}%?%+Lw%?%?%= %{g}][%{B} %m-%d %{W} %c %{g}]' fi cmd="cd $PWD;$*" -screen -X screen bash -c "$cmd;echo \"Press any key to continue, ctrl-r to see other windows\";bash -c 'read -n 1 -s'" +screen -X screen bash -c "$cmd;echo \"Press any key to continue, ctrl-f to see other windows\";bash -c 'read -n 1 -s'"