Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -49,10 +49,11 @@
     test-get-paths-matching-keynames-target-new
     get-prereqs-not-met
     get-count-tests-running-for-run-id
     get-run-info
     get-run-status
+    get-run-state
     get-run-stats
     get-run-times
     get-targets
     get-target
     ;; register-run
@@ -158,10 +159,15 @@
             (params            (vector-ref dat 1))
             (start-t           (current-milliseconds))
             (readonly-mode     (dbr:dbstruct-read-only dbstruct))
             (readonly-command  (member cmd api:read-only-queries))
             (writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))
+            (foo               (begin
+                                 (common:telemetry-log (conc "api-in:"(->string cmd))
+                                                       payload: `((params . ,params)))
+                                 
+                                 #t))
             (res    
              (if writecmd-in-readonly-mode
                  (conc "attempt to run write command "cmd" on a read-only database")
                  (case cmd
                    ;;===============================================
@@ -281,11 +287,13 @@
 		   ((get-test-times)                  (apply db:get-test-times dbstruct params))
 
                    ;; RUNS
                    ((get-run-info)                 (apply db:get-run-info dbstruct params))
                    ((get-run-status)               (apply db:get-run-status dbstruct params))
+                   ((get-run-state)                (apply db:get-run-state dbstruct params))
                    ((set-run-status)               (apply db:set-run-status dbstruct params))
+                   ((set-run-state-status)  			 (apply db:set-run-state-status dbstruct params))
                    ((get-tests-for-run)            (apply db:get-tests-for-run dbstruct params))
                    ((get-test-id)                  (apply db:get-test-id dbstruct params))
                    ((get-tests-for-run-mindata)    (apply db:get-tests-for-run-mindata dbstruct params))
                    ((get-tests-for-runs-mindata)   (apply db:get-tests-for-runs-mindata dbstruct params))
                    ((get-runs)                     (apply db:get-runs dbstruct params))
@@ -330,19 +338,28 @@
                    ;; TASKS 
                    ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))
 		   (else
 		    (debug:print 0 *default-log-port* "ERROR: bad api call " cmd)
 		    (conc "ERROR: BAD api call " cmd))))))
+
        
        ;; save all stats
        (let ((delta-t (- (current-milliseconds)
 			 start-t)))
 	 (hash-table-set! *db-api-call-time* cmd
 			  (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))))
        (if writecmd-in-readonly-mode
-	   (vector #f res)
-           (vector #t res)))))))
+           (begin
+             (common:telemetry-log (conc "api-out:"(->string cmd))
+                                   payload: `((params . ,params)
+                                              (ok-res . #t)))
+	     (vector #f res))
+           (begin
+             (common:telemetry-log (conc "api-out:"(->string cmd))
+                                   payload: `((params . ,params)
+                                              (ok-res . #f)))
+             (vector #t res))))))))
 
 ;; http-server  send-response
 ;;                 api:process-request
 ;;                    db:*
 ;;

Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -17,11 +17,11 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;======================================================================
 
 (use srfi-1 data-structures posix regex-case (prefix base64 base64:)
-     format dot-locking csv-xml z3 ;; sql-de-lite
+     format dot-locking csv-xml z3 udp ;; sql-de-lite
      hostinfo md5 message-digest typed-records directory-utils stack
      matchable regex posix (srfi 18) extras ;; tcp 
      (prefix nanomsg nmsg:)
      (prefix sqlite3 sqlite3:)
      pkts (prefix dbi dbi:)
@@ -81,10 +81,11 @@
     (length (glob (conc "/proc/" pid "/fd/*")))
     (length  (filter identity (map socket? (glob (conc "/proc/" pid "/fd/*")))))
   )
 )
 
+  
 
 ;; GLOBALS
 
 ;; CONTEXTS
 (defstruct cxt
@@ -887,10 +888,11 @@
 	    (debug:print-info 13 *default-log-port* "watchdog done."))
 	  (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost"))))
 
 
 (define (std-exit-procedure)
+  ;;(common:telemetry-log-close)
   (on-exit (lambda () 0))
   ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
   (let ((no-hurry  (if *time-to-exit* ;; hurry up
 		       #f
 		       (begin
@@ -1591,22 +1593,25 @@
      (with-output-to-file fullpath (lambda ()(pp dat))))))
 
 ;; get cpu load by reading from /proc/loadavg, return all three values
 ;;
 (define (common:get-cpu-load remote-host)
-  (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
-    (or (common:get-cached-info actual-hostname "cpu-load")
-	(let ((result (if remote-host
-			  (map (lambda (res)
-				 (if (eof-object? res) 9e99 res))
-			       (with-input-from-pipe 
-				   (conc "ssh " remote-host " cat /proc/loadavg")
+  (handle-exceptions
+   exn
+   '(99 99 99)
+   (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
+     (or (common:get-cached-info actual-hostname "cpu-load")
+	 (let ((result (if remote-host
+			   (map (lambda (res)
+				  (if (eof-object? res) 9e99 res))
+			        (with-input-from-pipe 
+				 (conc "ssh " remote-host " cat /proc/loadavg")
 				 (lambda ()(list (read)(read)(read)))))
-			  (with-input-from-file "/proc/loadavg" 
-			    (lambda ()(list (read)(read)(read)))))))
-	  (common:write-cached-info actual-hostname "cpu-load" result)
-	  result))))
+			   (with-input-from-file "/proc/loadavg" 
+			     (lambda ()(list (read)(read)(read)))))))
+	   (common:write-cached-info actual-hostname "cpu-load" result)
+	                     result)))))
 
 ;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads
 ;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc.
 ;;  keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load
 ;;
@@ -3049,5 +3054,69 @@
            exn
            #t ;; just ignore it, it might have died in the meantime so joining it will throw an exception
            (thread-join! thread))
            )))
    (hash-table-keys *common:thread-punchlist*)))
+
+(define *common:telemetry-log-state* 'startup)
+(define *common:telemetry-log-socket* #f)
+
+(define (common:telemetry-log-open)
+  (if (eq? *common:telemetry-log-state* 'startup)
+      (let* ((serverhost (configf:lookup *configdat* "telemetry" "host"))
+             (serverport (configf:lookup-number *configdat* "telemetry" "port"))
+             (user (or (get-environment-variable "USER") "unknown"))
+             (host (or (get-environment-variable "HOST") "unknown")))
+        (set! *common:telemetry-log-state*
+              (handle-exceptions
+               exn
+               (begin
+                 (debug:print-info 0 *default-log-port* "common-telemetry-log open udp port failure")
+                 'broken)
+               (if (and serverhost serverport user host)
+                   (let* ((s (udp-open-socket)))
+                     ;;(udp-bind! s #f 0)
+                     (udp-connect! s serverhost serverport)
+                     (set! *common:telemetry-log-socket* s)
+                     'open)
+                   'not-needed))))))
+  
+(define (common:telemetry-log event #!key (payload '()))
+  (if (eq? *common:telemetry-log-state* 'startup)
+      (common:telemetry-log-open))
+
+  (if (eq? 'open *common:telemetry-log-state*)
+      (handle-exceptions
+       exn
+       (begin
+         (debug:print-info 0 *default-log-port* "common-telemetry-log comms failure ; disabled (no server?)")
+         ;;(define *common:telemetry-log-state* 'broken-or-no-server-preclose)
+         ;;(common:telemetry-log-close)
+         (define *common:telemetry-log-state* 'broken-or-no-server)
+         (set! *common:telemetry-log-socket* #f)
+         )
+       (if (and *common:telemetry-log-socket* event) ;; TODO - filter on event against telemetry.want-events
+           (let* ((user (or (get-environment-variable "USER") "unknown"))
+                  (host (or (get-environment-variable "HOST") "unknown"))
+                  (start (conc "[megatest "event"]"))
+                  (toppath (or *toppath* "/dev/null"))
+                  (payload-serialized
+                   (base64:base64-encode
+                    (z3:encode-buffer
+                     (with-output-to-string (lambda () (pp payload))))))
+                  (msg     (conc user":"host":"start":"(current-process-id)":"(car (argv))":"
+                                 toppath":"payload-serialized)))
+             (udp-send *common:telemetry-log-socket* msg))))))
+  
+(define (common:telemetry-log-close)
+  (when (or (member *common:telemetry-log-state* '(broken-or-no-server-preclose open)) *common:telemetry-log-socket*)
+    (handle-exceptions
+     exn
+     (begin
+       (define *common:telemetry-log-state* 'closed-fail)
+       (debug:print-info 0 *default-log-port* "common-telemetry-log closure failure")
+       )
+     (begin
+       (define *common:telemetry-log-state* 'closed)
+       (udp-close-socket *common:telemetry-log-socket*)
+       (set! *common:telemetry-log-socket* #f)))))
+

Index: configf.scm
==================================================================
--- configf.scm
+++ configf.scm
@@ -774,14 +774,14 @@
     ht))
 
 ;; if 
 (define (configf:read-alist fname)
   (handle-exceptions
-      exn
-      #f
-    (configf:alist->config
-     (with-input-from-file fname read))))
+   exn
+   #f
+   (configf:alist->config
+    (with-input-from-file fname read))))
 
 (define (configf:write-alist cdat fname)
   (if (not (common:faux-lock fname))
       (debug:print 0 *default-log-port* "INFO: Could not get lock on " fname))
   (let* ((dat  (configf:config->alist cdat))

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -406,10 +406,13 @@
 
 ;; Open the classic megatest.db file (defaults to open in toppath)
 ;;
 ;;   NOTE: returns a dbdat not a dbstruct!
 ;;
+
+;;(define (db:reopen-megatest-db
+
 (define (db:open-megatest-db #!key (path #f)(name #f))
   (let* ((dbdir        (or path *toppath*))
          (dbpath       (conc  dbdir "/" (or name "megatest.db")))
 	 (dbexists     (common:file-exists? dbpath))
 	 (db           (db:lock-create-open dbpath
@@ -1642,39 +1645,61 @@
 
 (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
   (let* ((incompleted '())
 	 (oldlaunched '())
 	 (toplevels   '())
-	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime")) ;; FIXME suspect test run time & deadtime are not well matched; resulting in COMPLETED/DEAD status of an a-ok running test
-	 (deadtime     (if (and deadtime-str
-				(string->number deadtime-str))
-			   (string->number deadtime-str)
-			   7200))) ;; two hours
+         (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime")))
+         (server-start-allowance 200)
+         (server-overloaded-budget 200)
+         (launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30))
+         (launch-monitor-on-time-budget 30)
+         (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget))
+         (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30))
+         (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default))
+         (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))
+         (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period)
+         )
     (db:with-db 
      dbstruct #f #f
      (lambda (db)
-       (if (number? ovr-deadtime)(set! deadtime ovr-deadtime))
-       
        ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
        ;;
        ;; HOWEVER: this code in run:test seems to work fine
        ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
        ;;                     (db:test-get-run_duration testdat)))
        ;;                    600) 
        ;; (db:delay-if-busy dbdat)
        (sqlite3:for-each-row 
-        (lambda (test-id run-dir uname testname item-path)
+        (lambda (test-id run-dir uname testname item-path event-time run-duration)
+          (if (and (equal? uname "n/a")
+                   (equal? item-path "")) ;; this is a toplevel test
+              ;; what to do with toplevel? call rollup?
+              (begin
+                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+              (begin
+                (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
+                (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration))))
+        db
+        "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');"
+        run-id running-deadtime)
+
+       
+       (sqlite3:for-each-row 
+        (lambda (test-id run-dir uname testname item-path event-time run-duration)
           (if (and (equal? uname "n/a")
                    (equal? item-path "")) ;; this is a toplevel test
               ;; what to do with toplevel? call rollup?
               (begin
                 (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
                 (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
-              (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))
+              (begin
+                (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)
+                (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
         db
-        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');"
-        run-id deadtime)
+        "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');"
+        run-id remotehoststart-deadtime)
 
        ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
        ;;
        ;; (db:delay-if-busy dbdat)
        (sqlite3:for-each-row
@@ -1681,11 +1706,13 @@
         (lambda (test-id run-dir uname testname item-path)
           (if (and (equal? uname "n/a")
                    (equal? item-path "")) ;; this is a toplevel test
               ;; what to do with toplevel? call rollup?
               (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-              (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))
+              (begin
+                (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id" 1 day since event_time marked")
+                (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))))
         db
         "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"
         run-id)
        
        (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")
@@ -1702,15 +1729,15 @@
               ;;      		    incompleted))
               (min-incompleted-ids (map car incompleted)) ;; do 'em all
               (all-ids             (append min-incompleted-ids (map car oldlaunched))))
          (if (> (length all-ids) 0)
              (begin
-               (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as INCOMPLETE")
+               (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD")
                (for-each
                 (lambda (test-id)
-                  (db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test failed to complete")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS.  ref ticket 220546828
-
+                  (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))
+                  ;;(db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS.  ref ticket 220546828
                 all-ids))))))))
 
 ;; ALL REPLACED BY THE BLOCK ABOVE
 ;;
 ;; 	    (sqlite3:execute 
@@ -2625,10 +2652,24 @@
 	  (set! res status))
 	db
 	"SELECT status FROM runs WHERE id=?;" 
 	run-id)
        res))))
+
+(define (db:get-run-state dbstruct run-id)
+  (let ((res "n/a"))
+    (db:with-db
+     dbstruct #f #f
+     (lambda (db)
+       (sqlite3:for-each-row 
+	(lambda (status)
+	  (set! res status))
+	db
+	"SELECT state FROM runs WHERE id=?;" 
+	run-id)
+       res))))
+
 
 ;;======================================================================
 ;; K E Y S
 ;;======================================================================
 
@@ -3008,11 +3049,11 @@
 (db:with-db
    dbstruct
    run-id
    #f
    (lambda (db)
-      (print "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=" run-id)  
+      ;(print "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=" run-id)  
      (sqlite3:first-result
       db
       "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=?;" run-id))))
 
 (define (db:get-count-tests-running-in-jobgroup dbstruct run-id jobgroup)
@@ -3699,11 +3740,11 @@
 	 (item-path    (db:test-get-item-path testdat))
          (tl-testdat   (db:get-test-info dbstruct run-id test-name ""))
          (tl-test-id   (if tl-testdat
 			   (db:test-get-id tl-testdat)
 			   #f)))
-    (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
+    (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) 
 	(db:general-call dbstruct 'set-test-start-time (list test-id)))
     (mutex-lock! *db-transaction-mutex*)
     (db:with-db
      dbstruct #f #f
      (lambda (db)
@@ -3787,11 +3828,11 @@
                                          "\n\n")
 
                         ;; NB// Pass the db so it is part of the transaction
          (list newstate newstatus)))
 
-(define (db:set-state-status-and-roll-up-run dbstruct run-id)
+(define (db:set-state-status-and-roll-up-run dbstruct run-id curr-state curr-status)
     (mutex-lock! *db-transaction-mutex*)
     (db:with-db
      dbstruct #f #f
      (lambda (db)
        (let ((tr-res
@@ -3800,12 +3841,12 @@
                (lambda ()
                    (let* ((state-status-counts  (db:get-all-state-status-counts-for-run dbstruct run-id))
 													(state-stauses (db:roll-up-rules state-status-counts #f #f ))
                           (newstate (car state-stauses))
                           (newstatus (cadr state-stauses))) 
-                    
-                   (db:set-run-state-status dbstruct run-id newstate newstatus ))))))
+                    (if (or (not (eq? newstate curr-state)) (not (eq?  newstatus curr-status)))
+                   (db:set-run-state-status dbstruct run-id newstate newstatus )))))))
          (mutex-unlock! *db-transaction-mutex*)
          tr-res))))
 
 
 (define (db:get-all-state-status-counts-for-run dbstruct run-id)

ADDED   docs/manual/devnotes.txt
Index: docs/manual/devnotes.txt
==================================================================
--- /dev/null
+++ docs/manual/devnotes.txt
@@ -0,0 +1,37 @@
+Developer Notes
+---------------
+
+Collected here are some topics that may interest the megatest developer.
+
+telemetry
+~~~~~~~~~
+
+A new feature introduced in v1.6525 allows a centralized debug messaging system.  Debugging client-server issues
+is greatly aided by a centralized, time coherent log of events across test execution, server, and runner.  This
+is provided by the telemetry feature
+
+
+source code call example
+
+
+[source,ini]
+ [telemetry]
+ host <IP where telemetry daemon will run>
+ port <UDP port to listen -- we recommend 5929>
+ want-events <comma separated list of telemetry-log keywords
+
+[source,ini]
+ [telemetry]
+ host 10.38.32.91
+ port 5929
+ want-events ALL
+
+[source,scheme]
+(common:telemetry-log <keyword string>
+
+
+Usage:
+1. Add telemetry section to megatest.config
+2. Start telemetry daemon telemetry-daemon -a start -l /tmp/my-telemetry.log
+3. Run megatest
+4. examine / parse telemetry log

Index: docs/manual/megatest_manual.html
==================================================================
--- docs/manual/megatest_manual.html
+++ docs/manual/megatest_manual.html
@@ -900,69 +900,10 @@
 sqlite3 database. Megatest has been used with the Intel Netbatch and
 lsf (also known as openlava) batch systems and it should be
 straightforward to use it with other similar systems.</p></div>
 </div>
 </div>
-</div>
-<div class="sect1">
-<h2 id="_overview">Overview</h2>
-<div class="sectionbody">
-<div class="sect2">
-<h3 id="_stand_alone_megatest_area">Stand-alone Megatest Area</h3>
-<div class="paragraph"><p>A single, stand-alone, Megatest based testsuite or "area" is
-sufficient for most validation, automation and build problems.</p></div>
-<div class="imageblock">
-<div class="content">
-<img src="megatest-stand-alone-area.png" alt="Static">
-</div>
-</div>
-<div class="paragraph"><p>Megatest is designed as a distributed or decoupled system. This means
-you can run the areas stand-alone with no additional
-infrastructure. I.e. there are no databases, web servers or other
-centralized resources needed. However as your needs grow you can
-integrate multiple areas into a bigger system.</p></div>
-<div class="sect3">
-<h4 id="_component_descriptions">Component Descriptions</h4>
-<div class="olist arabic"><ol class="arabic">
-<li>
-<p>
-Multi-area dashboard and xterm. A gui (the dashboard) is usually the
-  best option for controlling and launching runs but all operations
-  can also be done from the commandline. Note: The not yet released
-  multi-area dashboard replaces the old dashboard for browsing and
-  controlling runs but for managing a single area the old dashboard
-  works very well.
-</p>
-</li>
-<li>
-<p>
-Area/testsuite. This is your testsuite or automation definition and
-  consists of the information in megatest.config, runconfigs.config
-  and your testconfigs along with any custom scripting that can&#8217;t be
-  done with the native Megatest features.
-</p>
-</li>
-<li>
-<p>
-If your testsuite or build automation is too large to run on a
-  single instance you can distribute your jobs into a compute server
-  pool. The only current requirements are password-less ssh access and
-  a network filesystem.
-</p>
-</li>
-</ol></div>
-</div>
-</div>
-<div class="sect2">
-<h3 id="_full_system_architecture">Full System Architecture</h3>
-<div class="imageblock">
-<div class="content">
-<img src="megatest-system-architecture.png" alt="Static">
-</div>
-</div>
-</div>
-</div>
 </div>
 <div class="sect1">
 <h2 id="_installation">Installation</h2>
 <div class="sectionbody">
 <div class="sect2">
@@ -3033,10 +2974,10 @@
 </div>
 <div id="footnotes"><hr></div>
 <div id="footer">
 <div id="footer-text">
 Version 1.0<br>
-Last updated 2018-11-29 09:32:52 PST
+Last updated 2019-02-19 11:13:54 PST
 </div>
 </div>
 </body>
 </html>

Index: docs/megatest-training.odp
==================================================================
--- docs/megatest-training.odp
+++ docs/megatest-training.odp
cannot compute difference between binary files

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -377,14 +377,16 @@
 			     start-seconds)))))
 	 (kill-tries 0))
     ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
     ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
     (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
+
     (let loop ((minutes   (calc-minutes))
 	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
 	       (disk-free (get-df (current-directory)))
                (last-sync (current-seconds)))
+      (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
       (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
              (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                    (delta (abs (- load cpu-load))))
                               (if (> delta 0.1) ;; don't bother updating with small changes
                                   load
@@ -395,24 +397,43 @@
                                    (delta (abs (- df disk-free))))
                               (if (and (> df 0)
                                        (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                   df
                                   #f)))
-             (do-sync       (or new-cpu-load new-disk-free over-time)))
+             (do-sync       (or new-cpu-load new-disk-free over-time))
+
+             (test-info   (rmt:get-test-info-by-id run-id test-id))
+             (state       (db:test-get-state test-info))
+             (status      (db:test-get-status test-info))
+             (kill-reason  "no kill reason specified")
+             (kill-job?    #f))
+        (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
+        (cond
+         ((test-get-kill-request run-id test-id)
+          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
+          (set! kill-job? #t))
+         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
+          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
+          (set! kill-job? #t))
+         ((equal? status "DEAD")
+          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
+          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
+          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
+          (set! kill-job? #f)))
+
         (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
-	(set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat))
-			    (and runtlim (let* ((run-seconds   (- (current-seconds) start-seconds))
-						(time-exceeded (> run-seconds runtlim)))
-					   (if time-exceeded
-					       (begin
-						 (debug:print-info 0 *default-log-port* "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)
-						 #t)
-					       #f)))))
-        (if do-sync
-            (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
+        (launch:handle-zombie-tests run-id)
+        (when do-sync
+          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
+          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
+          (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
+          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
+          (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
+        
 	(if kill-job? 
 	    (begin
+              (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
 	      (mutex-lock! m)
 	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
 	      ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
 	      ;;       between tries?
 	      (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0))
@@ -442,17 +463,19 @@
 				  #f
 				  (process-signal pid-num signal/kill)))
 			       (process:get-sub-pids pid))))
 		       ;;    (debug:print-info 0 *default-log-port* "not killing process " pid " as it is not alive"))))
 		       pids)
-		      (tests:test-set-status! run-id test-id "KILLED"  "KILLED" (args:get-arg "-m") #f))
+                      ;; BB: question to Matt -- does the tests:test-state-status! encompass rollup to toplevel?  If not, should it?
+		      (tests:test-set-status! run-id test-id "KILLED"  "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) ;; BB ADDED kill-reason -- confirm OK with Matt
 		    (begin
 		      (debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2)
-		      (tests:test-set-status! run-id test-id "KILLED"  "FAILED TO KILL" (args:get-arg "-m") #f)
+		      (tests:test-set-status! run-id test-id "KILLED"  "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt
 		      )))
 	      (mutex-unlock! m)
-	      ;; no point in sticking around. Exit now.
+	      ;; no point in sticking around. Exit now. But run end of run before exiting?
+        (launch:end-of-run-check run-id)
 	      (exit)))
 	(if (hash-table-ref/default misc-flags 'keep-going #f)
 	    (begin
 	      (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
 	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
@@ -573,18 +596,19 @@
 	  (let ((sighand (lambda (signum)
 			   ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting
 			   (if (eq? signum signal/stop)
 			       (debug:print-error 0 *default-log-port* "attempt to STOP process. Exiting."))
 			   (set! *time-to-exit* #t)
-			   (print "Received signal " signum ", cleaning up before exit. Please wait...")
+			   (print "Received signal " signum ", cleaning up before exit (set this test to COMPLETED/ABORT) . Please wait...")
 			   (let ((th1 (make-thread (lambda ()
-						     (rmt:test-set-state-status run-id test-id "INCOMPLETE" "KILLED" #f)
+                                                     (print "set test to COMPLETED/ABORT begin.")
+						     (rmt:test-set-state-status run-id test-id "COMPLETED" "ABORT" "received kill signal")
+                                                     (print "set test to COMPLETED/ABORT complete.")
 						     (print "Killed by signal " signum ". Exiting")
-						     (thread-sleep! 1)
 						     (exit 1))))
 				 (th2 (make-thread (lambda ()
-						     (thread-sleep! 2)
+						     (thread-sleep! 20)
 						     (debug:print 0 *default-log-port* "Done")
 						     (exit 4)))))
 			     (thread-start! th2)
 			     (thread-start! th1)
 			     (thread-join! th2)))))
@@ -605,25 +629,28 @@
 	    (cond
              ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag.
 	     ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
 	      (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
 	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
-	      (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
+
+              (rmt:general-call 'set-test-start-time #f test-id)
+              (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
 	      ) ;; prime it for running
 	     ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART"))
 	      (if (process:alive-on-host? test-host test-pid)
 		  (debug:print-error 0 *default-log-port* "test state is "  (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed")
 		  (exit)))
 	     ((not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")))
 	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
+              (rmt:general-call 'set-test-start-time #f test-id)
 	      (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
 	      )
 	     (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))
 	      (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed")
 	      (exit))))
 	  
-	  (debug:print 2 *default-log-port* "Exectuing " test-name " (id: " test-id ") on " (get-host-name))
+	  (debug:print 2 *default-log-port* "Executing " test-name " (id: " test-id ") on " (get-host-name))
 	  (set! keys       (rmt:get-keys))
 	  ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process
 	  ;; one of these is defunct/redundant ...
 	  (if (not (launch:setup force-reread: #t))
 	      (begin
@@ -835,34 +862,38 @@
 ;; 0 RUNNING ==> this is actually the first condition, should not get here
 
 (define (launch:end-of-run-check run-id )
     (let*	((not-completed-cnt (rmt:get-not-completed-cnt run-id))  
            (running-cnt (rmt:get-count-tests-running-for-run-id run-id))
-           (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id))))
-     ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing  
+           (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))
+           (current-state (rmt:get-run-state run-id))
+           (current-status (rmt:get-run-status run-id)))
+     ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
+     (debug:print 0 *default-log-port* "rollup run state/status")                      
+     (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
+ 
      (cond 
        ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" ))
-           	(debug:print 0 *default-log-port* "rollup run state/status")                      
-          	(rmt:set-state-status-and-roll-up-run  run-id)
-          	(debug:print 0 *default-log-port* "look for  post hook.")
+           	(debug:print 0 *default-log-port* "look for  post hook.")
           	(runs:run-post-hook run-id))
         ((> running-cnt 3) 
         	  (debug:print 0 *default-log-port* "There are " running-cnt " tests running." ))
         ((> running-cnt 0)
             (debug:print 0 *default-log-port* "running cnt > 0 but <= 3 kill-running-tests-if-dead" )
    				  (let ((kill-cnt (launch:kill-tests-if-dead run-id)))
            			(if (and all-test-launched  (equal? all-test-launched "yes") (eq? kill-cnt running-cnt))
            					(launch:end-of-run-check run-id)))) ;;todo
         (else  (debug:print 0 *default-log-port* "Should it get here?? May be everything is not launched yet. Running test cnt:" running-cnt " Not completed test cnt:" not-completed-cnt)
-         (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f)))
-       (let loop ((running-test (car running-tests))
-			     (tal    (cdr running-tests)))
+         (let* ((not-completed-tests (rmt:get-tests-for-run run-id "%" `("NOT_STARTED" "RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f)))
+       (if (> (length not-completed-tests) 0) 
+           (let loop ((running-test (car not-completed-tests))
+			     (tal    (cdr not-completed-tests)))
 		       (let* ((test-name (vector-ref running-test 2))
                  (item-path (vector-ref running-test 11)))
-			       	(debug:print 0 *default-log-port* "test " test-name "/" item-path " needs to be killed")
+			       	(debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed")
               (if (not (null? tal))
-				  (loop (car tal) (cdr tal))))))))))        
+				  (loop (car tal) (cdr tal)))))))))))        
  
 (define (launch:is-test-alive host pid)
 (if (and host pid (not (equal? host "n/a")))
 (let* ((cmd (conc "ssh " host " pstree -A " pid))
       (output (with-input-from-pipe cmd read-lines)))
@@ -1405,10 +1436,32 @@
 	    (begin
 	      (debug:print-error 0 *default-log-port* "Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries)
 	      ;; 
 	      (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat remtries: (- remtries 1)))
 	    (list #f #f)))))
+
+
+(define (launch:handle-zombie-tests run-id)
+  (let* ((key (conc "zombiescan-runid-"run-id))
+         (now (current-seconds))
+         (threshold (- (current-seconds)  (* 2 (or (configf:lookup-number *configdat* "setup" "deadtime") 120))))
+         (val (rmt:get-var key))
+         (do-scan?
+          (cond
+           ((not val)
+            #t)
+           ((< val threshold)
+            #t)
+           (else #f))))
+    (when do-scan?
+      (debug:print 1 *default-log-port* "INFO: search and mark zombie tests")
+      (rmt:set-var key (current-seconds))
+      (rmt:find-and-mark-incomplete run-id #f))))
+
+
+
+
 
 ;; 1. look though disks list for disk with most space
 ;; 2. create run dir on disk, path name is meaningful
 ;; 3. create link from run dir to megatest runs area 
 ;; 4. remotely run the test on allocated host

Index: megatest-version.scm
==================================================================
--- megatest-version.scm
+++ megatest-version.scm
@@ -18,6 +18,6 @@
 ;; Always use two or four digit decimal
 ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..
 
 (declare (unit megatest-version))
 
-(define megatest-version 1.6524)
+(define megatest-version 1.6525)

Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -109,13 +109,14 @@
 
 Launching and managing runs
   -run                    : run all tests or as specified by -testpatt
   -remove-runs            : remove the data for a run, requires -runname and -testpatt
                             Optionally use :state and :status, use -keep-records to remove only
-                            the run data.
-  -kill-runs               : kill existing run(s) (all incomplete tests killed)
-  -kill-rerun              : kill an existing run (all incomplete tests killed and run is rerun)
+                            the run data. Use -kill-wait to override the 10 second
+                            per test wait after kill delay. 
+  -kill-runs              : kill existing run(s) (all incomplete tests killed)
+  -kill-rerun             : kill an existing run (all incomplete tests killed and run is rerun)
   -set-state-status X,Y   : set state to X and status to Y, requires controls per -remove-runs
   -rerun FAIL,WARN...     : force re-run for tests with specificed status(s)
   -rerun-clean            : set all tests not COMPLETED+PASS,WARN,WAIVED to NOT_STARTED,n/a
                             and then run the specified testpatt with -preclean
   -rerun-all              : set all tests to NOT_STARTED,n/a and run with -preclean
@@ -129,11 +130,11 @@
   -no-cache               : do not use the cached config files. 
   -one-pass               : launch as many tests as you can but do not wait for more to be ready
   -remove-keep N          : remove all but N most recent runs per target; use '-actions, -age, -precmd'
   -age <age>              : 120d,3h,20m to apply only to runs older than the 
                                  specified age. NB// M=month, m=minute
-  -actions <action>       : print,remove-runs,archive to specify action to take
+  -actions <action>[,...] : actions to take; print,remove-runs,archive,kill-runs
   -precmd                 : insert a wrapper command in front of the commands run
 
 Selectors (e.g. use for -runtests, -remove-runs, -set-state-status, -list-runs etc.)
   -target key1/key2/...   : run for key1, key2, etc.
   -reqtarg key1/key2/...  : run for key1, key2, etc. but key1/key2 must be in runconfigs
@@ -357,10 +358,11 @@
                         "-src-target"
                         "-src-runname"
                         "-diff-email"
 			"-sync-to"			
 			"-pgsync"
+			"-kill-wait"    ;; wait this long before removing test (default is 10 sec)
                         "-diff-html"
 			)
  		 (list  "-h" "-help" "--help"
 			"-manual"
 			"-version"
@@ -380,23 +382,25 @@
 			"-clean-cache"
 			"-no-cache"
 			"-cache-db"
                         "-use-db-cache"
                         "-prepend-contour"
+
 			;; misc
 			"-repl"
 			"-lock"
 			"-unlock"
 			"-list-servers"
 			"-kill-servers"
                         "-run-wait"      ;; wait on a run to complete (i.e. no RUNNING)
-			"-one-pass"       ;;
+			"-one-pass"      ;;
 			"-local"         ;; run some commands using local db access
-      "-generate-html"
-      "-generate-html-structure" 
+			"-generate-html"
+			"-generate-html-structure" 
 			"-list-run-time"
                         "-list-test-time"
+			
 			;; misc queries
 			"-list-disks"
 			"-list-targets"
 			"-list-db-targets"
 			"-show-runconfig"

Index: mt-pg.sql
==================================================================
--- mt-pg.sql
+++ mt-pg.sql
@@ -82,11 +82,11 @@
        tag_id   INTEGER DEFAULT 0,
        area_id  INTEGER DEFAULT 0,
        CONSTRAINT areatagconstraint UNIQUE (tag_id, area_id));
 
 CREATE VIEW area_tag_view as 
-select a.id as aid, t.id as tid,area_name,tag_name from areas as a inner join area_tags as at on at.area_id = a.id
+select a.id as aid, t.id as tid,area_name,tag_name,area_path from areas as a inner join area_tags as at on at.area_id = a.id
 inner join tags as t on t.id = at.tag_id  ;
 
 INSERT INTO areas (id,area_name,area_path) VALUES (0,'local','.');
 
 CREATE TABLE IF NOT EXISTS ttype (
@@ -309,15 +309,16 @@
 
 
 
 CREATE TABLE IF NOT EXISTS users_webviews(
  id      SERIAL  PRIMARY KEY   ,
- user_id   INTEGER NOT NULL,
- webview_id  INTEGER NOT NULL,
- deleted     INTEGER default 0,
- searchpattern TEXT Default '',
- web_page TEXT Default '',
+ user_id         INTEGER NOT NULL,
+ webview_id  		 INTEGER NOT NULL,
+ deleted     		 INTEGER default 0,
+ searchpattern 	 TEXT Default '',
+ web_page   		 TEXT Default '',
+ is_default      boolean default 'f',
  other_search_data TEXT Default ''
 );
 
 CREATE TABLE IF NOT EXISTS cctrl_info(
  id      SERIAL  PRIMARY KEY   ,

Index: mtut.scm
==================================================================
--- mtut.scm
+++ mtut.scm
@@ -828,11 +828,12 @@
 (define (contains list x) (cond ((null? list) #f) ((eq? (car list) x) #t) (else (contains (cdr list) x))))
 
 ;; collect all needed data and create run pkts for contours with changed inputs
 ;;
 (define (generate-run-pkts mtconf toppath)
-  (let ((std-runname (conc "sched"  (time->string (seconds->local-time (current-seconds)) "%M%H%d"))))
+  (let ((std-runname (conc "sched"  (time->string (seconds->local-time (current-seconds)) "%M%H%d")))
+        (packets-generated 0))
     (common:with-queue-db
      mtconf
      (lambda (pktsdirs pktsdir pdb)
        (let* ((rgconfdat (find-and-read-config (conc toppath "/runconfigs.config")))
 	      (rgconf    (car rgconfdat))
@@ -1011,13 +1012,14 @@
                          ;;(print "Areas: " all-areas)
                          (for-each 
                            (lambda (area) 
 ;Add code to check whether area is valid
 			     (if 
-                   (if (args:get-arg "-target") 
+                   ;; This code checks whether the target has been passed in via argument, and only runs the specified target
+                   (and (< packets-generated 4)  (if (args:get-arg "-target") 
                      (if (string= (args:get-arg "-target") runkey) (area-allowed? area "area-needs-to-be-run" runkey contour #f) #f) 
-                     (area-allowed? area "area-needs-to-be-run" runkey contour #f))
+                     (area-allowed? area "area-needs-to-be-run" runkey contour #f)))
        
 			     (let* ((script (car cmd))
 				(params (cdr cmd))
 				(cmd    (conc script " " contour " " area " " runkey " " std-runname " " action " " params))
 				(res    (handle-exceptions
@@ -1096,13 +1098,16 @@
 							(aval       (or (configf:lookup mtconf "areas" area) ""))
                                     			(aval-alist (common:val->alist aval))
 
 							(targets (map-targets mtconf aval-alist runkey area contour)))
                                         (pp targets)
-				        (for-each (lambda (target) (create-run-pkt mtconf action area runkey target new-runname mode-patt
+				        (for-each (lambda (target) 
+                                                     (create-run-pkt mtconf action area runkey target new-runname mode-patt
                                                       tag-expr pktsdir reason contour sched dbdest append
-                                                      runtrans)) targets)
+                                                      runtrans)
+                                                     (set! packets-generated (+ packets-generated 1))
+                                                  ) targets)
 					;; Add filter for targets
 
                                        ;;(create-run-pkt mtconf action area runkey target runname
                                        ;;               pktsdir reason contour dbdest append
                                        ;;               runtrans)

Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -55,10 +55,15 @@
 
 ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
 ;;
 (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected
 
+  (common:telemetry-log (conc "rmt:"(->string cmd))
+                        payload: `((rid . ,rid)
+                                   (params . ,params)))
+                          
+  
   ;;DOT digraph megatest_state_status {
   ;;DOT   ranksep=0;
   ;;DOT   // rankdir=LR;
   ;;DOT   node [shape="box"];
   ;;DOT "rmt:send-receive" -> MUTEXLOCK;
@@ -692,12 +697,12 @@
 ;; state and status are extra hints not usually used in the calculation
 ;;
 (define (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status comment)
   (rmt:send-receive 'set-state-status-and-roll-up-items run-id (list run-id test-name item-path state status comment)))
 
-(define (rmt:set-state-status-and-roll-up-run run-id)
-  (rmt:send-receive 'set-state-status-and-roll-up-run run-id (list run-id)))
+(define (rmt:set-state-status-and-roll-up-run run-id state status)
+  (rmt:send-receive 'set-state-status-and-roll-up-run run-id (list run-id state status)))
 
 
 (define (rmt:update-pass-fail-counts run-id test-name)
   (rmt:general-call 'update-pass-fail-counts run-id test-name test-name test-name))
 
@@ -755,13 +760,21 @@
   (rmt:send-receive 'lock/unlock-run #f (list run-id lock unlock user)))
 
 ;; set/get status
 (define (rmt:get-run-status run-id)
   (rmt:send-receive 'get-run-status #f (list run-id)))
+
+(define (rmt:get-run-state run-id)
+  (rmt:send-receive 'get-run-state #f (list run-id)))
+
 
 (define (rmt:set-run-status run-id run-status #!key (msg #f))
   (rmt:send-receive 'set-run-status #f (list run-id run-status msg)))
+
+(define (rmt:set-run-state-status run-id state status )
+  (rmt:send-receive 'set-run-state-status #f (list run-id state status)))
+
 
 (define (rmt:update-run-event_time run-id)
   (rmt:send-receive 'update-run-event_time #f (list run-id)))
 
 (define (rmt:get-runs-by-patt  keys runnamepatt targpatt offset limit fields last-runs-update  #!key  (sort-order "asc")) ;; fields of #f uses default

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -450,10 +450,17 @@
 
     ;; register this run in monitor.db
     (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params)
     (rmt:tasks-set-state-given-param-key task-key "running")
 
+    (common:telemetry-log "run-tests"
+                          payload:
+                          `( (target . ,target)
+                             (run-name . ,runname)
+                             (test-patts . ,test-patts) ) )
+
+    
     ;; Now generate all the tests lists
     (set! all-tests-registry (tests:get-all))   ;; hash of testname => path-to-test
     (set! all-test-names     (hash-table-keys all-tests-registry))
     ;; filter first for allowed-tests (from -tagexpr) then for test-patts.
     (set! test-names         (tests:filter-test-names
@@ -515,10 +522,11 @@
     ;; run the run prehook if there are no tests yet run for this run:
     ;;
     (runs:run-pre-hook run-id)
     ;; mark all test launced flag as false in the meta table 
     (rmt:set-var (conc "lunch-complete-" run-id) "no")
+    (rmt:set-run-state-status run-id "new" "n/a")
     ;; now add non-directly referenced dependencies (i.e. waiton)
     ;;======================================================================
     ;; refactoring this block into tests:get-full-data
     ;;
     ;; What happended, this code is now duplicated in tests!?
@@ -2003,13 +2011,19 @@
 		       ((print)
 			(print " " (simple-run-runname run)
 			       " " (time->string (seconds->local-time (simple-run-event_time run)) "WW%V.%u %H:%M:%S")
 			       " " (if remove "REMOVE" "")))
 		       ((remove-runs)
-			(if remove (system (conc precmd " megatest -remove-runs -target " target " -runname " (simple-run-runname run) " -testpatt %"))))
+			(if remove (system (conc precmd " megatest -remove-runs -target " target " -runname " (simple-run-runname run) " -testpatt %"
+						 (if (member 'kill-runs actions) ;; if kill-runs is specified then set -kill-wait to 0
+						     " -kill-wait 0"
+						     "")))))
 		       ((archive)
-			(if remove (system (conc precmd " megatest -archive save-remove -target " target " -runname " (simple-run-runname run) " -testpatt %"))))))
+			(if remove (system (conc precmd " megatest -archive save-remove -target " target " -runname " (simple-run-runname run) " -testpatt %"))))
+		       ((kill-runs)
+			(if remove (system (conc precmd " megatest -kill-runs -target " target " -runname " (simple-run-runname run) " -testpatt %"))))
+			))
 		   actions))))
           sorted)))
      ;; (print "Sorted: " (map simple-run-event_time sorted))
      ;; (print "Remove: " (map simple-run-event_time to-remove))))
      (hash-table-keys runs-ht))
@@ -2137,11 +2151,11 @@
 		       (toplevel-retries (make-hash-table)) ;; try three times to loop through and remove top level tests
 		       (test-retry-time  (make-hash-table))
                        (backgrounded-remove-status     (make-hash-table))
                        (backgrounded-remove-last-visit (make-hash-table))
                        (backgrounded-remove-result     (make-hash-table))
-		       (allow-run-time   10)) ;; seconds to allow for killing tests before just brutally killing 'em
+		       (allow-run-time   (string->number (or (args:get-arg "-kill-wait") "10")))) ;; seconds to allow for killing tests before just brutally killing 'em
 		   (let loop ((test (car sorted-tests))
 			      (tal  (cdr sorted-tests)))
 		     (let* ((test-id       (db:test-get-id test))
 			    (new-test-dat  (rmt:get-test-info-by-id run-id test-id)))
 		       (if (not new-test-dat)
@@ -2155,14 +2169,16 @@
 				   ;; (rmt:sdb-qry 'getid 
 				   (db:test-get-rundir new-test-dat)) ;; )    ;; run dir is from the link tree
                                   (has-subrun    (and (subrun:subrun-test-initialized? run-dir)
                                                       (not (subrun:subrun-removed? run-dir))))
 				  (test-state    (db:test-get-state new-test-dat))
+                                  (test-status    (db:test-get-status new-test-dat))
 				  (test-fulln    (db:test-get-fullname new-test-dat))
 				  (uname         (db:test-get-uname    new-test-dat))
 				  (toplevel-with-children (and (db:test-get-is-toplevel test)
 							       (> (rmt:test-toplevel-num-items run-id test-name) 0))))
+                             
 			     (case action
 			       ((remove-runs)
 				;; if the test is a toplevel-with-children issue an error and do not remove
 				(cond
                                  (toplevel-with-children
@@ -2275,10 +2291,16 @@
                                  ((member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
                                   (debug:print 1 *default-log-port* "INFO: issuing killreq to test "test-fulln)
                                   (mt:test-set-state-status-by-id run-id (db:test-get-id test) "KILLREQ" "n/a" #f)
                                   (if (not (null? tal))
 				      (loop (car tal)(cdr tal))))
+                                 ((and (member test-status '("PREQ_FAIL" "PREQ_DISCARDED" "BLOCKED" "ZERO_ITEMS" "KEEP_TRYING" "TEN_STRIKES" "TIMED_OUT")))
+                                  (rmt:set-state-status-and-roll-up-items run-id (db:test-get-id test) 'foo "NOT_STARTED" "n/a" (conc "kill-run moved from "test-state":"test-status" to NOT_STARTED:n/a"))
+                                  ;;(mt:test-set-state-status-by-id run-id (db:test-get-id test) "NOT_STARTED" "n/a" (conc "kill-run moved from "test-state":"test-status" to NOT_STARTED:n/a"))
+                                  (if (not (null? tal))
+				      (loop (car tal)(cdr tal)))
+                                  )
                                  (else
                                   (if (not (null? tal))
 				      (loop (car tal)(cdr tal)))
                                   )))
 			       ((set-state-status)

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -528,10 +528,12 @@
 		(let* ((start-time (current-milliseconds))
                        (res (system sync-cmd)))
                   (cond
                    ((eq? 0 res)
 		    (delete-file* (conc mtdbfile ".backup"))
+                    (if (eq? 0 (file-size sync-log))
+                        (delete-file sync-log))
 		    (system (conc "/bin/mv " staging-file " " mtdbfile))
                     (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "(/ (- (current-milliseconds) start-time) 1000)" sec")
                     #t)
                    (else
                     (system (conc "/bin/cp "sync-log" "sync-log".fail"))

Index: tasks.scm
==================================================================
--- tasks.scm
+++ tasks.scm
@@ -993,10 +993,11 @@
 ;; get runs changed since last sync
 ;; (define (tasks:sync-test-data dbh cached-info area-info)
 ;;   (let* ((
 
 (define (tasks:sync-to-postgres configdat dest)
+  (print "In sync")
   (let* ((dbh         (pgdb:open configdat dbname: dest))
 	 (area-info   (pgdb:get-area-by-path dbh *toppath*))
 	 (cached-info (make-hash-table))
 	 (start       (current-seconds))
    (test-patt   (if (args:get-arg "-testpatt")
@@ -1014,15 +1015,17 @@
           (exit 1)))
      (if (and (not target)  run-name)
        (begin
 					(print "Error: Provide target")
           (exit 1)))
-
+    (print "123")
+    (exit 1) 
     (for-each (lambda (dtype)
 		(hash-table-set! cached-info dtype (make-hash-table)))
 	      '(runs targets tests steps data))
     (hash-table-set! cached-info 'start start) ;; when done we'll set sync times to this
+    (print "here")
     (if area-info
 	(let* ((last-sync-time (vector-ref area-info 3))
 	       (smallest-last-update-time  (make-hash-table))
          (changed      (if (and target run-name)
                             (rmt:get-run-record-ids target run-name (rmt:get-keys) test-patt)
@@ -1035,10 +1038,11 @@
          (area-tag    (if (args:get-arg "-area-tag") 
                                  (args:get-arg "-area-tag")
                                  (if (args:get-arg "-area") 
                                    (args:get-arg "-area") 
                                    ""))))
+          (print "here2")
            (if (and (equal? area-tag "") (not (pgdb:is-area-taged dbh (vector-ref area-info 0))))
             (set! area-tag *default-area-tag*)) 
            (if (not (equal? area-tag "")) 
              (task:add-area-tag dbh area-info area-tag)) 
 	  (if (or (not (null? test-ids)) (not (null? run-ids)))

ADDED   telemetry-daemon
Index: telemetry-daemon
==================================================================
--- /dev/null
+++ telemetry-daemon
@@ -0,0 +1,265 @@
+#!/usr/bin/env python
+# -*- Mode: Python; -*- 
+## Tiny Syslog Server in Python.
+##
+## This is a tiny syslog server that is able to receive UDP based syslog
+## entries on a specified port and save them to a file.
+## That's it... it does nothing else...
+
+ 
+import os
+import sys, os, time, atexit
+from signal import SIGTERM
+import logging
+import logging.handlers
+import SocketServer
+import datetime
+from subprocess import call
+import argparse
+import os
+import socket
+
+## code to determine this host's IP on non-loopback interface
+if os.name != "nt":
+    import fcntl
+    import struct
+
+    def get_interface_ip(ifname):
+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        return socket.inet_ntoa(fcntl.ioctl(s.fileno(), 0x8915, struct.pack('256s',
+                                ifname[:15]))[20:24])
+
+def get_lan_ip():
+    ip = socket.gethostbyname(socket.gethostname())
+    if ip.startswith("127.") and os.name != "nt":
+        interfaces = [
+            "eth0",
+            "eth1",
+            "eth2",
+            "wlan0",
+            "wlan1",
+            "wifi0",
+            "ath0",
+            "ath1",
+            "ppp0",
+            ]
+        for ifname in interfaces:
+            try:
+                ip = get_interface_ip(ifname)
+                break
+            except IOError:
+                pass
+    return ip
+
+class Daemon(object):
+        """
+        A generic daemon class.
+       
+        Usage: subclass the Daemon class and override the run() method
+        """
+        def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'):
+                self.stdin = stdin
+                self.stdout = stdout
+                self.stderr = stderr
+                self.pidfile = pidfile
+       
+        def daemonize(self):
+                """
+                do the UNIX double-fork magic, see Stevens' "Advanced
+                Programming in the UNIX Environment" for details (ISBN 0201563177)
+                http://www.erlenstar.demon.co.uk/unix/faq_2.html#SEC16
+                """
+                try:
+                        pid = os.fork()
+                        if pid > 0:
+                                # exit first parent
+                                sys.exit(0)
+                except OSError, e:
+                        sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror))
+                        sys.exit(1)
+       
+                # decouple from parent environment
+                os.chdir("/")
+                os.setsid()
+                os.umask(0)
+       
+                # do second fork
+                try:
+                        pid = os.fork()
+                        if pid > 0:
+                                # exit from second parent
+                                sys.exit(0)
+                except OSError, e:
+                        sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror))
+                        sys.exit(1)
+       
+                # redirect standard file descriptors
+                sys.stdout.flush()
+                sys.stderr.flush()
+                si = file(self.stdin, 'r')
+                so = file(self.stdout, 'a+')
+                se = file(self.stderr, 'a+', 0)
+                os.dup2(si.fileno(), sys.stdin.fileno())
+                os.dup2(so.fileno(), sys.stdout.fileno())
+                os.dup2(se.fileno(), sys.stderr.fileno())
+       
+                # write pidfile
+                atexit.register(self.delpid)
+                pid = str(os.getpid())
+                file(self.pidfile,'w+').write("%s\n" % pid)
+       
+        def delpid(self):
+                os.remove(self.pidfile)
+ 
+        def start(self):
+                """
+                Start the daemon
+                """
+                # Check for a pidfile to see if the daemon already runs
+                try:
+                        pf = file(self.pidfile,'r')
+                        pid = int(pf.read().strip())
+                        pf.close()
+                except IOError:
+                        pid = None
+       
+                if pid:
+                        message = "pidfile %s already exist. Daemon already running?\n"
+                        sys.stderr.write(message % self.pidfile)
+                        sys.exit(1)
+               
+                # Start the daemon
+                self.daemonize()
+                self.run()
+ 
+        def stop(self):
+                """
+                Stop the daemon
+                """
+                # Get the pid from the pidfile
+                try:
+                        pf = file(self.pidfile,'r')
+                        pid = int(pf.read().strip())
+                        pf.close()
+                except IOError:
+                        pid = None
+       
+                if not pid:
+                        message = "pidfile %s does not exist. Daemon not running?\n"
+                        sys.stderr.write(message % self.pidfile)
+                        return # not an error in a restart
+ 
+                # Try killing the daemon process       
+                try:
+                        while 1:
+                                os.kill(pid, SIGTERM)
+                                time.sleep(0.1)
+                except OSError, err:
+                        err = str(err)
+                        if err.find("No such process") > 0:
+                                if os.path.exists(self.pidfile):
+                                        os.remove(self.pidfile)
+                        else:
+                                print str(err)
+                                sys.exit(1)
+ 
+        def restart(self):
+                """
+                Restart the daemon
+                """
+                self.stop()
+                self.start()
+ 
+        def run(self):
+                """
+                You should override this method when you subclass Daemon. It will be called after the process has been
+                daemonized by start() or restart().
+                """
+
+# setup logging module so that the log can be moved aside and will reopen for append
+def log_setup(logfile):
+    log_handler = logging.handlers.WatchedFileHandler(logfile)
+    formatter = logging.Formatter(
+        '%(message)s','')
+    log_handler.setFormatter(formatter)
+    logger = logging.getLogger()
+    logger.addHandler(log_handler)
+    logger.setLevel(logging.INFO)
+
+
+class SyslogUDPHandler(SocketServer.BaseRequestHandler):
+	def handle(self):
+		data = bytes.decode(self.request[0].strip())
+		socket = self.request[1]
+		print( "%s : " % self.client_address[0], str(data))
+		timestamp = datetime.datetime.now().isoformat()
+		logline = timestamp + ":"+self.client_address[0] + ":" + str(data)
+		logging.info(str(logline))
+
+
+
+class TelemetryLogDaemon(Daemon):
+  def __init__(self, pidfile, logfile, server_ip, server_port):
+    self.logfile = logfile
+    self.server_ip = server_ip
+    self.server_port = server_port
+    super(TelemetryLogDaemon, self).__init__(pidfile)
+    
+  def run(self):
+    log_setup(self.logfile)
+    server = SocketServer.UDPServer((self.server_ip,int(self.server_port)), SyslogUDPHandler)
+    server.serve_forever(poll_interval=0.5)
+
+
+def main():
+  default_log_file = os.environ['PWD'] + "/telemetry.log"
+
+  parser = argparse.ArgumentParser(description = 'telemetry-daemon')
+  actions="start,restart,stop,nodaemon".split(",")
+  
+  parser.add_argument("-a", "--action", required=True, choices=actions, help="manage daemon: start stop or restart")
+  parser.add_argument("-p", "--server-port", default="5929", help="specify alternate udp port number, default is 5929")
+  parser.add_argument("-i", "--server-ip", default=get_lan_ip(), help="specify IP if heuristics to get local host lan ip fails")
+  parser.add_argument("-l", "--log-file", default=default_log_file, help="specify log file to write")
+  parser.add_argument("-z", "--pid-file", default=default_log_file + ".pidfile", help="specify pidfile")
+  opts = parser.parse_args()
+
+  tld = TelemetryLogDaemon(opts.pid_file, opts.log_file, opts.server_ip, opts.server_port)
+
+  if opts.action == "start":
+    print "Info: Starting server"
+    print """Example addition to megatest.config to enable telemetry:
+
+[telemetry]
+host %s
+port %s
+want-events ALL
+
+    """ % (opts.server_ip, opts.server_port)
+    tld.start()
+    
+  elif opts.action == "stop":
+    tld.stop()
+  elif opts.action == "restart":
+
+    print "Info: Restarting server"
+    print """Example addition to megatest.config to enable telemetry:
+
+[telemetry]
+host %s
+port %s
+want-events ALL
+
+    """ % (opts.server_ip, opts.server_port)
+    tld.restart()
+  elif opts.action == "nodaemon":
+    log_setup(opts.log_file)
+    server = SocketServer.UDPServer((opts.server_ip,int(opts.server_port)), SyslogUDPHandler)
+    server.serve_forever(poll_interval=0.5)
+  
+if __name__ == '__main__':
+  main()
+
+
+
+

Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -1517,11 +1517,11 @@
 ;; if .testconfig exists in test directory read and return it
 ;; else if have cached copy in *testconfigs* return it IFF there is a section "have fulldata"
 ;; else read the testconfig file
 ;;   if have path to test directory save the config as .testconfig and return it
 ;;
-(define (tests:get-testconfig test-name item-path test-registry system-allowed #!key (force-create #f)(allow-write-cache #t))
+(define (tests:get-testconfig test-name item-path test-registry system-allowed #!key (force-create #f)(allow-write-cache #t)(wait-a-minute #f))
   (let* ((use-cache    (common:use-cache?))
 	 (cache-path   (tests:get-test-path-from-environment))
 	 (cache-file   (and cache-path (conc cache-path "/.testconfig")))
 	 (cache-exists (and cache-file
 			    (not force-create)  ;; if force-create then pretend there is no cache to read
@@ -1545,13 +1545,37 @@
 	      dat
 	      ;; no cached data available
 	      (let* ((treg         (or test-registry
 				       (tests:get-all)))
 		     (test-path    (or (hash-table-ref/default treg test-name #f)
+                                       (let* ((local-tcdir (conc (getenv "MT_LINKTREE") "/"
+                                                                 (getenv "MT_TARGET") "/"
+                                                                 (getenv "MT_RUNNAME") "/"
+                                                                 test-name "/" item-path))
+                                              (local-tcfg (conc local-tcdir "/testconfig")))
+                                         (if (common:file-exists? local-tcfg)
+                                             local-tcdir
+                                             #f))
 				       (conc *toppath* "/tests/" test-name)))
 		     (test-configf (conc test-path "/testconfig"))
-		     (testexists   (and (common:file-exists? test-configf)(file-read-access? test-configf)))
+		     (testexists   (let loopa ((tries-left 30))
+                                     (cond
+                                      (
+                                       (and (common:file-exists? test-configf)(file-read-access? test-configf))
+                                       #t)
+                                      (
+                                       (common:file-exists? test-configf)
+                                       (debug:print 0 *default-log-port* "WARNING: Cannot read testconfig file: "test-configf)
+                                       #f)
+                                      (
+                                       (and wait-a-minute (> tries-left 0))
+                                       (thread-sleep! 10)
+                                       (debug:print 0 *default-log-port* "WARNING: testconfig file does not exist: "test-configf" will retry in 10 seconds.  Tries left: "tries-left) ;; BB: this fires
+                                       (loopa (sub1 tries-left)))
+                                      (else
+                                       (debug:print 0 *default-log-port* "WARNING: testconfig file does not exist: "test-configf) ;; BB: this fires
+                                       #f))))
 		     (tcfg         (if testexists
 				       (read-config test-configf #f system-allowed
 						    environ-patt: (if system-allowed
 								      "pre-launch-env-vars"
 								      #f))
@@ -1562,11 +1586,11 @@
 			 cache-file
 			 (file-write-access? cache-path)
 			 allow-write-cache)
 		    (let ((tpath (conc cache-path "/.testconfig")))
 		      (debug:print-info 1 *default-log-port* "Caching testconfig for " test-name " in " tpath)
-                      (if (not (common:in-running-test?))
+                      (if (and tcfg (not (common:in-running-test?)))
                           (configf:write-alist tcfg tpath))))
 		tcfg))))))
   
 ;; sort tests by priority and waiton
 ;; Move test specific stuff to a test unit FIXME one of these days