Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -49,10 +49,11 @@ test-get-paths-matching-keynames-target-new get-prereqs-not-met get-count-tests-running-for-run-id get-run-info get-run-status + get-run-state get-run-stats get-run-times get-targets get-target ;; register-run @@ -158,10 +159,15 @@ (params (vector-ref dat 1)) (start-t (current-milliseconds)) (readonly-mode (dbr:dbstruct-read-only dbstruct)) (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) + (foo (begin + (common:telemetry-log (conc "api-in:"(->string cmd)) + payload: `((params . ,params))) + + #t)) (res (if writecmd-in-readonly-mode (conc "attempt to run write command "cmd" on a read-only database") (case cmd ;;=============================================== @@ -281,11 +287,13 @@ ((get-test-times) (apply db:get-test-times dbstruct params)) ;; RUNS ((get-run-info) (apply db:get-run-info dbstruct params)) ((get-run-status) (apply db:get-run-status dbstruct params)) + ((get-run-state) (apply db:get-run-state dbstruct params)) ((set-run-status) (apply db:set-run-status dbstruct params)) + ((set-run-state-status) (apply db:set-run-state-status dbstruct params)) ((get-tests-for-run) (apply db:get-tests-for-run dbstruct params)) ((get-test-id) (apply db:get-test-id dbstruct params)) ((get-tests-for-run-mindata) (apply db:get-tests-for-run-mindata dbstruct params)) ((get-tests-for-runs-mindata) (apply db:get-tests-for-runs-mindata dbstruct params)) ((get-runs) (apply db:get-runs dbstruct params)) @@ -330,19 +338,28 @@ ;; TASKS ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)) (else (debug:print 0 *default-log-port* "ERROR: bad api call " cmd) (conc "ERROR: BAD api call " cmd)))))) + ;; save all stats (let ((delta-t (- (current-milliseconds) start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if writecmd-in-readonly-mode - (vector #f res) - (vector #t res))))))) + (begin + (common:telemetry-log (conc "api-out:"(->string cmd)) + payload: `((params . ,params) + (ok-res . #t))) + (vector #f res)) + (begin + (common:telemetry-log (conc "api-out:"(->string cmd)) + payload: `((params . ,params) + (ok-res . #f))) + (vector #t res)))))))) ;; http-server send-response ;; api:process-request ;; db:* ;; Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -17,11 +17,11 @@ ;; along with Megatest. If not, see . ;;====================================================================== (use srfi-1 data-structures posix regex-case (prefix base64 base64:) - format dot-locking csv-xml z3 ;; sql-de-lite + format dot-locking csv-xml z3 udp ;; sql-de-lite hostinfo md5 message-digest typed-records directory-utils stack matchable regex posix (srfi 18) extras ;; tcp (prefix nanomsg nmsg:) (prefix sqlite3 sqlite3:) pkts (prefix dbi dbi:) @@ -81,10 +81,11 @@ (length (glob (conc "/proc/" pid "/fd/*"))) (length (filter identity (map socket? (glob (conc "/proc/" pid "/fd/*"))))) ) ) + ;; GLOBALS ;; CONTEXTS (defstruct cxt @@ -887,10 +888,11 @@ (debug:print-info 13 *default-log-port* "watchdog done.")) (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost")))) (define (std-exit-procedure) + ;;(common:telemetry-log-close) (on-exit (lambda () 0)) ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) (let ((no-hurry (if *time-to-exit* ;; hurry up #f (begin @@ -1591,22 +1593,25 @@ (with-output-to-file fullpath (lambda ()(pp dat)))))) ;; get cpu load by reading from /proc/loadavg, return all three values ;; (define (common:get-cpu-load remote-host) - (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) - (or (common:get-cached-info actual-hostname "cpu-load") - (let ((result (if remote-host - (map (lambda (res) - (if (eof-object? res) 9e99 res)) - (with-input-from-pipe - (conc "ssh " remote-host " cat /proc/loadavg") + (handle-exceptions + exn + '(99 99 99) + (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) + (or (common:get-cached-info actual-hostname "cpu-load") + (let ((result (if remote-host + (map (lambda (res) + (if (eof-object? res) 9e99 res)) + (with-input-from-pipe + (conc "ssh " remote-host " cat /proc/loadavg") (lambda ()(list (read)(read)(read))))) - (with-input-from-file "/proc/loadavg" - (lambda ()(list (read)(read)(read))))))) - (common:write-cached-info actual-hostname "cpu-load" result) - result)))) + (with-input-from-file "/proc/loadavg" + (lambda ()(list (read)(read)(read))))))) + (common:write-cached-info actual-hostname "cpu-load" result) + result))))) ;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads ;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc. ;; keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load ;; @@ -3049,5 +3054,69 @@ exn #t ;; just ignore it, it might have died in the meantime so joining it will throw an exception (thread-join! thread)) ))) (hash-table-keys *common:thread-punchlist*))) + +(define *common:telemetry-log-state* 'startup) +(define *common:telemetry-log-socket* #f) + +(define (common:telemetry-log-open) + (if (eq? *common:telemetry-log-state* 'startup) + (let* ((serverhost (configf:lookup *configdat* "telemetry" "host")) + (serverport (configf:lookup-number *configdat* "telemetry" "port")) + (user (or (get-environment-variable "USER") "unknown")) + (host (or (get-environment-variable "HOST") "unknown"))) + (set! *common:telemetry-log-state* + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "common-telemetry-log open udp port failure") + 'broken) + (if (and serverhost serverport user host) + (let* ((s (udp-open-socket))) + ;;(udp-bind! s #f 0) + (udp-connect! s serverhost serverport) + (set! *common:telemetry-log-socket* s) + 'open) + 'not-needed)))))) + +(define (common:telemetry-log event #!key (payload '())) + (if (eq? *common:telemetry-log-state* 'startup) + (common:telemetry-log-open)) + + (if (eq? 'open *common:telemetry-log-state*) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "common-telemetry-log comms failure ; disabled (no server?)") + ;;(define *common:telemetry-log-state* 'broken-or-no-server-preclose) + ;;(common:telemetry-log-close) + (define *common:telemetry-log-state* 'broken-or-no-server) + (set! *common:telemetry-log-socket* #f) + ) + (if (and *common:telemetry-log-socket* event) ;; TODO - filter on event against telemetry.want-events + (let* ((user (or (get-environment-variable "USER") "unknown")) + (host (or (get-environment-variable "HOST") "unknown")) + (start (conc "[megatest "event"]")) + (toppath (or *toppath* "/dev/null")) + (payload-serialized + (base64:base64-encode + (z3:encode-buffer + (with-output-to-string (lambda () (pp payload)))))) + (msg (conc user":"host":"start":"(current-process-id)":"(car (argv))":" + toppath":"payload-serialized))) + (udp-send *common:telemetry-log-socket* msg)))))) + +(define (common:telemetry-log-close) + (when (or (member *common:telemetry-log-state* '(broken-or-no-server-preclose open)) *common:telemetry-log-socket*) + (handle-exceptions + exn + (begin + (define *common:telemetry-log-state* 'closed-fail) + (debug:print-info 0 *default-log-port* "common-telemetry-log closure failure") + ) + (begin + (define *common:telemetry-log-state* 'closed) + (udp-close-socket *common:telemetry-log-socket*) + (set! *common:telemetry-log-socket* #f))))) + Index: configf.scm ================================================================== --- configf.scm +++ configf.scm @@ -774,14 +774,14 @@ ht)) ;; if (define (configf:read-alist fname) (handle-exceptions - exn - #f - (configf:alist->config - (with-input-from-file fname read)))) + exn + #f + (configf:alist->config + (with-input-from-file fname read)))) (define (configf:write-alist cdat fname) (if (not (common:faux-lock fname)) (debug:print 0 *default-log-port* "INFO: Could not get lock on " fname)) (let* ((dat (configf:config->alist cdat)) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -406,10 +406,13 @@ ;; Open the classic megatest.db file (defaults to open in toppath) ;; ;; NOTE: returns a dbdat not a dbstruct! ;; + +;;(define (db:reopen-megatest-db + (define (db:open-megatest-db #!key (path #f)(name #f)) (let* ((dbdir (or path *toppath*)) (dbpath (conc dbdir "/" (or name "megatest.db"))) (dbexists (common:file-exists? dbpath)) (db (db:lock-create-open dbpath @@ -1642,39 +1645,61 @@ (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) (let* ((incompleted '()) (oldlaunched '()) (toplevels '()) - (deadtime-str (configf:lookup *configdat* "setup" "deadtime")) ;; FIXME suspect test run time & deadtime are not well matched; resulting in COMPLETED/DEAD status of an a-ok running test - (deadtime (if (and deadtime-str - (string->number deadtime-str)) - (string->number deadtime-str) - 7200))) ;; two hours + (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime"))) + (server-start-allowance 200) + (server-overloaded-budget 200) + (launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30)) + (launch-monitor-on-time-budget 30) + (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget)) + (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30)) + (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default)) + (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period))) + (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period) + ) (db:with-db dbstruct #f #f (lambda (db) - (if (number? ovr-deadtime)(set! deadtime ovr-deadtime)) - ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes ;; ;; HOWEVER: this code in run:test seems to work fine ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) ;; (db:test-get-run_duration testdat))) ;; 600) ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row - (lambda (test-id run-dir uname testname item-path) + (lambda (test-id run-dir uname testname item-path event-time run-duration) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (begin + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) + (begin + (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) + (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)))) + db + "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');" + run-id running-deadtime) + + + (sqlite3:for-each-row + (lambda (test-id run-dir uname testname item-path event-time run-duration) (if (and (equal? uname "n/a") (equal? item-path "")) ;; this is a toplevel test ;; what to do with toplevel? call rollup? (begin (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) - (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))) + (begin + (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration) + (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) db - "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');" - run-id deadtime) + "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');" + run-id remotehoststart-deadtime) ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config ;; ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row @@ -1681,11 +1706,13 @@ (lambda (test-id run-dir uname testname item-path) (if (and (equal? uname "n/a") (equal? item-path "")) ;; this is a toplevel test ;; what to do with toplevel? call rollup? (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))) + (begin + (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id" 1 day since event_time marked") + (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))) db "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');" run-id) (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.") @@ -1702,15 +1729,15 @@ ;; incompleted)) (min-incompleted-ids (map car incompleted)) ;; do 'em all (all-ids (append min-incompleted-ids (map car oldlaunched)))) (if (> (length all-ids) 0) (begin - (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as INCOMPLETE") + (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD") (for-each (lambda (test-id) - (db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test failed to complete")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS. ref ticket 220546828 - + (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")) + ;;(db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS. ref ticket 220546828 all-ids)))))))) ;; ALL REPLACED BY THE BLOCK ABOVE ;; ;; (sqlite3:execute @@ -2625,10 +2652,24 @@ (set! res status)) db "SELECT status FROM runs WHERE id=?;" run-id) res)))) + +(define (db:get-run-state dbstruct run-id) + (let ((res "n/a")) + (db:with-db + dbstruct #f #f + (lambda (db) + (sqlite3:for-each-row + (lambda (status) + (set! res status)) + db + "SELECT state FROM runs WHERE id=?;" + run-id) + res)))) + ;;====================================================================== ;; K E Y S ;;====================================================================== @@ -3008,11 +3049,11 @@ (db:with-db dbstruct run-id #f (lambda (db) - (print "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=" run-id) + ;(print "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=" run-id) (sqlite3:first-result db "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=?;" run-id)))) (define (db:get-count-tests-running-in-jobgroup dbstruct run-id jobgroup) @@ -3699,11 +3740,11 @@ (item-path (db:test-get-item-path testdat)) (tl-testdat (db:get-test-info dbstruct run-id test-name "")) (tl-test-id (if tl-testdat (db:test-get-id tl-testdat) #f))) - (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) + (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) (db:general-call dbstruct 'set-test-start-time (list test-id))) (mutex-lock! *db-transaction-mutex*) (db:with-db dbstruct #f #f (lambda (db) @@ -3787,11 +3828,11 @@ "\n\n") ;; NB// Pass the db so it is part of the transaction (list newstate newstatus))) -(define (db:set-state-status-and-roll-up-run dbstruct run-id) +(define (db:set-state-status-and-roll-up-run dbstruct run-id curr-state curr-status) (mutex-lock! *db-transaction-mutex*) (db:with-db dbstruct #f #f (lambda (db) (let ((tr-res @@ -3800,12 +3841,12 @@ (lambda () (let* ((state-status-counts (db:get-all-state-status-counts-for-run dbstruct run-id)) (state-stauses (db:roll-up-rules state-status-counts #f #f )) (newstate (car state-stauses)) (newstatus (cadr state-stauses))) - - (db:set-run-state-status dbstruct run-id newstate newstatus )))))) + (if (or (not (eq? newstate curr-state)) (not (eq? newstatus curr-status))) + (db:set-run-state-status dbstruct run-id newstate newstatus ))))))) (mutex-unlock! *db-transaction-mutex*) tr-res)))) (define (db:get-all-state-status-counts-for-run dbstruct run-id) ADDED docs/manual/devnotes.txt Index: docs/manual/devnotes.txt ================================================================== --- /dev/null +++ docs/manual/devnotes.txt @@ -0,0 +1,37 @@ +Developer Notes +--------------- + +Collected here are some topics that may interest the megatest developer. + +telemetry +~~~~~~~~~ + +A new feature introduced in v1.6525 allows a centralized debug messaging system. Debugging client-server issues +is greatly aided by a centralized, time coherent log of events across test execution, server, and runner. This +is provided by the telemetry feature + + +source code call example + + +[source,ini] + [telemetry] + host + port + want-events + + +Usage: +1. Add telemetry section to megatest.config +2. Start telemetry daemon telemetry-daemon -a start -l /tmp/my-telemetry.log +3. Run megatest +4. examine / parse telemetry log Index: docs/manual/megatest_manual.html ================================================================== --- docs/manual/megatest_manual.html +++ docs/manual/megatest_manual.html @@ -900,69 +900,10 @@ sqlite3 database. Megatest has been used with the Intel Netbatch and lsf (also known as openlava) batch systems and it should be straightforward to use it with other similar systems.

- -
-

Overview

-
-
-

Stand-alone Megatest Area

-

A single, stand-alone, Megatest based testsuite or "area" is -sufficient for most validation, automation and build problems.

-
-
-Static -
-
-

Megatest is designed as a distributed or decoupled system. This means -you can run the areas stand-alone with no additional -infrastructure. I.e. there are no databases, web servers or other -centralized resources needed. However as your needs grow you can -integrate multiple areas into a bigger system.

-
-

Component Descriptions

-
    -
  1. -

    -Multi-area dashboard and xterm. A gui (the dashboard) is usually the - best option for controlling and launching runs but all operations - can also be done from the commandline. Note: The not yet released - multi-area dashboard replaces the old dashboard for browsing and - controlling runs but for managing a single area the old dashboard - works very well. -

    -
  2. -
  3. -

    -Area/testsuite. This is your testsuite or automation definition and - consists of the information in megatest.config, runconfigs.config - and your testconfigs along with any custom scripting that can’t be - done with the native Megatest features. -

    -
  4. -
  5. -

    -If your testsuite or build automation is too large to run on a - single instance you can distribute your jobs into a compute server - pool. The only current requirements are password-less ssh access and - a network filesystem. -

    -
  6. -
-
-
-
-

Full System Architecture

-
-
-Static -
-
-
-

Installation

@@ -3033,10 +2974,10 @@

Index: docs/megatest-training.odp ================================================================== --- docs/megatest-training.odp +++ docs/megatest-training.odp cannot compute difference between binary files Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -377,14 +377,16 @@ start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) + (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) + (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load @@ -395,24 +397,43 @@ (delta (abs (- df disk-free)))) (if (and (> df 0) (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg df #f))) - (do-sync (or new-cpu-load new-disk-free over-time))) + (do-sync (or new-cpu-load new-disk-free over-time)) + + (test-info (rmt:get-test-info-by-id run-id test-id)) + (state (db:test-get-state test-info)) + (status (db:test-get-status test-info)) + (kill-reason "no kill reason specified") + (kill-job? #f)) + (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) + (cond + ((test-get-kill-request run-id test-id) + (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") + (set! kill-job? #t)) + ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) + (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) + (set! kill-job? #t)) + ((equal? status "DEAD") + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) + (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") + ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING + (set! kill-job? #f))) + (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) - (set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)) - (and runtlim (let* ((run-seconds (- (current-seconds) start-seconds)) - (time-exceeded (> run-seconds runtlim))) - (if time-exceeded - (begin - (debug:print-info 0 *default-log-port* "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim) - #t) - #f))))) - (if do-sync - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) + (launch:handle-zombie-tests run-id) + (when do-sync + ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) + ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) + (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) + (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))) + (if kill-job? (begin + (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries? (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0)) @@ -442,17 +463,19 @@ #f (process-signal pid-num signal/kill))) (process:get-sub-pids pid)))) ;; (debug:print-info 0 *default-log-port* "not killing process " pid " as it is not alive")))) pids) - (tests:test-set-status! run-id test-id "KILLED" "KILLED" (args:get-arg "-m") #f)) + ;; BB: question to Matt -- does the tests:test-state-status! encompass rollup to toplevel? If not, should it? + (tests:test-set-status! run-id test-id "KILLED" "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) ;; BB ADDED kill-reason -- confirm OK with Matt (begin (debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2) - (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (args:get-arg "-m") #f) + (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt ))) (mutex-unlock! m) - ;; no point in sticking around. Exit now. + ;; no point in sticking around. Exit now. But run end of run before exiting? + (launch:end-of-run-check run-id) (exit))) (if (hash-table-ref/default misc-flags 'keep-going #f) (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta @@ -573,18 +596,19 @@ (let ((sighand (lambda (signum) ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting (if (eq? signum signal/stop) (debug:print-error 0 *default-log-port* "attempt to STOP process. Exiting.")) (set! *time-to-exit* #t) - (print "Received signal " signum ", cleaning up before exit. Please wait...") + (print "Received signal " signum ", cleaning up before exit (set this test to COMPLETED/ABORT) . Please wait...") (let ((th1 (make-thread (lambda () - (rmt:test-set-state-status run-id test-id "INCOMPLETE" "KILLED" #f) + (print "set test to COMPLETED/ABORT begin.") + (rmt:test-set-state-status run-id test-id "COMPLETED" "ABORT" "received kill signal") + (print "set test to COMPLETED/ABORT complete.") (print "Killed by signal " signum ". Exiting") - (thread-sleep! 1) (exit 1)))) (th2 (make-thread (lambda () - (thread-sleep! 2) + (thread-sleep! 20) (debug:print 0 *default-log-port* "Done") (exit 4))))) (thread-start! th2) (thread-start! th1) (thread-join! th2))))) @@ -605,25 +629,28 @@ (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") - (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) + + (rmt:general-call 'set-test-start-time #f test-id) + (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) ;; prime it for running ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART")) (if (process:alive-on-host? test-host test-pid) (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) " and process " test-pid " is still running on host " test-host ", cannot proceed") (exit))) ((not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ"))) ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") + (rmt:general-call 'set-test-start-time #f test-id) (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")) (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed") (exit)))) - (debug:print 2 *default-log-port* "Exectuing " test-name " (id: " test-id ") on " (get-host-name)) + (debug:print 2 *default-log-port* "Executing " test-name " (id: " test-id ") on " (get-host-name)) (set! keys (rmt:get-keys)) ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process ;; one of these is defunct/redundant ... (if (not (launch:setup force-reread: #t)) (begin @@ -835,34 +862,38 @@ ;; 0 RUNNING ==> this is actually the first condition, should not get here (define (launch:end-of-run-check run-id ) (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id)) (running-cnt (rmt:get-count-tests-running-for-run-id run-id)) - (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))) - ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing + (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id))) + (current-state (rmt:get-run-state run-id)) + (current-status (rmt:get-run-status run-id))) + ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing + (debug:print 0 *default-log-port* "rollup run state/status") + (rmt:set-state-status-and-roll-up-run run-id current-state current-status) + (cond ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" )) - (debug:print 0 *default-log-port* "rollup run state/status") - (rmt:set-state-status-and-roll-up-run run-id) - (debug:print 0 *default-log-port* "look for post hook.") + (debug:print 0 *default-log-port* "look for post hook.") (runs:run-post-hook run-id)) ((> running-cnt 3) (debug:print 0 *default-log-port* "There are " running-cnt " tests running." )) ((> running-cnt 0) (debug:print 0 *default-log-port* "running cnt > 0 but <= 3 kill-running-tests-if-dead" ) (let ((kill-cnt (launch:kill-tests-if-dead run-id))) (if (and all-test-launched (equal? all-test-launched "yes") (eq? kill-cnt running-cnt)) (launch:end-of-run-check run-id)))) ;;todo (else (debug:print 0 *default-log-port* "Should it get here?? May be everything is not launched yet. Running test cnt:" running-cnt " Not completed test cnt:" not-completed-cnt) - (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f))) - (let loop ((running-test (car running-tests)) - (tal (cdr running-tests))) + (let* ((not-completed-tests (rmt:get-tests-for-run run-id "%" `("NOT_STARTED" "RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f))) + (if (> (length not-completed-tests) 0) + (let loop ((running-test (car not-completed-tests)) + (tal (cdr not-completed-tests))) (let* ((test-name (vector-ref running-test 2)) (item-path (vector-ref running-test 11))) - (debug:print 0 *default-log-port* "test " test-name "/" item-path " needs to be killed") + (debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed") (if (not (null? tal)) - (loop (car tal) (cdr tal)))))))))) + (loop (car tal) (cdr tal))))))))))) (define (launch:is-test-alive host pid) (if (and host pid (not (equal? host "n/a"))) (let* ((cmd (conc "ssh " host " pstree -A " pid)) (output (with-input-from-pipe cmd read-lines))) @@ -1405,10 +1436,32 @@ (begin (debug:print-error 0 *default-log-port* "Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries) ;; (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat remtries: (- remtries 1))) (list #f #f))))) + + +(define (launch:handle-zombie-tests run-id) + (let* ((key (conc "zombiescan-runid-"run-id)) + (now (current-seconds)) + (threshold (- (current-seconds) (* 2 (or (configf:lookup-number *configdat* "setup" "deadtime") 120)))) + (val (rmt:get-var key)) + (do-scan? + (cond + ((not val) + #t) + ((< val threshold) + #t) + (else #f)))) + (when do-scan? + (debug:print 1 *default-log-port* "INFO: search and mark zombie tests") + (rmt:set-var key (current-seconds)) + (rmt:find-and-mark-incomplete run-id #f)))) + + + + ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.6524) +(define megatest-version 1.6525) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -109,13 +109,14 @@ Launching and managing runs -run : run all tests or as specified by -testpatt -remove-runs : remove the data for a run, requires -runname and -testpatt Optionally use :state and :status, use -keep-records to remove only - the run data. - -kill-runs : kill existing run(s) (all incomplete tests killed) - -kill-rerun : kill an existing run (all incomplete tests killed and run is rerun) + the run data. Use -kill-wait to override the 10 second + per test wait after kill delay. + -kill-runs : kill existing run(s) (all incomplete tests killed) + -kill-rerun : kill an existing run (all incomplete tests killed and run is rerun) -set-state-status X,Y : set state to X and status to Y, requires controls per -remove-runs -rerun FAIL,WARN... : force re-run for tests with specificed status(s) -rerun-clean : set all tests not COMPLETED+PASS,WARN,WAIVED to NOT_STARTED,n/a and then run the specified testpatt with -preclean -rerun-all : set all tests to NOT_STARTED,n/a and run with -preclean @@ -129,11 +130,11 @@ -no-cache : do not use the cached config files. -one-pass : launch as many tests as you can but do not wait for more to be ready -remove-keep N : remove all but N most recent runs per target; use '-actions, -age, -precmd' -age : 120d,3h,20m to apply only to runs older than the specified age. NB// M=month, m=minute - -actions : print,remove-runs,archive to specify action to take + -actions [,...] : actions to take; print,remove-runs,archive,kill-runs -precmd : insert a wrapper command in front of the commands run Selectors (e.g. use for -runtests, -remove-runs, -set-state-status, -list-runs etc.) -target key1/key2/... : run for key1, key2, etc. -reqtarg key1/key2/... : run for key1, key2, etc. but key1/key2 must be in runconfigs @@ -357,10 +358,11 @@ "-src-target" "-src-runname" "-diff-email" "-sync-to" "-pgsync" + "-kill-wait" ;; wait this long before removing test (default is 10 sec) "-diff-html" ) (list "-h" "-help" "--help" "-manual" "-version" @@ -380,23 +382,25 @@ "-clean-cache" "-no-cache" "-cache-db" "-use-db-cache" "-prepend-contour" + ;; misc "-repl" "-lock" "-unlock" "-list-servers" "-kill-servers" "-run-wait" ;; wait on a run to complete (i.e. no RUNNING) - "-one-pass" ;; + "-one-pass" ;; "-local" ;; run some commands using local db access - "-generate-html" - "-generate-html-structure" + "-generate-html" + "-generate-html-structure" "-list-run-time" "-list-test-time" + ;; misc queries "-list-disks" "-list-targets" "-list-db-targets" "-show-runconfig" Index: mt-pg.sql ================================================================== --- mt-pg.sql +++ mt-pg.sql @@ -82,11 +82,11 @@ tag_id INTEGER DEFAULT 0, area_id INTEGER DEFAULT 0, CONSTRAINT areatagconstraint UNIQUE (tag_id, area_id)); CREATE VIEW area_tag_view as -select a.id as aid, t.id as tid,area_name,tag_name from areas as a inner join area_tags as at on at.area_id = a.id +select a.id as aid, t.id as tid,area_name,tag_name,area_path from areas as a inner join area_tags as at on at.area_id = a.id inner join tags as t on t.id = at.tag_id ; INSERT INTO areas (id,area_name,area_path) VALUES (0,'local','.'); CREATE TABLE IF NOT EXISTS ttype ( @@ -309,15 +309,16 @@ CREATE TABLE IF NOT EXISTS users_webviews( id SERIAL PRIMARY KEY , - user_id INTEGER NOT NULL, - webview_id INTEGER NOT NULL, - deleted INTEGER default 0, - searchpattern TEXT Default '', - web_page TEXT Default '', + user_id INTEGER NOT NULL, + webview_id INTEGER NOT NULL, + deleted INTEGER default 0, + searchpattern TEXT Default '', + web_page TEXT Default '', + is_default boolean default 'f', other_search_data TEXT Default '' ); CREATE TABLE IF NOT EXISTS cctrl_info( id SERIAL PRIMARY KEY , Index: mtut.scm ================================================================== --- mtut.scm +++ mtut.scm @@ -828,11 +828,12 @@ (define (contains list x) (cond ((null? list) #f) ((eq? (car list) x) #t) (else (contains (cdr list) x)))) ;; collect all needed data and create run pkts for contours with changed inputs ;; (define (generate-run-pkts mtconf toppath) - (let ((std-runname (conc "sched" (time->string (seconds->local-time (current-seconds)) "%M%H%d")))) + (let ((std-runname (conc "sched" (time->string (seconds->local-time (current-seconds)) "%M%H%d"))) + (packets-generated 0)) (common:with-queue-db mtconf (lambda (pktsdirs pktsdir pdb) (let* ((rgconfdat (find-and-read-config (conc toppath "/runconfigs.config"))) (rgconf (car rgconfdat)) @@ -1011,13 +1012,14 @@ ;;(print "Areas: " all-areas) (for-each (lambda (area) ;Add code to check whether area is valid (if - (if (args:get-arg "-target") + ;; This code checks whether the target has been passed in via argument, and only runs the specified target + (and (< packets-generated 4) (if (args:get-arg "-target") (if (string= (args:get-arg "-target") runkey) (area-allowed? area "area-needs-to-be-run" runkey contour #f) #f) - (area-allowed? area "area-needs-to-be-run" runkey contour #f)) + (area-allowed? area "area-needs-to-be-run" runkey contour #f))) (let* ((script (car cmd)) (params (cdr cmd)) (cmd (conc script " " contour " " area " " runkey " " std-runname " " action " " params)) (res (handle-exceptions @@ -1096,13 +1098,16 @@ (aval (or (configf:lookup mtconf "areas" area) "")) (aval-alist (common:val->alist aval)) (targets (map-targets mtconf aval-alist runkey area contour))) (pp targets) - (for-each (lambda (target) (create-run-pkt mtconf action area runkey target new-runname mode-patt + (for-each (lambda (target) + (create-run-pkt mtconf action area runkey target new-runname mode-patt tag-expr pktsdir reason contour sched dbdest append - runtrans)) targets) + runtrans) + (set! packets-generated (+ packets-generated 1)) + ) targets) ;; Add filter for targets ;;(create-run-pkt mtconf action area runkey target runname ;; pktsdir reason contour dbdest append ;; runtrans) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -55,10 +55,15 @@ ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected + (common:telemetry-log (conc "rmt:"(->string cmd)) + payload: `((rid . ,rid) + (params . ,params))) + + ;;DOT digraph megatest_state_status { ;;DOT ranksep=0; ;;DOT // rankdir=LR; ;;DOT node [shape="box"]; ;;DOT "rmt:send-receive" -> MUTEXLOCK; @@ -692,12 +697,12 @@ ;; state and status are extra hints not usually used in the calculation ;; (define (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status comment) (rmt:send-receive 'set-state-status-and-roll-up-items run-id (list run-id test-name item-path state status comment))) -(define (rmt:set-state-status-and-roll-up-run run-id) - (rmt:send-receive 'set-state-status-and-roll-up-run run-id (list run-id))) +(define (rmt:set-state-status-and-roll-up-run run-id state status) + (rmt:send-receive 'set-state-status-and-roll-up-run run-id (list run-id state status))) (define (rmt:update-pass-fail-counts run-id test-name) (rmt:general-call 'update-pass-fail-counts run-id test-name test-name test-name)) @@ -755,13 +760,21 @@ (rmt:send-receive 'lock/unlock-run #f (list run-id lock unlock user))) ;; set/get status (define (rmt:get-run-status run-id) (rmt:send-receive 'get-run-status #f (list run-id))) + +(define (rmt:get-run-state run-id) + (rmt:send-receive 'get-run-state #f (list run-id))) + (define (rmt:set-run-status run-id run-status #!key (msg #f)) (rmt:send-receive 'set-run-status #f (list run-id run-status msg))) + +(define (rmt:set-run-state-status run-id state status ) + (rmt:send-receive 'set-run-state-status #f (list run-id state status))) + (define (rmt:update-run-event_time run-id) (rmt:send-receive 'update-run-event_time #f (list run-id))) (define (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-runs-update #!key (sort-order "asc")) ;; fields of #f uses default Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -450,10 +450,17 @@ ;; register this run in monitor.db (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) (rmt:tasks-set-state-given-param-key task-key "running") + (common:telemetry-log "run-tests" + payload: + `( (target . ,target) + (run-name . ,runname) + (test-patts . ,test-patts) ) ) + + ;; Now generate all the tests lists (set! all-tests-registry (tests:get-all)) ;; hash of testname => path-to-test (set! all-test-names (hash-table-keys all-tests-registry)) ;; filter first for allowed-tests (from -tagexpr) then for test-patts. (set! test-names (tests:filter-test-names @@ -515,10 +522,11 @@ ;; run the run prehook if there are no tests yet run for this run: ;; (runs:run-pre-hook run-id) ;; mark all test launced flag as false in the meta table (rmt:set-var (conc "lunch-complete-" run-id) "no") + (rmt:set-run-state-status run-id "new" "n/a") ;; now add non-directly referenced dependencies (i.e. waiton) ;;====================================================================== ;; refactoring this block into tests:get-full-data ;; ;; What happended, this code is now duplicated in tests!? @@ -2003,13 +2011,19 @@ ((print) (print " " (simple-run-runname run) " " (time->string (seconds->local-time (simple-run-event_time run)) "WW%V.%u %H:%M:%S") " " (if remove "REMOVE" ""))) ((remove-runs) - (if remove (system (conc precmd " megatest -remove-runs -target " target " -runname " (simple-run-runname run) " -testpatt %")))) + (if remove (system (conc precmd " megatest -remove-runs -target " target " -runname " (simple-run-runname run) " -testpatt %" + (if (member 'kill-runs actions) ;; if kill-runs is specified then set -kill-wait to 0 + " -kill-wait 0" + ""))))) ((archive) - (if remove (system (conc precmd " megatest -archive save-remove -target " target " -runname " (simple-run-runname run) " -testpatt %")))))) + (if remove (system (conc precmd " megatest -archive save-remove -target " target " -runname " (simple-run-runname run) " -testpatt %")))) + ((kill-runs) + (if remove (system (conc precmd " megatest -kill-runs -target " target " -runname " (simple-run-runname run) " -testpatt %")))) + )) actions)))) sorted))) ;; (print "Sorted: " (map simple-run-event_time sorted)) ;; (print "Remove: " (map simple-run-event_time to-remove)))) (hash-table-keys runs-ht)) @@ -2137,11 +2151,11 @@ (toplevel-retries (make-hash-table)) ;; try three times to loop through and remove top level tests (test-retry-time (make-hash-table)) (backgrounded-remove-status (make-hash-table)) (backgrounded-remove-last-visit (make-hash-table)) (backgrounded-remove-result (make-hash-table)) - (allow-run-time 10)) ;; seconds to allow for killing tests before just brutally killing 'em + (allow-run-time (string->number (or (args:get-arg "-kill-wait") "10")))) ;; seconds to allow for killing tests before just brutally killing 'em (let loop ((test (car sorted-tests)) (tal (cdr sorted-tests))) (let* ((test-id (db:test-get-id test)) (new-test-dat (rmt:get-test-info-by-id run-id test-id))) (if (not new-test-dat) @@ -2155,14 +2169,16 @@ ;; (rmt:sdb-qry 'getid (db:test-get-rundir new-test-dat)) ;; ) ;; run dir is from the link tree (has-subrun (and (subrun:subrun-test-initialized? run-dir) (not (subrun:subrun-removed? run-dir)))) (test-state (db:test-get-state new-test-dat)) + (test-status (db:test-get-status new-test-dat)) (test-fulln (db:test-get-fullname new-test-dat)) (uname (db:test-get-uname new-test-dat)) (toplevel-with-children (and (db:test-get-is-toplevel test) (> (rmt:test-toplevel-num-items run-id test-name) 0)))) + (case action ((remove-runs) ;; if the test is a toplevel-with-children issue an error and do not remove (cond (toplevel-with-children @@ -2275,10 +2291,16 @@ ((member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ")) (debug:print 1 *default-log-port* "INFO: issuing killreq to test "test-fulln) (mt:test-set-state-status-by-id run-id (db:test-get-id test) "KILLREQ" "n/a" #f) (if (not (null? tal)) (loop (car tal)(cdr tal)))) + ((and (member test-status '("PREQ_FAIL" "PREQ_DISCARDED" "BLOCKED" "ZERO_ITEMS" "KEEP_TRYING" "TEN_STRIKES" "TIMED_OUT"))) + (rmt:set-state-status-and-roll-up-items run-id (db:test-get-id test) 'foo "NOT_STARTED" "n/a" (conc "kill-run moved from "test-state":"test-status" to NOT_STARTED:n/a")) + ;;(mt:test-set-state-status-by-id run-id (db:test-get-id test) "NOT_STARTED" "n/a" (conc "kill-run moved from "test-state":"test-status" to NOT_STARTED:n/a")) + (if (not (null? tal)) + (loop (car tal)(cdr tal))) + ) (else (if (not (null? tal)) (loop (car tal)(cdr tal))) ))) ((set-state-status) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -528,10 +528,12 @@ (let* ((start-time (current-milliseconds)) (res (system sync-cmd))) (cond ((eq? 0 res) (delete-file* (conc mtdbfile ".backup")) + (if (eq? 0 (file-size sync-log)) + (delete-file sync-log)) (system (conc "/bin/mv " staging-file " " mtdbfile)) (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "(/ (- (current-milliseconds) start-time) 1000)" sec") #t) (else (system (conc "/bin/cp "sync-log" "sync-log".fail")) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -993,10 +993,11 @@ ;; get runs changed since last sync ;; (define (tasks:sync-test-data dbh cached-info area-info) ;; (let* (( (define (tasks:sync-to-postgres configdat dest) + (print "In sync") (let* ((dbh (pgdb:open configdat dbname: dest)) (area-info (pgdb:get-area-by-path dbh *toppath*)) (cached-info (make-hash-table)) (start (current-seconds)) (test-patt (if (args:get-arg "-testpatt") @@ -1014,15 +1015,17 @@ (exit 1))) (if (and (not target) run-name) (begin (print "Error: Provide target") (exit 1))) - + (print "123") + (exit 1) (for-each (lambda (dtype) (hash-table-set! cached-info dtype (make-hash-table))) '(runs targets tests steps data)) (hash-table-set! cached-info 'start start) ;; when done we'll set sync times to this + (print "here") (if area-info (let* ((last-sync-time (vector-ref area-info 3)) (smallest-last-update-time (make-hash-table)) (changed (if (and target run-name) (rmt:get-run-record-ids target run-name (rmt:get-keys) test-patt) @@ -1035,10 +1038,11 @@ (area-tag (if (args:get-arg "-area-tag") (args:get-arg "-area-tag") (if (args:get-arg "-area") (args:get-arg "-area") "")))) + (print "here2") (if (and (equal? area-tag "") (not (pgdb:is-area-taged dbh (vector-ref area-info 0)))) (set! area-tag *default-area-tag*)) (if (not (equal? area-tag "")) (task:add-area-tag dbh area-info area-tag)) (if (or (not (null? test-ids)) (not (null? run-ids))) ADDED telemetry-daemon Index: telemetry-daemon ================================================================== --- /dev/null +++ telemetry-daemon @@ -0,0 +1,265 @@ +#!/usr/bin/env python +# -*- Mode: Python; -*- +## Tiny Syslog Server in Python. +## +## This is a tiny syslog server that is able to receive UDP based syslog +## entries on a specified port and save them to a file. +## That's it... it does nothing else... + + +import os +import sys, os, time, atexit +from signal import SIGTERM +import logging +import logging.handlers +import SocketServer +import datetime +from subprocess import call +import argparse +import os +import socket + +## code to determine this host's IP on non-loopback interface +if os.name != "nt": + import fcntl + import struct + + def get_interface_ip(ifname): + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + return socket.inet_ntoa(fcntl.ioctl(s.fileno(), 0x8915, struct.pack('256s', + ifname[:15]))[20:24]) + +def get_lan_ip(): + ip = socket.gethostbyname(socket.gethostname()) + if ip.startswith("127.") and os.name != "nt": + interfaces = [ + "eth0", + "eth1", + "eth2", + "wlan0", + "wlan1", + "wifi0", + "ath0", + "ath1", + "ppp0", + ] + for ifname in interfaces: + try: + ip = get_interface_ip(ifname) + break + except IOError: + pass + return ip + +class Daemon(object): + """ + A generic daemon class. + + Usage: subclass the Daemon class and override the run() method + """ + def __init__(self, pidfile, stdin='/dev/null', stdout='/dev/null', stderr='/dev/null'): + self.stdin = stdin + self.stdout = stdout + self.stderr = stderr + self.pidfile = pidfile + + def daemonize(self): + """ + do the UNIX double-fork magic, see Stevens' "Advanced + Programming in the UNIX Environment" for details (ISBN 0201563177) + http://www.erlenstar.demon.co.uk/unix/faq_2.html#SEC16 + """ + try: + pid = os.fork() + if pid > 0: + # exit first parent + sys.exit(0) + except OSError, e: + sys.stderr.write("fork #1 failed: %d (%s)\n" % (e.errno, e.strerror)) + sys.exit(1) + + # decouple from parent environment + os.chdir("/") + os.setsid() + os.umask(0) + + # do second fork + try: + pid = os.fork() + if pid > 0: + # exit from second parent + sys.exit(0) + except OSError, e: + sys.stderr.write("fork #2 failed: %d (%s)\n" % (e.errno, e.strerror)) + sys.exit(1) + + # redirect standard file descriptors + sys.stdout.flush() + sys.stderr.flush() + si = file(self.stdin, 'r') + so = file(self.stdout, 'a+') + se = file(self.stderr, 'a+', 0) + os.dup2(si.fileno(), sys.stdin.fileno()) + os.dup2(so.fileno(), sys.stdout.fileno()) + os.dup2(se.fileno(), sys.stderr.fileno()) + + # write pidfile + atexit.register(self.delpid) + pid = str(os.getpid()) + file(self.pidfile,'w+').write("%s\n" % pid) + + def delpid(self): + os.remove(self.pidfile) + + def start(self): + """ + Start the daemon + """ + # Check for a pidfile to see if the daemon already runs + try: + pf = file(self.pidfile,'r') + pid = int(pf.read().strip()) + pf.close() + except IOError: + pid = None + + if pid: + message = "pidfile %s already exist. Daemon already running?\n" + sys.stderr.write(message % self.pidfile) + sys.exit(1) + + # Start the daemon + self.daemonize() + self.run() + + def stop(self): + """ + Stop the daemon + """ + # Get the pid from the pidfile + try: + pf = file(self.pidfile,'r') + pid = int(pf.read().strip()) + pf.close() + except IOError: + pid = None + + if not pid: + message = "pidfile %s does not exist. Daemon not running?\n" + sys.stderr.write(message % self.pidfile) + return # not an error in a restart + + # Try killing the daemon process + try: + while 1: + os.kill(pid, SIGTERM) + time.sleep(0.1) + except OSError, err: + err = str(err) + if err.find("No such process") > 0: + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + else: + print str(err) + sys.exit(1) + + def restart(self): + """ + Restart the daemon + """ + self.stop() + self.start() + + def run(self): + """ + You should override this method when you subclass Daemon. It will be called after the process has been + daemonized by start() or restart(). + """ + +# setup logging module so that the log can be moved aside and will reopen for append +def log_setup(logfile): + log_handler = logging.handlers.WatchedFileHandler(logfile) + formatter = logging.Formatter( + '%(message)s','') + log_handler.setFormatter(formatter) + logger = logging.getLogger() + logger.addHandler(log_handler) + logger.setLevel(logging.INFO) + + +class SyslogUDPHandler(SocketServer.BaseRequestHandler): + def handle(self): + data = bytes.decode(self.request[0].strip()) + socket = self.request[1] + print( "%s : " % self.client_address[0], str(data)) + timestamp = datetime.datetime.now().isoformat() + logline = timestamp + ":"+self.client_address[0] + ":" + str(data) + logging.info(str(logline)) + + + +class TelemetryLogDaemon(Daemon): + def __init__(self, pidfile, logfile, server_ip, server_port): + self.logfile = logfile + self.server_ip = server_ip + self.server_port = server_port + super(TelemetryLogDaemon, self).__init__(pidfile) + + def run(self): + log_setup(self.logfile) + server = SocketServer.UDPServer((self.server_ip,int(self.server_port)), SyslogUDPHandler) + server.serve_forever(poll_interval=0.5) + + +def main(): + default_log_file = os.environ['PWD'] + "/telemetry.log" + + parser = argparse.ArgumentParser(description = 'telemetry-daemon') + actions="start,restart,stop,nodaemon".split(",") + + parser.add_argument("-a", "--action", required=True, choices=actions, help="manage daemon: start stop or restart") + parser.add_argument("-p", "--server-port", default="5929", help="specify alternate udp port number, default is 5929") + parser.add_argument("-i", "--server-ip", default=get_lan_ip(), help="specify IP if heuristics to get local host lan ip fails") + parser.add_argument("-l", "--log-file", default=default_log_file, help="specify log file to write") + parser.add_argument("-z", "--pid-file", default=default_log_file + ".pidfile", help="specify pidfile") + opts = parser.parse_args() + + tld = TelemetryLogDaemon(opts.pid_file, opts.log_file, opts.server_ip, opts.server_port) + + if opts.action == "start": + print "Info: Starting server" + print """Example addition to megatest.config to enable telemetry: + +[telemetry] +host %s +port %s +want-events ALL + + """ % (opts.server_ip, opts.server_port) + tld.start() + + elif opts.action == "stop": + tld.stop() + elif opts.action == "restart": + + print "Info: Restarting server" + print """Example addition to megatest.config to enable telemetry: + +[telemetry] +host %s +port %s +want-events ALL + + """ % (opts.server_ip, opts.server_port) + tld.restart() + elif opts.action == "nodaemon": + log_setup(opts.log_file) + server = SocketServer.UDPServer((opts.server_ip,int(opts.server_port)), SyslogUDPHandler) + server.serve_forever(poll_interval=0.5) + +if __name__ == '__main__': + main() + + + + Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -1517,11 +1517,11 @@ ;; if .testconfig exists in test directory read and return it ;; else if have cached copy in *testconfigs* return it IFF there is a section "have fulldata" ;; else read the testconfig file ;; if have path to test directory save the config as .testconfig and return it ;; -(define (tests:get-testconfig test-name item-path test-registry system-allowed #!key (force-create #f)(allow-write-cache #t)) +(define (tests:get-testconfig test-name item-path test-registry system-allowed #!key (force-create #f)(allow-write-cache #t)(wait-a-minute #f)) (let* ((use-cache (common:use-cache?)) (cache-path (tests:get-test-path-from-environment)) (cache-file (and cache-path (conc cache-path "/.testconfig"))) (cache-exists (and cache-file (not force-create) ;; if force-create then pretend there is no cache to read @@ -1545,13 +1545,37 @@ dat ;; no cached data available (let* ((treg (or test-registry (tests:get-all))) (test-path (or (hash-table-ref/default treg test-name #f) + (let* ((local-tcdir (conc (getenv "MT_LINKTREE") "/" + (getenv "MT_TARGET") "/" + (getenv "MT_RUNNAME") "/" + test-name "/" item-path)) + (local-tcfg (conc local-tcdir "/testconfig"))) + (if (common:file-exists? local-tcfg) + local-tcdir + #f)) (conc *toppath* "/tests/" test-name))) (test-configf (conc test-path "/testconfig")) - (testexists (and (common:file-exists? test-configf)(file-read-access? test-configf))) + (testexists (let loopa ((tries-left 30)) + (cond + ( + (and (common:file-exists? test-configf)(file-read-access? test-configf)) + #t) + ( + (common:file-exists? test-configf) + (debug:print 0 *default-log-port* "WARNING: Cannot read testconfig file: "test-configf) + #f) + ( + (and wait-a-minute (> tries-left 0)) + (thread-sleep! 10) + (debug:print 0 *default-log-port* "WARNING: testconfig file does not exist: "test-configf" will retry in 10 seconds. Tries left: "tries-left) ;; BB: this fires + (loopa (sub1 tries-left))) + (else + (debug:print 0 *default-log-port* "WARNING: testconfig file does not exist: "test-configf) ;; BB: this fires + #f)))) (tcfg (if testexists (read-config test-configf #f system-allowed environ-patt: (if system-allowed "pre-launch-env-vars" #f)) @@ -1562,11 +1586,11 @@ cache-file (file-write-access? cache-path) allow-write-cache) (let ((tpath (conc cache-path "/.testconfig"))) (debug:print-info 1 *default-log-port* "Caching testconfig for " test-name " in " tpath) - (if (not (common:in-running-test?)) + (if (and tcfg (not (common:in-running-test?))) (configf:write-alist tcfg tpath)))) tcfg)))))) ;; sort tests by priority and waiton ;; Move test specific stuff to a test unit FIXME one of these days