Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1676,11 +1676,11 @@ (begin (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) (begin (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) - (debug:print-info 0 *default-log-port* "BB> Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)))) + (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)))) db "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');" run-id running-deadtime) @@ -1691,11 +1691,11 @@ ;; what to do with toplevel? call rollup? (begin (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) (begin - (debug:print-info 0 *default-log-port* "BB> Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration) + (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration) (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) db "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');" run-id remotehoststart-deadtime) @@ -1707,11 +1707,11 @@ (if (and (equal? uname "n/a") (equal? item-path "")) ;; this is a toplevel test ;; what to do with toplevel? call rollup? (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) (begin - (debug:print-info 0 *default-log-port* "BB> Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded remotehoststart-deadtime "remotehoststart-deadtime) + (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id" 1 day since event_time marked") (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))) db "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');" run-id) Index: docs/manual/megatest_manual.html ================================================================== --- docs/manual/megatest_manual.html +++ docs/manual/megatest_manual.html @@ -900,69 +900,10 @@ sqlite3 database. Megatest has been used with the Intel Netbatch and lsf (also known as openlava) batch systems and it should be straightforward to use it with other similar systems.

- -
-

Overview

-
-
-

Stand-alone Megatest Area

-

A single, stand-alone, Megatest based testsuite or "area" is -sufficient for most validation, automation and build problems.

-
-
-Static -
-
-

Megatest is designed as a distributed or decoupled system. This means -you can run the areas stand-alone with no additional -infrastructure. I.e. there are no databases, web servers or other -centralized resources needed. However as your needs grow you can -integrate multiple areas into a bigger system.

-
-

Component Descriptions

-
    -
  1. -

    -Multi-area dashboard and xterm. A gui (the dashboard) is usually the - best option for controlling and launching runs but all operations - can also be done from the commandline. Note: The not yet released - multi-area dashboard replaces the old dashboard for browsing and - controlling runs but for managing a single area the old dashboard - works very well. -

    -
  2. -
  3. -

    -Area/testsuite. This is your testsuite or automation definition and - consists of the information in megatest.config, runconfigs.config - and your testconfigs along with any custom scripting that can’t be - done with the native Megatest features. -

    -
  4. -
  5. -

    -If your testsuite or build automation is too large to run on a - single instance you can distribute your jobs into a compute server - pool. The only current requirements are password-less ssh access and - a network filesystem. -

    -
  6. -
-
-
-
-

Full System Architecture

-
-
-Static -
-
-
-

Installation

@@ -3033,10 +2974,10 @@

Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -382,11 +382,11 @@ (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) - (BB> "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync) + (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load @@ -404,11 +404,11 @@ (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) - (BB> "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period) + (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) @@ -421,13 +421,13 @@ (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) (when do-sync ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) - (BB> "launch:monitor-job - dosync started at "(current-seconds)) + (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) - (BB> "launch:monitor-job - dosync finished at "(current-seconds))) + (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m)