Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -28,11 +28,11 @@ ezsteps.scm lock-queue.scm sdb.scm rmt.scm api.scm \ subrun.scm portlogger.scm archive.scm env.scm \ diff-report.scm cgisetup/models/pgdb.scm # module source files -MSRCFILES = +MSRCFILES = adjutant.scm mutils.scm # ftail.scm rmtmod.scm commonmod.scm removed # MSRCFILES = ducttape-lib.scm pkts.scm stml2.scm cookie.scm mutils.scm \ # mtargs.scm commonmod.scm dbmod.scm adjutant.scm ulex.scm \ # rmtmod.scm apimod.scm Index: TODO ================================================================== --- TODO +++ TODO @@ -16,10 +16,17 @@ # along with Megatest. If not, see . TODO ==== +WW38 +. Add test_rundat to no-sync ==> correction, put in /.meta/test-run.dat +. Add STATE/STATUS transitions to .meta/test-run.dat or similar +. Swizzle update-test-rundat to operate on no-sync +. Swizzle update-run-duration, -uname-host and cpuload-diskfree to no-sync +. On state/status change update tests table with duration + WW15 . fill newview matrix with data, filter pipeline gui elements . improve [script], especially indent handling WW16 @@ -35,11 +42,10 @@ . break command line into sections; all, run control, queries, utilities etc. . pull in ftfplan (not integrated, just code pulled in) WW20 . ./configure => ubuntu, sles11, sles12, rh7 -. Jenkins junit XML support . Add output flushing in teamcity support . Switch to using simple runs query everywhere . Add end_time to runs and add a rollup call that sets state, status and end_time Future Index: adjutant.scm ================================================================== --- adjutant.scm +++ adjutant.scm @@ -22,12 +22,23 @@ (module adjutant * (import scheme chicken data-structures extras files) (import (prefix sqlite3 sqlite3:) posix typed-records srfi-18 srfi-69 - md5 message-digest + md5 message-digest matchable regex srfi-1) -(define (adjutant-run) - (print "Running the adjutant!")) +(define (adjutant-run host-type rmt:no-sync-take-job) + (print "Running the adjutant!") + (let loop ((wait-count 0)) + (if (< wait-count 10) ;; 6 x 10 seconds = one minute + (let* ((dat (rmt:no-sync-take-job host-type))) + (match dat + ((id ht vars exekey cmdline state event-time last-update) + (system cmdline) + (loop 0)) + (else + (thread-sleep! 10) + (loop (+ wait-count 1))))) + (print "I'm bored. Exiting.")))) ) Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -251,17 +251,20 @@ ;; NO SYNC DB ((no-sync-set) (apply db:no-sync-set *no-sync-db* params)) ((no-sync-get/default) (apply db:no-sync-get/default *no-sync-db* params)) ((no-sync-del!) (apply db:no-sync-del! *no-sync-db* params)) ((no-sync-get-lock) (apply db:no-sync-get-lock *no-sync-db* params)) + ((no-sync-add-job) (apply db:no-sync-add-job *no-sync-db* params)) + ((no-sync-take-job) (apply db:no-sync-take-job *no-sync-db* params)) + ((no-sync-job-records-clean) (apply db:no-sync-job-records-clean *no-sync-db* params)) ;; ARCHIVES ;; ((archive-get-allocations) ((archive-register-disk) (apply db:archive-register-disk dbstruct params)) ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params)) ;; ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey)) - + ;;====================================================================== ;; READ ONLY QUERIES ;;====================================================================== ;; KEYS Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -567,11 +567,11 @@ (- num-logs max-allowed)))) (for-each (lambda (file) (let* ((fullname (conc "logs/" file))) (if (directory? fullname) - (debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") + (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") (handle-exceptions exn (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn) (delete-file* fullname))))) files) @@ -3171,31 +3171,39 @@ ;; arm cubie01 cubie02 ;; x86_64 zeus xena myth01 ;; allhosts #{g hosts arm} #{g hosts x86_64} ;; ;; [host-types] +;; C/M/A lets megatest know this launcher provides C cores, M bytes memory for architecture A +;; 2/2G/arm smart -cores 2 -memory 2G -arch arm ;; general #MTLOWESTLOAD #{g hosts allhosts} ;; arm #MTLOWESTLOAD #{g hosts arm} ;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo +;; +;; NOTE: host-rules is ONLY used for MTLOWESTLOAD ;; ;; [host-rules] ;; # maxnload => max normalized load ;; # maxnjobs => max jobs per cpu ;; # maxjobrate => max jobs per second ;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 ;; ;; [launchers] ;; envsetup general -;; xor/%/n 4C16G +;; xor/%/n 2/2G/arm ;; % nbgeneral ;; ;; [jobtools] ;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match. ;; flexi-launcher yes ;; launcher nbfake +;; mode adjutant|normal (default is normal) +;; +;; +;; mode is 'normal (i.e. directly use launcher) or 'adjutant (i.e. use adjutant) ;; -(define (common:get-launcher configdat testname itempath) +(define (common:get-launcher configdat testname itempath mode) (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher"))) (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no"))) (let* ((launchers (hash-table-ref/default configdat "launchers" '()))) (if (null? launchers) @@ -3202,32 +3210,36 @@ fallback-launcher (let loop ((hed (car launchers)) (tal (cdr launchers))) (let ((patt (car hed)) (host-type (cadr hed))) - (if (tests:match patt testname itempath) + (if (tests:match patt testname itempath) ;; have a launcher match for this test (begin (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) - (let ((launcher (configf:lookup configdat "host-types" host-type))) + (let ((launcher (configf:lookup configdat "host-types" host-type))) ;; find the actual launcher from the host-types table + ;; if we are in adjutant mode then we want to return both host-type and launcher (if launcher (let* ((launcher-parts (string-split launcher)) (launcher-exe (car launcher-parts))) (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) (count 100)) (if targ-host (conc "remrun " targ-host) (if (> count 0) + (begin (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) (thread-sleep! (- 101 count)) (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) (- count 1))) (begin (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) (exit))))) - launcher)) + (case mode + ((adjutant) (list host-type launcher)) + (else launcher)))) (begin (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) (if (null? tal) fallback-launcher (loop (car tal)(cdr tal))))))) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -212,10 +212,12 @@ (hash-table-set! (dboard:commondat-tabdats commondat) tabnum tabdat)) +(define *updater-running* #f) ;; move this into one of the stucts + ;; gets and calls updater list based on curr-tab-num ;; (define (dboard:common-run-curr-updaters commondat #!key (tab-num #f)) (if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat (let* ((tnum (or tab-num (dboard:commondat-curr-tab-num commondat))) @@ -240,11 +242,13 @@ tnum (cons updater curr-updaters)))) ;; data for each specific tab goes here ;; -(defstruct dboard:tabdat +(defstruct dboard:tabdat + ;; flags + ((already-running #f) : boolean) ;; the updater is already running. skip ;; runs ((allruns '()) : list) ;; list of dboard:rundat records ((allruns-by-id (make-hash-table)) : hash-table) ;; hash of run-id -> dboard:rundat records ((done-runs '()) : list) ;; list of runs already drawn ((not-done-runs '()) : list) ;; list of runs not yet drawn @@ -645,11 +649,11 @@ ;; (define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals) (let* ((start-time (current-seconds)) (access-mode (dboard:tabdat-access-mode tabdat)) (num-to-get (string->number (or (configf:lookup *configdat* "setup" "num-tests-to-get") - "200"))) + "60"))) ;; was 200, which is fine in a normal run area. (states (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat))) (statuses (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat))) (do-not-use-db-file-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab (do-not-use-query-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab (sort-info (get-curr-sort)) @@ -716,10 +720,11 @@ (if got-all (begin (dboard:rundat-last-update-set! run-dat (- start-time 2)) (dboard:rundat-run-data-offset-set! run-dat 0)) (begin + ;;; (thread-sleep! 0.25) ;; give the rest of the gui some time to update. <-- this did NOT help (dboard:rundat-run-data-offset-set! run-dat (+ num-to-get (dboard:rundat-run-data-offset run-dat))))) (for-each (lambda (tdat) @@ -833,11 +838,13 @@ ;; ;; create a virtual table of all the tests ;; keypatts: ( (KEY1 "abc%def")(KEY2 "%") ) ;; (define (dboard:update-rundat tabdat runnamepatt numruns testnamepatt keypatts) - (let* ((access-mode (dboard:tabdat-access-mode tabdat)) + (dboard:tabdat-already-running-set! tabdat #t) + (let* (;; (already-running (dboard:tabdat-already-running tabdat)) + (access-mode (dboard:tabdat-access-mode tabdat)) (keys (dboard:tabdat-keys tabdat)) ;; (db:dispatch-query access-mode rmt:get-keys db:get-keys))) (last-runs-update (- (dboard:tabdat-last-runs-update tabdat) 2)) (allruns (rmt:get-runs runnamepatt numruns (dboard:tabdat-start-run-offset tabdat) keypatts)) ;;(allruns-tree (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f)) (allruns-tree (rmt:get-runs-by-patt keys "%" #f #f #f #f 0)) ;; last-runs-update));;'("id" "runname") @@ -901,27 +908,26 @@ (hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id) (hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct)) (if (or (null? tal) (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update (begin - (when (> elapsed-time 2) + #;(when (> elapsed-time 2) (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.") (let* ((old-val (iup:attribute *tim* "TIME")) (new-val (number->string (inexact->exact (floor (* 2 (string->number old-val))))))) (if (< (string->number new-val) 5000) - ((debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val) - (iup:attribute-set! *tim* "TIME" new-val)))) - - - ) + (begin + (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val) + (iup:attribute-set! *tim* "TIME" new-val))))) (dboard:tabdat-allruns-set! tabdat new-res) maxtests) (if (> (dboard:rundat-run-data-offset run-struct) 0) (loop run tal new-res newmaxtests) ;; not done getting data for this run (loop (car tal)(cdr tal) new-res newmaxtests))))))) (dboard:tabdat-filters-changed-set! tabdat #f) - (dboard:update-tree tabdat runs-hash header tb))) + (dboard:update-tree tabdat runs-hash header tb) + (dboard:tabdat-already-running-set! tabdat #f))) (define *collapsed* (make-hash-table)) (define (toggle-hide lnum uidat) ; fulltestname) (let* ((btn (vector-ref (dboard:uidat-get-lftcol uidat) lnum)) @@ -2505,14 +2511,10 @@ ;; insert extra widget here (if extra-widget extra-widget (iup:hbox)) ;; empty widget - - - - ))) (let* ((status-toggles (map (lambda (status) (iup:toggle (conc status) #:fontsize 8 ;; btn-fontsz ;; "10" @@ -3725,28 +3727,36 @@ ;; (define (tabdat-values tabdat) ;; runs update-rundat using the various filters from the gui ;; (define (dashboard:do-update-rundat tabdat) + ;; this seems like a good place to check for already running and skip if so + ;; + ;; (set! *updater-running* #t) +;;(if (dboard:tabdat-already-running tabdat) +;; (begin +;; (debug:print-info 0 *default-log-port* "Dashboard overloaded - updates will be slow, skipping update.") +;; (dboard:tabdat-target tabdat)) (dboard:update-rundat tabdat (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "runname" "%") (dboard:tabdat-numruns tabdat) (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%") ;; generate key patterns from the target stored in tabdat (let* ((dbkeys (dboard:tabdat-dbkeys tabdat))) (let ((fres (if (dboard:tabdat-target tabdat) - (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%")))) - (map (lambda (k v)(list k v)) dbkeys ptparts)) - (let ((res '())) - (for-each (lambda (key) - (if (not (equal? key "runname")) - (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f))) - (if val (set! res (cons (list key val) res)))))) - dbkeys) - res)))) - fres)))) + (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%")))) + (map (lambda (k v)(list k v)) dbkeys ptparts)) + (let ((res '())) + (for-each (lambda (key) + (if (not (equal? key "runname")) + (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f))) + (if val (set! res (cons (list key val) res)))))) + dbkeys) + res)))) + fres))) + #;(set! *updater-running* #f)) (define (dashboard:runs-tab-updater commondat tab-num) (debug:catch-and-dump (lambda () (let* ((tabdat (dboard:common-get-tabdat commondat tab-num: tab-num)) @@ -3801,22 +3811,25 @@ (dashboard:runs-tab-updater commondat 1)) tab-num: 2) (iup:callback-set! *tim* "ACTION_CB" (lambda (time-obj) - (let ((update-is-running #f)) - (mutex-lock! (dboard:commondat-update-mutex commondat)) - (set! update-is-running (dboard:commondat-updating commondat)) - (if (not update-is-running) - (dboard:commondat-updating-set! commondat #t)) - (mutex-unlock! (dboard:commondat-update-mutex commondat)) - (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update - (begin - (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat) - (mutex-lock! (dboard:commondat-update-mutex commondat)) - (dboard:commondat-updating-set! commondat #f) - (mutex-unlock! (dboard:commondat-update-mutex commondat))) + (if (not *updater-running*) + (begin + ;; (mutex-lock! (dboard:commondat-update-mutex commondat)) + ;; (set! update-is-running (dboard:commondat-updating commondat)) + ;;(if (not update-is-running) + ;; (dboard:commondat-updating-set! commondat #t)) + ;;(mutex-unlock! (dboard:commondat-update-mutex commondat)) + ;;(if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update + ;; (begin + (set! *updater-running* #t) + (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat) + (set! *updater-running* #f) + ;; (mutex-lock! (dboard:commondat-update-mutex commondat)) + ;; (dboard:commondat-updating-set! commondat #f) + ;; (mutex-unlock! (dboard:commondat-update-mutex commondat))) )) 1)))) (let ((th1 (make-thread (lambda () (thread-sleep! 1) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1534,11 +1534,11 @@ state TEXT DEFAULT 'new', status TEXT DEFAULT 'n/a', archive_type TEXT DEFAULT 'bup', du INTEGER, archive_path TEXT);"))) - (print "creating trigges from init") + ;; (print "creating trigges from init") (db:create-triggers db) db)) ;; ) ;;====================================================================== ;; A R C H I V E S @@ -2142,14 +2142,62 @@ (db-exists (common:file-exists? dbname)) (db (sqlite3:open-database dbname))) (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)) (if (not db-exists) (begin - (sqlite3:execute db "PRAGMA synchronous = 0;") (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));") (sqlite3:execute db "PRAGMA journal_mode=WAL;"))) + ;; MOVE THIS TABLE CREATION TO THE (begin above in about six months (it is Sep 2020 right now). + (sqlite3:execute db "CREATE TABLE IF NOT EXISTS jobs_queue (id INTEGER PRIMARY KEY, host_type TEXT, cores INTEGER, memory TEXT, vars TEXT, exekey TEXT, cmdline TEXT, state TEXT, event_time INTEGER, last_update INTEGER);") + ;; not sure I'll use this next one. I prefer if tests simply append to a file: + ;; last-update-seconds cpuload tmpspace rundirspace + (sqlite3:execute db "CREATE TABLE IF NOT EXISTS test_extra_data (id INTEGER PRIMARY KEY, run_id INTEGER, test_id INTEGER, last_seen_running INTEGER);") + (sqlite3:execute db "PRAGMA synchronous = 0;") db)) + +(define (db:no-sync-add-job db-in host-type vars-list exekey cmdline) + (sqlite3:execute (db:no-sync-db db-in) "INSERT INTO jobs_queue (host_type,vars,exekey,cmdline,state,event_time,last_update) VALUES (?,?,?,?,?,?,?);" + host-type vars-list exekey cmdline "waiting" (current-seconds)(current-seconds))) + +;; find next job (waiting longest) that matches host-type - future, we'll find jobs that fit if no exact match +(define (db:no-sync-take-job db-in host-type) + (let* ((db (db:no-sync-db db-in)) + (stmt1 "SELECT id,host_type,vars,exekey,cmdline,state,event_time,last_update FROM jobs_queue WHERE host_type=? AND state != 'taken' ORDER BY event_time ASC;") + (stmt1h (sqlite3:prepare db stmt1)) + (stmt2 "UPDATE jobs_queue SET state='taken',last_update=? WHERE id=?;") + (stmt2h (sqlite3:prepare db stmt2)) + (res (sqlite3:with-transaction + db + (lambda () + (let* ((matching-jobs (sqlite3:fold-row + (lambda (res . row) ;; id host-type vars exekey state event-time last-update) + (cons row res)) + '() + stmt1h + host-type))) + (if (null? matching-jobs) + #f + (let ((choosen-one (let loop ((tal matching-jobs) + (res #f)) ;; put bestest one in here + (if (null? tal) + res + (let ((curr (car tal)) + (rem (cdr tal))) + curr) ;; here we will compare with res, if better candidate the loop with curr else loop with res + )))) + (if choosen-one ;; we need to mark it as taken + (sqlite3:execute stmt2h (current-seconds) (car choosen-one))) + choosen-one))))))) + (sqlite3:finalize! stmt1h) ;; it'd be nice to cache these and finalize on exit. + (sqlite3:finalize! stmt2h) + res)) + +;; clean out old jobs in queue, i.e. taken and event_time > 24 hrs ago +;; +(define (db:no-sync-job-records-clean db) + (sqlite3:execute (db:no-sync-db db) "DELETE FROM jobs_queue WHERE state='taken' AND event_time < ?;" (- (current-seconds)(* 24 3600)))) + ;; if we are not a server create a db handle. this is not finalized ;; so watch for problems. I'm still not clear if it is needed to manually ;; finalize sqlite3 dbs with the sqlite3 egg. ;; @@ -2167,12 +2215,13 @@ (sqlite3:execute (db:no-sync-db db) "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES (?,?);" var val)) (define (db:no-sync-del! db var) (sqlite3:execute (db:no-sync-db db) "DELETE FROM no_sync_metadat WHERE var=?;" var)) -(define (db:no-sync-get/default db var default) - (let ((res default)) +(define (db:no-sync-get/default db-in var default) + (let ((db (db:no-sync-db db-in)) + (res default)) (sqlite3:for-each-row (lambda (val) (set! res val)) (db:no-sync-db db) "SELECT val FROM no_sync_metadat WHERE var=?;" @@ -3467,11 +3516,11 @@ (lambda (run-id) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))) (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs))) run-ids))) -;; Get test data using test_id, run-id is not used +;; Get test data using test_id, run-id is not used - but it will be! ;; (define (db:get-test-info-by-id dbstruct run-id test-id) (db:with-db dbstruct #f ;; run-id Index: docs/manual/megatest_manual.html ================================================================== --- docs/manual/megatest_manual.html +++ docs/manual/megatest_manual.html @@ -769,13 +769,13 @@

The Megatest Users Manual

Matt Welland
<matt@kiatoa.com>
version 1.5, June 2020 -
-
Table of Contents
- +
+
Table of Contents
+

Preface

@@ -3440,10 +3440,48 @@

+
+
+

Test Plan

+
+
+

Tests

+

itemwait|33

+

rerun-downstream-item|20

+

rerunclean|20

+

fullrun|18

+

goodtests|18

+

kill-rerun|17

+

items-runconfigvars|16

+

ro_test|16

+

runconfig-tests|16

+

env-pollution|13

+

itemmap|11

+

testpatt_envvar|10

+

toprun|10

+

chained-waiton|8

+

skip-on-fileexists|8

+

killrun_preqfail|7

+

subrun|6

+

dependencies|5

+

itemwait-simple|4

+

rollup|4

+

end-of-run|3

+

killrun|3

+

listener|3

+

test2|3

+

testpatt|3

+

env-pollution-usecacheno|2

+

set-values|2 +envvars|1 +listruns-tests|1 +subrun-usecases|1

+
+

Megatest Internals

@@ -3461,11 +3499,11 @@

Index: docs/manual/megatest_manual.txt ================================================================== --- docs/manual/megatest_manual.txt +++ docs/manual/megatest_manual.txt @@ -116,10 +116,12 @@ include::writing_tests.txt[] include::howto.txt[] include::reference.txt[] + +include::testplan.txt[] Megatest Internals ------------------ ["graphviz", "server.png"] ADDED docs/manual/testplan.txt Index: docs/manual/testplan.txt ================================================================== --- /dev/null +++ docs/manual/testplan.txt @@ -0,0 +1,81 @@ +// This file is part of Megatest. +// +// Megatest is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Megatest is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with Megatest. If not, see . + +// Copyright 2006-2020, Matthew Welland. + +Test Plan +--------- + +Tests +~~~~~ + +itemwait|33 + +rerun-downstream-item|20 + +rerunclean|20 + +fullrun|18 + +goodtests|18 + +kill-rerun|17 + +items-runconfigvars|16 + +ro_test|16 + +runconfig-tests|16 + +env-pollution|13 + +itemmap|11 + +testpatt_envvar|10 + +toprun|10 + +chained-waiton|8 + +skip-on-fileexists|8 + +killrun_preqfail|7 + +subrun|6 + +dependencies|5 + +itemwait-simple|4 + +rollup|4 + +end-of-run|3 + +killrun|3 + +listener|3 + +test2|3 + +testpatt|3 + +env-pollution-usecacheno|2 + +set-values|2 +envvars|1 +listruns-tests|1 +subrun-usecases|1 + + Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -205,17 +205,19 @@ (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) - (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) + (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) - (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) + ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - + ;; top of loop encountered at "(current-seconds)" with + ;; last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load @@ -233,33 +235,28 @@ (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) - (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) + #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) - (when do-sync - ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) - ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) - (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) - (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))) - + (if do-sync ;; save meta data about the running of this test + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this @@ -312,11 +309,11 @@ (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) - (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional + (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) @@ -465,11 +462,13 @@ (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.") (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. - ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun + ((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun + (and (equal? (db:test-get-state test-info) "COMPLETED") ;; completed/abort => rerun if asked + (member (db:test-get-status test-info) '("ABORT")))) (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) @@ -1355,12 +1354,20 @@ (debug:print 1 *default-log-port* "INFO: search and mark zombie tests") (rmt:set-var key (current-seconds)) (rmt:find-and-mark-incomplete run-id #f)))) - +(defstruct launch:ajt + (vars '()) + (exekey #f) + (host-type #f) + (test-sig #f) + (cmdline #f)) +;; append vars +(define (launch:ajt-add-vars dat vars) + (launch:ajt-vars-set! dat (append (launch:ajt-vars dat) vars))) ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host @@ -1367,44 +1374,41 @@ ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex - (let* ( ;; (lock-key (conc "test-" test-id)) - ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) - ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds - ;; (if (car lock) - ;; #t - ;; (if (> (current-seconds) expire-time) - ;; (begin - ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path) - ;; (rmt:no-sync-del! lock-key) ;; destroy the lock - ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; - ;; (begin - ;; (thread-sleep! 1) - ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)))))) + (let* (;; locking code removed from here commented out and pasted at end of file (item-path (item-list->path itemdat)) - (contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour"))) + (contour #f) ;; NOT READY FOR THIS (args:get-arg "-contour"))) + ;; launcher-mode will be 'adjutant or 'normal + (launcher-mode (string->symbol (or (configf:lookup *configdat* "jobtools" "mode") "normal"))) + (ajtdat (make-launch:ajt))) (let loop ((delta (- (current-seconds) *last-launch*)) (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 0))) (if (> launch-delay delta) (begin (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay. (debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds")) (thread-sleep! (- launch-delay delta)) (loop (- (current-seconds) *last-launch*) launch-delay)))) (change-directory *toppath*) - (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars) - (append - (list - (list "MT_RUN_AREA_HOME" *toppath*) - (list "MT_TEST_NAME" test-name) - (list "MT_RUNNAME" runname) - (list "MT_ITEMPATH" item-path) - (list "MT_CONTOUR" contour) - ) - itemdat)) + (let ((var-list (append + (list + (list "MT_RUN_AREA_HOME" *toppath*) + (list "MT_TEST_NAME" test-name) + (list "MT_RUNNAME" runname) + (list "MT_ITEMPATH" item-path) + (list "MT_CONTOUR" contour) + ) + itemdat))) + ;; consolidate this code with the code in megatest.scm for + ;; "-execute", *maybe* - the longer they are set the longer + ;; each launch takes (must be non-overlapping with the vars) + (alist->env-vars var-list) + ;; the var-list into the ajtdat adjutant record whether it is needed or not. + (launch:ajt-add-vars ajtdat var-list)) + (let* ((tregistry (tests:get-all)) ;; third param (below) is system-allowed ;; for tconfig, why do we allow fallback to test-conf? (tconfig (or (tests:get-testconfig test-name item-path tregistry #t force-create: #t) (begin (debug:print 0 *default-log-port* "WARNING: falling back to pre-calculated testconfig. This is likely not desired.") @@ -1422,26 +1426,17 @@ ;; (memory (configf:lookup tconfig "requirements" "memory")) ;; (hosts (configf:lookup *configdat* "jobtools" "workhosts")) ;; I'm pretty sure this was never completed (remote-megatest (configf:lookup *configdat* "setup" "executable")) (run-time-limit (or (configf:lookup tconfig "requirements" "runtimelim") (configf:lookup *configdat* "setup" "runtimelim"))) - ;; FIXME SOMEDAY: not good how this is so obtuse, this hack is to - ;; allow running from dashboard. Extract the path - ;; from the called megatest and convert dashboard - ;; or dboard to megatest (local-megatest (common:find-local-megatest)) - #;(local-megatest (let* ((lm (car (argv))) - (dir (pathname-directory lm)) - (exe (pathname-strip-directory lm))) - (conc (if dir (conc dir "/") "") - (case (string->symbol exe) - ((dboard) "../megatest") - ((mtest) "../megatest") - ((dashboard) "megatest") - (else exe))))) - (launcher (common:get-launcher *configdat* test-name item-path)) ;; (configf:lookup *configdat* "jobtools" "launcher")) - (test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path)) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path + (launcher (let ((l (common:get-launcher *configdat* test-name item-path launcher-mode))) + (if (string? l) + (string-split l) + l))) ;; some nonhomogenuity here. '(cmd param1 param2 ...) OR '(host-type launcher) + ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path + (test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path)) (work-area #f) (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all (diskpath #f) (cmdparms #f) (fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x)))) @@ -1451,15 +1446,24 @@ (debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '()) (if (args:get-arg "-logging")(list "-logging") '()) (if (configf:lookup *configdat* "misc" "profilesw") (list (configf:lookup *configdat* "misc" "profilesw")) '())))) + ;; save the test-sig in the ajtdat record + (launch:ajt-test-sig-set! ajtdat test-sig) + ;; go ahead and figure out if we have a host-type from the + ;; launcher call above and save it in the ajtdat record + (if (and (eq? launcher-mode 'adjutant) + (list? launcher) + (> (length launcher) 1)) + (launch:ajt-host-type-set! ajtdat (car launcher))) + ;; (if hosts (set! hosts (string-split hosts))) ;; set the megatest to be called on the remote host (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest")) (set! mt-bindir-path (pathname-directory remote-megatest)) - (if launcher (set! launcher (string-split launcher))) + ;; (if launcher (set! launcher (string-split launcher))) ;; yuk! ;; set up the run work area for this test (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir (begin (debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) @@ -1512,64 +1516,92 @@ (list 'runtlim (if run-time-limit (common:hms-string->seconds run-time-limit) #f)) (list 'env-ovrd (hash-table-ref/default *configdat* "env-override" '())) (list 'set-vars (if params (hash-table-ref/default params "-setvars" #f))) (list 'runname runname) (list 'mt-bindir-path mt-bindir-path)))))))) + ;; save the cmdparms in the ajtdat + (launch:ajt-exekey-set! ajtdat cmdparms) ;; clean out step records from previous run if they exist ;; (rmt:delete-test-step-records run-id test-id) ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway (if (common:file-exists? work-area) (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir - (cond - ;; ((and launcher hosts) ;; must be using ssh hostname - ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) - ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms)))) + + ;; save the command line for adjutant mode (might never be needed but best to assemble it here) + (launch:ajt-cmdline-set! ajtdat (string-intersperse + (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) + (cond (launcher (set! fullcmd (append launcher (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) - ;; (set! fullcmd (append launcher (list remote-megatest test-sig "-execute" cmdparms)))) (else (if (not useshell)(debug:print 0 *default-log-port* "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section")) (set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" "")))))) - ;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" ""))))) + (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm")))) (debug:print 1 *default-log-port* "Launching " work-area) ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done (debug:print 4 *default-log-port* "fullcmd: " fullcmd) (set! *last-launch* (current-seconds)) ;; all that junk above takes time, set this as late as possible. - (let* ((commonprevvals (alist->env-vars - (hash-table-ref/default *configdat* "env-override" '()))) - (miscprevvals (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute" - (append (list (list "MT_TEST_RUN_DIR" work-area) - (list "MT_TEST_NAME" test-name) - (list "MT_ITEM_INFO" (conc itemdat)) - (list "MT_RUNNAME" runname) - (list "MT_TARGET" mt_target) - (list "MT_ITEMPATH" item-path) - ) - itemdat))) - (testprevvals (alist->env-vars - (hash-table-ref/default tconfig "pre-launch-env-overrides" '()))) + (let* ((env-override-vars (hash-table-ref/default *configdat* "env-override" '())) + (commonprevvals (alist->env-vars env-override-vars)) + (misc-vars (append (list (list "MT_TEST_RUN_DIR" work-area) + (list "MT_TEST_NAME" test-name) + (list "MT_ITEM_INFO" (conc itemdat)) + (list "MT_RUNNAME" runname) + (list "MT_TARGET" mt_target) + (list "MT_ITEMPATH" item-path)) + itemdat)) + (miscprevvals (alist->env-vars misc-vars));; consolidate this code with the code in megatest.scm for "-execute" + (test-vars (hash-table-ref/default tconfig "pre-launch-env-overrides" '())) + (testprevvals (alist->env-vars test-vars)) + ;; Launchwait defaults to true, must override it to turn off wait (launchwait (if (equal? (configf:lookup *configdat* "setup" "launchwait") "no") #f #t)) - (launch-results-prev (apply (if launchwait ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed. - process:cmd-run-with-stderr-and-exitcode->list - process-run) - (if useshell - (let ((cmdstr (string-intersperse fullcmd " "))) - (if launchwait - cmdstr - (conc cmdstr " >> mt_launch.log 2>&1 &"))) - (car fullcmd)) - (if useshell - '() - (cdr fullcmd)))) + ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed. + (launch-results-prev (if (eq? launcher-mode 'adjutant) + '(#t 0) ;; just some fake data to fool downstream but non-applicable code + (apply (if launchwait + process:cmd-run-with-stderr-and-exitcode->list + process-run) + (if useshell + (let ((cmdstr (string-intersperse fullcmd " "))) + (if launchwait + cmdstr + (conc cmdstr " >> mt_launch.log 2>&1 &"))) + (car fullcmd)) + (if useshell + '() + (cdr fullcmd))))) (success (if launchwait (equal? 0 (cadr launch-results-prev)) #t)) (launch-results (if launchwait (car launch-results-prev) launch-results-prev))) - (if (not success) + + (launch:ajt-add-vars ajtdat env-override-vars) + (launch:ajt-add-vars ajtdat misc-vars) + (launch:ajt-add-vars ajtdat test-vars) + + ;; if in adjutant mode we register the job in the jobs_queue + ;; then fire off an adjutant runner + ;; + (if (eq? launcher-mode 'adjutant) + (let* ((adjutant-runner-cmd (append (cdr launcher) + (list remote-megatest "-adjutant" + (launch:ajt-host-type ajtdat) + "-start-dir" *toppath*))) + (adj-cmd (conc (string-intersperse (map conc adjutant-runner-cmd) " ") + "&"))) + (rmt:no-sync-add-job + (launch:ajt-host-type ajtdat) + (conc (launch:ajt-vars ajtdat)) + (launch:ajt-exekey ajtdat) + (launch:ajt-cmdline ajtdat)) + (print "adj-cmd: " adj-cmd) + (system adj-cmd) + )) + + (if (not success) (tests:test-set-status! run-id test-id "COMPLETED" "DEAD" "launcher failed; exited non-zero; check mt_launch.log" #f)) ;; (if launch-results launch-results "FAILED")) - (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork. ;; (rmt:no-sync-del! lock-key) ;; release the lock for starting this test (if (not launchwait) ;; give the OS a little time to allow the process to start (thread-sleep! 0.01)) (with-output-to-file "mt_launch.log" (lambda () @@ -1591,10 +1623,14 @@ (process-signal (current-process-id) signal/kill) )) (alist->env-vars miscprevvals) (alist->env-vars testprevvals) (alist->env-vars commonprevvals) + ;; yes, really should mutex all the way to here. Need to put this entire process into a fork. + ;; the unlock previously was further up. This seemed wrong as we should not proceed until the + ;; vars have been reset. + (mutex-unlock! *launch-setup-mutex*) launch-results)) (change-directory *toppath*) (thread-sleep! (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.0)))) ;; recover a test where the top controlling mtest may have died @@ -1620,5 +1656,21 @@ ;; now wait on that process if all is correct ;; periodically update the db with runtime ;; when the process exits look at the db, if still RUNNING after 10 seconds set ;; state/status appropriately (process-wait pid))) + + + ;; (lock-key (conc "test-" test-id)) + ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) + ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds + ;; (if (car lock) + ;; #t + ;; (if (> (current-seconds) expire-time) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path) + ;; (rmt:no-sync-del! lock-key) ;; destroy the lock + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; + ;; (begin + ;; (thread-sleep! 1) + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)))))) + Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. ;; (declare (unit megatest-version)) -(define megatest-version 1.6568) +(define megatest-version 1.6569) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -39,10 +39,17 @@ (declare (uses mt)) (declare (uses api)) (declare (uses tasks)) ;; only used for debugging. (declare (uses env)) (declare (uses diff-report)) + +(declare (uses mutils)) +(import mutils) + +(declare (uses adjutant)) +(import adjutant) + ;; (declare (uses ftail)) ;; (import ftail) (define *db* #f) ;; this is only for the repl, do not use in general!!!! @@ -51,18 +58,18 @@ (include "db_records.scm") (include "run_records.scm") (include "megatest-fossil-hash.scm") (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (prefix base64 base64:) - readline apropos json http-client directory-utils typed-records + readline apropos json http-client directory-utils typed-records matchable http-client srfi-18 extras format) ;; Added for csv stuff - will be removed ;; (use sparse-vectors) -(require-library mutils) +;; (require-library mutils) (define *usage-log-file* #f) ;; put path to file for logging usage in this var in the ~/.megatestrc file (define *usage-use-seconds* #t) ;; for Epoc seconds in usage logging change this to #t in ~/.megatestrc file ;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file @@ -199,11 +206,11 @@ -update-meta : update the tests metadata for all tests -setvars VAR1=val1,VAR2=val2 : Add environment variables to a run NB// these are overwritten by values set in config files. -server -|hostname : start the server (reduces contention on megatest.db), use - to automatically figure out hostname - -adjutant C,M : start the server/adjutant with allocated cores C and Mem M (Gig), + -adjutant host-type : start the server/adjutant with given host-type use 0,0 to auto use full machine -transport http|rpc : use http or rpc for transport (default is http) -log logfile : send stdout and stderr to logfile -list-servers : list the servers -kill-servers : kill all servers @@ -912,12 +919,25 @@ ;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to ;; a specific Megatest area. Detail are being hashed out and this may change. ;; (if (args:get-arg "-adjutant") - (begin - (adjutant-run) + (let* ((host-type (args:get-arg "-adjutant"))) + (launch:setup) ;; dang it, wish this wasn't needed + (print "Running the adjutant!") + (let loop ((wait-count 0)) + (if (< wait-count 10) ;; 6 x 10 seconds = one minute + (let* ((dat (rmt:no-sync-take-job host-type))) + (match dat + ((id ht vars exekey cmdline state event-time last-update) + (system cmdline) + (loop 0)) + (else + (thread-sleep! 10) + (loop (+ wait-count 1))))) + (print "I'm bored. Exiting."))) + ;; (adjutant-run (args:get-arg "-ajutant") rmt:no-sync-take-job) (set! *didsomething* #t))) (if (or (args:get-arg "-list-servers") (args:get-arg "-kill-servers")) (let ((tl (launch:setup))) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -22,10 +22,12 @@ (declare (unit rmt)) (declare (uses api)) (declare (uses http-transport)) (include "common_records.scm") +(include "db_records.scm") + ;; (declare (uses rmtmod)) ;; (import rmtmod) ;; @@ -54,18 +56,39 @@ (client:setup areapath) #f)))) (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id +(define *rmt-query-last-call-time* 0) +(define *rmt-query-last-rest-time* 0) ;; last time there was at least a 1/2 second rest - giving other processes access to the db + +;; NOTE: This query rest algorythm will not adapt to long query times. REDESIGN NEEDED. TODO. FIXME. +;; +(define (rmt:query-rest) + (let* ((now (current-milliseconds))) + (cond + ((> (- now *rmt-query-last-call-time*) 500) ;; it's been a while since last query - no need to rest + (set! *rmt-query-last-rest-time* now) + (set! *rmt-query-last-call-time* now)) + ((> (- now *rmt-query-last-rest-time*) 5000) ;; no natural rests have happened + (debug:print 0 *default-log-port* "query rest needed. blocking for 1/2 second.") + (thread-sleep! 0.5) ;; force a rest of a half second + (set! *rmt-query-last-rest-time* now) + (set! *rmt-query-last-call-time* now)) + (else ;; sufficient rests have occurred, just record the last query time + (set! *rmt-query-last-call-time* now))))) + ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected #;(common:telemetry-log (conc "rmt:"(->string cmd)) payload: `((rid . ,rid) (params . ,params))) - + (if (not (equal? (configf:lookup *configdat* "setup" "query-rest") "no")) + (rmt:query-rest)) + (if (> attemptnum 2) (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum)) (cond ((> attemptnum 2) (thread-sleep! 0.05)) @@ -525,15 +548,24 @@ (rmt:general-call 'register-test run-id run-id test-name item-path)) (define (rmt:get-test-id run-id testname item-path) (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) -;; run-id is NOT used +;; run-id is NOT used - but it will be! ;; (define (rmt:get-test-info-by-id run-id test-id) (if (number? test-id) - (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)) + (let* ((testdat (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))) + (trundatf (conc (db:test-get-rundir testdat) "/.mt_data/test-run.dat"))) + ;; now we can update a couple fields from the filesystem + (if (and (db:test-get-rundir testdat) + (file-exists? trundatf)) + (let* ((duration (db:test-get-run_duration testdat)) + (event-time (db:test-get-event_time testdat)) + (last-touch (file-modification-time trundatf))) + (db:test-set-run_duration! testdat (max duration (- last-touch event-time))))) + testdat) (begin (debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) (print-call-chain (current-error-port)) #f))) @@ -927,10 +959,19 @@ (define (rmt:no-sync-del! var) (rmt:send-receive 'no-sync-del! #f `(,var))) (define (rmt:no-sync-get-lock keyname) (rmt:send-receive 'no-sync-get-lock #f `(,keyname))) + +(define (rmt:no-sync-add-job host-type vars-list exekey cmdline) + (rmt:send-receive 'no-sync-add-job #f `(,host-type ,vars-list ,exekey ,cmdline))) + +(define (rmt:no-sync-take-job host-type) + (rmt:send-receive 'no-sync-take-job #f `(,host-type))) + +(define (rmt:no-sync-job-records-clean) + (rmt:set-receive 'no-sync-job-records-clean #f '())) ;;====================================================================== ;; A R C H I V E S ;;====================================================================== Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -585,14 +585,17 @@ (if (not (configf:lookup *configdat* "server" "disable-db-snapshot")) (common:snapshot-file mtdbfile subdir: ".db-snapshot")) (delete-file* staging-file) (let* ((start-time (current-milliseconds)) (res (system sync-cmd)) + (dbbackupfile (conc mtdbfile ".backup")) (res2 (cond - ((eq? 0 res) - (delete-file* (conc mtdbfile ".backup")) + ((eq? 0 res ) + (if (file-exists? dbbackupfile) + (delete-file* dbbackupfile) + ) (if (eq? 0 (file-size sync-log)) (delete-file sync-log)) (system (conc "/bin/mv " staging-file " " mtdbfile)) (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000)) Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -1944,52 +1944,53 @@ tdb "SELECT count(id) FROM test_rundat;") res)) 0) -(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname) - (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1)) - (if (and cpuload diskfree) - (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id)) - (if minutes - (rmt:general-call 'update-run-duration run-id minutes test-id)) - (if (and uname hostname) - (rmt:general-call 'update-uname-host run-id uname hostname test-id))) +;; +(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname #!key (update-db #f)(tmpfree #f)) + (if (get-environment-variable "MT_TEST_RUN_DIR") + (let* ((dest-dir (conc (get-environment-variable "MT_TEST_RUN_DIR") "/.mt_data")) + (or-dash (lambda (instr) + (cond + ((not instr) "") ;; #f -> blank, indicates value unchanged since last measurement taken + ((string? instr)(if (string-search " " instr) (conc "\"" instr "\"") instr)) + (else instr)))) + (file-new (not (directory-exists? dest-dir)))) + (if file-new (create-directory dest-dir #t)) + (let* ((outp (open-output-file (conc dest-dir "/test-run.dat") #:append))) + (with-output-to-port outp + (lambda () + (if file-new + (print "epoch_time,run_id,test_id,cpuload,diskfree,tmpfree,run_minutes,hostname,uname")) + (print (current-seconds) "," (or-dash run-id) "," (or-dash test-id) "," + (or-dash cpuload) "," (or-dash diskfree) "," (or-dash tmpfree) "," + (or-dash minutes) "," (or-dash hostname) "," + (or-dash uname)))) ;; put uname last as it has spaces in it + (close-output-port outp))) + (begin + (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1)))) + (if update-db + (begin + (if (and cpuload diskfree) + (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id)) + (if minutes + (rmt:general-call 'update-run-duration run-id minutes test-id)) + (if (and uname hostname) + (rmt:general-call 'update-uname-host run-id uname hostname test-id))))) ;; This one is for running with no db access (i.e. via rmt: internally) -(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries) +(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries #!key (update-db #f)) ;; (define (tests:set-full-meta-info test-id run-id minutes work-area) ;; (let ((remtries 10)) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) + (tmpfree (get-df "/tmp")) (uname (get-uname "-srvpio")) (hostname (get-host-name))) - (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname))) - -;; (define (tests:set-partial-meta-info test-id run-id minutes work-area) -#;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries) - (let* ((cpuload (get-cpu-load)) - (diskfree (get-df (current-directory))) - (remtries 10)) - (handle-exceptions - exn - (if (> remtries 0) - (begin - (print-call-chain (current-error-port)) - (debug:print-info 0 *default-log-port* "WARNING: failed to set meta info. Will try " remtries " more times") - (set! remtries (- remtries 1)) - (thread-sleep! 10) - (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1))) - (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) - (debug:print-error 0 *default-log-port* "tried for over a minute to update meta info and failed. Giving up") - (debug:print 0 *default-log-port* "EXCEPTION: database probably overloaded or unreadable.") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 5 *default-log-port* "exn=" (condition->list exn)) - (debug:print 0 *default-log-port* " status: " ((condition-property-accessor 'sqlite3 'status) exn)) - (print-call-chain (current-error-port)))) - (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes) - ))) + (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db tmpfree: tmpfree))) + ;;====================================================================== ;; A R C H I V I N G ;;======================================================================