Index: TODO ================================================================== --- TODO +++ TODO @@ -16,10 +16,17 @@ # along with Megatest. If not, see . TODO ==== +WW38 +. Add test_rundat to no-sync ==> correction, put in /.meta/test-run.dat +. Add STATE/STATUS transitions to .meta/test-run.dat or similar +. Swizzle update-test-rundat to operate on no-sync +. Swizzle update-run-duration, -uname-host and cpuload-diskfree to no-sync +. On state/status change update tests table with duration + WW15 . fill newview matrix with data, filter pipeline gui elements . improve [script], especially indent handling WW16 @@ -35,11 +42,10 @@ . break command line into sections; all, run control, queries, utilities etc. . pull in ftfplan (not integrated, just code pulled in) WW20 . ./configure => ubuntu, sles11, sles12, rh7 -. Jenkins junit XML support . Add output flushing in teamcity support . Switch to using simple runs query everywhere . Add end_time to runs and add a rollup call that sets state, status and end_time Future Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -543,11 +543,11 @@ (handle-exceptions exn #f (if (directory? fullname) (begin - (debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") + (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") (inc-stat "directories")) (begin (delete-file* fullname) (inc-stat "deleted"))) (hash-table-delete! all-files file))))))) @@ -567,11 +567,11 @@ (- num-logs max-allowed)))) (for-each (lambda (file) (let* ((fullname (conc "logs/" file))) (if (directory? fullname) - (debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") + (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") (handle-exceptions exn (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn) (delete-file* fullname))))) files) @@ -1766,24 +1766,37 @@ ;; - load-in, load as from uptime, NOT normalized ;; - numcpus, number of cpus, ideally use the real cpus, not threads ;; (define (common:get-delay load-in numcpus) (let* ((ratio (/ load-in numcpus)) - (new-option (configf:lookup *configdat* "load" "new-load-method"))) + (new-option (configf:lookup *configdat* "load" "new-load-method")) + (paramstr (or (configf:lookup *configdat* "load" "exp-params") + "15 12 1281453987.9543 0.75")) ;; 5 4 10 1")) + (paramlst (map string->number (string-split paramstr)))) (if new-option - (begin - (cond ((and (>= ratio 0) (< ratio .5)) - 0 - ) - ((and (>= ratio 0.5) (<= ratio .9)) - (* ratio (/ 5 .9))) - ((and (> ratio .9) (<= ratio 1.1)) - (+ 5 (* (- ratio .9) (/ 55 .2)))) - ((> ratio 1.1) - 60))) - (max (/ (expt 5 (* 4 ratio)) 10) 0)))) - + (begin + (cond ((and (>= ratio 0) (< ratio .5)) + 0) + ((and (>= ratio 0.5) (<= ratio .9)) + (* ratio (/ 5 .9))) + ((and (> ratio .9) (<= ratio 1.1)) + (+ 5 (* (- ratio .9) (/ 55 .2)))) + ((> ratio 1.1) + 60))) + (match paramlst + ((r1 r2 s1 s2) + (debug:print 3 *default-log-port* "Using params r1=" r1 " r2=" r2 " s1=" s1 " s2=" s2) + (min (max (/ (expt r1 (* r2 s2 ratio)) s1) 0) 30)) + (else + (debug:print 0 *default-log-port* "BAD exp-params, should be \"r1 r2 s1 s2\" but got " paramstr) + 30))))) + +(define (common:print-delay-table) + (let loop ((x 0)) + (print x "," (common:get-delay x 1)) + (if (< x 2) + (loop (+ x 0.1))))) (define (get-cpu-load #!key (remote-host #f)) (car (common:get-cpu-load remote-host))) ;; (let* ((load-res (process:cmd-run->list "uptime")) ;; (load-rx (regexp "load average:\\s+(\\d+)")) @@ -1800,30 +1813,44 @@ ;; get values from cached info from dropping file in logs dir ;; e.g. key is host and dtype is normalized-load ;; (define (common:get-cached-info key dtype #!key (age 10)) (if *toppath* - (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log"))) + (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")) + (delfile (lambda () + (debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn) + (delete-file* fullpath) + #f))) (if (and (file-exists? fullpath) (file-read-access? fullpath)) (handle-exceptions exn (begin (debug:print 0 *default-log-port* "failed to get cached info from " fullpath ", exn=" exn) #f) (debug:print 2 *default-log-port* "reading file " fullpath) - (let ((real-age (- (current-seconds)(file-change-time fullpath)))) + (let ((real-age (- (current-seconds) + (handle-exceptions + exn + (begin + (debug:print 1 *default-log-port* "Failed to read mod time on file " + fullpath ", using 0, exn=" exn) + 0) + (file-change-time fullpath))))) (if (< real-age age) (handle-exceptions exn - (begin - (debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn) - (delete-file* fullpath) - #f) - (with-input-from-file fullpath read)) + (delfile) + (let* ((res (with-input-from-file fullpath read))) + (if (eof-object? res) + (begin + (delfile) + #f) + res))) (begin - (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it") + (debug:print-info 2 *default-log-port* "file " fullpath + " is too old (" real-age" seconds) to trust, skipping reading it") #f)))) (begin (debug:print 2 *default-log-port* "not reading file " fullpath) #f))) #f)) @@ -2089,33 +2116,38 @@ #f (common:get-homehost))) (hh (if hh-dat (car hh-dat) #f))) (common:wait-for-normalized-load maxnormload msg hh))) +(define *numcpus-cache* (make-hash-table)) (define (common:get-num-cpus remote-host) (let* ((actual-host (or remote-host (get-host-name)))) - (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) ;; hosts had better not be changing the number of cpus too often! - (let* ((proc (lambda () - (let loop ((numcpu 0) - (inl (read-line))) - (if (eof-object? inl) - (if (> numcpu 0) - numcpu - #f) ;; if zero return #f so caller knows that things are not working - (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl) - (+ numcpu 1) - numcpu) - (read-line)))))) - (result (if remote-host - (with-input-from-pipe - (conc "ssh " remote-host " cat /proc/cpuinfo") - proc) - (with-input-from-file "/proc/cpuinfo" proc)))) - (if (and (number? result) - (> result 0)) - (common:write-cached-info actual-host "num-cpus" result)) - result)))) + ;; hosts had better not be changing the number of cpus too often! + (or (hash-table-ref/default *numcpus-cache* actual-host #f) + (let* ((numcpus (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) + (let* ((proc (lambda () + (let loop ((numcpu 0) + (inl (read-line))) + (if (eof-object? inl) + (if (> numcpu 0) + numcpu + #f) ;; if zero return #f so caller knows that things are not working + (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl) + (+ numcpu 1) + numcpu) + (read-line)))))) + (result (if remote-host + (with-input-from-pipe + (conc "ssh " remote-host " cat /proc/cpuinfo") + proc) + (with-input-from-file "/proc/cpuinfo" proc)))) + (if (and (number? result) + (> result 0)) + (common:write-cached-info actual-host "num-cpus" result)) + result)))) + (hash-table-set! *numcpus-cache* actual-host numcpus) + numcpus)))) ;; wait for normalized cpu load to drop below maxload ;; (define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5)) (let ((num-cpus (common:get-num-cpus remote-host))) @@ -2155,11 +2187,14 @@ (effective-host (or remote-host "localhost")) (normalized-effective-load (/ effective-load numcpus)) (will-wait (> normalized-effective-load maxnormload))) (if (> recommended-delay 0) (let* ((actual-delay (min recommended-delay 30))) - (debug:print-info 0 *default-log-port* "Load control, delaying " actual-delay " seconds to maintain safe load.") + (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) + (debug:print-info 0 *default-log-port* "Load control, delaying " + actual-delay " seconds to maintain safe load. current normalized effective load is " + normalized-effective-load".")) (thread-sleep! actual-delay))) (cond ;; bad data, try again to get the data ((not will-wait) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -212,10 +212,12 @@ (hash-table-set! (dboard:commondat-tabdats commondat) tabnum tabdat)) +(define *updater-running* #f) ;; move this into one of the stucts + ;; gets and calls updater list based on curr-tab-num ;; (define (dboard:common-run-curr-updaters commondat #!key (tab-num #f)) (if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat (let* ((tnum (or tab-num (dboard:commondat-curr-tab-num commondat))) @@ -240,11 +242,13 @@ tnum (cons updater curr-updaters)))) ;; data for each specific tab goes here ;; -(defstruct dboard:tabdat +(defstruct dboard:tabdat + ;; flags + ((already-running #f) : boolean) ;; the updater is already running. skip ;; runs ((allruns '()) : list) ;; list of dboard:rundat records ((allruns-by-id (make-hash-table)) : hash-table) ;; hash of run-id -> dboard:rundat records ((done-runs '()) : list) ;; list of runs already drawn ((not-done-runs '()) : list) ;; list of runs not yet drawn @@ -645,11 +649,11 @@ ;; (define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals) (let* ((start-time (current-seconds)) (access-mode (dboard:tabdat-access-mode tabdat)) (num-to-get (string->number (or (configf:lookup *configdat* "setup" "num-tests-to-get") - "200"))) + "50"))) ;; was 200, which is fine in a normal run area. (states (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat))) (statuses (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat))) (do-not-use-db-file-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab (do-not-use-query-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab (sort-info (get-curr-sort)) @@ -716,10 +720,11 @@ (if got-all (begin (dboard:rundat-last-update-set! run-dat (- start-time 2)) (dboard:rundat-run-data-offset-set! run-dat 0)) (begin + ;;; (thread-sleep! 0.25) ;; give the rest of the gui some time to update. <-- this did NOT help (dboard:rundat-run-data-offset-set! run-dat (+ num-to-get (dboard:rundat-run-data-offset run-dat))))) (for-each (lambda (tdat) @@ -833,11 +838,13 @@ ;; ;; create a virtual table of all the tests ;; keypatts: ( (KEY1 "abc%def")(KEY2 "%") ) ;; (define (dboard:update-rundat tabdat runnamepatt numruns testnamepatt keypatts) - (let* ((access-mode (dboard:tabdat-access-mode tabdat)) + (dboard:tabdat-already-running-set! tabdat #t) + (let* (;; (already-running (dboard:tabdat-already-running tabdat)) + (access-mode (dboard:tabdat-access-mode tabdat)) (keys (dboard:tabdat-keys tabdat)) ;; (db:dispatch-query access-mode rmt:get-keys db:get-keys))) (last-runs-update (- (dboard:tabdat-last-runs-update tabdat) 2)) (allruns (rmt:get-runs runnamepatt numruns (dboard:tabdat-start-run-offset tabdat) keypatts)) ;;(allruns-tree (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f)) (allruns-tree (rmt:get-runs-by-patt keys "%" #f #f #f #f 0)) ;; last-runs-update));;'("id" "runname") @@ -901,27 +908,26 @@ (hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id) (hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct)) (if (or (null? tal) (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update (begin - (when (> elapsed-time 2) + #;(when (> elapsed-time 2) (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.") (let* ((old-val (iup:attribute *tim* "TIME")) (new-val (number->string (inexact->exact (floor (* 2 (string->number old-val))))))) (if (< (string->number new-val) 5000) - ((debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val) - (iup:attribute-set! *tim* "TIME" new-val)))) - - - ) + (begin + (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val) + (iup:attribute-set! *tim* "TIME" new-val))))) (dboard:tabdat-allruns-set! tabdat new-res) maxtests) (if (> (dboard:rundat-run-data-offset run-struct) 0) (loop run tal new-res newmaxtests) ;; not done getting data for this run (loop (car tal)(cdr tal) new-res newmaxtests))))))) (dboard:tabdat-filters-changed-set! tabdat #f) - (dboard:update-tree tabdat runs-hash header tb))) + (dboard:update-tree tabdat runs-hash header tb) + (dboard:tabdat-already-running-set! tabdat #f))) (define *collapsed* (make-hash-table)) (define (toggle-hide lnum uidat) ; fulltestname) (let* ((btn (vector-ref (dboard:uidat-get-lftcol uidat) lnum)) @@ -2505,14 +2511,10 @@ ;; insert extra widget here (if extra-widget extra-widget (iup:hbox)) ;; empty widget - - - - ))) (let* ((status-toggles (map (lambda (status) (iup:toggle (conc status) #:fontsize 8 ;; btn-fontsz ;; "10" @@ -3725,28 +3727,36 @@ ;; (define (tabdat-values tabdat) ;; runs update-rundat using the various filters from the gui ;; (define (dashboard:do-update-rundat tabdat) + ;; this seems like a good place to check for already running and skip if so + ;; + ;; (set! *updater-running* #t) +;;(if (dboard:tabdat-already-running tabdat) +;; (begin +;; (debug:print-info 0 *default-log-port* "Dashboard overloaded - updates will be slow, skipping update.") +;; (dboard:tabdat-target tabdat)) (dboard:update-rundat tabdat (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "runname" "%") (dboard:tabdat-numruns tabdat) (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%") ;; generate key patterns from the target stored in tabdat (let* ((dbkeys (dboard:tabdat-dbkeys tabdat))) (let ((fres (if (dboard:tabdat-target tabdat) - (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%")))) - (map (lambda (k v)(list k v)) dbkeys ptparts)) - (let ((res '())) - (for-each (lambda (key) - (if (not (equal? key "runname")) - (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f))) - (if val (set! res (cons (list key val) res)))))) - dbkeys) - res)))) - fres)))) + (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%")))) + (map (lambda (k v)(list k v)) dbkeys ptparts)) + (let ((res '())) + (for-each (lambda (key) + (if (not (equal? key "runname")) + (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f))) + (if val (set! res (cons (list key val) res)))))) + dbkeys) + res)))) + fres))) + #;(set! *updater-running* #f)) (define (dashboard:runs-tab-updater commondat tab-num) (debug:catch-and-dump (lambda () (let* ((tabdat (dboard:common-get-tabdat commondat tab-num: tab-num)) @@ -3801,22 +3811,25 @@ (dashboard:runs-tab-updater commondat 1)) tab-num: 2) (iup:callback-set! *tim* "ACTION_CB" (lambda (time-obj) - (let ((update-is-running #f)) - (mutex-lock! (dboard:commondat-update-mutex commondat)) - (set! update-is-running (dboard:commondat-updating commondat)) - (if (not update-is-running) - (dboard:commondat-updating-set! commondat #t)) - (mutex-unlock! (dboard:commondat-update-mutex commondat)) - (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update - (begin - (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat) - (mutex-lock! (dboard:commondat-update-mutex commondat)) - (dboard:commondat-updating-set! commondat #f) - (mutex-unlock! (dboard:commondat-update-mutex commondat))) + (if (not *updater-running*) + (begin + ;; (mutex-lock! (dboard:commondat-update-mutex commondat)) + ;; (set! update-is-running (dboard:commondat-updating commondat)) + ;;(if (not update-is-running) + ;; (dboard:commondat-updating-set! commondat #t)) + ;;(mutex-unlock! (dboard:commondat-update-mutex commondat)) + ;;(if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update + ;; (begin + (set! *updater-running* #t) + (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat) + (set! *updater-running* #f) + ;; (mutex-lock! (dboard:commondat-update-mutex commondat)) + ;; (dboard:commondat-updating-set! commondat #f) + ;; (mutex-unlock! (dboard:commondat-update-mutex commondat))) )) 1)))) (let ((th1 (make-thread (lambda () (thread-sleep! 1) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -459,11 +459,11 @@ (print "Attempt to safely close sqlite3 db failed. Trying again. exn=" exn) (thread-sleep! 3) (sqlite3:interrupt! db) (db:safely-close-sqlite3-db db stmtcache try-num: (- try-num 1))) (if (sqlite3:database? db) - (let* ((stmts (hash-table-ref/default stmt-cache db #f))) + (let* ((stmts (and stmt-cache (hash-table-ref/default stmt-cache db #f)))) (if stmts (map sqlite3:finalize! (hash-table-values stmts))) (sqlite3:finalize! db) #t) #f)))) @@ -826,16 +826,17 @@ ) ;; first pass implementation, just insert all changed rows (for-each (lambda (targdb) - (let* ((db (db:dbdat-get-db targdb)) - (drp-trigger (if (member "last_update" field-names) - (db:drop-trigger db tablename) - #f)) - (is-trigger-dropped (if (member "last_update" field-names) - (db:is-trigger-dropped db tablename) #f)) + (let* ((db (db:dbdat-get-db targdb)) + (drp-trigger (if (member "last_update" field-names) + (db:drop-trigger db tablename) + #f)) + (is-trigger-dropped (if (member "last_update" field-names) + (db:is-trigger-dropped db tablename) + #f)) (stmth (sqlite3:prepare db full-ins))) ;; (db:delay-if-busy targdb) ;; NO WAITING (if (member "last_update" field-names) (debug:print-info 3 *default-log-port* "is-trigger-dropped: " is-trigger-dropped)) (for-each @@ -1224,29 +1225,29 @@ FOR EACH ROW BEGIN UPDATE runs SET last_update=(strftime('%s','now')) WHERE id=old.id; END;" ) - (list "update_run_stats_trigger" "CREATE TRIGGER IF NOT EXISTS update_run_stats_trigger AFTER UPDATE ON run_stats + (list "update_run_stats_trigger" "CREATE TRIGGER IF NOT EXISTS update_run_stats_trigger AFTER UPDATE ON run_stats FOR EACH ROW BEGIN UPDATE run_stats SET last_update=(strftime('%s','now')) WHERE id=old.id; END;" ) - (list "update_tests_trigger" "CREATE TRIGGER IF NOT EXISTS update_tests_trigger AFTER UPDATE ON tests + (list "update_tests_trigger" "CREATE TRIGGER IF NOT EXISTS update_tests_trigger AFTER UPDATE ON tests FOR EACH ROW BEGIN UPDATE tests SET last_update=(strftime('%s','now')) WHERE id=old.id; END;" ) - (list "update_teststeps_trigger" "CREATE TRIGGER IF NOT EXISTS update_teststeps_trigger AFTER UPDATE ON test_steps + (list "update_teststeps_trigger" "CREATE TRIGGER IF NOT EXISTS update_teststeps_trigger AFTER UPDATE ON test_steps FOR EACH ROW BEGIN UPDATE test_steps SET last_update=(strftime('%s','now')) WHERE id=old.id; END;" ) - (list "update_test_data_trigger" "CREATE TRIGGER IF NOT EXISTS update_test_data_trigger AFTER UPDATE ON test_data + (list "update_test_data_trigger" "CREATE TRIGGER IF NOT EXISTS update_test_data_trigger AFTER UPDATE ON test_data FOR EACH ROW BEGIN UPDATE test_data SET last_update=(strftime('%s','now')) WHERE id=old.id; END;" ))) @@ -1261,40 +1262,43 @@ (for-each (lambda (key) (sqlite3:execute db (cadr key))) db:trigger-list)) (define (db:drop-all-triggers dbstruct) -(db:with-db + (db:with-db dbstruct #f #f (lambda (db) -(db:drop-triggers db)))) + (db:drop-triggers db)))) (define (db:is-trigger-dropped db tbl-name) - (let* ((trigger-name (if (equal? tbl-name "test_steps") - "update_teststeps_trigger" - (conc "update_" tbl-name "_trigger")))) - (sqlite3:for-each-row - (lambda (name) - ;(print name) - (set! res (vector name))) - db - "select name from sqlite_master where type = 'trigger' ;" - ))) + (let* ((trigger-name (if (equal? tbl-name "test_steps") + "update_teststeps_trigger" + (conc "update_" tbl-name "_trigger"))) + (res #f)) + (sqlite3:for-each-row + (lambda (name) + (if (equal? name trigger-name) + (set! res #t))) + db + "SELECT name FROM sqlite_master WHERE type = 'trigger' ;" + ))) (define (db:drop-triggers db) - (for-each (lambda (key) - (sqlite3:execute db (conc "drop trigger " (car key)))) - db:trigger-list)) + (for-each + (lambda (key) + (sqlite3:execute db (conc "drop trigger if exists " (car key)))) + db:trigger-list)) (define (db:drop-trigger db tbl-name) - (let* ((trigger-name (if (equal? tbl-name "test_steps") - "update_teststeps_trigger" - (conc "update_" tbl-name "_trigger")))) - (for-each (lambda (key) - (if (equal? (car key) trigger-name) - (sqlite3:execute db (conc "drop trigger " trigger-name)))) - db:trigger-list))) + (let* ((trigger-name (if (equal? tbl-name "test_steps") + "update_teststeps_trigger" + (conc "update_" tbl-name "_trigger")))) + (for-each + (lambda (key) + (if (equal? (car key) trigger-name) + (sqlite3:execute db (conc "drop trigger if exists " trigger-name)))) + db:trigger-list))) (define (db:create-trigger db tbl-name) (let* ((trigger-name (if (equal? tbl-name "test_steps") "update_teststeps_trigger" (conc "update_" tbl-name "_trigger")))) @@ -1530,11 +1534,11 @@ state TEXT DEFAULT 'new', status TEXT DEFAULT 'n/a', archive_type TEXT DEFAULT 'bup', du INTEGER, archive_path TEXT);"))) - (print "creating trigges from init") + ;; (print "creating trigges from init") (db:create-triggers db) db)) ;; ) ;;====================================================================== ;; A R C H I V E S @@ -1772,15 +1776,33 @@ #f ) (with-input-from-file infile read-lines) ))) -;; select end_time-now from -;; (select testname,item_path,event_time+run_duration as -;; end_time,strftime('%s','now') as now from tests where state in -;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED')); - +;; check duration against test-run.dat file if it exists and update the value in +;; the db if necessary +;; +(define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration) + (let* ((datf (conc run-dir ".mt_data/test-run.dat")) + (modt (if (and (file-exists? datf) + (file-read-access? datf)) + (file-modification-time datf) + #f)) ;; (+ event-time run-duration)))) + (alt-run-duration (if modt + (- modt event-time) + #f))) + (if (and alt-run-duration + (> alt-run-duration run-duration)) + (begin + (debug:print 0 *default-log-port* "Test " test-id " run duration mismatch. Setting to " alt-run-duration) + (db:with-db + dbstruct #f #f + (lambda (db) + (sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id) + #t))) + #f))) ;; #f = we did NOT adjust the time + (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) (let* ((incompleted '()) (oldlaunched '()) (toplevels '()) ;; The default running-deadtime is 720 seconds = 12 minutes. @@ -1824,37 +1846,39 @@ ;; (db:test-get-run_duration testdat))) ;; 600) ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) - (if (and (equal? uname "n/a") - (equal? item-path "")) ;; this is a toplevel test - ;; what to do with toplevel? call rollup? - (begin - (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) - (begin - (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) - (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" - test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds) - " event-time="event-time" run-duration="run-duration)))) + (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (begin + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) + (begin + (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) + (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" + test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds) + " event-time="event-time" run-duration="run-duration))))) stmth1 run-id running-deadtime) ;; default time 720 seconds - + (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) - (if (and (equal? uname "n/a") - (equal? item-path "")) ;; this is a toplevel test - ;; what to do with toplevel? call rollup? - (begin - (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) - (begin - (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id - " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time - " run-duration="run-duration) - (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) + (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (begin + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) + (begin + (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id + " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time + " run-duration="run-duration) + (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))) stmth2 run-id remotehoststart-deadtime) ;; default time 230 seconds ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config ;; @@ -2180,12 +2204,12 @@ (if newres newres res)) res))) -(define (db:no-sync-close-db db stmtcache) - (db:safely-close-sqlite3-db db stmtcache)) +(define (db:no-sync-close-db db stmt-cache) + (db:safely-close-sqlite3-db db stmt-cache)) ;; transaction protected lock aquisition ;; either: ;; fails returns (#f . lock-creation-time) ;; succeeds (returns (#t . lock-creation-time) @@ -2196,13 +2220,14 @@ (sqlite3:with-transaction db (lambda () (handle-exceptions exn - (let ((lock-time (current-seconds))) - (sqlite3:execute db "INSERT INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-time) - `(#t . ,lock-time)) + (let ((lock-time (current-seconds))) + (debug:print-info 2 *default-log-port* "db:no-sync-get-lock keyname=" keyname ", lock-time=" lock-time ", exn=" exn) + (sqlite3:execute db "INSERT INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-time) + `(#t . ,lock-time)) `(#f . ,(sqlite3:first-result db "SELECT val FROM no_sync_metadat WHERE var=?;" keyname))))))) ;; use a global for some primitive caching, it is just silly to @@ -2238,11 +2263,12 @@ (n 0)) (if (equal? hed field) (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "WARNING: attempt to read non-existant field, row=" row " header=" header " field=" field ", exn=" exn) + (debug:print 0 *default-log-port* "WARNING: attempt to read non-existant field, row=" + row " header=" header " field=" field ", exn=" exn) #f) (vector-ref row n)) (if (null? tal) #f (loop (car tal)(cdr tal)(+ n 1))))))) ;; Accessors for the header/data structure @@ -3461,11 +3487,11 @@ (lambda (run-id) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))) (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs))) run-ids))) -;; Get test data using test_id, run-id is not used +;; Get test data using test_id, run-id is not used - but it will be! ;; (define (db:get-test-info-by-id dbstruct run-id test-id) (db:with-db dbstruct #f ;; run-id @@ -4479,11 +4505,11 @@ (db (db:dbdat-get-db dbdat)) ;; we'll return this so (db:delay--if-busy can be called inline (dbfj (conc dbpath "-journal"))) (if (handle-exceptions exn (begin - (debug:print-info 0 *default-log-port* "WARNING: failed to test for existance of " dbfj) + (debug:print-info 0 *default-log-port* "WARNING: failed to test for existance of " dbfj ", exn=" exn) (thread-sleep! 1) (db:delay-if-busy count (- count 1))) (common:file-exists? dbfj)) (case count ((6) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -566,17 +566,17 @@ (debug:print 0 *default-log-port* msg) (if (common:file-exists? full-serv-fname) (system (conc "sleep 1;mv -f " full-serv-fname " " new-serv-fname)) (debug:print 0 *default-log-port* "INFO: cannot move " full-serv-fname " to " new-serv-fname)) (exit))))) - (if (and (not start-time-old) ;; last server start try was less than five seconds ago + #;(if (and (not start-time-old) ;; last server start try was less than five seconds ago (not server-starting)) (begin (cleanup-proc "NOT starting server, there is either a recently started server or a server in process of starting") (exit))) ;; lets not even bother to start if there are already three or more server files ready to go - (let* ((num-alive (server:get-num-alive (server:get-list *toppath*)))) + #;(let* ((num-alive (server:get-num-alive (server:get-list *toppath*)))) (if (> num-alive 3) (begin (cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up")) (exit)))) (common:save-pkt `((action . start) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -205,17 +205,19 @@ (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) - (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) + (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) - (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) + ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - + ;; top of loop encountered at "(current-seconds)" with + ;; last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load @@ -233,33 +235,28 @@ (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) - (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) + #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) - (when do-sync - ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) - ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) - (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) - (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))) - + (if do-sync ;; save meta data about the running of this test + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this @@ -312,11 +309,11 @@ (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) - (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional + (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) @@ -1304,12 +1301,12 @@ (debug:print 2 *default-log-port* " - creating run area in " test-path) (handle-exceptions exn (begin (debug:print-error 0 *default-log-port* " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) - ", exiting, exn=" exn) - (exit 1)) + ", continuing (might cause downstream issues?), exn=" exn) + #f) (create-directory test-path #t)) (debug:print 2 *default-log-port* " - creating link from: " test-path "\n" " to: " lnktarget) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -22,10 +22,12 @@ (declare (unit rmt)) (declare (uses api)) (declare (uses http-transport)) (include "common_records.scm") +(include "db_records.scm") + ;; (declare (uses rmtmod)) ;; (import rmtmod) ;; @@ -525,15 +527,24 @@ (rmt:general-call 'register-test run-id run-id test-name item-path)) (define (rmt:get-test-id run-id testname item-path) (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) -;; run-id is NOT used +;; run-id is NOT used - but it will be! ;; (define (rmt:get-test-info-by-id run-id test-id) (if (number? test-id) - (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)) + (let* ((testdat (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))) + (trundatf (conc (db:test-get-rundir testdat) "/.mt_data/test-run.dat"))) + ;; now we can update a couple fields from the filesystem + (if (and (db:test-get-rundir testdat) + (file-exists? trundatf)) + (let* ((duration (db:test-get-run_duration testdat)) + (event-time (db:test-get-event_time testdat)) + (last-touch (file-modification-time trundatf))) + (db:test-set-run_duration! testdat (max duration (- last-touch event-time))))) + testdat) (begin (debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) (print-call-chain (current-error-port)) #f))) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -16,11 +16,12 @@ ;; along with Megatest. If not, see . ;; strftime('%m/%d/%Y %H:%M:%S','now','localtime') (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) - posix-extras directory-utils pathname-expand typed-records format sxml-serializer sxml-modifications) + posix-extras directory-utils pathname-expand typed-records format sxml-serializer + sxml-modifications matchable) (declare (unit runs)) (declare (uses db)) (declare (uses common)) (declare (uses items)) @@ -45,14 +46,19 @@ (defstruct runs:dat reglen regfull runname max-concurrent-jobs run-id test-patts required-tests test-registry registry-mutex flags keyvals run-info all-tests-registry - can-run-more-tests + ;; stores results from last runs:can-run-more-tests + (can-run-more-tests #f) ;; (list can-run-more-flag num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit) ((can-run-more-tests-count 0) : fixnum) (last-fuel-check 0) ;; time when we last checked fuel (beginning-of-time (current-seconds)) + (load-mgmt-function #f) + (wait-for-jobs-function #f) + (last-load-check-time 0) + (last-jobs-check-time 0) ) (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup @@ -89,11 +95,11 @@ lock-files))) (if fresh-locks (begin (if (runs:lownoise "runners-softlock-wait" 360) (debug:print-info 0 *default-log-port* "Other runners in flight, giving up some time...")) - (thread-sleep! 10)) + (thread-sleep! 2)) (begin (if (runs:lownoise "runners-softlock-nowait" 360) (debug:print-info 0 *default-log-port* "No runners in flight, updating softlock")) (let* ((ouf (open-output-file my-lock-file))) (with-output-to-port ouf @@ -313,25 +319,12 @@ ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) - (thread-sleep! (cond ;; BB: check with Matt. Should this sleep move - ;; to cond clauses below where we determine we - ;; have too many jobs running rather than each - ;; time the and condition above is true (which - ;; seems like always)? - ((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way - 0) - ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time - (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ...")) - 10) ;; obviously haven't had any work to do for a while - (else 0))) -;; ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero? -;; (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01) -;; ))) - + (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) + (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) @@ -539,11 +532,11 @@ ;; register this run in monitor.db (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) (rmt:tasks-set-state-given-param-key task-key "running") - (common:telemetry-log "run-tests" + #;(common:telemetry-log "run-tests" payload: `( (target . ,target) (run-name . ,runname) (test-patts . ,test-patts) ) ) @@ -991,13 +984,17 @@ (null? non-completed)) (debug:print-info 4 *default-log-port* "cond branch - " "ei-4") (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) - (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; + (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0) ", going to wait 60 sec.") ;; ;; getting here likely means the system is way overloaded, kill a full minute before continuing - (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) + ;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!! + ;; No runsdat, can't do this yet + ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) + ;; + (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") @@ -1123,11 +1120,29 @@ (conc " WARNING: t is not a vector=" t ))) prereqs-not-met) ", ") ") fails: " fails "\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) - + ;; well, first lets see if cpu load throttling is enabled. If so wait around until the + ;; average cpu load is under the threshold before continuing + ;; + (if (runs:dat-load-mgmt-function runsdat) + ((runs:dat-load-mgmt-function runsdat)) + (runs:dat-load-mgmt-function-set! + runsdat + (lambda () + ;; jobtools maxload is useful for where the full Megatest run is done on one machine + (if (and (not (common:on-homehost?)) + maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized + (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) + + ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues + (if maxhomehostload + (common:wait-for-homehost-load maxhomehostload + (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))))) + + (if (and (not (null? prereqs-not-met)) (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) (debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) @@ -1212,29 +1227,17 @@ ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path)) ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! *max-tries-hash* (make-hash-table)) - ;; well, first lets see if cpu load throttling is enabled. If so wait around until the - ;; average cpu load is under the threshold before continuing - - ;; jobtools maxload is useful for where the full Megatest run is done on one machine - (if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized - (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) - - ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues - (if maxhomehostload - (common:wait-for-homehost-load maxhomehostload - (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))) - - (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) + (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat testdat) (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) (if (or (not (null? tal))(not (null? reg))) - (runs:loop-values tal reg reglen regfull reruns) + (runs:loop-values tal reg reglen regfull reruns) ;; hed should be dropped at this time #f)) ;; must be we have unmet prerequisites ;; (else @@ -1251,19 +1254,18 @@ (member 'toplevel testmode)) (begin ;; couldn't run, take a breather (if (runs:lownoise "Waiting for more work to do..." 60) (debug:print-info 0 *default-log-port* "Waiting for more work to do...")) - (thread-sleep! 1) + + ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) + (thread-sleep! 5) (list (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (begin (let ((my-test-id (rmt:get-test-id run-id test-name item-path))) - (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2")) - - - + (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2")) (if (or (not (null? reg))(not (null? tal))) (if (vector? hed) (begin (debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path " from the launch list as it has prerequistes that are FAIL") @@ -1581,19 +1583,10 @@ itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) - ;; -- removed BB 17ww28 - no longer needed. - ;; every 15 minutes verify the server is there for this run - ;; (if (and (common:low-noise-print 240 "try start server" run-id) - ;; (not (or (and *runremote* - ;; (remote-server-url *runremote*) - ;; (server:ping (remote-server-url *runremote*))) - ;; (server:check-if-running *toppath*)))) - ;; (server:kind-run *toppath*)) - (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000))) (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1))) @@ -1636,11 +1629,11 @@ "\n regfull: " regfull "\n reglen: " reglen "\n length reg: " (length reg) ) - (runs:parallel-runners-mgmt runsdat) + ;; (runs:parallel-runners-mgmt runsdat) ;; check for hed in waitons => this would be circular, remove it and issue an ;; error (if (member test-name waitons) (begin @@ -1672,19 +1665,67 @@ (debug:print-info 4 *default-log-port* "cond branch - " "rtq-2") (debug:print-info 4 *default-log-port* "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) + ;;(runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) + + ;; gonna try a strategy change here. + ;; + ;; check if can run more tests. if yes, continue, if no, rest until can run more + ;; look at the test jobgroup and tot jobs running + ;; + ;; NOTE: This does NOT actually gate here, only captures the proc to be called later + ;; + (if (not (runs:dat-wait-for-jobs-function runsdat)) + (runs:dat-wait-for-jobs-function-set! + runsdat + (lambda (testdat-in) + (let* ((jobgroup (runs:testdat-jobgroup testdat-in)) + (can-run-more-tests (runs:dat-can-run-more-tests runsdat)) + (last-jobs-check-time (runs:dat-last-jobs-check-time runsdat)) + (should-check-jobs (match can-run-more-tests + ((can-run-more-flag num-running nr-in-jobgroup max-concurrent-jobs . params) + (if (< (- max-concurrent-jobs num-running) 25) + (begin + (debug:print-info 0 *default-log-port* + "less than 20 jobs headroom, ("max-concurrent-jobs + "-"num-running")>20. Forcing prelaunch check.") + #t) + #f)) + (else #f)))) ;; no record yet + (if should-check-jobs + (let loop-can-run-more + ((res (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) + (remtries 1440)) ;; we can wait for up to two hours for jobs to get done + (match res + ((run-more num-running . rem) + (if (or run-more + (< remtries 1)) + (begin + (if (runs:lownoise "num-running" 30) + (debug:print-info 0 *default-log-port* "Have "num-running" tests of max " max-concurrent-jobs)) + (runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through + (begin + (if (runs:lownoise "num-running" 10) + (debug:print-info 0 *default-log-port* "Can't run more tests, have "num-running" tests of " + max-concurrent-jobs " allowed.")) + (thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable + + ;; wait for load here + (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) + (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) + (- remtries 1))))))) + ))))) + + ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) - ;; This would be a good place to block on homehost load - + ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed + (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) - - - (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running - (let ((loop-list (runs:process-expanded-tests runsdat testdat))) + (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running (if loop-list (apply loop loop-list)))) ;; items processed into a list but not came in as a list been processed ;; ((and (list? items) ;; thus we know our items are already calculated @@ -1743,19 +1784,20 @@ ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS ((or (procedure? items)(eq? items 'have-procedure)) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-4") - (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) + (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) - (car can-run-more)) - (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here + (car can-run-more)) ;; itemized test expanded here + (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup + max-concurrent-jobs run-id waitons item-path + testmode test-record can-run-more items runname + tconfig reglen test-registry test-records itemmaps))) (if loop-list (apply loop loop-list) - (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed) - ) - ) + (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed))) ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns)))) ;; this case should not happen, added to help catch any bugs ((and (list? items) itemdat) @@ -1788,11 +1830,13 @@ ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched (rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) + ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle + (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes (prev-num-running 0)) ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup *configdat* "setup" "run-wait") "yes")) @@ -1807,11 +1851,12 @@ (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) - (thread-sleep! 5) + ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) + (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. @@ -1866,11 +1911,11 @@ (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t)))) lst)) ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step ;; -(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry) +(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry runsdat testdat-rec) ;; All these vars might be referenced by the testconfig file reader ;; ;; NEED to reprocess testconfig here, ensuring that item variables are available. ;; This is for Tal's issue with item-specific env vars not being set for use in skip. ;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273 @@ -1944,12 +1989,13 @@ (set! test-id (rmt:get-test-id run-id test-name item-path)))) (debug:print-info 4 *default-log-port* "test-id=" test-id ", run-id=" run-id ", test-name=" test-name ", item-path=\"" item-path "\"") (set! testdat (rmt:get-test-info-by-id run-id test-id)) (if (not testdat) (begin - (debug:print-info 0 *default-log-port* "WARNING: server is overloaded, trying again in one second") - (thread-sleep! 1) + (debug:print-info 0 *default-log-port* "WARNING: server is overloaded, trying again in two seconds") + ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) + (thread-sleep! 2) (loop))))) (if (not testdat) ;; should NOT happen (debug:print-error 0 *default-log-port* "failed to get test record for test-id " test-id)) (set! test-id (db:test-get-id testdat)) (if (common:file-exists? test-path) @@ -2056,15 +2102,23 @@ (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test) (debug:print-info 1 *default-log-port* "SKIPPING Test " full-test-name " due to " skip-test)) ;; ;; Here the test is handed off to launch.scm for launch-test to complete the launch process ;; - (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags)) - (begin - (print "ERROR: Failed to launch the test. Exiting as soon as possible") - (set! *globalexitstatus* 1) ;; - (process-signal (current-process-id) signal/kill)))))))) + (begin + ;; wait for less than max jobs here + (if (runs:dat-wait-for-jobs-function runsdat) + ((runs:dat-wait-for-jobs-function runsdat) testdat-rec)) + + (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags)) + (begin + (print "ERROR: Failed to launch the test. Exiting as soon as possible") + (set! *globalexitstatus* 1) ;; + (process-signal (current-process-id) signal/kill)) + ) + ;; wait again here? + )))))) ((KILLED) (debug:print 1 *default-log-port* "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.") (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)) ;; KILLED)) ((LAUNCHED REMOTEHOSTSTART RUNNING) (debug:print 2 *default-log-port* "NOTE: " test-name " is already running")) @@ -2535,11 +2589,11 @@ (if (not (null? tal)) (loop (car tal)(cdr tal)))) ((run-wait) ;; BB TODO - manage has-subrun case (debug:print-info 2 *default-log-port* "still waiting, " (length tests) " tests still running") - (thread-sleep! 10) + (thread-sleep! 5) (let ((new-tests (proc-get-tests run-id))) (if (null? new-tests) (debug:print-info 1 *default-log-port* "Run completed according to zero tests matching provided criteria.") (loop (car new-tests)(cdr new-tests))))) ((archive) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -202,13 +202,13 @@ (if (if (directory-exists? (conc areapath "/logs")) '() (if (file-write-access? areapath) (begin (condition-case - (create-directory (conc areapath "/logs") #t) - (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) - (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list."))) + (create-directory (conc areapath "/logs") #t) + (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) + (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) (directory-exists? (conc areapath "/logs"))) '())) (let* ((server-logs (glob (conc areapath "/logs/server-*.log"))) (num-serv-logs (length server-logs))) (if (null? server-logs) @@ -215,15 +215,15 @@ '() (let loop ((hed (car server-logs)) (tal (cdr server-logs)) (res '())) (let* ((mod-time (handle-exceptions - exn - (begin - (print "failed to get modification time on " hed ", exn=" exn) - (current-seconds)) ;; 0 - (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted + exn + (begin + (print "failed to get modification time on " hed ", exn=" exn) + (current-seconds)) ;; 0 + (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted (down-time (- (current-seconds) mod-time)) (serv-dat (if (or (< num-serv-logs 10) (< down-time 900)) ;; day-seconds)) (server:logf-get-start-info hed) '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at @@ -231,16 +231,16 @@ (fmatch (string-match fname-rx hed)) (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) (new-res (if (null? serv-dat) res (cons (append serv-rec (list pid)) res)))) - (if (null? tal) - (if (and limit - (> (length new-res) limit)) - new-res ;; (take new-res limit) <= need intelligent sorting before this will work - new-res) - (loop (car tal)(cdr tal) new-res))))))))) + (if (null? tal) + (if (and limit + (> (length new-res) limit)) + new-res ;; (take new-res limit) <= need intelligent sorting before this will work + new-res) + (loop (car tal)(cdr tal) new-res))))))))) (define (server:get-num-alive srvlst) (let ((num-alive 0)) (for-each (lambda (server) @@ -274,14 +274,15 @@ (mod-time (list-ref rec 0))) ;; (print "start-time: " start-time " mod-time: " mod-time) (and start-time mod-time (> (- now start-time) 0) ;; been running at least 0 seconds (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds - (< (- now start-time) - (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) - 180) - (random 360))) ;; under one hour running time +/- 180 + (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set + (< (- now start-time) + (+ (- (string->number (configf:lookup *configdat* "server" "runtime")) + 180) + (random 360)))) ;; under one hour running time +/- 180 )) #f)) srvlst) (lambda (a b) (< (list-ref a 3) @@ -321,25 +322,39 @@ *my-client-signature*))) ;; wait for server=start-last to be three seconds old ;; (define (server:wait-for-server-start-last-flag areapath) - (let* ((start-flag (conc areapath "/logs/server-start-last"))) + (let* ((start-flag (conc areapath "/logs/server-start-last")) + ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) + (reftime (configf:lookup-number *configdat* "server" "idletime" default: 4)) + (server-key (conc (get-host-name) "-" (current-process-id)))) (if (file-exists? start-flag) (let* ((fmodtime (file-modification-time start-flag)) - (reftime (+ 3 (random 5))) - (delta (- (current-seconds) fmodtime))) - (if (> delta reftime) ;; good enough - (begin - (debug:print-info 0 *default-log-port* "Ready to start server, last start: " - fmodtime ", delta: " delta ", reftime: " reftime) - (system (conc "touch " start-flag))) ;; lazy but safe - (begin - (thread-sleep! 5) - (server:wait-for-server-start-last-flag areapath)))) - (system (conc "touch " start-flag))))) - + (delta (- (current-seconds) fmodtime)) + (all-go (> delta reftime))) + (if (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "Something not right with soft server locking: exn=" exn) + #t) + (and all-go + (begin + (with-output-to-file start-flag + (lambda () + (print server-key))) + (thread-sleep! 0.25) + (let ((res (with-input-from-file start-flag + (lambda () + (read-line))))) + (equal? server-key res))))) + #t ;; (system (conc "touch " start-flag)) ;; lazy but safe + (begin + (debug:print-info 0 *default-log-port* "Gating server start, last start: " + fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go) + (thread-sleep! reftime) + (server:wait-for-server-start-last-flag areapath))))))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched ;; (define (server:kind-run areapath) @@ -356,12 +371,13 @@ ((2) 300) (else 600)) (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously (lock-file (conc areapath "/logs/server-start.lock"))) (if (> (- (current-seconds) when-run) run-delay) - (begin + (let* ((start-flag (conc areapath "/logs/server-start-last"))) (common:simple-file-lock-and-wait lock-file expire-time: 15) + (system (conc "touch " start-flag)) ;; lazy but safe (server:run areapath) (thread-sleep! 2) ;; don't release the lock for at least a few seconds (common:simple-file-release-lock lock-file))) (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) @@ -574,18 +590,26 @@ (if (not (configf:lookup *configdat* "server" "disable-db-snapshot")) (common:snapshot-file mtdbfile subdir: ".db-snapshot")) (delete-file* staging-file) (let* ((start-time (current-milliseconds)) (res (system sync-cmd)) + (dbbackupfile (conc mtdbfile ".backup")) (res2 (cond - ((eq? 0 res) - (delete-file* (conc mtdbfile ".backup")) + ((eq? 0 res ) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "Problem making backup db file, exn=" exn) + #t) + (if (file-exists? dbbackupfile) + (delete-file* dbbackupfile) + ) (if (eq? 0 (file-size sync-log)) (delete-file sync-log)) (system (conc "/bin/mv " staging-file " " mtdbfile)) - + ) (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000)) (set! off-time (calculate-off-time last-sync-seconds (cond ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1)) Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -1944,28 +1944,52 @@ tdb "SELECT count(id) FROM test_rundat;") res)) 0) -(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname) - (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1)) - (if (and cpuload diskfree) - (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id)) - (if minutes - (rmt:general-call 'update-run-duration run-id minutes test-id)) - (if (and uname hostname) - (rmt:general-call 'update-uname-host run-id uname hostname test-id))) +;; +(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname #!key (update-db #f)(tmpfree #f)) + (if (get-environment-variable "MT_TEST_RUN_DIR") + (let* ((dest-dir (conc (get-environment-variable "MT_TEST_RUN_DIR") "/.mt_data")) + (or-dash (lambda (instr) + (cond + ((not instr) "") ;; #f -> blank, indicates value unchanged since last measurement taken + ((string? instr)(if (string-search " " instr) (conc "\"" instr "\"") instr)) + (else instr)))) + (file-new (not (directory-exists? dest-dir)))) + (if file-new (create-directory dest-dir #t)) + (let* ((outp (open-output-file (conc dest-dir "/test-run.dat") #:append))) + (with-output-to-port outp + (lambda () + (if file-new + (print "epoch_time,run_id,test_id,cpuload,diskfree,tmpfree,run_minutes,hostname,uname")) + (print (current-seconds) "," (or-dash run-id) "," (or-dash test-id) "," + (or-dash cpuload) "," (or-dash diskfree) "," (or-dash tmpfree) "," + (or-dash minutes) "," (or-dash hostname) "," + (or-dash uname)))) ;; put uname last as it has spaces in it + (close-output-port outp))) + (begin + (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1)))) + (if update-db + (begin + (if (and cpuload diskfree) + (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id)) + (if minutes + (rmt:general-call 'update-run-duration run-id minutes test-id)) + (if (and uname hostname) + (rmt:general-call 'update-uname-host run-id uname hostname test-id))))) ;; This one is for running with no db access (i.e. via rmt: internally) -(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries) +(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries #!key (update-db #f)) ;; (define (tests:set-full-meta-info test-id run-id minutes work-area) ;; (let ((remtries 10)) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) + (tmpfree (get-df "/tmp")) (uname (get-uname "-srvpio")) (hostname (get-host-name))) - (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname))) + (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db tmpfree: tmpfree))) ;; (define (tests:set-partial-meta-info test-id run-id minutes work-area) #;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory)))