Changes In Branch v1.65-shell-problem Excluding Merge-Ins
This is equivalent to a diff from 0a9e690b28 to 3d9c1de633
2020-05-01
| ||
20:30 | Cherrypicked inter-test-delay and cached load to 10s check-in: aa29985039 user: mrwellan tags: v1.65-broken | |
18:13 | Fixed (I hope) inter-test-delay support. Reduced useless noise on cpu load checking messages. Closed-Leaf check-in: 3d9c1de633 user: mrwellan tags: v1.65-shell-problem, v1.65-broken | |
15:58 | Upped allowed time for reading cached load data to 10 sec. The load is over a 60 sec. window, does 10 seconds really make a difference? check-in: 6625f3bc6a user: mrwellan tags: v1.65-shell-problem, v1.65-broken | |
2020-04-30
| ||
22:44 | Partial fix for #{shell ...} problem in testconfigs check-in: 3ccc64251f user: mrwellan tags: v1.65-shell-problem, v1.65-broken | |
16:22 | rewrite absurdly long log file names if over 250 chars long check-in: 0a9e690b28 user: mrwellan tags: v1.65-broken | |
2020-04-29
| ||
14:39 | Added speculative fix (try few times) to lauch where it is trying to get the test-info. check-in: 18da8b6a61 user: mrwellan tags: v1.65-broken | |
Modified common.scm from [d0760d6348] to [7eb16cae49].
︙ | ︙ | |||
484 485 486 487 488 489 490 | (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age (if (not (directory-exists? "logs"))(create-directory "logs")) (directory-fold (lambda (file rem) (handle-exceptions exn (begin | | | | > | 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 | (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age (if (not (directory-exists? "logs"))(create-directory "logs")) (directory-fold (lambda (file rem) (handle-exceptions exn (begin (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore.") (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) ;; (print-call-chain (current-error-port)) ;; ) (let* ((fullname (conc "logs/" file)) (mod-time (file-modification-time fullname)) (file-age (- (current-seconds) mod-time))) (hash-table-set! all-files file mod-time) (if (or (and (string-match "^.*.log" file) (> (file-size fullname) 200000)) (and (string-match "^server-.*.log" file) |
︙ | ︙ | |||
1697 1698 1699 1700 1701 1702 1703 | ;; (set! cpu-load newval)))))) ;; (car load-res)) ;; cpu-load)) ;; get values from cached info from dropping file in logs dir ;; e.g. key is host and dtype is normalized-load ;; | | | | 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 | ;; (set! cpu-load newval)))))) ;; (car load-res)) ;; cpu-load)) ;; get values from cached info from dropping file in logs dir ;; e.g. key is host and dtype is normalized-load ;; (define (common:get-cached-info key dtype #!key (age 10)) (if *toppath* (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log"))) (if (and (file-exists? fullpath) (file-read-access? fullpath)) (handle-exceptions exn #f (debug:print 2 *default-log-port* "reading file " fullpath) (let ((real-age (- (current-seconds)(file-change-time fullpath)))) (if (< real-age age) (with-input-from-file fullpath read) (begin (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it") #f)))) (begin (debug:print 2 *default-log-port* "not reading file " fullpath) #f))) #f)) (define (common:write-cached-info key dtype dat) |
︙ | ︙ | |||
1984 1985 1986 1987 1988 1989 1990 | numcpu) (read-line)))))) (result (if remote-host (with-input-from-pipe (conc "ssh " remote-host " cat /proc/cpuinfo") proc) (with-input-from-file "/proc/cpuinfo" proc)))) | > > | | 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 | numcpu) (read-line)))))) (result (if remote-host (with-input-from-pipe (conc "ssh " remote-host " cat /proc/cpuinfo") proc) (with-input-from-file "/proc/cpuinfo" proc)))) (if (and (number? result) (> result 0)) (common:write-cached-info actual-host "num-cpus" result)) result)))) ;; wait for normalized cpu load to drop below maxload ;; (define (common:wait-for-normalized-load maxload msg remote-host #!optional (rem-tries 5)) (let ((num-cpus (common:get-num-cpus remote-host))) (if num-cpus |
︙ | ︙ | |||
2014 2015 2016 2017 2018 2019 2020 | maxload-in (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? (first (car loadavg)) (next (cadr loadavg)) (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 (loadjmp (- first next)) (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously | > > | | | 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 | maxload-in (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? (first (car loadavg)) (next (cadr loadavg)) (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 (loadjmp (- first next)) (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously ;; let's let the user know once in a long while that load checking is happening but not constantly report it (if (> (random 100) 75) ;; about 25% of the time (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)) (cond ((and (> first adjload) (> count 0)) (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) (thread-sleep! adjwait) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) ((and (> loadjmp numcpus) |
︙ | ︙ |
Modified megatest.scm from [ee60e5eddb] to [4b27fb4644].
︙ | ︙ | |||
238 239 240 241 242 243 244 | -refdb2dat refdb : convert refdb to sexp or to format specified by s-dumpmode formats: perl, ruby, sqlite3, csv (for csv the -o param will substitute %s for the sheet name in generating multiple sheets) -o : output file for refdb2dat (defaults to stdout) -archive cmd : archive runs specified by selectors to one of disks specified in the [archive-disks] section. | | > > | 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 | -refdb2dat refdb : convert refdb to sexp or to format specified by s-dumpmode formats: perl, ruby, sqlite3, csv (for csv the -o param will substitute %s for the sheet name in generating multiple sheets) -o : output file for refdb2dat (defaults to stdout) -archive cmd : archive runs specified by selectors to one of disks specified in the [archive-disks] section. cmd: keep-html, restore, save, save-remove, get (use -dest to set destination), -include and -exclude to include or exclude files) -generate-html : create a simple html dashboard for browsing your runs -generate-html-structure : create a top level html veiw to list targets/runs and a Run view within each run directory. -list-run-time : list time requered to complete runs. It supports following switches -run-patt <patt> -target-patt <patt> -dumpmode <csv,json,plain-text> -list-test-time : list time requered to complete each test in a run. It following following arguments -runname <patt> -target <patt> -dumpmode <csv,json,plain-text> -syscheck : do some very basic checks; write access and space in tmp, home, runs, links and |
︙ | ︙ |
Modified runs.scm from [c265f57285] to [04a7c3fd06].
︙ | ︙ | |||
235 236 237 238 239 240 241 | (define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) | | > > > > | < | | > > | 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 | (define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) (thread-sleep! (cond ;; BB: check with Matt. Should this sleep move ;; to cond clauses below where we determine we ;; have too many jobs running rather than each ;; time the and condition above is true (which ;; seems like always)? ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ...")) 10) ;; obviously haven't had any work to do for a while (else ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero? (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)))) (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) |
︙ | ︙ |
Modified tests.scm from [bf1af44b82] to [fbd7ebda22].
︙ | ︙ | |||
166 167 168 169 170 171 172 | (items:get-items-from-config tconfig)) (else #f)))) ;; not iterated ;; returns waitons waitors tconfigdat ;; (define (tests:get-waitons test-name all-tests-registry) | | | 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | (items:get-items-from-config tconfig)) (else #f)))) ;; not iterated ;; returns waitons waitors tconfigdat ;; (define (tests:get-waitons test-name all-tests-registry) (let* ((config (tests:get-testconfig test-name #f all-tests-registry #t))) ;;'return-procs))) ;; assuming no problems with immediate evaluation, this could be simplified ('return-procs -> #t) (let ((instr (if config (configf:lookup config "requirements" "waiton") (begin ;; No config means this is a non-existant test (debug:print-error 0 *default-log-port* "non-existent required test \"" test-name "\"") (exit 1)))) (instr2 (if config (configf:lookup config "requirements" "waitor") |
︙ | ︙ | |||
1577 1578 1579 1580 1581 1582 1583 | (getenv "MT_TARGET") "/" (getenv "MT_RUNNAME") "/" test-name "/" item-path)) (local-tcfg (conc local-tcdir "/testconfig"))) (if (common:file-exists? local-tcfg) local-tcdir #f)) | > > | > < | < | < | | > | | 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 | (getenv "MT_TARGET") "/" (getenv "MT_RUNNAME") "/" test-name "/" item-path)) (local-tcfg (conc local-tcdir "/testconfig"))) (if (common:file-exists? local-tcfg) local-tcdir #f)) (begin (debug:print-info 0 *default-log-port* "reading testconfig for "test-full-name" from tests/"test-name" directory.") (conc *toppath* "/tests/" test-name)) ;; should this fallback exist? )) (test-configf (conc test-path "/testconfig")) (testexists (let loopa ((tries-left 30)) (cond ((and (common:file-exists? test-configf)(file-read-access? test-configf)) #t) ((common:file-exists? test-configf) (debug:print 0 *default-log-port* "WARNING: Cannot read testconfig file: "test-configf) #f) ((and wait-a-minute (> tries-left 0)) (thread-sleep! 10) (debug:print 0 *default-log-port* "WARNING: testconfig file does not exist: "test-configf " will retry in 10 seconds. Tries left: "tries-left) ;; BB: this fires (loopa (sub1 tries-left))) (else (debug:print 0 *default-log-port* "WARNING: testconfig file does not exist: "test-configf) ;; BB: this fires #f)))) (tcfg (if testexists (configf:read-file test-configf #f system-allowed environ-patt: (if system-allowed "pre-launch-env-vars" #f)) #f))) (if (and tcfg cache-file) (hash-table-set! tcfg "have fulldata" #t)) ;; mark this as fully read data (if tcfg (hash-table-set! *testconfigs* test-full-name tcfg)) (if (and testexists |
︙ | ︙ |