Index: archive.scm ================================================================== --- archive.scm +++ archive.scm @@ -90,11 +90,11 @@ (pscript-cmd (conc pscript " " testsuite-name " " target " " run-name " " test-name)) (apath (if pscript (handle-exceptions exn (begin - (debug:print 0 "ERROR: script \"" pscript-cmd "\" failed to run properly.") + (debug:print 0 *default-log-port* "ERROR: script \"" pscript-cmd "\" failed to run properly.") (exit 1)) (with-input-from-pipe pscript-cmd read-line)) #f)) ;; this is the user-calculated archive path @@ -116,13 +116,16 @@ ;; (allocation-id (rmt:archive-allocate-testsuite/area-to-block block-id testsuite-name area-key))) (if block-id ;; (and block-id allocation-id) (let ((res (cons block-id archive-path))) (hash-table-set! blockid-cache key res) res) - #f)) - #f)) ;; no best disk found - ))) + (begin + (debug:print 0 *default-log-port* "WARNING: no disk found for " target ", " run-name ", " test-name ", archive-path=" archive-path) + #f))) + (begin + (debug:print 0 *default-log-port* "WARNING: no disk found for " target ", " run-name ", " test-name ", block-id=" block-id) + #f)))))) ;; no best disk found ;; archive - run bup ;; ;; 1. create the bup dir if not exists ;; 2. start the du of each directory @@ -246,11 +249,20 @@ (arch-group (hash-table-ref arch-groups test-base)) (arch-info (car arch-group)) ;; don't know yet how this will work, can I get more than one possibility? (archive-id (car arch-info)) (archive-dir (cdr arch-info))) (debug:print 0 *default-log-port* "Processing disk-group " test-base) - (let* ((test-paths (hash-table-ref disk-groups test-base))) + (let* ((test-paths-in (hash-table-ref disk-groups test-base)) + (test-paths (if (args:get-arg "-include") + (let ((subpaths (string-split (args:get-arg "-include") ","))) + (apply append + (map (lambda (p) + (map (lambda (subp) + (conc p "/" subp)) + subpaths)) + test-paths-in))) + test-paths-in))) (if (not (common:file-exists? archive-dir)) (create-directory archive-dir #t)) (case archiver ((bup) ;; Archive using bup (let* ((bup-init-params (list "-d" archive-dir "init")) @@ -343,11 +355,14 @@ (archive-block-id (db:test-get-archived test-dat)) (archive-block-info (rmt:test-get-archive-block-info archive-block-id)) (archive-path (if (vector? archive-block-info) (vector-ref archive-block-info 2) ;; look in db.scm for test-get-archive-block-info for the vector record info #f)) ;; no archive found? - (archive-internal-path (conc (common:get-testsuite-name) "-" run-id "/latest/" test-partial-path))) + (archive-internal-path (conc (common:get-testsuite-name) "-" run-id "/latest/" test-partial-path)) + (include-paths (args:get-arg "-include")) + (exclude-pattern (args:get-arg "-exclude-rx")) + (exclude-file (args:get-arg "-exclude-rx-from"))) ;; some sanity checks, move an existing path out of the way - iif it is not a toplevel with children ;; (if (and (not toplevel/children) ;; special handling needed for toplevel with children prev-test-physical-path @@ -440,18 +455,25 @@ (archive-block-info (rmt:test-get-archive-block-info archive-block-id)) (archive-path (if (vector? archive-block-info) (vector-ref archive-block-info 2) #f)) (archive-internal-path (conc (common:get-testsuite-name) "-" run-id - "/latest/" test-partial-path))) + "/latest/" test-partial-path)) + (include-paths (args:get-arg "-include")) + (exclude-pattern (args:get-arg "-exclude-rx")) + (exclude-file (args:get-arg "-exclude-rx-from"))) (if (and archive-path ;; no point in proceeding if there is no actual archive (not toplevel/children)) (begin - (let* ((bup-restore-params (list "-d" archive-path "restore" "-C" (or destpath "data") - ;; " " ;; What is the empty string for? - archive-internal-path))) + (let* ((bup-restore-params (append (list "-d" archive-path "restore" "-C" (or destpath "data")) + ;; " " ;; What is the empty string for? + (if include-paths + (map (lambda (p) + (conc archive-internal-path "/" p)) + (string-split include-paths ",")) + (list archive-internal-path))))) (debug:print-info 0 *default-log-port* "Restoring archived data to " (or destpath "data") " from archive in " archive-path " ... " archive-internal-path) (run-n-wait bup-exe params: bup-restore-params print-cmd: #t))) (let ((new-rem-tests (filter (lambda (tdat) (or (not (eq? (db:test-get-id tdat) test-id)) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -484,13 +484,14 @@ (directory-fold (lambda (file rem) (handle-exceptions exn (begin - (debug:print-info 0 *default-log-port* "unable to rotate log " file ", probably handled by another process.") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (print-call-chain (current-error-port))) + (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore.") + (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + ;; (print-call-chain (current-error-port)) ;; + ) (let* ((fullname (conc "logs/" file)) (mod-time (file-modification-time fullname)) (file-age (- (current-seconds) mod-time))) (hash-table-set! all-files file mod-time) (if (or (and (string-match "^.*.log" file) @@ -1714,11 +1715,11 @@ ;; cpu-load)) ;; get values from cached info from dropping file in logs dir ;; e.g. key is host and dtype is normalized-load ;; -(define (common:get-cached-info key dtype #!key (age 5)) +(define (common:get-cached-info key dtype #!key (age 10)) (if *toppath* (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log"))) (if (and (file-exists? fullpath) (file-read-access? fullpath)) (handle-exceptions @@ -1727,11 +1728,11 @@ (debug:print 2 *default-log-port* "reading file " fullpath) (let ((real-age (- (current-seconds)(file-change-time fullpath)))) (if (< real-age age) (with-input-from-file fullpath read) (begin - (debug:print 1 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it") + (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it") #f)))) (begin (debug:print 2 *default-log-port* "not reading file " fullpath) #f))) #f)) @@ -1993,11 +1994,13 @@ (result (if remote-host (with-input-from-pipe (conc "ssh " remote-host " cat /proc/cpuinfo") proc) (with-input-from-file "/proc/cpuinfo" proc)))) - (if (> result 0)(common:write-cached-info actual-host "num-cpus" result)) + (if (and (number? result) + (> result 0)) + (common:write-cached-info actual-host "num-cpus" result)) result)))) ;; wait for normalized cpu load to drop below maxload ;; (define (common:wait-for-normalized-load maxload msg remote-host #!optional (rem-tries 5)) @@ -2023,12 +2026,14 @@ (first (car loadavg)) (next (cadr loadavg)) (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 (loadjmp (- first next)) (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously - (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload - ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp) + ;; let's let the user know once in a long while that load checking is happening but not constantly report it + (if (> (random 100) 75) ;; about 25% of the time + (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload + ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)) (cond ((and (> first adjload) (> count 0)) (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) (thread-sleep! adjwait) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1464,20 +1464,18 @@ (set! res id)) db "SELECT id FROM archive_blocks WHERE archive_disk_id=? AND disk_path=?;" bdisk-id archive-path) (if res ;; record exists, update du if applicable and return res - (begin - (if du (sqlite3:execute db "UPDATE archive_blocks SET last_du=?,last_du_time=(strftime('%s','now')) + (if du (sqlite3:execute db "UPDATE archive_blocks SET last_du=?,last_du_time=(strftime('%s','now')) WHERE archive_disk_id=? AND disk_path=?;" - bdisk-id archive-path du)) - res) + bdisk-id archive-path du)) (begin (sqlite3:execute db "INSERT OR REPLACE INTO archive_blocks (archive_disk_id,disk_path,last_du) VALUES (?,?,?);" bdisk-id archive-path (or du 0)) - (db:archive-register-block-name dbstruct bdisk-id archive-path du: du))) + (set! res (db:archive-register-block-name dbstruct bdisk-id archive-path du: du)))) (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat) res)) ;; The "archived" field in tests is overloaded; 0 = not archived, > 0 archived in block with given id Index: genexample.scm ================================================================== --- genexample.scm +++ genexample.scm @@ -156,18 +156,18 @@ (print "") (print "[setup]") (print "# Adjust max_concurrent_jobs to limit how much you load your machines") (print "max_concurrent_jobs 50\n") (print "# This is your link path. Avoid moving it once set.") - (print "linktree " (common:real-path lntree)) + (print "linktree " lntree) ;; (common:real-path lntree)) (print "\n# Job tools are more advanced ways to control how your jobs are launched") (print "[jobtools]\nuseshell yes\nlauncher nbfake\nmaxload 1.5\n") (print "# You can override environment variables for all your tests here") (print "[env-override]\nEXAMPLE_VAR example value\n") (print "# As you run more tests you may need to add additional disks, the names are arbitrary but must be unique") - (print "[disks]\ndisk0 " (common:real-path firstd)))) - + (print "[disks]\ndisk0 " firstd))) ;; (common:real-path firstd)))) + (print "================== I'm now creating a runconfigs.config file for you with a default section. You can use this file to set variables for your tests based on the \"target\" (the combination @@ -195,11 +195,11 @@ ;; Now create a test and logpro file (print "================== You now have the basic common files for your megatest setup. Next run -\"megatest -gen-test\" to create a test. +\"megatest -create-test \" to create a test. Thank you for using Megatest. You can edit your config files and create tests in the " path " directory Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -627,11 +627,11 @@ ;; (let* ((test-info (let loop ((tries 0)) (let ((tinfo (rmt:get-test-info-by-id run-id test-id))) (if tinfo tinfo - (if (> tries 10) + (if (> tries 5) #f (begin (thread-sleep! (+ 1 (* tries 10))) (loop (+ tries 1)))))))) (test-host (if test-info Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -226,11 +226,12 @@ will substitute %s for the sheet name in generating multiple sheets) -o : output file for refdb2dat (defaults to stdout) -archive cmd : archive runs specified by selectors to one of disks specified in the [archive-disks] section. - cmd: keep-html, restore, save, save-remove, get (use -dest to set destination) + cmd: keep-html, restore, save, save-remove, get (use + -dest to set destination), -include path1,path2... to get or save specific files -generate-html : create a simple html dashboard for browsing your runs -generate-html-structure : create a top level html veiw to list targets/runs and a Run view within each run directory. -list-run-time : list time requered to complete runs. It supports following switches -run-patt -target-patt -dumpmode -list-test-time : list time requered to complete each test in a run. It following following arguments @@ -330,13 +331,18 @@ ;; move runs stuff here "-remove-keep" "-set-run-status" "-age" + + ;; archive "-archive" "-actions" "-precmd" + "-include" + "-exclude-rx" + "-exclude-rx-from" "-debug" ;; for *verbosity* > 2 "-create-test" "-override-timeout" "-test-files" ;; -test-paths is for listing all @@ -506,13 +512,19 @@ (if start-watchdog (thread-start! *watchdog*))) ;; bracket open-output-file with code to make leading directory if it does not exist and handle exceptions -(define (open-logfile logpath) +(define (open-logfile logpath-in) (condition-case - (let* ((log-dir (or (pathname-directory logpath) "."))) + (let* ((log-dir (or (pathname-directory logpath-in) ".")) + (fname (pathname-strip-directory logpath-in)) + (logpath (if (> (string-length fname) 250) + (let ((newlogf (conc log-dir "/" (common:get-signature fname) ".log"))) + (debug:print 0 *default-log-port* "WARNING: log file " logpath-in " path too long, converted to " newlogf) + newlogf) + logpath-in))) (if (not (directory-exists? log-dir)) (system (conc "mkdir -p " log-dir))) (open-output-file logpath)) (exn () (debug:print-error 0 *default-log-port* "Could not open log file for write: "logpath) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -237,16 +237,21 @@ ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) - (thread-sleep! (cond ;; BB: check with Matt. Should this sleep move to cond clauses below where we determine we have too many jobs running rather than each time the and condition above is true (which seems like always)? - ((> (runs:dat-can-run-more-tests-count runsdat) 20) + (thread-sleep! (cond ;; BB: check with Matt. Should this sleep move + ;; to cond clauses below where we determine we + ;; have too many jobs running rather than each + ;; time the and condition above is true (which + ;; seems like always)? + ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ...")) - (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.1) ;; was 2 - );; obviously haven't had any work to do for a while - (else 0))) + 10) ;; obviously haven't had any work to do for a while + (else + ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero? + (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)))) (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count)