@@ -49,11 +49,11 @@ ;; arguments - thunk, message (define (common:fail-safe thunk warning-message-on-exception) (handle-exceptions exn (begin - (debug:print-info 0 *default-log-port* "notable but nonfatal condition - "warning-message-on-exception) + (debug:print-info 0 *default-log-port* "notable but nonfatal condition - "warning-message-on-exception", exn=" exn) (debug:print-info 0 *default-log-port* (string-substitute "\n?Error:" "nonfatal condition:" (with-output-to-string (lambda () (print-error-message exn) )))) @@ -61,17 +61,19 @@ #f) (thunk))) (define getenv get-environment-variable) (define (safe-setenv key val) - (if (or (substring-index "!" key) (substring-index ":" key)) ;; variables containing : are for internal use and cannot be environment variables. + (if (or (substring-index "!" key) + (substring-index ":" key) ;; variables containing : are for internal use and cannot be environment variables. + (substring-index "." key)) ;; periods are not allowed in environment variables (debug:print-error 4 *default-log-port* "skip setting internal use only variables containing \":\" or starting with \"!\"") (if (and (string? val) (string? key)) (handle-exceptions exn - (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val) + (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val ", exn=" exn) (setenv key val)) (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val)))) (define home (getenv "HOME")) (define user (getenv "USER")) @@ -506,11 +508,11 @@ (directory-fold (lambda (file rem) (handle-exceptions exn (begin - (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore.") + (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore. exn=" exn) (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) ;; (print-call-chain (current-error-port)) ;; ) (let* ((fullname (conc "logs/" file)) (mod-time (file-modification-time fullname)) @@ -541,11 +543,11 @@ (handle-exceptions exn #f (if (directory? fullname) (begin - (debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") + (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") (inc-stat "directories")) (begin (delete-file* fullname) (inc-stat "deleted"))) (hash-table-delete! all-files file))))))) @@ -565,14 +567,14 @@ (- num-logs max-allowed)))) (for-each (lambda (file) (let* ((fullname (conc "logs/" file))) (if (directory? fullname) - (debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") + (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") (handle-exceptions exn - (debug:print-error 0 *default-log-port* "failed to remove " fullname) + (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn) (delete-file* fullname))))) files) (debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files.")))))) ;; Force a megatest cleanup-db if version is changed and skip-version-check not specified @@ -595,11 +597,11 @@ (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db (debug:print 0 *default-log-port* " I see you are the owner of megatest.config, attempting to cleanup and reset to new version") (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "Failed to switch versions.") + (debug:print 0 *default-log-port* "Failed to switch versions. exn=" exn) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (print-call-chain (current-error-port)) (exit 1)) (common:cleanup-db dbstruct))) ((not (common:file-exists? mtconf)) @@ -706,11 +708,11 @@ (handle-exceptions exn (handle-exceptions exn (begin - (debug:print-error 0 *default-log-port* "received bad encoded string \"" instr "\", message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print-error 0 *default-log-port* "received bad encoded string \"" instr "\", message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (print-call-chain (current-error-port)) #f) (read (open-input-string (base64:base64-decode instr)))) (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) @@ -944,11 +946,11 @@ *db-cache-path* (if *toppath* ;; common:get-create-writeable-dir (handle-exceptions exn (begin - (debug:print-error 0 *default-log-port* "Couldn't create path to " *db-cache-path*) + (debug:print-error 0 *default-log-port* "Couldn't create path to " *db-cache-path* ", exn=" exn) (exit 1)) (let ((dbpath (common:get-create-writeable-dir (list (conc "/tmp/" (current-user-name) "/megatest_localdb/" (common:get-testsuite-name) "/" @@ -1189,11 +1191,12 @@ (file-write-access? hed) hed) (handle-exceptions exn (begin - (debug:print-info 0 *default-log-port* "could not create " hed ", this might cause problems down the road.") + (debug:print-info 0 *default-log-port* "could not create " hed + ", this might cause problems down the road. exn=" exn) #f) (create-directory hed #t))))) (if (and (string? res) (directory? res)) res @@ -1333,12 +1336,14 @@ ;; ;; returns the directory or #f ;; (define (common:directory-writable? path-string) (handle-exceptions - exn - #f + exn + (begin + (debug:print 0 *default-log-port* "Failed to identify access to " path-string ", exn=" exn) + #f) (if (and (directory-exists? path-string) (file-write-access? path-string)) path-string #f))) @@ -1427,16 +1432,20 @@ (handle-exceptions exn (if (> trynum 0) (let ((delay-time (* (- 5 trynum) 5))) (mutex-unlock! *homehost-mutex*) - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying " delay-time " seconds and trying again, message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying " + delay-time " seconds and trying again, message: " ((condition-property-accessor 'exn 'message) exn) + ", exn=" exn) (thread-sleep! delay-time) (common:get-homehost trynum: (- trynum 1))) (begin (mutex-unlock! *homehost-mutex*) - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file after trying five times. Giving up and exiting, message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "ERROR: ["(common:human-time) + "] Failed to read .homehost file after trying five times. Giving up and exiting, message: " + ((condition-property-accessor 'exn 'message) exn)) (exit 1))) (let ((hhf (conc *toppath* "/.homehost"))) (if (common:file-exists? hhf) (with-input-from-file hhf read-line) (if (file-write-access? *toppath*) @@ -1687,26 +1696,32 @@ ;; lazy-safe get file mod time. on any error (file not existing etc.) return 0 ;; (define (common:lazy-modification-time fpath) (handle-exceptions exn - 0 - (file-modification-time fpath))) + (begin + (debug:print 0 *default-log-port* "Failed to get modifcation time for " fpath ", treating it as zero. exn=" exn) + 0) + (if (file-exists? fpath) + (file-modification-time fpath) + 0))) ;; find timestamp of newest file associated with a sqlite db file (define (common:lazy-sqlite-db-modification-time fpath) (let* ((glob-list (handle-exceptions exn - `(,(conc "/no/such/file, message: " ((condition-property-accessor 'exn 'message) exn))) + (begin + (debug:print 0 *default-log-port* "Failed to glob " fpath "*, exn=" exn) + `(,(conc "/no/such/file, message: " ((condition-property-accessor 'exn 'message) exn)))) (glob (conc fpath "*")))) (file-list (if (eq? 0 (length glob-list)) '("/no/such/file") glob-list))) (apply max - (map - common:lazy-modification-time - file-list)))) + (map + common:lazy-modification-time + file-list)))) ;; return a nice clean pathname made absolute (define (common:nice-path dir) (let ((match (string-match "^(~[^\\/]*)(\\/.*|)$" dir))) (if match ;; using ~ for home? @@ -1720,11 +1735,11 @@ (define (common:read-link-f path) (handle-exceptions exn (begin - (debug:print-error 0 *default-log-port* "command \"/bin/readlink -f " path "\" failed.") + (debug:print-error 0 *default-log-port* "command \"/bin/readlink -f " path "\" failed. exn=" exn) path) ;; just give up (with-input-from-pipe (conc "/bin/readlink -f " path) (lambda () (read-line))))) @@ -1763,11 +1778,12 @@ (debug:print-info 1 *default-log-port* " removing bad file " fullpath) (delete-file* fullpath) #f) (with-input-from-file fullpath read)) (begin - (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it") + (debug:print-info 2 *default-log-port* "file " fullpath + " is too old (" real-age" seconds) to trust, skipping reading it") #f)))) (begin (debug:print 2 *default-log-port* "not reading file " fullpath) #f))) #f)) @@ -1776,50 +1792,56 @@ (if *toppath* (let* ((fulldir (conc *toppath* "/.sysdata")) (fullpath (conc fulldir "/" key "-" dtype ".log"))) (if (not (file-exists? fulldir))(create-directory fulldir #t)) (handle-exceptions - exn - #f - (with-output-to-file fullpath (lambda ()(pp dat))))) + exn + (begin + (debug:print 0 *default-log-path* "failed to write file " fullpath ", exn=" exn) + #f) + (with-output-to-file fullpath (lambda ()(pp dat))))) #f)) (define (common:raw-get-remote-host-load remote-host) (handle-exceptions - exn - #f ;; more specific handling of errors needed - (with-input-from-pipe - (conc "ssh " remote-host " cat /proc/loadavg") - (lambda ()(list (read)(read)(read)))))) + exn + (begin + (debug:print 0 *default-log-port* "failed to ssh to " remote-host " and get loadavg. exn=" exn) + #f) ;; more specific handling of errors needed + (with-input-from-pipe + (conc "ssh " remote-host " cat /proc/loadavg") + (lambda ()(list (read)(read)(read)))))) ;; get cpu load by reading from /proc/loadavg, return all three values ;; (define (common:get-cpu-load remote-host) (handle-exceptions - exn - '(-99 -99 -99) - (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) - (or (common:get-cached-info actual-hostname "cpu-load") - (let ((result (if remote-host - (map (lambda (res) - (if (eof-object? res) 9e99 res)) - (with-input-from-pipe - (conc "ssh " remote-host " cat /proc/loadavg") - (lambda ()(list (read)(read)(read))))) - (with-input-from-file "/proc/loadavg" - (lambda ()(list (read)(read)(read))))))) - (match - result - ((l1 l2 l3) - (if (and (number? l1) + exn + (begin + (debug:print 0 *default-log-port* "failed to ssh or read loadavg from host " remote-host ", exn=" exn) + '(-99 -99 -99)) + (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) + (or (common:get-cached-info actual-hostname "cpu-load") + (let ((result (if remote-host + (map (lambda (res) + (if (eof-object? res) 9e99 res)) + (with-input-from-pipe + (conc "ssh " remote-host " cat /proc/loadavg") + (lambda ()(list (read)(read)(read))))) + (with-input-from-file "/proc/loadavg" + (lambda ()(list (read)(read)(read))))))) + (match + result + ((l1 l2 l3) + (if (and (number? l1) (number? l2) (number? l3)) - (begin - (common:write-cached-info actual-hostname "cpu-load" result) - result) - '(-1 -1 -1))) ;; -1 is bad result - (else '(-2 -2 -2)))))))) + (begin + (common:write-cached-info actual-hostname "cpu-load" result) + result) + '(-1 -1 -1))) ;; -1 is bad result + (else '(-2 -2 -2)))))))) ;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads ;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc. ;; keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load ;; @@ -2097,11 +2119,12 @@ (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp)) (cond ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable (> num-tries 0)) - (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.") + (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " + first ", we'll sleep 10s and try " num-tries " more times.") (thread-sleep! 10) (common:wait-for-cpuload maxload-in numcpus-in waitdelay count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1))) ((and (> first adjmaxload) (> count 0)) @@ -2433,10 +2456,12 @@ ("mode-patt" . "-modepatt") ("run-name" . "-runname") ("contour" . "-contour") ("target" . "-target") ("test-patt" . "-testpatt") + ("rerun" . "-rerun") + ("setvars" . "-setvars") ("msg" . "-m") ("log" . "-log") ("start-dir" . "-start-dir") ("new" . "-set-state-status")))) (if (eq? flavor 'switch-symbol) @@ -3018,10 +3043,11 @@ (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) (count 100)) (if targ-host (conc "remrun " targ-host) (if (> count 0) + (begin (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) (thread-sleep! (- 101 count)) (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) (- count 1))) @@ -3342,13 +3368,15 @@ (for-each (lambda (thread-name) (let* ((thread (hash-table-ref/default *common:thread-punchlist* thread-name #f))) (if thread (handle-exceptions - exn - #t ;; just ignore it, it might have died in the meantime so joining it will throw an exception - (thread-join! thread)) + exn + (begin + (debug:print 0 *default-log-port* "joining threads failed. exn=" exn) + #t) ;; just ignore it, it might have died in the meantime so joining it will throw an exception + (thread-join! thread)) ))) (hash-table-keys *common:thread-punchlist*))) (define *common:telemetry-log-state* 'startup) (define *common:telemetry-log-socket* #f)