Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -142,11 +142,11 @@ ;; (define (api:execute-requests dbstruct dat) (handle-exceptions exn (let ((call-chain (get-call-chain))) - (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat) + (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn) (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (cond ((not (vector? dat)) ;; it is an error to not receive a vector Index: archive.scm ================================================================== --- archive.scm +++ archive.scm @@ -90,11 +90,11 @@ (pscript-cmd (conc pscript " " testsuite-name " " target " " run-name " " test-name)) (apath (if pscript (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "ERROR: script \"" pscript-cmd "\" failed to run properly.") + (debug:print 0 *default-log-port* "ERROR: script \"" pscript-cmd "\" failed to run properly. exn=" exn) (exit 1)) (with-input-from-pipe pscript-cmd read-line)) #f)) ;; this is the user-calculated archive path Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -49,11 +49,11 @@ ;; arguments - thunk, message (define (common:fail-safe thunk warning-message-on-exception) (handle-exceptions exn (begin - (debug:print-info 0 *default-log-port* "notable but nonfatal condition - "warning-message-on-exception) + (debug:print-info 0 *default-log-port* "notable but nonfatal condition - "warning-message-on-exception", exn=" exn) (debug:print-info 0 *default-log-port* (string-substitute "\n?Error:" "nonfatal condition:" (with-output-to-string (lambda () (print-error-message exn) )))) @@ -61,17 +61,19 @@ #f) (thunk))) (define getenv get-environment-variable) (define (safe-setenv key val) - (if (or (substring-index "!" key) (substring-index ":" key)) ;; variables containing : are for internal use and cannot be environment variables. + (if (or (substring-index "!" key) + (substring-index ":" key) ;; variables containing : are for internal use and cannot be environment variables. + (substring-index "." key)) ;; periods are not allowed in environment variables (debug:print-error 4 *default-log-port* "skip setting internal use only variables containing \":\" or starting with \"!\"") (if (and (string? val) (string? key)) (handle-exceptions exn - (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val) + (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val ", exn=" exn) (setenv key val)) (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val)))) (define home (getenv "HOME")) (define user (getenv "USER")) @@ -506,11 +508,11 @@ (directory-fold (lambda (file rem) (handle-exceptions exn (begin - (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore.") + (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore. exn=" exn) (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) ;; (print-call-chain (current-error-port)) ;; ) (let* ((fullname (conc "logs/" file)) (mod-time (file-modification-time fullname)) @@ -568,11 +570,11 @@ (let* ((fullname (conc "logs/" file))) (if (directory? fullname) (debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") (handle-exceptions exn - (debug:print-error 0 *default-log-port* "failed to remove " fullname) + (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn) (delete-file* fullname))))) files) (debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files.")))))) ;; Force a megatest cleanup-db if version is changed and skip-version-check not specified @@ -595,11 +597,11 @@ (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db (debug:print 0 *default-log-port* " I see you are the owner of megatest.config, attempting to cleanup and reset to new version") (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "Failed to switch versions.") + (debug:print 0 *default-log-port* "Failed to switch versions. exn=" exn) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (print-call-chain (current-error-port)) (exit 1)) (common:cleanup-db dbstruct))) ((not (common:file-exists? mtconf)) @@ -706,11 +708,11 @@ (handle-exceptions exn (handle-exceptions exn (begin - (debug:print-error 0 *default-log-port* "received bad encoded string \"" instr "\", message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print-error 0 *default-log-port* "received bad encoded string \"" instr "\", message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (print-call-chain (current-error-port)) #f) (read (open-input-string (base64:base64-decode instr)))) (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) @@ -944,11 +946,11 @@ *db-cache-path* (if *toppath* ;; common:get-create-writeable-dir (handle-exceptions exn (begin - (debug:print-error 0 *default-log-port* "Couldn't create path to " *db-cache-path*) + (debug:print-error 0 *default-log-port* "Couldn't create path to " *db-cache-path* ", exn=" exn) (exit 1)) (let ((dbpath (common:get-create-writeable-dir (list (conc "/tmp/" (current-user-name) "/megatest_localdb/" (common:get-testsuite-name) "/" @@ -1189,11 +1191,11 @@ (file-write-access? hed) hed) (handle-exceptions exn (begin - (debug:print-info 0 *default-log-port* "could not create " hed ", this might cause problems down the road.") + (debug:print-info 0 *default-log-port* "could not create " hed ", this might cause problems down the road. exn=" exn) #f) (create-directory hed #t))))) (if (and (string? res) (directory? res)) res @@ -1333,12 +1335,14 @@ ;; ;; returns the directory or #f ;; (define (common:directory-writable? path-string) (handle-exceptions - exn - #f + exn + (begin + (debug:print 0 *default-log-port* "Failed to identify access to " path-string ", exn=" exn) + #f) (if (and (directory-exists? path-string) (file-write-access? path-string)) path-string #f))) @@ -1427,16 +1431,20 @@ (handle-exceptions exn (if (> trynum 0) (let ((delay-time (* (- 5 trynum) 5))) (mutex-unlock! *homehost-mutex*) - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying " delay-time " seconds and trying again, message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying " + delay-time " seconds and trying again, message: " ((condition-property-accessor 'exn 'message) exn) + ", exn=" exn) (thread-sleep! delay-time) (common:get-homehost trynum: (- trynum 1))) (begin (mutex-unlock! *homehost-mutex*) - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file after trying five times. Giving up and exiting, message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "ERROR: ["(common:human-time) + "] Failed to read .homehost file after trying five times. Giving up and exiting, message: " + ((condition-property-accessor 'exn 'message) exn)) (exit 1))) (let ((hhf (conc *toppath* "/.homehost"))) (if (common:file-exists? hhf) (with-input-from-file hhf read-line) (if (file-write-access? *toppath*) @@ -1687,26 +1695,30 @@ ;; lazy-safe get file mod time. on any error (file not existing etc.) return 0 ;; (define (common:lazy-modification-time fpath) (handle-exceptions exn - 0 + (begin + (debug:print 0 *default-log-port* "Failed to get modifcation time for " fpath ", treating it as zero. exn=" exn) + 0) (file-modification-time fpath))) ;; find timestamp of newest file associated with a sqlite db file (define (common:lazy-sqlite-db-modification-time fpath) (let* ((glob-list (handle-exceptions exn - `(,(conc "/no/such/file, message: " ((condition-property-accessor 'exn 'message) exn))) + (begin + (debug:print 0 *default-log-port* "Failed to glob " fpath "*, exn=" exn) + `(,(conc "/no/such/file, message: " ((condition-property-accessor 'exn 'message) exn)))) (glob (conc fpath "*")))) (file-list (if (eq? 0 (length glob-list)) '("/no/such/file") glob-list))) (apply max - (map - common:lazy-modification-time - file-list)))) + (map + common:lazy-modification-time + file-list)))) ;; return a nice clean pathname made absolute (define (common:nice-path dir) (let ((match (string-match "^(~[^\\/]*)(\\/.*|)$" dir))) (if match ;; using ~ for home? @@ -1720,11 +1732,11 @@ (define (common:read-link-f path) (handle-exceptions exn (begin - (debug:print-error 0 *default-log-port* "command \"/bin/readlink -f " path "\" failed.") + (debug:print-error 0 *default-log-port* "command \"/bin/readlink -f " path "\" failed. exn=" exn) path) ;; just give up (with-input-from-pipe (conc "/bin/readlink -f " path) (lambda () (read-line))))) @@ -1750,19 +1762,21 @@ (if *toppath* (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log"))) (if (and (file-exists? fullpath) (file-read-access? fullpath)) (handle-exceptions - exn - #f + exn + (begin + (debug:print 0 *default-log-port* "failed to get cached info from " fullpath ", exn=" exn) + #f) (debug:print 2 *default-log-port* "reading file " fullpath) (let ((real-age (- (current-seconds)(file-change-time fullpath)))) (if (< real-age age) (handle-exceptions exn (begin - (debug:print-info 1 *default-log-port* " removing bad file " fullpath) + (debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn) (delete-file* fullpath) #f) (with-input-from-file fullpath read)) (begin (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it") @@ -1776,50 +1790,56 @@ (if *toppath* (let* ((fulldir (conc *toppath* "/.sysdata")) (fullpath (conc fulldir "/" key "-" dtype ".log"))) (if (not (file-exists? fulldir))(create-directory fulldir #t)) (handle-exceptions - exn - #f - (with-output-to-file fullpath (lambda ()(pp dat))))) + exn + (begin + (debug:print 0 *default-log-path* "failed to write file " fullpath ", exn=" exn) + #f) + (with-output-to-file fullpath (lambda ()(pp dat))))) #f)) (define (common:raw-get-remote-host-load remote-host) (handle-exceptions - exn - #f ;; more specific handling of errors needed - (with-input-from-pipe - (conc "ssh " remote-host " cat /proc/loadavg") - (lambda ()(list (read)(read)(read)))))) + exn + (begin + (debug:print 0 *default-log-port* "failed to ssh to " remote-host " and get loadavg. exn=" exn) + #f) ;; more specific handling of errors needed + (with-input-from-pipe + (conc "ssh " remote-host " cat /proc/loadavg") + (lambda ()(list (read)(read)(read)))))) ;; get cpu load by reading from /proc/loadavg, return all three values ;; (define (common:get-cpu-load remote-host) (handle-exceptions - exn - '(-99 -99 -99) - (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) - (or (common:get-cached-info actual-hostname "cpu-load") - (let ((result (if remote-host - (map (lambda (res) - (if (eof-object? res) 9e99 res)) - (with-input-from-pipe - (conc "ssh " remote-host " cat /proc/loadavg") - (lambda ()(list (read)(read)(read))))) - (with-input-from-file "/proc/loadavg" - (lambda ()(list (read)(read)(read))))))) - (match - result - ((l1 l2 l3) - (if (and (number? l1) + exn + (begin + (debug:print 0 *default-log-port* "failed to ssh or read loadavg from host " remote-host ", exn=" exn) + '(-99 -99 -99)) + (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) + (or (common:get-cached-info actual-hostname "cpu-load") + (let ((result (if remote-host + (map (lambda (res) + (if (eof-object? res) 9e99 res)) + (with-input-from-pipe + (conc "ssh " remote-host " cat /proc/loadavg") + (lambda ()(list (read)(read)(read))))) + (with-input-from-file "/proc/loadavg" + (lambda ()(list (read)(read)(read))))))) + (match + result + ((l1 l2 l3) + (if (and (number? l1) (number? l2) (number? l3)) - (begin - (common:write-cached-info actual-hostname "cpu-load" result) - result) - '(-1 -1 -1))) ;; -1 is bad result - (else '(-2 -2 -2)))))))) + (begin + (common:write-cached-info actual-hostname "cpu-load" result) + result) + '(-1 -1 -1))) ;; -1 is bad result + (else '(-2 -2 -2)))))))) ;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads ;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc. ;; keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load ;; @@ -3203,11 +3223,11 @@ (pktsdir (car pktsdirs))) ;; assume it is there (hash-table-set! *pkts-info* 'pkts-dir pktsdir) pktsdir)))) (handle-exceptions exn - (debug:print-info 0 "failed to write out packet to " pktsdir) ;; don't care if this failed for now but MUST FIX - BUG!! + (debug:print-info 0 "failed to write out packet to " pktsdir ", exn=" exn) ;; don't care if this failed for now but MUST FIX - BUG!! (if (not (file-exists? pktsdir)) (create-directory pktsdir #t)) (with-output-to-file (conc pktsdir "/" uuid ".pkt") (lambda () @@ -3350,13 +3370,15 @@ (for-each (lambda (thread-name) (let* ((thread (hash-table-ref/default *common:thread-punchlist* thread-name #f))) (if thread (handle-exceptions - exn - #t ;; just ignore it, it might have died in the meantime so joining it will throw an exception - (thread-join! thread)) + exn + (begin + (debug:print 0 *default-log-port* "joining threads failed. exn=" exn) + #t) ;; just ignore it, it might have died in the meantime so joining it will throw an exception + (thread-join! thread)) ))) (hash-table-keys *common:thread-punchlist*))) (define *common:telemetry-log-state* 'startup) (define *common:telemetry-log-socket* #f) Index: configf.scm ================================================================== --- configf.scm +++ configf.scm @@ -65,11 +65,11 @@ ;; (equal? "!" (substring str 0 1))) ;; null string or starts with ! are preserved but NOT set in the environment str (handle-exceptions exn (begin - (debug:print-error 0 *default-log-port* "problem evaluating \"" str "\" in the shell environment") + (debug:print-error 0 *default-log-port* "problem evaluating \"" str "\" in the shell environment, exn=" exn) #f) (let ((cmdres (process:cmd-run->list (conc "echo " str)))) (if (null? cmdres) "" (caar cmdres))))) ;; ) @@ -130,11 +130,11 @@ (else "(lambda (ht)(print \"ERROR\") \"ERROR\")")))) ;; (print "fullcmd=" fullcmd) (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "WARNING: failed to process config input \"" l "\"") + (debug:print 0 *default-log-port* "WARNING: failed to process config input \"" l "\", exn=" exn) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) ;; (print "exn=" (condition->list exn)) (set! result (conc "#{( " cmdtype ") " cmd "}, full expansion: " fullcmd))) (if (or allow-system (not (member cmdtype '("system" "shell" "sh")))) @@ -334,11 +334,15 @@ (common:nice-path (conc (if curr-conf-dir curr-conf-dir ".") "/" include-file))))) - (let ((all-matches (sort (handle-exceptions exn (list) (glob full-conf)) string<=?))) + (let ((all-matches (sort (handle-exceptions exn + (begin + (debug:print '(2 9) *default-log-port* "glob of " full-conf " gave no match. , exn=" exn) + (list)) + (glob full-conf)) string<=?))) (if (null? all-matches) (begin (debug:print '(2 9) #f "INFO: include file(s) matching " include-file " not found (called from " path ")") (debug:print 2 *default-log-port* " " full-conf)) (for-each @@ -775,14 +779,16 @@ ht)) ;; if (define (configf:read-alist fname) (handle-exceptions - exn - #f - (configf:alist->config - (with-input-from-file fname read)))) + exn + (begin + (debug:print 0 *default-log-port* "read of alist " fname " failed. exn=" exn) + #f) + (configf:alist->config + (with-input-from-file fname read)))) (define (configf:write-alist cdat fname) (if (not (common:faux-lock fname)) (debug:print 0 *default-log-port* "INFO: Could not get lock on " fname)) (let* ((dat (configf:config->alist cdat)) @@ -795,14 +801,16 @@ (if (common:file-exists? fname) ;; now verify it is readable (if (configf:read-alist fname) #t ;; data is good. (begin (handle-exceptions - exn - #f - (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.") - (delete-file fname)) + exn + (begin + (debug:print 0 *default-log-port* "deleting " fname " failed, exn=" exn) + #f) + (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.") + (delete-file fname)) #f)) #f)))) (common:faux-unlock fname) res)) Index: dashboard-context-menu.scm ================================================================== --- dashboard-context-menu.scm +++ dashboard-context-menu.scm @@ -319,11 +319,11 @@ (lambda () (if scheme-match (begin (handle-exceptions exn - (print "error with custom menu scheme") + (print "error with custom menu scheme, exn=" exn) (begin ;;(BB> "gonna eval it!") (eval (with-input-from-string (cadr scheme-match) read))))) (common:run-a-command command-line with-vars: #t)))))))) #f))) Index: dashboard-tests.scm ================================================================== --- dashboard-tests.scm +++ dashboard-tests.scm @@ -513,21 +513,25 @@ ;; this next block was added to fix a bug where variables were ;; needed. Revisit this. (runconfig (let ((runconfigf (conc *toppath* "/runconfigs.config"))) ;; no rush but it would be good to convert this call to use runconfig:read (if (common:file-exists? runconfigf) (handle-exceptions - exn - #f ;; do nothing, just keep on trucking .... + exn + (begin + (debug:print 0 *default-log-port* "failed to set up environment for " runconfigf ", exn=" exn) + #f) ;; do nothing, just keep on trucking .... (setup-env-defaults runconfigf run-id (make-hash-table) keydat environ-patt: keystring)) (make-hash-table)))) (testconfig (begin ;; (runs:set-megatest-env-vars run-id inrunname: runname testname: test-name itempath: item-path) (runs:set-megatest-env-vars run-id inkeyvals: keydat inrunname: runname intarget: keystring testname: testname itempath: item-path) ;; these may be needed by the launching process (handle-exceptions - exn ;; NOTE: I've no idea why this was written this way. Research, study and fix needed! - (tests:get-testconfig (db:test-get-testname testdat) (db:test-get-item-path testdat) test-registry #f allow-write-cache: #f) - (tests:get-testconfig (db:test-get-testname testdat) item-path test-registry #t allow-write-cache: #f)))) + exn ;; NOTE: I've no idea why this was written this way. Research, study and fix needed! + (begin + (debug:print 0 *default-log-port* "testconfig load using " item-path " failed, trying " (db:test-get-item-path testdat) ", exn=" exn) + (tests:get-testconfig (db:test-get-testname testdat) (db:test-get-item-path testdat) test-registry #f allow-write-cache: #f)) + (tests:get-testconfig (db:test-get-testname testdat) item-path test-registry #t allow-write-cache: #f)))) (viewlog (lambda (x) (if (common:file-exists? logfile) ;(system (conc "firefox " logfile "&")) (dcommon:run-html-viewer logfile) (message-window (conc "File " logfile " not found"))))) @@ -561,13 +565,16 @@ (> (current-milliseconds)(+ last-update 10000)) ;; force update even 10 seconds request-update)) (newtestdat (if need-update ;; NOTE: BUG HIDER, try to eliminate this exception handler (handle-exceptions - exn - (debug:print-info 0 *default-log-port* "test db access issue in examine test for run-id " run-id ", test-id " test-id ": " ((condition-property-accessor 'exn 'message) exn)) - (rmt:get-test-info-by-id run-id test-id ))))) + exn + (begin + (debug:print-info 0 *default-log-port* "test db access issue in examine test for run-id " run-id + ", test-id " test-id ": " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) + #f) + (rmt:get-test-info-by-id run-id test-id))))) ;; (print "INFO: need-update= " need-update " curr-mod-time = " curr-mod-time) (cond ((and need-update newtestdat) (set! testdat newtestdat) (set! teststeps (augment-teststeps (tests:get-compressed-steps run-id test-id))) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -2127,11 +2127,11 @@ (file-read-access? source)) (handle-exceptions exn (begin (print-call-chain) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (debug:print 0 *default-log-port* "ERROR: failed to load " source ", try loading in the repl: megatest -repl") (set! success #f)) (load source)) (begin (debug:print 0 *default-log-port* "ERROR: cannot find file to load: \"" source "\" for user view " view-name))) @@ -2139,11 +2139,11 @@ (if success (handle-exceptions exn (begin (print-call-chain) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (debug:print 0 *default-log-port* "ERROR: failed call procedure " viewgen ", with; tab-num=" tab-num ", view-name=" view-name ", and views-cfgdat and megatest configdat as parameters. To debug try loading in the repl: megatest -repl") (set! success #f)) (print "Adding tab " view-name " with proc " viewgen) @@ -2156,11 +2156,11 @@ (lambda () (handle-exceptions exn (begin (print-call-chain) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (debug:print 0 *default-log-port* "ERROR: failed call procedure \"" updater "\", with; tabnum=" tab-num ", view-name=" view-name ", and views-cfgdat and megatest configdat as parameters. To debug try loading in the repl: megatest -repl") (set! success #f)) (debug:print 4 *default-log-port* "Running updater for tab " view-name " with proc " updater " and tab-num: " tab-num) @@ -3044,11 +3044,12 @@ (define (dashboard:get-youngest-run-db-mod-time dbdir) (handle-exceptions exn (begin - (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir) + (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " + ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir ", exn=" exn) (current-seconds)) ;; something went wrong - just print an error and return current-seconds (common:max (map (lambda (filen) (file-modification-time filen)) (glob (conc dbdir "/*.db*")))))) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -123,11 +123,11 @@ (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) ;; check for (exn sqlite3) ((condition-property-accessor 'exn 'message) exn) (if (eq? err-status 'done) default (begin - (debug:print-error 0 *default-log-port* " query " stmt " failed, params: " params ", error: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print-error 0 *default-log-port* " query " stmt " failed, params: " params ", error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (print-call-chain (current-error-port)) default))) (apply sqlite3:first-result db stmt params))) ;; Get/open a database @@ -184,17 +184,10 @@ (debug:print-info 0 *default-log-port* *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access")) (if (common:low-noise-print 600 (conc "parallel-api-requests" *max-api-process-requests*)) (debug:print-info 2 *default-log-port* "Parallel api request count: " *api-process-request-count* " max parallel requests: " *max-api-process-requests*)) (condition-case (begin - ;;;;;;;;; (handle-exceptions - ;;;;;;;;; exn - ;;;;;;;;; (begin - ;;;;;;;;; (print-call-chain (current-error-port)) - ;;;;;;;;; (debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn)) - ;;;;;;;;; ;; there is no recovering at this time. exit - ;;;;;;;;; (exit 50)) (if use-mutex (mutex-lock! *db-with-db-mutex*)) (let ((res (apply proc db params))) (if use-mutex (mutex-unlock! *db-with-db-mutex*)) ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w)) (if dbdat (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat)) @@ -460,14 +453,15 @@ (define (db:safely-close-sqlite3-db db stmt-cache #!key (try-num 3)) (if (<= try-num 0) #f (handle-exceptions exn - (begin - (thread-sleep! 3) - (sqlite3:interrupt! db) - (db:safely-close-sqlite3-db db try-num: (- try-num 1))) + (begin + (print "Attempt to safely close sqlite3 db failed. Trying again. exn=" exn) + (thread-sleep! 3) + (sqlite3:interrupt! db) + (db:safely-close-sqlite3-db db try-num: (- try-num 1))) (if (sqlite3:database? db) (let* ((stmts (hash-table-ref/default stmt-cache db #f))) (if stmts (map sqlite3:finalize! (hash-table-values stmts))) (sqlite3:finalize! db) #t) @@ -477,11 +471,11 @@ (define (db:close-all dbstruct) (if (dbr:dbstruct? dbstruct) (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "WARNING: Finalizing failed, " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "WARNING: Finalizing failed, " ((condition-property-accessor 'exn 'message) exn) ", note - exn=" exn) (print-call-chain *default-log-port*)) ;; (db:sync-touched dbstruct 0 force-sync: #t) ;; NO. Do not do this here. Instead we rely on a server to be started when there are writes, even if the server itself is not going to be used as a server. (let ((tdbs (map db:dbdat-get-db (stack->list (dbr:dbstruct-dbstack dbstruct)))) (mdb (db:dbdat-get-db (dbr:dbstruct-mtdb dbstruct))) @@ -645,10 +639,11 @@ ;; (else ;; ((equal? fname "megatest.db") ;; this file can be regenerated if needed (handle-exceptions exn (begin + (print "Problems trying to repair the db, exn=" exn) ;; (db:move-and-recreate-db dbdat) (if (> numtries 0) (db:repair-db dbdat numtries: (- numtries 1)) #f) (debug:print 0 *default-log-port* "FATAL: file " dbpath " was found corrupted, an attempt to fix has been made but you must start over.") @@ -838,13 +833,13 @@ (db:drop-trigger db tablename) #f)) (is-trigger-dropped (if (member "last_update" field-names) (db:is-trigger-dropped db tablename) #f)) (stmth (sqlite3:prepare db full-ins))) - (db:delay-if-busy targdb) ;; NO WAITING - (if (member "last_update" field-names) - (debug:print-info 3 *default-log-port* "is-trigger-dropped: " is-trigger-dropped)) + ;; (db:delay-if-busy targdb) ;; NO WAITING + (if (member "last_update" field-names) + (debug:print-info 3 *default-log-port* "is-trigger-dropped: " is-trigger-dropped)) (for-each (lambda (fromdat-lst) (sqlite3:with-transaction db (lambda () @@ -1097,11 +1092,11 @@ ) ;; clear out junk records ;; ((dejunk) - (db:delay-if-busy mtdb) ;; ok to delay on mtdb + ;; (db:delay-if-busy mtdb) ;; ok to delay on mtdb (when (file-write-access? (db:dbdat-get-path mtdb)) (db:clean-up mtdb)) (db:clean-up tmpdb) (db:clean-up refndb)) ;; sync runs, test_meta etc. @@ -2243,11 +2238,11 @@ (n 0)) (if (equal? hed field) (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "WARNING: attempt to read non-existant field, row=" row " header=" header " field=" field) + (debug:print 0 *default-log-port* "WARNING: attempt to read non-existant field, row=" row " header=" header " field=" field ", exn=" exn) #f) (vector-ref row n)) (if (null? tal) #f (loop (car tal)(cdr tal)(+ n 1))))))) ;; Accessors for the header/data structure @@ -4648,11 +4643,11 @@ (begin (handle-exceptions exn (begin (debug:print 0 *default-log-port* - "WARNING: itemmap has problem \"" itemmap "\", patt: " patt ", repl: " repl) + "WARNING: itemmap has problem \"" itemmap "\", patt: " patt ", repl: " repl ", exn=" exn) res) (string-substitute patt repl res)) ) Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -1434,11 +1434,12 @@ (define (dashboard:get-youngest-run-db-mod-time dbdir) (handle-exceptions exn (begin - (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir) + (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) + " db-dir="dbdir ", exn=" exn) (current-seconds)) ;; something went wrong - just print an error and return current-seconds (common:max (map (lambda (filen) (file-modification-time filen)) (glob (conc dbdir "/*.db*")))))) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -269,11 +269,11 @@ (set! success #f) (if (debug:debug-mode 1) (debug:print-info 0 *default-log-port* "couldn't talk to server, trying again ...") (begin (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") - (debug:print 0 *default-log-port* " message: " msg) + (debug:print 0 *default-log-port* " message: " msg ", exn=" exn) (debug:print 0 *default-log-port* " cmd: " cmd " params: " params) (debug:print 0 *default-log-port* " call-chain: " call-chain))) (if runremote (remote-conndat-set! runremote #f)) ;; Killing associated server to allow clean retry.") @@ -332,11 +332,11 @@ (let ((api-dat (http-transport:server-dat-get-api-uri server-dat))) (handle-exceptions exn (begin (print-call-chain *default-log-port*) - (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn))) + (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)) (close-connection! api-dat) ;;(close-idle-connections!) #t)) #f))) @@ -494,11 +494,11 @@ (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions exn - (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on log file " server-log-file ". Are you out of space on that disk?") + (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on log file " server-log-file ". Are you out of space on that disk? exn=" exn) (if (not *server-overloaded*) (change-file-times server-log-file curr-time curr-time))))) (loop 0 server-state bad-sync-count (current-milliseconds))) (else (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -274,11 +274,11 @@ (lambda (pid) (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "Unable to kill process with pid " pid ", possibly already killed.") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)) (debug:print 0 *default-log-port* "WARNING: Request received to kill job " pid) ;; " (attempt # " kill-tries ")") (debug:print-info 0 *default-log-port* "Signal mask=" (signal-mask)) ;; (if (process:alive? pid) ;; (begin (map (lambda (pid-num) @@ -286,13 +286,15 @@ (process:get-sub-pids pid)) (thread-sleep! 5) ;; (if (process:process-alive? pid) (map (lambda (pid-num) (handle-exceptions - exn - #f - (process-signal pid-num signal/kill))) + exn + (begin + (debug:print 0 *default-log-port* " .... had trouble sending kill to " pid-num ", exn=" exn) + #f) + (process-signal pid-num signal/kill))) (process:get-sub-pids pid)))) ;; (debug:print-info 0 *default-log-port* "not killing process " pid " as it is not alive")))) pids) ;; BB: question to Matt -- does the tests:test-state-status! encompass rollup to toplevel? If not, should it? (tests:test-set-status! run-id test-id "KILLED" "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) ;; BB ADDED kill-reason -- confirm OK with Matt @@ -1039,18 +1041,18 @@ (begin (handle-exceptions exn (begin (debug:print-error 0 *default-log-port* "Something went wrong when trying to create linktree dir at " linktree) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (exit 1)) (create-directory linktree #t)))) (handle-exceptions exn (begin (debug:print-error 0 *default-log-port* "Something went wrong when trying to create link to linktree at " *toppath*) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)) (let ((tlink (conc *toppath* "/lt"))) (if (not (common:file-exists? tlink)) (create-symbolic-link linktree tlink))))) (begin (debug:print-error 0 *default-log-port* "linktree not defined in [setup] section of megatest.config") @@ -1111,11 +1113,15 @@ ;;(exit 1) (if (null? disks) (cons 1 (conc *toppath* "/runs")) (let ((paths (sort disks (lambda (x y) (> (string-length (cadr x)) (string-length (cadr y))))))) (let loop ((head (car paths)) (tail (cdr paths))) - (let ((result (handle-exceptions exn #f (create-directory (cadr head) #t)))) + (let ((result (handle-exceptions exn + (begin + (debug:print 0 *default-log-port* "failed to create dir " (cadr head) ", exn=" exn) + #f) + (create-directory (cadr head) #t)))) (if result result (if (null? tail) (cons 1 (conc *toppath* "/runs")) (loop (car tail) (cdr tail))))))))))) @@ -1207,11 +1213,11 @@ (let ((success (if (and (not (common:directory-exists? lnkbase)) (not (common:file-exists? lnkbase))) (handle-exceptions exn (begin - (debug:print-error 0 *default-log-port* "Problem creating linktree base at " lnkbase) + (debug:print-error 0 *default-log-port* "Problem creating linktree base at " lnkbase ", exn=" exn) (print-error-message exn (current-error-port)) #t) (create-directory lnkbase #t) #f)))) (if (and (not success)(> done 0)) @@ -1230,28 +1236,31 @@ (let ((iterated-parent (pathname-directory (conc lnkpath "/" item-path)))) (debug:print-info 2 *default-log-port* "Creating iterated parent " iterated-parent) (handle-exceptions exn (begin - (debug:print-error 0 *default-log-port* " Failed to create directory " iterated-parent ((condition-property-accessor 'exn 'message) exn) ", continuing but link tree may be corrupted") + (debug:print-error 0 *default-log-port* " Failed to create directory " iterated-parent ((condition-property-accessor 'exn 'message) exn) + ", continuing but link tree may be corrupted, exn=" exn) #;(exit 1)) (create-directory iterated-parent #t)))) (if (symbolic-link? lnkpath) (handle-exceptions exn (begin - (debug:print-error 0 *default-log-port* " Failed to remove symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", continuing but link tree may be corrupted.") + (debug:print-error 0 *default-log-port* " Failed to remove symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) + ", continuing but link tree may be corrupted. exn=" exn) #;(exit 1)) (delete-file lnkpath))) (if (not (or (common:file-exists? lnkpath) (symbolic-link? lnkpath))) (handle-exceptions exn (begin - (debug:print-error 0 *default-log-port* " Failed to create symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", continuing but link tree may be corrupted.") + (debug:print-error 0 *default-log-port* " Failed to create symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) + ", continuing but link tree may be corrupted. exn=" exn) #;(exit 1)) (create-symbolic-link toptest-path lnkpath))) ;; NB - This was not working right - some top tests are not getting the path set!!! ;; @@ -1278,12 +1287,14 @@ (if (or (not curr-test-path) (not (directory-exists? toptest-path))) (begin (debug:print-info 2 *default-log-port* "Creating " toptest-path " and link " lnkpath) (handle-exceptions - exn - #f ;; don't care to catch and deal with errors here for now. + exn + (begin + (debug:print 0 *default-log-port* "failed to create directory " toptest-path ", exn=" exn) + #f) (create-directory toptest-path #t)) (hash-table-set! *toptest-paths* testname toptest-path))))) ;; The toptest path has been created, the link to the test in the linktree has ;; been created. Now, if this is an iterated test the real test dir must be created @@ -1292,11 +1303,12 @@ (debug:print 2 *default-log-port* "Setting up sub test run area") (debug:print 2 *default-log-port* " - creating run area in " test-path) (handle-exceptions exn (begin - (debug:print-error 0 *default-log-port* " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) ", exiting") + (debug:print-error 0 *default-log-port* " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) + ", exiting, exn=" exn) (exit 1)) (create-directory test-path #t)) (debug:print 2 *default-log-port* " - creating link from: " test-path "\n" " to: " lnktarget) @@ -1303,11 +1315,11 @@ ;; If there is already a symlink delete it and recreate it. (handle-exceptions exn (begin - (debug:print-error 0 *default-log-port* " Failed to re-create link " lnktarget ((condition-property-accessor 'exn 'message) exn) ", exiting") + (debug:print-error 0 *default-log-port* " Failed to re-create link " lnktarget ((condition-property-accessor 'exn 'message) exn) ", exiting, exn=" exn) (exit)) (if (symbolic-link? lnktarget) (delete-file lnktarget)) (if (not (common:file-exists? lnktarget)) (create-symbolic-link test-path lnktarget))))) (if (not (directory? test-path)) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -380,11 +380,11 @@ (resdat (if (not (and read-only qry-is-write)) (let ((v (api:execute-requests dbstruct-local (vector (symbol->string cmd) params)))) (handle-exceptions ;; there has been a long history of receiving strange errors from values returned by the client when things go wrong.. exn ;; This is an attempt to detect that situation and recover gracefully (begin - (debug:print 0 *default-log-port* "ERROR: bad data from server " v " message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "ERROR: bad data from server " v " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (vector #t '())) ;; should always get a vector but if something goes wrong return a dummy (if (and (vector? v) (> (vector-length v) 1)) (let ((newvec (vector (vector-ref v 0)(vector-ref v 1)))) newvec) ;; by copying the vector while inside the error handler we should force the detection of a corrupted record Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -213,15 +213,17 @@ exn (let ((call-chain (get-call-chain)) (msg ((condition-property-accessor 'exn 'message) exn))) (if (< count 5) (begin ;; this call is colliding, do some crude stuff to fix it. - (debug:print 0 *default-log-port* "ERROR: *configdat* was inaccessible! This should never happen. Retry #" count) + (debug:print 0 *default-log-port* "ERROR: *configdat* was inaccessible! This should never happen. Retry #" count + ", exn=" exn) (launch:setup force-reread: #t) (fatal-loop (+ count 1))) (begin - (debug:print 0 *default-log-port* "FATAL: *configdat* was inaccessible! This should never happen. Retried " count " times. Message: " msg) + (debug:print 0 *default-log-port* "FATAL: *configdat* was inaccessible! This should never happen. Retried " count + " times. Message: " msg) (debug:print 0 *default-log-port* "Call chain:") (with-output-to-port *default-log-port* (lambda () (print "*configdat* is >>"*configdat*"<<") @@ -380,11 +382,11 @@ (if (null? existing-tests) (let* ((use-log-dir (if (not (directory-exists? log-dir)) (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir) + (debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir ", exn=" exn) #f) (create-directory log-dir #t) #t) #t)) (start-time (current-seconds)) @@ -391,11 +393,11 @@ (actual-logf (if use-log-dir full-log-fname log-file))) (handle-exceptions exn (begin (print-call-chain *default-log-port*) - (debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (debug:print 0 *default-log-port* "ERROR: failed to run pre-hook " run-pre-hook ", check the log " log-file)) (debug:print-info 0 *default-log-port* "running run-pre-hook: \"" run-pre-hook "\", log is " actual-logf) (system (conc run-pre-hook " >> " actual-logf " 2>&1")) (debug:print-info 0 *default-log-port* "pre-hook \"" run-pre-hook "\" took " (- (current-seconds) start-time) " seconds to run."))) (debug:print 0 *default-log-port* "Skipping pre-hook call \"" run-pre-hook "\" as there are existing tests for this run."))))) @@ -420,11 +422,11 @@ ;; (debug:print 0 *default-log-port* "Skipping post-hook call \"" run-post-hook "\" as there are existing tests for this run."))))) (let* ((use-log-dir (if (not (directory-exists? log-dir)) (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir) + (debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir ", exn=" exn) #f) (create-directory log-dir #t) #t) #t)) (start-time (current-seconds)) @@ -431,11 +433,11 @@ (actual-logf (if use-log-dir full-log-fname log-file))) (handle-exceptions exn (begin (print-call-chain *default-log-port*) - (debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (debug:print 0 *default-log-port* "ERROR: failed to run post-hook " run-post-hook ", check the log " log-file)) (debug:print-info 0 *default-log-port* "running run-post-hook: \"" run-post-hook "\", log is " actual-logf) (system (conc run-post-hook " >> " actual-logf " 2>&1")) (debug:print-info 0 *default-log-port* "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run.")))))) @@ -751,11 +753,11 @@ (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going (handle-exceptions exn - (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id) + (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn) (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) run-ids))) "runs: mark-incompletes"))) ;; (thread-start! th1) (thread-start! th2) @@ -2630,18 +2632,18 @@ (if (symbolic-link? run-dir) (begin (debug:print-info 1 *default-log-port* "Removing symlink " run-dir) (handle-exceptions exn - (debug:print-error 0 *default-log-port* " Failed to remove symlink " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue") + (debug:print-error 0 *default-log-port* " Failed to remove symlink " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue, exn=" exn) (delete-file run-dir))) (if (directory? run-dir) (if (> (directory-fold (lambda (f x)(+ 1 x)) 0 run-dir) 0) (debug:print 0 *default-log-port* "WARNING: refusing to remove " run-dir " as it is not empty") (handle-exceptions exn - (debug:print-error 0 *default-log-port* " Failed to remove directory " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue") + (debug:print-error 0 *default-log-port* " Failed to remove directory " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue, exn=" exn) (delete-directory run-dir))) (if (and run-dir (not (member run-dir (list "n/a" "/tmp/badname")))) (debug:print 0 *default-log-port* "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink") (debug:print 0 *default-log-port* "NOTE: the run dir for this test is undefined. Test may have already been deleted.")) @@ -2914,11 +2916,11 @@ (let* ((final-doc ((sxml-modify `("testsuite" insert-into (@ (name ,xml-ts-name) (tests ,tests-count) (errors ,error-cnt) (failures ,fail-cnt)))) new-doc))) (debug:print 0 *default-log-port* "modify attrib error=" error-cnt " fail= " fail-cnt) (handle-exceptions exn (let* ((msg ((condition-property-accessor 'exn 'message) exn))) - (debug:print 0 *default-log-port* (conc "WARNING: Failed to update file" xml-path". Message:" msg))) + (debug:print 0 *default-log-port* (conc "WARNING: Failed to update file" xml-path". Message:" msg ", exn=" exn))) (if (not (file-exists? xml-dir)) (create-directory xml-dir #t)) (if (not (rmt:no-sync-get/default keyname #f)) (begin @@ -2948,10 +2950,10 @@ (debug:print-info 0 *default-log-port* "Removing cached files:\n " (string-intersperse files "\n ")) (for-each (lambda (f) (handle-exceptions exn - (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f) + (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f ", exn=" exn) (delete-file f))) files)))) (debug:print-error 0 *default-log-port* "-clean-cache requires -runname.")) (debug:print-error 0 *default-log-port* "-clean-cache requires -target or -reqtarg")))