@@ -484,13 +484,14 @@ (directory-fold (lambda (file rem) (handle-exceptions exn (begin - (debug:print-info 0 *default-log-port* "unable to rotate log " file ", probably handled by another process.") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (print-call-chain (current-error-port))) + (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore.") + (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + ;; (print-call-chain (current-error-port)) ;; + ) (let* ((fullname (conc "logs/" file)) (mod-time (file-modification-time fullname)) (file-age (- (current-seconds) mod-time))) (hash-table-set! all-files file mod-time) (if (or (and (string-match "^.*.log" file) @@ -1714,11 +1715,11 @@ ;; cpu-load)) ;; get values from cached info from dropping file in logs dir ;; e.g. key is host and dtype is normalized-load ;; -(define (common:get-cached-info key dtype #!key (age 5)) +(define (common:get-cached-info key dtype #!key (age 10)) (if *toppath* (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log"))) (if (and (file-exists? fullpath) (file-read-access? fullpath)) (handle-exceptions @@ -1727,11 +1728,11 @@ (debug:print 2 *default-log-port* "reading file " fullpath) (let ((real-age (- (current-seconds)(file-change-time fullpath)))) (if (< real-age age) (with-input-from-file fullpath read) (begin - (debug:print 1 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it") + (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it") #f)))) (begin (debug:print 2 *default-log-port* "not reading file " fullpath) #f))) #f)) @@ -1993,11 +1994,13 @@ (result (if remote-host (with-input-from-pipe (conc "ssh " remote-host " cat /proc/cpuinfo") proc) (with-input-from-file "/proc/cpuinfo" proc)))) - (if (> result 0)(common:write-cached-info actual-host "num-cpus" result)) + (if (and (number? result) + (> result 0)) + (common:write-cached-info actual-host "num-cpus" result)) result)))) ;; wait for normalized cpu load to drop below maxload ;; (define (common:wait-for-normalized-load maxload msg remote-host #!optional (rem-tries 5)) @@ -2023,12 +2026,14 @@ (first (car loadavg)) (next (cadr loadavg)) (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 (loadjmp (- first next)) (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously - (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload - ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp) + ;; let's let the user know once in a long while that load checking is happening but not constantly report it + (if (> (random 100) 75) ;; about 25% of the time + (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload + ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)) (cond ((and (> first adjload) (> count 0)) (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) (thread-sleep! adjwait)