Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -1506,13 +1506,19 @@ exn #f (with-output-to-file fullpath (lambda ()(pp dat)))))) (define (common:get-cpu-load remote-host) + (handle-exceptions + exn + (lambda() + (list 50 50 50) + ) (let ((al (common:get-normalized-cpu-load remote-host))) (list (alist-ref '1m-load al) (alist-ref '5m-load al) (alist-ref '15m-load al))) ;;(common:get-cpu-load-original remote-host) + ) ) ;; get cpu load by reading from /proc/loadavg, return all three values ;; (define (common:get-cpu-load-original remote-host) (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) @@ -1537,40 +1543,39 @@ (begin (with-input-from-file (pathname-expand "~/.megatest/tquery") (lambda() (set! tqfilecontents (read-string)) )) - (set! tqfileparts (string-split (string-trim-both tqfilecontents) ":")) + (handle-exceptions exn + (lambda() + (sleep 1) + (common:get-normalized-cpu-load remote-host) + ) + (set! tqfileparts (string-split (string-trim-both tqfilecontents) ":")) + ) ;;(print "TQuery host: " (car tqfileparts)) ;;(print "TQuery port " (cadr tqfileparts)) ;;(print "Getting normalized cpu load for : " remote-host " via " (car tqfileparts) ":" (cadr tqfileparts)) ) (begin (process-run "nbfake /p/fdk/gwa/jmoon18/fossil/megatest/tquery -server -") - (sleep 5) + (sleep 2) ) ) (handle-exceptions exn (lambda() ;;(print "Need to start tquery server here:") (process-run "nbfake /p/fdk/gwa/jmoon18/fossil/megatest/tquery -server -") - (sleep 5) + (sleep 2) (common:get-normalized-cpu-load remote-host) ) (define-values (i o) (tcp-connect (car tqfileparts) (string->number (cadr tqfileparts)))) ;;(define-values (i o) (tcp-connect "plxcas102" 9000)) (write-line (conc "adj-cpuload-full:" (if remote-host remote-host (get-host-name))) o) ;;(write-line "adj-cpuload-full:plxcm5005" o) (with-input-from-string (read-line i) read) ) - - ;;(define new-al with-input-from-string (read-line i) read) - ;;new-al - ;;(set! loadstring (read-string i)) - ;;(with-input-from-string loadstring read) - ;;`((adj-proc-load 3.0) - ;; (adj-core-load 3.2)) ) (define (common:get-normalized-cpu-load-original remote-host) (let ((res (common:get-normalized-cpu-load-raw-original remote-host)) @@ -1769,11 +1774,11 @@ (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) (host-last-used-set! rec curr-time) new-best) (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) -(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)) +(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 100) (msg #f)(remote-host #f)(force-maxload #f)) (let* ((loadavg (common:get-cpu-load remote-host)) (numcpus (if (< 1 numcpus-in) ;; not possible (common:get-num-cpus remote-host) numcpus-in)) (maxload (if force-maxload @@ -1781,22 +1786,24 @@ (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? (first (car loadavg)) (next (cadr loadavg)) (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 (loadjmp (- first next)) - (adjwait (min (+ 300 (random 10)) (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) ) ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously + (adjwait (min (+ 300 (random 10)) (* (+ (random 10)(/ (- 100 count) 10) waitdelay) (- first adjload) ) ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload - ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp) + ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp " ,adjwait: " adjwait " ,numcpus: " numcpus ", loadjmp: " loadjmp) (cond ((and (> first adjload) (> count 0)) (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) + (debug:print-info 1 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) (thread-sleep! adjwait) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) ((and (> loadjmp numcpus) (> count 0)) (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) + (debug:print-info 1 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) (thread-sleep! adjwait) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))))) (define (common:wait-for-homehost-load maxload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. @@ -1805,11 +1812,16 @@ (hh (if hh-dat (car hh-dat) #f)) (numcpus (common:get-num-cpus hh))) (common:wait-for-normalized-load maxload msg hh))) (define (common:get-num-cpus remote-host) - (alist-ref 'core (common:get-normalized-cpu-load remote-host)) + (handle-exceptions exn + (lambda() + 2 + ) + (alist-ref 'core (common:get-normalized-cpu-load remote-host)) + ) ) (define (common:get-num-cpus-orig remote-host) (let* ((actual-host (or remote-host (get-host-name)))) (or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often!