Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -242,19 +242,32 @@ ;; (define (common:rotate-logs) (if (not (directory-exists? "logs"))(create-directory "logs")) (directory-fold (lambda (file rem) - (if (and (string-match "^.*.log" file) - (> (file-size (conc "logs/" file)) 200000)) - (let ((gzfile (conc "logs/" file ".gz"))) - (if (file-exists? gzfile) - (begin - (debug:print-info 0 *default-log-port* "removing " gzfile) - (delete-file gzfile))) - (debug:print-info 0 *default-log-port* "compressing " file) - (system (conc "gzip logs/" file))))) + (handle-exceptions + exn + (debug:print-info 0 *default-log-port* "failed to rotate log " file ", probably handled by another process.") + (let* ((fullname (conc "logs/" file)) + (file-age (- (current-seconds)(file-modification-time fullname)))) + (if (or (and (string-match "^.*.log" file) + (> (file-size fullname) 200000)) + (and (string-match "^server-.*.log" file) + (> (- (current-seconds) (file-modification-time fullname)) + (* 8 60 60)))) + (let ((gzfile (conc fullname ".gz"))) + (if (file-exists? gzfile) + (begin + (debug:print-info 0 *default-log-port* "removing " gzfile) + (delete-file gzfile))) + (debug:print-info 0 *default-log-port* "compressing " file) + (system (conc "gzip " fullname))) + (if (> file-age (* (string->number (or (configf:lookup *configdat* "setup" "log-expire-days") "30")) 24 3600)) + (handle-exceptions + exn + #f + (delete-file fullname))))))) '() "logs")) ;; Force a megatest cleanup-db if version is changed and skip-version-check not specified ;; Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -101,11 +101,11 @@ (print help) (exit))) (if (not (common:on-homehost?)) (begin - (debug:print 0 "ERROR: Current policy requires running dashboard on homehost: " (common:get-homehost)))) + (debug:print 0 *default-log-port* "ERROR: Current policy requires running dashboard on homehost: " (common:get-homehost)))) ;; TODO: Move this inside (main) ;; (if (not (launch:setup)) (begin Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -219,13 +219,13 @@ (filter (lambda (rec) (let ((start-time (list-ref rec 3)) (mod-time (list-ref rec 0))) ;; (print "start-time: " start-time " mod-time: " mod-time) (and start-time mod-time - (> (- now start-time) 1) ;; been running at least 1 seconds + (> (- now start-time) 0) ;; been running at least 0 seconds (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds - (< (- now start-time) 3600) ;; under one hour running time + (< (- now start-time) (string->number (or (configf:lookup *configdat* "server" "runtime") "3600"))) ;; under one hour running time ))) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) @@ -251,24 +251,30 @@ *my-client-signature*))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched (define (server:kind-run areapath) - (let ((last-run-time (hash-table-ref/default *server-kind-run* areapath #f))) - (if (or (not last-run-time) - (> (- (current-seconds) last-run-time) 30)) - (begin - (server:run areapath) - (hash-table-set! *server-kind-run* areapath (current-seconds)))))) + (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun + (call-num (car last-run-dat)) + (when-run (cadr last-run-dat)) + (run-delay (+ (case call-num + ((0) 0) + ((1) 20) + ((2) 300) + (else 600)) + (random 5)))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously + (if (> (- (current-seconds) when-run) run-delay) + (server:run areapath)) + (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))) (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) (let loop ((server-url (server:check-if-running areapath))) (if (or server-url (> (current-seconds) give-up-time)) server-url - (let ((num-ok (server:get-best (server:get-list areapath)))) + (let ((num-ok (length (server:get-best (server:get-list areapath))))) (if (< num-ok 2) ;; if there are no decent candidates for servers then try starting a new one (server:kind-run areapath)) (thread-sleep! 5) (loop (server:check-if-running areapath))))))) @@ -284,25 +290,32 @@ (file-modification-time server-file)))))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) - (let* ((servers (server:get-best (server:get-list areapath))) - (best-server (if (null? servers) #f (car servers))) - (dotserver-url (if best-server - (server:record->url best-server) - #f))) ;; (server:read-dotserver->url areapath))) ;; tdbdat (tasks:open-db))) - (if dotserver-url - (let* ((res (case *transport-type* - ((http)(server:ping dotserver-url)) - ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) - ))) - (if res - dotserver-url - (begin - ;; (server:kill best-server) - #f))) + (let* ((servers (server:get-best (server:get-list areapath)))) + (if (null? servers) + #f + (let loop ((hed (car servers)) + (tal (cdr servers))) + (let ((res (server:check-server hed))) + (if res + res + (if (null? tal) + #f + (loop (car tal)(cdr tal))))))))) + +;; ping the given server +;; +(define (server:check-server server-record) + (let* ((server-url (server:record->url server-record)) + (res (case *transport-type* + ((http)(server:ping server-url)) + ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) + ))) + (if res + server-url #f))) (define (server:kill servr) (match-let (((mod-time hostname port start-time pid) servr)) Index: utils/lock-stats.sh ================================================================== --- utils/lock-stats.sh +++ utils/lock-stats.sh @@ -1,11 +1,11 @@ #!/bin/bash while IFS=': ' read x x x x p x x i x; do if ! [[ ${i}x == "x" ]];then if ! $(echo $i|grep EOF >/dev/null);then - fname=$(sudo find -L "/proc/$p/fd" -maxdepth 1 -inum "$i" -exec readlink {} \; -quit) + fname=$(find -L "/proc/$p/fd" -maxdepth 1 -inum "$i" -exec readlink {} \; -quit) if $(echo $fname | grep megatest.db > /dev/null) || \ $(echo $fname | egrep '.db/\d+.db' > /dev/null);then echo $fname fi fi