Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -572,33 +572,36 @@ (define *wdnum*mutex (make-mutex)) ;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp ;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server) ;; (define (common:watchdog) - (thread-sleep! 0.05) ;; delay for startup (let ((legacy-sync (common:run-sync?)) (debug-mode (debug:debug-mode 1)) (last-time (current-seconds)) - (this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x))) - ) + (this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x)))) (debug:print-info 0 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num) (if (and legacy-sync (not *time-to-exit*)) - (let ((dbstruct (db:setup))) + (let* ((dbstruct (db:setup)) + (mtdb (dbr:dbstruct-mtdb dbstruct)) + (mtpath (db:dbdat-get-path mtdb))) (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") (let loop () - ;;(BB> "watchdog loop. pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) ;; sync for filesystem local db writes ;; (mutex-lock! *db-multi-sync-mutex*) (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write (sync-in-progress *db-sync-in-progress*) (should-sync (and (not *time-to-exit*) (> (- (current-seconds) *db-last-sync*) 5))) ;; sync every five seconds minimum + (start-time (current-seconds)) + (mt-mod-time (file-modification-time mtpath)) + (recently-synced (> (- start-time mt-mod-time) 4)) (will-sync (and (or need-sync should-sync) - (not sync-in-progress))) - (start-time (current-seconds))) + (not sync-in-progress) + (not recently-synced)))) + ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced)) ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) (if will-sync (set! *db-sync-in-progress* #t)) (mutex-unlock! *db-multi-sync-mutex*) (if will-sync (let ((res (common:sync-to-megatest.db dbstruct))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -13,14 +13,14 @@ ;; Database access ;;====================================================================== ;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc -(use (srfi 18) extras tcp stack) ;; RADT => use of require-extension? +(use (srfi 18) extras tcp stack) (use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest base64 format dot-locking z3 typed-records matchable) (import (prefix sqlite3 sqlite3:)) -(import (prefix base64 base64:)) ;; RADT => prefix?? +(import (prefix base64 base64:)) (declare (unit db)) (declare (uses common)) (declare (uses keys)) (declare (uses ods)) @@ -143,11 +143,11 @@ (use-mutex (> *api-process-request-count* 25))) (if (and use-mutex (common:low-noise-print 120 "over-50-parallel-api-requests")) (debug:print-info 0 *default-log-port* *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access")) (if (common:low-noise-print 600 (conc "parallel-api-requests" *max-api-process-requests*)) - (debug:print-info 0 *default-log-port* "Parallel api request count: " *api-process-request-count* " max parallel requests: " *max-api-process-requests*)) + (debug:print-info 2 *default-log-port* "Parallel api request count: " *api-process-request-count* " max parallel requests: " *max-api-process-requests*)) (handle-exceptions exn (begin (print-call-chain (current-error-port)) (debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn)) @@ -2040,13 +2040,16 @@ db "SELECT id FROM runs WHERE state != 'deleted' ORDER BY event_time DESC;") (reverse run-ids))))) ;; get some basic run stats +;; +;; data structure: ;; ;; ( (runname (( state count ) ... )) -;; ( ... +;; ( ... +;; (define (db:get-run-stats dbstruct) (let* ((totals (make-hash-table)) (curr (make-hash-table)) (res '()) (runs-info '())) Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -620,11 +620,11 @@ #:numlin-visible 5 )) (colnames (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId")) (updater (lambda () (if (dashboard:monitor-changed? commondat tabdat) - (let ((servers (server:get-list *toppath*))) + (let ((servers (server:get-list *toppath* limit: 10))) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat)))) (iup:attribute-set! servers-matrix "NUMLIN" (length servers)) ;; (set! colnum 0) ;; (for-each (lambda (colname) ;; ;; (print "colnum: " colnum " colname: " colname) @@ -637,11 +637,11 @@ (set! colnum 0) (match-let (((mod-time host port start-time pid) server)) (let* ((uptime (- (current-seconds) mod-time)) (runtime (if start-time - (- (current-seconds) start-time) + (- mod-time start-time) 0)) (vals (list "-" ;; (vector-ref server 0) ;; Id "-" ;; (vector-ref server 9) ;; MT-Ver pid ;; (vector-ref server 1) ;; Pid host ;; (vector-ref server 2) ;; Hostname Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -379,11 +379,11 @@ (if (not server-going) ;; *dbstruct-db* (begin (debug:print 0 *default-log-port* "SERVER: dbprep") (set! *dbstruct-db* (db:setup)) ;; run-id)) (set! server-going #t) - (debug:print 0 *default-log-port* "SERVER: running") ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. + (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version)) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. (thread-start! *watchdog*))) ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) @@ -425,19 +425,20 @@ server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond ((and *server-run* - (> (+ last-access server-timeout) - (current-seconds))) + (> (+ last-access server-timeout) + (current-seconds)) + (< (- (current-seconds) server-start-time) 3600)) ;; do not update log or touch log if we've been running for more than one hour. (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (change-file-times server-log-file curr-time curr-time))) (loop 0 server-state bad-sync-count (current-milliseconds))) (else - (debug:print-info 0 *default-log-port* "Server timeed out. seconds since last db access: " (- (current-seconds) last-access)) + (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) (http-transport:server-shutdown port))))))) (define (http-transport:server-shutdown port) (let ((tdbdat (tasks:open-db))) ;;(BB> "http-transport:server-shutdown called") Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -160,12 +160,13 @@ (list #f #f #f))))))) ;; get a list of servers with all relevant data ;; ( mod-time host port start-time pid ) ;; -(define (server:get-list areapath) - (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))) +(define (server:get-list areapath #!key (limit #f)) + (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) + (day-seconds (* 24 60 60))) ;; if the directory exists continue to get the list ;; otherwise attempt to create the logs dir and then ;; continue (if (if (directory-exists? (conc areapath "/logs")) #t @@ -175,24 +176,34 @@ (create-directory (conc areapath "/logs") #t) (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list."))) (directory-exists? (conc areapath "/logs"))) #f)) - (let ((server-logs (glob (conc areapath "/logs/server-*.log")))) + (let* ((server-logs (glob (conc areapath "/logs/server-*.log"))) + (num-serv-logs (length server-logs))) (if (null? server-logs) '() (let loop ((hed (car server-logs)) (tal (cdr server-logs)) (res '())) - (let* ((mod-time (file-modification-time hed)) - (serv-dat (server:logf-get-start-info hed)) + (let* ((mod-time (file-modification-time hed)) + (down-time (- (current-seconds) mod-time)) + (serv-dat (if (or (< num-serv-logs 10) + (< down-time day-seconds)) + (server:logf-get-start-info hed) + '())) ;; don't waste time processing server files not touched in the past day if there are more than ten servers to look at (serv-rec (cons mod-time serv-dat)) (fmatch (string-match fname-rx hed)) (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) - (new-res (cons (append serv-rec (list pid)) res))) - (if (null? tal) - new-res + (new-res (if (null? serv-dat) + res + (cons (append serv-rec (list pid)) res)))) + (if (null? tal) + (if (and limit + (> (length new-res) limit)) + new-res ;; (take new-res limit) <= need intelligent sorting before this will work + new-res) (loop (car tal)(cdr tal) new-res))))))))) ;; given a list of servers get a list of valid servers, i.e. at least ;; 10 seconds old, has started and is less than 1 hour old and is ;; active (i.e. mod-time < 10 seconds @@ -209,11 +220,11 @@ (let ((start-time (list-ref rec 3)) (mod-time (list-ref rec 0))) ;; (print "start-time: " start-time " mod-time: " mod-time) (and start-time mod-time (> (- now start-time) 1) ;; been running at least 1 seconds - (< (- now mod-time) 10) ;; still alive - file touched in last 10 seconds + (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds (< (- now start-time) 3600) ;; under one hour running time ))) srvlst) (lambda (a b) (< (list-ref a 3)