Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -477,11 +477,11 @@ (data-synced 0) ;; count of changed records (tmp-area (common:get-db-tmp-area)) (old2new (member 'old2new options)) (dejunk (member 'dejunk options)) (killservers (member 'killservers options)) - (servers (server:get-list *toppath*)) + (servers (server:choose-server *toppath* 'all-valid)) ;; (server:get-list *toppath*)) (src-area (if old2new *toppath* tmp-area)) (dest-area (if old2new tmp-area *toppath*)) (dbfiles (if old2new (glob (conc *toppath* "/.megatest/*.db")) (glob (conc tmp-area "/.megatest/*.db")))) (keys (db:get-keys dbstruct)) (sync-durations (make-hash-table))) Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -704,11 +704,11 @@ #:numlin-visible 5 )) (colnames (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId")) (updater (lambda () (if (dashboard:monitor-changed? commondat tabdat) - (let ((servers (server:get-list *toppath* limit: 10))) + (let ((servers (server:choose-server *toppath* 'all-valid))) ;; (server:get-list *toppath* limit: 10))) (iup:attribute-set! servers-matrix "NUMLIN" (length servers)) ;; (set! colnum 0) ;; (for-each (lambda (colname) ;; ;; (print "colnum: " colnum " colname: " colname) ;; (iup:attribute-set! servers-matrix (conc "0:" colnum) colname) @@ -716,26 +716,27 @@ ;; colnames) (set! rownum 1) (for-each (lambda (server) (set! colnum 0) - (match-let (((mod-time host port start-time server-id pid) + ;; (("192.168.0.127" 60215 1669088591.0 "c85484f764df7a8550b0224409bd4bcd") + (match-let (((host port start-time server-id pid) server)) - (let* ((uptime (- (current-seconds) mod-time)) + (let* (;; (uptime (- (current-seconds) mod-time)) (runtime (if start-time - (- mod-time start-time) + (- (current-seconds) start-time) 0)) (vals (list "-" ;; (vector-ref server 0) ;; Id "-" ;; (vector-ref server 9) ;; MT-Ver pid ;; (vector-ref server 1) ;; Pid host ;; (vector-ref server 2) ;; Hostname (conc host ":" port) ;; (conc (vector-ref server 3) ":" (vector-ref server 4)) ;; IP:Port (seconds->hr-min-sec runtime) ;; (- (current-seconds) start-time)) ;; (vector-ref server 6))) - (cond - ((< uptime 5) "alive") - ((< uptime 16) "probably alive");; less than 15 seconds since mod, call it alive (vector-ref server 8) ;; State - (else "dead")) + "-" #;(cond + ((< uptime 5) "alive") + ((< uptime 16) "probably alive");; less than 15 seconds since mod, call it alive (vector-ref server 8) ;; State + (else "dead")) "-" ;; (vector-ref server 12) ;; RunId ))) (for-each (lambda (val) (let* ((row-col (conc rownum ":" colnum)) (curr-val (iup:attribute servers-matrix row-col))) @@ -745,11 +746,11 @@ (iup:attribute-set! servers-matrix "FITTOTEXT" (conc "C" colnum)))) (set! colnum (+ 1 colnum)))) vals) (set! rownum (+ rownum 1))) (iup:attribute-set! servers-matrix "REDRAW" "ALL"))) - (sort servers (lambda (a b)(> (car a)(car b)))))))))) + (sort servers (lambda (a b)(> (caddr a)(caddr b)))))))))) (set! colnum 0) (for-each (lambda (colname) (iup:attribute-set! servers-matrix (conc "0:" colnum) colname) (iup:attribute-set! servers-matrix "FITTOTEXT" (conc "C" colnum)) (set! colnum (+ colnum 1))) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -436,11 +436,11 @@ (create-directory servinfodir #t)) (with-output-to-file servinf (lambda () (let* ((serv-id (server:mk-signature))) (set! *server-id* serv-id) - (print "SERVER STARTED: "ipaddr":"port" AT "(current-seconds)" server-id: "serv-id) + (print "SERVER STARTED: "ipaddr":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)) (print "started: "(seconds->year-week/day-time (current-seconds)))))) (set! *on-exit-procs* (cons (lambda () (delete-file* servinf)) *on-exit-procs*)) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -951,11 +951,11 @@ (if (or (args:get-arg "-list-servers") (args:get-arg "-kill-servers")) (let ((tl (launch:setup))) (if tl ;; all roads from here exit - (let* ((servers (server:get-list *toppath*)) + (let* ((servers (server:choose-server *toppath* 'all-valid)) ;; (server:get-list *toppath*)) (fmtstr "~33a~22a~20a~20a~8a\n")) (format #t fmtstr "ID" "host:port" "age (hms)" "Last mod" "State") (format #t fmtstr "==" "=========" "=========" "========" "=====") (for-each ;; ( mod-time host port start-time pid ) (lambda (server) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -185,108 +185,109 @@ ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let ;; example of what it's looking for in the log file: ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 (define (server:logf-get-start-info logf) - (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id - (dbprep-rx (regexp "^SERVER: dbprep")) - (dbprep-found 0)) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get server info from " logf ", exn=" exn) - (list #f #f #f #f)) ;; no idea what went wrong, call it a bad server + (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id + (dbprep-rx (regexp "^SERVER: dbprep")) + (dbprep-found 0) + (bad-dat (list #f #f #f #f #f))) +;; (handle-exceptions +;; exn +;; (begin +;; (debug:print-info 0 *default-log-port* "Unable to get server info from " logf ", exn=" exn) +;; bad-dat) ;; no idea what went wrong, call it a bad server (with-input-from-file logf (lambda () (let loop ((inl (read-line)) (lnum 0)) (if (not (eof-object? inl)) (let ((mlst (string-match server-rx inl)) - (dbprep (string-match dbprep-rx inl)) - ) - (if dbprep - (set! dbprep-found 1) - ) + (dbprep (string-match dbprep-rx inl))) + (if dbprep (set! dbprep-found 1)) (if (not mlst) (if (< lnum 500) ;; give up if more than 500 lines of server log read (loop (read-line)(+ lnum 1)) (begin (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) - (list #f #f #f #f))) - (let ((dat (cdr mlst))) - (list (car dat) ;; host - (string->number (cadr dat)) ;; port - (string->number (caddr dat)) - (cadr (cddr dat)))))) + bad-dat)) + (match mlst + ((_ host port start server-id pid) + (list host + (string->number port) + (string->number start) + server-id + (string->number pid))) + (else + (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) + bad-dat)))) (begin - (if dbprep-found + (if dbprep-found (begin (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) - (thread-sleep! 0.5) ;; was 25 sec but that blocked things from starting? - ) - (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds))) - ) - (list #f #f #f #f))))))))) - -;; get a list of servers from the log files, with all relevant data -;; ( mod-time host port start-time pid ) -;; -(define (server:get-list areapath #!key (limit #f)) - (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) - (day-seconds (* 24 60 60))) - ;; if the directory exists continue to get the list - ;; otherwise attempt to create the logs dir and then - ;; continue - (if (if (directory-exists? (conc areapath "/logs")) - '() - (if (file-write-access? areapath) - (begin - (condition-case - (create-directory (conc areapath "/logs") #t) - (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) - (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) - (directory-exists? (conc areapath "/logs"))) - '())) - - ;; Get the list of server logs. - (let* ( - ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. - ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) - (server-logs (glob (conc areapath "/logs/server-*-*.log"))) - (num-serv-logs (length server-logs))) - (if (or (null? server-logs) (= num-serv-logs 0)) - (let () - (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) - '() - ) - (let loop ((hed (string-chomp (car server-logs))) - (tal (cdr server-logs)) - (res '())) - (let* ((mod-time (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn) - (current-seconds)) ;; 0 - (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted - (down-time (- (current-seconds) mod-time)) - (serv-dat (if (or (< num-serv-logs 10) - (< down-time 900)) ;; day-seconds)) - (server:logf-get-start-info hed) - '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at - (serv-rec (cons mod-time serv-dat)) - (fmatch (string-match fname-rx hed)) - (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) - (new-res (if (null? serv-dat) - res - (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let - (if (null? tal) - (if (and limit - (> (length new-res) limit)) - new-res ;; (take new-res limit) <= need intelligent sorting before this will work - new-res) - (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) + (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? + (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) + bad-dat)))))));; ) + +;; ;; get a list of servers from the log files, with all relevant data +;; ;; ( mod-time host port start-time pid ) +;; ;; +;; (define (server:get-list areapath #!key (limit #f)) +;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) +;; (day-seconds (* 24 60 60))) +;; ;; if the directory exists continue to get the list +;; ;; otherwise attempt to create the logs dir and then +;; ;; continue +;; (if (if (directory-exists? (conc areapath "/logs")) +;; '() +;; (if (file-write-access? areapath) +;; (begin +;; (condition-case +;; (create-directory (conc areapath "/logs") #t) +;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) +;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) +;; (directory-exists? (conc areapath "/logs"))) +;; '())) +;; +;; ;; Get the list of server logs. +;; (let* ( +;; ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. +;; ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) +;; (server-logs (glob (conc areapath "/logs/server-*-*.log"))) +;; (num-serv-logs (length server-logs))) +;; (if (or (null? server-logs) (= num-serv-logs 0)) +;; (let () +;; (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) +;; '() +;; ) +;; (let loop ((hed (string-chomp (car server-logs))) +;; (tal (cdr server-logs)) +;; (res '())) +;; (let* ((mod-time (handle-exceptions +;; exn +;; (begin +;; (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn) +;; (current-seconds)) ;; 0 +;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted +;; (down-time (- (current-seconds) mod-time)) +;; (serv-dat (if (or (< num-serv-logs 10) +;; (< down-time 900)) ;; day-seconds)) +;; (server:logf-get-start-info hed) +;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at +;; (serv-rec (cons mod-time serv-dat)) +;; (fmatch (string-match fname-rx hed)) +;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) +;; (new-res (if (null? serv-dat) +;; res +;; (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let +;; (if (null? tal) +;; (if (and limit +;; (> (length new-res) limit)) +;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work +;; new-res) +;; (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) #;(define (server:get-num-alive srvlst) (let ((num-alive 0)) (for-each (lambda (server) @@ -302,46 +303,46 @@ 0))) (if (< uptime 5)(set! num-alive (+ num-alive 1))))))) srvlst) num-alive)) -;; given a list of servers get a list of valid servers, i.e. at least -;; 10 seconds old, has started and is less than 1 hour old and is -;; active (i.e. mod-time < 10 seconds -;; -;; mod-time host port start-time pid -;; -;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off -;; and servers should stick around for about two hours or so. -;; -(define (server:get-best srvlst) - (let* ((nums (server:get-num-servers)) - (now (current-seconds)) - (slst (sort - (filter (lambda (rec) - (if (and (list? rec) - (> (length rec) 2)) - (let ((start-time (list-ref rec 3)) - (mod-time (list-ref rec 0))) - ;; (print "start-time: " start-time " mod-time: " mod-time) - (and start-time mod-time - (> (- now start-time) 0) ;; been running at least 0 seconds - (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds - (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set - (< (- now start-time) - (+ (- (string->number (configf:lookup *configdat* "server" "runtime")) - 180) - (random 360)))) ;; under one hour running time +/- 180 - )) - #f)) - srvlst) - (lambda (a b) - (< (list-ref a 3) - (list-ref b 3)))))) - (if (> (length slst) nums) - (take slst nums) - slst))) +;; ;; given a list of servers get a list of valid servers, i.e. at least +;; ;; 10 seconds old, has started and is less than 1 hour old and is +;; ;; active (i.e. mod-time < 10 seconds +;; ;; +;; ;; mod-time host port start-time pid +;; ;; +;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off +;; ;; and servers should stick around for about two hours or so. +;; ;; +;; (define (server:get-best srvlst) +;; (let* ((nums (server:get-num-servers)) +;; (now (current-seconds)) +;; (slst (sort +;; (filter (lambda (rec) +;; (if (and (list? rec) +;; (> (length rec) 2)) +;; (let ((start-time (list-ref rec 3)) +;; (mod-time (list-ref rec 0))) +;; ;; (print "start-time: " start-time " mod-time: " mod-time) +;; (and start-time mod-time +;; (> (- now start-time) 0) ;; been running at least 0 seconds +;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds +;; (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set +;; (< (- now start-time) +;; (+ (- (string->number (configf:lookup *configdat* "server" "runtime")) +;; 180) +;; (random 360)))) ;; under one hour running time +/- 180 +;; )) +;; #f)) +;; srvlst) +;; (lambda (a b) +;; (< (list-ref a 3) +;; (list-ref b 3)))))) +;; (if (> (length slst) nums) +;; (take slst nums) +;; slst))) ;; ;; switch from server:get-list to server:get-servers-info ;; ;; ;; (define (server:get-first-best areapath) ;; (let ((srvrs (server:get-best (server:get-list areapath)))) @@ -363,11 +364,11 @@ (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn) #f) - (match-let (((host port start-time server-id) + (match-let (((host port start-time server-id pid) servr)) (if server-id server-id #f)))) @@ -375,11 +376,11 @@ (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn) #f) - (match-let (((host port start-time server-id) + (match-let (((host port start-time server-id pid) servr)) (if (and host port) (conc host ":" port) #f)))) @@ -497,11 +498,15 @@ (thread-sleep! 3) (case mode ((homehost) (cons #f #f)) (else #f)))))) - +;; would like to eventually get rid of this +;; +(define (common:on-homehost?) + (server:choose-server *toppath* 'home?)) + ;; kind start up of server, wait before allowing another server for a given ;; area to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last