@@ -167,11 +167,13 @@ ;; given a path to a server log return: host port startseconds ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let (define (server:logf-get-start-info logf) - (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)"))) ;; SERVER STARTED: host:port AT timesecs server id + (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id + (dbprep-rx (regexp "^SERVER: dbprep")) + (dbprep-found 0)) (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "Unable to get server info from " logf ", exn=" exn) (list #f #f #f #f)) ;; no idea what went wrong, call it a bad server @@ -179,11 +181,16 @@ logf (lambda () (let loop ((inl (read-line)) (lnum 0)) (if (not (eof-object? inl)) - (let ((mlst (string-match rx inl))) + (let ((mlst (string-match server-rx inl)) + (dbprep (string-match dbprep-rx inl)) + ) + (if dbprep + (set! dbprep-found 1) + ) (if (not mlst) (if (< lnum 500) ;; give up if more than 500 lines of server log read (loop (read-line)(+ lnum 1)) (begin (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) @@ -192,11 +199,17 @@ (list (car dat) ;; host (string->number (cadr dat)) ;; port (string->number (caddr dat)) (cadr (cddr dat)))))) (begin - (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds)) + (if dbprep-found + (begin + (debug:print-info 0 *default-log-port* "Server is in dbprep at " (current-seconds)) + (thread-sleep! 25) + ) + (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds)) + ) (list #f #f #f #f))))))))) ;; get a list of servers with all relevant data ;; ( mod-time host port start-time pid ) ;; @@ -214,15 +227,21 @@ (create-directory (conc areapath "/logs") #t) (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) (directory-exists? (conc areapath "/logs"))) '())) - (let* ((server-logs (glob (conc areapath "/logs/server-*.log"))) + + ;; Get the list of server logs that do not contain "exiting". Ignore logs for servers that have exited. + (let* ((server-logs-cmd (conc "grep -iL exiting " areapath "/logs/server-*-*.log")) + (server-logs (string-split (string-chomp (call-with-input-pipe server-logs-cmd read-all)))) (num-serv-logs (length server-logs))) - (if (null? server-logs) - '() - (let loop ((hed (car server-logs)) + (if (or (null? server-logs) (= num-serv-logs 0)) + (let () + (debug:print 1 *default-log-port* "There are no servers running") + '() + ) + (let loop ((hed (string-chomp (car server-logs))) (tal (cdr server-logs)) (res '())) (let* ((mod-time (handle-exceptions exn (begin @@ -243,11 +262,11 @@ (if (null? tal) (if (and limit (> (length new-res) limit)) new-res ;; (take new-res limit) <= need intelligent sorting before this will work new-res) - (loop (car tal)(cdr tal) new-res))))))))) + (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) (define (server:get-num-alive srvlst) (let ((num-alive 0)) (for-each (lambda (server) @@ -359,10 +378,11 @@ (let* ((fmodtime (file-modification-time start-flag)) (delta (- (current-seconds) fmodtime)) (all-go (> delta reftime))) (if (and all-go (begin + (debug:print-info 0 *default-log-port* "Writing " start-flag) (with-output-to-file start-flag (lambda () (print server-key))) (thread-sleep! 0.25) (let ((res (with-input-from-file start-flag @@ -395,10 +415,11 @@ (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously (lock-file (conc areapath "/logs/server-start.lock"))) (if (> (- (current-seconds) when-run) run-delay) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (common:simple-file-lock-and-wait lock-file expire-time: 15) + (debug:print-info 0 *default-log-port* "server:kind-run: touching " start-flag) (system (conc "touch " start-flag)) ;; lazy but safe (server:run areapath) (thread-sleep! 2) ;; don't release the lock for at least a few seconds (common:simple-file-release-lock lock-file))) (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))