@@ -18,11 +18,11 @@ (declare (unit server)) (declare (uses common)) (declare (uses db)) (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. -(declare (uses synchash)) +;; (declare (uses synchash)) (declare (uses http-transport)) (declare (uses rpc-transport)) ;;(declare (uses nmsg-transport)) (declare (uses launch)) (declare (uses daemon)) @@ -112,11 +112,12 @@ " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") " -daemonize " "") ;; " -log " logfile " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) - (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread"))) + (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) + (load-limit (configf:lookup-number *configdat* "jobtools" "maxhomehostload" default: 3.0))) ;; we want the remote server to start in *toppath* so push there (push-directory areapath) (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") (thread-start! log-rotate) @@ -129,11 +130,11 @@ (begin (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) (setenv "TARGETHOST_LOGF" logfile) - (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever + (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) @@ -140,26 +141,29 @@ ;; given a path to a server log return: host port startseconds ;; (define (server:logf-get-start-info logf) (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs - (with-input-from-file - logf - (lambda () - (let loop ((inl (read-line)) - (lnum 0)) - (if (not (eof-object? inl)) - (let ((mlst (string-match rx inl))) - (if (not mlst) - (if (< lnum 500) ;; give up if more than 500 lines of server log read - (loop (read-line)(+ lnum 1)) - (list #f #f #f)) - (let ((dat (cdr mlst))) - (list (car dat) ;; host - (string->number (cadr dat)) ;; port - (string->number (caddr dat)))))) - (list #f #f #f))))))) + (handle-exceptions + exn + (list #f #f #f) ;; no idea what went wrong, call it a bad server + (with-input-from-file + logf + (lambda () + (let loop ((inl (read-line)) + (lnum 0)) + (if (not (eof-object? inl)) + (let ((mlst (string-match rx inl))) + (if (not mlst) + (if (< lnum 500) ;; give up if more than 500 lines of server log read + (loop (read-line)(+ lnum 1)) + (list #f #f #f)) + (let ((dat (cdr mlst))) + (list (car dat) ;; host + (string->number (cadr dat)) ;; port + (string->number (caddr dat)))))) + (list #f #f #f)))))))) ;; get a list of servers with all relevant data ;; ( mod-time host port start-time pid ) ;; (define (server:get-list areapath #!key (limit #f)) @@ -167,32 +171,35 @@ (day-seconds (* 24 60 60))) ;; if the directory exists continue to get the list ;; otherwise attempt to create the logs dir and then ;; continue (if (if (directory-exists? (conc areapath "/logs")) - #t + '() (if (file-write-access? areapath) (begin (condition-case (create-directory (conc areapath "/logs") #t) (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list."))) (directory-exists? (conc areapath "/logs"))) - #f)) + '())) (let* ((server-logs (glob (conc areapath "/logs/server-*.log"))) (num-serv-logs (length server-logs))) (if (null? server-logs) '() (let loop ((hed (car server-logs)) (tal (cdr server-logs)) (res '())) - (let* ((mod-time (file-modification-time hed)) + (let* ((mod-time (handle-exceptions + exn + (current-seconds) ;; 0 + (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted (down-time (- (current-seconds) mod-time)) (serv-dat (if (or (< num-serv-logs 10) - (< down-time day-seconds)) - (server:logf-get-start-info hed) - '())) ;; don't waste time processing server files not touched in the past day if there are more than ten servers to look at + (< down-time 900)) ;; day-seconds)) + (server:logf-get-start-info hed) + '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at (serv-rec (cons mod-time serv-dat)) (fmatch (string-match fname-rx hed)) (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) (new-res (if (null? serv-dat) res @@ -200,11 +207,25 @@ (if (null? tal) (if (and limit (> (length new-res) limit)) new-res ;; (take new-res limit) <= need intelligent sorting before this will work new-res) - (loop (car tal)(cdr tal) new-res))))))))) + (loop (car tal)(cdr tal) new-res))))))))) + +(define (server:get-num-alive srvlst) + (let ((num-alive 0)) + (for-each + (lambda (server) + (match-let (((mod-time host port start-time pid) + server)) + (let* ((uptime (- (current-seconds) mod-time)) + (runtime (if start-time + (- mod-time start-time) + 0))) + (if (< uptime 5)(set! num-alive (+ num-alive 1)))))) + srvlst) + num-alive)) ;; given a list of servers get a list of valid servers, i.e. at least ;; 10 seconds old, has started and is less than 1 hour old and is ;; active (i.e. mod-time < 10 seconds ;; @@ -212,35 +233,52 @@ ;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off ;; and servers should stick around for about two hours or so. ;; (define (server:get-best srvlst) - (let ((now (current-seconds))) - (sort - (filter (lambda (rec) - (let ((start-time (list-ref rec 3)) - (mod-time (list-ref rec 0))) - ;; (print "start-time: " start-time " mod-time: " mod-time) - (and start-time mod-time - (> (- now start-time) 0) ;; been running at least 0 seconds - (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds - (< (- now start-time) - (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) - 180) - (random 360))) ;; under one hour running time +/- 180 - ))) - srvlst) - (lambda (a b) - (< (list-ref a 3) - (list-ref b 3)))))) + (let* ((nums (server:get-num-servers)) + (now (current-seconds)) + (slst (sort + (filter (lambda (rec) + (if (and (list? rec) + (> (length rec) 2)) + (let ((start-time (list-ref rec 3)) + (mod-time (list-ref rec 0))) + ;; (print "start-time: " start-time " mod-time: " mod-time) + (and start-time mod-time + (> (- now start-time) 0) ;; been running at least 0 seconds + (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds + (< (- now start-time) + (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) + 180) + (random 360))) ;; under one hour running time +/- 180 + )) + #f)) + srvlst) + (lambda (a b) + (< (list-ref a 3) + (list-ref b 3)))))) + (if (> (length slst) nums) + (take slst nums) + slst))) (define (server:get-first-best areapath) (let ((srvrs (server:get-best (server:get-list areapath)))) (if (and srvrs (not (null? srvrs))) (car srvrs) #f))) + +(define (server:get-rand-best areapath) + (let ((srvrs (server:get-best (server:get-list areapath)))) + (if (and (list? srvrs) + (not (null? srvrs))) + (let* ((len (length srvrs)) + (idx (random len))) + (list-ref srvrs idx)) + #f))) + (define (server:record->url servr) (match-let (((mod-time host port start-time pid) servr)) (if (and host port) @@ -254,51 +292,62 @@ *my-client-signature*))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched (define (server:kind-run areapath) - (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun - (call-num (car last-run-dat)) - (when-run (cadr last-run-dat)) - (run-delay (+ (case call-num - ((0) 0) - ((1) 20) - ((2) 300) - (else 600)) - (random 5)))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously - (if (> (- (current-seconds) when-run) run-delay) - (server:run areapath)) - (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))) + (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? + (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun + (call-num (car last-run-dat)) + (when-run (cadr last-run-dat)) + (run-delay (+ (case call-num + ((0) 0) + ((1) 20) + ((2) 300) + (else 600)) + (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously + (lock-file (conc areapath "/logs/server-start.lock"))) + (if (> (- (current-seconds) when-run) run-delay) + (begin + (common:simple-file-lock-and-wait lock-file expire-time: 15) + (server:run areapath) + (thread-sleep! 5) ;; don't release the lock for at least a few seconds + (common:simple-file-release-lock lock-file))) + (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) - (let loop ((server-url (server:check-if-running areapath))) + (let loop ((server-url (server:check-if-running areapath)) + (try-num 0)) (if (or server-url (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. server-url (let ((num-ok (length (server:get-best (server:get-list areapath))))) - (if (< num-ok 1) ;; if there are no decent candidates for servers then try starting a new one + (if (and (> try-num 0) ;; first time through simply wait a little while then try again + (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one (server:kind-run areapath)) (thread-sleep! 5) - (loop (server:check-if-running areapath))))))) + (loop (server:check-if-running areapath) + (+ try-num 1))))))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. -(define (server:dotserver-age-seconds areapath) - (let ((server-file (conc areapath "/.server"))) - (begin - (handle-exceptions - exn - #f - (- (current-seconds) - (file-modification-time server-file)))))) - +(define (server:get-num-servers #!key (numservers 2)) + (let ((ns (string->number + (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) + (or ns numservers))) + ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; -(define (server:check-if-running areapath) - (let* ((servers (server:get-best (server:get-list areapath)))) - (if (null? servers) +(define (server:check-if-running areapath) ;; #!key (numservers "2")) + (let* ((ns (server:get-num-servers)) + (servers (server:get-best (server:get-list areapath)))) + ;; (print "servers: " servers " ns: " ns) + (if (or (and servers + (null? servers)) + (not servers) + (and (list? servers) + (< (length servers) (random ns)))) ;; somewhere between 0 and numservers #f (let loop ((hed (car servers)) (tal (cdr servers))) (let ((res (server:check-server hed))) (if res @@ -381,22 +430,156 @@ ((NOREPLY) #f) ((LOGIN_OK) #t) (else #f)) (loop (read-line) inl)))))) +;; NOT USED (well, ok, reference in rpc-transport but otherwise not used). +;; (define (server:login toppath) (lambda (toppath) (set! *db-last-access* (current-seconds)) ;; might not be needed. (if (equal? *toppath* toppath) #t #f))) -(define (server:get-timeout) - (let ((tmo (configf:lookup *configdat* "server" "timeout"))) +;; timeout is hms string: 1h 5m 3s, default is 1 minute +;; +(define (server:expiration-timeout) + (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) - (string->number tmo)) - (* 60 60 (string->number tmo)) - ;; (* 3 24 60 60) ;; default to three days - (* 60 1) ;; default to one minute - ;; (* 60 60 25) ;; default to 25 hours - ))) + (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below + (* 3600 (string->number tmo)) + 60))) + +;; moving this here as it needs access to db and cannot be in common. +;; +(define (server:writable-watchdog dbstruct) + (thread-sleep! 0.05) ;; delay for startup + (let ((legacy-sync (common:run-sync?)) + (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300)) + (debug-mode (debug:debug-mode 1)) + (last-time (current-seconds)) + (no-sync-db (db:open-no-sync-db)) + (sync-duration 0) ;; run time of the sync in milliseconds + (this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x)))) + (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls + (debug:print-info 2 *default-log-port* "Periodic sync thread started.") + (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num) + (if (and legacy-sync (not *time-to-exit*)) + (let* (;;(dbstruct (db:setup)) + (mtdb (dbr:dbstruct-mtdb dbstruct)) + (mtpath (db:dbdat-get-path mtdb)) + (tmp-area (common:get-db-tmp-area)) + (start-file (conc tmp-area "/.start-sync")) + (end-file (conc tmp-area "/.end-sync"))) + (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") + (let loop () + ;; sync for filesystem local db writes + ;; + (mutex-lock! *db-multi-sync-mutex*) + (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write + (sync-in-progress *db-sync-in-progress*) + (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 5)) + (should-sync (and (not *time-to-exit*) + (> (- (current-seconds) *db-last-sync*) min-intersync-delay))) ;; sync every five seconds minimum, deprecated logic, can probably be removed + (start-time (current-seconds)) + (cpu-load-adj (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f))) + (mt-mod-time (file-modification-time mtpath)) + (last-sync-start (if (common:file-exists? start-file) + (file-modification-time start-file) + 0)) + (last-sync-end (if (common:file-exists? end-file) + (file-modification-time end-file) + 10)) + (sync-period (+ 3 (* cpu-load-adj 30))) ;; as adjusted load increases increase the sync period + (recently-synced (and (< (- start-time mt-mod-time) sync-period) ;; not useful if sync didn't modify megatest.db! + (< mt-mod-time last-sync-start))) + (sync-done (<= last-sync-start last-sync-end)) + (sync-stale (> start-time (+ last-sync-start sync-stale-seconds))) + (will-sync (and (not *time-to-exit*) ;; do not start a sync if we are in the process of exiting + (or need-sync should-sync) + (or sync-done sync-stale) + (not sync-in-progress) + (not recently-synced)))) + (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop. need-sync="need-sync" sync-in-progress=" sync-in-progress + " should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync + " sync-done=" sync-done " sync-period=" sync-period) + (if (and (> sync-period 5) + (common:low-noise-print 30 "sync-period")) + (debug:print-info 0 *default-log-port* "Increased sync period due to long sync times, sync took: " sync-period " seconds.")) + ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced)) + ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) + (if will-sync (set! *db-sync-in-progress* #t)) + (mutex-unlock! *db-multi-sync-mutex*) + (if will-sync + (let (;; (max-sync-duration (configf:lookup-number *configdat* "server" "max-sync-duration")) ;; KEEPING THIS AVAILABLE BUT SHOULD NOT USE, I'M PRETTY SURE IT DOES NOT WORK! + (sync-start (current-milliseconds))) + (with-output-to-file start-file (lambda ()(print (current-process-id)))) + + ;; put lock here + + ;; (if (or (not max-sync-duration) + ;; (< sync-duration max-sync-duration)) ;; NOTE: db:sync-to-megatest.db keeps track of time of last sync and syncs incrementally + (let ((res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive + (set! sync-duration (- (current-milliseconds) sync-start)) + (if (> res 0) ;; some records were transferred, keep the db alive + (begin + (mutex-lock! *heartbeat-mutex*) + (set! *db-last-access* (current-seconds)) + (mutex-unlock! *heartbeat-mutex*) + (debug:print-info 0 *default-log-port* "sync called, " res " records transferred.")) + (debug:print-info 2 *default-log-port* "sync called but zero records transferred"))))) +;; ;; TODO: factor this next routine out into a function +;; (with-input-from-pipe ;; this should not block other threads but need to verify this +;; (conc "megatest -sync-to-megatest.db -m testsuite:" (common:get-area-name) ":" *toppath*) +;; (lambda () +;; (let loop ((inl (read-line)) +;; (res #f)) +;; (if (eof-object? inl) +;; (begin +;; (set! sync-duration (- (current-milliseconds) sync-start)) +;; (cond +;; ((not res) +;; (debug:print 0 *default-log-port* "ERROR: sync from /tmp db to megatest.db appears to have failed. Recommended that you stop your runs and run \"megatest -cleanup-db\"")) +;; ((> res 0) +;; (mutex-lock! *heartbeat-mutex*) +;; (set! *db-last-access* (current-seconds)) +;; (mutex-unlock! *heartbeat-mutex*)))) +;; (let ((num-synced (let ((matches (string-match "^Synced (\\d+).*$" inl))) +;; (if matches +;; (string->number (cadr matches)) +;; #f)))) +;; (loop (read-line) +;; (or num-synced res)))))))))) + (if will-sync + (begin + (mutex-lock! *db-multi-sync-mutex*) + (set! *db-sync-in-progress* #f) + (set! *db-last-sync* start-time) + (with-output-to-file end-file (lambda ()(print (current-process-id)))) + + ;; release lock here + + (mutex-unlock! *db-multi-sync-mutex*))) + (if (and debug-mode + (> (- start-time last-time) 60)) + (begin + (set! last-time start-time) + (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*)))))) + + ;; keep going unless time to exit + ;; + (if (not *time-to-exit*) + (let delay-loop ((count 0)) + ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) + + (if (and (not *time-to-exit*) + (< count 6)) ;; was 11, changing to 4. + (begin + (thread-sleep! 1) + (delay-loop (+ count 1)))) + (if (not *time-to-exit*) (loop)))) + ;; time to exit, close the no-sync db here + (db:no-sync-close-db no-sync-db) + (if (common:low-noise-print 30) + (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" this-wd-num="this-wd-num)))))))