Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -36,13 +36,14 @@ artifacts.scm all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut # dbmod.import.o is just a hack here -mofiles/dbfile.o : mofiles/debugprint.o mofiles/commonmod.o dbmod.import.o +mofiles/dbfile.o : mofiles/debugprint.o mofiles/commonmod.o # dbmod.import.o db.o : dbmod.import.o mofiles/debugprint.o : mofiles/mtargs.o +mofiles/servermod.o : mofiles/artifacts.o # ftail.scm rmtmod.scm commonmod.scm removed # MSRCFILES = ducttape-lib.scm pkts.scm stml2.scm cookie.scm mutils.scm \ # mtargs.scm commonmod.scm dbmod.scm adjutant.scm ulex.scm \ # rmtmod.scm apimod.scm @@ -379,18 +380,18 @@ $(PREFIX)/bin/.$(ARCHSTR)/mtexec $(PREFIX)/bin/mtexec $(PREFIX)/bin/serialize-env \ $(PREFIX)/bin/nbfind $(PREFIX)/bin/mtrunner $(PREFIX)/bin/viewscreen $(PREFIX)/bin/mt_xterm \ $(PREFIX)/bin/mt-old-to-new.sh $(PREFIX)/bin/mt-new-to-old.sh \ $(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun \ $(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun $(PREFIX)/bin/mtutil \ - $(PREFIX)/bin/tcmt $(PREFIX)/share/db/mt-pg.sql \ + $(PREFIX)/share/db/mt-pg.sql \ $(PREFIX)/share/js/jquery-3.1.0.slim.min.js \ $(PREFIX)/bin/.$(ARCHSTR)/lib/libpangox-1.0.so \ $(PREFIX)/bin/.$(ARCHSTR)/lib/libpangox-1.0.so.0 \ $(PREFIX)/bin/.$(ARCHSTR)/lib/libxcb-xlib.so.0 # $(PREFIX)/bin/.$(ARCHSTR)/ndboard -# $(PREFIX)/bin/newdashboard +# $(PREFIX)/bin/newdashboard $(PREFIX)/bin/tcmt $(PREFIX)/bin/.$(ARCHSTR) : mkdir -p $(PREFIX)/bin/.$(ARCHSTR) mkdir -p $(PREFIX)/bin/.$(ARCHSTR)/lib Index: artifacts/artifacts.scm ================================================================== --- artifacts/artifacts.scm +++ artifacts/artifacts.scm @@ -96,10 +96,12 @@ ;; '((foods (fruit . f) ;; (meat . m))))) ;; => "beef" ;; +;; NOTE: We call artifacts "arfs" + (module artifacts ( ;; cards, util and misc ;; sort-cards ;; calc-sha1 @@ -139,10 +141,11 @@ get-value ;; looks up a value given a key in a dartifact flatten-all ;; merge the list of values from a query which includes a artifact into a flat alist <== really useful! check-artifact ;; artifact alists +get-artifact-fname write-alist->artifact read-artifact->alist ;; archive database ;; archive-open-db @@ -1089,15 +1092,18 @@ ;;====================================================================== ;; Read/write packets to files (convience functions) ;;====================================================================== +(define (get-artifact-fname targdir uuid) + (conc targdir "/" uuid ".artifact")) + ;; write alist to a artifact file ;; (define (write-alist->artifact targdir dat #!key (artifactspec '())(ptype #f)) (let-values (((uuid artifact)(alist->artifact dat artifactspec ptype: ptype))) - (with-output-to-file (conc targdir "/" uuid ".artifact") + (with-output-to-file (get-artifact-fname targdir uuid) (lambda () (print artifact))) uuid)) ;; return the uuid ;; read artifact into alist Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -58,11 +58,12 @@ ;; (declare (uses ftail)) ;; (import ftail) (import dbmod commonmod - dbfile) + dbfile + servermod) (define *db* #f) ;; this is only for the repl, do not use in general!!!! (include "common_records.scm") (include "key_records.scm") @@ -922,13 +923,29 @@ ;;====================================================================== ;; Server? Start up here. ;; (if (args:get-arg "-server") - (let ((tl (launch:setup))) + (let* ((tl (launch:setup)) + (srvdat (server:setup tl)) + (handler (lambda (dbstruct cmd params) + (api:execute-requests dbstruct (if (string? cmd) + (string->symbol cmd) + cmd) + (db:string->obj params))))) + (server:set-handler srvdat handler) + (srv-obj-to-str-set! srvdat db:obj->string) + (srv-str-to-obj-set! srvdat db:string->obj) + (srv-dbstruct-set! srvdat (db:setup #t)) + (thread-join! + (thread-start! (make-thread + (lambda () + (server:run srvdat))))) + ;; (server:launch 0 'http) - (http-transport:launch) + ;; (http-transport:launch) ;; NOTE: Need to replace this call + (set! *didsomething* #t))) ;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to ;; a specific Megatest area. Detail are being hashed out and this may change. ;; Index: servermod.scm ================================================================== --- servermod.scm +++ servermod.scm @@ -1,6 +1,6 @@ -;; Copyright 2006-2017, Matthew Welland. +;; Copyright 2006-2023, Matthew Welland. ;; ;; This file is part of Megatest. ;; ;; Megatest is free software: you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by @@ -12,13 +12,14 @@ ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see . -;; +;;====================================================================== (declare (unit servermod)) +(declare (uses artifacts)) (use md5 message-digest posix typed-records extras) (module servermod * @@ -29,453 +30,163 @@ extras md5 message-digest ports posix + srfi-18 typed-records data-structures - ) -(define *client-server-id* #f) + artifacts + ) (defstruct srv (areapath #f) (host #f) (pid #f) (type #f) (dir #f) - ) -;;====================================================================== -;; S E R V E R U T I L I T I E S -;;====================================================================== - -;; Generate a unique signature for this server -(define (mk-signature) - (message-digest-string (md5-primitive) - (with-output-to-string - (lambda () - (write (list (current-directory) - (current-process-id) - (argv))))))) - -(define (get-client-server-id) - (if *client-server-id* *client-server-id* - (let ((sig (mk-signature))) ;; clients re-use the server:mk-signature logic - (set! *client-server-id* sig) - *client-server-id*))) - -;; if srvdat is #f calculate host.pid -(define (get-host.pid srvdat) - (if srvdat - (conc (srv-host srvdat)"."(srv-pid srvdat)) - (conc (get-host-name)"."(current-process-id)))) + (incoming #f) + (dbstruct #f) + (handler #f) + (obj-to-str #f) + (str-to-obj #f) + ) ;; nearly every process in Megatest (if write access) starts a server so it ;; can receive messages to exit on request +;; servers have a type, mtserve, dboard, runner, execute? TOO COMPLICATED. ;; one server per run db file would be ideal. -;; servers have a type, mtserve, dboard, runner, execute - -;; mtrah/.servers//./incoming/*.artifact -;; | `attic -;; | -;; (note: not needed? (i)) `outgoing/./*.artifact -;; | `attic -;; `.host:port - -;; (i) Not needed if it is expected that all processes run a server +;; mtrah/.servers/./incoming/*.artifact +;; | `attic +;; | +;; `outgoing/./*.artifact +;; | `attic +;; `.host:port ;; on exit processes clean up. only mtserv or dboard clean up abandoned records? + +;; IDEA: All requests could go into one directory instead of server specific directory - need locking +;; don't get multiple processing of arfs ;; server:setup - setup the directory ;; server:launch - start a new mtserve process, possibly ;; using a launcher ;; server:run - run the long running thread that monitors ;; the .server area ;; server:exit - shutdown the server and exit ;; server:handle-request - take incoming request, process it, send response ;; back via best or fastest available transport + +;; call this with handler that takes dbstruct cmd and params after doing server:setup +;; and before starting server:run +;; +(define (server:set-handler srvdat handler) + (srv-handler-set! srvdat handler)) ;; set up the server area and return a server struct ;; NOTE: This will need to be gated by write-access ;; (define (server:setup areapath) (let* ((srvdat (make-srv areapath: areapath host: (get-host-name) ;; likely need to replace with ip address pid: (current-process-id) - ;; type: srvtype )) - (srvdir (conc areapath"/.server/"srvtype"/"(get-host.pid srvdat)))) + (srvdir (conc areapath"/.server/"(get-host.pid srvdat)))) (srv-dir-set! srvdat srvdir) + (srv-incoming-set! srvdat (conc srvdir"/incoming")) (create-directory srvdir #t) (for-each (lambda (d) (create-directory (conc srvdir"/"d))) '("incoming" "responses")) srvdat)) +(define *server-keep-running* #f) + +;; to cleanly shut the server down set *server-keep-running* to #f +;; +(define (server:run srvdat) + ;; create server arf + ;; put arf in srvdat-dir + ;; forever + ;; scan incoming dir + ;; foreach arf + ;; bundle into with-transaction, no-transaction + ;; foreach bundle + ;; process the request + ;; create results arf and write it to clients dir + ;; remove in-arf from incoming + (let* ((areapath (srv-areapath srvdat)) + (srvinfod (server:get-servinfo-dir areapath)) + (myarf (srv->alist srvdat)) + (myuuid (write-alist->artifact srvinfod myarf ptype: 'S)) + (arf-fname (get-artifact-fname srvinfod myuuid)) + (dbstruct (srv-dbstruct srvdat))) + (set! *server-keep-running* #t) + (let loop () + (let* ((start (current-milliseconds)) + (res (server:process-incoming srvdat)) + (delta (- (current-milliseconds) start))) + (thread-sleep! (if (> delta 500) + 0.1 + 0.9)) + (if (or (> res 0) ;; res is the number of requests that were found and processed + *server-keep-running*) + (loop)))))) + +;; read arfs from incoming, process them and put result arfs in proper dirs +;; return number requests found and processed +;; +(define (server:process-incoming srvdat) + (let* ((srvdir (srv-dir srvdat)) + (indir (srv-incoming srvdat)) + (arfs (glob (conc indir"/*.artifacts"))) + (handler (srv-handler srvdat)) + (obj->string (srv-obj-to-str srvdat)) + (dbstruct (srv-dbstruct srvdat))) + (let loop ((rem arfs)) + (if (not (null? arfs)) + (let* ((arf (car rem)) + (dat (read-artifact->alist arf)) + (ruuid (alist-ref 'Z dat)) + (host (alist-ref 'h dat)) + (pid (alist-ref 'i dat)) + (dest (conc srvdir"/"host"."pid"/responses")) + (cmd (alist-ref 'c dat)) + (params (alist-ref 'p dat)) + (res (handler dbstruct cmd params)) + (narf `((r . ,(obj->string res)) + (P . ,ruuid)))) + (delete-file arf) ;; add ability to save in bundles in archive area + (write-alist->artifact dest narf ptype: 'Q) + (loop (cdr rem))))) + (length arfs))) + +;; start a server process (NOT start server in this process) +;; ;; maybe check load before calling this? (define (server:launch areapath) (let* ((logd (conc areapath"/logs")) (logf (conc logd"/from-"(get-host.pid #f)".log"))) (if (not (file-exists? logd))(create-directory logd #t)) (setenv "NBFAKE_LOG" logf) (system (conc "nbfake mtserve -start-dir "areapath)))) -;; ;; When using zmq this would send the message back (two step process) -;; ;; with spiffy or rpc this simply returns the return data to be returned -;; ;; -;; (define (server:reply return-addr query-sig success/fail result) -;; (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result) -;; ;; (send-message pubsock target send-more: #t) -;; ;; (send-message pubsock -;; (case (server:get-transport) -;; ((rpc) (db:obj->string (vector success/fail query-sig result))) -;; ((http) (db:obj->string (vector success/fail query-sig result))) -;; ((fs) result) -;; (else -;; (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) -;; result))) -;; -;; ;; Given an area path, start a server process ### NOTE ### > file 2>&1 -;; ;; if the target-host is set -;; ;; try running on that host -;; ;; incidental: rotate logs in logs/ dir. -;; ;; -;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area -;; (let* ((testsuite (common:get-testsuite-name)) -;; (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) -;; (profile-mode (or (configf:lookup *configdat* "misc" "profilesw") -;; "")) -;; (cmdln (conc (common:get-megatest-exe) -;; " -server - ";; (or target-host "-") -;; (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") -;; " -daemonize " -;; "") -;; ;; " -log " logfile -;; " -m testsuite:" testsuite -;; " " profile-mode -;; )) ;; (conc " >> " logfile " 2>&1 &"))))) -;; (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? -;; (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) -;; ;; we want the remote server to start in *toppath* so push there -;; (push-directory areapath) -;; (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") -;; (thread-start! log-rotate) -;; -;; ;; host.domain.tld match host? -;; ;; (if (and target-host -;; ;; ;; look at target host, is it host.domain.tld or ip address and does it -;; ;; ;; match current ip or hostname -;; ;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) -;; ;; (not (equal? curr-ip target-host))) -;; ;; (begin -;; ;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) -;; ;; (setenv "TARGETHOST" target-host))) -;; ;; -;; (setenv "TARGETHOST_LOGF" logfile) -;; (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time -;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) -;; (system (conc "nbfake " cmdln)) -;; (unsetenv "TARGETHOST_LOGF") -;; ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) -;; (thread-join! log-rotate) -;; (pop-directory))) -;; -;; ;; given a path to a server log return: host port startseconds server-id -;; ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let -;; ;; example of what it's looking for in the log file: -;; ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 -;; -;; (define (server:logf-get-start-info logf) -;; (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id -;; (dbprep-rx (regexp "^SERVER: dbprep")) -;; (dbprep-found 0) -;; (bad-dat (list #f #f #f #f #f))) -;; (handle-exceptions -;; exn -;; (begin -;; ;; WARNING: this is potentially dangerous to blanket ignore the errors -;; (if (file-exists? logf) -;; (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn)) -;; bad-dat) ;; no idea what went wrong, call it a bad server -;; (with-input-from-file -;; logf -;; (lambda () -;; (let loop ((inl (read-line)) -;; (lnum 0)) -;; (if (not (eof-object? inl)) -;; (let ((mlst (string-match server-rx inl)) -;; (dbprep (string-match dbprep-rx inl))) -;; (if dbprep (set! dbprep-found 1)) -;; (if (not mlst) -;; (if (< lnum 500) ;; give up if more than 500 lines of server log read -;; (loop (read-line)(+ lnum 1)) -;; (begin -;; (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) -;; bad-dat)) -;; (match mlst -;; ((_ host port start server-id pid) -;; (list host -;; (string->number port) -;; (string->number start) -;; server-id -;; (string->number pid))) -;; (else -;; (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) -;; bad-dat)))) -;; (begin -;; (if dbprep-found -;; (begin -;; (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) -;; (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? -;; (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) -;; bad-dat)))))))) -;; -;; ;; ;; get a list of servers from the log files, with all relevant data -;; ;; ;; ( mod-time host port start-time pid ) -;; ;; ;; -;; ;; (define (server:get-list areapath #!key (limit #f)) -;; ;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) -;; ;; (day-seconds (* 24 60 60))) -;; ;; ;; if the directory exists continue to get the list -;; ;; ;; otherwise attempt to create the logs dir and then -;; ;; ;; continue -;; ;; (if (if (directory-exists? (conc areapath "/logs")) -;; ;; '() -;; ;; (if (file-write-access? areapath) -;; ;; (begin -;; ;; (condition-case -;; ;; (create-directory (conc areapath "/logs") #t) -;; ;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) -;; ;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) -;; ;; (directory-exists? (conc areapath "/logs"))) -;; ;; '())) -;; ;; -;; ;; ;; Get the list of server logs. -;; ;; (let* ( -;; ;; ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. -;; ;; ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) -;; ;; (server-logs (glob (conc areapath "/logs/server-*-*.log"))) -;; ;; (num-serv-logs (length server-logs))) -;; ;; (if (or (null? server-logs) (= num-serv-logs 0)) -;; ;; (let () -;; ;; (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) -;; ;; '() -;; ;; ) -;; ;; (let loop ((hed (string-chomp (car server-logs))) -;; ;; (tal (cdr server-logs)) -;; ;; (res '())) -;; ;; (let* ((mod-time (handle-exceptions -;; ;; exn -;; ;; (begin -;; ;; (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn) -;; ;; (current-seconds)) ;; 0 -;; ;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted -;; ;; (down-time (- (current-seconds) mod-time)) -;; ;; (serv-dat (if (or (< num-serv-logs 10) -;; ;; (< down-time 900)) ;; day-seconds)) -;; ;; (server:logf-get-start-info hed) -;; ;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at -;; ;; (serv-rec (cons mod-time serv-dat)) -;; ;; (fmatch (string-match fname-rx hed)) -;; ;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) -;; ;; (new-res (if (null? serv-dat) -;; ;; res -;; ;; (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let -;; ;; (if (null? tal) -;; ;; (if (and limit -;; ;; (> (length new-res) limit)) -;; ;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work -;; ;; new-res) -;; ;; (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) -;; -;; #;(define (server:get-num-alive srvlst) -;; (let ((num-alive 0)) -;; (for-each -;; (lambda (server) -;; (handle-exceptions -;; exn -;; (begin -;; (debug:print-info 0 *default-log-port* "Unable to get server start-time and/or mod-time from " server ", exn=" exn)) -;; (match-let (((mod-time host port start-time server-id pid) -;; server)) -;; (let* ((uptime (- (current-seconds) mod-time)) -;; (runtime (if start-time -;; (- mod-time start-time) -;; 0))) -;; (if (< uptime 5)(set! num-alive (+ num-alive 1))))))) -;; srvlst) -;; num-alive)) -;; -;; ;; ;; given a list of servers get a list of valid servers, i.e. at least -;; ;; ;; 10 seconds old, has started and is less than 1 hour old and is -;; ;; ;; active (i.e. mod-time < 10 seconds -;; ;; ;; -;; ;; ;; mod-time host port start-time pid -;; ;; ;; -;; ;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off -;; ;; ;; and servers should stick around for about two hours or so. -;; ;; ;; -;; ;; (define (server:get-best srvlst) -;; ;; (let* ((nums (server:get-num-servers)) -;; ;; (now (current-seconds)) -;; ;; (slst (sort -;; ;; (filter (lambda (rec) -;; ;; (if (and (list? rec) -;; ;; (> (length rec) 2)) -;; ;; (let ((start-time (list-ref rec 3)) -;; ;; (mod-time (list-ref rec 0))) -;; ;; ;; (print "start-time: " start-time " mod-time: " mod-time) -;; ;; (and start-time mod-time -;; ;; (> (- now start-time) 0) ;; been running at least 0 seconds -;; ;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds -;; ;; (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set -;; ;; (< (- now start-time) -;; ;; (+ (- (string->number (configf:lookup *configdat* "server" "runtime")) -;; ;; 180) -;; ;; (random 360)))) ;; under one hour running time +/- 180 -;; ;; )) -;; ;; #f)) -;; ;; srvlst) -;; ;; (lambda (a b) -;; ;; (< (list-ref a 3) -;; ;; (list-ref b 3)))))) -;; ;; (if (> (length slst) nums) -;; ;; (take slst nums) -;; ;; slst))) -;; -;; ;; ;; switch from server:get-list to server:get-servers-info -;; ;; ;; -;; ;; (define (server:get-first-best areapath) -;; ;; (let ((srvrs (server:get-best (server:get-list areapath)))) -;; ;; (if (and srvrs -;; ;; (not (null? srvrs))) -;; ;; (car srvrs) -;; ;; #f))) -;; ;; -;; ;; (define (server:get-rand-best areapath) -;; ;; (let ((srvrs (server:get-best (server:get-list areapath)))) -;; ;; (if (and (list? srvrs) -;; ;; (not (null? srvrs))) -;; ;; (let* ((len (length srvrs)) -;; ;; (idx (random len))) -;; ;; (list-ref srvrs idx)) -;; ;; #f))) -;; -;; (define (server:record->id servr) -;; (handle-exceptions -;; exn -;; (begin -;; (debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn) -;; #f) -;; (match-let (((host port start-time server-id pid) -;; servr)) -;; (if server-id -;; server-id -;; #f)))) -;; -;; (define (server:record->url servr) -;; (handle-exceptions -;; exn -;; (begin -;; (debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn) -;; #f) -;; (match-let (((host port start-time server-id pid) -;; servr)) -;; (if (and host port) -;; (conc host ":" port) -;; #f)))) -;; -;; -;; ;; if server-start-last exists, and wasn't old enough, wait + 1, then call this function recursively until it is old enough. -;; ;; if it is old enough, overwrite it and wait 0.25 seconds. -;; ;; if it then has the wrong server key, wait + 1 and call this function recursively. -;; ;; -;; #;(define (server:wait-for-server-start-last-flag areapath) -;; (let* ((start-flag (conc areapath "/logs/server-start-last")) -;; ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) -;; (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) -;; (server-key (conc (get-host-name) "-" (current-process-id)))) -;; (if (file-exists? start-flag) -;; (let* ((fmodtime (file-modification-time start-flag)) -;; (delta (- (current-seconds) fmodtime)) -;; (old-enough (> delta idletime)) -;; (new-server-key "")) -;; ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than seconds, and the new file still has the same server key as you just wrote, return #t. -;; ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process. -;; (if (and old-enough -;; (begin -;; (debug:print-info 2 *default-log-port* "Writing " start-flag) -;; (with-output-to-file start-flag (lambda () (print server-key))) -;; (thread-sleep! 0.25) -;; (set! new-server-key (with-input-from-file start-flag (lambda () (read-line)))) -;; (equal? server-key new-server-key))) -;; #t -;; ;; If either of the above conditions is not true, print a "Gating server start" message, wait + 1, then call this function recursively. -;; (begin -;; (debug:print-info 0 *default-log-port* "Gating server start, last start: " -;; (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) -;; -;; (thread-sleep! ( + 1 idletime)) -;; (server:wait-for-server-start-last-flag areapath))))))) -;; -;; ;; oldest server alive determines host then choose random of youngest -;; ;; five servers on that host -;; ;; -;; (define (server:get-servers-info areapath) -;; ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") -;; (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo"))) -;; (if (not (file-exists? servinfodir)) -;; (create-directory servinfodir)) -;; (let* ((allfiles (glob (conc servinfodir"/*"))) -;; (res (make-hash-table))) -;; (for-each -;; (lambda (f) -;; (let* ((hostport (pathname-strip-directory f)) -;; (serverdat (server:logf-get-start-info f))) -;; (match serverdat -;; ((host port start server-id pid) -;; (if (and host port start server-id pid) -;; (hash-table-set! res hostport serverdat) -;; (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))) -;; (else -;; (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))))) -;; allfiles) -;; res))) -;; -;; ;; check the .servinfo directory, are there other servers running on this -;; ;; or another host? -;; ;; -;; ;; returns #t => ok to start another server -;; ;; #f => not ok to start another server -;; ;; -;; (define (server:minimal-check areapath) -;; (server:clean-up-old areapath) -;; (let* ((srvdir (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo")) -;; (servrs (glob (conc srvdir"/*"))) -;; (thishostip (server:get-best-guess-address (get-host-name))) -;; (thisservrs (glob (conc srvdir"/"thishostip":*"))) -;; (homehostinf (server:choose-server areapath 'homehost)) -;; (havehome (car homehostinf)) -;; (wearehome (cdr homehostinf))) -;; (debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome -;; ", numservers: "(length thisservrs)) -;; (cond -;; ((not havehome) #t) ;; no homehost yet, go for it -;; ((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another -;; ((and havehome (not wearehome)) #f) ;; we are not the home host -;; ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running -;; (else -;; (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs) -;; #t)))) -;; -;; -;; (define server-last-start 0) -;; +;;====================================================================== +;; OLD SERVER STUFF BELOW HERE +;;====================================================================== + +;; ;; servers start by setting up fs transport +;; ;; and put a flag file for that ASAP. +;; ;; they then set up tcp and put a flag file for +;; ;; that +;; ;; +;; (define *client-server-id* #f) ;; ;; ;; oldest server alive determines host then choose random of youngest ;; ;; five servers on that host ;; ;; ;; ;; mode: @@ -490,21 +201,20 @@ ;; ;; 1. sort by age descending ;; ;; 2. take five ;; ;; 3. check alive, discard if not and repeat ;; ;; first we clean up old server files ;; (server:clean-up-old areapath) -;; (let* ((since-last (- (current-seconds) server-last-start)) -;; (server-start-delay 10)) -;; (if ( < (- (current-seconds) server-last-start) 10 ) -;; (begin -;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) -;; (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds") -;; (thread-sleep! server-start-delay) -;; ) -;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) -;; ) -;; ) +;; ;; (let* ((since-last (- (current-seconds) server-last-start)) +;; ;; (server-start-delay 10)) +;; ;; (if ( < (- (current-seconds) server-last-start) 10 ) +;; ;; (begin +;; ;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) +;; ;; (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds") +;; ;; (thread-sleep! server-start-delay) +;; ;; ) +;; ;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) +;; ;; ) ;; (let* ((serversdat (server:get-servers-info areapath)) ;; (servkeys (hash-table-keys serversdat)) ;; (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last ;; (sort servkeys ;; list of "host:port" ;; (lambda (a b) @@ -519,15 +229,15 @@ ;; (host (list-ref oldest-dat 0)) ;; (all-valid (filter (lambda (x) ;; (equal? host (list-ref (hash-table-ref serversdat x) 0))) ;; by-time-asc)) ;; (best-ten (lambda () -;; (if (> (length all-valid) 11) -;; (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out -;; (if (> (length all-valid) 8) -;; (drop-right all-valid 1) -;; all-valid)))) +;; (if (> (length all-valid) 11) +;; (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out +;; (if (> (length all-valid) 8) +;; (drop-right all-valid 1) +;; all-valid)))) ;; (names->dats (lambda (names) ;; (map (lambda (x) ;; (hash-table-ref serversdat x)) ;; names))) ;; (am-home? (lambda () @@ -556,16 +266,29 @@ ;; (set! server-last-start (current-seconds)) ;; ;; (thread-sleep! 3) ;; (case mode ;; ((homehost) (cons #f #f)) ;; (else #f)))))) -;; -;; (define (server:get-servinfo-dir areapath) -;; (let* ((spath (conc areapath"/.servinfo"))) -;; (if (not (file-exists? spath)) -;; (create-directory spath #t)) -;; spath)) + +;;====================================================================== +;; S E R V E R U T I L I T I E S +;;====================================================================== + +(define (server:get-servinfo-dir areapath) + (let* ((spath (conc areapath"/.servinfo"))) + (if (not (file-exists? spath)) + (create-directory spath #t)) + spath)) + +;; ;; Generate a unique signature for this server +;; (define (mk-signature) +;; (message-digest-string (md5-primitive) +;; (with-output-to-string +;; (lambda () +;; (write (list (current-directory) +;; (current-process-id) +;; (argv))))))) ;; ;; (define (server:clean-up-old areapath) ;; ;; any server file that has not been touched in ten minutes is effectively dead ;; (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*")))) ;; (for-each @@ -585,309 +308,785 @@ ;; exn ;; (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile) ;; (delete-file sfile)))))) ;; sfiles))) ;; -;; ;; would like to eventually get rid of this -;; ;; -;; (define (common:on-homehost?) -;; (server:choose-server *toppath* 'home?)) -;; -;; ;; kind start up of server, wait before allowing another server for a given -;; ;; area to be launched -;; ;; -;; (define (server:kind-run areapath) -;; ;; look for $MT_RUN_AREA_HOME/logs/server-start-last -;; ;; and wait for it to be at least seconds old -;; ;; (server:wait-for-server-start-last-flag areapath) -;; (let loop () -;; (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2) -;; (begin -;; (if (common:low-noise-print 30 "our-host-load") -;; (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server.")) -;; (loop)))) -;; (if (< (server:choose-server areapath 'count) 20) -;; (server:run areapath)) -;; #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? -;; (let* ((lock-file (conc areapath "/logs/server-start.lock"))) -;; (let* ((start-flag (conc areapath "/logs/server-start-last"))) -;; (common:simple-file-lock-and-wait lock-file expire-time: 25) -;; (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag) -;; (system (conc "touch " start-flag)) ;; lazy but safe -;; (server:run areapath) -;; (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED". -;; (common:simple-file-release-lock lock-file))) -;; (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another."))) -;; -;; ;; this one seems to be the general entry point -;; ;; -;; (define (server:start-and-wait areapath #!key (timeout 60)) -;; (let ((give-up-time (+ (current-seconds) timeout))) -;; (let loop ((server-info (server:check-if-running areapath)) -;; (try-num 0)) -;; (if (or server-info -;; (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. -;; (server:record->url server-info) -;; (let* ( (servers (server:choose-server areapath 'all-valid)) -;; (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0))) -;; (if (and (> try-num 0) ;; first time through simply wait a little while then try again -;; (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one -;; (server:run areapath)) -;; (thread-sleep! 5) -;; (loop (server:check-if-running areapath) -;; (+ try-num 1))))))) -;; -;; (define (server:get-num-servers #!key (numservers 2)) -;; (let ((ns (string->number -;; (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) -;; (or ns numservers))) -;; -;; ;; no longer care if multiple servers are started by accident. older servers will drop off in time. -;; ;; -;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) -;; (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed -;; (servers (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath)))) -;; (if (or (and servers -;; (null? servers)) -;; (not servers)) -;; ;; (and (list? servers) -;; ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers -;; #f -;; (let loop ((hed (car servers)) -;; (tal (cdr servers))) -;; (let ((res (server:check-server hed))) -;; (if res -;; hed -;; (if (null? tal) -;; #f -;; (loop (car tal)(cdr tal))))))))) -;; -;; ;; ping the given server -;; ;; -;; (define (server:check-server server-record) -;; (let* ((server-url (server:record->url server-record)) -;; (server-id (server:record->id server-record)) -;; (res (server:ping server-url server-id))) -;; (if res -;; server-url -;; #f))) -;; -;; (define (server:kill servr) -;; (handle-exceptions -;; exn -;; (begin -;; (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " servr ", exn=" exn) -;; #f) -;; (match-let (((mod-time hostname port start-time server-id pid) -;; servr)) -;; (tasks:kill-server hostname pid)))) -;; -;; ;; called in megatest.scm, host-port is string hostname:port -;; ;; -;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running -;; ;; in the same process as the server. -;; ;; -;; (define (server:ping host:port server-id #!key (do-exit #f)) -;; (let* ((host-port (cond -;; ((string? host:port) -;; (let ((slst (string-split host:port ":"))) -;; (if (eq? (length slst) 2) -;; (list (car slst)(string->number (cadr slst))) -;; #f))) -;; (else -;; #f)))) -;; (cond -;; ((and (list? host-port) -;; (eq? (length host-port) 2)) -;; (let* ((myrunremote (make-remote)) -;; (iface (car host-port)) -;; (port (cadr host-port)) -;; (server-dat (client:connect iface port server-id myrunremote)) -;; (login-res (rmt:login-no-auto-client-setup myrunremote))) -;; (if (and (list? login-res) -;; (car login-res)) -;; (begin -;; ;; (print "LOGIN_OK") -;; (if do-exit (exit 0)) -;; #t) -;; (begin -;; ;; (print "LOGIN_FAILED") -;; (if do-exit (exit 1)) -;; #f)))) -;; (else -;; (if host:port -;; (debug:print 0 *default-log-port* "ERROR: bad host:port "host:port)) -;; (if do-exit -;; (exit 1) -;; #f))))) -;; -;; ;; run ping in separate process, safest way in some cases -;; ;; -;; (define (server:ping-server ifaceport) -;; (with-input-from-pipe -;; (conc (common:get-megatest-exe) " -ping " ifaceport) -;; (lambda () -;; (let loop ((inl (read-line)) -;; (res "NOREPLY")) -;; (if (eof-object? inl) -;; (case (string->symbol res) -;; ((NOREPLY) #f) -;; ((LOGIN_OK) #t) -;; (else #f)) -;; (loop (read-line) inl)))))) -;; -;; ;; NOT USED (well, ok, reference in rpc-transport but otherwise not used). -;; ;; -;; (define (server:login toppath) -;; (lambda (toppath) -;; (set! *db-last-access* (current-seconds)) ;; might not be needed. -;; (if (equal? *toppath* toppath) -;; #t -;; #f))) -;; -;; ;; timeout is hms string: 1h 5m 3s, default is 1 minute -;; ;; This is currently broken. Just use the number of hours with no unit. -;; ;; Default is 60 seconds. -;; ;; -;; (define (server:expiration-timeout) -;; (let ((tmo (configf:lookup *configdat* "server" "timeout"))) -;; (if (and (string? tmo) -;; (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below -;; (* 3600 (string->number tmo)) -;; 600))) -;; -;; (define (server:get-best-guess-address hostname) -;; (let ((res #f)) -;; (for-each -;; (lambda (adr) -;; (if (not (eq? (u8vector-ref adr 0) 127)) -;; (set! res adr))) -;; ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME -;; (vector->list (hostinfo-addresses (hostname->hostinfo hostname)))) -;; (string-intersperse -;; (map number->string -;; (u8vector->list -;; (if res res (hostname->ip hostname)))) "."))) -;; -;; ;; (define server:sync-lock-token "SERVER_SYNC_LOCK") -;; ;; (define (server:release-sync-lock) -;; ;; (db:no-sync-del! *no-sync-db* server:sync-lock-token)) -;; ;; (define (server:have-sync-lock?) -;; ;; (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token)) -;; ;; (have-lock? (car have-lock-pair)) -;; ;; (lock-time (cdr have-lock-pair)) -;; ;; (lock-age (- (current-seconds) lock-time))) +;; (define (get-client-server-id) +;; (if *client-server-id* *client-server-id* +;; (let ((sig (mk-signature))) ;; clients re-use the server:mk-signature logic +;; (set! *client-server-id* sig) +;; *client-server-id*))) + +;; if srvdat is #f calculate host.pid +(define (get-host.pid srvdat) + (if srvdat + (conc (srv-host srvdat)"."(srv-pid srvdat)) + (conc (get-host-name)"."(current-process-id)))) + +;; ;; ;; When using zmq this would send the message back (two step process) +;; ;; ;; with spiffy or rpc this simply returns the return data to be returned +;; ;; ;; +;; ;; (define (server:reply return-addr query-sig success/fail result) +;; ;; (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result) +;; ;; ;; (send-message pubsock target send-more: #t) +;; ;; ;; (send-message pubsock +;; ;; (case (server:get-transport) +;; ;; ((rpc) (db:obj->string (vector success/fail query-sig result))) +;; ;; ((http) (db:obj->string (vector success/fail query-sig result))) +;; ;; ((fs) result) +;; ;; (else +;; ;; (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) +;; ;; result))) +;; ;; +;; ;; ;; Given an area path, start a server process ### NOTE ### > file 2>&1 +;; ;; ;; if the target-host is set +;; ;; ;; try running on that host +;; ;; ;; incidental: rotate logs in logs/ dir. +;; ;; ;; +;; ;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area +;; ;; (let* ((testsuite (common:get-testsuite-name)) +;; ;; (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) +;; ;; (profile-mode (or (configf:lookup *configdat* "misc" "profilesw") +;; ;; "")) +;; ;; (cmdln (conc (common:get-megatest-exe) +;; ;; " -server - ";; (or target-host "-") +;; ;; (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") +;; ;; " -daemonize " +;; ;; "") +;; ;; ;; " -log " logfile +;; ;; " -m testsuite:" testsuite +;; ;; " " profile-mode +;; ;; )) ;; (conc " >> " logfile " 2>&1 &"))))) +;; ;; (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? +;; ;; (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) +;; ;; ;; we want the remote server to start in *toppath* so push there +;; ;; (push-directory areapath) +;; ;; (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") +;; ;; (thread-start! log-rotate) +;; ;; +;; ;; ;; host.domain.tld match host? +;; ;; ;; (if (and target-host +;; ;; ;; ;; look at target host, is it host.domain.tld or ip address and does it +;; ;; ;; ;; match current ip or hostname +;; ;; ;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) +;; ;; ;; (not (equal? curr-ip target-host))) +;; ;; ;; (begin +;; ;; ;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) +;; ;; ;; (setenv "TARGETHOST" target-host))) +;; ;; ;; +;; ;; (setenv "TARGETHOST_LOGF" logfile) +;; ;; (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time +;; ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) +;; ;; (system (conc "nbfake " cmdln)) +;; ;; (unsetenv "TARGETHOST_LOGF") +;; ;; ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) +;; ;; (thread-join! log-rotate) +;; ;; (pop-directory))) +;; ;; +;; ;; ;; given a path to a server log return: host port startseconds server-id +;; ;; ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let +;; ;; ;; example of what it's looking for in the log file: +;; ;; ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 +;; ;; +;; ;; (define (server:logf-get-start-info logf) +;; ;; (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id +;; ;; (dbprep-rx (regexp "^SERVER: dbprep")) +;; ;; (dbprep-found 0) +;; ;; (bad-dat (list #f #f #f #f #f))) +;; ;; (handle-exceptions +;; ;; exn +;; ;; (begin +;; ;; ;; WARNING: this is potentially dangerous to blanket ignore the errors +;; ;; (if (file-exists? logf) +;; ;; (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn)) +;; ;; bad-dat) ;; no idea what went wrong, call it a bad server +;; ;; (with-input-from-file +;; ;; logf +;; ;; (lambda () +;; ;; (let loop ((inl (read-line)) +;; ;; (lnum 0)) +;; ;; (if (not (eof-object? inl)) +;; ;; (let ((mlst (string-match server-rx inl)) +;; ;; (dbprep (string-match dbprep-rx inl))) +;; ;; (if dbprep (set! dbprep-found 1)) +;; ;; (if (not mlst) +;; ;; (if (< lnum 500) ;; give up if more than 500 lines of server log read +;; ;; (loop (read-line)(+ lnum 1)) +;; ;; (begin +;; ;; (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) +;; ;; bad-dat)) +;; ;; (match mlst +;; ;; ((_ host port start server-id pid) +;; ;; (list host +;; ;; (string->number port) +;; ;; (string->number start) +;; ;; server-id +;; ;; (string->number pid))) +;; ;; (else +;; ;; (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) +;; ;; bad-dat)))) +;; ;; (begin +;; ;; (if dbprep-found +;; ;; (begin +;; ;; (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) +;; ;; (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? +;; ;; (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) +;; ;; bad-dat)))))))) +;; ;; +;; ;; ;; ;; get a list of servers from the log files, with all relevant data +;; ;; ;; ;; ( mod-time host port start-time pid ) +;; ;; ;; ;; +;; ;; ;; (define (server:get-list areapath #!key (limit #f)) +;; ;; ;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) +;; ;; ;; (day-seconds (* 24 60 60))) +;; ;; ;; ;; if the directory exists continue to get the list +;; ;; ;; ;; otherwise attempt to create the logs dir and then +;; ;; ;; ;; continue +;; ;; ;; (if (if (directory-exists? (conc areapath "/logs")) +;; ;; ;; '() +;; ;; ;; (if (file-write-access? areapath) +;; ;; ;; (begin +;; ;; ;; (condition-case +;; ;; ;; (create-directory (conc areapath "/logs") #t) +;; ;; ;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) +;; ;; ;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) +;; ;; ;; (directory-exists? (conc areapath "/logs"))) +;; ;; ;; '())) +;; ;; ;; +;; ;; ;; ;; Get the list of server logs. +;; ;; ;; (let* ( +;; ;; ;; ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. +;; ;; ;; ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) +;; ;; ;; (server-logs (glob (conc areapath "/logs/server-*-*.log"))) +;; ;; ;; (num-serv-logs (length server-logs))) +;; ;; ;; (if (or (null? server-logs) (= num-serv-logs 0)) +;; ;; ;; (let () +;; ;; ;; (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) +;; ;; ;; '() +;; ;; ;; ) +;; ;; ;; (let loop ((hed (string-chomp (car server-logs))) +;; ;; ;; (tal (cdr server-logs)) +;; ;; ;; (res '())) +;; ;; ;; (let* ((mod-time (handle-exceptions +;; ;; ;; exn +;; ;; ;; (begin +;; ;; ;; (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn) +;; ;; ;; (current-seconds)) ;; 0 +;; ;; ;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted +;; ;; ;; (down-time (- (current-seconds) mod-time)) +;; ;; ;; (serv-dat (if (or (< num-serv-logs 10) +;; ;; ;; (< down-time 900)) ;; day-seconds)) +;; ;; ;; (server:logf-get-start-info hed) +;; ;; ;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at +;; ;; ;; (serv-rec (cons mod-time serv-dat)) +;; ;; ;; (fmatch (string-match fname-rx hed)) +;; ;; ;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) +;; ;; ;; (new-res (if (null? serv-dat) +;; ;; ;; res +;; ;; ;; (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let +;; ;; ;; (if (null? tal) +;; ;; ;; (if (and limit +;; ;; ;; (> (length new-res) limit)) +;; ;; ;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work +;; ;; ;; new-res) +;; ;; ;; (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) +;; ;; +;; ;; #;(define (server:get-num-alive srvlst) +;; ;; (let ((num-alive 0)) +;; ;; (for-each +;; ;; (lambda (server) +;; ;; (handle-exceptions +;; ;; exn +;; ;; (begin +;; ;; (debug:print-info 0 *default-log-port* "Unable to get server start-time and/or mod-time from " server ", exn=" exn)) +;; ;; (match-let (((mod-time host port start-time server-id pid) +;; ;; server)) +;; ;; (let* ((uptime (- (current-seconds) mod-time)) +;; ;; (runtime (if start-time +;; ;; (- mod-time start-time) +;; ;; 0))) +;; ;; (if (< uptime 5)(set! num-alive (+ num-alive 1))))))) +;; ;; srvlst) +;; ;; num-alive)) +;; ;; +;; ;; ;; ;; given a list of servers get a list of valid servers, i.e. at least +;; ;; ;; ;; 10 seconds old, has started and is less than 1 hour old and is +;; ;; ;; ;; active (i.e. mod-time < 10 seconds +;; ;; ;; ;; +;; ;; ;; ;; mod-time host port start-time pid +;; ;; ;; ;; +;; ;; ;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off +;; ;; ;; ;; and servers should stick around for about two hours or so. +;; ;; ;; ;; +;; ;; ;; (define (server:get-best srvlst) +;; ;; ;; (let* ((nums (server:get-num-servers)) +;; ;; ;; (now (current-seconds)) +;; ;; ;; (slst (sort +;; ;; ;; (filter (lambda (rec) +;; ;; ;; (if (and (list? rec) +;; ;; ;; (> (length rec) 2)) +;; ;; ;; (let ((start-time (list-ref rec 3)) +;; ;; ;; (mod-time (list-ref rec 0))) +;; ;; ;; ;; (print "start-time: " start-time " mod-time: " mod-time) +;; ;; ;; (and start-time mod-time +;; ;; ;; (> (- now start-time) 0) ;; been running at least 0 seconds +;; ;; ;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds +;; ;; ;; (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set +;; ;; ;; (< (- now start-time) +;; ;; ;; (+ (- (string->number (configf:lookup *configdat* "server" "runtime")) +;; ;; ;; 180) +;; ;; ;; (random 360)))) ;; under one hour running time +/- 180 +;; ;; ;; )) +;; ;; ;; #f)) +;; ;; ;; srvlst) +;; ;; ;; (lambda (a b) +;; ;; ;; (< (list-ref a 3) +;; ;; ;; (list-ref b 3)))))) +;; ;; ;; (if (> (length slst) nums) +;; ;; ;; (take slst nums) +;; ;; ;; slst))) +;; ;; +;; ;; ;; ;; switch from server:get-list to server:get-servers-info +;; ;; ;; ;; +;; ;; ;; (define (server:get-first-best areapath) +;; ;; ;; (let ((srvrs (server:get-best (server:get-list areapath)))) +;; ;; ;; (if (and srvrs +;; ;; ;; (not (null? srvrs))) +;; ;; ;; (car srvrs) +;; ;; ;; #f))) +;; ;; ;; +;; ;; ;; (define (server:get-rand-best areapath) +;; ;; ;; (let ((srvrs (server:get-best (server:get-list areapath)))) +;; ;; ;; (if (and (list? srvrs) +;; ;; ;; (not (null? srvrs))) +;; ;; ;; (let* ((len (length srvrs)) +;; ;; ;; (idx (random len))) +;; ;; ;; (list-ref srvrs idx)) +;; ;; ;; #f))) +;; ;; +;; ;; (define (server:record->id servr) +;; ;; (handle-exceptions +;; ;; exn +;; ;; (begin +;; ;; (debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn) +;; ;; #f) +;; ;; (match-let (((host port start-time server-id pid) +;; ;; servr)) +;; ;; (if server-id +;; ;; server-id +;; ;; #f)))) +;; ;; +;; ;; (define (server:record->url servr) +;; ;; (handle-exceptions +;; ;; exn +;; ;; (begin +;; ;; (debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn) +;; ;; #f) +;; ;; (match-let (((host port start-time server-id pid) +;; ;; servr)) +;; ;; (if (and host port) +;; ;; (conc host ":" port) +;; ;; #f)))) +;; ;; +;; ;; +;; ;; ;; if server-start-last exists, and wasn't old enough, wait + 1, then call this function recursively until it is old enough. +;; ;; ;; if it is old enough, overwrite it and wait 0.25 seconds. +;; ;; ;; if it then has the wrong server key, wait + 1 and call this function recursively. +;; ;; ;; +;; ;; #;(define (server:wait-for-server-start-last-flag areapath) +;; ;; (let* ((start-flag (conc areapath "/logs/server-start-last")) +;; ;; ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) +;; ;; (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) +;; ;; (server-key (conc (get-host-name) "-" (current-process-id)))) +;; ;; (if (file-exists? start-flag) +;; ;; (let* ((fmodtime (file-modification-time start-flag)) +;; ;; (delta (- (current-seconds) fmodtime)) +;; ;; (old-enough (> delta idletime)) +;; ;; (new-server-key "")) +;; ;; ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than seconds, and the new file still has the same server key as you just wrote, return #t. +;; ;; ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process. +;; ;; (if (and old-enough +;; ;; (begin +;; ;; (debug:print-info 2 *default-log-port* "Writing " start-flag) +;; ;; (with-output-to-file start-flag (lambda () (print server-key))) +;; ;; (thread-sleep! 0.25) +;; ;; (set! new-server-key (with-input-from-file start-flag (lambda () (read-line)))) +;; ;; (equal? server-key new-server-key))) +;; ;; #t +;; ;; ;; If either of the above conditions is not true, print a "Gating server start" message, wait + 1, then call this function recursively. +;; ;; (begin +;; ;; (debug:print-info 0 *default-log-port* "Gating server start, last start: " +;; ;; (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) +;; ;; +;; ;; (thread-sleep! ( + 1 idletime)) +;; ;; (server:wait-for-server-start-last-flag areapath))))))) +;; ;; +;; ;; ;; oldest server alive determines host then choose random of youngest +;; ;; ;; five servers on that host +;; ;; ;; +;; ;; (define (server:get-servers-info areapath) +;; ;; ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") +;; ;; (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo"))) +;; ;; (if (not (file-exists? servinfodir)) +;; ;; (create-directory servinfodir)) +;; ;; (let* ((allfiles (glob (conc servinfodir"/*"))) +;; ;; (res (make-hash-table))) +;; ;; (for-each +;; ;; (lambda (f) +;; ;; (let* ((hostport (pathname-strip-directory f)) +;; ;; (serverdat (server:logf-get-start-info f))) +;; ;; (match serverdat +;; ;; ((host port start server-id pid) +;; ;; (if (and host port start server-id pid) +;; ;; (hash-table-set! res hostport serverdat) +;; ;; (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))) +;; ;; (else +;; ;; (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))))) +;; ;; allfiles) +;; ;; res))) +;; ;; +;; ;; ;; check the .servinfo directory, are there other servers running on this +;; ;; ;; or another host? +;; ;; ;; +;; ;; ;; returns #t => ok to start another server +;; ;; ;; #f => not ok to start another server +;; ;; ;; +;; ;; (define (server:minimal-check areapath) +;; ;; (server:clean-up-old areapath) +;; ;; (let* ((srvdir (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo")) +;; ;; (servrs (glob (conc srvdir"/*"))) +;; ;; (thishostip (server:get-best-guess-address (get-host-name))) +;; ;; (thisservrs (glob (conc srvdir"/"thishostip":*"))) +;; ;; (homehostinf (server:choose-server areapath 'homehost)) +;; ;; (havehome (car homehostinf)) +;; ;; (wearehome (cdr homehostinf))) +;; ;; (debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome +;; ;; ", numservers: "(length thisservrs)) +;; ;; (cond +;; ;; ((not havehome) #t) ;; no homehost yet, go for it +;; ;; ((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another +;; ;; ((and havehome (not wearehome)) #f) ;; we are not the home host +;; ;; ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running +;; ;; (else +;; ;; (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs) +;; ;; #t)))) +;; ;; +;; ;; +;; ;; (define server-last-start 0) +;; ;; +;; ;; +;; ;; ;; oldest server alive determines host then choose random of youngest +;; ;; ;; five servers on that host +;; ;; ;; +;; ;; ;; mode: +;; ;; ;; best - get best server (random of newest five) +;; ;; ;; home - get home host based on oldest server +;; ;; ;; info - print info +;; ;; (define (server:choose-server areapath #!optional (mode 'best)) +;; ;; ;; age is current-starttime +;; ;; ;; find oldest alive +;; ;; ;; 1. sort by age ascending and ping until good +;; ;; ;; find alive rand from youngest +;; ;; ;; 1. sort by age descending +;; ;; ;; 2. take five +;; ;; ;; 3. check alive, discard if not and repeat +;; ;; ;; first we clean up old server files +;; ;; (server:clean-up-old areapath) +;; ;; (let* ((since-last (- (current-seconds) server-last-start)) +;; ;; (server-start-delay 10)) +;; ;; (if ( < (- (current-seconds) server-last-start) 10 ) +;; ;; (begin +;; ;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) +;; ;; (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds") +;; ;; (thread-sleep! server-start-delay) +;; ;; ) +;; ;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) +;; ;; ) +;; ;; ) +;; ;; (let* ((serversdat (server:get-servers-info areapath)) +;; ;; (servkeys (hash-table-keys serversdat)) +;; ;; (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last +;; ;; (sort servkeys ;; list of "host:port" +;; ;; (lambda (a b) +;; ;; (>= (list-ref (hash-table-ref serversdat a) 2) +;; ;; (list-ref (hash-table-ref serversdat b) 2)))) +;; ;; '()))) +;; ;; (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat) +;; ;; (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys) +;; ;; (if (not (null? by-time-asc)) +;; ;; (let* ((oldest (last by-time-asc)) +;; ;; (oldest-dat (hash-table-ref serversdat oldest)) +;; ;; (host (list-ref oldest-dat 0)) +;; ;; (all-valid (filter (lambda (x) +;; ;; (equal? host (list-ref (hash-table-ref serversdat x) 0))) +;; ;; by-time-asc)) +;; ;; (best-ten (lambda () +;; ;; (if (> (length all-valid) 11) +;; ;; (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out +;; ;; (if (> (length all-valid) 8) +;; ;; (drop-right all-valid 1) +;; ;; all-valid)))) +;; ;; (names->dats (lambda (names) +;; ;; (map (lambda (x) +;; ;; (hash-table-ref serversdat x)) +;; ;; names))) +;; ;; (am-home? (lambda () +;; ;; (let* ((currhost (get-host-name)) +;; ;; (bestadrs (server:get-best-guess-address currhost))) +;; ;; (or (equal? host currhost) +;; ;; (equal? host bestadrs)))))) +;; ;; (case mode +;; ;; ((info) +;; ;; (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) +;; ;; (print "youngest: "(hash-table-ref serversdat (car all-valid)))) +;; ;; ((home) host) +;; ;; ((homehost) (cons host (am-home?))) ;; shut up old code +;; ;; ((home?) (am-home?)) +;; ;; ((best-ten)(names->dats (best-ten))) +;; ;; ((all-valid)(names->dats all-valid)) +;; ;; ((best) (let* ((best-ten (best-ten)) +;; ;; (len (length best-ten))) +;; ;; (hash-table-ref serversdat (list-ref best-ten (random len))))) +;; ;; ((count)(length all-valid)) +;; ;; (else +;; ;; (debug:print 0 *default-log-port* "ERROR: invalid command "mode) +;; ;; #f))) +;; ;; (begin +;; ;; (server:run areapath) +;; ;; (set! server-last-start (current-seconds)) +;; ;; ;; (thread-sleep! 3) +;; ;; (case mode +;; ;; ((homehost) (cons #f #f)) +;; ;; (else #f)))))) +;; ;; +;; ;; (define (server:get-servinfo-dir areapath) +;; ;; (let* ((spath (conc areapath"/.servinfo"))) +;; ;; (if (not (file-exists? spath)) +;; ;; (create-directory spath #t)) +;; ;; spath)) +;; ;; +;; ;; (define (server:clean-up-old areapath) +;; ;; ;; any server file that has not been touched in ten minutes is effectively dead +;; ;; (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*")))) +;; ;; (for-each +;; ;; (lambda (sfile) +;; ;; (let* ((modtime (handle-exceptions +;; ;; exn +;; ;; (begin +;; ;; (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile) +;; ;; (current-seconds)) +;; ;; (file-modification-time sfile)))) +;; ;; (if (and (number? modtime) +;; ;; (> (- (current-seconds) modtime) +;; ;; 600)) +;; ;; (begin +;; ;; (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.") +;; ;; (handle-exceptions +;; ;; exn +;; ;; (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile) +;; ;; (delete-file sfile)))))) +;; ;; sfiles))) +;; ;; +;; ;; ;; would like to eventually get rid of this +;; ;; ;; +;; ;; (define (common:on-homehost?) +;; ;; (server:choose-server *toppath* 'home?)) +;; ;; +;; ;; ;; kind start up of server, wait before allowing another server for a given +;; ;; ;; area to be launched +;; ;; ;; +;; ;; (define (server:kind-run areapath) +;; ;; ;; look for $MT_RUN_AREA_HOME/logs/server-start-last +;; ;; ;; and wait for it to be at least seconds old +;; ;; ;; (server:wait-for-server-start-last-flag areapath) +;; ;; (let loop () +;; ;; (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2) +;; ;; (begin +;; ;; (if (common:low-noise-print 30 "our-host-load") +;; ;; (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server.")) +;; ;; (loop)))) +;; ;; (if (< (server:choose-server areapath 'count) 20) +;; ;; (server:run areapath)) +;; ;; #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? +;; ;; (let* ((lock-file (conc areapath "/logs/server-start.lock"))) +;; ;; (let* ((start-flag (conc areapath "/logs/server-start-last"))) +;; ;; (common:simple-file-lock-and-wait lock-file expire-time: 25) +;; ;; (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag) +;; ;; (system (conc "touch " start-flag)) ;; lazy but safe +;; ;; (server:run areapath) +;; ;; (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED". +;; ;; (common:simple-file-release-lock lock-file))) +;; ;; (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another."))) +;; ;; +;; ;; ;; this one seems to be the general entry point +;; ;; ;; +;; ;; (define (server:start-and-wait areapath #!key (timeout 60)) +;; ;; (let ((give-up-time (+ (current-seconds) timeout))) +;; ;; (let loop ((server-info (server:check-if-running areapath)) +;; ;; (try-num 0)) +;; ;; (if (or server-info +;; ;; (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. +;; ;; (server:record->url server-info) +;; ;; (let* ( (servers (server:choose-server areapath 'all-valid)) +;; ;; (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0))) +;; ;; (if (and (> try-num 0) ;; first time through simply wait a little while then try again +;; ;; (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one +;; ;; (server:run areapath)) +;; ;; (thread-sleep! 5) +;; ;; (loop (server:check-if-running areapath) +;; ;; (+ try-num 1))))))) +;; ;; +;; ;; (define (server:get-num-servers #!key (numservers 2)) +;; ;; (let ((ns (string->number +;; ;; (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) +;; ;; (or ns numservers))) +;; ;; +;; ;; ;; no longer care if multiple servers are started by accident. older servers will drop off in time. +;; ;; ;; +;; ;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) +;; ;; (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed +;; ;; (servers (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath)))) +;; ;; (if (or (and servers +;; ;; (null? servers)) +;; ;; (not servers)) +;; ;; ;; (and (list? servers) +;; ;; ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers +;; ;; #f +;; ;; (let loop ((hed (car servers)) +;; ;; (tal (cdr servers))) +;; ;; (let ((res (server:check-server hed))) +;; ;; (if res +;; ;; hed +;; ;; (if (null? tal) +;; ;; #f +;; ;; (loop (car tal)(cdr tal))))))))) +;; ;; +;; ;; ;; ping the given server +;; ;; ;; +;; ;; (define (server:check-server server-record) +;; ;; (let* ((server-url (server:record->url server-record)) +;; ;; (server-id (server:record->id server-record)) +;; ;; (res (server:ping server-url server-id))) +;; ;; (if res +;; ;; server-url +;; ;; #f))) +;; ;; +;; ;; (define (server:kill servr) +;; ;; (handle-exceptions +;; ;; exn +;; ;; (begin +;; ;; (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " servr ", exn=" exn) +;; ;; #f) +;; ;; (match-let (((mod-time hostname port start-time server-id pid) +;; ;; servr)) +;; ;; (tasks:kill-server hostname pid)))) +;; ;; +;; ;; ;; called in megatest.scm, host-port is string hostname:port +;; ;; ;; +;; ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running +;; ;; ;; in the same process as the server. +;; ;; ;; +;; ;; (define (server:ping host:port server-id #!key (do-exit #f)) +;; ;; (let* ((host-port (cond +;; ;; ((string? host:port) +;; ;; (let ((slst (string-split host:port ":"))) +;; ;; (if (eq? (length slst) 2) +;; ;; (list (car slst)(string->number (cadr slst))) +;; ;; #f))) +;; ;; (else +;; ;; #f)))) ;; ;; (cond -;; ;; (have-lock? #t) -;; ;; ((>lock-age -;; ;; (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180))) -;; ;; (server:release-sync-lock) -;; ;; (server:have-sync-lock?)) -;; ;; (else #f)))) -;; -;; ;; moving this here as it needs access to db and cannot be in common. -;; ;; -;; -;; (define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f)) -;; (debug:print "WARNING: bruteforce-syncer is called but has been disabled!") -;; (lambda () -;; (debug:print "WARNING: bruteforce-syncer is called but has been disabled!")) -;; #;(let* ((sqlite-exe (or (get-environment-variable "MT_SQLITE3_EXE"))) ;; defined in cfg.sh -;; (sync-log (or (args:get-arg "-sync-log") (conc *toppath* "/logs/sync-" (current-process-id) "-" (get-host-name) ".log"))) -;; (tmp-area (common:get-db-tmp-area)) -;; (tmp-db (conc tmp-area "/megatest.db")) -;; (staging-file (conc *toppath* "/.megatest.db")) -;; (mtdbfile (conc *toppath* "/megatest.db")) -;; (lockfile (common:get-sync-lock-filepath)) -;; (sync-cmd-core (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log)) -;; (sync-cmd (if fork-to-background -;; (conc "/usr/bin/env NBFAKE_LOG="*toppath*"/logs/last-server-sync-"(current-process-id)".log nbfake \""sync-cmd-core" && /bin/mv -f " staging-file " " mtdbfile" \"") -;; sync-cmd-core)) -;; (default-min-intersync-delay 2) -;; (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: default-min-intersync-delay)) -;; (default-duty-cycle 0.1) -;; (duty-cycle (configf:lookup-number *configdat* "server" "sync-duty-cycle" default: default-duty-cycle)) -;; (last-sync-seconds 10) ;; we will adjust this to a measurement and delay last-sync-seconds * (1 - duty-cycle) -;; (calculate-off-time (lambda (work-duration duty-cycle) -;; (* (/ (- 1 duty-cycle) duty-cycle) last-sync-seconds))) -;; (off-time min-intersync-delay) ;; adjusted in closure below. -;; (do-a-sync -;; (lambda () -;; (BB> "Start do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync) -;; (let* ((finalres -;; (let retry-loop ((num-tries 0)) -;; (if (common:simple-file-lock lockfile) -;; (begin -;; (cond -;; ((not (or fork-to-background persist-until-sync)) -;; (debug:print 0 *default-log-port* "INFO: syncer thread sleeping for max of (server.minimum-intersync-delay="min-intersync-delay -;; " , off-time="off-time" seconds ]") -;; (thread-sleep! (max off-time min-intersync-delay))) -;; (else -;; (debug:print 0 *default-log-port* "INFO: syncer thread NOT sleeping ; maybe time-to-exit..."))) -;; -;; (if (not (configf:lookup *configdat* "server" "disable-db-snapshot")) -;; (common:snapshot-file mtdbfile subdir: ".db-snapshot")) -;; (delete-file* staging-file) -;; (let* ((start-time (current-milliseconds)) -;; (res (system sync-cmd)) -;; (dbbackupfile (conc mtdbfile ".backup")) -;; (res2 -;; (cond -;; ((eq? 0 res ) -;; (handle-exceptions -;; exn -;; #f -;; (if (file-exists? dbbackupfile) -;; (delete-file* dbbackupfile) -;; ) -;; (if (eq? 0 (file-size sync-log)) -;; (delete-file* sync-log)) -;; (system (conc "/bin/mv " staging-file " " mtdbfile)) -;; -;; (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000)) -;; (set! off-time (calculate-off-time -;; last-sync-seconds -;; (cond -;; ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1)) -;; duty-cycle) -;; (else -;; (debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid. Should be a number between 0 and 1, but "duty-cycle" was specified. Using default value: "default-duty-cycle) -;; default-duty-cycle)))) -;; -;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec") -;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time) -;; 'sync-completed)) -;; (else -;; (system (conc "/bin/cp "sync-log" "sync-log".fail")) -;; (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail") -;; (if (file-exists? (conc mtdbfile ".backup")) -;; (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile))) -;; #f)))) -;; (common:simple-file-release-lock lockfile) -;; (BB> "released lockfile: " lockfile) -;; (when (common:file-exists? lockfile) -;; (BB> "DID NOT ACTUALLY RELEASE LOCKFILE")) -;; res2) ;; end let -;; );; end begin -;; ;; else -;; (cond -;; (persist-until-sync -;; (thread-sleep! 1) -;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; we're in a fork-to-background so we need to succeed. Let's wait a jiffy and and try again. num-tries="num-tries" (waiting for lockfile="lockfile" to disappear)") -;; (retry-loop (add1 num-tries))) -;; (else -;; (thread-sleep! (max off-time (+ last-sync-seconds min-intersync-delay))) -;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; not syncing.") -;; 'parallel-sync-in-progress)) -;; ) ;; end if got lockfile -;; ) -;; )) -;; (BB> "End do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync" and result="finalres) -;; finalres) -;; ) ;; end lambda -;; )) -;; do-a-sync)) -;; -;; +;; ;; ((and (list? host-port) +;; ;; (eq? (length host-port) 2)) +;; ;; (let* ((myrunremote (make-remote)) +;; ;; (iface (car host-port)) +;; ;; (port (cadr host-port)) +;; ;; (server-dat (client:connect iface port server-id myrunremote)) +;; ;; (login-res (rmt:login-no-auto-client-setup myrunremote))) +;; ;; (if (and (list? login-res) +;; ;; (car login-res)) +;; ;; (begin +;; ;; ;; (print "LOGIN_OK") +;; ;; (if do-exit (exit 0)) +;; ;; #t) +;; ;; (begin +;; ;; ;; (print "LOGIN_FAILED") +;; ;; (if do-exit (exit 1)) +;; ;; #f)))) +;; ;; (else +;; ;; (if host:port +;; ;; (debug:print 0 *default-log-port* "ERROR: bad host:port "host:port)) +;; ;; (if do-exit +;; ;; (exit 1) +;; ;; #f))))) +;; ;; +;; ;; ;; run ping in separate process, safest way in some cases +;; ;; ;; +;; ;; (define (server:ping-server ifaceport) +;; ;; (with-input-from-pipe +;; ;; (conc (common:get-megatest-exe) " -ping " ifaceport) +;; ;; (lambda () +;; ;; (let loop ((inl (read-line)) +;; ;; (res "NOREPLY")) +;; ;; (if (eof-object? inl) +;; ;; (case (string->symbol res) +;; ;; ((NOREPLY) #f) +;; ;; ((LOGIN_OK) #t) +;; ;; (else #f)) +;; ;; (loop (read-line) inl)))))) +;; ;; +;; ;; ;; NOT USED (well, ok, reference in rpc-transport but otherwise not used). +;; ;; ;; +;; ;; (define (server:login toppath) +;; ;; (lambda (toppath) +;; ;; (set! *db-last-access* (current-seconds)) ;; might not be needed. +;; ;; (if (equal? *toppath* toppath) +;; ;; #t +;; ;; #f))) +;; ;; +;; ;; ;; timeout is hms string: 1h 5m 3s, default is 1 minute +;; ;; ;; This is currently broken. Just use the number of hours with no unit. +;; ;; ;; Default is 60 seconds. +;; ;; ;; +;; ;; (define (server:expiration-timeout) +;; ;; (let ((tmo (configf:lookup *configdat* "server" "timeout"))) +;; ;; (if (and (string? tmo) +;; ;; (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below +;; ;; (* 3600 (string->number tmo)) +;; ;; 600))) +;; ;; +;; ;; (define (server:get-best-guess-address hostname) +;; ;; (let ((res #f)) +;; ;; (for-each +;; ;; (lambda (adr) +;; ;; (if (not (eq? (u8vector-ref adr 0) 127)) +;; ;; (set! res adr))) +;; ;; ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME +;; ;; (vector->list (hostinfo-addresses (hostname->hostinfo hostname)))) +;; ;; (string-intersperse +;; ;; (map number->string +;; ;; (u8vector->list +;; ;; (if res res (hostname->ip hostname)))) "."))) +;; ;; +;; ;; ;; (define server:sync-lock-token "SERVER_SYNC_LOCK") +;; ;; ;; (define (server:release-sync-lock) +;; ;; ;; (db:no-sync-del! *no-sync-db* server:sync-lock-token)) +;; ;; ;; (define (server:have-sync-lock?) +;; ;; ;; (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token)) +;; ;; ;; (have-lock? (car have-lock-pair)) +;; ;; ;; (lock-time (cdr have-lock-pair)) +;; ;; ;; (lock-age (- (current-seconds) lock-time))) +;; ;; ;; (cond +;; ;; ;; (have-lock? #t) +;; ;; ;; ((>lock-age +;; ;; ;; (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180))) +;; ;; ;; (server:release-sync-lock) +;; ;; ;; (server:have-sync-lock?)) +;; ;; ;; (else #f)))) +;; ;; +;; ;; ;; moving this here as it needs access to db and cannot be in common. +;; ;; ;; +;; ;; +;; ;; (define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f)) +;; ;; (debug:print "WARNING: bruteforce-syncer is called but has been disabled!") +;; ;; (lambda () +;; ;; (debug:print "WARNING: bruteforce-syncer is called but has been disabled!")) +;; ;; #;(let* ((sqlite-exe (or (get-environment-variable "MT_SQLITE3_EXE"))) ;; defined in cfg.sh +;; ;; (sync-log (or (args:get-arg "-sync-log") (conc *toppath* "/logs/sync-" (current-process-id) "-" (get-host-name) ".log"))) +;; ;; (tmp-area (common:get-db-tmp-area)) +;; ;; (tmp-db (conc tmp-area "/megatest.db")) +;; ;; (staging-file (conc *toppath* "/.megatest.db")) +;; ;; (mtdbfile (conc *toppath* "/megatest.db")) +;; ;; (lockfile (common:get-sync-lock-filepath)) +;; ;; (sync-cmd-core (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log)) +;; ;; (sync-cmd (if fork-to-background +;; ;; (conc "/usr/bin/env NBFAKE_LOG="*toppath*"/logs/last-server-sync-"(current-process-id)".log nbfake \""sync-cmd-core" && /bin/mv -f " staging-file " " mtdbfile" \"") +;; ;; sync-cmd-core)) +;; ;; (default-min-intersync-delay 2) +;; ;; (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: default-min-intersync-delay)) +;; ;; (default-duty-cycle 0.1) +;; ;; (duty-cycle (configf:lookup-number *configdat* "server" "sync-duty-cycle" default: default-duty-cycle)) +;; ;; (last-sync-seconds 10) ;; we will adjust this to a measurement and delay last-sync-seconds * (1 - duty-cycle) +;; ;; (calculate-off-time (lambda (work-duration duty-cycle) +;; ;; (* (/ (- 1 duty-cycle) duty-cycle) last-sync-seconds))) +;; ;; (off-time min-intersync-delay) ;; adjusted in closure below. +;; ;; (do-a-sync +;; ;; (lambda () +;; ;; (BB> "Start do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync) +;; ;; (let* ((finalres +;; ;; (let retry-loop ((num-tries 0)) +;; ;; (if (common:simple-file-lock lockfile) +;; ;; (begin +;; ;; (cond +;; ;; ((not (or fork-to-background persist-until-sync)) +;; ;; (debug:print 0 *default-log-port* "INFO: syncer thread sleeping for max of (server.minimum-intersync-delay="min-intersync-delay +;; ;; " , off-time="off-time" seconds ]") +;; ;; (thread-sleep! (max off-time min-intersync-delay))) +;; ;; (else +;; ;; (debug:print 0 *default-log-port* "INFO: syncer thread NOT sleeping ; maybe time-to-exit..."))) +;; ;; +;; ;; (if (not (configf:lookup *configdat* "server" "disable-db-snapshot")) +;; ;; (common:snapshot-file mtdbfile subdir: ".db-snapshot")) +;; ;; (delete-file* staging-file) +;; ;; (let* ((start-time (current-milliseconds)) +;; ;; (res (system sync-cmd)) +;; ;; (dbbackupfile (conc mtdbfile ".backup")) +;; ;; (res2 +;; ;; (cond +;; ;; ((eq? 0 res ) +;; ;; (handle-exceptions +;; ;; exn +;; ;; #f +;; ;; (if (file-exists? dbbackupfile) +;; ;; (delete-file* dbbackupfile) +;; ;; ) +;; ;; (if (eq? 0 (file-size sync-log)) +;; ;; (delete-file* sync-log)) +;; ;; (system (conc "/bin/mv " staging-file " " mtdbfile)) +;; ;; +;; ;; (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000)) +;; ;; (set! off-time (calculate-off-time +;; ;; last-sync-seconds +;; ;; (cond +;; ;; ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1)) +;; ;; duty-cycle) +;; ;; (else +;; ;; (debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid. Should be a number between 0 and 1, but "duty-cycle" was specified. Using default value: "default-duty-cycle) +;; ;; default-duty-cycle)))) +;; ;; +;; ;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec") +;; ;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time) +;; ;; 'sync-completed)) +;; ;; (else +;; ;; (system (conc "/bin/cp "sync-log" "sync-log".fail")) +;; ;; (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail") +;; ;; (if (file-exists? (conc mtdbfile ".backup")) +;; ;; (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile))) +;; ;; #f)))) +;; ;; (common:simple-file-release-lock lockfile) +;; ;; (BB> "released lockfile: " lockfile) +;; ;; (when (common:file-exists? lockfile) +;; ;; (BB> "DID NOT ACTUALLY RELEASE LOCKFILE")) +;; ;; res2) ;; end let +;; ;; );; end begin +;; ;; ;; else +;; ;; (cond +;; ;; (persist-until-sync +;; ;; (thread-sleep! 1) +;; ;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; we're in a fork-to-background so we need to succeed. Let's wait a jiffy and and try again. num-tries="num-tries" (waiting for lockfile="lockfile" to disappear)") +;; ;; (retry-loop (add1 num-tries))) +;; ;; (else +;; ;; (thread-sleep! (max off-time (+ last-sync-seconds min-intersync-delay))) +;; ;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; not syncing.") +;; ;; 'parallel-sync-in-progress)) +;; ;; ) ;; end if got lockfile +;; ;; ) +;; ;; )) +;; ;; (BB> "End do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync" and result="finalres) +;; ;; finalres) +;; ;; ) ;; end lambda +;; ;; )) +;; ;; do-a-sync)) +;; ;; +;; ;; )