Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -141,11 +141,11 @@ (define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set (define *homehost-mutex* (make-mutex)) (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) - (server-url (if *toppath* (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*) #f)) + (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (or (server:get-timeout) 100))) ;; default to 100 seconds Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -99,10 +99,14 @@ (if (args:get-arg "-h") (begin (print help) (exit))) +(if (not (common:on-homehost?)) + (begin + (debug:print 0 "ERROR: Current policy requires running dashboard on homehost: " (common:get-homehost)))) + ;; TODO: Move this inside (main) ;; (if (not (launch:setup)) (begin (print "Failed to find megatest.config, exiting") Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -14,11 +14,11 @@ ;;====================================================================== ;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc (use (srfi 18) extras tcp stack) ;; RADT => use of require-extension? -(use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest base64 format dot-locking z3 typed-records) +(use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest base64 format dot-locking z3 typed-records matchable) (import (prefix sqlite3 sqlite3:)) (import (prefix base64 base64:)) ;; RADT => prefix?? (declare (unit db)) (declare (uses common)) @@ -825,20 +825,21 @@ (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.") (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) (tmpdb (db:get-db dbstruct)) (refndb (dbr:dbstruct-refndb dbstruct)) (allow-cleanup #t) ;; (if run-ids #f #t)) - (tdbdat (tasks:open-db)) - (servers (tasks:get-all-servers (db:delay-if-busy tdbdat))) + ;; (tdbdat (tasks:open-db)) + (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat))) (data-synced 0)) ;; count of changed records (I hope) ;; kill servers (if (member 'killservers options) (for-each (lambda (server) - (tasks:server-delete-record (db:delay-if-busy tdbdat) (vector-ref server 0) "dbmigration") - (tasks:kill-server (vector-ref server 2)(vector-ref server 1))) + (match-let (((mod-time host port start-time pid) server)) + (if (and host pid) + (tasks:kill-server host pid)))) servers)) ;; clear out junk records ;; (if (member 'dejunk options) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -47,11 +47,11 @@ ;; Call this to start the actual server ;; (define *db:process-queue-mutex* (make-mutex)) -(define (http-transport:run hostn run-id server-id) +(define (http-transport:run hostn) (debug:print 2 *default-log-port* "Attempting to start the server ...") (let* ((db #f) ;; (open-db)) ;; we don't want the server to be opening and closing the db unnecesarily (hostname (get-host-name)) (ipaddrstr (let ((ipstr (if (string=? "-" hostn) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") @@ -104,18 +104,17 @@ ((equal? (uri-path (request-uri (current-request))) '(/ "hey")) (send-response body: "hey there!\n" headers: '((content-type text/plain)))) (else (continue)))))))) - (http-transport:try-start-server run-id ipaddrstr start-port server-id))) + (http-transport:try-start-server ipaddrstr start-port))) ;; This is recursively run by http-transport:run until sucessful ;; -(define (http-transport:try-start-server run-id ipaddrstr portnum server-id) - (let ((config-hostname (configf:lookup *configdat* "server" "hostname")) - (tdbdat (tasks:open-db))) - (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " run-id=" run-id " ipaddrsstr=" ipaddrstr " portnum=" portnum " server-id=" server-id " config-hostname=" config-hostname) +(define (http-transport:try-start-server ipaddrstr portnum) + (let ((config-hostname (configf:lookup *configdat* "server" "hostname"))) + (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " ipaddrsstr=" ipaddrstr " portnum=" portnum " config-hostname=" config-hostname) (handle-exceptions exn (begin (print-error-message exn) (if (< portnum 64000) @@ -126,34 +125,26 @@ (portlogger:open-run-close portlogger:set-failed portnum) (debug:print 0 *default-log-port* "WARNING: failed to start on portnum: " portnum ", trying next port") (thread-sleep! 0.1) ;; get_next_port goes here - (http-transport:try-start-server run-id - ipaddrstr - (portlogger:open-run-close portlogger:find-port) - server-id)) + (http-transport:try-start-server ipaddrstr + (portlogger:open-run-close portlogger:find-port))) (begin - (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server") (print "ERROR: Tried and tried but could not start the server")))) ;; any error in following steps will result in a retry (set! *server-info* (list ipaddrstr portnum)) - (tasks:server-set-interface-port - (db:delay-if-busy tdbdat) - server-id - ipaddrstr portnum) (debug:print 0 *default-log-port* "INFO: Trying to start server on " ipaddrstr ":" portnum) ;; This starts the spiffy server ;; NEED WAY TO SET IP TO #f TO BIND ALL ;; (start-server bind-address: ipaddrstr port: portnum) (if config-hostname ;; this is a hint to bind directly (start-server port: portnum bind-address: (if (equal? config-hostname "-") ipaddrstr config-hostname)) (start-server port: portnum)) - ;; (portlogger:open-run-close portlogger:set-port portnum "released") - (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server") + (portlogger:open-run-close portlogger:set-port portnum "released") (debug:print 1 *default-log-port* "INFO: server has been stopped")))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== @@ -341,17 +332,16 @@ server-dat)) ;; run http-transport:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; -(define (http-transport:keep-running server-id run-id) +(define (http-transport:keep-running) ;; if none running or if > 20 seconds since ;; server last used then start shutdown ;; This thread waits for the server to come alive - (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server for run-id=" run-id) - (let* ((tdbdat (tasks:open-db)) - (server-start-time (current-seconds)) + (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server") + (let* ((server-start-time (current-seconds)) (server-info (let loop ((start-time (current-seconds)) (changed #t) (last-sdat "not this")) (let ((sdat #f)) (thread-sleep! 0.01) @@ -368,12 +358,11 @@ (begin (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) (sleep 4) (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes (begin - (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server " server-id " for run " run-id) - (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature") + (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") (exit)) (loop start-time (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) @@ -383,33 +372,18 @@ (server-going #f)) (let loop ((count 0) (server-state 'available) (bad-sync-count 0) (start-time (current-milliseconds))) - ;;(BB> "http-transport: top of loop; count="count" server-state="server-state" bad-sync-count="bad-sync-count" server-going="server-going) ;; Use this opportunity to sync the tmp db to megatest.db (if (not server-going) ;; *dbstruct-db* - ;; Removed code is pasted below (keeping it around until we are clear it is not needed). - ;; no *dbstruct-db* yet, set running after our first pass through and start the db - (if (eq? server-state 'available) - (let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers - (if (equal? new-server-id server-id) - (begin - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") - ;;(BB> "http-transport: ->dbprep") - (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access - (set! *dbstruct-db* (db:setup)) ;; run-id)) - (set! server-going #t) - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") - ;;(BB> "http-transport: ->running") - (server:write-dotserver *toppath* iface port (current-process-id) 'http) - (thread-start! *watchdog*) - (server:complete-attempt *toppath*)) - (begin ;; gotta exit nicely - ;;(BB> "http-transport: ->collision") - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision") - (http-transport:server-shutdown server-id port)))))) + (begin + (debug:print 0 *default-log-port* "SERVER: dbprep") + (set! *dbstruct-db* (db:setup)) ;; run-id)) + (set! server-going #t) + (debug:print 0 *default-log-port* "SERVER: running") ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. + (thread-start! *watchdog*))) ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) (rem-time (quotient (- 4000 sync-time) 1000))) @@ -423,183 +397,108 @@ ;; Check that iface and port have not changed (can happen if server port collides) (mutex-lock! *heartbeat-mutex*) (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) - (if (or (not (equal? sdat (list iface port))) - (not server-id)) - (begin - (debug:print-info 0 *default-log-port* "interface changed, refreshing iface and port info") - (set! iface (car sdat)) - (set! port (cadr sdat)) - (server:write-dotserver *toppath* iface port (current-process-id) 'http))) + (if (not (equal? sdat (list iface port))) + (let ((new-iface (car sdat)) + (new-port (cadr sdat))) + (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info") + (set! iface new-iface) + (set! port new-port) + (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) + (flush-output *default-log-port*))) ;; Transfer *db-last-access* to last-access to use in checking that we are still alive (mutex-lock! *heartbeat-mutex*) (set! last-access *db-last-access*) (mutex-unlock! *heartbeat-mutex*) + + (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) + (begin + (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) + (flush-output *default-log-port*))) - ;; (debug:print 11 *default-log-port* "last-access=" last-access ", server-timeout=" server-timeout) - ;; - ;; no_traffic, no running tests, if server 0, no running servers - ;; - ;; (let ((wait-on-running (configf:lookup *configdat* "server" b"wait-on-running"))) ;; wait on running tasks (if not true then exit on time out) - ;; (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600)) (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond - ((not (server:confirm-dotserver *toppath* iface port (current-process-id) 'http)) - (debug:print-info 0 *default-log-port* "Server .server file does not exist or contents do not match. Initiate server shutdown.") - (http-transport:server-shutdown server-id port)) ((and *server-run* (> (+ last-access server-timeout) (current-seconds))) (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))) - ;; - ;; Consider implementing some smarts here to re-insert the record or kill self is - ;; the db indicates so - ;; - ;; (if (tasks:server-am-i-the-server? tdb run-id) - ;; (tasks:server-set-state! tdb server-id "running")) - ;; (loop 0 server-state bad-sync-count (current-milliseconds))) (else (debug:print-info 0 *default-log-port* "Server timeed out. seconds since last db access: " (- (current-seconds) last-access)) - (http-transport:server-shutdown server-id port))))))) - -;; code cut out from above -;; -;; (condition-case -;; ;; (if (and (member (mutex-state *db-sync-mutex*) '(abandoned not-abandoned)) -;; ;; (> (- (current-seconds) *db-last-sync*) 5)) ;; if not currently being synced nor recently synced -;; (db:sync-touched *dbstruct-db* *run-id* force-sync: #t) ;; usually done in the watchdog, not here. -;; ((sync-failed)(cond -;; ((> bad-sync-count 10) ;; time to give up -;; (http-transport:server-shutdown server-id port)) -;; (else ;; (> bad-sync-count 0) ;; we've had a fail or two, delay and loop -;; (thread-sleep! 5) -;; (loop count server-state (+ bad-sync-count 1))))) -;; ((exn) -;; (debug:print-error 0 *default-log-port* "error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server") -;; (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running crashed") -;; (exit))) -;; (set! sync-time (- (current-milliseconds) start-time)) -;; (set! rem-time (quotient (- 4000 sync-time) 1000)) -;; (debug:print 4 *default-log-port* "SYNC: time= " sync-time ", rem-time=" rem-time) -;; -;; (if (and (<= rem-time 4) -;; (> rem-time 0)) -;; (thread-sleep! rem-time) -;; (thread-sleep! 4))) ;; fallback for if the math is changed ... - -(define (http-transport:server-shutdown server-id port) + (http-transport:server-shutdown port))))))) + +(define (http-transport:server-shutdown port) (let ((tdbdat (tasks:open-db))) ;;(BB> "http-transport:server-shutdown called") (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id)) ;; ;; start_shutdown ;; - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") + ;; (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up (portlogger:open-run-close portlogger:set-port port "released") - (thread-sleep! 5) -;; (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*) -;; (debug:print-info 0 *default-log-port* "Number of cached writes " *number-of-writes*) -;; (debug:print-info 0 *default-log-port* "Average cached write time " -;; (if (eq? *number-of-writes* 0) -;; "n/a (no writes)" -;; (/ *writes-total-delay* -;; *number-of-writes*)) -;; " ms") -;; (debug:print-info 0 *default-log-port* "Number non-cached queries " *number-non-write-queries*) -;; (debug:print-info 0 *default-log-port* "Average non-cached time " -;; (if (eq? *number-non-write-queries* 0) -;; "n/a (no queries)" -;; (/ *total-non-write-delay* -;; *number-non-write-queries*)) - ;; " ms") - + (thread-sleep! 1) + + ;; (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*) + ;; (debug:print-info 0 *default-log-port* "Number of cached writes " *number-of-writes*) + ;; (debug:print-info 0 *default-log-port* "Average cached write time " + ;; (if (eq? *number-of-writes* 0) + ;; "n/a (no writes)" + ;; (/ *writes-total-delay* + ;; *number-of-writes*)) + ;; " ms") + ;; (debug:print-info 0 *default-log-port* "Number non-cached queries " *number-non-write-queries*) + ;; (debug:print-info 0 *default-log-port* "Average non-cached time " + ;; (if (eq? *number-non-write-queries* 0) + ;; "n/a (no queries)" + ;; (/ *total-non-write-delay* + ;; *number-non-write-queries*)) + ;; " ms") + (db:print-current-query-stats) (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting") - (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running complete") - ;; if the .server file contained :myport then we can remove it - (server:remove-dotserver-file *toppath* port) - ;;(BB> "http-transport:server-shutdown -> exit") (exit))) ;; all routes though here end in exit ... ;; ;; start_server? ;; -(define (http-transport:launch run-id) - (server:attempting-start *toppath*) - (let* ((tdbdat (tasks:open-db))) - (set! *run-id* run-id) - (if (args:get-arg "-daemonize") - (begin - (daemon:ize) - (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it - (begin - (current-error-port *alt-log-file*) - (current-output-port *alt-log-file*))))) - (if (and (server:read-dotserver *toppath*) - (server:check-if-running run-id)) - (begin - (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") - (exit 0)) - (begin ;; ok, no server detected, clean out any lingering records - (tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id "notresponding"))) - (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)) - (remtries 4)) - (if (not server-id) - (if (> remtries 0) - (begin - (thread-sleep! 2) - (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id) - (- remtries 1))) - (begin - ;; since we didn't get the server lock we are going to clean up and bail out - (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") - (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch") - (server:complete-attempt *toppath*) - )) - (let* ((th2 (make-thread (lambda () - (debug:print-info 0 *default-log-port* "Server run thread started") - (http-transport:run - (if (args:get-arg "-server") - (args:get-arg "-server") - "-") - run-id - server-id)) "Server run")) - (th3 (make-thread (lambda () - (debug:print-info 0 *default-log-port* "Server monitor thread started") - (http-transport:keep-running server-id run-id)) - "Keep running"))) - (thread-start! th2) - (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. - (thread-start! th3) - (set! *didsomething* #t) - (thread-join! th2) - (exit)))))) - -;; (define (http:ping run-id host-port) -;; (let* ((server-dat (http-transport:client-connect (car host-port)(cadr host-port))) -;; (login-res (rmt:login-no-auto-client-setup server-dat run-id))) -;; (if (and (list? login-res) -;; (car login-res)) -;; (begin -;; (print "LOGIN_OK") -;; (exit 0)) -;; (begin -;; (print "LOGIN_FAILED") -;; (exit 1))))) +(define (http-transport:launch) + (if (args:get-arg "-daemonize") + (begin + (daemon:ize) + (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it + (begin + (current-error-port *alt-log-file*) + (current-output-port *alt-log-file*))))) + (let* ((th2 (make-thread (lambda () + (debug:print-info 0 *default-log-port* "Server run thread started") + (http-transport:run + (if (args:get-arg "-server") + (args:get-arg "-server") + "-") + )) "Server run")) + (th3 (make-thread (lambda () + (debug:print-info 0 *default-log-port* "Server monitor thread started") + (http-transport:keep-running) + "Keep running")))) + (thread-start! th2) + (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. + (thread-start! th3) + (set! *didsomething* #t) + (thread-join! th2) + (exit))) (define (http-transport:server-signal-handler signum) (signal-mask! signum) (handle-exceptions exn Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -352,15 +352,19 @@ ;; (define *watchdog* (make-thread common:watchdog "Watchdog thread")) (if (not (args:get-arg "-server")) (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog -;;(BB> "thread-start! watchdog") -(if (args:get-arg "-log") - (let ((oup (open-output-file (args:get-arg "-log")))) - (debug:print-info 0 *default-log-port* "Sending log output to " (args:get-arg "-log")) +(if (or (args:get-arg "-log")(args:get-arg "-server")) ;; redirect the log always when a server + (let* ((tl (or (args:get-arg "-log")(launch:setup))) ;; run launch:setup if -server + (logf (or (args:get-arg "-log") ;; use -log unless we are a server, then craft a logfile name + (conc tl "/logs/server-" (current-process-id) "-" (get-host-name) ".log"))) + (oup (open-output-file logf))) + (if (not (args:get-arg "-log")) + (hash-table-set! args:arg-hash "-log" logf)) ;; fake out future queries of -log + (debug:print-info 0 *default-log-port* "Sending log output to " logf) (set! *default-log-port* oup))) (if (or (args:get-arg "-h") (args:get-arg "-help") (args:get-arg "--help")) @@ -699,50 +703,17 @@ ;;====================================================================== ;; Start the server - can be done in conjunction with -runall or -runtests (one day...) ;; we start the server if not running else start the client thread ;;====================================================================== +;; Server? Start up here. +;; (if (args:get-arg "-server") - - ;; Server? Start up here. - ;; (let ((tl (launch:setup)) - ;; (run-id (and (args:get-arg "-run-id") - ;; (string->number (args:get-arg "-run-id")))) (transport-type (string->symbol (or (args:get-arg "-transport") "http")))) - ;; (if run-id - ;; (begin (server:launch 0 transport-type) (set! *didsomething* #t))) -;; ;; (debug:print-error 0 *default-log-port* "server requires run-id be specified with -run-id"))) -;; -;; ;; Not a server? This section will decide how to communicate -;; ;; -;; ;; Setup client for all expect listed here -;; (if (null? (lset-intersection -;; equal? -;; (hash-table-keys args:arg-hash) -;; '("-list-servers" -;; "-stop-server" -;; "-kill-server" -;; "-show-cmdinfo" -;; "-list-runs" -;; "-ping"))) -;; (if (launch:setup) -;; (let ((run-id (and (args:get-arg "-run-id") -;; (string->number (args:get-arg "-run-id"))))) -;; ;; (set! *fdb* (filedb:open-db (conc *toppath* "/db/paths.db"))) -;; ;; if not list or kill then start a client (if appropriate) -;; (if (or (args-defined? "-h" "-version" "-create-megatest-area" "-create-test") -;; (eq? (length (hash-table-keys args:arg-hash)) 0)) -;; (debug:print-info 1 *default-log-port* "Server connection not needed") -;; (begin -;; ;; (if run-id -;; ;; (client:launch run-id) -;; ;; (client:launch 0) ;; without run-id we'll start a server for "0" -;; #t -;; )))))) (if (or (args:get-arg "-list-servers") (args:get-arg "-stop-server") (args:get-arg "-kill-server")) (let ((tl (launch:setup))) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -92,11 +92,11 @@ ;; on homehost and this is a write, we already have a server, but server has died ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url *runremote*) ;; have a server - (not (server:read-dotserver *toppath*))) ;; server has died. + (not (server:check-if-running *toppath*))) ;; server has died. (set! *runremote* #f) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) @@ -106,54 +106,22 @@ (remote-server-url *runremote*)) ;; have a server (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4") (rmt:open-qry-close-locally cmd 0 params)) - ;; commented by bb; this was blocking server passive start on write on homehost (case 5) - ;; ;; on homehost and this is a write, we have a server (we know because case 4 checked) - ;; ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost - ;; (not (member cmd api:read-only-queries))) - ;; (mutex-unlock! *rmt-mutex*) - ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") - ;; (rmt:open-qry-close-locally cmd 0 params)) - - ;; on homehost, no server contact made and this is a write, passively start a server ((and (cdr (remote-hh-dat *runremote*)) ; new (not (remote-server-url *runremote*)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") - (let ((server-url (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call + (let ((server-url (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call (if server-url (remote-server-url-set! *runremote* server-url) ;; the string can be consumed by the client setup if needed - (if (not (server:start-attempted? *toppath*)) - (server:kind-run *toppath*)))) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") - (rmt:open-qry-close-locally cmd 0 params)) - - - - ;;; - ;; (begin ;; not on homehost, start server and wait - ;; (mutex-unlock! *rmt-mutex*) - ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.2") - ;; (tasks:start-and-wait-for-server (tasks:open-db) 0 15) - ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;) ;) -;;;; - - ;; if not on homehost ensure we have a connection to a live server - ;; NOTE: we *have* a homehost record by now - - ;; ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost - ;; (not (remote-conndat *runremote*)) ;; and no connection - ;; (server:read-dotserver *toppath*)) ;; .server file exists - ;; ;; something caused the server entry in tdb to disappear, but the server is still running - ;; (server:remove-dotserver-file *toppath* ".*") - ;; (mutex-unlock! *rmt-mutex*) - ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 20") - ;; (rmt:send-receive cmd rid params attemptnum: (add1 attemptnum))) + (server:kind-run *toppath*))) + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") + (rmt:open-qry-close-locally cmd 0 params)) ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost (not (remote-conndat *runremote*))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*)) (mutex-unlock! *rmt-mutex*) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -8,11 +8,11 @@ ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. (require-extension (srfi 18) extras tcp s11n) -(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest directory-utils posix-extras) +(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest directory-utils posix-extras matchable) ;; (use zmq) (use spiffy uri-common intarweb http-client spiffy-request-vars) (declare (unit server)) @@ -47,35 +47,15 @@ ;; all routes though here end in exit ... ;; ;; start_server ;; (define (server:launch run-id transport-type) - ;;(BB> "server:launch fired for run-id="run-id" transport-type="transport-type) - - (let ((attempt-in-progress (server:start-attempted? *toppath*))) ; check for .server-starting - (when attempt-in-progress - (debug:print-info 0 *default-log-port* "Server start attempt in progress in other process (=> "attempt-in-progress"<=). Aborting server launch attempt in this process ("(current-process-id)")") - (exit))) - - (let ((dotserver-url (server:check-if-running *toppath*))) ;; check for .server - (when dotserver-url - (debug:print-info 0 *default-log-port* "Server already running (=> "dotserver-url"<=). Aborting server launch attempt in this process ("(current-process-id)")") - (exit) - )) - (case transport-type - ((http)(http-transport:launch run-id)) + ((http)(http-transport:launch)) ;;((nmsg)(nmsg-transport:launch run-id)) ((rpc) (rpc-transport:launch run-id)) - (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type))) - - ;; is this a good place to print server exit stats? - (debug:print 0 "SERVER: max parallel api requests: " *max-api-process-requests*) - - ) -;; (else (debug:print-error 0 *default-log-port* "No known transport set, transport=" transport ", using rpc") -;; (rpc-transport:launch run-id))))) + (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type)))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== @@ -122,53 +102,124 @@ ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area (let* ((curr-host (get-host-name)) - (attempt-in-progress (server:start-attempted? areapath)) - (dot-server-url (server:check-if-running areapath)) + ;; (attempt-in-progress (server:start-attempted? areapath)) + ;; (dot-server-url (server:check-if-running areapath)) (curr-ip (server:get-best-guess-address curr-host)) (curr-pid (current-process-id)) (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) (target-host (car homehost)) (testsuite (common:get-testsuite-name)) - (logfile (conc areapath "/logs/server.log")) + (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) (cmdln (conc (common:get-megatest-exe) - " -server " (or target-host "-") " -run-id " 0 (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") - (conc " -daemonize -log " logfile) - "") + " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") + " -daemonize " + "") + ;; " -log " logfile " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread"))) ;; we want the remote server to start in *toppath* so push there (push-directory areapath) - (cond - (attempt-in-progress - (debug:print 0 *default-log-port* "INFO: Not trying to start server because attempt is in progress: "attempt-in-progress)) - (dot-server-url - (debug:print 0 *default-log-port* "INFO: Not trying to start server because one is already running : "dot-server-url)) - (else - (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") - (thread-start! log-rotate) - - ;; host.domain.tld match host? - (if (and target-host - ;; look at target host, is it host.domain.tld or ip address and does it - ;; match current ip or hostname - (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) - (not (equal? curr-ip target-host))) - (begin - (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) - (setenv "TARGETHOST" target-host))) - - (setenv "TARGETHOST_LOGF" logfile) - (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever - (system (conc "nbfake " cmdln)) - (unsetenv "TARGETHOST_LOGF") - (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) - (thread-join! log-rotate) - (pop-directory))))) - + (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") + (thread-start! log-rotate) + + ;; host.domain.tld match host? + (if (and target-host + ;; look at target host, is it host.domain.tld or ip address and does it + ;; match current ip or hostname + (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) + (not (equal? curr-ip target-host))) + (begin + (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) + (setenv "TARGETHOST" target-host))) + + (setenv "TARGETHOST_LOGF" "server.log") ;; logfile) + (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever + (system (conc "nbfake " cmdln)) + (unsetenv "TARGETHOST_LOGF") + (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) + (thread-join! log-rotate) + (pop-directory))) + +;; given a path to a server log return: host port startseconds +;; +(define (server:logf-get-start-info logf) + (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs + (with-input-from-file + logf + (lambda () + (let loop ((inl (read-line)) + (lnum 0)) + (if (not (eof-object? inl)) + (let ((mlst (string-match rx inl))) + (if (not mlst) + (if (< lnum 500) ;; give up if more than 500 lines of server log read + (loop (read-line)(+ lnum 1)) + (list #f #f #f)) + (let ((dat (cdr mlst))) + (list (car dat) ;; host + (string->number (cadr dat)) ;; port + (string->number (caddr dat)))))) + (list #f #f #f))))))) + +;; get a list of servers with all relevant data +;; ( mod-time host port start-time ) +;; +(define (server:get-list areapath) + (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))) + (if (directory-exists? areapath) + (let ((server-logs (glob (conc areapath "/logs/server-*.log")))) + (if (null? server-logs) + '() + (let loop ((hed (car server-logs)) + (tal (cdr server-logs)) + (res '())) + (let* ((mod-time (file-modification-time hed)) + (serv-dat (server:logf-get-start-info hed)) + (serv-rec (cons mod-time serv-dat)) + (fmatch (string-match fname-rx hed)) + (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) + (new-res (cons (append serv-rec (list pid)) res))) + (if (null? tal) + new-res + (loop (car tal)(cdr tal) new-res))))))))) + +;; given a list of servers get a list of valid servers, i.e. at least +;; 10 seconds old, has started and is less than 1 hour old and is +;; active (i.e. mod-time < 10 seconds +;; +;; mod-time host port start-time pid +;; +;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off +;; and servers should stick around for about two hours or so. +;; +(define (server:get-best srvlst) + (let ((now (current-seconds))) + (sort + (filter (lambda (rec) + (let ((start-time (list-ref rec 3)) + (mod-time (list-ref rec 0))) + ;; (print "start-time: " start-time " mod-time: " mod-time) + (and start-time mod-time + (> (- now start-time) 1) ;; been running at least 1 seconds + (< (- now mod-time) 10) ;; still alive - file touched in last 10 seconds + (< (- now start-time) 3600) ;; under one hour running time + ))) + srvlst) + (lambda (a b) + (< (list-ref a 3) + (list-ref b 3)))))) + +(define (server:record->url servr) + (match-let (((mod-time host port start-time pid) + servr)) + (if (and host port) + (conc host ":" port) + #f))) + (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) (set! *my-client-signature* sig) *my-client-signature*))) @@ -183,116 +234,116 @@ (server:run areapath) (hash-table-set! *server-kind-run* areapath (current-seconds)))))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. -(define (server:attempting-start areapath) - (with-output-to-file - (conc areapath "/.starting-server") - (lambda () - (print (current-process-id) " on " (get-host-name))))) - -(define (server:complete-attempt areapath) - (delete-file* (conc areapath "/.starting-server"))) - -(define (server:start-attempted? areapath) - (let ((flagfile (conc areapath "/.starting-server"))) - (handle-exceptions - exn - #f ;; if things go wrong pretend we can't see the file - (cond - ((and (file-exists? flagfile) - (< (- (current-seconds) - (file-modification-time flagfile)) - 15)) ;; exists and less than 15 seconds old - (with-input-from-file flagfile (lambda () (read-line)))) - ((file-exists? flagfile) ;; it is stale. - (server:complete-attempt areapath) - #f) - (else #f))))) - -(define (server:read-dotserver areapath) - (let ((dotfile (conc areapath "/.server"))) - (handle-exceptions - exn - #f ;; if things go wrong pretend we can't see the file - (cond - ((not (file-exists? dotfile)) - #f) - ((not (file-read-access? dotfile)) - #f) - ((> (server:dotserver-age-seconds areapath) (+ 5 (server:get-timeout))) - (server:remove-dotserver-file areapath ".*") - #f) - (else - (let* ((line - (with-input-from-file - dotfile - (lambda () - (read-line)))) - (tokens (if (string? line) (string-split line ":") #f))) - (cond - ((eq? 4 (length tokens)) - tokens) - (else #f)))))))) - -(define (server:read-dotserver->url areapath) - (let ((dotserver-tokens (server:read-dotserver areapath))) - (if dotserver-tokens - (conc (list-ref dotserver-tokens 0) ":" (list-ref dotserver-tokens 1)) - #f))) - -;; write a .server file in *toppath* with hostport -;; return #t on success, #f otherwise -;; -(define (server:write-dotserver areapath host port pid transport) - (let ((lock-file (conc areapath "/.server.lock")) - (server-file (conc areapath "/.server"))) - (if (common:simple-file-lock lock-file) - (let ((res (handle-exceptions - exn - #f ;; failed for some reason, for the moment simply return #f - (with-output-to-file server-file - (lambda () - (print (conc host ":" port ":" pid ":" transport)))) - #t))) - (debug:print-info 0 *default-log-port* "server file " server-file " for " host ":" port " created pid="pid) - (common:simple-file-release-lock lock-file) - res) - #f))) - - -;; this will check that the .server file present matches the server calling this procedure. -;; if parameters match (this-pid and transport) the file will be touched and #t returned -;; otherwise #f will be returned. -(define (server:confirm-dotserver areapath this-iface this-port this-pid this-transport) - (let* ((tokens (server:read-dotserver areapath))) - (cond - ((not tokens) - (debug:print-info 0 *default-log-port* "INFO: .server file does not exist.") - #f) - ((not (eq? 4 (length tokens))) - (debug:print-info 0 *default-log-port* "INFO: .server file is corrupt. There are not 4 tokens as expeted; there are "(length tokens)".") - #f) - ((not (equal? this-iface (list-ref tokens 0))) - (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for iface, server has value >"(list-ref tokens 0)"< but this server's value is >"this-iface"<") - #f) - ((not (equal? (->string this-port) (list-ref tokens 1))) - (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for port, .server has value >"(list-ref tokens 1)"< but this server's value is >"(->string this-port)"<") - #f) - ((not (equal? (->string this-pid) (list-ref tokens 2))) - (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for pid, .server has value >"(list-ref tokens 2)"< but this server's value is >"(->string this-pid)"<") - #f) - ((not (equal? (->string this-transport) (->string (list-ref tokens 3)))) - (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for transport, .server has value >"(list-ref tokens 3)"< but this server's value is >"this-transport"<") - #f) - (else (server:touch-dotserver areapath) - #t)))) - -(define (server:touch-dotserver areapath) - (let ((server-file (conc areapath "/.server"))) - (change-file-times server-file (current-seconds) (current-seconds)))) +;; (define (server:attempting-start areapath) +;; (with-output-to-file +;; (conc areapath "/.starting-server") +;; (lambda () +;; (print (current-process-id) " on " (get-host-name))))) +;; +;; (define (server:complete-attempt areapath) +;; (delete-file* (conc areapath "/.starting-server"))) +;; +;; (define (server:start-attempted? areapath) +;; (let ((flagfile (conc areapath "/.starting-server"))) +;; (handle-exceptions +;; exn +;; #f ;; if things go wrong pretend we can't see the file +;; (cond +;; ((and (file-exists? flagfile) +;; (< (- (current-seconds) +;; (file-modification-time flagfile)) +;; 15)) ;; exists and less than 15 seconds old +;; (with-input-from-file flagfile (lambda () (read-line)))) +;; ((file-exists? flagfile) ;; it is stale. +;; (server:complete-attempt areapath) +;; #f) +;; (else #f))))) +;; +;; (define (server:read-dotserver areapath) +;; (let ((dotfile (conc areapath "/.server"))) +;; (handle-exceptions +;; exn +;; #f ;; if things go wrong pretend we can't see the file +;; (cond +;; ((not (file-exists? dotfile)) +;; #f) +;; ((not (file-read-access? dotfile)) +;; #f) +;; ((> (server:dotserver-age-seconds areapath) (+ 5 (server:get-timeout))) +;; (server:remove-dotserver-file areapath ".*") +;; #f) +;; (else +;; (let* ((line +;; (with-input-from-file +;; dotfile +;; (lambda () +;; (read-line)))) +;; (tokens (if (string? line) (string-split line ":") #f))) +;; (cond +;; ((eq? 4 (length tokens)) +;; tokens) +;; (else #f)))))))) +;; +;; (define (server:read-dotserver->url areapath) +;; (let ((dotserver-tokens (server:read-dotserver areapath))) +;; (if dotserver-tokens +;; (conc (list-ref dotserver-tokens 0) ":" (list-ref dotserver-tokens 1)) +;; #f))) +;; +;; ;; write a .server file in *toppath* with hostport +;; ;; return #t on success, #f otherwise +;; ;; +;; (define (server:write-dotserver areapath host port pid transport) +;; (let ((lock-file (conc areapath "/.server.lock")) +;; (server-file (conc areapath "/.server"))) +;; (if (common:simple-file-lock lock-file) +;; (let ((res (handle-exceptions +;; exn +;; #f ;; failed for some reason, for the moment simply return #f +;; (with-output-to-file server-file +;; (lambda () +;; (print (conc host ":" port ":" pid ":" transport)))) +;; #t))) +;; (debug:print-info 0 *default-log-port* "server file " server-file " for " host ":" port " created pid="pid) +;; (common:simple-file-release-lock lock-file) +;; res) +;; #f))) +;; +;; +;; ;; this will check that the .server file present matches the server calling this procedure. +;; ;; if parameters match (this-pid and transport) the file will be touched and #t returned +;; ;; otherwise #f will be returned. +;; (define (server:confirm-dotserver areapath this-iface this-port this-pid this-transport) +;; (let* ((tokens (server:read-dotserver areapath))) +;; (cond +;; ((not tokens) +;; (debug:print-info 0 *default-log-port* "INFO: .server file does not exist.") +;; #f) +;; ((not (eq? 4 (length tokens))) +;; (debug:print-info 0 *default-log-port* "INFO: .server file is corrupt. There are not 4 tokens as expeted; there are "(length tokens)".") +;; #f) +;; ((not (equal? this-iface (list-ref tokens 0))) +;; (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for iface, server has value >"(list-ref tokens 0)"< but this server's value is >"this-iface"<") +;; #f) +;; ((not (equal? (->string this-port) (list-ref tokens 1))) +;; (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for port, .server has value >"(list-ref tokens 1)"< but this server's value is >"(->string this-port)"<") +;; #f) +;; ((not (equal? (->string this-pid) (list-ref tokens 2))) +;; (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for pid, .server has value >"(list-ref tokens 2)"< but this server's value is >"(->string this-pid)"<") +;; #f) +;; ((not (equal? (->string this-transport) (->string (list-ref tokens 3)))) +;; (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for transport, .server has value >"(list-ref tokens 3)"< but this server's value is >"this-transport"<") +;; #f) +;; (else (server:touch-dotserver areapath) +;; #t)))) +;; +;; (define (server:touch-dotserver areapath) +;; (let ((server-file (conc areapath "/.server"))) +;; (change-file-times server-file (current-seconds) (current-seconds)))) (define (server:dotserver-age-seconds areapath) (let ((server-file (conc areapath "/.server"))) (begin (handle-exceptions @@ -299,50 +350,59 @@ exn #f (- (current-seconds) (file-modification-time server-file)))))) -(define (server:remove-dotserver-file areapath hostport) - (let ((dotserver-url (server:read-dotserver->url areapath)) - (server-file (conc areapath "/.server")) - (lock-file (conc areapath "/.server.lock"))) - (if (and dotserver-url (string-match (conc ".*:" hostport "$") dotserver-url)) ;; port matches, good enough info to decide to remove the file - (if (common:simple-file-lock lock-file) - (begin - (handle-exceptions - exn - #f - (delete-file* server-file)) - (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " removed") - (common:simple-file-release-lock lock-file)) - (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - could not get lock.")) - (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - dotserver-url("dotserver-url") did not match hostport pattern ("hostport")")))) +;; (define (server:remove-dotserver-file areapath hostport) +;; (let ((dotserver-url (server:read-dotserver->url areapath)) +;; (server-file (conc areapath "/.server")) +;; (lock-file (conc areapath "/.server.lock"))) +;; (if (and dotserver-url (string-match (conc ".*:" hostport "$") dotserver-url)) ;; port matches, good enough info to decide to remove the file +;; (if (common:simple-file-lock lock-file) +;; (begin +;; (handle-exceptions +;; exn +;; #f +;; (delete-file* server-file)) +;; (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " removed") +;; (common:simple-file-release-lock lock-file)) +;; (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - could not get lock.")) +;; (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - dotserver-url("dotserver-url") did not match hostport pattern ("hostport")")))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) - (let* ((dotserver-url (server:read-dotserver->url areapath))) ;; tdbdat (tasks:open-db))) + (let* ((servers (server:get-best (server:get-list areapath))) + (best-server (if (null? servers) #f (car servers))) + (dotserver-url (if best-server + (server:record->url best-server) + #f))) ;; (server:read-dotserver->url areapath))) ;; tdbdat (tasks:open-db))) (if dotserver-url (let* ((res (case *transport-type* ((http)(server:ping-server dotserver-url)) ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) ))) (if res dotserver-url (begin - (server:remove-dotserver-file areapath ".*") ;; remove stale dotserver + (server:kill best-server) #f))) #f))) +(define (server:kill servr) + (match-let (((mod-time hostname port start-time pid) + servr)) + (tasks:kill-server hostname pid))) + ;; called in megatest.scm, host-port is string hostname:port ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; (define (server:ping host-port-in #!key (do-exit #f)) (let ((host:port (if (not host-port-in) ;; use read-dotserver to find - (server:read-dotserver->url *toppath*) + (server:check-if-running *toppath*) (if (number? host-port-in) ;; we were handed a server-id (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) ;; (print "srec: " srec " host-port-in: " host-port-in) (if srec (conc (vector-ref srec 3) ":" (vector-ref srec 4)) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -170,315 +170,24 @@ (define (tasks:hostinfo-get-pubport vec) (vector-ref vec 3)) (define (tasks:hostinfo-get-transport vec) (vector-ref vec 4)) (define (tasks:hostinfo-get-pid vec) (vector-ref vec 5)) (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) -(define (tasks:server-lock-slot mdb run-id) - (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot") - (if (< (tasks:num-in-available-state mdb run-id) 4) - (begin - (tasks:server-set-available mdb run-id) - (thread-sleep! (/ (random 1500) 1000)) ;; (thread-sleep! 2) ;; Try removing this. It may not be needed. - (tasks:server-am-i-the-server? mdb run-id)) - #f)) - -;; register that this server may come online (first to register goes though with the process) -(define (tasks:server-set-available mdb run-id) - (sqlite3:execute - mdb - "INSERT INTO servers (pid,hostname,port,pubport,start_time, priority,state,mt_version,heartbeat, interface,transport,run_id) - VALUES(?, ?, ?, ?, strftime('%s','now'), ?, ?, ?,-1,?, ?, ?);" - (current-process-id) ;; pid - (get-host-name) ;; hostname - -1 ;; port - -1 ;; pubport - (random 1000) ;; priority (used a tiebreaker on get-available) - "available" ;; state - (common:version-signature) ;; mt_version - -1 ;; interface - ;; (conc (server:get-transport)) ;; transport - (conc *transport-type*) ;; transport - run-id - )) - -(define (tasks:num-in-available-state mdb run-id) - (let ((res 0)) - (sqlite3:for-each-row - (lambda (num-in-queue) - (set! res num-in-queue)) - mdb - "SELECT count(id) FROM servers WHERE run_id=? AND state = 'available' AND (strftime('%s','now') - start_time) < 30 ;" - run-id) - res)) - -(define (tasks:num-servers-non-zero-running mdb) - (let ((res 0)) - (sqlite3:for-each-row - (lambda (num-running) - (set! res num-running)) - mdb - "SELECT count(id) FROM servers WHERE run_id != 0 AND state = 'running';") - res)) - -(define (tasks:server-clean-out-old-records-for-run-id mdb run-id tag) - (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state in ('available','dbprep','shutting-down') AND (strftime('%s','now') - start_time) > 50 AND run_id=?;" - (conc "defunct" tag) run-id)) - -(define (tasks:server-force-clean-running-records-for-run-id mdb run-id tag) - (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=?;" - (conc "defunct" tag) run-id)) - -(define (tasks:server-force-clean-run-record mdb run-id iface port tag) - (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE state = 'running' AND run_id=? AND interface=? AND port=?;" - (conc "defunct" tag) run-id iface port)) - - -;; BB> adding missing func for --list-servers -(define (tasks:server-deregister mdb hostname #!key (pullport #f) (pid #f) (action #f)) ;;pullport pid: pid action: 'delete)) - (if (eq? action 'delete) - (sqlite3:execute mdb "DELETE FROM servers WHERE pid=? AND port=? AND hostname=?;" pid pullport hostname) - (sqlite3:execute mdb "UPDATE servers SET state='defunct', heartbeat=strftime('%s','now') WHERE hostname=? AND pid=?;" - hostname pid))) - -(define (tasks:server-delete-records-for-this-pid mdb tag) - (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE hostname=? AND pid=?;" - (conc "defunct" tag) (get-host-name) (current-process-id))) - -(define (tasks:server-delete-record mdb server-id tag) - (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE id=?;" - (conc "defunct" tag) server-id) - ;; use this opportuntity to clean out records over one month old or over 10 minutes old with port = -1 (i.e. a never used placeholder) - (sqlite3:execute mdb "DELETE FROM servers WHERE state not in ('running','shutting-down','dbprep') AND (strftime('%s','now') - start_time) > 2628000;") - (sqlite3:execute mdb "DELETE FROM servers WHERE state like 'defunct%' AND port=-1 AND (strftime('%s','now') - start_time) > 600;") - ) - -(define (tasks:server-set-state! mdb server-id state) - (sqlite3:execute mdb "UPDATE servers SET state=?,heartbeat=strftime('%s','now') WHERE id=?;" state server-id)) - -(define (tasks:server-set-interface-port mdb server-id interface port) - (sqlite3:execute mdb "UPDATE servers SET interface=?,port=?,heartbeat=strftime('%s','now') WHERE id=?;" interface port server-id)) - -;; Get random port not used in long time -;; -(define (tasks:server-get-next-port mdb) - (let* ((lownum 30000) - (highnum 64000) - (used-ports '()) - (get-rand-port (lambda () - (+ lownum (random (- highnum lownum))))) - (port-param (if (and (args:get-arg "-port") - (string->number (args:get-arg "-port"))) - (string->number (args:get-arg "-port")) - #f)) - ;; (config-port (if (and (config-lookup *configdat* "server" "port") - ;; (string->number (config-lookup *configdat* "server" "port"))) - ;; (string->number (config-lookup *configdat* "server" "port")) - ;; #f)) - ) - (sqlite3:for-each-row - (lambda (port) - (set! used-ports (cons port used-ports))) - mdb - "SELECT port FROM servers;") - (cond - ((and port-param res) (if (> res port-param) res port-param)) - (port-param port-param) - ;; ((and config-port res) (if (> res config-port) res config-port)) - ;; (config-port config-port) - (else - (let loop ((port (get-rand-port)) - (remtries 100)) - (if (member port used-ports) - (if (> remtries 0) - (loop (get-rand-port)(- remtries 1)) - (get-rand-port)) - port)))))) - -(define (tasks:server-am-i-the-server? mdb run-id) - (let* ((all (tasks:server-get-servers-vying-for-run-id mdb run-id)) - (first (if (null? all) - #f;; (begin (debug:print-error 0 *default-log-port* "no servers listed, should be at least one by now.") - ;; (sqlite3:finalize! mdb) - ;; (exit 1)) - (car (db:get-rows all))))) - (if first - (let* ((header (db:get-header all)) - (id (db:get-value-by-header first header "id")) - (hostname (db:get-value-by-header first header "hostname")) - (pid (db:get-value-by-header first header "pid")) - (priority (db:get-value-by-header first header "priority"))) - ;; (debug:print 0 *default-log-port* "INFO: am-i-the-server got record " first) - ;; for now a basic check. add tiebreaking by priority later - (if (and (equal? hostname (get-host-name)) - (equal? pid (current-process-id))) - id - #f)) - #f))) - -;; Use: (db:get-value-by-header (car (db:get-rows dat)) (db:get-header dat) "fieldname") -;; to extract info from the structure returned -;; -(define (tasks:server-get-servers-vying-for-run-id mdb run-id) - (let* ((header (list "id" "hostname" "pid" "interface" "port" "pubport" "state" "run_id" "priority" "start_time")) - (selstr (string-intersperse header ",")) - (res '())) - (sqlite3:for-each-row - (lambda (a . b) - (set! res (cons (apply vector a b) res))) - mdb - (conc "SELECT " selstr " FROM servers WHERE state in ('available','running','dbprep') ORDER BY start_time DESC;") - ) - (vector header res))) - -(define (tasks:get-server mdb run-id #!key (retries 10)) - (let ((res #f) - (best #f)) - (handle-exceptions - exn - (begin - (print-call-chain (current-error-port)) - (debug:print 0 *default-log-port* "WARNING: tasks:get-server db access error.") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 0 *default-log-port* " for run " run-id) - (print-call-chain (current-error-port)) - (if (> retries 0) - (begin - (debug:print 0 *default-log-port* " trying call to tasks:get-server again in 10 seconds") - (thread-sleep! 10) - (tasks:get-server mdb run-id retries: (- retries 0))) - (debug:print 0 *default-log-port* "10 tries of tasks:get-server all crashed and burned. Giving up and returning \"no server found\""))) - (sqlite3:for-each-row - (lambda (id interface port pubport transport pid hostname) - (set! res (vector id interface port pubport transport pid hostname))) - mdb - ;; removed: - ;; strftime('%s','now')-heartbeat < 10 AND mt_version = ? - "SELECT id,interface,port,pubport,transport,pid,hostname FROM servers - WHERE run_id=? AND state='running' - ORDER BY start_time DESC LIMIT 1;" run-id) ;; (common:version-signature) run-id) - res))) - -(define (tasks:server-running-or-starting? mdb run-id) - (let ((res #f)) - (sqlite3:for-each-row - (lambda (id) - (set! res id)) - mdb ;; NEEDS dbprep ADDED - "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND (strftime('%s','now') - start_time) < 60));" run-id) - res)) - -(define (tasks:server-running? mdb run-id) - (let ((res #f)) - (sqlite3:for-each-row - (lambda (id) - (set! res id)) - mdb ;; NEEDS dbprep ADDED - "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id) - res)) - (define (tasks:need-server run-id) (equal? (configf:lookup *configdat* "server" "required") "yes")) -;; (maxqry (cdr (rmt:get-max-query-average run-id))) -;; (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) -;; (cond -;; (forced -;; (if (common:low-noise-print 60 run-id "server required is set") -;; (debug:print-info 0 *default-log-port* "Server required is set, starting server for run-id " run-id ".")) -;; #t) -;; ((> maxqry threshold) -;; (if (common:low-noise-print 60 run-id "Max query time execeeded") -;; (debug:print-info 0 *default-log-port* "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, server needed for run-id " run-id ".")) -;; #t) -;; (else -;; #f)))) - -;; try to start a server and wait for it to be available -;; -(define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries) - ;; ensure a server is running for this run - (let loop ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id)) - (delay-time 0)) - (if (and (not server-dat) - (< delay-time delay-max-tries)) - (begin - (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id) - (debug:print 0 *default-log-port* "Try starting server for run-id " run-id)) - (thread-sleep! (/ (random 2000) 1000)) - (server:kind-run *toppath*) - (thread-sleep! (min delay-time 1)) - (if (not (or (server:start-attempted? *toppath*) - (server:read-dotserver *toppath*))) ;; no point in trying - (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1)) - #f)) - #f))) - -(define (tasks:get-all-servers mdb) - (let ((res '())) - (sqlite3:for-each-row - (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) - ;; 0 1 2 3 4 5 6 7 8 9 10 11 12 - (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res))) - mdb - "SELECT id,pid,hostname,interface,port,pubport,start_time,priority,state,mt_version,strftime('%s','now')-heartbeat AS last_update,transport,run_id - FROM servers WHERE state NOT LIKE 'defunct%' ORDER BY start_time DESC;") - res)) - -(define (tasks:get-server-by-id mdb id) - (let ((res #f)) - (sqlite3:for-each-row - (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) - ;; 0 1 2 3 4 5 6 7 8 9 10 11 12 - (set! res (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id))) - mdb - "SELECT id,pid,hostname,interface,port,pubport,start_time,priority,state,mt_version,strftime('%s','now')-heartbeat AS last_update,transport,run_id - FROM servers WHERE id=?;" - id) - res)) - -(define (tasks:get-server-records mdb run-id) - (let ((res '())) - (sqlite3:for-each-row - (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) - ;; 0 1 2 3 4 5 6 7 8 9 10 11 12 - (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res))) - mdb - "SELECT id,pid,hostname,interface,port,pubport,start_time,priority,state,mt_version,strftime('%s','now')-heartbeat AS last_update,transport,run_id - FROM servers WHERE run_id=? AND state NOT LIKE 'defunct%' ORDER BY start_time DESC;" - run-id) - (reverse res))) - ;; no elegance here ... ;; (define (tasks:kill-server hostname pid #!key (kill-switch "")) - (server:remove-dotserver-file *toppath* ".*") (debug:print-info 0 *default-log-port* "Attempting to kill server process " pid " on host " hostname) (setenv "TARGETHOST" hostname) (setenv "TARGETHOST_LOGF" "server-kills.log") (system (conc "nbfake kill "kill-switch" "pid)) (unsetenv "TARGETHOST_LOGF") (unsetenv "TARGETHOST")) -;; look up a server by run-id and send it a kill, also delete the record for that server -;; -(define (tasks:kill-server-run-id run-id #!key (tag "default")) - (let* ((tdbdat (tasks:open-db)) - (sdat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) - (if sdat - (let ((hostname (vector-ref sdat 6)) - (pid (vector-ref sdat 5)) - (server-id (vector-ref sdat 0))) - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed") - (debug:print-info 0 *default-log-port* "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) - (server:remove-dotserver-file *toppath* ".*") - (tasks:kill-server hostname pid) - (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) - (debug:print-info 0 *default-log-port* "No server found for run-id " run-id ", nothing to kill")) - ;; (sqlite3:finalize! tdb) - )) - ;;====================================================================== ;; M O N I T O R S ;;====================================================================== (define (tasks:remove-monitor-record mdb) Index: utils/mk_wrapper ================================================================== --- utils/mk_wrapper +++ utils/mk_wrapper @@ -28,21 +28,21 @@ echo "#!/bin/bash" > $target if [[ $cmd =~ dboard ]]; then cat >> $target <<'EOF' -# disable if not running on homehost -if [[ -e .homehost ]]; then - homehostname=$( host `cat .homehost` | awk '{print $NF}' | sed 's/\.$//' ) - hostname=$( hostname -f ) - - if [[ ! ($homehostname == $hostname) ]]; then - echo "ERROR: this host ($hostname) is not the homehost ($homehostname) for this megatest run area. Cannot start dashboard." - echo " Please log into homehost before launching dashboard." - exit 1 - fi -fi +# # disable if not running on homehost +# if [[ -e .homehost ]]; then +# homehostname=$( host `cat .homehost` | awk '{print $NF}' | sed 's/\.$//' ) +# hostname=$( hostname -f ) +# +# if [[ ! ($homehostname == $hostname) ]]; then +# echo "ERROR: this host ($hostname) is not the homehost ($homehostname) for this megatest run area. Cannot start dashboard." +# echo " Please log into homehost before launching dashboard." +# exit 1 +# fi +# fi # check that $DISPLAY is set if [[ -z $DISPLAY ]]; then echo 'ERROR: $DISPLAY environment variable is not set; megatest dashboard requires X display address to be set in $DISPLAY.' exit 1