Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -141,11 +141,11 @@ (define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set (define *homehost-mutex* (make-mutex)) (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) - (server-url (if *toppath* (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*) #f)) + (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (or (server:get-timeout) 100))) ;; default to 100 seconds Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -99,10 +99,14 @@ (if (args:get-arg "-h") (begin (print help) (exit))) +(if (not (common:on-homehost?)) + (begin + (debug:print 0 "ERROR: Current policy requires running dashboard on homehost: " (common:get-homehost)))) + ;; TODO: Move this inside (main) ;; (if (not (launch:setup)) (begin (print "Failed to find megatest.config, exiting") Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -14,11 +14,11 @@ ;;====================================================================== ;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc (use (srfi 18) extras tcp stack) ;; RADT => use of require-extension? -(use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest base64 format dot-locking z3 typed-records) +(use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest base64 format dot-locking z3 typed-records matchable) (import (prefix sqlite3 sqlite3:)) (import (prefix base64 base64:)) ;; RADT => prefix?? (declare (unit db)) (declare (uses common)) @@ -825,20 +825,21 @@ (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.") (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) (tmpdb (db:get-db dbstruct)) (refndb (dbr:dbstruct-refndb dbstruct)) (allow-cleanup #t) ;; (if run-ids #f #t)) - (tdbdat (tasks:open-db)) - (servers (tasks:get-all-servers (db:delay-if-busy tdbdat))) + ;; (tdbdat (tasks:open-db)) + (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat))) (data-synced 0)) ;; count of changed records (I hope) ;; kill servers (if (member 'killservers options) (for-each (lambda (server) - (tasks:server-delete-record (db:delay-if-busy tdbdat) (vector-ref server 0) "dbmigration") - (tasks:kill-server (vector-ref server 2)(vector-ref server 1))) + (match-let (((mod-time host port start-time pid) server)) + (if (and host pid) + (tasks:kill-server host pid)))) servers)) ;; clear out junk records ;; (if (member 'dejunk options) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -352,15 +352,19 @@ ;; (define *watchdog* (make-thread common:watchdog "Watchdog thread")) (if (not (args:get-arg "-server")) (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog -;;(BB> "thread-start! watchdog") -(if (args:get-arg "-log") - (let ((oup (open-output-file (args:get-arg "-log")))) - (debug:print-info 0 *default-log-port* "Sending log output to " (args:get-arg "-log")) +(if (or (args:get-arg "-log")(args:get-arg "-server")) ;; redirect the log always when a server + (let* ((tl (or (args:get-arg "-log")(launch:setup))) ;; run launch:setup if -server + (logf (or (args:get-arg "-log") ;; use -log unless we are a server, then craft a logfile name + (conc tl "/logs/server-" (current-process-id) "-" (get-host-name) ".log"))) + (oup (open-output-file logf))) + (if (not (args:get-arg "-log")) + (hash-table-set! args:arg-hash "-log" logf)) ;; fake out future queries of -log + (debug:print-info 0 *default-log-port* "Sending log output to " logf) (set! *default-log-port* oup))) (if (or (args:get-arg "-h") (args:get-arg "-help") (args:get-arg "--help")) @@ -699,50 +703,17 @@ ;;====================================================================== ;; Start the server - can be done in conjunction with -runall or -runtests (one day...) ;; we start the server if not running else start the client thread ;;====================================================================== +;; Server? Start up here. +;; (if (args:get-arg "-server") - - ;; Server? Start up here. - ;; (let ((tl (launch:setup)) - ;; (run-id (and (args:get-arg "-run-id") - ;; (string->number (args:get-arg "-run-id")))) (transport-type (string->symbol (or (args:get-arg "-transport") "http")))) - ;; (if run-id - ;; (begin (server:launch 0 transport-type) (set! *didsomething* #t))) -;; ;; (debug:print-error 0 *default-log-port* "server requires run-id be specified with -run-id"))) -;; -;; ;; Not a server? This section will decide how to communicate -;; ;; -;; ;; Setup client for all expect listed here -;; (if (null? (lset-intersection -;; equal? -;; (hash-table-keys args:arg-hash) -;; '("-list-servers" -;; "-stop-server" -;; "-kill-server" -;; "-show-cmdinfo" -;; "-list-runs" -;; "-ping"))) -;; (if (launch:setup) -;; (let ((run-id (and (args:get-arg "-run-id") -;; (string->number (args:get-arg "-run-id"))))) -;; ;; (set! *fdb* (filedb:open-db (conc *toppath* "/db/paths.db"))) -;; ;; if not list or kill then start a client (if appropriate) -;; (if (or (args-defined? "-h" "-version" "-create-megatest-area" "-create-test") -;; (eq? (length (hash-table-keys args:arg-hash)) 0)) -;; (debug:print-info 1 *default-log-port* "Server connection not needed") -;; (begin -;; ;; (if run-id -;; ;; (client:launch run-id) -;; ;; (client:launch 0) ;; without run-id we'll start a server for "0" -;; #t -;; )))))) (if (or (args:get-arg "-list-servers") (args:get-arg "-stop-server") (args:get-arg "-kill-server")) (let ((tl (launch:setup))) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -92,11 +92,11 @@ ;; on homehost and this is a write, we already have a server, but server has died ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url *runremote*) ;; have a server - (not (server:read-dotserver *toppath*))) ;; server has died. + (not (server:check-if-running *toppath*))) ;; server has died. (set! *runremote* #f) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) @@ -106,54 +106,22 @@ (remote-server-url *runremote*)) ;; have a server (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4") (rmt:open-qry-close-locally cmd 0 params)) - ;; commented by bb; this was blocking server passive start on write on homehost (case 5) - ;; ;; on homehost and this is a write, we have a server (we know because case 4 checked) - ;; ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost - ;; (not (member cmd api:read-only-queries))) - ;; (mutex-unlock! *rmt-mutex*) - ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") - ;; (rmt:open-qry-close-locally cmd 0 params)) - - ;; on homehost, no server contact made and this is a write, passively start a server ((and (cdr (remote-hh-dat *runremote*)) ; new (not (remote-server-url *runremote*)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") - (let ((server-url (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call + (let ((server-url (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call (if server-url (remote-server-url-set! *runremote* server-url) ;; the string can be consumed by the client setup if needed - (if (not (server:start-attempted? *toppath*)) - (server:kind-run *toppath*)))) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") - (rmt:open-qry-close-locally cmd 0 params)) - - - - ;;; - ;; (begin ;; not on homehost, start server and wait - ;; (mutex-unlock! *rmt-mutex*) - ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.2") - ;; (tasks:start-and-wait-for-server (tasks:open-db) 0 15) - ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;) ;) -;;;; - - ;; if not on homehost ensure we have a connection to a live server - ;; NOTE: we *have* a homehost record by now - - ;; ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost - ;; (not (remote-conndat *runremote*)) ;; and no connection - ;; (server:read-dotserver *toppath*)) ;; .server file exists - ;; ;; something caused the server entry in tdb to disappear, but the server is still running - ;; (server:remove-dotserver-file *toppath* ".*") - ;; (mutex-unlock! *rmt-mutex*) - ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 20") - ;; (rmt:send-receive cmd rid params attemptnum: (add1 attemptnum))) + (server:kind-run *toppath*))) + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") + (rmt:open-qry-close-locally cmd 0 params)) ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost (not (remote-conndat *runremote*))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*)) (mutex-unlock! *rmt-mutex*) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -8,11 +8,11 @@ ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. (require-extension (srfi 18) extras tcp s11n) -(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest directory-utils posix-extras) +(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest directory-utils posix-extras matchable) ;; (use zmq) (use spiffy uri-common intarweb http-client spiffy-request-vars) (declare (unit server)) @@ -109,16 +109,16 @@ (curr-ip (server:get-best-guess-address curr-host)) (curr-pid (current-process-id)) (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) (target-host (car homehost)) (testsuite (common:get-testsuite-name)) - (logfile (conc areapath "/logs/server-" curr-pid "-" target-host ".log")) + (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) (cmdln (conc (common:get-megatest-exe) " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") " -daemonize " "") - " -log " logfile + ;; " -log " logfile " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread"))) ;; we want the remote server to start in *toppath* so push there (push-directory areapath) (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") @@ -165,30 +165,33 @@ ;; get a list of servers with all relevant data ;; ( mod-time host port start-time ) ;; (define (server:get-list areapath) - (if (directory-exists? areapath) - (let ((server-logs (glob (conc areapath "/logs/server-*.log")))) - (if (null? server-logs) - '() - (let loop ((hed (car server-logs)) - (tal (cdr server-logs)) - (res '())) - (let* ((mod-time (file-modification-time hed)) - (serv-dat (server:logf-get-start-info hed)) - (serv-rec (cons mod-time serv-dat)) - (new-res (cons serv-rec res))) - (if (null? tal) - new-res - (loop (car tal)(cdr tal) new-res)))))))) + (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))) + (if (directory-exists? areapath) + (let ((server-logs (glob (conc areapath "/logs/server-*.log")))) + (if (null? server-logs) + '() + (let loop ((hed (car server-logs)) + (tal (cdr server-logs)) + (res '())) + (let* ((mod-time (file-modification-time hed)) + (serv-dat (server:logf-get-start-info hed)) + (serv-rec (cons mod-time serv-dat)) + (fmatch (string-match fname-rx hed)) + (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) + (new-res (cons (append serv-rec (list pid)) res))) + (if (null? tal) + new-res + (loop (car tal)(cdr tal) new-res))))))))) ;; given a list of servers get a list of valid servers, i.e. at least ;; 10 seconds old, has started and is less than 1 hour old and is ;; active (i.e. mod-time < 10 seconds ;; -;; mod-time host port start-time +;; mod-time host port start-time pid ;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off ;; and servers should stick around for about two hours or so. ;; (define (server:get-best srvlst) @@ -195,21 +198,27 @@ (let ((now (current-seconds))) (sort (filter (lambda (rec) (let ((start-time (list-ref rec 3)) (mod-time (list-ref rec 0))) - (print "start-time: " start-time " mod-time: " mod-time) + ;; (print "start-time: " start-time " mod-time: " mod-time) (and start-time mod-time (> (- now start-time) 1) ;; been running at least 1 seconds (< (- now mod-time) 10) ;; still alive - file touched in last 10 seconds (< (- now start-time) 3600) ;; under one hour running time ))) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) - + +(define (server:record->url servr) + (match-let (((mod-time host port start-time pid) + servr)) + (if (and host port) + (conc host ":" port) + #f))) (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) (set! *my-client-signature* sig) @@ -360,31 +369,40 @@ ;; (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - dotserver-url("dotserver-url") did not match hostport pattern ("hostport")")))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) - (let* ((dotserver-url (server:read-dotserver->url areapath))) ;; tdbdat (tasks:open-db))) + (let* ((servers (server:get-best (server:get-list areapath))) + (best-server (if (null? servers) #f (car servers))) + (dotserver-url (if best-server + (server:record->url best-server) + #f))) ;; (server:read-dotserver->url areapath))) ;; tdbdat (tasks:open-db))) (if dotserver-url (let* ((res (case *transport-type* ((http)(server:ping-server dotserver-url)) ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) ))) (if res dotserver-url (begin - (server:remove-dotserver-file areapath ".*") ;; remove stale dotserver + (server:kill best-server) #f))) #f))) +(define (server:kill servr) + (match-let (((mod-time hostname port start-time pid) + servr)) + (tasks:kill-server hostname pid))) + ;; called in megatest.scm, host-port is string hostname:port ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; (define (server:ping host-port-in #!key (do-exit #f)) (let ((host:port (if (not host-port-in) ;; use read-dotserver to find - (server:read-dotserver->url *toppath*) + (server:check-if-running *toppath*) (if (number? host-port-in) ;; we were handed a server-id (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) ;; (print "srec: " srec " host-port-in: " host-port-in) (if srec (conc (vector-ref srec 3) ":" (vector-ref srec 4)) Index: utils/mk_wrapper ================================================================== --- utils/mk_wrapper +++ utils/mk_wrapper @@ -28,21 +28,21 @@ echo "#!/bin/bash" > $target if [[ $cmd =~ dboard ]]; then cat >> $target <<'EOF' -# disable if not running on homehost -if [[ -e .homehost ]]; then - homehostname=$( host `cat .homehost` | awk '{print $NF}' | sed 's/\.$//' ) - hostname=$( hostname -f ) - - if [[ ! ($homehostname == $hostname) ]]; then - echo "ERROR: this host ($hostname) is not the homehost ($homehostname) for this megatest run area. Cannot start dashboard." - echo " Please log into homehost before launching dashboard." - exit 1 - fi -fi +# # disable if not running on homehost +# if [[ -e .homehost ]]; then +# homehostname=$( host `cat .homehost` | awk '{print $NF}' | sed 's/\.$//' ) +# hostname=$( hostname -f ) +# +# if [[ ! ($homehostname == $hostname) ]]; then +# echo "ERROR: this host ($hostname) is not the homehost ($homehostname) for this megatest run area. Cannot start dashboard." +# echo " Please log into homehost before launching dashboard." +# exit 1 +# fi +# fi # check that $DISPLAY is set if [[ -z $DISPLAY ]]; then echo 'ERROR: $DISPLAY environment variable is not set; megatest dashboard requires X display address to be set in $DISPLAY.' exit 1