Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -233,11 +233,12 @@ ;;====================================================================== ;; Create the sqlite db for the individual test(s) (define (open-test-db testpath) (debug:print-info 11 "open-test-db " testpath) - (if (and (directory? testpath) + (if (and testpath + (directory? testpath) (file-read-access? testpath)) (let* ((dbpath (conc testpath "/testdat.db")) (dbexists (file-exists? dbpath)) (db (sqlite3:open-database dbpath)) ;; (never-give-up-open-db dbpath)) (handler (make-busy-timeout (if (args:get-arg "-override-timeout") Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -1,7 +1,7 @@ ;; Always use two digit decimal ;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.5105) +(define megatest-version 1.5106) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -272,27 +272,28 @@ (if (or (args:get-arg "-listservers") (args:get-arg "-killserver")) (let ((tl (setup-for-run))) (if tl (let ((servers (open-run-close tasks:get-all-servers tasks:open-db)) - (fmtstr "~5a~8a~8a~20a~5a~20a~9a~20a\n") + (fmtstr "~5a~8a~8a~20a~20a~10a~20a~10a~10a\n") (servers-to-kill '())) - (format #t fmtstr "Id" "MTver" "Pid" "Host" "Port" "Time" "Priority" "State") - (format #t fmtstr "==" "=====" "===" "====" "====" "====" "========" "=====") + (format #t fmtstr "Id" "MTver" "Pid" "Host" "Interface" "Port" "Time" "Priority" "State") + (format #t fmtstr "==" "=====" "===" "====" "=========" "====" "====" "========" "=====") (for-each (lambda (server) (let* ((killinfo (args:get-arg "-killserver")) (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f)) (kpid (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f)) (id (vector-ref server 0)) (pid (vector-ref server 1)) (hostname (vector-ref server 2)) - (port (vector-ref server 3)) - (start-time (vector-ref server 4)) - (priority (vector-ref server 5)) - (state (vector-ref server 6)) - (mt-ver (vector-ref server 7)) + (interface (vector-ref server 3)) + (port (vector-ref server 4)) + (start-time (vector-ref server 5)) + (priority (vector-ref server 6)) + (state (vector-ref server 7)) + (mt-ver (vector-ref server 8)) (status (open-run-close tasks:server-alive? tasks:open-db hostname port: port)) (killed #f) (zmq-socket (if status (server:client-connect hostname port) #f))) ;; no need to login as status of #t indicates we are connecting to correct ;; server @@ -302,29 +303,35 @@ (equal? port (string->number (cadr khost-port))))) (begin (open-run-close tasks:server-deregister tasks:open-db hostname port: port) (if status ;; #t means alive (begin - (cdb:kill-server zmq-socket) + (if (equal? hostname (get-host-name)) + (process-signal pid signal/term) + (cdb:kill-server zmq-socket)) (debug:print-info 1 "Killed server by host:port at " hostname ":" port)) (debug:print-info 1 "Removing defunct server record for " hostname ":" port)) (set! killed #t))) (if (and kpid ;; (equal? hostname (car khost-port)) (equal? kpid pid)) ;;; YEP, ALL WITH PID WILL BE KILLED!!! (begin (open-run-close tasks:server-deregister tasks:open-db hostname pid: pid) (set! killed #t) - (if status (cdb:kill-server zmq-socket)) + (if status + (if (equal? hostname (get-host-name)) + (process-signal pid signal/term) + (debug:print 0 "WARNING: Can't kill a dead server on host " hostname))) (debug:print-info 1 "Killed server by pid at " hostname ":" port))) ;; (if zmq-socket (close-socket zmq-socket)) - (format #t fmtstr id mt-ver pid hostname port start-time priority - status))) + (format #t fmtstr id mt-ver pid hostname interface port start-time priority + (if status "alive" "dead")))) servers) (debug:print-info 1 "Done with listservers") + (set! *didsomething* #t) (exit) ;; must do, would have to add checks to many/all calls below - (set! *didsomething* #t)) + ) (exit))) ;; if not list or kill then start a client (if appropriate) (if (or (args-defined? "-h" "-version" "-gen-megatest-area" "-gen-megatest-test") (eq? (length (hash-table-keys args:arg-hash)) 0)) (debug:print-info 1 "Server connection not needed") Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -35,26 +35,28 @@ (if (not (setup-for-run)) (begin (debug:print 0 "ERROR: cannot find megatest.config, cannot start server, exiting") (exit)))) (let* ((zmq-socket #f) - (hostname (if (string=? "-" hostn) - (get-host-name) + (iface (if (string=? "-" hostn) + "*" ;; (get-host-name) hostn)) + (hostname (get-host-name)) (ipaddrstr (let ((ipstr (if (string=? "-" hostn) (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") #f))) (if ipstr ipstr hostname)))) + ;; (set! zmq-socket (server:find-free-port-and-open iface zmq-socket 5555 0)) (set! zmq-socket (server:find-free-port-and-open ipaddrstr zmq-socket 5555 0)) (set! *cache-on* #t) ;; what to do when we quit ;; (on-exit (lambda () (if (and *toppath* *server-id*) (begin - (open-run-close tasks:server-deregister-self tasks:open-db #f)) + (open-run-close tasks:server-deregister-self tasks:open-db ipaddrstr)) (let loop () (let ((queue-len 0)) (thread-sleep! (random 5)) (mutex-lock! *incoming-mutex*) (set! queue-len (length *incoming-data*)) @@ -109,28 +111,31 @@ (thread-sleep! 1) (debug:print-info 0 "Max cached queries was " *max-cache-size*) (debug:print-info 0 "Server shutdown complete. Exiting") (exit))))))) -(define (server:find-free-port-and-open host s port #!key (trynum 50)) +(define (server:find-free-port-and-open iface s port #!key (trynum 50)) (let ((s (if s s (make-socket 'rep))) - (p (if (number? port) port 5555))) + (p (if (number? port) port 5555)) + (old-handler (current-exception-handler))) (handle-exceptions exn (begin (debug:print 0 "Failed to bind to port " p ", trying next port") (debug:print 0 " EXCEPTION: " ((condition-property-accessor 'exn 'message) exn)) + ;; (old-handler) + ;; (print-call-chain) (if (> trynum 0) - (server:find-free-port-and-open host s (+ p 1) trynum: (- trynum 1)) - (debug:print-info 0 "Tried ports from " (- p trynum) " to " p + (server:find-free-port-and-open iface s (+ p 1) trynum: (- trynum 1)) + (debug:print-info 0 "Tried ports up to " p " but all were in use. Please try a different port range by starting the server with parameter \" -port N\" where N is the starting port number to use"))) - (let ((zmq-url (conc "tcp://" host ":" p))) + (let ((zmq-url (conc "tcp://" iface ":" p))) (print "Trying to start server on " zmq-url) (bind-socket s zmq-url) (set! *runremote* #f) (debug:print 0 "Server started on " zmq-url) - (set! *server-id* (open-run-close tasks:server-register tasks:open-db (current-process-id) host p 0 'live)) + (set! *server-id* (open-run-close tasks:server-register tasks:open-db (current-process-id) iface p 0 'live)) s)))) (define (server:mk-signature) (message-digest-string (md5-primitive) (with-output-to-string @@ -143,17 +148,17 @@ (let ((sig (server:mk-signature))) (set! *my-client-signature* sig) *my-client-signature*))) ;; -(define (server:client-connect host port #!key (context #f)) - (debug:print 3 "client-connect " host ":" port) +(define (server:client-connect iface port #!key (context #f)) + (debug:print 3 "client-connect " iface ":" port) (let ((connect-ok #f) (zmq-socket (if context (make-socket 'req context) (make-socket 'req))) - (conurl (server:make-server-url (list host port)))) + (conurl (server:make-server-url (list iface port)))) (if (socket? zmq-socket) (begin (connect-socket zmq-socket conurl) zmq-socket) #f))) @@ -175,38 +180,35 @@ (begin (debug:print 0 "ERROR: failed to find megatest.config, exiting") (exit)))) (let ((hostinfo (open-run-close tasks:get-best-server tasks:open-db do-ping: do-ping))) (if hostinfo - (let ((host (car hostinfo)) - (port (cadr hostinfo))) - ;; (zsocket (caddr hostinfo))) - ;; (set! *runremote* zsocket)) - (let* ((host (car hostinfo)) - (port (cadr hostinfo))) - (debug:print-info 2 "Setting up to connect to " hostinfo) - (handle-exceptions - exn - (begin - (debug:print 0 "ERROR: Failed to open a connection to the server at: " hostinfo) - (debug:print 0 " EXCEPTION: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 0 " perhaps jobs killed with -9? Removing server records") - (open-run-close tasks:server-deregister tasks:open-db host port: port) - #f) - (let* ((zmq-socket (server:client-connect host port)) - (login-res (server:client-login zmq-socket)) - (connect-ok (if (null? login-res) #f (car login-res))) - (conurl (server:make-server-url hostinfo))) - (if connect-ok - (begin - (debug:print-info 2 "Logged in and connected to " conurl) - (set! *runremote* zmq-socket) - #t) - (begin - (debug:print-info 2 "Failed to login or connect to " conurl) - (set! *runremote* #f) - #f)))))) + (let ((host (car hostinfo)) + (iface (cadr hostinfo)) + (port (caddr hostinfo))) + (debug:print-info 2 "Setting up to connect to " hostinfo) + (handle-exceptions + exn + (begin + (debug:print 0 "ERROR: Failed to open a connection to the server at: " hostinfo) + (debug:print 0 " EXCEPTION: " ((condition-property-accessor 'exn 'message) exn)) + (debug:print 0 " perhaps jobs killed with -9? Removing server records") + (open-run-close tasks:server-deregister tasks:open-db host port: port) + #f) + (let* ((zmq-socket (server:client-connect iface port)) + (login-res (server:client-login zmq-socket)) + (connect-ok (if (null? login-res) #f (car login-res))) + (conurl (server:make-server-url (list iface port)))) + (if connect-ok + (begin + (debug:print-info 2 "Logged in and connected to " conurl) + (set! *runremote* zmq-socket) + #t) + (begin + (debug:print-info 2 "Failed to login or connect to " conurl) + (set! *runremote* #f) + #f))))) (if (> numtries 0) (let ((exe (car (argv)))) (debug:print-info 1 "No server available, attempting to start one...") (process-run exe (list "-server" "-" "-debug" (conc *verbosity*))) (sleep 2) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -49,10 +49,11 @@ hostname TEXT, username TEXT, CONSTRAINT monitors_constraint UNIQUE (pid,hostname));") (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS servers (id INTEGER PRIMARY KEY, pid INTEGER, + interface TEXT, hostname TEXT, port INTEGER, start_time TIMESTAMP, priority INTEGER, state TEXT, @@ -74,16 +75,16 @@ ;;====================================================================== ;; Server and client management ;;====================================================================== ;; state: 'live, 'shutting-down, 'dead -(define (tasks:server-register mdb pid hostname port priority state) +(define (tasks:server-register mdb pid interface port priority state) (sqlite3:execute mdb - "INSERT OR REPLACE INTO servers (pid,hostname,port,start_time,priority,state,mt_version,heartbeat) VALUES(?,?,?,strftime('%s','now'),?,?,?,strftime('%s','now'));" - pid hostname port priority (conc state) megatest-version) - (tasks:server-get-server-id mdb hostname port pid)) + "INSERT OR REPLACE INTO servers (pid,hostname,port,start_time,priority,state,mt_version,heartbeat,interface) VALUES(?,?,?,strftime('%s','now'),?,?,?,strftime('%s','now'),?);" + pid (get-host-name) port priority (conc state) megatest-version interface) + (tasks:server-get-server-id mdb (get-host-name) port pid)) ;; NB// two servers with same pid on different hosts will be removed from the list if pid: is used! (define (tasks:server-deregister mdb hostname #!key (port #f)(pid #f)) (debug:print-info 11 "server-deregister " hostname ", port " port ", pid " pid) (if pid @@ -93,20 +94,20 @@ (debug:print 0 "ERROR: tasks:server-deregister called with neither pid nor port specified")))) (define (tasks:server-deregister-self mdb hostname) (tasks:server-deregister mdb hostname pid: (current-process-id))) -(define (tasks:server-get-server-id mdb host port pid) +(define (tasks:server-get-server-id mdb hostname port pid) (let ((res #f)) (sqlite3:for-each-row (lambda (id) (set! res id)) mdb - (if (and host pid) + (if (and hostname pid) "SELECT id FROM servers WHERE hostname=? AND pid=?;" "SELECT id FROM servers WHERE hostname=? AND port=?;") - host (if pid pid port)) + hostname (if pid pid port)) res)) (define (tasks:server-update-heartbeat mdb server-id) (sqlite3:execute mdb "UPDATE servers SET heartbeat=strftime('%s','now') WHERE id=?;" server-id)) @@ -118,11 +119,11 @@ (heartbeat-delta 99e9)) (sqlite3:for-each-row (lambda (delta) (set! heartbeat-delta delta)) mdb "SELECT strftime('%s','now')-heartbeat FROM servers WHERE id=?;" server-id) - (< (- (current-seconds) heartbeat-delta) 10))) + (> heartbeat-delta 10))) (define (tasks:client-register mdb pid hostname cmdline) (sqlite3:execute mdb "INSERT OR REPLACE INTO clients (server_id,pid,hostname,cmdline,login_time) VALUES(?,?,?,?,strftime('%s','now'));") @@ -151,28 +152,41 @@ ;; remove any others. will not necessarily remove all! (define (tasks:get-best-server mdb #!key (do-ping #f)) (let ((res '()) (best #f)) (sqlite3:for-each-row - (lambda (id hostname port) - (set! res (cons (list hostname port) res)) + (lambda (id hostname interface port pid) + (set! res (cons (list hostname interface port pid) res)) (debug:print-info 1 "Found " hostname ":" port)) mdb - "SELECT id,hostname,port FROM servers WHERE state='live' AND mt_version=? ORDER BY start_time DESC LIMIT 1;" megatest-version) + "SELECT id,hostname,interface,port,pid FROM servers WHERE state='live' AND mt_version=? ORDER BY start_time DESC LIMIT 1;" megatest-version) ;; (print "res=" res) (if (null? res) #f (let loop ((hed (car res)) (tal (cdr res))) ;; (print "hed=" hed ", tal=" tal) - (let* ((host (car hed)) - (port (cadr hed)) + (let* ((host (car hed)) + (iface (cadr hed)) + (port (caddr hed)) + (pid (cadddr hed)) ;; (ping-res (if do-ping (server:ping host port return-socket: #f) '(#t "NO PING" #f))) (alive (open-run-close tasks:server-alive? tasks:open-db host port: port)) ;; (car ping-res)) ;; (reason (cadr ping-res)) ;; (zsocket (caddr ping-res)) ) - (if alive (list host port) + (if alive + ;; (if (server:ping iface port) + (list host iface port) + ;; ;; not actually alive, destroy! + ;; (begin + ;; (if (equal? host (get-host-name)) + ;; (begin + ;; (debug:print-info 0 "Killing process " pid " on host " host " with signal/term") + ;; (send-signal pid signal/term)) + ;; (debug:print 0 "WARNING: Can't kill process " pid " on host " host)) + ;; (open-run-close tasks:server-deregister tasks:open-db host port: port) + ;; #f)) ;; remove defunct server from table (begin (open-run-close tasks:server-deregister tasks:open-db host port: port) (if (null? tal) #f @@ -179,14 +193,14 @@ (loop (car tal)(cdr tal)))))))))) (define (tasks:get-all-servers mdb) (let ((res '())) (sqlite3:for-each-row - (lambda (id pid hostname port start-time priority state mt-version) - (set! res (cons (vector id pid hostname port start-time priority state mt-version) res))) + (lambda (id pid hostname interface port start-time priority state mt-version) + (set! res (cons (vector id pid hostname interface port start-time priority state mt-version) res))) mdb - "SELECT id,pid,hostname,port,start_time,priority,state,mt_version FROM servers ORDER BY start_time DESC;") + "SELECT id,pid,hostname,interface,port,start_time,priority,state,mt_version FROM servers ORDER BY start_time DESC;") res)) ;;====================================================================== ;; Tasks and Task monitors