Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -46,10 +46,12 @@ (define *runremote* #f) ;; if set up for server communication this will hold (define *last-db-access* (current-seconds)) ;; update when db is accessed via server (define *max-cache-size* 0) (define *logged-in-clients* (make-hash-table)) (define *client-non-blocking-mode* #f) +(define *server-id* #f) +(define *time-to-exit* #f) (define *target* (make-hash-table)) ;; cache the target here; target is keyval1/keyval2/.../keyvalN (define *keys* (make-hash-table)) ;; cache the keys here (define *keyvals* (make-hash-table)) (define *toptest-paths* (make-hash-table)) ;; cache toptest path settings here Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1104,16 +1104,18 @@ (debug:print-info 12 "cdb:cached-access qry-name=" qry-name " params=" params) (if (not cached?)(db:write-cached-data)) ;; Any special calls are dispatched here. ;; Remainder are put in the db queue (case qry-name - ((login) ;; login checks that the megatest path matches - (if (< (length remparam) 2) ;; should get toppath and signature + ((login) ;; login checks that the megatest path and version matches + (if (< (length remparam) 3) ;; should get toppath, version and signature '(#f "login failed due to missing params") ;; missing params - (let ((calling-path (car remparam)) - (client-key (cadr remparam))) - (if (equal? calling-path *toppath*) + (let ((calling-path (car remparam)) + (calling-vers (cadr remparam)) + (client-key (caddr remparam))) + (if (and (equal? calling-path *toppath*) + (equal? megatest-version calling-vers)) (begin (hash-table-set! *logged-in-clients* client-key (current-seconds)) '(#t "successful login")) ;; path matches - pass! Should vet the caller at this time ... (list #f (conc "Login failed due to mismatch paths: " calling-path ", " *toppath*)))))) ((logout) @@ -1184,11 +1186,11 @@ (define (cdb:set-verbosity zmq-socket val) (cdb:client-call zmq-socket 'set-verbosity #f val)) (define (cdb:login zmq-socket keyval signature) - (cdb:client-call zmq-socket 'login #t keyval signature)) + (cdb:client-call zmq-socket 'login #t keyval megatest-version signature)) (define (cdb:logout zmq-socket keyval signature) (cdb:client-call zmq-socket 'logout #t keyval signature)) (define (cdb:num-clients zmq-socket) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -272,14 +272,14 @@ (if (or (args:get-arg "-listservers") (args:get-arg "-killserver")) (let ((tl (setup-for-run))) (if tl (let ((servers (open-run-close tasks:get-all-servers tasks:open-db)) - (fmtstr "~5a~8a~20a~5a~20a~9a~20a~5a\n") + (fmtstr "~5a~8a~8a~20a~5a~20a~9a~20a\n") (servers-to-kill '())) - (format #t fmtstr "Id" "Pid" "Host" "Port" "Time" "Priority" "State" "Num Clients") - (format #t fmtstr "==" "===" "====" "====" "====" "========" "=====" "===========") + (format #t fmtstr "Id" "MTver" "Pid" "Host" "Port" "Time" "Priority" "State") + (format #t fmtstr "==" "=====" "===" "====" "====" "====" "========" "=====") (for-each (lambda (server) (let* ((killinfo (args:get-arg "-killserver")) (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f)) (kpid (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f)) @@ -288,13 +288,12 @@ (hostname (vector-ref server 2)) (port (vector-ref server 3)) (start-time (vector-ref server 4)) (priority (vector-ref server 5)) (state (vector-ref server 6)) - (stat-numc (server:ping hostname port)) - (status (car stat-numc)) - (numclients (cadr stat-numc)) + (mt-ver (vector-ref server 7)) + (status (open-run-close tasks:server-alive? tasks:open-db hostname port: port)) (killed #f) (zmq-socket (if status (server:client-connect hostname port) #f))) ;; no need to login as status of #t indicates we are connecting to correct ;; server (if (or (not status) ;; no point in keeping dead records in the db @@ -316,12 +315,12 @@ (open-run-close tasks:server-deregister tasks:open-db hostname pid: pid) (set! killed #t) (if status (cdb:kill-server zmq-socket)) (debug:print-info 1 "Killed server by pid at " hostname ":" port))) ;; (if zmq-socket (close-socket zmq-socket)) - (format #t fmtstr id pid hostname port start-time priority - status numclients))) + (format #t fmtstr id mt-ver pid hostname port start-time priority + status))) servers) (debug:print-info 1 "Done with listservers") (exit) ;; must do, would have to add checks to many/all calls below (set! *didsomething* #t)))) ;; if not list or kill then start a client (if appropriate) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -26,11 +26,10 @@ (define (server:make-server-url hostport) (if (not hostport) #f (conc "tcp://" (car hostport) ":" (cadr hostport)))) -(define *time-to-exit* #f) (define (server:run hostn) (debug:print 0 "Attempting to start the server ...") (if (not *toppath*)(setup-for-run)) (let* ((zmq-socket #f) @@ -85,23 +84,24 @@ ;; server last used then start shutdown (let loop ((count 0)) (thread-sleep! 3) ;; no need to do this very often (db:write-cached-data) ;; (print "Server running, count is " count) - (if (< count 10) + (if (< count 2) ;; 3x3 = 9 secs aprox (loop (+ count 1)) (let ((numrunning (open-run-close db:get-count-tests-running #f))) + (open-run-close tasks:server-update-heartbeat tasks:open-db *server-id*) (if (or (> numrunning 0) ;; stay alive for two days after last access (> (+ *last-db-access* (* 48 60 60))(current-seconds))) (begin (debug:print-info 2 "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*)) (loop 0)) (begin (debug:print-info 0 "Starting to shutdown the server.") ;; need to delete only *my* server entry (future use) (set! *time-to-exit* #t) - (open-run-close tasks:server-deregister-self tasks:open-db #f) + (open-run-close tasks:server-deregister-self tasks:open-db) (thread-sleep! 1) (debug:print-info 0 "Max cached queries was " *max-cache-size*) (debug:print-info 0 "Server shutdown complete. Exiting") (exit))))))) @@ -120,11 +120,11 @@ (let ((zmq-url (conc "tcp://" host ":" p))) (print "Trying to start server on " zmq-url) (bind-socket s zmq-url) (set! *runremote* #f) (debug:print 0 "Server started on " zmq-url) - (open-run-close tasks:server-register tasks:open-db (current-process-id) host p 0 'live) + (set! *server-id* (open-run-close tasks:server-register tasks:open-db (current-process-id) host p 0 'live)) s)))) (define (server:mk-signature) (message-digest-string (md5-primitive) (with-output-to-string @@ -166,12 +166,12 @@ (define (server:client-setup #!key (numtries 10)(do-ping #f)) (if (not *toppath*)(setup-for-run)) (let ((hostinfo (open-run-close tasks:get-best-server tasks:open-db do-ping: do-ping))) (if hostinfo (let ((host (car hostinfo)) - (port (cadr hostinfo)) - (zsocket (caddr hostinfo))) + (port (cadr hostinfo))) + ;; (zsocket (caddr hostinfo))) ;; (set! *runremote* zsocket)) (let* ((host (car hostinfo)) (port (cadr hostinfo))) (debug:print-info 2 "Setting up to connect to " hostinfo) (handle-exceptions @@ -221,11 +221,11 @@ (thread-join! th3)) (debug:print 0 "ERROR: Failed to setup for megatest")))))) (define (server:client-launch #!key (do-ping #f)) (if (server:client-setup do-ping: do-ping) - (debug:print-info 0 "connected as client") + (debug:print-info 2 "connected as client") (begin (debug:print 0 "ERROR: Failed to connect as client") (exit)))) ;; ping a server and return number of clients or #f (if no response) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -54,11 +54,13 @@ hostname TEXT, port INTEGER, start_time TIMESTAMP, priority INTEGER, state TEXT, - CONSTRAINT servers_constraint UNIQUE (pid,hostname));") + mt_version TEXT, + heartbeat TIMESTAMP, + CONSTRAINT servers_constraint UNIQUE (pid,hostname,port));") (sqlite3:execute mdb "CREATE TABLE IF NOT EXISTS clients (id INTEGER PRIMARY KEY, server_id INTEGER, pid INTEGER, hostname TEXT, cmdline TEXT, @@ -75,12 +77,13 @@ ;; state: 'live, 'shutting-down, 'dead (define (tasks:server-register mdb pid hostname port priority state) (sqlite3:execute mdb - "INSERT OR REPLACE INTO servers (pid,hostname,port,start_time,priority,state) VALUES(?,?,?,strftime('%s','now'),?,?);" - pid hostname port priority (conc state))) + "INSERT OR REPLACE INTO servers (pid,hostname,port,start_time,priority,state,mt_version,heartbeat) VALUES(?,?,?,strftime('%s','now'),?,?,?,strftime('%s','now'));" + pid hostname port priority (conc state) megatest-version) + (tasks:server-get-server-id mdb hostname port pid)) ;; NB// two servers with same pid on different hosts will be removed from the list if pid: is used! (define (tasks:server-deregister mdb hostname #!key (port #f)(pid #f)) (debug:print-info 11 "server-deregister " hostname ", port " port ", pid " pid) (if pid @@ -90,13 +93,36 @@ (debug:print 0 "ERROR: tasks:server-deregister called with neither pid nor port specified")))) (define (tasks:server-deregister-self mdb hostname) (tasks:server-deregister mdb hostname pid: (current-process-id))) -(define (tasks:server-get-server-id mdb) - ;; dunno yet - 0) +(define (tasks:server-get-server-id mdb host port pid) + (let ((res #f)) + (sqlite3:for-each-row + (lambda (id) + (set! res id)) + mdb + (if (and host pid) + "SELECT id FROM servers WHERE hostname=? AND pid=?;" + "SELECT id FROM servers WHERE hostname=? AND port=?;") + host (if pid pid port)) + res)) + +(define (tasks:server-update-heartbeat mdb server-id) + (sqlite3:execute mdb "UPDATE servers SET heartbeat=strftime('%s','now') WHERE id=?;" server-id)) + +;; alive servers keep the heartbeat field upto date with seconds every 6 or so seconds +(define (tasks:server-alive? mdb server-id #!key (hostname #f)(port #f)(pid #f)) + (let* ((server-id (if server-id + server-id + (tasks:server-get-server-id mdb hostname port pid))) + (heartbeat-delta 99e9)) + (sqlite3:for-each-row + (lambda (delta) + (set! heartbeat-delta delta)) + mdb "SELECT strftime('%s','now')-heartbeat FROM servers WHERE id=?;" server-id) + (< (- (current-seconds) heartbeat-delta) 10))) (define (tasks:client-register mdb pid hostname cmdline) (sqlite3:execute mdb "INSERT OR REPLACE INTO clients (server_id,pid,hostname,cmdline,login_time) VALUES(?,?,?,?,strftime('%s','now'));") @@ -129,23 +155,24 @@ (sqlite3:for-each-row (lambda (id hostname port) (set! res (cons (list hostname port) res)) (debug:print-info 1 "Found " hostname ":" port)) mdb - "SELECT id,hostname,port FROM servers WHERE state='live' ORDER BY start_time DESC LIMIT 1;") + "SELECT id,hostname,port FROM servers WHERE state='live' AND mt_version=? ORDER BY start_time DESC LIMIT 1;" megatest-version) ;; (print "res=" res) (if (null? res) #f (let loop ((hed (car res)) (tal (cdr res))) ;; (print "hed=" hed ", tal=" tal) (let* ((host (car hed)) (port (cadr hed)) - (ping-res (if do-ping (server:ping host port return-socket: #f) '(#t "NO PING" #f))) - (alive (car ping-res)) - (reason (cadr ping-res)) - (zsocket (caddr ping-res))) - (if alive (list host port zsocket) + ;; (ping-res (if do-ping (server:ping host port return-socket: #f) '(#t "NO PING" #f))) + (alive (open-run-close tasks:server-alive? tasks:open-db host port: port)) ;; (car ping-res)) + ;; (reason (cadr ping-res)) + ;; (zsocket (caddr ping-res)) + ) + (if alive (list host port) ;; remove defunct server from table (begin (open-run-close tasks:server-deregister tasks:open-db host port: port) (if (null? tal) #f @@ -152,14 +179,14 @@ (loop (car tal)(cdr tal)))))))))) (define (tasks:get-all-servers mdb) (let ((res '())) (sqlite3:for-each-row - (lambda (id pid hostname port start-time priority state) - (set! res (cons (vector id pid hostname port start-time priority state) res))) + (lambda (id pid hostname port start-time priority state mt-version) + (set! res (cons (vector id pid hostname port start-time priority state mt-version) res))) mdb - "SELECT id,pid,hostname,port,start_time,priority,state FROM servers ORDER BY start_time DESC;") + "SELECT id,pid,hostname,port,start_time,priority,state,mt_version FROM servers ORDER BY start_time DESC;") res)) ;;====================================================================== ;; Tasks and Task monitors Index: tests/Makefile ================================================================== --- tests/Makefile +++ tests/Makefile @@ -85,7 +85,10 @@ killall -v mtest dboard || true hardkill : kill sleep 5;killall -v mtest main.sh dboard -9 +listservers : + cd fullrun;$(MEGATEST) -listservers + runforever : while(ls); do runname=`date +%F-%R:%S`;(cd fullrun;$(MEGATEST) -runall -target ubuntu/nfs/none :runname $$runname;/home/matt/data/megatest/megatest -runall -target ubuntu/nfs/none :runname $$runname;/home/matt/data/megatest/megatest -runall -target ubuntu/nfs/none :runname $$runname);done