Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -1,7 +1,7 @@ ;; Always use two digit decimal ;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.5106) +(define megatest-version 1.5107) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -156,10 +156,11 @@ ":tol" ":units" ;; misc "-server" "-killserver" + "-port" "-extract-ods" "-pathmod" "-env2file" "-setvars" "-set-state-status" @@ -304,12 +305,12 @@ (begin (open-run-close tasks:server-deregister tasks:open-db hostname port: port) (if status ;; #t means alive (begin (if (equal? hostname (get-host-name)) - (process-signal pid signal/term) - (cdb:kill-server zmq-socket)) + (process-signal pid signal/term) ;; local machine, send sig term + (cdb:kill-server zmq-socket)) ;; remote machine, try telling server to commit suicide (debug:print-info 1 "Killed server by host:port at " hostname ":" port)) (debug:print-info 1 "Removing defunct server record for " hostname ":" port)) (set! killed #t))) (if (and kpid ;; (equal? hostname (car khost-port)) @@ -317,11 +318,11 @@ (begin (open-run-close tasks:server-deregister tasks:open-db hostname pid: pid) (set! killed #t) (if status (if (equal? hostname (get-host-name)) - (process-signal pid signal/term) + (process-signal pid signal/term) ;; local machine, send sig term (debug:print 0 "WARNING: Can't kill a dead server on host " hostname))) (debug:print-info 1 "Killed server by pid at " hostname ":" port))) ;; (if zmq-socket (close-socket zmq-socket)) (format #t fmtstr id mt-ver pid hostname interface port start-time priority (if status "alive" "dead")))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -27,10 +27,12 @@ (define (server:make-server-url hostport) (if (not hostport) #f (conc "tcp://" (car hostport) ":" (cadr hostport)))) +(define *server-loop-heart-beat* (list 'start (current-seconds))) + (define (server:run hostn) (debug:print 0 "Attempting to start the server ...") (if (not *toppath*) (if (not (setup-for-run)) (begin @@ -44,11 +46,14 @@ (ipaddrstr (let ((ipstr (if (string=? "-" hostn) (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") #f))) (if ipstr ipstr hostname)))) ;; (set! zmq-socket (server:find-free-port-and-open iface zmq-socket 5555 0)) - (set! zmq-socket (server:find-free-port-and-open ipaddrstr zmq-socket 5555 0)) + (set! zmq-socket (server:find-free-port-and-open ipaddrstr zmq-socket (if (args:get-arg "-port") + (string->number (args:get-arg "-port")) + 5555) + 0)) (set! *cache-on* #t) ;; what to do when we quit ;; (on-exit (lambda () @@ -67,13 +72,21 @@ (loop)))))))) ;; The heavy lifting ;; (let loop () + ;; Ugly yuk. + (mutex-lock! *incoming-mutex*) + (set! *server-loop-heart-beat* (list 'waiting (current-seconds))) + (mutex-unlock! *incoming-mutex*) (let* ((rawmsg (receive-message* zmq-socket)) (params (db:string->obj rawmsg)) ;; (with-input-from-string rawmsg (lambda ()(deserialize)))) (res #f)) + ;;; Ugly yuk. + (mutex-lock! *incoming-mutex*) + (set! *server-loop-heart-beat* (list 'working (current-seconds))) + (mutex-unlock! *incoming-mutex*) (debug:print-info 12 "server=> received params=" params) (set! res (cdb:cached-access params)) (debug:print-info 12 "server=> processed res=" res) (send-message zmq-socket (db:obj->string res)) (if (not *time-to-exit*) @@ -94,12 +107,24 @@ (thread-sleep! 3) ;; no need to do this very often (db:write-cached-data) ;; (print "Server running, count is " count) (if (< count 2) ;; 3x3 = 9 secs aprox (loop (+ count 1)) - (let ((numrunning (open-run-close db:get-count-tests-running #f))) - (open-run-close tasks:server-update-heartbeat tasks:open-db *server-id*) + (let ((numrunning (open-run-close db:get-count-tests-running #f)) + (server-loop-heartbeat #f)) + ;;; Ugly yuk. + (mutex-lock! *incoming-mutex*) + (set! server-loop-heartbeat *server-loop-heart-beat*) + (mutex-unlock! *incoming-mutex*) + ;; The logic here is that if the server loop gets stuck blocked in working + ;; we don't want to update our heartbeat + (let ((server-state (car server-loop-heartbeat)) + (server-update (cadr server-loop-heartbeat))) + (if (or (eq? server-state 'waiting) + (< (- (current-seconds) server-update) 10)) + (open-run-close tasks:server-update-heartbeat tasks:open-db *server-id*) + (debug:print "ERROR: No heartbeat update, server appears stuck"))) (if (or (> numrunning 0) ;; stay alive for two days after last access (> (+ *last-db-access* (* 48 60 60))(current-seconds))) (begin (debug:print-info 2 "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*)) (loop 0)) @@ -172,11 +197,11 @@ (cdb:logout zmq-socket *toppath* (server:get-client-signature))))) ;; (close-socket zmq-socket) ok)) ;; Do all the connection work, start a server if not already running -(define (server:client-setup #!key (numtries 10)(do-ping #f)) +(define (server:client-setup #!key (numtries 50)(do-ping #f)) (if (not *toppath*) (if (not (setup-for-run)) (begin (debug:print 0 "ERROR: failed to find megatest.config, exiting") (exit)))) @@ -212,11 +237,11 @@ (debug:print-info 1 "No server available, attempting to start one...") (process-run exe (list "-server" "-" "-debug" (conc *verbosity*))) (sleep 2) ;; not doing ping, assume the server started and registered itself (server:client-setup numtries: (- numtries 1) do-ping: #f)) - (debug:print-info 1 "Too many retries, giving up"))))) + (debug:print-info 1 "Too many attempts, giving up"))))) (define (server:launch) (if (not *toppath*) (if (not (setup-for-run)) (begin Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -119,11 +119,11 @@ (heartbeat-delta 99e9)) (sqlite3:for-each-row (lambda (delta) (set! heartbeat-delta delta)) mdb "SELECT strftime('%s','now')-heartbeat FROM servers WHERE id=?;" server-id) - (> heartbeat-delta 10))) + (< heartbeat-delta 10))) (define (tasks:client-register mdb pid hostname cmdline) (sqlite3:execute mdb "INSERT OR REPLACE INTO clients (server_id,pid,hostname,cmdline,login_time) VALUES(?,?,?,?,strftime('%s','now'));")