Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -160,11 +160,11 @@ (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) - (server-timeout (server:get-timeout)) ;; default from server:get-timeout + (server-timeout (server:expiration-timeout)) (force-server #f) (ro-mode #f) (ro-mode-checked #f)) ;; flag that indicates we have checked for ro-mode ;; launching and hosts Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -383,11 +383,11 @@ (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) (port (cadr server-info)) (last-access 0) - (server-timeout (server:get-timeout)) + (server-timeout (server:expiration-timeout)) (server-going #f) (server-log-file (args:get-arg "-log"))) ;; always set when we are a server (with-output-to-file started-file (lambda ()(print (current-process-id)))) @@ -440,21 +440,15 @@ (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) - (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600)) - (adjusted-timeout (if (> hrs-since-start 1) - (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour - server-timeout))) - (if (common:low-noise-print 120 "server timeout") - (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) + (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) (cond ((and *server-run* (> (+ last-access server-timeout) - (current-seconds)) - (< (- (current-seconds) server-start-time) (configf:lookup-number *configdat* "server" "time-to-die-seconds" default: (* 3600 700 2) ) )) ;; do not update log or touch log if we've been running for more than one hour. + (current-seconds))) (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions exn Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -467,58 +467,58 @@ (setenv "MT_TESTSUITENAME" areaname) (setenv "MT_RUN_AREA_HOME" top-path) (set! *toppath* top-path) (setenv "MT_TEST_RUN_DIR" work-area) - ;; On NFS it can be slow and unreliable to get needed startup information. - ;; i. Check if we are on the homehost, if so, proceed - ;; ii. Check if host and port passed in via CMDINFO are valid and if - ;; possible use them. - (let ((bestadrs (server:get-best-guess-address (get-host-name))) - (needcare #f)) - (if (equal? homehost bestadrs) ;; we are likely on the homehost - (debug:print-info 0 *default-log-port* "test " test-name " appears to be running on the homehost " homehost) - (let ((host-port (if serverurl (string-split serverurl ":") #f))) - (if (not *runremote*)(set! *runremote* (make-remote))) ;; init *runremote* - (if (string? homehost) - (if (and host-port - (> (length host-port) 1)) - (let* ((host (car host-port)) - (port (cadr host-port)) - (start-res (http-transport:client-connect host port)) - (ping-res (rmt:login-no-auto-client-setup start-res))) - (if (and start-res - ping-res) - ;; (begin ;; let ((url (http-transport:server-dat-make-url start-res))) - (begin - (remote-conndat-set! *runremote* start-res) - ;; (remote-server-url-set! *runremote* url) - ;; (if (server:ping url) - (debug:print-info 0 *default-log-port* "connected to " host ":" port " using CMDINFO data.")) - (begin - (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " host ":" port) - (set! *runremote* #f)) - ;; (remote-conndat-set! *runremote* #f)) - )) - (begin - (set! *runremote* #f) - (debug:print-info 0 *default-log-port* (if host-port - (conc "received invalid host-port information " host-port) - "no host-port information received")) - ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare. - (set! needcare #t))) - (begin - (set! *runremote* #f) - (debug:print-info 0 *default-log-port* "received no homehost information. Please report this to support as it should not happen.") - (set! needcare #t))))) - (if needcare ;; due to very slow NFS we will do a brute force mkdir to ensure that the directory inode it truly available on this host - (let ((logdir (conc top-path "/logs"))) ;; we'll try to create this directory - (handle-exceptions - exn - (debug:print 0 *default-log-port* "Failed to create directory " logdir " expect problems, message: " ((condition-property-accessor 'exn 'message) exn)) - (create-directory logdir #t))))) - + ;; ;; On NFS it can be slow and unreliable to get needed startup information. + ;; ;; i. Check if we are on the homehost, if so, proceed + ;; ;; ii. Check if host and port passed in via CMDINFO are valid and if + ;; ;; possible use them. + ;; (let ((bestadrs (server:get-best-guess-address (get-host-name))) + ;; (needcare #f)) + ;; (if (equal? homehost bestadrs) ;; we are likely on the homehost + ;; (debug:print-info 0 *default-log-port* "test " test-name " appears to be running on the homehost " homehost) + ;; (let ((host-port (if serverurl (string-split serverurl ":") #f))) + ;; (if (not *runremote*)(set! *runremote* (make-remote))) ;; init *runremote* + ;; (if (string? homehost) + ;; (if (and host-port + ;; (> (length host-port) 1)) + ;; (let* ((host (car host-port)) + ;; (port (cadr host-port)) + ;; (start-res (http-transport:client-connect host port)) + ;; (ping-res (rmt:login-no-auto-client-setup start-res))) + ;; (if (and start-res + ;; ping-res) + ;; ;; (begin ;; let ((url (http-transport:server-dat-make-url start-res))) + ;; (begin + ;; (remote-conndat-set! *runremote* start-res) + ;; ;; (remote-server-url-set! *runremote* url) + ;; ;; (if (server:ping url) + ;; (debug:print-info 0 *default-log-port* "connected to " host ":" port " using CMDINFO data.")) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " host ":" port) + ;; (set! *runremote* #f)) + ;; ;; (remote-conndat-set! *runremote* #f)) + ;; )) + ;; (begin + ;; (set! *runremote* #f) + ;; (debug:print-info 0 *default-log-port* (if host-port + ;; (conc "received invalid host-port information " host-port) + ;; "no host-port information received")) + ;; ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare. + ;; (set! needcare #t))) + ;; (begin + ;; (set! *runremote* #f) + ;; (debug:print-info 0 *default-log-port* "received no homehost information. Please report this to support as it should not happen.") + ;; (set! needcare #t))))) + ;; (if needcare ;; due to very slow NFS we will do a brute force mkdir to ensure that the directory inode it truly available on this host + ;; (let ((logdir (conc top-path "/logs"))) ;; we'll try to create this directory + ;; (handle-exceptions + ;; exn + ;; (debug:print 0 *default-log-port* "Failed to create directory " logdir " expect problems, message: " ((condition-property-accessor 'exn 'message) exn)) + ;; (create-directory logdir #t))))) + ;; ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) (if (or (common:file-exists? top-path) (> count 10)) (change-directory top-path) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -128,24 +128,25 @@ ;; This block was for pre-emptively resetting the connection if there had been no communication for some time. ;; I don't think it adds any value. If the server is not there, just fail and start a new connection. ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout) ;; - ;; ;;DOT CASE4 [label="reset\nconnection"]; - ;; ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} - ;; ;;DOT CASE4 -> "rmt:send-receive"; - ;; ;; reset the connection if it has been unused too long - ;; ((and runremote - ;; (remote-conndat runremote) - ;; (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 10)))) ;; Subtract or add the random value? Seems like it should be substract but Neither fixes the "WARNING: failure in with-input-from-request to #.\n message: Server closed connection before sending response" - ;; (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) - ;; (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") - ;; (http-transport:close-connections area-dat: runremote) - ;; (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. - ;; (mutex-unlock! *rmt-mutex*) - ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) - + ;;DOT CASE4 [label="reset\nconnection"]; + ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} + ;;DOT CASE4 -> "rmt:send-receive"; + ;; reset the connection if it has been unused too long + ((and runremote + (remote-conndat runremote) + (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on + (+ (http-transport:server-dat-get-last-access (remote-conndat runremote)) + (remote-server-timeout runremote)))) + (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") + (http-transport:close-connections area-dat: runremote) + (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. + (mutex-unlock! *rmt-mutex*) + (rmt:send-receive cmd rid params attemptnum: attemptnum)) + ;;DOT CASE5 [label="local\nread"]; ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; ;;DOT CASE5 -> "rmt:open-qry-close-locally"; ;; on homehost and this is a read ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required @@ -244,12 +245,16 @@ (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported") (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (and (vector? conninfo) (> 5 (vector-length conninfo))) + + (http-transport:server-dat-update-last-access conninfo) ;; refresh access time - (begin + + + (begin (set! conninfo #f) (remote-conndat-set! runremote #f))) ;; (mutex-unlock! *rmt-mutex*) (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) (mutex-unlock! *rmt-mutex*) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -439,20 +439,18 @@ (set! *db-last-access* (current-seconds)) ;; might not be needed. (if (equal? *toppath* toppath) #t #f))) -;; timeout is in hours -(define (server:get-timeout) - (let ((tmo (configf:lookup *configdat* "server" "timeout"))) +;; timeout is hms string: 1h 5m 3s, default is 1 minute +;; +(define (server:expiration-timeout) + (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) - (string->number tmo)) - (* 60 60 (string->number tmo)) - ;; (* 3 24 60 60) ;; default to three days - ;;(* 60 60 1) ;; default to one hour - (* 60 5) ;; default to five minutes - ))) + (common:hms-string->seconds tmo)) + (string->number tmo) + 60))) ;; moving this here as it needs access to db and cannot be in common. ;; (define (server:writable-watchdog dbstruct) (thread-sleep! 0.05) ;; delay for startup