Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -1019,22 +1019,34 @@ (else (let* ((currhost (get-host-name)) (bestadrs (server:get-best-guess-address currhost)) ;; first look in config, then look in file .homehost, create it if not found (homehost (or (configf:lookup *configdat* "server" "homehost" ) - (let ((hhf (conc *toppath* "/.homehost"))) - (if (file-exists? hhf) - (with-input-from-file hhf read-line) - (if (file-write-access? *toppath*) - (begin - (with-output-to-file hhf - (lambda () - (print bestadrs))) - (begin - (mutex-unlock! *homehost-mutex*) - (car (common:get-homehost)))) - #f))))) + (handle-exceptions + exn + (if (> trynum 0) + (let ((delay-time (* (- 5 trynum) 5))) + (mutex-unlock! *homehost-mutex*) + (debug:print 0 *default-log-port* "ERROR: Failed to read .homehost file, delaying " delay-time " seconds and trying again, message: " ((condition-property-accessor 'exn 'message) exn)) + (thread-sleep! delay-time) + (common:get-homehost trynum: (- trynum 1))) + (begin + (mutex-unlock! *homehost-mutex*) + (debug:print 0 *default-log-port* "ERROR: Failed to read .homehost file after trying five times. Giving up and exiting, message: " ((condition-property-accessor 'exn 'message) exn)) + (exit 1))) + (let ((hhf (conc *toppath* "/.homehost"))) + (if (file-exists? hhf) + (with-input-from-file hhf read-line) + (if (file-write-access? *toppath*) + (begin + (with-output-to-file hhf + (lambda () + (print bestadrs))) + (begin + (mutex-unlock! *homehost-mutex*) + (car (common:get-homehost)))) + #f)))))) (at-home (or (equal? homehost currhost) (equal? homehost bestadrs)))) (set! *home-host* (cons homehost at-home)) (mutex-unlock! *homehost-mutex*) *home-host*)))) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -12,11 +12,11 @@ ;; launch a task - this runs on the originating host, tests themselves ;; ;;====================================================================== (use regex regex-case base64 sqlite3 srfi-18 directory-utils posix-extras z3 call-with-environment-variables csv) -(use typed-records pathname-expand) +(use typed-records pathname-expand matchable) (import (prefix base64 base64:)) (import (prefix sqlite3 sqlite3:)) (declare (unit launch)) @@ -413,13 +413,15 @@ (work-area (assoc/default 'work-area cmdinfo)) ;; work-area is the test run area (test-name (assoc/default 'test-name cmdinfo)) (runscript (assoc/default 'runscript cmdinfo)) (ezsteps (assoc/default 'ezsteps cmdinfo)) ;; (runremote (assoc/default 'runremote cmdinfo)) - (transport (assoc/default 'transport cmdinfo)) + ;; (transport (assoc/default 'transport cmdinfo)) ;; not used ;; (serverinf (assoc/default 'serverinf cmdinfo)) - (port (assoc/default 'port cmdinfo)) + ;; (port (assoc/default 'port cmdinfo)) + (serverurl (assoc/default 'serverurl cmdinfo)) + (homehost (assoc/default 'homehost cmdinfo)) (run-id (assoc/default 'run-id cmdinfo)) (test-id (assoc/default 'test-id cmdinfo)) (target (assoc/default 'target cmdinfo)) (itemdat (assoc/default 'itemdat cmdinfo)) (env-ovrd (assoc/default 'env-ovrd cmdinfo)) @@ -443,10 +445,51 @@ runscript))))) ;; assume it is on the path ) ;; (rollup-status 0) (if contour (setenv "MT_CONTOUR" contour)) + ;; On NFS it can be slow and unreliable to get needed startup information. + ;; i. Check if we are on the homehost, if so, proceed + ;; ii. Check if host and port passed in via CMDINFO are valid and if + ;; possible use them. + (let ((bestadrs (server:get-best-guess-address (get-host-name))) + (needcare #f)) + (if (equal? homehost bestadrs) ;; we are likely on the homehost + (debug:print-info 0 *default-log-port* "test " test-name " appears to be running on the homehost " homehost) + (let ((host-port (if serverurl (string-split serverurl ":") #f))) + (if (not *runremote*)(set! *runremote* (make-remote))) ;; init *runremote* + (if (string? homehost) + (if (and host-port + (> (length host-port) 1)) + (let* ((host (car host-port)) + (port (cadr host-port)) + (start-res (http-transport:client-connect host port)) + (ping-res (rmt:login-no-auto-client-setup start-res))) + (if (and start-res + ping-res) + (let ((url (http-transport:server-dat-make-url start-res))) + (remote-conndat-set! *runremote* start-res) + (remote-server-url-set! *runremote* url) + (debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data.")) + (debug:print-info 0 *default-log-port* "received " host ":" port " for url but could not connect.") + )) + (begin + (debug:print-info 0 *default-log-port* (if host-port + (conc "received invalid host-port information " host-port) + "no host-port information received")) + ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare. + (set! needcare #t))) + (begin + (debug:print-info 0 *default-log-port* "received no homehost information. Please report this to support as it should not happen.") + (set! needcare #t))))) + (if needcare ;; due to very slow NFS we will do a brute force mkdir to ensure that the directory inode it truly available on this host + (let ((logdir (conc top-path "/logs"))) ;; we'll try to create this directory + (handle-exceptions + exn + (debug:print 0 *default-log-port* "Failed to create directory " logdir " expect problems, message: " ((condition-property-accessor 'exn 'message) exn)) + (create-directory logdir #t))))) + ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) (if (or (file-exists? top-path) (> count 10)) (change-directory top-path) @@ -1189,12 +1232,19 @@ (set! cmdparms (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) - (list 'transport (conc *transport-type*)) + ;; (list 'transport (conc *transport-type*)) ;; (list 'serverinf *server-info*) + (list 'homehost (let* ((hhdat (common:get-homehost))) + (if hhdat + (car hhdat) + #f))) + (list 'serverurl (if *runremote* + (remote-server-url *runremote*) + #f)) ;; (list 'toppath *toppath*) (list 'work-area work-area) (list 'test-name test-name) (list 'runscript runscript) (list 'run-id run-id ) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -36,15 +36,15 @@ (define (rmt:get-connection-info areapath #!key (area-dat #f)) ;; TODO: push areapath down. (let* ((runremote (or area-dat *runremote*)) (cinfo (if (remote? runremote) (remote-conndat runremote) #f))) - (if cinfo - cinfo - (if (server:check-if-running areapath) - (client:setup areapath) - #f)))) + (if cinfo + cinfo + (if (server:check-if-running areapath) + (client:setup areapath) + #f)))) (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;;