Megatest

Check-in [047e6962b4]
Login
Overview
Comment:Added messages for all possible paths in the lauch:execute initial connection and NFS dir check section
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | robust-test-execute
Files: files | file ages | folders
SHA1: 047e6962b480b7d443d0dffb9be566c02a6a583b
User & Date: matt on 2017-03-08 22:37:20
Other Links: branch diff | manifest | tags
Context
2017-03-09
20:10
Added defense against an unreadable .homehost file (shouldn't happen\!) Closed-Leaf check-in: 08cf77140c user: matt tags: homehost-protection
2017-03-08
22:37
Added messages for all possible paths in the lauch:execute initial connection and NFS dir check section Closed-Leaf check-in: 047e6962b4 user: matt tags: robust-test-execute
09:58
fixed issues with Baishali last evening check-in: fb43245d3c user: bjbarcla tags: v1.63, v1.6311
Changes

Modified launch.scm from [946add6293] to [5c08c0419f].

444
445
446
447
448
449
450
451

452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
















471
472
473
474
475
476
477
                                              runscript))))) ;; assume it is on the path
	       ) ;; (rollup-status 0)

	  ;; On NFS it can be slow and unreliable to get needed startup information.
	  ;;  i. Check if we are on the homehost, if so, proceed
	  ;; ii. Check if host and port passed in via CMDINFO are valid and if
	  ;;     possible use them.
	  (let ((bestadrs (server:get-best-guess-address (get-host-name))))

	    (if (equal? homehost bestadrs) ;; we are likely on the homehost
		(debug:print-info 0 *default-log-port* "test " test-name " appears to be running on the homehost " homehost)
		(let ((host-port (if serverurl (string-split serverurl ":") #f)))
		  (if (not *runremote*)(set! *runremote* (make-remote))) ;; init *runremote*
		  (if (string? homehost)
		      (if (and host-port
			       (> (length host-port) 1))
			  (let* ((host      (car host-port))
                                 (port      (cadr host-port))
                                 (start-res (http-transport:client-connect host port))
                                 (ping-res  (rmt:login-no-auto-client-setup start-res)))
			    (if (and start-res
				     ping-res)
				(let ((url  (http-transport:server-dat-make-url start-res)))
				  (remote-conndat-set! *runremote* start-res)
				  (remote-server-url-set! *runremote* url)
				  (debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data."))
				(debug:print-info 0 *default-log-port* "received " host ":" port " for url but could not connect.")
				)))))))
















	  ;; NFS might not have propagated the directory meta data to the run host - give it time if needed
	  (let loop ((count 0))
	    (if (or (file-exists? top-path)
		    (> count 10))
		(change-directory top-path)
		(begin
		  (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " top-path " not found")







|
>


















|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
                                              runscript))))) ;; assume it is on the path
	       ) ;; (rollup-status 0)

	  ;; On NFS it can be slow and unreliable to get needed startup information.
	  ;;  i. Check if we are on the homehost, if so, proceed
	  ;; ii. Check if host and port passed in via CMDINFO are valid and if
	  ;;     possible use them.
	  (let ((bestadrs (server:get-best-guess-address (get-host-name)))
		(needcare #f))
	    (if (equal? homehost bestadrs) ;; we are likely on the homehost
		(debug:print-info 0 *default-log-port* "test " test-name " appears to be running on the homehost " homehost)
		(let ((host-port (if serverurl (string-split serverurl ":") #f)))
		  (if (not *runremote*)(set! *runremote* (make-remote))) ;; init *runremote*
		  (if (string? homehost)
		      (if (and host-port
			       (> (length host-port) 1))
			  (let* ((host      (car host-port))
                                 (port      (cadr host-port))
                                 (start-res (http-transport:client-connect host port))
                                 (ping-res  (rmt:login-no-auto-client-setup start-res)))
			    (if (and start-res
				     ping-res)
				(let ((url  (http-transport:server-dat-make-url start-res)))
				  (remote-conndat-set! *runremote* start-res)
				  (remote-server-url-set! *runremote* url)
				  (debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data."))
				(debug:print-info 0 *default-log-port* "received " host ":" port " for url but could not connect.")
				))
			  (begin
			    (debug:print-info 0 *default-log-port* (if host-port
								       (conc "received invalid host-port information " host-port)
								       "no host-port information received"))
			    ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare.
			    (set! needcare #t)))
		      (begin
			(debug:print-info 0 *default-log-port* "received no homehost information. Please report this to support as it should not happen.")
			(set! needcare #t)))))
	    (if needcare  ;; due to very slow NFS we will do a brute force mkdir to ensure that the directory inode it truly available on this host
		(let ((logdir (conc top-path "/logs"))) ;; we'll try to create this directory
		  (handle-exceptions
		      exn
		      (debug:print 0 *default-log-port* "Failed to create directory " logdir " expect problems, message: " ((condition-property-accessor 'exn 'message) exn))
		    (create-directory logdir #t)))))
		  
	  ;; NFS might not have propagated the directory meta data to the run host - give it time if needed
	  (let loop ((count 0))
	    (if (or (file-exists? top-path)
		    (> count 10))
		(change-directory top-path)
		(begin
		  (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " top-path " not found")