Megatest

Check-in [b13bf213d4]
Login
Overview
Comment:Improved auto server reconnect in client.scm but it is still flakey under extreme load
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60
Files: files | file ages | folders
SHA1: b13bf213d405d1446f3ff22faac32c69e02499c0
User & Date: matt on 2014-02-18 21:16:23
Other Links: branch diff | manifest | tags
Context
2014-02-19
23:30
Partial fix for issues with $MT_MEGATEST -test-files lookithome.log -target $MT_TARGET :runname $MT_RUNNAME -testpatt ez_fail check-in: 845c375e3f user: matt tags: v1.60
2014-02-18
21:16
Improved auto server reconnect in client.scm but it is still flakey under extreme load check-in: b13bf213d4 user: matt tags: v1.60
19:41
Clean up, removed old references to *runremote* check-in: 7590c8479a user: matt tags: v1.60
Changes

Modified client.scm from [67235b3676] to [4e17077a64].

51
52
53
54
55
56
57
58

59
60
61
62
63
64
65
66
67
68


69
70


71
72
73
74
75
76
77
78
79












80
81
82
83
84
85
86


87
88
89
90
91
92
93
94
95
96
97














98
99
100
101
102
103
104
51
52
53
54
55
56
57

58
59
60
61
62
63
64
65
66


67
68
69
70
71
72









73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93











94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114







-
+








-
-
+
+


+
+
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+







+
+
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+







;;   2. We are a run tests, list runs or other interactive process and we must figure out
;;      *transport-type* and *runremote* from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;
(define (client:setup run-id #!key (remaining-tries 10))
(define (client:setup run-id #!key (remaining-tries 20) (failed-connects 0))
  (debug:print 0 "INFO: client:setup remaining-tries=" remaining-tries)
  (if (<= remaining-tries 0)
      (begin
	(debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	(exit 1))
      (let ((server-dat (and run-id (hash-table-ref/default *runremote* run-id #f))))
	(if server-dat
	    (let ((start-res (http-transport:client-connect run-id ;; NB// confusion over server-dat and connection result!
							    (tasks:hostinfo-get-interface server-dat)
							    (tasks:hostinfo-get-port      server-dat))))
							    (car  server-dat)
							    (cadr server-dat))))
	      (if start-res ;; sucessful login?
		  start-res
		  (if (and (< remaining-tries 10)
			   (odd? remaining-tries))
		  (begin    ;; login failed
		    (hash-table-delete! *runremote* run-id)
		    (open-run-close tasks:server-force-clean-run-record
				    tasks:open-db
				    run-id 
				    (tasks:hostinfo-get-interface server-dat)
				    (tasks:hostinfo-get-port      server-dat))
		    (thread-sleep! 5)
		    (client:setup run-id remaining-tries: (- remaining-tries 1)))))
		      (begin    ;; login failed
			(hash-table-delete! *runremote* run-id)
			(open-run-close tasks:server-force-clean-run-record
					tasks:open-db
					run-id 
					(car  server-dat)
					(cadr server-dat))
			(thread-sleep! 5)
			(client:setup run-id remaining-tries: (- remaining-tries 1)))
		      (begin
			(thread-sleep! 5)
			(client:setup run-id remaining-tries: (- remaining-tries 1))))))
	    (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id)))
	      (if server-dat
		  (let ((start-res (http-transport:client-connect run-id
								  (tasks:hostinfo-get-interface server-dat)
								  (tasks:hostinfo-get-port      server-dat))))
		    (if start-res
			start-res
			(if (and (< remaining-tries 10)
				 (odd? remaining-tries))
			(begin    ;; login failed
			  (hash-table-delete! *runremote* run-id)
			  (open-run-close tasks:server-force-clean-run-record
					  tasks:open-db
					  run-id 
					  (tasks:hostinfo-get-interface server-dat)
					  (tasks:hostinfo-get-port      server-dat))
			  (thread-sleep! 2)
			  (server:try-running run-id)
			  (thread-sleep! 5) ;; give server a little time to start up
			  (client:setup run-id remaining-tries: (- remaining-tries 1)))))
			    (begin    ;; login failed
			      (hash-table-delete! *runremote* run-id)
			      (open-run-close tasks:server-force-clean-run-record
					      tasks:open-db
					      run-id 
					      (tasks:hostinfo-get-interface server-dat)
					      (tasks:hostinfo-get-port      server-dat))
			      (thread-sleep! 2)
			      (server:try-running run-id)
			      (thread-sleep! 5) ;; give server a little time to start up
			      (client:setup run-id remaining-tries: (- remaining-tries 1)))
			    (begin
			      (thread-sleep! 5)
			      (client:setup run-id remaining-tries: (- remaining-tries 1))))))
		  (begin    ;; no server registered
		    (thread-sleep! 2)
		    (server:try-running run-id)
		    (thread-sleep! 5) ;; give server a little time to start up
		    (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))

;; keep this as a function to ease future 

Modified server.scm from [8eb4730569] to [3ef5cd2281].

105
106
107
108
109
110
111
112


113
114
115
116
117
118
119
105
106
107
108
109
110
111

112
113
114
115
116
117
118
119
120







-
+
+







(define (server:reply return-addr query-sig success/fail result)
  (db:obj->string (vector success/fail query-sig result)))

;; > file 2>&1 
(define (server:try-running run-id)
  (let* ((rand-name (random 100))
	 (cmdln (conc (if (getenv "MT_MEGATEST") (getenv "MT_MEGATEST") "megatest")
		     " -server - -run-id " run-id " name=" rand-name " > " *toppath* "/db/" run-id "-" rand-name ".log 2>&1 &")))
		    ;; " -server - -run-id " run-id " name=" rand-name " > " *toppath* "/db/" run-id "-" rand-name ".log 2>&1 &")))
		      " -server - -run-id " run-id " > " *toppath* "/db/" run-id ".log 2>&1 &")))
    (debug:print 0 "INFO: Starting server (" cmdln ") as none running ...")
    (push-directory *toppath*)
    (system cmdln)
    (pop-directory)))

(define (server:check-if-running run-id)
  (let loop ((server (open-run-close tasks:get-server tasks:open-db run-id))