Megatest

Check-in [f64f2f8ca8]
Login
Overview
Comment:Cleaned up client:start
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | try-nanomsg
Files: files | file ages | folders
SHA1: f64f2f8ca80622ad2f123a7a1e5fc9ab63270ebe
User & Date: matt on 2014-11-26 22:49:33
Other Links: branch diff | manifest | tags
Context
2014-11-26
23:09
nanomsg transport fixed. check-in: 675b89e392 user: matt tags: try-nanomsg
22:49
Cleaned up client:start check-in: f64f2f8ca8 user: matt tags: try-nanomsg
21:29
Add timeout on all remote calls check-in: 245a3a0b6d user: matt tags: try-nanomsg
Changes

Modified client.scm from [ec1ead835f] to [0a4fff7154].

51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
































133
134
135
136
137
138
139
;;   2. We are a run tests, list runs or other interactive process and we must figure out
;;      *transport-type* and *runremote* from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;
(define (client:setup run-id #!key (remaining-tries 100) (failed-connects 0))
  (debug:print-info 2 "client:setup remaining-tries=" remaining-tries)
  (let* ((tdbdat (tasks:open-db)))
    (if (<= remaining-tries 0)
	(begin
	  (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	  (exit 1))
	(let ((host-info (hash-table-ref/default *runremote* run-id #f)))
	  (if host-info ;; this is a bit circular. the host-info *is* the start-res FIXME
	      (let* ((iface     (http-transport:server-dat-get-iface host-info))
		     (port      (http-transport:server-dat-get-port  host-info))
		     (start-res (case *transport-type* 
				  ((http)(http-transport:client-connect iface port))
				  ((nmsg) host-info) ;; (http-transport:server-dat-get-socket host-info))
				  (else #f)))
		     (ping-res  (case *transport-type*
				  ((http)(rmt:login-no-auto-client-setup start-res run-id))
				  ((nmsg)(nmsg-transport:ping iface port timeout: 2 socket: #t))
				  (else #f))))
		(if ping-res   ;; sucessful login?
		    (begin
		      (debug:print-info 2 "client:setup, ping is good using host-info=" host-info ", remaining-tries=" remaining-tries)
		      start-res)  ;; return the server info
		    ;; have host info but no ping. shutdown the current connection and try again
		    (begin    ;; login failed
		      (debug:print-info 1 "client:setup, ping is bad for start-res=" start-res " and *runremote*=" host-info)
		      (case *transport-type*
			((http)(http-transport:close-connections run-id)))
		      (hash-table-delete! *runremote* run-id)
		      (if (< remaining-tries 8)
			  (thread-sleep! 5)
			  (thread-sleep! 1))
		      (client:setup run-id remaining-tries: (- remaining-tries 1)))))
	      ;; YUK: rename server-dat here
	      (let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
		(debug:print-info 4 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
		(if server-dat
		    (let* ((iface     (tasks:hostinfo-get-interface server-dat))
			   (hostname  (tasks:hostinfo-get-hostname  server-dat))
			   (port      (tasks:hostinfo-get-port      server-dat))
			   (start-res (case *transport-type*
					((http)(http-transport:client-connect iface port))
					((nmsg)(nmsg-transport:client-connect hostname port))))
			   (ping-res  (case *transport-type* 
					((http)(rmt:login-no-auto-client-setup start-res run-id))
					((nmsg)(http-transport:server-dat-get-socket start-res))))) ;; socket is the result of a ping



		      (if (and start-res
			       ping-res)
			  (begin
			    (hash-table-set! *runremote* run-id start-res)
			    (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res))
			    start-res)
			  (begin    ;; login failed but have a server record, clean out the record and try again
			    (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
			    (case *transport-type* 
			      ((http)(http-transport:close-connections run-id)))
			    (hash-table-delete! *runremote* run-id)
			    (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
								 run-id 
								 (tasks:hostinfo-get-interface server-dat)
								 (tasks:hostinfo-get-port      server-dat)
								 " client:setup (server-dat = #t)")
			    (thread-sleep! 2)
			    (server:try-running run-id)
			    (thread-sleep! 10) ;; give server a little time to start up
			    (client:setup run-id remaining-tries: (- remaining-tries 1)))))
		    (begin    ;; no server registered
		      (let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id)))
			(debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
			(thread-sleep! 2) 
			(if (< num-available 2)
			    (begin
			      (server:try-running run-id)))
			(thread-sleep! 10) ;; give server a little time to start up
			(client:setup run-id remaining-tries: (- remaining-tries 1)))))))))))

































;; keep this as a function to ease future 
(define (client:start run-id server-info)
  (http-transport:client-connect (tasks:hostinfo-get-interface server-info)
				 (tasks:hostinfo-get-port server-info)))

;; client:signal-handler







|






<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
|
|
|
|
|
|
|
|
|
|
|
>
>
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<
|
|
|
|
|
|
<
|
<
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







51
52
53
54
55
56
57
58
59
60
61
62
63
64



























65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95

96
97
98
99
100
101

102

103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
;;   2. We are a run tests, list runs or other interactive process and we must figure out
;;      *transport-type* and *runremote* from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;
(define (client:setup run-id #!key (remaining-tries 2) (failed-connects 0))
  (debug:print-info 2 "client:setup remaining-tries=" remaining-tries)
  (let* ((tdbdat (tasks:open-db)))
    (if (<= remaining-tries 0)
	(begin
	  (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	  (exit 1))



























	(let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
	  (debug:print-info 4 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	  (if server-dat
	      (let* ((iface     (tasks:hostinfo-get-interface server-dat))
		     (hostname  (tasks:hostinfo-get-hostname  server-dat))
		     (port      (tasks:hostinfo-get-port      server-dat))
		     (start-res (case *transport-type*
				  ((http)(http-transport:client-connect iface port))
				  ((nmsg)(nmsg-transport:client-connect hostname port))))
		     (ping-res  (case *transport-type* 
				  ((http)(rmt:login-no-auto-client-setup start-res run-id))
				  ((nmsg)(let ((logininfo (rmt:login-no-auto-client-setup start-res run-id)))
 					   (if logininfo
 					       (vector-ref (vector-ref logininfo 1) 1)
 					       #f))))))
		(if (and start-res
			 ping-res)
		    (begin
		      (hash-table-set! *runremote* run-id start-res)
		      (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res))
		      start-res)
		    (begin    ;; login failed but have a server record, clean out the record and try again
		      (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
		      (case *transport-type* 
			((http)(http-transport:close-connections run-id)))
		      (hash-table-delete! *runremote* run-id)
		      (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
							   run-id 
							   (tasks:hostinfo-get-interface server-dat)
							   (tasks:hostinfo-get-port      server-dat)
							   " client:setup (server-dat = #t)")

		      (server:try-running run-id)
		      (thread-sleep! 5) ;; give server a little time to start up
		      (client:setup run-id remaining-tries: (- remaining-tries 1)))))
	      (begin    ;; no server registered
		(let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id)))
		  (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)

		  (if (< num-available 2)

		      (server:try-running run-id))
		  (thread-sleep! 5) ;; give server a little time to start up
		  (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))

;; 	(let ((host-info (hash-table-ref/default *runremote* run-id #f)))
;; 	  (if host-info ;; this is a bit circular. the host-info *is* the start-res FIXME
;; 	      (let* ((iface     (http-transport:server-dat-get-iface host-info))
;; 		     (port      (http-transport:server-dat-get-port  host-info))
;; 		     (start-res (case *transport-type* 
;; 				  ((http)(http-transport:client-connect iface port))
;; 				  ((nmsg)(nmsg-transport:client-connect iface port)) ;; (http-transport:server-dat-get-socket host-info))
;; 				  (else #f)))
;; 		     (ping-res  (case *transport-type*
;; 				  ((http)(rmt:login-no-auto-client-setup start-res run-id))
;; 				  ((nmsg)(let ((logininfo (rmt:login-no-auto-client-setup start-res run-id)))
;; 					   (if logininfo
;; 					       (vector-ref (vector-ref logininfo 1) 1)
;; 					       #f)))
;; 				  (else #f))))
;; 		(if ping-res   ;; sucessful login?
;; 		    (begin
;; 		      (debug:print-info 2 "client:setup, ping is good using host-info=" host-info ", remaining-tries=" remaining-tries)
;; 		      start-res)  ;; return the server info
;; 		    ;; have host info but no ping. shutdown the current connection and try again
;; 		    (begin    ;; login failed
;; 		      (debug:print-info 1 "client:setup, ping is bad for start-res=" start-res " and *runremote*=" host-info)
;; 		      (case *transport-type*
;; 			((http)(http-transport:close-connections run-id)))
;; 		      (hash-table-delete! *runremote* run-id)
;; 		      (if (< remaining-tries 8)
;; 			  (thread-sleep! 5)
;; 			  (thread-sleep! 1))
;; 		      (client:setup run-id remaining-tries: (- remaining-tries 1)))))
;; 	      ;; YUK: rename server-dat here
;; 

;; keep this as a function to ease future 
(define (client:start run-id server-info)
  (http-transport:client-connect (tasks:hostinfo-get-interface server-info)
				 (tasks:hostinfo-get-port server-info)))

;; client:signal-handler

Modified rmt.scm from [ac5c069146] to [e68219c025].

93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119


120
121
122
123
124
125
126
127
			  (else  (exit))))
	       (success (if (and dat (vector? dat)) (vector-ref dat 0) #f))
	       (res     (if (and dat (vector? dat)) (vector-ref dat 1) #f)))
	  (http-transport:server-dat-update-last-access connection-info)
	  (if success
	      (case *transport-type* 
		((http)(db:string->obj res))
		((nmsg) res))
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")
		(case *transport-type*
		  ((nmsg)(nn-close (http-transport:server-dat-get-socket connection-info))))
		(hash-table-delete! *runremote* run-id) ;; don't keep using the same connection
		(tasks:kill-server-run-id run-id tag: "api-send-receive-failed")
		(tasks:start-and-wait-for-server (tasks:open-db) run-id 15)
		;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1))))))

		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
		;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))
		;; (thread-sleep! 2)
		(rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1))))))
    (if (and (< attemptnum 10)
	     (tasks:need-server run-id))
	(begin
	  (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)


	  (rmt:send-receive cmd rid params (+ attemptnum 1)))
	(rmt:open-qry-close-locally cmd run-id params))))

(define (rmt:update-db-stats run-id rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)
  (handle-exceptions
   exn
   (begin







|


|
|















>
>
|







93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
			  (else  (exit))))
	       (success (if (and dat (vector? dat)) (vector-ref dat 0) #f))
	       (res     (if (and dat (vector? dat)) (vector-ref dat 1) #f)))
	  (http-transport:server-dat-update-last-access connection-info)
	  (if success
	      (case *transport-type* 
		((http)(db:string->obj res))
		((nmsg)(vector-ref res 1)))
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")
		;; (case *transport-type*
		;;   ((nmsg)(nn-close (http-transport:server-dat-get-socket connection-info))))
		(hash-table-delete! *runremote* run-id) ;; don't keep using the same connection
		(tasks:kill-server-run-id run-id tag: "api-send-receive-failed")
		(tasks:start-and-wait-for-server (tasks:open-db) run-id 15)
		;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1))))))

		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
		;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))
		;; (thread-sleep! 2)
		(rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1))))))
    (if (and (< attemptnum 10)
	     (tasks:need-server run-id))
	(begin
	  (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
	  (hash-table-delete! *runremote* run-id)
	  (client:setup run-id)
	  (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
	(rmt:open-qry-close-locally cmd run-id params))))

(define (rmt:update-db-stats run-id rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)
  (handle-exceptions
   exn
   (begin
244
245
246
247
248
249
250

251

252
253
254
255
256
257
258
259

(define (rmt:login run-id)
  (rmt:send-receive 'login run-id (list *toppath* megatest-version run-id *my-client-signature*)))

;; This login does no retries under the hood - it acts a bit like a ping.
;;
(define (rmt:login-no-auto-client-setup connection-info run-id)

  (rmt:send-receive-no-auto-client-setup connection-info 'login run-id (list *toppath* megatest-version run-id *my-client-signature*)))

  
;; hand off a call to one of the db:queries statements
;; added run-id to make looking up the correct db possible 
;;
(define (rmt:general-call stmtname run-id . params)
  (rmt:send-receive 'general-call run-id (append (list stmtname run-id) params)))

(define (rmt:sync-inmem->db run-id)







>
|
>
|







246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263

(define (rmt:login run-id)
  (rmt:send-receive 'login run-id (list *toppath* megatest-version run-id *my-client-signature*)))

;; This login does no retries under the hood - it acts a bit like a ping.
;;
(define (rmt:login-no-auto-client-setup connection-info run-id)
  (case *transport-type*
    ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login run-id (list *toppath* megatest-version run-id *my-client-signature*)))
    ((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info 'login (list *toppath* megatest-version run-id *my-client-signature*)))))

;; hand off a call to one of the db:queries statements
;; added run-id to make looking up the correct db possible 
;;
(define (rmt:general-call stmtname run-id . params)
  (rmt:send-receive 'general-call run-id (append (list stmtname run-id) params)))

(define (rmt:sync-inmem->db run-id)

Modified tests/fullrun/megatest.config from [7518de8980] to [26d5518426].

131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# If the server can't be started on this port it will try the next port until
# it succeeds
port 8080

# This server will keep running this number of hours after last access. 
# Three minutes is 0.05 hours
# timeout 0.025
timeout 0.1

# Server is required - slower but more resistant to Sqlite issues.
required yes

# Start server when average query takes longer than this
# server-query-threshold 55500
server-query-threshold -1







|







131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# If the server can't be started on this port it will try the next port until
# it succeeds
port 8080

# This server will keep running this number of hours after last access. 
# Three minutes is 0.05 hours
# timeout 0.025
timeout 0.061

# Server is required - slower but more resistant to Sqlite issues.
required yes

# Start server when average query takes longer than this
# server-query-threshold 55500
server-query-threshold -1