Megatest

Changes On Branch a827c0e1f82fe9c3
Login

Changes In Branch rpc-transport Through [a827c0e1f8] Excluding Merge-Ins

This is equivalent to a diff from cfb9ac119d to a827c0e1f8

2016-11-15
14:02
wrapped access to *runremote* has with mutes check-in: 3dffa0e4f9 user: bjbarcla tags: rpc-transport
2016-11-14
17:55
caught another unimplemented area. check-in: a827c0e1f8 user: bjbarcla tags: rpc-transport
15:58
removed -daemonize when starting server; removed local fallback in rmt:send-receive check-in: 48bb95d322 user: bjbarcla tags: rpc-transport
2016-10-27
15:21
Run tab resize fixed check-in: 5ff16368ff user: ritikaag tags: v1.62
2016-10-26
14:48
branch for rpc support check-in: d306d8dea0 user: bjbarcla tags: rpc-transport
14:21
added support for -kill-servers and -transport switches on megatest check-in: cfb9ac119d user: bjbarcla tags: v1.62
2016-10-25
21:02
fixed -list-servers and -stop-server switches on megatest check-in: 73502a1626 user: bjbarcla tags: v1.62

Modified client.scm from [c5821d20e2] to [93c404c342].

39
40
41
42
43
44
45


46
47
48
49
50
51
52
53




54













55






56








57






















58
59
60
61
62
63
64

;; Not currently used! But, I think it *should* be used!!!
(define (client:logout serverdat)
  (let ((ok (and (socket? serverdat)
		 (cdb:logout serverdat *toppath* (client:get-signature)))))
    ok))



(define (client:connect iface port)
  (case (server:get-transport)
    ((rpc)  (rpc:client-connect  iface port))
    ((http) (http:client-connect iface port))
    ((zmq)  (zmq:client-connect  iface port))
    (else   (rpc:client-connect  iface port))))

(define (client:setup  run-id #!key (remaining-tries 10) (failed-connects 0))




  (case (server:get-transport)













    ((rpc) (rpc-transport:client-setup run-id)) ;;(client:setup-rpc run-id))






    ((http)(client:setup-http run-id))








    (else  (rpc-transport:client-setup run-id)))) ;; (client:setup-rpc run-id))))























;; (define (client:login-no-auto-setup server-info run-id)
;;   (case (server:get-transport)
;;     ((rpc)  (rpc:login-no-auto-client-setup server-info run-id))
;;     ((http) (rmt:login-no-auto-client-setup server-info run-id))
;;     (else   (rpc:login-no-auto-client-setup server-info run-id))))
;; 







>
>
|
|
|
|
|
|

|
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

;; Not currently used! But, I think it *should* be used!!!
(define (client:logout serverdat)
  (let ((ok (and (socket? serverdat)
		 (cdb:logout serverdat *toppath* (client:get-signature)))))
    ok))

;; BB: commenting out orphan code
;;;;;
;; (define (client:connect iface port)
;;   (case (server:get-transport)
;;     ((rpc)  (rpc:client-connect  iface port))
;;     ((http) (http:client-connect iface port))
;;     ((zmq)  (zmq:client-connect  iface port))
;;     (else   (rpc:client-connect  iface port))))

(define (client:setup run-id #!key (remaining-tries 10))
  (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries)
  (let* ((server-dat (tasks:bb-get-server-info run-id))
         (transport (if server-dat (string->symbol (tasks:hostinfo-get-transport server-dat)) 'noserver)))

    (case transport
      ((noserver) ;; no server registered
       (if (<= remaining-tries 0)
           (begin
             (debug:print-error 0 *default-log-port* "failed to start or connect to server for run-id " run-id)
             (exit 1))
           (begin    
             (let ((num-available (tasks:bb-num-in-available-state run-id)))
               (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
               (if (< num-available 2)
                   (server:try-running run-id))
               (thread-sleep! (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
               (client:setup run-id remaining-tries: (- remaining-tries 1))))))
      ((http)(client:setup-http run-id server-dat remaining-tries))
      ((rpc) (rpc-transport:client-setup run-id server-dat remtries: remaining-tries)) 
      (else
       (debug:print-error 0 *default-log-port* "(6) Transport ["
                          transport "] specified for run-id [" run-id "] is not implemented in client:setup.  Cannot proceed.")
       (exit 1)))))


;; client:setup-http
;;
;; For http transport, robustly ensure an advertised-running server is actually working and responding, and
;; establish tcp connection to server.  For servers marked running but not responding, kill them and clear from mdb
;; 
(define (client:setup-http run-id server-dat remaining-tries)
  (let* ((iface     (tasks:hostinfo-get-interface server-dat))
         (hostname  (tasks:hostinfo-get-hostname  server-dat))
         (port      (tasks:hostinfo-get-port      server-dat))

         (start-res (http-transport:client-connect iface port))
         (ping-res  (rmt:login-no-auto-client-setup start-res run-id)))
    (if (and start-res ping-res)
        (begin
          (hash-table-set! *runremote* run-id start-res) ;; side-effect - *runremote* cache init fpr rmt:*
          (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res))
          start-res)
        (begin    ;; login failed but have a server record, clean out the record and try again
          (debug:print-info 0 *default-log-port* "client:setup-http, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
          (http-transport:close-connections run-id)
          (hash-table-delete! *runremote* run-id) ;; BB: suspect there is nothing to delete ...
          (tasks:kill-server-run-id run-id) ;; -9 so the hung processes dont eat 100% when not responding to sigterm.
          (tasks:bb-server-force-clean-run-record  run-id iface port
                                                   " client:setup-http (server-dat = #t)")
          (if (> remaining-tries 8)
              (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little
              (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time
          (server:try-running run-id)
          (thread-sleep! 5)   ;; give server a little time to start up
          (client:setup run-id remaining-tries: (- remaining-tries 1))
          ))))


;; (define (client:login-no-auto-setup server-info run-id)
;;   (case (server:get-transport)
;;     ((rpc)  (rpc:login-no-auto-client-setup server-info run-id))
;;     ((http) (rmt:login-no-auto-client-setup server-info run-id))
;;     (else   (rpc:login-no-auto-client-setup server-info run-id))))
;; 
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
;; 			(thread-sleep! 5)
;; 			(client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1)))
;; 		      (begin
;; 			(debug:print 25 *default-log-port* "INFO: client:setup failed to connect, start-res=" start-res ", run-id=" run-id ", host-info=" host-info)
;; 			(thread-sleep! 5)
;; 			(client:setup run-id remaining-tries: (- remaining-tries 1))))))
;; 	    ;; YUK: rename server-dat here
;; 	    (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id)))
;; 	      (debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
;; 	      (if server-dat
;; 		  (let* ((iface     (tasks:hostinfo-get-interface server-dat))
;; 			 (port      (tasks:hostinfo-get-port      server-dat))
;; 			 (start-res (http-transport:client-connect iface port))
;; 			 ;; (ping-res  (server:ping-server run-id iface port))
;; 			 (ping-res  (rmt:login-no-auto-client-setup start-res run-id)))







|







148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
;; 			(thread-sleep! 5)
;; 			(client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1)))
;; 		      (begin
;; 			(debug:print 25 *default-log-port* "INFO: client:setup failed to connect, start-res=" start-res ", run-id=" run-id ", host-info=" host-info)
;; 			(thread-sleep! 5)
;; 			(client:setup run-id remaining-tries: (- remaining-tries 1))))))
;; 	    ;; YUK: rename server-dat here
;; 	    (let* ((server-dat (open-run-close tasks:get-server-info tasks:open-db run-id)))
;; 	      (debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
;; 	      (if server-dat
;; 		  (let* ((iface     (tasks:hostinfo-get-interface server-dat))
;; 			 (port      (tasks:hostinfo-get-port      server-dat))
;; 			 (start-res (http-transport:client-connect iface port))
;; 			 ;; (ping-res  (server:ping-server run-id iface port))
;; 			 (ping-res  (rmt:login-no-auto-client-setup start-res run-id)))
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
;;   2. We are a run tests, list runs or other interactive process and we must figure out
;;      *transport-type* and *runremote* from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;
(define (client:setup-http run-id #!key (remaining-tries 10) (failed-connects 0))
  (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries)
  (let* ((tdbdat (tasks:open-db)))
    (if (<= remaining-tries 0)
	(begin
	  (debug:print-error 0 *default-log-port* "failed to start or connect to server for run-id " run-id)
	  (exit 1))
	(let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
	  (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	  (if server-dat
	      (let* ((iface     (tasks:hostinfo-get-interface server-dat))
		     (hostname  (tasks:hostinfo-get-hostname  server-dat))
		     (port      (tasks:hostinfo-get-port      server-dat))
		     (start-res (case *transport-type*
				  ((http)(http-transport:client-connect iface port))
				  ;;((nmsg)(nmsg-transport:client-connect hostname port))
                                  ))
		     (ping-res  (case *transport-type* 
				  ((http)(rmt:login-no-auto-client-setup start-res run-id))
				  ;; ((nmsg)(let ((logininfo (rmt:login-no-auto-client-setup start-res run-id)))
 				  ;;          (if logininfo
 				  ;;              (car (vector-ref logininfo 1))
 				  ;;              #f)))

                                  )))
		(if (and start-res
			 ping-res)
		    (begin
		      (hash-table-set! *runremote* run-id start-res)
		      (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res))
		      start-res)
		    (begin    ;; login failed but have a server record, clean out the record and try again
		      (debug:print-info 0 *default-log-port* "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
		      (case *transport-type* 
			((http)(http-transport:close-connections run-id)))
		      (hash-table-delete! *runremote* run-id)
		      (tasks:kill-server-run-id run-id)
		      (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
							   run-id 
							   (tasks:hostinfo-get-interface server-dat)
							   (tasks:hostinfo-get-port      server-dat)
							   " client:setup (server-dat = #t)")
		      (if (> remaining-tries 8)
			  (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little
			  (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time
		      (server:try-running run-id)
		      (thread-sleep! 5)   ;; give server a little time to start up
		      (client:setup run-id remaining-tries: (- remaining-tries 1))
		      )))
	      (begin    ;; no server registered
		(let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id)))
		  (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
		  (if (< num-available 2)
		      (server:try-running run-id))
		  (thread-sleep! (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
		  (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))

;; keep this as a function to ease future 
(define (client:start run-id server-info)
  (http-transport:client-connect (tasks:hostinfo-get-interface server-info)
				 (tasks:hostinfo-get-port server-info)))

;; ;; client:signal-handler
;; (define (client:signal-handler signum)
;;   (signal-mask! signum)
;;   (set! *time-to-exit* #t)
;;   (handle-exceptions
;;    exn







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<

<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
|
|
|
|







205
206
207
208
209
210
211























212


213





























214
215
216
217
218
219
220
221
222
223
224
225
;;   2. We are a run tests, list runs or other interactive process and we must figure out
;;      *transport-type* and *runremote* from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;


























;; BB: commenting out orphan code.  





























;;
;; ;; keep this as a function to ease future 
;; (define (client:start run-id server-info)
;;   (http-transport:client-connect (tasks:hostinfo-get-interface server-info)
;; 				 (tasks:hostinfo-get-port server-info)))

;; ;; client:signal-handler
;; (define (client:signal-handler signum)
;;   (signal-mask! signum)
;;   (set! *time-to-exit* #t)
;;   (handle-exceptions
;;    exn

Modified common.scm from [87c3dc36b9] to [86bc64eb87].

75
76
77
78
79
80
81



82
83
84
85
86
87
88
89
90
(define *inmemdb*             #f)
(define *task-db*             #f) ;; (vector db path-to-db)
(define *db-access-allowed*   #t) ;; flag to allow access
(define *db-access-mutex*     (make-mutex))

;; SERVER
(define *my-client-signature* #f)



(define *transport-type*    'http)
(define *transport-type*    'http)             ;; override with [server] transport http|rpc|nmsg
(define *runremote*         (make-hash-table)) ;; if set up for server communication this will hold <host port>
(define *max-cache-size*    0)
(define *logged-in-clients* (make-hash-table))
(define *client-non-blocking-mode* #f)
(define *server-id*         #f)
(define *server-info*       #f)
(define *time-to-exit*      #f)







>
>
>
|
|







75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
(define *inmemdb*             #f)
(define *task-db*             #f) ;; (vector db path-to-db)
(define *db-access-allowed*   #t) ;; flag to allow access
(define *db-access-mutex*     (make-mutex))

;; SERVER
(define *my-client-signature* #f)

;; default preference for transport-type is set here
;;
(define *transport-type*    'rpc)             ;; override with [server] transport http|rpc 

(define *runremote*         (make-hash-table)) ;; if set up for server communication this will hold <host port>
(define *max-cache-size*    0)
(define *logged-in-clients* (make-hash-table))
(define *client-non-blocking-mode* #f)
(define *server-id*         #f)
(define *server-info*       #f)
(define *time-to-exit*      #f)

Modified fs-transport.scm from [59920959a9] to [de64f5b4d6].

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
(require-extension (srfi 18) extras tcp s11n)

(use sqlite3 srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest)
(import (prefix sqlite3 sqlite3:))

(use spiffy uri-common intarweb http-client spiffy-request-vars)

(tcp-buffer-size 2048)

(declare (unit fs-transport))

(declare (uses common))
(declare (uses db))
(declare (uses tests))
(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running.







|







11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
(require-extension (srfi 18) extras tcp s11n)

(use sqlite3 srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest)
(import (prefix sqlite3 sqlite3:))

(use spiffy uri-common intarweb http-client spiffy-request-vars)

;;(tcp-buffer-size 2048)

(declare (unit fs-transport))

(declare (uses common))
(declare (uses db))
(declare (uses tests))
(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running.

Modified http-transport.scm from [13883e3b0d] to [b16e6277ce].

12
13
14
15
16
17
18

19



20
21
22
23
24
25
26

(use  srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) ;; sqlite3
;; (import (prefix sqlite3 sqlite3:))

(use spiffy uri-common intarweb http-client spiffy-request-vars intarweb spiffy-directory-listing)

;; Configurations for server

(tcp-buffer-size 2048)



(max-connections 2048) 

(declare (unit http-transport))

(declare (uses common))
(declare (uses db))
(declare (uses tests))







>
|
>
>
>







12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30

(use  srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) ;; sqlite3
;; (import (prefix sqlite3 sqlite3:))

(use spiffy uri-common intarweb http-client spiffy-request-vars intarweb spiffy-directory-listing)

;; Configurations for server

(tcp-buffer-size 2048) ;; this interferes with rpc ; compensating in rpc-transport... so far so good



(max-connections 2048) 

(declare (unit http-transport))

(declare (uses common))
(declare (uses db))
(declare (uses tests))
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
(define (make-http-transport:server-dat)(make-vector 6))
(define (http-transport:server-dat-get-iface         vec)    (vector-ref  vec 0))
(define (http-transport:server-dat-get-port          vec)    (vector-ref  vec 1))
(define (http-transport:server-dat-get-api-uri       vec)    (vector-ref  vec 2))
(define (http-transport:server-dat-get-api-url       vec)    (vector-ref  vec 3))
(define (http-transport:server-dat-get-api-req       vec)    (vector-ref  vec 4))
(define (http-transport:server-dat-get-last-access   vec)    (vector-ref  vec 5))
(define (http-transport:server-dat-get-socket        vec)    (vector-ref  vec 6))

(define (http-transport:server-dat-make-url vec)
  (if (and (http-transport:server-dat-get-iface vec)
	   (http-transport:server-dat-get-port  vec))
      (conc "http://" 
	    (http-transport:server-dat-get-iface vec)
	    ":"







|







325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
(define (make-http-transport:server-dat)(make-vector 6))
(define (http-transport:server-dat-get-iface         vec)    (vector-ref  vec 0))
(define (http-transport:server-dat-get-port          vec)    (vector-ref  vec 1))
(define (http-transport:server-dat-get-api-uri       vec)    (vector-ref  vec 2))
(define (http-transport:server-dat-get-api-url       vec)    (vector-ref  vec 3))
(define (http-transport:server-dat-get-api-req       vec)    (vector-ref  vec 4))
(define (http-transport:server-dat-get-last-access   vec)    (vector-ref  vec 5))
(define (http-transport:server-dat-get-transport     vec)    (vector-ref  vec 6))

(define (http-transport:server-dat-make-url vec)
  (if (and (http-transport:server-dat-get-iface vec)
	   (http-transport:server-dat-get-port  vec))
      (conc "http://" 
	    (http-transport:server-dat-get-iface vec)
	    ":"
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
;;
;; connect
;;
(define (http-transport:client-connect iface port)
  (let* ((api-url      (conc "http://" iface ":" port "/api"))
	 (api-uri      (uri-reference (conc "http://" iface ":" port "/api")))
	 (api-req      (make-request method: 'POST uri: api-uri))
	 (server-dat   (vector iface port api-uri api-url api-req (current-seconds))))
    server-dat))

;; run http-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (http-transport:keep-running server-id run-id)
  ;; if none running or if > 20 seconds since 







|







350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
;;
;; connect
;;
(define (http-transport:client-connect iface port)
  (let* ((api-url      (conc "http://" iface ":" port "/api"))
	 (api-uri      (uri-reference (conc "http://" iface ":" port "/api")))
	 (api-req      (make-request method: 'POST uri: api-uri))
	 (server-dat   (vector iface port api-uri api-url api-req (current-seconds) 'http)))
    server-dat))

;; run http-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (http-transport:keep-running server-id run-id)
  ;; if none running or if > 20 seconds since 
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
				    (begin
				      (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server " server-id " for run " run-id)
				      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature")
				      (exit))
				    (loop start-time
					  (equal? sdat last-sdat)
					  sdat)))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
	 (server-timeout (server:get-timeout)))
    (let loop ((count         0)
	       (server-state 'available)
	       (bad-sync-count 0))








|







387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
				    (begin
				      (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server " server-id " for run " run-id)
				      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature")
				      (exit))
				    (loop start-time
					  (equal? sdat last-sdat)
					  sdat)))))))
         (iface       (car server-info)) ;; BB> this represents ip address, not interface (like eth0 as I would expect from the term)
         (port        (cadr server-info))
         (last-access 0)
	 (server-timeout (server:get-timeout)))
    (let loop ((count         0)
	       (server-state 'available)
	       (bad-sync-count 0))

533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
	      (begin
		(current-error-port *alt-log-file*)
		(current-output-port *alt-log-file*)))))
    (if (server:check-if-running run-id)
	(begin
	  (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running")
	  (exit 0)))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))
	       (remtries  4))
      (if (not server-id)
	  (if (> remtries 0)
	      (begin
		(thread-sleep! 2)
		(loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)
		      (- remtries 1)))
	      (begin
		;; since we didn't get the server lock we are going to clean up and bail out
		(debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
		(tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch")
		))
	  (let* ((th2 (make-thread (lambda ()







|





|







537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
	      (begin
		(current-error-port *alt-log-file*)
		(current-output-port *alt-log-file*)))))
    (if (server:check-if-running run-id)
	(begin
	  (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running")
	  (exit 0)))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id 'http))
	       (remtries  4))
      (if (not server-id)
	  (if (> remtries 0)
	      (begin
		(thread-sleep! 2)
		(loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id 'http)
		      (- remtries 1)))
	      (begin
		;; since we didn't get the server lock we are going to clean up and bail out
		(debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
		(tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch")
		))
	  (let* ((th2 (make-thread (lambda ()

Modified launch.scm from [a58a11e1e1] to [5f9f026431].

66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
    (if (file-exists? cname)
	(let* ((dat  (read-config cname #f #f))
	       (csvr (db:logpro-dat->csv dat stepname))
	       (csvt (let-values (( (fmt-cell fmt-record fmt-csv) (make-format ",")))
				 (fmt-csv (map list->csv-record csvr))))
	       (status (configf:lookup dat "final" "exit-status"))
	       (msg     (configf:lookup dat "final" "message")))
          ;;(if csvt  ;; this if blocked stack dump caused by .dat file from logpro being 0-byte.  fixed by upgrading logpro
              (rmt:csv->test-data run-id test-id csvt)
            ;;  (BB> "Error: run-id/test-id/stepname="run-id"/"test-id"/"stepname" => bad csvr="csvr)
            ;;  )
	  (cond
	   ((equal? status "PASS") "PASS") ;; skip the message part if status is pass
	   (status (conc (configf:lookup dat "final" "exit-status") ": " (if msg msg "no message")))
	   (else #f)))
	#f)))

(define (launch:runstep ezstep run-id test-id exit-info m tal testconfig)







<
|
<
<







66
67
68
69
70
71
72

73


74
75
76
77
78
79
80
    (if (file-exists? cname)
	(let* ((dat  (read-config cname #f #f))
	       (csvr (db:logpro-dat->csv dat stepname))
	       (csvt (let-values (( (fmt-cell fmt-record fmt-csv) (make-format ",")))
				 (fmt-csv (map list->csv-record csvr))))
	       (status (configf:lookup dat "final" "exit-status"))
	       (msg     (configf:lookup dat "final" "message")))

          (rmt:csv->test-data run-id test-id csvt)


	  (cond
	   ((equal? status "PASS") "PASS") ;; skip the message part if status is pass
	   (status (conc (configf:lookup dat "final" "exit-status") ": " (if msg msg "no message")))
	   (else #f)))
	#f)))

(define (launch:runstep ezstep run-id test-id exit-info m tal testconfig)
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
;;     megatest.config     (cache if all vars avail)
;;   returns:
;;     *toppath*
;;   side effects:
;;     sets; *configdat*    (megatest.config info)
;;           *runconfigdat* (runconfigs.config info)
;;           *configstatus* (status of the read data)
;;
(define (launch:setup #!key (force #f))
  (let* ((toppath  (or *toppath* (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath
	 (runname  (common:args-get-runname))
	 (target   (common:args-get-target))
	 (linktree (common:get-linktree))
	 (sections (if target (list "default" target) #f)) ;; for runconfigs
	 (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config 







|







695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
;;     megatest.config     (cache if all vars avail)
;;   returns:
;;     *toppath*
;;   side effects:
;;     sets; *configdat*    (megatest.config info)
;;           *runconfigdat* (runconfigs.config info)
;;           *configstatus* (status of the read data)
;;           *transport-type*
(define (launch:setup #!key (force #f))
  (let* ((toppath  (or *toppath* (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath
	 (runname  (common:args-get-runname))
	 (target   (common:args-get-target))
	 (linktree (common:get-linktree))
	 (sections (if target (list "default" target) #f)) ;; for runconfigs
	 (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config 
825
826
827
828
829
830
831

832
833
834
835
836
837
838
	    (debug:print-error 0 *default-log-port* "linktree not defined in [setup] section of megatest.config")
	    )))
    (if (and *toppath*
	     (directory-exists? *toppath*))
	(setenv "MT_RUN_AREA_HOME" *toppath*)
	(begin
	  (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.")))

    *toppath*))

(define (get-best-disk confdat testconfig)
  (let* ((disks   (or (and testconfig (hash-table-ref/default testconfig "disks" #f))
		      (hash-table-ref/default confdat "disks" #f)))
	 (minspace (let ((m (configf:lookup confdat "setup" "minspace")))
		     (string->number (or m "10000")))))







>







822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
	    (debug:print-error 0 *default-log-port* "linktree not defined in [setup] section of megatest.config")
	    )))
    (if (and *toppath*
	     (directory-exists? *toppath*))
	(setenv "MT_RUN_AREA_HOME" *toppath*)
	(begin
	  (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.")))
    (server:set-transport)
    *toppath*))

(define (get-best-disk confdat testconfig)
  (let* ((disks   (or (and testconfig (hash-table-ref/default testconfig "disks" #f))
		      (hash-table-ref/default confdat "disks" #f)))
	 (minspace (let ((m (configf:lookup confdat "setup" "minspace")))
		     (string->number (or m "10000")))))
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
	  (create-directory work-area #t)
	  (debug:print 0 *default-log-port* "WARNING: No disk work area specified - running in the test directory under tmp_run")))
    (set! cmdparms (base64:base64-encode 
		    (z3:encode-buffer 
		     (with-output-to-string
		       (lambda () ;; (list 'hosts     hosts)
			 (write (list (list 'testpath  test-path)
				      (list 'transport (conc *transport-type*))
				      ;; (list 'serverinf *server-info*)
				      (list 'toppath   *toppath*)
				      (list 'work-area work-area)
				      (list 'test-name test-name) 
				      (list 'runscript runscript) 
				      (list 'run-id    run-id   )
				      (list 'test-id   test-id  )







|







1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
	  (create-directory work-area #t)
	  (debug:print 0 *default-log-port* "WARNING: No disk work area specified - running in the test directory under tmp_run")))
    (set! cmdparms (base64:base64-encode 
		    (z3:encode-buffer 
		     (with-output-to-string
		       (lambda () ;; (list 'hosts     hosts)
			 (write (list (list 'testpath  test-path)
				      (list 'transport (conc (rmt:run-id->transport-type run-id)))
				      ;; (list 'serverinf *server-info*)
				      (list 'toppath   *toppath*)
				      (list 'work-area work-area)
				      (list 'test-name test-name) 
				      (list 'runscript runscript) 
				      (list 'run-id    run-id   )
				      (list 'test-id   test-id  )

Modified megatest.scm from [7f46162c88] to [56597888dd].

731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757

;;======================================================================
;; Start the server - can be done in conjunction with -runall or -runtests (one day...)
;;   we start the server if not running else start the client thread
;;======================================================================

(if (args:get-arg "-server")

    ;; Server? Start up here.
    ;;
    (let ((tl        (launch:setup))
	  (run-id    (and (args:get-arg "-run-id")
			  (string->number (args:get-arg "-run-id"))))
          (transport-type (string->symbol (or (args:get-arg "-transport") "http"))))
      (if run-id
	  (begin
	    (server:launch run-id transport-type)
	    (set! *didsomething* #t))
	  (debug:print-error 0 *default-log-port* "server requires run-id be specified with -run-id")))

    ;; Not a server? This section will decide how to communicate
    ;;
    ;;  Setup client for all expect listed here
    (if (null? (lset-intersection 
		equal?
		(hash-table-keys args:arg-hash)
		'("-list-servers"







|


|
|
|
<

|
|


|







731
732
733
734
735
736
737
738
739
740
741
742
743

744
745
746
747
748
749
750
751
752
753
754
755
756

;;======================================================================
;; Start the server - can be done in conjunction with -runall or -runtests (one day...)
;;   we start the server if not running else start the client thread
;;======================================================================

(if (args:get-arg "-server")
    
    ;; Server? Start up here.
    ;;
    (let* ((tl        (launch:setup))
           (run-id    (and (args:get-arg "-run-id")
                           (string->number (args:get-arg "-run-id")))))

      (if run-id
          (begin
	    (server:launch run-id (->string *transport-type*))
	    (set! *didsomething* #t))
	  (debug:print-error 0 *default-log-port* "server requires run-id be specified with -run-id")))
    
    ;; Not a server? This section will decide how to communicate
    ;;
    ;;  Setup client for all expect listed here
    (if (null? (lset-intersection 
		equal?
		(hash-table-keys args:arg-hash)
		'("-list-servers"

Modified rmt.scm from [bb562bf1d7] to [f14f777e62].

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

19
20
21
22
23
24
25
;;======================================================================
;; Copyright 2006-2013, Matthew Welland.
;; 
;;  This program is made available under the GNU GPL version 2.0 or
;;  greater. See the accompanying file COPYING for details.
;; 
;;  This program is distributed WITHOUT ANY WARRANTY; without even the
;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
;;  PURPOSE.
;;======================================================================

(use json format) ;; RADT => purpose of json format??

(declare (unit rmt))
(declare (uses api))
(declare (uses tdb))
(declare (uses http-transport))
;;(declare (uses nmsg-transport))

;;
;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!!
;;

;; ;; For debugging add the following to ~/.megatestrc
;;
;; (require-library trace)
|
|















|
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

;; Copyright 2006-2016, Matthew Welland.
;; 
;;  This program is made available under the GNU GPL version 2.0 or
;;  greater. See the accompanying file COPYING for details.
;; 
;;  This program is distributed WITHOUT ANY WARRANTY; without even the
;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
;;  PURPOSE.
;;======================================================================

(use json format) ;; RADT => purpose of json format??

(declare (unit rmt))
(declare (uses api))
(declare (uses tdb))
(declare (uses http-transport))
(declare (uses rpc-transport))

;;
;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!!
;;

;; ;; For debugging add the following to ~/.megatestrc
;;
;; (require-library trace)
66
67
68
69
70
71
72












73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

103




104
105
106
107
108
109





110
111
112
113
114
115
116
117
118
119
120




121


122
123
124
125

126
127
128
129
130
131
132
133
134
135
136
137
138
139






140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
    (if cinfo
	cinfo
	;; NB// can cache the answer for server running for 10 seconds ...
	;;  ;; (and (not (rmt:write-frequency-over-limit? cmd run-id))
	(if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id)
	    (client:setup run-id)
	    #f))))













(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected
  ;; clean out old connections
  ;; (mutex-lock! *db-multi-sync-mutex*)
  (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin
    (for-each 
     (lambda (run-id)
       (let ((connection (hash-table-ref/default *runremote* run-id #f)))
         (if (and (vector? connection)
        	  (< (http-transport:server-dat-get-last-access connection) expire-time))
             (begin
               (debug:print-info 0 *default-log-port* "Discarding connection to server for run-id " run-id ", too long between accesses")
               ;; bb- disabling nanomsg
               ;; SHOULD CLOSE THE CONNECTION HERE 
	       ;; (case *transport-type*
	       ;;   ((nmsg)(nn-close (http-transport:server-dat-get-socket 
	       ;;  		   (hash-table-ref *runremote* run-id)))))
               (hash-table-delete! *runremote* run-id)))))
     (hash-table-keys *runremote*)))
  ;; (mutex-unlock! *db-multi-sync-mutex*)
  ;; (mutex-lock! *send-receive-mutex*)
  (let* ((run-id          (if rid rid 0))
	 (connection-info (rmt:get-connection-info run-id)))
    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)
    (if connection-info
	;; use the server if have connection info

	(let* ((dat     (case *transport-type*




			  ((http)(condition-case
				  (http-transport:client-api-send-receive run-id connection-info cmd params)
				  ((commfail)(vector #f "communications fail"))
				  ((exn)(vector #f "other fail"))))
			  ;; ((nmsg)(condition-case
			  ;;         (nmsg-transport:client-api-send-receive run-id connection-info cmd params)





			  ;;         ((timeout)(vector #f "timeout talking to server"))))
			  (else  (exit))))
	       (success (if (vector? dat) (vector-ref dat 0) #f))
	       (res     (if (vector? dat) (vector-ref dat 1) #f)))
	  (if (vector? connection-info)(http-transport:server-dat-update-last-access connection-info))
	  (if success
	      (begin
		;; (mutex-unlock! *send-receive-mutex*)
		(case *transport-type* 
		  ((http) res) ;; (db:string->obj res))
		  ;; ((nmsg) res)




                  )) ;; (vector-ref res 1)))


	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 *default-log-port* "WARNING: Communication failed, trying call to rmt:send-receive again.")
		;; (case *transport-type*
		;;   ((nmsg)(nn-close (http-transport:server-dat-get-socket connection-info))))

		(hash-table-delete! *runremote* run-id) ;; don't keep using the same connection
		;; NOTE: killing server causes this process to block forever. No idea why. Dec 2. 
		;; (if (eq? (modulo attemptnum 5) 0)
		;;     (tasks:kill-server-run-id run-id tag: "api-send-receive-failed"))
		;; (mutex-unlock! *send-receive-mutex*) ;; close the mutex here to allow other threads access to communications
		(tasks:start-and-wait-for-server (tasks:open-db) run-id 15)
		;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1))))))

		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
		;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))
		;; (thread-sleep! 2)
		(rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1)))))






	;; no connection info? try to start a server, or access locally if no
	;; server and the query is read-only
	;;
	;; Note: The tasks db was checked for a server in starting mode in the rmt:get-connection-info call
	;;
	(if (and (< attemptnum 15)
		 (member cmd api:write-queries))
	    (let ((faststart (configf:lookup *configdat* "server" "faststart")))
	      (hash-table-delete! *runremote* run-id)
	      ;; (mutex-unlock! *send-receive-mutex*)
	      (if (and faststart (equal? faststart "no"))
		  (begin
		    (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
		    (thread-sleep! (random 5)) ;; give some time to settle and minimize collison?
		    (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
		  (let ((start-time (current-milliseconds))
			(max-query  (string->number (or (configf:lookup *configdat* "server" "server-query-threshold")
							"300")))
			(newres     (rmt:open-qry-close-locally cmd run-id params)))
		    (let ((delta (- (current-milliseconds) start-time)))
		      (if (> delta max-query)
			  (begin
			    (debug:print-info 0 *default-log-port* "Starting server as query time " delta " is over the limit of " max-query)
			    (server:kind-run run-id)))
		      ;; return the result!
		      newres)
		    )))
	    (begin
	      ;; (debug:print-error 0 *default-log-port* "Communication failed!")
	      ;; (mutex-unlock! *send-receive-mutex*)
	      ;; (exit)
	      (rmt:open-qry-close-locally cmd run-id params)
	      )))))

(define (rmt:update-db-stats run-id rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 *default-log-port* "WARNING: stats collection failed in update-db-stats")







>
>
>
>
>
>
>
>
>
>
>
>






|
<





|


<
<
<
<
<


|
<
|




>
|
>
>
>
>




<
|
>
>
>
>
>
|
|


|



|
|
|
>
>
>
>

>
>


|
|
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>
>
>
>
>
>
|
<



<
<
<
<
<
<
|
|
|
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

93
94
95
96
97
98
99
100





101
102
103

104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168

169
170
171






172
173
174
175
176

















177
178
179
180
181
182
183
    (if cinfo
	cinfo
	;; NB// can cache the answer for server running for 10 seconds ...
	;;  ;; (and (not (rmt:write-frequency-over-limit? cmd run-id))
	(if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id)
	    (client:setup run-id)
	    #f))))

(define (rmt:run-id->transport-type rid)
  (let* ((run-id          (if rid rid 0))
	 (connection-info (hash-table-ref/default *runremote* run-id #f)))
    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)

    (if connection-info ;; if we already have a connection for this run-id, use that precendent
	;; use the server if have connection info
	(let* ((transport-type (vector-ref connection-info 6))) ;; BB: assumes all transport-type'-servertdat vector's item 6 ids transport type
          transport-type)
        ;; otherwise pick the global default as preference. (set in common.scm)
        *transport-type*)))

(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected
  ;; side-effect: clean out old connections

  (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin
    (for-each 
     (lambda (run-id)
       (let ((connection (hash-table-ref/default *runremote* run-id #f)))
         (if (and (vector? connection)
        	  (< (http-transport:server-dat-get-last-access connection) expire-time)) ;; BB> BBTODO: make this generic, not http transport specific.
             (begin
               (debug:print-info 0 *default-log-port* "Discarding connection to server for run-id " run-id ", too long between accesses")





               (hash-table-delete! *runremote* run-id)))))
     (hash-table-keys *runremote*)))
  

  (let* ((run-id     (if rid rid 0))
	 (connection-info (rmt:get-connection-info run-id)))
    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)
    (if connection-info
	;; use the server if have connection info
	(let* ((transport-type (rmt:run-id->transport-type run-id))
               
               ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
               ;;  Here, we make request to remote server
               ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
               (dat     (case transport-type ;; BB: replaced *transport-type* global with run-id specific transport-type
			  ((http)(condition-case
				  (http-transport:client-api-send-receive run-id connection-info cmd params)
				  ((commfail)(vector #f "communications fail"))
				  ((exn)(vector #f "other fail"))))

                          ((rpc) (rpc-transport:client-api-send-receive run-id connection-info cmd params)) ;; BB: let us error out for now
			  (else  
                           (debug:print-error 0 *default-log-port* "(1) Transport [" transport-type
                                              "] specified for run-id [" run-id
                                              "] is not implemented in rmt:send-receive.  Cannot proceed." (symbol? transport-type))
                           (vector #f (conc "transport ["transport-type"] unimplemented")))))
               
               
	       (success (if (vector? dat) (vector-ref dat 0) #f))
	       (res     (if (vector? dat) (vector-ref dat 1) #f)))
	  (if (vector? connection-info)(http-transport:server-dat-update-last-access connection-info)) ;; BB> BBTODO: make this generic, not http transport specific.
	  (if success
	      (begin
		;; (mutex-unlock! *send-receive-mutex*)
		(case transport-type 
		  ((http rpc) res) ;; (db:string->obj res))
                  (else
                   (debug:print-error 0 *default-log-port* "(2) Transport [" transport-type
                                      "] specified for run-id [" run-id
                                      "] is not implemented in rmt:send-receive.  Cannot proceed. Also unexpected since this branch follows success which would follow a suported transport...")
                   #f)
                  )) ;; (vector-ref res 1)))
              
              ;; no success...
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 *default-log-port* "WARNING: Communication failed, trying call to rmt:send-receive again.")
                (case transport-type
                  
                  ((http rpc)
                   (hash-table-delete! *runremote* run-id) ;; don't keep using the same connection
                   ;; NOTE: killing server causes this process to block forever. No idea why. Dec 2. 
                   ;; (if (eq? (modulo attemptnum 5) 0)
                   ;;     (tasks:kill-server-run-id run-id tag: "api-send-receive-failed"))
                   ;; (mutex-unlock! *send-receive-mutex*) ;; close the mutex here to allow other threads access to communications
                   (tasks:start-and-wait-for-server (tasks:open-db) run-id 15)
                   ;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1))))))
                   
                   ;; no longer killing the server in http-transport:client-api-send-receive
                   ;; may kill it here but what are the criteria?
                   ;; start with three calls then kill server
                   ;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))
                   ;; (thread-sleep! 2)
                   (rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1)))
                  (else
                   (debug:print-error 0 *default-log-port* "(3) Transport [" transport-type
                                      "] specified for run-id [" run-id
                                      "] is not implemented in rmt:send-receive.  Cannot proceed.")
                   #f)))))
        
	;; no connection info; try to start a server

	;;
	;; Note: The tasks db was checked for a server in starting mode in the rmt:get-connection-info call
	;;






        (begin
          (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
          (thread-sleep! (random 5)) ;; give some time to settle and minimize collison?
          (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))
  


















(define (rmt:update-db-stats run-id rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 *default-log-port* "WARNING: stats collection failed in update-db-stats")
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268


269
270
271
272












273
274
275
276
277
278
279
280
			     (if (> tot 10)
				 (cons newmax-cmd currmax)
				 (cons 'none 0))
			     (loop (car tal)(cdr tal) newmax-cmd currmax)))))))
    (mutex-unlock! *db-stats-mutex*)
    res))
	  
(define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5))
  (let* ((dbstruct-local (if *dbstruct-db*
			     *dbstruct-db*
			     (let* ((dbdir (db:dbfile-path #f)) ;;  (conc    (configf:lookup *configdat* "setup" "linktree") "/.db"))
				    (db (make-dbr:dbstruct path:  dbdir local: #t)))
			       (set! *dbstruct-db* db)
			       db)))
	 (db-file-path   (db:dbfile-path 0))
	 ;; (read-only      (not (file-read-access? db-file-path)))
	 (start          (current-milliseconds))
	 (resdat         (api:execute-requests dbstruct-local (vector (symbol->string cmd) params)))
	 (success        (vector-ref resdat 0))
	 (res            (vector-ref resdat 1))
	 (duration       (- (current-milliseconds) start)))
    (if (not success)
	(if (> remretries 0)
	    (begin
	      (debug:print-error 0 *default-log-port* "local query failed. Trying again.")
	      (thread-sleep! (/ (random 5000) 1000)) ;; some random delay 
	      (rmt:open-qry-close-locally cmd run-id params remretries: (- remretries 1)))
	    (begin
	      (debug:print-error 0 *default-log-port* "too many retries in rmt:open-qry-close-locally, giving up")
	      #f))
	(begin
	  ;; (rmt:update-db-stats run-id cmd params duration)
	  ;; mark this run as dirty if this was a write
	  (if (not (member cmd api:read-only-queries))
	      (let ((start-time (current-seconds)))
		(mutex-lock! *db-multi-sync-mutex*)
		;; (if (not (hash-table-ref/default *db-local-sync* run-id #f))
		;; just set it every time. Is a write more expensive than a read and does it matter?
		(hash-table-set! *db-local-sync* (or run-id 0) start-time) ;; the oldest "write"
		(mutex-unlock! *db-multi-sync-mutex*)))
	  res))))

(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)
  (let* ((run-id   (if run-id run-id 0))
	 ;; (jparams  (db:obj->string params)) ;; (rmt:dat->json-str params))


	 (res  	   (handle-exceptions
		    exn
		    #f
		    (http-transport:client-api-send-receive run-id connection-info cmd params))))












;;		    ((commfail) (vector #f "communications fail")))))
    (if (and res (vector-ref res 0))
	(vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!!
	#f)))
;; 	(db:string->obj (vector-ref dat 1))
;; 	(begin
;; 	  (debug:print-error 0 *default-log-port* "rmt:send-receive-no-auto-client-setup failed, attempting to continue. Got " dat)
;; 	  dat))))







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<



>
>
|
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
|







228
229
230
231
232
233
234



































235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
			     (if (> tot 10)
				 (cons newmax-cmd currmax)
				 (cons 'none 0))
			     (loop (car tal)(cdr tal) newmax-cmd currmax)))))))
    (mutex-unlock! *db-stats-mutex*)
    res))
	  



































(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)
  (let* ((run-id   (if run-id run-id 0))
	 ;; (jparams  (db:obj->string params)) ;; (rmt:dat->json-str params))
	 (res (case (rmt:run-id->transport-type run-id)
                ((http) 
                 (handle-exceptions
                  exn
                  #f
                  (http-transport:client-api-send-receive run-id connection-info cmd params)))
                ((rpc)
                 (handle-exceptions
                  exn
                  #f
                  (rpc-transport:client-api-send-receive run-id connection-info cmd params)))
                (else  
                 (debug:print-error 0 *default-log-port* "(4) Transport [" *transport-type*
                                    "] specified for run-id [" run-id
                                    "] is not implemented in rmt:send-receive-no-auto-client-setup.  Cannot proceed.")
                 (exit 1)))))

              
              ;;		    ((commfail) (vector #f "communications fail")))))
    (if (and res (vector-ref res 0))
	(vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!!
	#f)))
;; 	(db:string->obj (vector-ref dat 1))
;; 	(begin
;; 	  (debug:print-error 0 *default-log-port* "rmt:send-receive-no-auto-client-setup failed, attempting to continue. Got " dat)
;; 	  dat))))
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
(define (rmt:login run-id)
  (rmt:send-receive 'login run-id (list *toppath* megatest-version run-id *my-client-signature*)))

;; This login does no retries under the hood - it acts a bit like a ping.
;; Deprecated for nmsg-transport.
;;
(define (rmt:login-no-auto-client-setup connection-info run-id)
  (case *transport-type*
    ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login run-id (list *toppath* megatest-version run-id *my-client-signature*)))
    ;;((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info 'login (list *toppath* megatest-version run-id *my-client-signature*)))
    ))

;; hand off a call to one of the db:queries statements
;; added run-id to make looking up the correct db possible 
;;
(define (rmt:general-call stmtname run-id . params)
  (rmt:send-receive 'general-call run-id (append (list stmtname run-id) params)))








<
|
<
<







296
297
298
299
300
301
302

303


304
305
306
307
308
309
310
(define (rmt:login run-id)
  (rmt:send-receive 'login run-id (list *toppath* megatest-version run-id *my-client-signature*)))

;; This login does no retries under the hood - it acts a bit like a ping.
;; Deprecated for nmsg-transport.
;;
(define (rmt:login-no-auto-client-setup connection-info run-id)

  (rmt:send-receive-no-auto-client-setup connection-info 'login run-id (list *toppath* megatest-version run-id *my-client-signature*)))



;; hand off a call to one of the db:queries statements
;; added run-id to make looking up the correct db possible 
;;
(define (rmt:general-call stmtname run-id . params)
  (rmt:send-receive 'general-call run-id (append (list stmtname run-id) params)))

Modified rpc-transport.scm from [62a65daa58] to [678adcae88].

1
2
3
4
5
6
7
8
9

;; Copyright 2006-2012, Matthew Welland.
;; 
;;  This program is made available under the GNU GPL version 2.0 or
;;  greater. See the accompanying file COPYING for details.
;; 
;;  This program is distributed WITHOUT ANY WARRANTY; without even the
;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
;;  PURPOSE.

|







1
2
3
4
5
6
7
8
9

;; Copyright 2006-2016, Matthew Welland.
;; 
;;  This program is made available under the GNU GPL version 2.0 or
;;  greater. See the accompanying file COPYING for details.
;; 
;;  This program is distributed WITHOUT ANY WARRANTY; without even the
;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
;;  PURPOSE.
19
20
21
22
23
24
25




26
27
28
























29
30
31



















































32













33







34








35








36


































37
38
39
40
41
42
43
44
45


46
47





48
49
50
51
52





53
















































54
55
56





























57



58












59
60
61

62
63






64

65
66
67
68
69
70
71



72
73



74











75
76


77
78
79
80
81

82

83





84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99













100
101
102
103
104
105
106
107
108



109
110


111
112
113
114
115
116

117





118






119
120
121
122
123







124













125







126
127
128


129
130

131









132
133


134





135











136





137












138
139

140
141
142
143
144
145
146


147

148

149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

194
195

196


197
198
199
200
201
202
203
204
205
206
207
208
209
210

211
212
213
214

215
216
217
218
219
220



221
222
223
224
225
226







227
228


(declare (uses common))
(declare (uses db))
(declare (uses tests))
(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running.

(include "common_records.scm")
(include "db_records.scm")





;; procstr is the name of the procedure to be called as a string
(define (rpc-transport:autoremote procstr params)
























  (handle-exceptions
   exn
   (begin



















































     (debug:print 1 *default-log-port* "Remote failed for " proc " " params)













     (apply (eval (string->symbol procstr)) params))







   ;; (if *runremote*








   ;;    (apply (eval (string->symbol (conc "remote:" procstr))) params)








   (apply (eval (string->symbol procstr)) params)))



































;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (rpc-transport:launch run-id)
  (let* ((tdbdat (tasks:open-db)))
    (BB> "rpc-transport:launch fired for run-id="run-id)
    (set! *run-id*   run-id)


    (if (args:get-arg "-daemonize")
        (daemon:ize))





    (if (server:check-if-running run-id)
        (begin
          (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running")
          (exit 0)))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))





               (remtries  4))
















































      (if (not server-id)
          (if (> remtries 0)
              (begin





























                (thread-sleep! 2)



                (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)












                      (- remtries 1)))
              (begin
                ;; since we didn't get the server lock we are going to clean up and bail out

                (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
                (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " rpc-transport:launch")))






          (begin

            (rpc-transport:run (if (args:get-arg "-server")(args:get-arg "-server") "-") run-id server-id)
            (exit))))))

(define (rpc-transport:run hostn run-id server-id)
  (debug:print 2 *default-log-port* "Attempting to start the rpc server ...")
   ;; (trace rpc:publish-procedure!)




  (rpc:publish-procedure! 'server:login server:login)
  (rpc:publish-procedure! 'testing (lambda () "Just testing"))















  (let* ((db              #f)
	 (hostname        (get-host-name))


	 (ipaddrstr       (let ((ipstr (if (string=? "-" hostn)
					   ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
					   (server:get-best-guess-address hostname)
					   #f)))
			    (if ipstr ipstr hostn))) ;; hostname))) 

	 (start-port      (open-run-close tasks:server-get-next-port tasks:open-db))

	 (link-tree-path  (configf:lookup *configdat* "setup" "linktree"))





	 (rpc:listener    (rpc-transport:find-free-port-and-open (rpc:default-server-port)))
	 (th1             (make-thread
			   (lambda ()
			     ((rpc:make-server rpc:listener) #t))
			   "rpc:server"))
			   ;; (cute (rpc:make-server rpc:listener) "rpc:server")
			   ;; 'rpc:server))
	 (hostname        (if (string=? "-" hostn)
			      (get-host-name) 
			      hostn))
	 (ipaddrstr       (if (string=? "-" hostn)
			      (server:get-best-guess-address hostname) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
			      #f))
	 (portnum         (rpc:default-server-port))
	 (host:port       (conc (if ipaddrstr ipaddrstr hostname) ":" portnum))
	 (tdb             (tasks:open-db)))













    (thread-start! th1)
    (set! db *inmemdb*)
    (open-run-close tasks:server-set-interface-port 
		    tasks:open-db 
		    server-id 
		    ipaddrstr portnum)
    (debug:print 0 *default-log-port* "Server started on " host:port)
    
    ;; (trace rpc:publish-procedure!)



    ;; (rpc:publish-procedure! 'server:login server:login)
    ;; (rpc:publish-procedure! 'testing (lambda () "Just testing"))



    ;;======================================================================
    ;;	  ;; end of publish-procedure section
    ;;======================================================================
    ;;
    (on-exit (lambda ()

	       (open-run-close tasks:server-set-state! tasks:open-db server-id "stopped")))












    (set! *rpc:listener* rpc:listener)
    (tasks:server-set-state! tdb server-id "running")
    (set! *inmemdb*  (db:setup run-id))
    ;; if none running or if > 20 seconds since 
    ;; server last used then start shutdown







    (let loop ((count 0))













      (thread-sleep! 5) ;; no need to do this very often







      (let ((numrunning -1)) ;; (db:get-count-tests-running db)))
	(if (or (> numrunning 0)
		(> (+ *last-db-access* 60)(current-seconds)))


	    (begin
	      (debug:print-info 0 *default-log-port* "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*))

	      (loop (+ 1 count)))









	    (begin
	      (debug:print-info 0 *default-log-port* "Starting to shutdown the server side")


	      (open-run-close tasks:server-delete-record tasks:open-db server-id " rpc-transport:try-start-server stop")





	      (thread-sleep! 10)











	      (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*)





	      (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting")












	      ))))))


(define (rpc-transport:find-free-port-and-open port)
  (handle-exceptions
   exn
	  (begin
     (print "Failed to bind to port " (rpc:default-server-port) ", trying next port")
     (rpc-transport:find-free-port-and-open (+ port 1)))
   (rpc:default-server-port port)


   (tcp-read-timeout 240000)

   (tcp-listen (rpc:default-server-port) 10000)))


(define (rpc-transport:ping run-id host port)
  (handle-exceptions
   exn
   (begin
     (print "SERVER_NOT_FOUND")
     (exit 1))
   (let ((login-res ((rpc:procedure 'server:login host port) *toppath*)))
     (if (and (list? login-res)
	      (car login-res))
	 (begin
	   (print "LOGIN_OK")
	   (exit 0))
	 (begin
	   (print "LOGIN_FAILED")
	   (exit 1))))))

(define (rpc-transport:client-setup run-id #!key (remtries 10))
  (if *runremote*
      (begin
	(debug:print-error 0 *default-log-port* "Attempt to connect to server but already connected")
	#f)
      (let* ((host-info (hash-table-ref/default *runremote* run-id #f))) ;; (open-run-close db:get-var #f "SERVER"))
	(if host-info
	    (let ((iface    (car host-info))
		  (port     (cadr host-info))
		  (ping-res ((rpc:procedure 'server:login host port) *toppath*)))
	      (if ping-res
		  (let ((server-dat (list iface port #f #f #f)))
		    (hash-table-set! *runremote* run-id server-dat)
		    server-dat)
		  (begin
		    (server:try-running run-id)
		    (thread-sleep! 2)
		    (rpc-transport:client-setup run-id (- remtries 1)))))
 	    (let* ((server-db-info (open-run-close tasks:get-server tasks:open-db run-id)))
 	      (debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	      (if server-db-info
 		  (let* ((iface     (tasks:hostinfo-get-interface server-db-info))
 			 (port      (tasks:hostinfo-get-port      server-db-info))
			 (server-dat (list iface port #f #f #f))
 			 (ping-res  ((rpc:procedure 'server:login host port) *toppath*)))
 		    (if start-res
 			(begin
 			  (hash-table-set! *runremote* run-id server-dat)

			  server-dat)
			(begin

			  (server:try-running run-id)


			  (thread-sleep! 2)
			  (rpc-transport:client-setup run-id (- remtries 1)))))
		  (begin
		    (server:try-running run-id)
		    (thread-sleep! 2)
		    (rpc-transport:client-setup run-id (- remtries 1)))))))))
;; 
;; 	     (port     (if (and hostinfo (> (length hostdat) 1))(cadr hostdat) #f)))
;; 	(if (and port
;; 		 (string->number port))
;; 	    (let ((portn (string->number port)))
;; 	      (debug:print-info 2 *default-log-port* "Setting up to connect to host " host ":" port)
;; 	      (handle-exceptions
;; 	       exn

;; 	       (begin
;; 		 (debug:print-error 0 *default-log-port* "Failed to open a connection to the server at host: " host " port: " port)
;; 		 (debug:print 0 *default-log-port* "   EXCEPTION: " ((condition-property-accessor 'exn 'message) exn))
;; 		 ;; (open-run-close 

;; 		 ;;  (lambda (db . param) 
;; 		 ;;    (sqlite3:execute db "DELETE FROM metadat WHERE var='SERVER'"))
;; 		 ;;  #f)
;; 		 (set! *runremote* #f))
;; 	       (if (and (not (args:get-arg "-server")) ;; no point in the server using the server using the server
;; 			((rpc:procedure 'server:login host portn) *toppath*))



;; 		   (begin
;; 		     (debug:print-info 2 *default-log-port* "Logged in and connected to " host ":" port)
;; 		     (set! *runremote* (vector host portn)))
;; 		   (begin
;; 		     (debug:print-info 2 *default-log-port* "Failed to login or connect to " host ":" port)
;; 		     (set! *runremote* #f)))))







;; 	    (debug:print-info 2 *default-log-port* "no server available")))))










>
>
>
>


|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>






<
<
|
>
>
|
|
>
>
>
>
>
|
<
|
|
|
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
<
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
|
|
<
>
|
<
>
>
>
>
>
>
|
>
|
|





>
>
>
|
|
>
>
>

>
>
>
>
>
>
>
>
>
>
>

|
>
>
|


|
|
>
|
>

>
>
>
>
>
|


|

|
|
|





|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>


|
<
<
<


|
>
>
>
|
<
>
>
|
<
<
<
<

>
|
>
>
>
>
>

>
>
>
>
>
>
|
|
|
|
|
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
|
|
|
>
>
|
<
>
|
>
>
>
>
>
>
>
>
>
|
|
>
>
|
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
|

>
|


|

|

>
>

>
|
>
|




|


<
|







|
|
<
<
<
<
|
<
<
|
|
|
<
<
|
<
<
<
<
<
<
<
<
<
<
<
|
<
>
|
|
>
|
>
>
|
|
<
<
|
|
<
<
<
<
<
<
<
|
>
|
<
<
<
>
|
<
<
<
<
|
>
>
>
|
|
|
<
<
|
>
>
>
>
>
>
>
|
|
>
>
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191


192
193
194
195
196
197
198
199
200
201
202

203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260

261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309

310
311

312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396



397
398
399
400
401
402
403

404
405
406




407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461

462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537

538
539
540
541
542
543
544
545
546
547




548


549
550
551


552











553

554
555
556
557
558
559
560
561
562


563
564







565
566
567



568
569




570
571
572
573
574
575
576


577
578
579
580
581
582
583
584
585
586
587
588
(declare (uses common))
(declare (uses db))
(declare (uses tests))
(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running.

(include "common_records.scm")
(include "db_records.scm")

(define *heartbeat-mutex* (make-mutex))
(define *server-loop-heart-beat* (current-seconds))


;; procstr is the name of the procedure to be called as a string
(define (rpc-transport:autoremote procstr params)  ;; may be unused, I think api-exec deprecates this one.
  (let* ((procsym (if (symbol? procstr)
                     procstr
                     (string->symbol (->string procstr))))
        (res
         (begin
           (apply (eval procsym) params))))
    res))


;; rpc receiver
(define (rpc-transport:api-exec cmd params)
  (let* ( (resdat  (api:execute-requests *inmemdb* (vector cmd params))) ;; #( flag result )
          (flag    (vector-ref resdat 0))
          (res     (vector-ref resdat 1)))

    (mutex-lock! *heartbeat-mutex*)

    (set! *last-db-access* (current-seconds)) ;; bump *last-db-access*; this will renew keep-running thread's lease on life for another (server:get-timeout) seconds
    (BB> "in api-exec; last-db-access updated to "*last-db-access*)
    (mutex-unlock! *heartbeat-mutex*)

    res))


  ;; (handle-exceptions
  ;;  exn
  ;;  (begin
  ;;    (debug:print 0 *default-log-port* "Remote failed for " proc " " params " exn="exn)
  ;;    (apply (eval (string->symbol procstr)) params))
  ;;  ;; (if *runremote*
  ;;  ;;    (apply (eval (string->symbol (conc "remote:" procstr))) params)
  ;;  (apply (eval (string->symbol procstr)) params)))

;; retry an operation (depends on srfi-18)
;; ==================
;; idea here is to avoid spending time on coding retrying something.  Trying to be generic here.
;;
;; Exception handling:
;; -------------------
;; if evaluating the thunk results in exception, it will be retried.
;; on last try, if final-failure-returns-actual is true, the exception will be re-thrown to caller.
;;
;; look at options below #!key to see how to configure behavior
;;
;;
(define (retry-thunk
         the-thunk
         #!key ;;;; options below
         (accept-result?   (lambda (x) x)) ;; retry if predicate applied to thunk's result is false 
         (retries                       4) ;; how many tries
         (failure-value                #f) ;; return this on final failure, unless following option is enabled:
         (final-failure-returns-actual #f) ;; on failure, on the last try, just return the result, not failure-value

         (retry-delay                 0.1) ;; delay between tries
         (back-off-factor               1) ;; multiply retry-delay by this factor on retry
         (random-delay                0.1) ;; add a random portion of this value to wait

         (chatty                       #f) ;; print status as we go, for debugging.
         )
  
  (when chatty (print) (print "Entered retry-thunk") (print "-=-=-=-=-=-"))
  (let* ((guarded-thunk ;; we are guarding the thunk against exceptions.  We will record whether result of evaluation is an exception or a regular result.
          (lambda ()
           (let* ((EXCEPTION (gensym)) ;; using gensym to avoid potential collision
                  (res
                   (condition-case
                    (the-thunk) ;; this is what we are guarding the execution of
                    [x () (cons EXCEPTION x)]
                    )))
             (cond
              ((and (pair? res) (eq? (car res) EXCEPTION))
               (if chatty
                   (print " - the-thunk threw exception >"(cdr res)"<"))
               (cons 'exception (cdr res)))
               (else
                (if chatty
                    (print " - the-thunk returned result >"res"<"))
                (cons 'regular-result res)))))))
    
    (let loop ((guarded-res (guarded-thunk))
               (retries-left retries)
               (fail-wait retry-delay))
      (if chatty (print "   =========="))
      (let* ((wait-time (+ fail-wait (+ (* fail-wait back-off-factor)
                                        (* random-delay
                                           (/ (random 1024) 1024) ))))
             (res-type (car guarded-res))
             (res-value (cdr guarded-res)))
        (cond
         ((and (eq? res-type 'regular-result) (accept-result? res-value))
                   (if chatty (print " + return result that satisfied accept-result? >"res-value"<"))
                   res-value)

         ((> retries-left 0)
          (if chatty (print " - sleep "wait-time))
          (thread-sleep! wait-time)
          (if chatty (print " + retry ["retries-left" tries left]"))
          (loop (guarded-thunk)
                (sub1 retries-left)
                wait-time))
         
         ((eq? res-type 'regular-result)
          (if final-failure-returns-actual
              (begin
                (if chatty (print " + last try failed- return the result >"res-value"<"))
                res-value)
              (begin
                (if chatty (print " + last try failed- return canned failure value >"failure-value"<"))
              failure-value)))
         
         (else ;; no retries left; result was not accepted and res-type can only be 'exception
          (if final-failure-returns-actual 
              (begin
                (if chatty (print " + last try failed with exception- re-throw it >"res-value"<"))
                (abort res-value)); re-raise the exception. TODO: find a way for call-history to show as though from entry to this function
              (begin
                (if chatty (print " + last try failed with exception- return canned failure value >"failure-value"<"))
                failure-value))))))))


(define (rpc-transport:server-shutdown server-id rpc:listener #!key (from-on-exit #f))
  (on-exit (lambda () #t)) ;; turn off on-exit stuff
  ;;(tcp-close rpc:listener) ;; gotta exit nicely
  ;;(tasks:bb-server-set-state! server-id "stopped")


  ;; TODO: (low) the following is extraordinaritly slow.  Maybe we don't even need portlogger for rpc anyway??  the exception-based failover when ports are taken is fast!
  ;;(portlogger:open-run-close portlogger:set-port (rpc:default-server-port) "released")
  
  (set! *time-to-exit* #t)
  (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t))
  (tasks:bb-server-delete-record server-id " rpc-transport:keep-running complete")
  (BB> "Before (exit) (from-on-exit="from-on-exit")")
  (unless from-on-exit (exit))  ;; sometimes we hang (around) here with 100% cpu.
  (BB> "After")
  ;; strace reveals endless:
  ;; getrusage(RUSAGE_SELF, {ru_utime={413, 917868}, ru_stime={0, 60003}, ...}) = 0
  ;; getrusage(RUSAGE_SELF, {ru_utime={414, 9874}, ru_stime={0, 60003}, ...}) = 0
  ;; getrusage(RUSAGE_SELF, {ru_utime={414, 13874}, ru_stime={0, 60003}, ...}) = 0
  ;; getrusage(RUSAGE_SELF, {ru_utime={414, 105880}, ru_stime={0, 60003}, ...}) = 0
  ;; getrusage(RUSAGE_SELF, {ru_utime={414, 109880}, ru_stime={0, 60003}, ...}) = 0
  ;; getrusage(RUSAGE_SELF, {ru_utime={414, 201886}, ru_stime={0, 60003}, ...}) = 0
  ;; getrusage(RUSAGE_SELF, {ru_utime={414, 205886}, ru_stime={0, 60003}, ...}) = 0
  ;; getrusage(RUSAGE_SELF, {ru_utime={414, 297892}, ru_stime={0, 60003}, ...}) = 0
  ;; getrusage(RUSAGE_SELF, {ru_utime={414, 301892}, ru_stime={0, 60003}, ...}) = 0
  ;; getrusage(RUSAGE_SELF, {ru_utime={414, 393898}, ru_stime={0, 60003}, ...}) = 0
  ;; getrusage(RUSAGE_SELF, {ru_utime={414, 397898}, ru_stime={0, 60003}, ...}) = 0
  ;; make a post to chicken-users w/ http://paste.call-cc.org/paste?id=60a4b66a29ccf7d11359ea866db642c970735978
  (if from-on-exit
      ;; avoid above condition!  End current process externally  since 1 in 20 (exit)'s result in hung, 100% cpu zombies. (see above)
      (system (conc  "kill -9  "(current-process-id))))
  )


;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (rpc-transport:launch run-id)


  (set! *run-id*   run-id)

  ;; ;; send to background if requested
  ;; (when (args:get-arg "-daemonize")
  ;;     (daemon:ize)
  ;;     (when *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
  ;;       (current-error-port *alt-log-file*)
  ;;       (current-output-port *alt-log-file*)))

  ;; double check we dont alrady have a running server for this run-id
  (when (server:check-if-running run-id)

    (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running")
    (exit 0))

  ;; let's get a server-id for this server
  ;;   if at first we do not suceed, try 3 more times.
  (let ((server-id (retry-thunk
                    (lambda () (tasks:bb-server-lock-slot run-id 'rpc))
                    chatty: #f
                    retries: 4)))
    (when (not server-id) ;; dang we couldn't get a server-id.
      ;; since we didn't get the server lock we are going to clean up and bail out
      (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
      (tasks:bb-server-delete-records-for-this-pid " rpc-transport:launch")
      (exit 1))

    ;; we got a server-id (and a corresponding entry in servers table in globally shared mdb)
    ;; all systems go.  Proceed to setup rpc server.  
    (rpc-transport:run
     (if (args:get-arg "-server")
         (args:get-arg "-server")
         "-")
     run-id
     server-id)
    (exit)))

(define *rpc-listener-port* #f)
(define *rpc-listener-port-bind-timestamp* #f)

(define *on-exit-flag #f)

(define (rpc-transport:server-dat-get-iface         vec)    (vector-ref  vec 0))
(define (rpc-transport:server-dat-get-port          vec)    (vector-ref  vec 1))
(define (rpc-transport:server-dat-get-last-access   vec)    (vector-ref  vec 5))
(define (rpc-transport:server-dat-get-transport     vec)    (vector-ref  vec 6))
(define (rpc-transport:server-dat-update-last-access vec)
  (if (vector? vec)
      (vector-set! vec 5 (current-seconds))
      (begin
	(print-call-chain (current-error-port))
	(debug:print-error 0 *default-log-port* "call to rpc-transport:server-dat-update-last-access with non-vector!!"))))


(define *api-exec-ht* (make-hash-table))

;; let's see if caching the rpc stub curbs thread-profusion on server side
(define (rpc-transport:get-api-exec iface port)
  (let* ((lu (hash-table-ref/default *api-exec-ht* '(iface . port) #f)))
    (if lu
        lu
        (let ((res (rpc:procedure 'api-exec iface port)))
          (hash-table-set! *api-exec-ht* '(iface . port) res)
          res))))

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; this client-side procedure makes rpc call to server and returns result
;;
(define (rpc-transport:client-api-send-receive run-id serverdat cmd params #!key (numretries 3))
  (if (not (vector? serverdat))

      (begin
        (BB> "WHAT?? for run-id="run-id", serverdat="serverdat)
        (print-call-chain)
        (exit 1)))
  (let* ((iface (rpc-transport:server-dat-get-iface serverdat))
         (port  (rpc-transport:server-dat-get-port serverdat))
         (res #f)
         (api-exec (rpc-transport:get-api-exec iface port))  
         (send-receive (lambda ()
                         (tcp-buffer-size 0)
                         (set! res (retry-thunk
                                    (lambda ()
                                      (condition-case
                                       ;;(vector #t (run-remote cmd params))
                                       (vector 'success (api-exec cmd params))
                                       [x (exn i/o net) (vector 'comms-fail (conc "communications fail ["(->string x)"]") x)]
                                       [x () (vector 'other-fail "other fail ["(->string x)"]" x)]))
                                    chatty: #f
                                    accept-result?: (lambda(x)
                                                      (and (vector? x) (vector-ref x 0)))
                                    retries: 4
                                    back-off-factor: 1.5
                                    random-wait: 0.2
                                    retry-delay: 0.1
                                    final-failure-returns-actual: #t))
                         res
                         ))
         (th1 (make-thread send-receive "send-receive"))
         (time-out-reached #f)
         (time-out     (lambda ()
			      (thread-sleep! 45)
                              (set! time-out-reached #t)
                              (thread-terminate! th1)
			      #f))

         (th2 (make-thread time-out     "time out")))
	 (thread-start! th1)
	 (thread-start! th2)
	 (thread-join! th1)
	 (thread-terminate! th2)
	 (debug:print-info 11 *default-log-port* "got res=" res)
	 (if (vector? res)
             (case (vector-ref res 0)
               ((success) (vector #t (vector-ref res 1)))
               ((comms-fail)
                (debug:print 0 *default-log-port* "WARNING: comms failure for rpc request")
                ;;(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
                (vector #f (vector-ref res 1)))
               (else

                (debug:print-error 0 *default-log-port* "error occured at server, info=" (vector-ref res 1))
                (debug:print 0 *default-log-port* " client call chain:")

                (print-call-chain (current-error-port))
                (debug:print 0 *default-log-port* " server call chain:")
                (pp (vector-ref res 1) (current-error-port))
                (signal (vector-ref res 2))))
             (signal (make-composite-condition
                      (make-property-condition 
             	       'timeout
             	       'message "nmsg-transport:client-api-send-receive-raw timed out talking to server"))))))
        


(define (rpc-transport:run hostn run-id server-id)
  (debug:print 2 *default-log-port* "Attempting to start the rpc server ...")
   ;; (trace rpc:publish-procedure!)

  ;;======================================================================
  ;;	  start of publish-procedure section
  ;;======================================================================
  (rpc:publish-procedure! 'server:login server:login) ;; this allows client to validate it is the same megatest instance as the server.  No security here, just making sure we're in the right room.
  (rpc:publish-procedure!
   'testing
   (lambda ()
     "Just testing"))

  ;; procedure to receive arbitrary API request from client's rpc:send-receive/rpc-transport:client-api-send-receive 
  (rpc:publish-procedure! 'rpc-transport:autoremote rpc-transport:autoremote)
  ;; can use this to run most anything at the remote
  (rpc:publish-procedure! 'api-exec rpc-transport:api-exec)
  
  
  ;;======================================================================
  ;;	  end of publish-procedure section
  ;;======================================================================


  (let* ((db              #f)
	 (hostname        (let ((res (get-host-name)))  res))
         (server-start-time (current-seconds))
         (server-timeout (server:get-timeout))
	 (ipaddrstr       (let* ((ipstr (if (string=? "-" hostn)
					   ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
					   (server:get-best-guess-address hostname)
					   #f))
                                 (res (if ipstr ipstr hostn)))
                            res)) ;; hostname))) 
	 (start-port      (let ((res (portlogger:open-run-close portlogger:find-port)))      ;; BB> TODO: remove portlogger!
                            res))
	 (link-tree-path  (configf:lookup *configdat* "setup" "linktree"))

         ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
         ;; rpc:listener is the tcp-listen result from inside the find-free-port-and-open complex.
         ;;   It is our handle on the listening tcp port
         ;;   We will attach this to our rpc server with rpc:make-server in thread th1 .
	 (rpc:listener    (rpc-transport:find-free-port-and-open start-port)) 
	 (th1             (make-thread
			   (lambda ()
			     ((rpc:make-server rpc:listener) #t) )
			   "rpc:server"))


         (hostname        (if (string=? "-" hostn)
			      (get-host-name) 
			      hostn))
	 (ipaddrstr       (if (string=? "-" hostn)
			      (server:get-best-guess-address hostname) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
			      #f))
	 (portnum         (let ((res (rpc:default-server-port)))  res))
	 (host:port       (conc (if ipaddrstr ipaddrstr hostname) ":" portnum)))


    ;; BB> TODO: remove portlogger!
    ;; if rpc found it needed a different port than portlogger provided, keep portlogger in the loop.
    ;; (when (not (equal? start-port portnum))
    ;;   (BB> "portlogger proffered "start-port" but rpc grabbed "portnum)
    ;;   (portlogger:open-run-close portlogger:set-port start-port "released")
    ;;   (portlogger:open-run-close portlogger:take-port portnum))

    (tasks:bb-server-set-interface-port server-id ipaddrstr portnum)

    ;;============================================================
    ;;  activate thread th1 to attach opened tcp port to rpc server
    ;;=============================================================
    (thread-start! th1)
    (set! db *inmemdb*)




    (debug:print 0 *default-log-port* "Server started on " host:port)
    

    (thread-sleep! 5)
    (if (rpc-transport:self-test run-id ipaddrstr portnum)
        (debug:print 0 *default-log-port* "INFO: rpc self test passed!")
        (begin

          (debug:print 0 *default-log-port* "Error: rpc listener did not pass self test.  Shutting down.  On: " host:port)
          (exit)))
    




    (on-exit (lambda ()
               (rpc-transport:server-shutdown server-id rpc:listener from-on-exit: #t)))
    
    ;; check again for running servers for this run-id in case one has snuck in since we checked last in rpc-transport:launch
    (if (not (equal? server-id (tasks:bb-server-am-i-the-server? run-id)));; try to ensure no double registering of servers
        (begin ;; i am not the server, another server snuck in and beat this one to the punch
          (tcp-close rpc:listener) ;; gotta exit nicely and free up that tcp port
          (tasks:bb-server-set-state! server-id "collision"))

        (begin ;; i am the server
          ;; setup the in-memory db
          (set! *inmemdb*  (db:setup run-id))
          (db:get-db *inmemdb* run-id)

          ;; let's make it official
          (set! *rpc:listener* rpc:listener) 
          (tasks:bb-server-set-state! server-id "running") ;; update our mdb servers entry

          
          
          ;; this let loop will hold open this thread until we want the server to shut down.
          ;;   if no requests received within the last 20 seconds :
          ;;   database hasnt changed in ??
          ;;

          ;; begin new loop
          ;; keep-running loop: polls last-db-access to see if we have timed out.  
          (let loop ((count          0)
                     (bad-sync-count 0))

            ;; Use this opportunity to sync the inmemdb to db
            (let ((start-time (current-milliseconds))
                  (sync-time  #f)
                  (rem-time   #f))
              ;; inmemdb is a dbstruct
              (condition-case
               (db:sync-touched *inmemdb* *run-id* force-sync: #t)
               ((sync-failed)(cond
                              ((> bad-sync-count 10) ;; time to give up
                               (rpc-transport:server-shutdown server-id rpc:listener))
                              (else ;; (> bad-sync-count 0)  ;; we've had a fail or two, delay and loop
                               (thread-sleep! 5)
                               (loop count (+ bad-sync-count 1)))))
               ((exn)
                (debug:print-error 0 *default-log-port* "error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server")
                (rpc-transport:server-shutdown server-id rpc:listener)))
              (set! sync-time  (- (current-milliseconds) start-time))
              (set! rem-time (quotient (- 4000 sync-time) 1000))
              (debug:print 4 *default-log-port* "SYNC: time= " sync-time ", rem-time=" rem-time)
              
              (if (and (<= rem-time 4)
                       (> rem-time 0))
                  (thread-sleep! rem-time)
                  (thread-sleep! 4))) ;; fallback for if the math is changed ...
            

            (if (< count 1) ;; 3x3 = 9 secs aprox
                (loop (+ count 1) bad-sync-count))

            ;; BB: don't see how this is possible with RPC
            ;; ;; Check that iface and port have not changed (can happen if server port collides)
            ;; (mutex-lock! *heartbeat-mutex*)
            ;; (set! sdat *server-info*)
            ;; (mutex-unlock! *heartbeat-mutex*)
            
            ;; (if (or (not (equal? sdat (list iface port)))
            ;;         (not server-id))
            ;;     (begin 
            ;;       (debug:print-info 0 *default-log-port* "interface changed, refreshing iface and port info")
            ;;       (set! iface (car sdat))
            ;;       (set! port  (cadr sdat))))
            
            ;; Transfer *last-db-access* to last-access to use in checking that we are still alive
            (mutex-lock! *heartbeat-mutex*)
            (set! last-access *last-db-access*)
            (BB> "in rpc-transport:run ; last-access="last-access)
            (mutex-unlock! *heartbeat-mutex*)
            
            ;; (debug:print 11 *default-log-port* "last-access=" last-access ", server-timeout=" server-timeout)
            ;;
            ;; no_traffic, no running tests, if server 0, no running servers
            ;;
            ;; (let ((wait-on-running (configf:lookup *configdat* "server" b"wait-on-running"))) ;; wait on running tasks (if not true then exit on time out)
            ;;
            (let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600))
                   (adjusted-timeout (if (> hrs-since-start 1)
                                         (- server-timeout (inexact->exact (round (* hrs-since-start 60))))  ;; subtract 60 seconds per hour
                                         server-timeout)))
              (if (common:low-noise-print 120 "server timeout")
                  (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout))
              (if (and *server-run*
                       (> (+ last-access server-timeout)
                          (current-seconds)))
                  (begin
                    (if (common:low-noise-print 120 "server continuing")
                        (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)))
                    ;;
                    ;; Consider implementing some smarts here to re-insert the record or kill self is
                    ;; the db indicates so
                    ;;
                    (if (tasks:bb-server-am-i-the-server? run-id)
                        (tasks:bb-server-set-state! server-id "running"))
                    ;;
                    (loop 0 bad-sync-count))
                  (begin
                    (BB> "SERVER SHUTDOWN CALLED!  last-access="last-access" current-seconds="(current-seconds)" server-timeout="server-timeout)
                    (rpc-transport:server-shutdown server-id rpc:listener)))))
          ;; end new loop
          ))))


(define (rpc-transport:find-free-port-and-open port #!key )
  (handle-exceptions
   exn
   (begin
     (print "Failed to bind to port " (rpc:default-server-port) ", trying next port")
     (rpc-transport:find-free-port-and-open (add1 port)))
   (rpc:default-server-port port)
   (set! *rpc-listener-port* port) ;; a bit paranoid about rpc:default-server-port parameter not changing across threads (as params are wont to do).  keeping this global in my back pocket in case this causes problems
   (set! *rpc-listener-port-bind-timestamp* (current-milliseconds)) ;; may want to test how long it has been since the last bind attempt happened...
   (tcp-read-timeout 240000)
   (tcp-buffer-size 0) ;; gotta do this because http-transport undoes it.
   (tcp-listen (rpc:default-server-port) 10000)
   ))
  
(define (rpc-transport:ping run-id host port)
  (handle-exceptions
   exn
   (begin
     (print "SERVER_NOT_FOUND exn="exn)
     (exit 1))
   (let ((login-res ((rpc:procedure 'server:login host port) *toppath*)))

     (if login-res
	 (begin
	   (print "LOGIN_OK")
	   (exit 0))
	 (begin
	   (print "LOGIN_FAILED")
	   (exit 1))))))

(define (rpc-transport:self-test run-id host port)
  (tcp-buffer-size 0) ;; gotta do this because http-transport undoes it.




  (let* ((testing-res ((rpc:procedure 'testing host port)))


         (login-res ((rpc:procedure 'server:login host port) *toppath*))
         (res (and login-res (equal? testing-res "Just testing"))))
    


    (if login-res











        (begin

          (BB> "Self test PASS.  login-res="login-res" testing-res="testing-res" *toppath*="*toppath*)
          #t)
        (begin
          (BB> "Self test fail.  login-res="login-res" testing-res="testing-res" *toppath*="*toppath*)
           
          #f))
    res))

(define (rpc-transport:client-setup run-id server-dat #!key (remtries 10))


  (tcp-buffer-size 0)
  (debug:print-info 0 *default-log-port* "rpc-transport:client-setup run-id="run-id" server-dat=" server-dat ", remaining-tries=" remtries)







  (let* ((iface     (tasks:hostinfo-get-interface server-dat))
         (hostname  (tasks:hostinfo-get-hostname  server-dat))
         (port      (tasks:hostinfo-get-port      server-dat))



         (runremote-server-dat (vector iface port #f #f #f (current-seconds) 'rpc)) ;; http version := (vector iface port api-uri api-url api-req (current-seconds) 'http  )
         (ping-res (retry-thunk (lambda ()  ;; make 3 attempts to ping.




                                  ((rpc:procedure 'server:login iface port) *toppath*))
                                retries: 3)))
    ;; we got here from rmt:get-connection-info on the condition that *runremote* has no entry for run-id...
    (if ping-res
        (begin
          (debug:print-info 0 *default-log-port* "rpc-transport:client-setup CONNECTION ESTABLISHED run-id="run-id" server-dat=" server-dat)
          (hash-table-set! *runremote* run-id runremote-server-dat)  ;; side-effect - *runremote* cache init fpr rmt:*


          runremote-server-dat)
        (begin ;; login failed but have a server record, clean out the record and try again
          (tasks:kill-server-run-id run-id)
          (tasks:bb-server-force-clean-run-record  run-id iface port
                                                   " rpc-transport:client-setup (server-dat = #t)")
          (if (> remtries 2)
              (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little
              (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time
          (server:try-running run-id)
          (thread-sleep! 5)   ;; give server a little time to start up
          (client:setup run-id remaining-tries: (sub1 remtries))
          " rpc-transport:client-setup (server-dat = #t)"))))

Modified server.scm from [19061b35b0] to [262f305ebc].

1
2
3
4
5
6
7
8
9

;; Copyright 2006-2012, Matthew Welland.
;; 
;;  This program is made available under the GNU GPL version 2.0 or
;;  greater. See the accompanying file COPYING for details.
;; 
;;  This program is distributed WITHOUT ANY WARRANTY; without even the
;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
;;  PURPOSE.

|







1
2
3
4
5
6
7
8
9

;; Copyright 2006-2016, Matthew Welland.
;; 
;;  This program is made available under the GNU GPL version 2.0 or
;;  greater. See the accompanying file COPYING for details.
;; 
;;  This program is distributed WITHOUT ANY WARRANTY; without even the
;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
;;  PURPOSE.
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

66
67
68
69
70
71
72
73
74
75
76







77
78
79
80
81
82
83
;;

;; all routes though here end in exit ...
;;
;; start_server
;;
(define (server:launch run-id transport-type)
  (BB> "server:launch fired for run-id="run-id" transport-type="transport-type)
  (case transport-type
    ((http)(http-transport:launch run-id))
    ;;((nmsg)(nmsg-transport:launch run-id))
    ((rpc)  (rpc-transport:launch run-id))
    (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type))))
;;       (else   (debug:print-error 0 *default-log-port* "No known transport set, transport=" transport ", using rpc")
;; 	      (rpc-transport:launch run-id)))))

;;======================================================================
;; S E R V E R   U T I L I T I E S 
;;======================================================================


;; Get the transport
(define (server:get-transport)
  (if *transport-type*
      *transport-type*
      (let ((ttype (string->symbol
		    (or (args:get-arg "-transport")
			(configf:lookup *configdat* "server" "transport")
			"rpc"))))
	(set! *transport-type* ttype)
	ttype)))
	    







;; Generate a unique signature for this server
(define (server:mk-signature)
  (message-digest-string (md5-primitive) 
			 (with-output-to-string
			   (lambda ()
			     (write (list (current-directory)
					  (argv)))))))







|
|
|
|
|
|
|
|





>
|
|
<
<
|
|
|
|
|
|
|
>
>
>
>
>
>
>







46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68


69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
;;

;; all routes though here end in exit ...
;;
;; start_server
;;
(define (server:launch run-id transport-type)
  (let ((ttype (if (symbol? transport-type) transport-type (string->symbol (->string transport-type)))))
    (case ttype
      ((http)(http-transport:launch run-id))
      ;;((nmsg)(nmsg-transport:launch run-id))
      ((rpc)  (rpc-transport:launch run-id))
      (else (debug:print-error 0 *default-log-port* "unknown server type " ttype)))))
  ;;       (else   (debug:print-error 0 *default-log-port* "No known transport set, transport=" transport ", using rpc")
  ;; 	      (rpc-transport:launch run-id)))))

;;======================================================================
;; S E R V E R   U T I L I T I E S 
;;======================================================================

;; set global *transport-type* based on -transport switch and serer/transport configuration.  default http otherwise.
;; called by launch:setup
(define (server:set-transport)


  (let ((ttype (string->symbol
                (or (args:get-arg "-transport")
                    (configf:lookup *configdat* "server" "transport")
                    "rpc"))))
    (set! *transport-type* ttype)
    ttype))

;; Get the transport  -- DO NOT call this from client code.  In client code, this is run-id sensitive and not a global
;;   For code communicating with existing run-id with a server, use: (rmt:run-id->transport-type run-id)
 (define (server:get-transport)
   (if *transport-type*
       *transport-type*
       (server:set-transport)))

;; Generate a unique signature for this server
(define (server:mk-signature)
  (message-digest-string (md5-primitive) 
			 (with-output-to-string
			   (lambda ()
			     (write (list (current-directory)
					  (argv)))))))
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
(define  (server:run run-id)
  (let* ((curr-host   (get-host-name))
	 (curr-ip     (server:get-best-guess-address curr-host))
	 (target-host (configf:lookup *configdat* "server" "homehost" ))
	 (testsuite   (common:get-testsuite-name))
	 (logfile     (conc *toppath* "/logs/" run-id ".log"))
	 (cmdln (conc (common:get-megatest-exe)
		      " -server " (or target-host "-") " -run-id " run-id (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
									      (conc " -daemonize -log " logfile)
									      "")
		      " -m testsuite:" testsuite))) ;; (conc " >> " logfile " 2>&1 &")))))
    (debug:print 0 *default-log-port* "INFO: Starting server (" cmdln ") as none running ...")
    (push-directory *toppath*)
    (if (not (directory-exists? "logs"))(create-directory "logs"))
    
    ;; Rotate logs, logic: 
    ;;                 if > 500k and older than 1 week:
    ;;                     remove previous compressed log and compress this log







|
|
<
|







115
116
117
118
119
120
121
122
123

124
125
126
127
128
129
130
131
(define  (server:run run-id)
  (let* ((curr-host   (get-host-name))
	 (curr-ip     (server:get-best-guess-address curr-host))
	 (target-host (configf:lookup *configdat* "server" "homehost" ))
	 (testsuite   (common:get-testsuite-name))
	 (logfile     (conc *toppath* "/logs/" run-id ".log"))
	 (cmdln (conc (common:get-megatest-exe)
		      " -server " (or target-host "-")
                      " -run-id " run-id " -log " logfile

		      " -m testsuite:" testsuite))) 
    (debug:print 0 *default-log-port* "INFO: Starting server (" cmdln ") as none running ...")
    (push-directory *toppath*)
    (if (not (directory-exists? "logs"))(create-directory "logs"))
    
    ;; Rotate logs, logic: 
    ;;                 if > 500k and older than 1 week:
    ;;                     remove previous compressed log and compress this log
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192

193
194
195
196

197
198
199




200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
(define (server:try-running run-id)
  (if (eq? run-id 0)
      (server:run run-id)
      (rmt:start-server run-id)))

(define (server:check-if-running run-id)
  (let ((tdbdat (tasks:open-db)))
    (let loop ((server (tasks:get-server (db:delay-if-busy tdbdat) run-id))
	       (trycount 0))
    (if server
	;; note: client:start will set *runremote*. this needs to be changed
	;;       also, client:start will login to the server, also need to change that.
	;;
	;; client:start returns #t if login was successful.
	;;

	(let ((res (case *transport-type*
		     ((http)(server:ping-server run-id 
						(tasks:hostinfo-get-interface server)
						(tasks:hostinfo-get-port      server)))

		     ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
		     ;;    			 (tasks:hostinfo-get-port      server)
		     ;;    			 timeout: 2))




                     )))
	  ;; if the server didn't respond we must remove the record
	  (if res
	      #t
	      (begin
		(debug:print-info 0 *default-log-port* "server at " server " not responding, removing record")
		(tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id 
				" server:check-if-running")
		res)))
	#f))))

;; called in megatest.scm, host-port is string hostname:port
;;
(define (server:ping run-id host:port)
  (let ((tdbdat (tasks:open-db)))
    (let* ((host-port (let ((slst (string-split   host:port ":")))
			(if (eq? (length slst) 2)
			    (list (car slst)(string->number (cadr slst)))
			    #f)))
	   (toppath       (launch:setup))
	   (server-db-dat (if (not host-port)(tasks:get-server (db:delay-if-busy tdbdat) run-id) #f)))
      (if (not run-id)
	  (begin
	    (debug:print-error 0 *default-log-port* "must specify run-id when doing ping, -run-id n")
	    (print "ERROR: No run-id")
	    (exit 1))
	  (if (and (not host-port)
		   (not server-db-dat))







|







>
|



>
|
|
|
>
>
>
>
|



















|







183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
(define (server:try-running run-id)
  (if (eq? run-id 0)
      (server:run run-id)
      (rmt:start-server run-id)))

(define (server:check-if-running run-id)
  (let ((tdbdat (tasks:open-db)))
    (let loop ((server (tasks:get-server-info (db:delay-if-busy tdbdat) run-id))
	       (trycount 0))
    (if server
	;; note: client:start will set *runremote*. this needs to be changed
	;;       also, client:start will login to the server, also need to change that.
	;;
	;; client:start returns #t if login was successful.
	;;
	(let* ((transport-type (rmt:run-id->transport-type run-id))
               (res (case transport-type
		     ((http)(server:ping-server run-id 
						(tasks:hostinfo-get-interface server)
						(tasks:hostinfo-get-port      server)))
                     ((rpc) (rpc-transport:ping  run-id 
                                                 (tasks:hostinfo-get-interface server)
                                                 (tasks:hostinfo-get-port      server)))
                    
                     (else  
                      (debug:print-error 0 *default-log-port* "(5) Transport [" transport-type
                                         "] specified for run-id [" run-id
                                         "] is not implemented in rmt:send-receive.  Cannot proceed.")
                      (exit 1)))))
	  ;; if the server didn't respond we must remove the record
	  (if res
	      #t
	      (begin
		(debug:print-info 0 *default-log-port* "server at " server " not responding, removing record")
		(tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id 
				" server:check-if-running")
		res)))
	#f))))

;; called in megatest.scm, host-port is string hostname:port
;;
(define (server:ping run-id host:port)
  (let ((tdbdat (tasks:open-db)))
    (let* ((host-port (let ((slst (string-split   host:port ":")))
			(if (eq? (length slst) 2)
			    (list (car slst)(string->number (cadr slst)))
			    #f)))
	   (toppath       (launch:setup))
	   (server-db-dat (if (not host-port)(tasks:get-server-info (db:delay-if-busy tdbdat) run-id) #f)))
      (if (not run-id)
	  (begin
	    (debug:print-error 0 *default-log-port* "must specify run-id when doing ping, -run-id n")
	    (print "ERROR: No run-id")
	    (exit 1))
	  (if (and (not host-port)
		   (not server-db-dat))
252
253
254
255
256
257
258



259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
       (if (eof-object? inl)
	   (case (string->symbol res)
	     ((NOREPLY)  #f)
	     ((LOGIN_OK) #t)
	     (else       #f))
	   (loop (read-line) inl))))))




(define (server:login toppath)
  (lambda (toppath)
    (set! *last-db-access* (current-seconds))
    (if (equal? *toppath* toppath)
	(begin
	  ;; (debug:print-info 2 *default-log-port* "login successful")
	  #t)
	(begin
	  ;; (debug:print-info 2 *default-log-port* "login failed")
	  #f))))

(define (server:get-timeout)
  (let ((tmo (configf:lookup  *configdat* "server" "timeout")))
    (if (and (string? tmo)
	     (string->number tmo))
	(* 60 60 (string->number tmo))
	;; (* 3 24 60 60) ;; default to three days
	(* 60 1)         ;; default to one minute
	;; (* 60 60 25)      ;; default to 25 hours
	)))








>
>
>

<
|
|
|
|
|
|
|
|











263
264
265
266
267
268
269
270
271
272
273

274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
       (if (eof-object? inl)
	   (case (string->symbol res)
	     ((NOREPLY)  #f)
	     ((LOGIN_OK) #t)
	     (else       #f))
	   (loop (read-line) inl))))))


;; Client will call this procedure on the server via the low-level transport (http/rpc/etc) to verify its toppath matches the server's toppath.
;; A true result means client and server are associated with same megatest instance, share the same megatest.config, etc...)  A false result means the client should not talk to this server.
(define (server:login toppath)

  (set! *last-db-access* (current-seconds))
  (if (equal? *toppath* toppath)
      (begin
        ;; (debug:print-info 2 *default-log-port* "login successful")
        #t)
      (begin
        ;; (debug:print-info 2 *default-log-port* "login failed")
        #f)))

(define (server:get-timeout)
  (let ((tmo (configf:lookup  *configdat* "server" "timeout")))
    (if (and (string? tmo)
	     (string->number tmo))
	(* 60 60 (string->number tmo))
	;; (* 3 24 60 60) ;; default to three days
	(* 60 1)         ;; default to one minute
	;; (* 60 60 25)      ;; default to 25 hours
	)))

Modified tasks.scm from [a06114a2ac] to [736f8afe48].

168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
(define (tasks:hostinfo-get-interface   vec)    (vector-ref  vec 1))
(define (tasks:hostinfo-get-port        vec)    (vector-ref  vec 2))
(define (tasks:hostinfo-get-pubport     vec)    (vector-ref  vec 3))
(define (tasks:hostinfo-get-transport   vec)    (vector-ref  vec 4))
(define (tasks:hostinfo-get-pid         vec)    (vector-ref  vec 5))
(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

(define (tasks:server-lock-slot mdb run-id)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot")
  (if (< (tasks:num-in-available-state mdb run-id) 4)
      (begin 
	(tasks:server-set-available mdb run-id)
	(thread-sleep! (/ (random 1500) 1000)) ;; (thread-sleep! 2) ;; Try removing this. It may not be needed.
	(tasks:server-am-i-the-server? mdb run-id))
      #f))
	
;; register that this server may come online (first to register goes though with the process)
(define (tasks:server-set-available mdb run-id)
  (sqlite3:execute 
   mdb 
   "INSERT INTO servers (pid,hostname,port,pubport,start_time,      priority,state,mt_version,heartbeat,   interface,transport,run_id)
                   VALUES(?, ?,       ?,   ?, strftime('%s','now'), ?,       ?,    ?,-1,?,        ?,        ?);"
   (current-process-id)          ;; pid
   (get-host-name)               ;; hostname
   -1                            ;; port
   -1                            ;; pubport
   (random 1000)                 ;; priority (used a tiebreaker on get-available)
   "available"                   ;; state
   (common:version-signature)    ;; mt_version
   -1                            ;; interface
   ;; (conc (server:get-transport)) ;; transport
   (conc *transport-type*)    ;; transport
   run-id
   ))

(define (tasks:num-in-available-state mdb run-id)
  (let ((res 0))
    (sqlite3:for-each-row
     (lambda (num-in-queue)







|



|





|













|







168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
(define (tasks:hostinfo-get-interface   vec)    (vector-ref  vec 1))
(define (tasks:hostinfo-get-port        vec)    (vector-ref  vec 2))
(define (tasks:hostinfo-get-pubport     vec)    (vector-ref  vec 3))
(define (tasks:hostinfo-get-transport   vec)    (vector-ref  vec 4))
(define (tasks:hostinfo-get-pid         vec)    (vector-ref  vec 5))
(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

(define (tasks:server-lock-slot mdb run-id transport-type)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot")
  (if (< (tasks:num-in-available-state mdb run-id) 4)
      (begin 
	(tasks:server-set-available mdb run-id transport-type)
	(thread-sleep! (/ (random 1500) 1000)) ;; (thread-sleep! 2) ;; Try removing this. It may not be needed.
	(tasks:server-am-i-the-server? mdb run-id))
      #f))
	
;; register that this server may come online (first to register goes though with the process)
(define (tasks:server-set-available mdb run-id transport-type)
  (sqlite3:execute 
   mdb 
   "INSERT INTO servers (pid,hostname,port,pubport,start_time,      priority,state,mt_version,heartbeat,   interface,transport,run_id)
                   VALUES(?, ?,       ?,   ?, strftime('%s','now'), ?,       ?,    ?,-1,?,        ?,        ?);"
   (current-process-id)          ;; pid
   (get-host-name)               ;; hostname
   -1                            ;; port
   -1                            ;; pubport
   (random 1000)                 ;; priority (used a tiebreaker on get-available)
   "available"                   ;; state
   (common:version-signature)    ;; mt_version
   -1                            ;; interface
   ;; (conc (server:get-transport)) ;; transport
   (conc transport-type)    ;; transport
   run-id
   ))

(define (tasks:num-in-available-state mdb run-id)
  (let ((res 0))
    (sqlite3:for-each-row
     (lambda (num-in-queue)
289
290
291
292
293
294
295




296
297
298
299
300
301
302
		 (remtries 100))
	(if (member port used-ports)
	    (if (> remtries 0)
		(loop (get-rand-port)(- remtries 1))
		(get-rand-port))
	    port))))))





(define (tasks:server-am-i-the-server? mdb run-id)
  (let* ((all    (tasks:server-get-servers-vying-for-run-id mdb run-id))
	 (first  (if (null? all)
		     #f;; (begin (debug:print-error 0 *default-log-port* "no servers listed, should be at least one by now.") 
		       ;;      (sqlite3:finalize! mdb)
		       ;;      (exit 1))
		     (car (db:get-rows all)))))







>
>
>
>







289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
		 (remtries 100))
	(if (member port used-ports)
	    (if (> remtries 0)
		(loop (get-rand-port)(- remtries 1))
		(get-rand-port))
	    port))))))

;; there can be multiple servers spawned for the same runid.  we want exactly zero or one servers per runid.  The caller is a nascent server.  It wants to know if it should proceed or if it is redundant.  this function chooses a winner and tells me if I am the winner.  Alternative is lots of runaway servers.  Nobody wants that, trust me.
;;
;; algo: get all server info entries for this runid.  Each nascent server will insert an entry for its runid before getting here.  Entries are visible globally.  If current hostname and current processid match first entry, then yes I am the server;  return server-id as my prize for winning.  Otherwise, I am not the server;  return #f.
;;
(define (tasks:server-am-i-the-server? mdb run-id)
  (let* ((all    (tasks:server-get-servers-vying-for-run-id mdb run-id))
	 (first  (if (null? all)
		     #f;; (begin (debug:print-error 0 *default-log-port* "no servers listed, should be at least one by now.") 
		       ;;      (sqlite3:finalize! mdb)
		       ;;      (exit 1))
		     (car (db:get-rows all)))))
325
326
327
328
329
330
331



































332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
     (lambda (a . b)
       (set! res (cons (apply vector a b) res)))
     mdb
     (conc "SELECT " selstr " FROM servers WHERE run_id=? AND state in ('available','running','dbprep') ORDER BY start_time DESC;")
     run-id)
    (vector header res)))




































(define (tasks:get-server mdb run-id #!key (retries 10))
  (let ((res  #f)
	(best #f))
    (handle-exceptions
     exn
     (begin
       (print-call-chain (current-error-port))
       (debug:print 0 *default-log-port* "WARNING: tasks:get-server db access error.")
       (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 0 *default-log-port* " for run " run-id)
       (print-call-chain (current-error-port))
       (if (> retries 0)
	   (begin
	     (debug:print 0 *default-log-port* " trying call to tasks:get-server again in 10 seconds")
	     (thread-sleep! 10)
	     (tasks:get-server mdb run-id retries: (- retries 0)))
	   (debug:print 0 *default-log-port* "10 tries of tasks:get-server all crashed and burned. Giving up and returning \"no server found\"")))
     (sqlite3:for-each-row
      (lambda (id interface port pubport transport pid hostname)
	(set! res (vector id interface port pubport transport pid hostname)))
      mdb
      ;; removed:
      ;; strftime('%s','now')-heartbeat < 10 AND mt_version = ?
      "SELECT id,interface,port,pubport,transport,pid,hostname FROM servers







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|






|





|

|
|







329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
     (lambda (a . b)
       (set! res (cons (apply vector a b) res)))
     mdb
     (conc "SELECT " selstr " FROM servers WHERE run_id=? AND state in ('available','running','dbprep') ORDER BY start_time DESC;")
     run-id)
    (vector header res)))


;; BB> bb opinion - want to push responsibility into api (encapsulation), like waiting if db is busy and finding the db handle in the first place.  why should the caller need to be concerned??  If my opinion carries, we'll remove the bb- and make other needful adjustments.
(define (bb-mdb-inserter mdb-expecting-proc mdbless-args)
  (let ((mdb (db:delay-if-busy (tasks:open-db))))
    (apply mdb-expecting-proc (cons mdb mdbless-args))))

(define (tasks:bb-server-force-clean-run-record . args)
  (bb-mdb-inserter tasks:server-force-clean-run-record args))

(define (tasks:bb-server-lock-slot . args)
  (bb-mdb-inserter tasks:server-lock-slot args))

(define (tasks:bb-server-set-interface-port . args)
  (bb-mdb-inserter tasks:server-set-interface-port args))

(define (tasks:bb-server-am-i-the-server? . args)
  (bb-mdb-inserter tasks:server-am-i-the-server? args))

(define (tasks:bb-server-set-state! . args)
  (bb-mdb-inserter tasks:server-set-state! args))
  
(define (tasks:bb-get-server-info . args)
  (bb-mdb-inserter tasks:get-server-info args))

(define (tasks:bb-num-in-available-state . args)
  (bb-mdb-inserter tasks:num-in-available-state args))

(define (tasks:bb-server-delete-records-for-this-pid . args)
  (bb-mdb-inserter tasks:server-delete-records-for-this-pid args))

(define (tasks:bb-server-delete-record . args)
  (bb-mdb-inserter tasks:server-delete-record args))


;; BB: renaming tasks:get-server to get-server-info to make clear we aren't creating servers here
(define (tasks:get-server-info mdb run-id #!key (retries 10))
  (let ((res  #f)
	(best #f))
    (handle-exceptions
     exn
     (begin
       (print-call-chain (current-error-port))
       (debug:print 0 *default-log-port* "WARNING: tasks:get-server-info db access error.")
       (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 0 *default-log-port* " for run " run-id)
       (print-call-chain (current-error-port))
       (if (> retries 0)
	   (begin
	     (debug:print 0 *default-log-port* " trying call to tasks:get-server-info again in 10 seconds")
	     (thread-sleep! 10)
	     (tasks:get-server-info mdb run-id retries: (- retries 0)))
	   (debug:print 0 *default-log-port* "10 tries of tasks:get-server-info all crashed and burned. Giving up and returning \"no server found\"")))
     (sqlite3:for-each-row
      (lambda (id interface port pubport transport pid hostname)
	(set! res (vector id interface port pubport transport pid hostname)))
      mdb
      ;; removed:
      ;; strftime('%s','now')-heartbeat < 10 AND mt_version = ?
      "SELECT id,interface,port,pubport,transport,pid,hostname FROM servers
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
;;      (else
;;       #f))))

;; try to start a server and wait for it to be available
;;
(define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries)
  ;; ensure a server is running for this run
  (let loop ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))
	     (delay-time 0))
      (if (and (not server-dat)
	       (< delay-time delay-max-tries))
	  (begin
	    (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id)
		(debug:print 0 *default-log-port* "Try starting server for run-id " run-id))
	    (thread-sleep! (/ (random 2000) 1000))
	    (server:kind-run run-id)
	    (thread-sleep! (min delay-time 1))
	    (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1))))))

(define (tasks:get-all-servers mdb)
  (let ((res '()))
    (sqlite3:for-each-row
     (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id)
       ;;                       0  1     2         3      4     5          6        7     8          9          10        11     12
       (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res)))







|









|







431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
;;      (else
;;       #f))))

;; try to start a server and wait for it to be available
;;
(define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries)
  ;; ensure a server is running for this run
  (let loop ((server-dat (tasks:get-server-info (db:delay-if-busy tdbdat) run-id))
	     (delay-time 0))
      (if (and (not server-dat)
	       (< delay-time delay-max-tries))
	  (begin
	    (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id)
		(debug:print 0 *default-log-port* "Try starting server for run-id " run-id))
	    (thread-sleep! (/ (random 2000) 1000))
	    (server:kind-run run-id)
	    (thread-sleep! (min delay-time 1))
	    (loop (tasks:get-server-info (db:delay-if-busy tdbdat) run-id)(+ delay-time 1))))))

(define (tasks:get-all-servers mdb)
  (let ((res '()))
    (sqlite3:for-each-row
     (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id)
       ;;                       0  1     2         3      4     5          6        7     8          9          10        11     12
       (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res)))
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
  (unsetenv "TARGETHOST_LOGF")
  (unsetenv "TARGETHOST"))
 
;; look up a server by run-id and send it a kill, also delete the record for that server
;;
(define (tasks:kill-server-run-id run-id #!key (tag "default"))
  (let* ((tdbdat  (tasks:open-db))
	 (sdat    (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
    (if sdat
	(let ((hostname (vector-ref sdat 6))
	      (pid      (vector-ref sdat 5))
	      (server-id (vector-ref sdat 0)))
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed")
	  (debug:print-info 0 *default-log-port* "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
	  (tasks:kill-server hostname pid)
	  (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
	(debug:print-info 0 *default-log-port* "No server found for run-id " run-id ", nothing to kill"))
    ;; (sqlite3:finalize! tdb)
    ))
    
;;======================================================================
;; M O N I T O R S







|






|







480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
  (unsetenv "TARGETHOST_LOGF")
  (unsetenv "TARGETHOST"))
 
;; look up a server by run-id and send it a kill, also delete the record for that server
;;
(define (tasks:kill-server-run-id run-id #!key (tag "default"))
  (let* ((tdbdat  (tasks:open-db))
	 (sdat    (tasks:get-server-info (db:delay-if-busy tdbdat) run-id)))
    (if sdat
	(let ((hostname (vector-ref sdat 6))
	      (pid      (vector-ref sdat 5))
	      (server-id (vector-ref sdat 0)))
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed")
	  (debug:print-info 0 *default-log-port* "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
	  (tasks:kill-server hostname pid kill-switch: "-9") ;; BB: added -9, let's not be kind here.  we need it to die so it isn't a 100% cpu zombie
	  (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
	(debug:print-info 0 *default-log-port* "No server found for run-id " run-id ", nothing to kill"))
    ;; (sqlite3:finalize! tdb)
    ))
    
;;======================================================================
;; M O N I T O R S