Megatest

Diff
Login

Differences From Artifact [42471562b8]:

To Artifact [50f24c44ca]:


15
16
17
18
19
20
21



22
23
24
25
26
27
28
29
30
31





32

33
34
35




36
37





38
39
40
41
42
43




















































































































































































































































































44
45
46
;; 
;;     You should have received a copy of the GNU General Public License
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

;;======================================================================

(declare (unit servermod))




(module servermod
	*
	
(import scheme
	chicken.base
	chicken.string
	chicken.process
	chicken.io
	chicken.time





	

	(prefix sqlite3 sqlite3:)

	typed-records




	srfi-18
	srfi-69





	)

(define (server:make-server-url hostport)
  (if (not hostport)
      #f
      (conc "http://" (car hostport) ":" (cadr hostport))))






















































































































































































































































































)







>
>
>










>
>
>
>
>

>

<

>
>
>
>


>
>
>
>
>






>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>



15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42

43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
;; 
;;     You should have received a copy of the GNU General Public License
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

;;======================================================================

(declare (unit servermod))
(declare (uses commonmod))
(declare (uses debugprint))
(declare (uses configfmod))

(module servermod
	*
	
(import scheme
	chicken.base
	chicken.string
	chicken.process
	chicken.io
	chicken.time
	chicken.condition
	chicken.file
	chicken.process-context
	chicken.process-context.posix
	chicken.random
	
	system-information
	(prefix sqlite3 sqlite3:)

	typed-records
	regex
	directory-utils
	matchable
	
	srfi-18
	srfi-69

	commonmod
	debugprint
	configfmod
	
	)

(define (server:make-server-url hostport)
  (if (not hostport)
      #f
      (conc "http://" (car hostport) ":" (cadr hostport))))

;;======================================================================
;; logic for getting homehost. Returns (host . at-home)
;; IF *toppath* is not set, wait up to five seconds trying every two seconds
;; (this is to accomodate the watchdog)
;;
(define (common:get-homehost #!key (trynum 5))
  (assert *toppath* "ERROR: common:get-homehost called before launch:setup. This is fatal.")
  ;; called often especially at start up. use mutex to eliminate collisions
  (mutex-lock! *homehost-mutex*)
  (cond
   (*home-host*
    (mutex-unlock! *homehost-mutex*)
    *home-host*)
   ((not *toppath*)
    (mutex-unlock! *homehost-mutex*)
    ;; (launch:setup) ;; safely mutexed now
    (if (> trynum 0)
	(begin
	  (thread-sleep! 2)
	  (common:get-homehost trynum: (- trynum 1)))
	#f))
   (else
    (let* ((currhost (get-host-name))
	   (bestadrs (server:get-best-guess-address currhost))
	   ;; first look in config, then look in file .homehost, create it if not found
	   (homehost (or (configf:lookup *configdat* "server" "homehost" )
			 (handle-exceptions
			     exn
			     (if (> trynum 0)
				 (let ((delay-time (* (- 5 trynum) 5)))
				   (mutex-unlock! *homehost-mutex*)
				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying "
						delay-time " seconds and trying again, message: "  ((condition-property-accessor 'exn 'message) exn)
						", exn=" exn)
				   (thread-sleep! delay-time)
				   (common:get-homehost trynum: (- trynum 1)))
				 (begin
				   (mutex-unlock! *homehost-mutex*)
				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)
						"] Failed to read .homehost file after trying five times. Giving up and exiting, message: "
						((condition-property-accessor 'exn 'message) exn))
				   (exit 1)))
			   (let ((hhf (conc *toppath* "/.homehost")))
			     (if (common:file-exists? hhf)
				 (with-input-from-file hhf read-line)
				 (if (file-writable? *toppath*)
				     (begin
				       (with-output-to-file hhf
					 (lambda ()
					   (print bestadrs)))
				       (begin
					 (mutex-unlock! *homehost-mutex*)
					 (car (common:get-homehost))))
				     #f))))))
	   (at-home  (or (equal? homehost currhost)
			 (equal? homehost bestadrs))))
      (set! *home-host* (cons homehost at-home))
      (mutex-unlock! *homehost-mutex*)
      *home-host*))))

;;======================================================================
;; am I on the homehost?
;;
(define (common:on-homehost?)
  (let ((hh (common:get-homehost)))
    (if hh
	(cdr hh)
	#f)))

(define (common:wait-for-homehost-load maxnormload msg)
  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                     #f
                     (common:get-homehost)))
         (hh     (if hh-dat (car hh-dat) #f)))
    (common:wait-for-normalized-load maxnormload msg hh)))


;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least 3 seconds old
  (server:wait-for-server-start-last-flag areapath)
  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
	     (call-num     (car last-run-dat))
	     (when-run     (cadr last-run-dat))
	     (run-delay    (+ (case call-num
				((0)    0)
				((1)   20)
				((2)  300)
				(else 600))
			      (pseudo-random-integer 5)))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
	     (lock-file    (conc areapath "/logs/server-start.lock")))
	(if	(> (- (current-seconds) when-run) run-delay)
		(let* ((start-flag (conc areapath "/logs/server-start-last")))
		  (common:simple-file-lock-and-wait lock-file expire-time: 15)
                  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
		  (system (conc "touch " start-flag)) ;; lazy but safe
		  (server:run areapath)
		  (thread-sleep! 2) ;; don't release the lock for at least a few seconds
		  (common:simple-file-release-lock lock-file)))
	(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))
;; Given a run id start a server process    ### NOTE ### > file 2>&1 
;; if the run-id is zero and the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))
	 (curr-ip     (server:get-best-guess-address curr-host))
	 (curr-pid    (current-process-id))
	 (homehost    (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
	 (target-host (car homehost))
	 (testsuite   (common:get-testsuite-name))
	 (logfile     (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
	 (profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
			   ""))
	 (cmdln (conc (common:get-megatest-exe)
		      " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
							   " -daemonize "
							   "")
		      ;; " -log " logfile
		      " -m testsuite:" testsuite
		      " " profile-mode
		      )) ;; (conc " >> " logfile " 2>&1 &")))))
	 (log-rotate  (make-thread common:rotate-logs  "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!?
         (load-limit  (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0)))
    ;; we want the remote server to start in *toppath* so push there
    (push-directory areapath)
    (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
    (thread-start! log-rotate)
    
    ;; host.domain.tld match host?
    (if (and target-host 
	     ;; look at target host, is it host.domain.tld or ip address and does it 
	     ;; match current ip or hostname
	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (pseudo-random-integer 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
    ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
    #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath) ;;  #!key (numservers "2"))
  (let* ((ns            (server:get-num-servers))
	 (servers       (server:get-best (server:get-list areapath))))
    (if (or (and servers
		 (null? servers))
	    (not servers)
	    (and (list? servers)
		 (< (length servers) (pseudo-random-integer ns)))) ;; somewhere between 0 and numservers
        #f
        (let loop ((hed (car servers))
                   (tal (cdr servers)))
          (let ((res (server:check-server hed)))
            (if res
                hed
                (if (null? tal)
                    #f
                    (loop (car tal)(cdr tal)))))))))


;; ping the given server
;;
(define (server:check-server server-record)
  (let* ((server-url (server:record->url server-record))
         (server-id (server:record->id server-record)) 
         (res        (case *transport-type*
                       ((http)(server:ping server-url server-id))
                       ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
                       )))
    (if res
        server-url
	#f)))

(define (server:record->url servr)
  (handle-exceptions
   exn
   (begin 
     (debug:print-info 0 *default-log-port*  "Unable to get server url from " servr ", exn=" exn)     
   #f)
  (match-let (((mod-time host port start-time server-id pid)
	       servr))
    (if (and host port)
	(conc host ":" port)
	#f))))

;; wait for server=start-last to be three seconds old
;;
(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (reftime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))
	       (all-go   (> delta reftime)))
	  (if (and all-go
		   (begin
                     (debug:print-info 0 *default-log-port* "Writing " start-flag)
		     (with-output-to-file start-flag
		       (lambda ()
			 (print server-key)))
		     (thread-sleep! 0.254)
		     (let ((res (with-input-from-file start-flag
				  (lambda ()
				    (read-line)))))
		       (equal? server-key res))))
	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe
	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
				  fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
		(thread-sleep! reftime)
		(server:wait-for-server-start-last-flag areapath)))))))

;; called in megatest.scm, host-port is string hostname:port
;;
;; NOTE: This is NOT called directly from clients as not all transports support a client running
;;       in the same process as the server.
;;
(define (server:ping host-port-in server-id #!key (do-exit #f))
  (let ((host:port (if (not host-port-in) ;; use read-dotserver to find
		       #f ;; (server:check-if-running *toppath*)
		;; (if (number? host-port-in) ;; we were handed a server-id
		;; 	   (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in)))
		;; 	     ;; (print "srec: " srec " host-port-in: " host-port-in)
		;; 	     (if srec
		;; 		 (conc (vector-ref srec 3) ":" (vector-ref srec 4))
		;; 		 (conc "no such server-id " host-port-in)))
		       host-port-in))) ;; )
    (let* ((host-port (if host:port
			  (let ((slst (string-split   host:port ":")))
			    (if (eq? (length slst) 2)
				(list (car slst)(string->number (cadr slst)))
				#f))
			  #f)))
;;	   (toppath       (launch:setup)))
      ;; (print "host-port=" host-port)
      (if (not host-port)
	  (begin
	    (if host-port-in
		(debug:print 0 *default-log-port*  "ERROR: bad host:port"))
	    (if do-exit (exit 1))
	    #f)
	  (let* ((iface      (car host-port))
		 (port       (cadr host-port))
		 (server-dat (http-transport:client-connect iface port server-id))
		 (login-res  (rmt:login-no-auto-client-setup server-dat)))
	    (if (and (list? login-res)
		     (car login-res))
		(begin
		  ;; (print "LOGIN_OK")
		  (if do-exit (exit 0))
		  #t)
		(begin
		  ;; (print "LOGIN_FAILED")
		  (if do-exit (exit 1))
		  #f)))))))



)