Megatest

Diff
Login

Differences From Artifact [19fd15cc14]:

To Artifact [c7b3629e26]:


109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
    ((rpc)  (db:obj->string (vector success/fail query-sig result)))
    ((http) (db:obj->string (vector success/fail query-sig result)))
    ((fs)   result)
    (else 
     (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
     result)))

;; Given a run id start a server process    ### NOTE ### > file 2>&1 
;; if the run-id is zero and the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))







|
|







109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
    ((rpc)  (db:obj->string (vector success/fail query-sig result)))
    ((http) (db:obj->string (vector success/fail query-sig result)))
    ((fs)   result)
    (else 
     (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
     result)))

;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169


170
171
172
173
174
175
176
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
    ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
    #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let 



(define (server:logf-get-start-info logf)
  (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx (regexp "^SERVER: dbprep"))
        (dbprep-found 0)) 
    (handle-exceptions
	exn







|
<






|
|
>
>







153
154
155
156
157
158
159
160

161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
    (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))

    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds server-id
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let 
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 

(define (server:logf-get-start-info logf)
  (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx (regexp "^SERVER: dbprep"))
        (dbprep-found 0)) 
    (handle-exceptions
	exn
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
			(list (car dat) ;; host
			      (string->number (cadr dat)) ;; port
			      (string->number (caddr dat))
                              (cadr (cddr dat))))))
                (begin 
                   (if dbprep-found
                      (begin
                         (debug:print-info 0 *default-log-port* "Server is in dbprep at " (current-seconds))
                         (thread-sleep! 0.5) ;; was 25 sec but that blocked things from starting?
                      )
                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds))
                   )
		    (list #f #f #f #f)))))))))

;; get a list of servers with all relevant data
;; ( mod-time host port start-time pid )
;;
(define (server:get-list areapath #!key (limit #f))
  (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
	(day-seconds (* 24 60 60)))
    ;; if the directory exists continue to get the list
    ;; otherwise attempt to create the logs dir and then







|


|



|







200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
			(list (car dat) ;; host
			      (string->number (cadr dat)) ;; port
			      (string->number (caddr dat))
                              (cadr (cddr dat))))))
                (begin 
                   (if dbprep-found
                      (begin
                         (debug:print-info 0 *default-log-port* "Server is in dbprep at " (common:human-time))
                         (thread-sleep! 0.5) ;; was 25 sec but that blocked things from starting?
                      )
                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))
                   )
		    (list #f #f #f #f)))))))))

;; get a list of servers from the log files, with all relevant data
;; ( mod-time host port start-time pid )
;;
(define (server:get-list areapath #!key (limit #f))
  (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
	(day-seconds (* 24 60 60)))
    ;; if the directory exists continue to get the list
    ;; otherwise attempt to create the logs dir and then
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246

        ;; Get the list of server logs that do not contain "exiting". Ignore logs for servers that have exited.
	(let* ((server-logs-cmd  (conc "grep -iL exiting " areapath "/logs/server-*-*.log"))
               (server-logs   (string-split (string-chomp (call-with-input-pipe server-logs-cmd read-all))))
	       (num-serv-logs (length server-logs)))
	  (if (or (null? server-logs) (= num-serv-logs 0))
              (let ()
                 (debug:print 1  *default-log-port* "There are no servers running")
	         '()
              )
	      (let loop ((hed  (string-chomp (car server-logs)))
			 (tal  (cdr server-logs))
			 (res '()))
		(let* ((mod-time  (handle-exceptions
				   exn







|







233
234
235
236
237
238
239
240
241
242
243
244
245
246
247

        ;; Get the list of server logs that do not contain "exiting". Ignore logs for servers that have exited.
	(let* ((server-logs-cmd  (conc "grep -iL exiting " areapath "/logs/server-*-*.log"))
               (server-logs   (string-split (string-chomp (call-with-input-pipe server-logs-cmd read-all))))
	       (num-serv-logs (length server-logs)))
	  (if (or (null? server-logs) (= num-serv-logs 0))
              (let ()
                 (debug:print 1  *default-log-port* "There are no servers running at " (common:human-time))
	         '()
              )
	      (let loop ((hed  (string-chomp (car server-logs)))
			 (tal  (cdr server-logs))
			 (res '()))
		(let* ((mod-time  (handle-exceptions
				   exn
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380





381
382
383
384
385
386
387
388
389
390
391

392


393
394


395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426

427
428


429
430
431
432
433
434
435

(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))

;; wait for server=start-last to be three seconds old
;;
(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (reftime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))
	       (all-go   (> delta reftime)))





	  (if (and all-go
		   (begin
                     (debug:print-info 0 *default-log-port* "Writing " start-flag)
		     (with-output-to-file start-flag
		       (lambda ()
			 (print server-key)))
		     (thread-sleep! 0.25)
		     (let ((res (with-input-from-file start-flag
				  (lambda ()
				    (read-line)))))
		       (equal? server-key res))))

	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe


	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "


				  fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
		(thread-sleep! reftime)
		(server:wait-for-server-start-last-flag areapath)))))))

;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least 3 seconds old
  (server:wait-for-server-start-last-flag areapath)
  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
	     (call-num     (car last-run-dat))
	     (when-run     (cadr last-run-dat))
	     (run-delay    (+ (case call-num ;; NOT USED. Waiting is handled by wait-for-server
				((0)    0)
				((1)   20)
				((2)  300)
				(else 600))
			      (random 5)
                              0))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously

	     (lock-file    (conc areapath "/logs/server-start.lock")))
	;; (if	(> (- (current-seconds) when-run) run-delay)
	(let* ((start-flag (conc areapath "/logs/server-start-last")))
	  (common:simple-file-lock-and-wait lock-file expire-time: 15)
	  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
	  (system (conc "touch " start-flag)) ;; lazy but safe
	  (server:run areapath)
	  (thread-sleep! 2) ;; don't release the lock for at least a few seconds
	  (common:simple-file-release-lock lock-file)))

      (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")))
	;; (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))



;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))
    (let loop ((server-info (server:check-if-running areapath))
	       (try-num    0))







|




|




|
>
>
>
>
>










|
>

>
>


>
>
|
|


|
|



|


<
<
<
<
<
<
<
<
<
<
|

<





|

>
|
<
>
>







364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417










418
419

420
421
422
423
424
425
426
427
428

429
430
431
432
433
434
435
436
437

(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))

;; if server-start-last exists, overwrite it. Otherwise loop recursively until it is old enough.
;;
(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (idletime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))
	       (all-go   (> delta idletime))
               (old-server-key (with-input-from-file start-flag (lambda () (read-line))))
              )

          ;; write a new start-flag file, wait 0.25s, then if the previous start-flag file was older than <idletime> seconds, and the new file still has the same server key as you just wrote, return #t.
          ;; 
	  (if (and all-go
		   (begin
                     (debug:print-info 0 *default-log-port* "Writing " start-flag)
		     (with-output-to-file start-flag
		       (lambda ()
			 (print server-key)))
		     (thread-sleep! 0.25)
		     (let ((res (with-input-from-file start-flag
				  (lambda ()
				    (read-line)))))
		       (equal? server-key res)))
                )
	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe

           ;; If either of the above conditions are not true, print a "Gating server start" message, wait <idle-time>, then call this function recursively. 
	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
				  (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if all-go "server key does not match" "too soon to start another server"))
		(debug:print-info 0 *default-log-port* "server keys: from file: " old-server-key " needed: " server-key)

		(thread-sleep! idletime)
		(server:wait-for-server-start-last-flag areapath)))))))

;; kind start up of servers, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least <server idletime> seconds old
  (server:wait-for-server-start-last-flag areapath)
  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?










      (let* (
	     (lock-file    (conc areapath "/logs/server-start.lock")))

	(let* ((start-flag (conc areapath "/logs/server-start-last")))
	  (common:simple-file-lock-and-wait lock-file expire-time: 15)
	  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
	  (system (conc "touch " start-flag)) ;; lazy but safe
	  (server:run areapath)
	  (thread-sleep! 18) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
	  (common:simple-file-release-lock lock-file)))

      (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")

   )
)

;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))
    (let loop ((server-info (server:check-if-running areapath))
	       (try-num    0))