Megatest

Diff
Login

Differences From Artifact [6d2db21290]:

To Artifact [d261c5be21]:


109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
    ((rpc)  (db:obj->string (vector success/fail query-sig result)))
    ((http) (db:obj->string (vector success/fail query-sig result)))
    ((fs)   result)
    (else 
     (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
     result)))

;; Given a run id start a server process    ### NOTE ### > file 2>&1 
;; if the run-id is zero and the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))







|
|







109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
    ((rpc)  (db:obj->string (vector success/fail query-sig result)))
    ((http) (db:obj->string (vector success/fail query-sig result)))
    ((fs)   result)
    (else 
     (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
     result)))

;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169


170
171
172
173
174
175
176
	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
    ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
    #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let 



(define (server:logf-get-start-info logf)
  (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx (regexp "^SERVER: dbprep"))
        (dbprep-found 0)) 
    (handle-exceptions
	exn







|
|
<






|
|
>
>







152
153
154
155
156
157
158
159
160

161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time
    (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))

    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds server-id
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let 
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 

(define (server:logf-get-start-info logf)
  (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx (regexp "^SERVER: dbprep"))
        (dbprep-found 0)) 
    (handle-exceptions
	exn
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233



234
235
236
237
238
239
240
241
242
243
244
245
246
			(list (car dat) ;; host
			      (string->number (cadr dat)) ;; port
			      (string->number (caddr dat))
                              (cadr (cddr dat))))))
                (begin 
                   (if dbprep-found
                      (begin
                         (debug:print-info 0 *default-log-port* "Server is in dbprep at " (current-seconds))
                         (thread-sleep! 25)
                      )
                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds))
                   )
		    (list #f #f #f #f)))))))))

;; get a list of servers with all relevant data
;; ( mod-time host port start-time pid )
;;
(define (server:get-list areapath #!key (limit #f))
  (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
	(day-seconds (* 24 60 60)))
    ;; if the directory exists continue to get the list
    ;; otherwise attempt to create the logs dir and then
    ;; continue
    (if (if (directory-exists? (conc areapath "/logs"))
	    '()
	    (if (file-write-access? areapath)
		(begin
		  (condition-case
		   (create-directory (conc areapath "/logs") #t)
		   (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
		   (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn)))
		  (directory-exists? (conc areapath "/logs")))
		'()))

        ;; Get the list of server logs that do not contain "exiting". Ignore logs for servers that have exited.



	(let* ((server-logs-cmd  (conc "grep -iL exiting " areapath "/logs/server-*-*.log"))
               (server-logs   (string-split (string-chomp (call-with-input-pipe server-logs-cmd read-all))))
	       (num-serv-logs (length server-logs)))
	  (if (or (null? server-logs) (= num-serv-logs 0))
              (let ()
                 (debug:print 1  *default-log-port* "There are no servers running")
	         '()
              )
	      (let loop ((hed  (string-chomp (car server-logs)))
			 (tal  (cdr server-logs))
			 (res '()))
		(let* ((mod-time  (handle-exceptions
				   exn







|
|

|



|



















|
>
>
>
|
<



|







200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238

239
240
241
242
243
244
245
246
247
248
249
			(list (car dat) ;; host
			      (string->number (cadr dat)) ;; port
			      (string->number (caddr dat))
                              (cadr (cddr dat))))))
                (begin 
                   (if dbprep-found
                      (begin
                         (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time))
                         (thread-sleep! 0.5) ;; was 25 sec but that blocked things from starting?
                      )
                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))
                   )
		    (list #f #f #f #f)))))))))

;; get a list of servers from the log files, with all relevant data
;; ( mod-time host port start-time pid )
;;
(define (server:get-list areapath #!key (limit #f))
  (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
	(day-seconds (* 24 60 60)))
    ;; if the directory exists continue to get the list
    ;; otherwise attempt to create the logs dir and then
    ;; continue
    (if (if (directory-exists? (conc areapath "/logs"))
	    '()
	    (if (file-write-access? areapath)
		(begin
		  (condition-case
		   (create-directory (conc areapath "/logs") #t)
		   (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
		   (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn)))
		  (directory-exists? (conc areapath "/logs")))
		'()))

        ;; Get the list of server logs. First remove logs for servers that have exited.
	(let* (
               ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers.
               ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'")))
               (server-logs   (glob (conc areapath "/logs/server-*-*.log")))

	       (num-serv-logs (length server-logs)))
	  (if (or (null? server-logs) (= num-serv-logs 0))
              (let ()
                 (debug:print 2  *default-log-port* "There are no servers running at " (common:human-time))
	         '()
              )
	      (let loop ((hed  (string-chomp (car server-logs)))
			 (tal  (cdr server-logs))
			 (res '()))
		(let* ((mod-time  (handle-exceptions
				   exn
363
364
365
366
367
368
369
370



371
372
373
374
375
376
377
378
379
380





381
382
383
384
385
386
387
388
389
390
391


392

393
394

395
396
397
398


399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425



426
427
428
429
430
431
432

(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))

;; wait for server=start-last to be three seconds old



;;
(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (reftime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))
	       (all-go   (> delta reftime)))





	  (if (and all-go
		   (begin
                     (debug:print-info 0 *default-log-port* "Writing " start-flag)
		     (with-output-to-file start-flag
		       (lambda ()
			 (print server-key)))
		     (thread-sleep! 0.25)
		     (let ((res (with-input-from-file start-flag
				  (lambda ()
				    (read-line)))))
		       (equal? server-key res))))


	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe

	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "

				  fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
		(thread-sleep! reftime)
		(server:wait-for-server-start-last-flag areapath)))))))



;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least 3 seconds old
  (server:wait-for-server-start-last-flag areapath)
  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
	     (call-num     (car last-run-dat))
	     (when-run     (cadr last-run-dat))
	     (run-delay    (+ (case call-num
				((0)    0)
				((1)   20)
				((2)  300)
				(else 600))
			      (random 5)))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
	     (lock-file    (conc areapath "/logs/server-start.lock")))
	(if	(> (- (current-seconds) when-run) run-delay)
		(let* ((start-flag (conc areapath "/logs/server-start-last")))
		  (common:simple-file-lock-and-wait lock-file expire-time: 15)
                  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
		  (system (conc "touch " start-flag)) ;; lazy but safe
		  (server:run areapath)
		  (thread-sleep! 2) ;; don't release the lock for at least a few seconds
		  (common:simple-file-release-lock lock-file)))
	(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))




;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))
    (let loop ((server-info (server:check-if-running areapath))
	       (try-num    0))







|
>
>
>




|




|
>
>
>
>
>
|


|
<
<

|
<
<
|
>
>
|
>


>
|
|


>
>
|
|



|


|
<
<
<
<
<
<
<
<

<
|
|
|
|
|
|
|
|
>
>
>







366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395


396
397


398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420








421

422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439

(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))


;; if server-start-last exists, and wasn't old enough, wait <idle time> + 1, then call this function recursively until it is old enough.
;; if it is old enough, overwrite it and wait 0.25 seconds.
;; if it then has the wrong server key, wait <idle time> + 1 and call this function recursively.
;;
(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (idletime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))
	       (old-enough   (> delta idletime))
               (new-server-key "")
              )

          ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than <idletime> seconds, and the new file still has the same server key as you just wrote, return #t.
	  ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process.
           (if (and old-enough
		   (begin
                     (debug:print-info 0 *default-log-port* "Writing " start-flag)
		     (with-output-to-file start-flag (lambda () (print server-key)))


		     (thread-sleep! 0.25)
		     (set! new-server-key (with-input-from-file start-flag (lambda () (read-line))))


		     (equal? server-key new-server-key))
                )
	      #t

           ;; If either of the above conditions is not true, print a "Gating server start" message, wait <idle-time> + 1, then call this function recursively. 
	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
				  (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server"))

		(thread-sleep! ( + 1 idletime))
		(server:wait-for-server-start-last-flag areapath)))))))


        
;; kind start up of server, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least <server idletime> seconds old
  (server:wait-for-server-start-last-flag areapath)
  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* (








	     (lock-file    (conc areapath "/logs/server-start.lock")))

	(let* ((start-flag (conc areapath "/logs/server-start-last")))
	  (common:simple-file-lock-and-wait lock-file expire-time: 25)
	  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
	  (system (conc "touch " start-flag)) ;; lazy but safe
	  (server:run areapath)
	  (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
	  (common:simple-file-release-lock lock-file)))

      (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")
   )
)

;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))
    (let loop ((server-info (server:check-if-running areapath))
	       (try-num    0))
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
  (let ((ns (string->number
	     (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
    (or ns numservers)))

;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath) ;;  #!key (numservers "2"))
  (let* ((ns            (server:get-num-servers))
	 (servers       (server:get-best (server:get-list areapath))))
    (if (or (and servers
		 (null? servers))
	    (not servers)
	    (and (list? servers)
		 (< (length servers) (random ns)))) ;; somewhere between 0 and numservers
        #f
        (let loop ((hed (car servers))
                   (tal (cdr servers)))
          (let ((res (server:check-server hed)))
            (if res
                hed
                (if (null? tal)







|





|







454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
  (let ((ns (string->number
	     (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
    (or ns numservers)))

;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath) ;;  #!key (numservers "2"))
  (let* ((ns            (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed
	 (servers       (server:get-best (server:get-list areapath))))
    (if (or (and servers
		 (null? servers))
	    (not servers)
	    (and (list? servers)
		 (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers
        #f
        (let loop ((hed (car servers))
                   (tal (cdr servers)))
          (let ((res (server:check-server hed)))
            (if res
                hed
                (if (null? tal)