Megatest

Check-in [eb064be6a3]
Login
Overview
Comment:Automated merge of server-log-handshaking/a05b1e5025/integ into integ-home
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | integ-home
Files: files | file ages | folders
SHA1: eb064be6a3184379d41aae343a457c79f5c5de7e
User & Date: matt on 2017-02-01 11:07:31
Other Links: branch diff | manifest | tags
Context
2017-02-02
15:20
Automated merge of html-gen/ae20a8a286/integ into integ-home check-in: 729025ee8c user: matt tags: integ-home
2017-02-01
11:07
Automated merge of server-log-handshaking/a05b1e5025/integ into integ-home check-in: eb064be6a3 user: matt tags: integ-home
10:50
Fixed an issue with log-rotate that was causing some run-away scenarios. check-in: a05b1e5025 user: mrwellan tags: server-log-handshaking
2017-01-29
16:48
Automated merge of server-log-handshaking/a642f429b1/integ into integ-home check-in: ad7df3e1eb user: matt tags: integ-home
Changes

Modified common.scm from [3380145d50] to [1694e7ccde].

240
241
242
243
244
245
246





247
248



249
250
251
252
253
254
255





256
257
258
259
260
261
262
;; WARNING: This proc operates assuming that it is in the directory above the
;;          logs directory you wish to log-rotate.
;;
(define (common:rotate-logs)
  (if (not (directory-exists? "logs"))(create-directory "logs"))
  (directory-fold 
   (lambda (file rem)





     (if (and (string-match "^.*.log" file)
	      (> (file-size (conc "logs/" file)) 200000))



	 (let ((gzfile (conc "logs/" file ".gz")))
	   (if (file-exists? gzfile)
	       (begin
		 (debug:print-info 0 *default-log-port* "removing " gzfile)
		 (delete-file gzfile)))
	   (debug:print-info 0 *default-log-port* "compressing " file)
	   (system (conc "gzip logs/" file)))))





   '()
   "logs"))

;; Force a megatest cleanup-db if version is changed and skip-version-check not specified
;;
(define (common:exit-on-version-changed)
  (if (common:version-changed?)







>
>
>
>
>
|
|
>
>
>
|
|
|
|
|
|
|
>
>
>
>
>







240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
;; WARNING: This proc operates assuming that it is in the directory above the
;;          logs directory you wish to log-rotate.
;;
(define (common:rotate-logs)
  (if (not (directory-exists? "logs"))(create-directory "logs"))
  (directory-fold 
   (lambda (file rem)
     (handle-exceptions
      exn
      (debug:print-info 0 *default-log-port* "failed to rotate log " file ", probably handled by another process.")
      (let* ((fullname (conc "logs/" file))
             (file-age (- (current-seconds)(file-modification-time fullname))))
        (if (or (and (string-match "^.*.log" file)
                     (> (file-size fullname) 200000))
                (and (string-match "^server-.*.log" file)
                     (> (- (current-seconds) (file-modification-time fullname))
                        (* 8 60 60))))
            (let ((gzfile (conc fullname ".gz")))
              (if (file-exists? gzfile)
                  (begin
                    (debug:print-info 0 *default-log-port* "removing " gzfile)
                    (delete-file gzfile)))
              (debug:print-info 0 *default-log-port* "compressing " file)
              (system (conc "gzip " fullname)))
            (if (> file-age (* (string->number (or (configf:lookup *configdat* "setup" "log-expire-days") "30")) 24 3600))
                (handle-exceptions
                 exn
                 #f
                 (delete-file fullname)))))))
   '()
   "logs"))

;; Force a megatest cleanup-db if version is changed and skip-version-check not specified
;;
(define (common:exit-on-version-changed)
  (if (common:version-changed?)

Modified dashboard.scm from [bb7acd661f] to [9bbb1ee284].

99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
(if (args:get-arg "-h")
    (begin
      (print help)
      (exit)))

(if (not (common:on-homehost?))
    (begin
      (debug:print 0 "ERROR: Current policy requires running dashboard on homehost: " (common:get-homehost))))
    
;; TODO: Move this inside (main)
;;
(if (not (launch:setup))
    (begin
      (print "Failed to find megatest.config, exiting") 
      (exit 1)))







|







99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
(if (args:get-arg "-h")
    (begin
      (print help)
      (exit)))

(if (not (common:on-homehost?))
    (begin
      (debug:print 0 *default-log-port* "ERROR: Current policy requires running dashboard on homehost: " (common:get-homehost))))
    
;; TODO: Move this inside (main)
;;
(if (not (launch:setup))
    (begin
      (print "Failed to find megatest.config, exiting") 
      (exit 1)))

Modified server.scm from [b68dac663e] to [5c1183db18].

217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
  (let ((now (current-seconds)))
    (sort
     (filter (lambda (rec)
	       (let ((start-time (list-ref rec 3))
		     (mod-time   (list-ref rec 0)))
		 ;; (print "start-time: " start-time " mod-time: " mod-time)
		 (and start-time mod-time
		      (> (- now start-time) 1)    ;; been running at least 1 seconds
		      (< (- now mod-time)   16)   ;; still alive - file touched in last 16 seconds
		      (< (- now start-time) 3600) ;; under one hour running time
		      )))
	     srvlst)
     (lambda (a b)
       (< (list-ref a 3)
	  (list-ref b 3))))))

(define (server:get-first-best areapath)







|

|







217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
  (let ((now (current-seconds)))
    (sort
     (filter (lambda (rec)
	       (let ((start-time (list-ref rec 3))
		     (mod-time   (list-ref rec 0)))
		 ;; (print "start-time: " start-time " mod-time: " mod-time)
		 (and start-time mod-time
		      (> (- now start-time) 0)    ;; been running at least 0 seconds
		      (< (- now mod-time)   16)   ;; still alive - file touched in last 16 seconds
		      (< (- now start-time) (string->number (or (configf:lookup *configdat* "server" "runtime") "3600"))) ;; under one hour running time
		      )))
	     srvlst)
     (lambda (a b)
       (< (list-ref a 3)
	  (list-ref b 3))))))

(define (server:get-first-best areapath)
249
250
251
252
253
254
255
256

257






258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290


291
292





293
294



295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))

;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
(define (server:kind-run areapath)
  (let ((last-run-time (hash-table-ref/default *server-kind-run* areapath #f)))

    (if (or (not last-run-time)






	    (> (- (current-seconds) last-run-time) 30))
	(begin
	  (server:run areapath)
	  (hash-table-set! *server-kind-run* areapath (current-seconds))))))

(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))
    (let loop ((server-url (server:check-if-running areapath)))
      (if (or server-url
	      (> (current-seconds) give-up-time))
	  server-url
	  (let ((num-ok (server:get-best (server:get-list areapath))))
	    (if (< num-ok 2) ;; if there are no decent candidates for servers then try starting a new one
		(server:kind-run areapath))
	    (thread-sleep! 5)
	    (loop (server:check-if-running areapath)))))))

(define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG.

(define (server:dotserver-age-seconds areapath)
  (let ((server-file (conc areapath "/.server")))
    (begin
      (handle-exceptions
       exn
       #f
       (- (current-seconds)
          (file-modification-time server-file))))))
    
;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath)
  (let* ((servers       (server:get-best (server:get-list areapath)))
	 (best-server   (if (null? servers) #f (car servers)))


	 (dotserver-url (if best-server
			    (server:record->url best-server)





			    #f))) ;; (server:read-dotserver->url areapath))) ;; tdbdat (tasks:open-db)))
    (if dotserver-url



	(let* ((res (case *transport-type*
		      ((http)(server:ping dotserver-url))
		      ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
		      )))
	  (if res
	      dotserver-url
	      (begin
		;; (server:kill best-server)
                #f)))
	#f)))

(define (server:kill servr)
  (match-let (((mod-time hostname port start-time pid)
	       servr))
    (tasks:kill-server hostname pid)))








|
>
|
>
>
>
>
>
>
|
<
|
|







|



















|
|
>
>
|
|
>
>
>
>
>
|
|
>
>
>
|
|
|
|
|
|
<
<
<







249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265

266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316



317
318
319
320
321
322
323
      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))

;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
(define (server:kind-run areapath)
  (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
         (call-num     (car last-run-dat))
         (when-run     (cadr last-run-dat))
         (run-delay    (+ (case call-num
                            ((0)    0)
                            ((1)   20)
                            ((2)  300)
                            (else 600))
                          (random 5)))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
    (if	(> (- (current-seconds) when-run) run-delay)

        (server:run areapath))
    (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))

(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))
    (let loop ((server-url (server:check-if-running areapath)))
      (if (or server-url
	      (> (current-seconds) give-up-time))
	  server-url
	  (let ((num-ok (length (server:get-best (server:get-list areapath)))))
	    (if (< num-ok 2) ;; if there are no decent candidates for servers then try starting a new one
		(server:kind-run areapath))
	    (thread-sleep! 5)
	    (loop (server:check-if-running areapath)))))))

(define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG.

(define (server:dotserver-age-seconds areapath)
  (let ((server-file (conc areapath "/.server")))
    (begin
      (handle-exceptions
       exn
       #f
       (- (current-seconds)
          (file-modification-time server-file))))))
    
;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath)
  (let* ((servers       (server:get-best (server:get-list areapath))))
    (if (null? servers)
        #f
        (let loop ((hed (car servers))
                   (tal (cdr servers)))
          (let ((res (server:check-server hed)))
            (if res
                res
                (if (null? tal)
                    #f
                    (loop (car tal)(cdr tal)))))))))

;; ping the given server
;;
(define (server:check-server server-record)
  (let* ((server-url (server:record->url server-record))
         (res        (case *transport-type*
                       ((http)(server:ping server-url))
                       ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
                       )))
    (if res
        server-url



	#f)))

(define (server:kill servr)
  (match-let (((mod-time hostname port start-time pid)
	       servr))
    (tasks:kill-server hostname pid)))

Modified utils/lock-stats.sh from [3f061e6171] to [84d255afaf].

1
2
3
4
5
6
7
8
9
10
11
12
13
#!/bin/bash

while IFS=': ' read x x x x p x x i x; do
    if ! [[ ${i}x == "x" ]];then
	if ! $(echo $i|grep EOF >/dev/null);then
	    fname=$(sudo find -L "/proc/$p/fd" -maxdepth 1 -inum "$i" -exec readlink {} \; -quit)
	    if $(echo $fname | grep megatest.db > /dev/null) || \
	       $(echo $fname | egrep '.db/\d+.db' > /dev/null);then
		echo $fname
	    fi
	fi
    fi
done < /proc/locks





|







1
2
3
4
5
6
7
8
9
10
11
12
13
#!/bin/bash

while IFS=': ' read x x x x p x x i x; do
    if ! [[ ${i}x == "x" ]];then
	if ! $(echo $i|grep EOF >/dev/null);then
	    fname=$(find -L "/proc/$p/fd" -maxdepth 1 -inum "$i" -exec readlink {} \; -quit)
	    if $(echo $fname | grep megatest.db > /dev/null) || \
	       $(echo $fname | egrep '.db/\d+.db' > /dev/null);then
		echo $fname
	    fi
	fi
    fi
done < /proc/locks