Megatest

Check-in [c5f5a4ad19]
Login
Overview
Comment:Remove -O4, fixed the force server switch.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | runaway-servers-fix
Files: files | file ages | folders
SHA1: c5f5a4ad1973b77052bd8d7cadef4db593dbe020
User & Date: matt on 2017-03-23 13:27:38
Other Links: branch diff | manifest | tags
Context
2017-03-23
13:56
Added message when server is forced Closed-Leaf check-in: 6a476e9ca7 user: matt tags: runaway-servers-fix
13:38
merged lockfile fix check-in: f25594cb64 user: bjbarcla tags: v1.63
13:27
Remove -O4, fixed the force server switch. check-in: c5f5a4ad19 user: matt tags: runaway-servers-fix
12:49
adding some file-exists? protections and a fix to force-server? from 1.63 check-in: 211ecbabeb user: bjbarcla tags: runaway-servers-fix
Changes

Modified Makefile from [daa4306c4d] to [693a5b2d45].

1
2
3
4
5
6
7
8
9
10
11
12
# make install CSCOPTS='-accumulate-profile -profile-name $(PWD)/profile-ww$(shell date +%V.%u)'
# rm <files>.o ; make install CSCOPTS='-profile' ; ... ;  chicken-profile | less

PREFIX=$(PWD)
CSCOPTS=-O4
INSTALL=install
SRCFILES = common.scm items.scm launch.scm \
   ods.scm runconfig.scm server.scm configf.scm \
   db.scm keys.scm margs.scm megatest-version.scm \
   process.scm runs.scm tasks.scm tests.scm genexample.scm \
   http-transport.scm filedb.scm \
   client.scm synchash.scm daemon.scm mt.scm \




|







1
2
3
4
5
6
7
8
9
10
11
12
# make install CSCOPTS='-accumulate-profile -profile-name $(PWD)/profile-ww$(shell date +%V.%u)'
# rm <files>.o ; make install CSCOPTS='-profile' ; ... ;  chicken-profile | less

PREFIX=$(PWD)
CSCOPTS=
INSTALL=install
SRCFILES = common.scm items.scm launch.scm \
   ods.scm runconfig.scm server.scm configf.scm \
   db.scm keys.scm margs.scm megatest-version.scm \
   process.scm runs.scm tasks.scm tests.scm genexample.scm \
   http-transport.scm filedb.scm \
   client.scm synchash.scm daemon.scm mt.scm \

Modified common.scm from [fe39965c84] to [73b38805aa].

428
429
430
431
432
433
434
435









436
437
438
439
440
441
442
	    (print key-string)))
	(thread-sleep! 0.25)
	(if (file-exists? fname)
	    (with-input-from-file fname
	      (lambda ()
		(equal? key-string (read-line))))
	    #f))))
	









(define (common:simple-file-release-lock fname)
  (delete-file* fname))

;;======================================================================
;; S T A T E S   A N D   S T A T U S E S
;;======================================================================








|
>
>
>
>
>
>
>
>
>







428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
	    (print key-string)))
	(thread-sleep! 0.25)
	(if (file-exists? fname)
	    (with-input-from-file fname
	      (lambda ()
		(equal? key-string (read-line))))
	    #f))))

(define (common:simple-file-lock-and-wait fname #!key (expire-time 300))
  (let ((end-time (+ expire-time (current-seconds))))
    (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time)))
      (if got-lock
	  #t
	  (if (> end-time (current-seconds))
	      (loop (common:simple-file-lock fname expire-time: expire-time))
	      #f)))))

(define (common:simple-file-release-lock fname)
  (delete-file* fname))

;;======================================================================
;; S T A T E S   A N D   S T A T U S E S
;;======================================================================

1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056



1057
1058
1059
1060
1061
1062
1063
  (not (or (args:get-arg "-no-cache")
	   (and *configdat*
		(equal? (configf:lookup *configdat* "setup" "use-cache") "no")))))

;; force use of server?
;;
(define (common:force-server?)
  (let* ((force-setting (configf:lookup "server" "force"))
         (force-type    (if force-setting (string->symbol force-setting) #f)))
    (case force-type
      ((#f)     #f)
      ((always) #t)
      ((test)   (if (args:get-arg "-execute") ;; we are in a test
		    #t
		    #f)))))




;;======================================================================
;; M I S C   L I S T S
;;======================================================================

;; items in lista are matched value and position in listb
;; return the remaining items in listb or #f







|
|





|
>
>
>







1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
  (not (or (args:get-arg "-no-cache")
	   (and *configdat*
		(equal? (configf:lookup *configdat* "setup" "use-cache") "no")))))

;; force use of server?
;;
(define (common:force-server?)
  (let* ((force-setting (configf:lookup *configdat* "server" "force"))
	 (force-type    (if force-setting (string->symbol force-setting) #f)))
    (case force-type
      ((#f)     #f)
      ((always) #t)
      ((test)   (if (args:get-arg "-execute") ;; we are in a test
		    #t
		    #f))
      (else
       (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.")
       #t)))) ;; default to requiring server 

;;======================================================================
;; M I S C   L I S T S
;;======================================================================

;; items in lista are matched value and position in listb
;; return the remaining items in listb or #f

Modified rmt.scm from [9187b44a8c] to [1adf35b1f4].

93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
	     (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
      (remote-conndat-set! runremote #f)
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a record for our connection for given area
     ((not runremote)                     
      (set! *runremote* (make-remote))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  1")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a homehost record
     ((not (pair? (remote-hh-dat runremote)))  ;; not on homehost
      (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
      (remote-hh-dat-set! runremote (common:get-homehost))







|







93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
	     (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
      (remote-conndat-set! runremote #f)
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a record for our connection for given area
     ((not runremote)                     
      (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  1")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a homehost record
     ((not (pair? (remote-hh-dat runremote)))  ;; not on homehost
      (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
      (remote-hh-dat-set! runremote (common:get-homehost))
140
141
142
143
144
145
146


147

148
149
150
151


152
153
154
155
156
157
158
159
160
161
162
163
164
	   (cdr (remote-hh-dat runremote))       ;; new
           (not (remote-server-url runremote))
	   (not (member cmd api:read-only-queries)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
      (let ((server-url  (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
	(if server-url
	    (remote-server-url-set! runremote server-url) ;; the string can be consumed by the client setup if needed


	    (server:kind-run *toppath*)))

      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.1")
      (rmt:open-qry-close-locally cmd 0 params))



     ((and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
           (not (remote-conndat runremote)))            ;; and no connection
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6  hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote))
      (mutex-unlock! *rmt-mutex*)
      (server:start-and-wait *toppath*)
      (if (common:force-server?)(remote-force-server-set! runremote #t))
      (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
      (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as
     ;; all set up if get this far, dispatch the query
     ((and (not (remote-force-server runremote))
	   (cdr (remote-hh-dat runremote))) ;; we are on homehost
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  7")







>
>
|
>




>
>
|
|



|







140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
	   (cdr (remote-hh-dat runremote))       ;; new
           (not (remote-server-url runremote))
	   (not (member cmd api:read-only-queries)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
      (let ((server-url  (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
	(if server-url
	    (remote-server-url-set! runremote server-url) ;; the string can be consumed by the client setup if needed
	    (if (common:force-server?)
		(server:start-and-wait *toppath*)
		(server:kind-run *toppath*))))
      (remote-force-server-set! runremote (common:force-server?))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.1")
      (rmt:open-qry-close-locally cmd 0 params))

     ((or (and (remote-force-server runremote)              ;; we are forcing a server and don't yet have a connection to one
	       (not (remote-conndat runremote)))
	  (and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
	       (not (remote-conndat runremote))))           ;; and no connection
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6  hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote))
      (mutex-unlock! *rmt-mutex*)
      (server:start-and-wait *toppath*)
      (remote-force-server-set! runremote (common:force-server?))
      (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
      (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as
     ;; all set up if get this far, dispatch the query
     ((and (not (remote-force-server runremote))
	   (cdr (remote-hh-dat runremote))) ;; we are on homehost
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  7")

Modified server.scm from [948061a1f4] to [ba8be5ee9a].

268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
				((1)   20)
				((2)  300)
				(else 600))
			      (random 5)))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
	     (lock-file    (conc areapath "/logs/server-start.lock")))
	(if	(> (- (current-seconds) when-run) run-delay)
		(begin
		  (common:simple-file-lock lock-file expire-time: 15)
		  (server:run areapath)
		  (thread-sleep! 5) ;; don't release the lock for at least a few seconds
		  (common:simple-file-release-lock lock-file)))
	(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))

(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))







|







268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
				((1)   20)
				((2)  300)
				(else 600))
			      (random 5)))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
	     (lock-file    (conc areapath "/logs/server-start.lock")))
	(if	(> (- (current-seconds) when-run) run-delay)
		(begin
		  (common:simple-file-lock-and-wait lock-file expire-time: 15)
		  (server:run areapath)
		  (thread-sleep! 5) ;; don't release the lock for at least a few seconds
		  (common:simple-file-release-lock lock-file)))
	(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))

(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))

Modified tests/fdktestqa/testqa/megatest.config from [d32541500d] to [200e742890].

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
[setup]
testcopycmd cp --remove-destination -rlv TEST_SRC_PATH/. TEST_TARG_PATH/. >> TEST_TARG_PATH/mt_launch.log 2>> TEST_TARG_PATH/mt_launch.log
# launchwait no
launch-delay 0

[server]
runtime 180

# All these are overridden in ../fdk.config
# [jobtools]
# launcher nbfake
# launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log 

[include ../fdk.config]






<
<
<







1
2
3
4
5



6
7
8
9
10
11
12
[setup]
testcopycmd cp --remove-destination -rlv TEST_SRC_PATH/. TEST_TARG_PATH/. >> TEST_TARG_PATH/mt_launch.log 2>> TEST_TARG_PATH/mt_launch.log
# launchwait no
launch-delay 0




# All these are overridden in ../fdk.config
# [jobtools]
# launcher nbfake
# launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log 

[include ../fdk.config]