Megatest

Check-in [f25594cb64]
Login
Overview
Comment:merged lockfile fix
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.63
Files: files | file ages | folders
SHA1: f25594cb6459f0a49773f663da969bb9c52e4608
User & Date: bjbarcla on 2017-03-23 13:38:11
Other Links: branch diff | manifest | tags
Context
2017-03-24
11:19
Added some error handling on the locking calls. Silenced a dashboard message. check-in: a5dbcdd2ac user: matt tags: v1.63
2017-03-23
13:38
merged lockfile fix check-in: f25594cb64 user: bjbarcla tags: v1.63
13:27
Remove -O4, fixed the force server switch. check-in: c5f5a4ad19 user: matt tags: runaway-servers-fix
12:23
merged in matts fix for testsuite; resolved conflicts with common:file-exists? additions check-in: 474e1543fa user: bjbarcla tags: v1.63
Changes

Modified Makefile from [daa4306c4d] to [693a5b2d45].

1
2
3
4
5

6
7
8
9
10
11
12
1
2
3
4

5
6
7
8
9
10
11
12




-
+







# make install CSCOPTS='-accumulate-profile -profile-name $(PWD)/profile-ww$(shell date +%V.%u)'
# rm <files>.o ; make install CSCOPTS='-profile' ; ... ;  chicken-profile | less

PREFIX=$(PWD)
CSCOPTS=-O4
CSCOPTS=
INSTALL=install
SRCFILES = common.scm items.scm launch.scm \
   ods.scm runconfig.scm server.scm configf.scm \
   db.scm keys.scm margs.scm megatest-version.scm \
   process.scm runs.scm tasks.scm tests.scm genexample.scm \
   http-transport.scm filedb.scm \
   client.scm synchash.scm daemon.scm mt.scm \

Modified common.scm from [fe39965c84] to [73b38805aa].

428
429
430
431
432
433
434
435










436
437
438
439
440
441
442
428
429
430
431
432
433
434

435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451







-
+
+
+
+
+
+
+
+
+
+







	    (print key-string)))
	(thread-sleep! 0.25)
	(if (file-exists? fname)
	    (with-input-from-file fname
	      (lambda ()
		(equal? key-string (read-line))))
	    #f))))
	

(define (common:simple-file-lock-and-wait fname #!key (expire-time 300))
  (let ((end-time (+ expire-time (current-seconds))))
    (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time)))
      (if got-lock
	  #t
	  (if (> end-time (current-seconds))
	      (loop (common:simple-file-lock fname expire-time: expire-time))
	      #f)))))

(define (common:simple-file-release-lock fname)
  (delete-file* fname))

;;======================================================================
;; S T A T E S   A N D   S T A T U S E S
;;======================================================================

1042
1043
1044
1045
1046
1047
1048
1049
1050


1051
1052
1053
1054
1055
1056




1057
1058
1059
1060
1061
1062
1063
1051
1052
1053
1054
1055
1056
1057


1058
1059
1060
1061
1062
1063
1064

1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075







-
-
+
+





-
+
+
+
+







  (not (or (args:get-arg "-no-cache")
	   (and *configdat*
		(equal? (configf:lookup *configdat* "setup" "use-cache") "no")))))

;; force use of server?
;;
(define (common:force-server?)
  (let* ((force-setting (configf:lookup "server" "force"))
         (force-type    (if force-setting (string->symbol force-setting) #f)))
  (let* ((force-setting (configf:lookup *configdat* "server" "force"))
	 (force-type    (if force-setting (string->symbol force-setting) #f)))
    (case force-type
      ((#f)     #f)
      ((always) #t)
      ((test)   (if (args:get-arg "-execute") ;; we are in a test
		    #t
		    #f)))))
		    #f))
      (else
       (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.")
       #t)))) ;; default to requiring server 

;;======================================================================
;; M I S C   L I S T S
;;======================================================================

;; items in lista are matched value and position in listb
;; return the remaining items in listb or #f

Modified rmt.scm from [9187b44a8c] to [1adf35b1f4].

93
94
95
96
97
98
99
100

101
102
103
104
105
106
107
93
94
95
96
97
98
99

100
101
102
103
104
105
106
107







-
+







	     (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
      (remote-conndat-set! runremote #f)
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a record for our connection for given area
     ((not runremote)                     
      (set! *runremote* (make-remote))
      (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  1")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a homehost record
     ((not (pair? (remote-hh-dat runremote)))  ;; not on homehost
      (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
      (remote-hh-dat-set! runremote (common:get-homehost))
140
141
142
143
144
145
146


147


148
149
150
151


152
153


154
155
156
157

158
159
160
161
162
163
164
140
141
142
143
144
145
146
147
148

149
150
151
152
153
154
155
156


157
158
159
160
161

162
163
164
165
166
167
168
169







+
+
-
+
+




+
+
-
-
+
+



-
+







	   (cdr (remote-hh-dat runremote))       ;; new
           (not (remote-server-url runremote))
	   (not (member cmd api:read-only-queries)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
      (let ((server-url  (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
	(if server-url
	    (remote-server-url-set! runremote server-url) ;; the string can be consumed by the client setup if needed
	    (if (common:force-server?)
		(server:start-and-wait *toppath*)
	    (server:kind-run *toppath*)))
		(server:kind-run *toppath*))))
      (remote-force-server-set! runremote (common:force-server?))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.1")
      (rmt:open-qry-close-locally cmd 0 params))

     ((or (and (remote-force-server runremote)              ;; we are forcing a server and don't yet have a connection to one
	       (not (remote-conndat runremote)))
     ((and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
           (not (remote-conndat runremote)))            ;; and no connection
	  (and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
	       (not (remote-conndat runremote))))           ;; and no connection
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6  hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote))
      (mutex-unlock! *rmt-mutex*)
      (server:start-and-wait *toppath*)
      (if (common:force-server?)(remote-force-server-set! runremote #t))
      (remote-force-server-set! runremote (common:force-server?))
      (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
      (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as
     ;; all set up if get this far, dispatch the query
     ((and (not (remote-force-server runremote))
	   (cdr (remote-hh-dat runremote))) ;; we are on homehost
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  7")

Modified server.scm from [948061a1f4] to [ba8be5ee9a].

268
269
270
271
272
273
274
275

276
277
278
279
280
281
282
268
269
270
271
272
273
274

275
276
277
278
279
280
281
282







-
+







				((1)   20)
				((2)  300)
				(else 600))
			      (random 5)))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
	     (lock-file    (conc areapath "/logs/server-start.lock")))
	(if	(> (- (current-seconds) when-run) run-delay)
		(begin
		  (common:simple-file-lock lock-file expire-time: 15)
		  (common:simple-file-lock-and-wait lock-file expire-time: 15)
		  (server:run areapath)
		  (thread-sleep! 5) ;; don't release the lock for at least a few seconds
		  (common:simple-file-release-lock lock-file)))
	(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))

(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))

Modified tests/fdktestqa/testqa/megatest.config from [d32541500d] to [200e742890].

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
1
2
3
4
5



6
7
8
9
10
11
12





-
-
-







[setup]
testcopycmd cp --remove-destination -rlv TEST_SRC_PATH/. TEST_TARG_PATH/. >> TEST_TARG_PATH/mt_launch.log 2>> TEST_TARG_PATH/mt_launch.log
# launchwait no
launch-delay 0

[server]
runtime 180

# All these are overridden in ../fdk.config
# [jobtools]
# launcher nbfake
# launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log 

[include ../fdk.config]