Megatest

Check-in [85565c63a0]
Login
Overview
Comment:speculative fixes for lockdb issues and dead server connections (bug d33ba94)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60 | v1.6005_ww45.2
Files: files | file ages | folders
SHA1: 85565c63a070d7ea7f3a639e1efdffb97b955b23
User & Date: matt on 2014-11-05 03:18:40
Other Links: branch diff | manifest | tags
Context
2014-11-05
11:05
Added exception handler to lock-queue call. Added transaction to create db in tdb check-in: 343a1c8579 user: mrwellan tags: v1.60
03:18
speculative fixes for lockdb issues and dead server connections (bug d33ba94) check-in: 85565c63a0 user: matt tags: v1.60, v1.6005_ww45.2
2014-11-04
22:44
Improvements to sharing access to lock db in lock-queue check-in: 96e9934cfa user: matt tags: v1.60
Changes

Modified lock-queue.scm from [9d2a7c3661] to [f53d3994cd].

141
142
143
144
145
146
147
148

149
150
151
152








153

154
155
156
157
158
159
160
141
142
143
144
145
146
147

148
149
150
151
152
153
154
155
156
157
158
159
160

161
162
163
164
165
166
167
168







-
+




+
+
+
+
+
+
+
+
-
+







(define (lock-queue:release-lock fname test-id #!key (count 10))
  (let* ((dbdat (lock-queue:open-db fname)))
    (handle-exceptions
     exn
     (begin
       (debug:print 0 "WARNING: Failed to release queue lock. Will try again in few seconds")
       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
       (thread-sleep! 10)
       (thread-sleep! (/ count 10))
       (if (> count 0)
	   (begin
	     (sqlite3:finalize! (lock-queue:db-dat-get-db dbdat))
	     (lock-queue:release-lock fname test-id count: (- count 1)))
	   (let ((journal (conc fname "-journal")))
	     ;; If we've tried ten times and failed there is a serious problem
	     ;; try to remove the lock db and allow it to be recreated
	     (handle-exceptions
	      exn
	      #f
	      (if (file-exists? journal)(delete-file journal))
	      (if (file-exists? fname)  (delete-file fname))
	   #f))
	      #f))))
     (sqlite3:execute (lock-queue:db-dat-get-db dbdat) "DELETE FROM runlocks WHERE test_id=?;" test-id)
     (sqlite3:finalize! (lock-queue:db-dat-get-db dbdat)))))

(define (lock-queue:steal-lock dbdat test-id #!key (count 10))
  (debug:print-info 0 "Attempting to steal lock at " (lock-queue:db-dat-get-path dbdat))
  (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 "lock-queue:steal-lock; waiting on journal")
  (handle-exceptions

Modified rmt.scm from [ef41e52830] to [f191ba6eba].

88
89
90
91
92
93
94

95
96
97
98
99
100
101
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102







+







    (if connection-info
	(let ((res             (http-transport:client-api-send-receive run-id connection-info cmd jparams)))
	  (http-transport:server-dat-update-last-access connection-info)
	  (if res
	      (db:string->obj res)
	      (let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")
		(hash-table-delete! *runremote* run-id)
		(rmt:send-receive cmd run-id params))))
	(let ((max-avg-qry (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "-1"))))
	  (debug:print-info 4 "no server and read-only query, bypassing normal channel")
	  ;; (if (rmt:write-frequency-over-limit? cmd run-id)(server:kind-run run-id))
	  (let ((curr-max (rmt:get-max-query-average)))
	    (if (> (cdr curr-max) max-avg-qry)
		(begin