Megatest

Check-in [85565c63a0]
Login
Overview
Comment:speculative fixes for lockdb issues and dead server connections (bug d33ba94)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60 | v1.6005_ww45.2
Files: files | file ages | folders
SHA1: 85565c63a070d7ea7f3a639e1efdffb97b955b23
User & Date: matt on 2014-11-05 03:18:40
Other Links: branch diff | manifest | tags
Context
2014-11-05
11:05
Added exception handler to lock-queue call. Added transaction to create db in tdb check-in: 343a1c8579 user: mrwellan tags: v1.60
03:18
speculative fixes for lockdb issues and dead server connections (bug d33ba94) check-in: 85565c63a0 user: matt tags: v1.60, v1.6005_ww45.2
2014-11-04
22:44
Improvements to sharing access to lock db in lock-queue check-in: 96e9934cfa user: matt tags: v1.60
Changes

Modified lock-queue.scm from [9d2a7c3661] to [f53d3994cd].

141
142
143
144
145
146
147
148
149
150
151
152








153
154
155
156
157
158
159
160
(define (lock-queue:release-lock fname test-id #!key (count 10))
  (let* ((dbdat (lock-queue:open-db fname)))
    (handle-exceptions
     exn
     (begin
       (debug:print 0 "WARNING: Failed to release queue lock. Will try again in few seconds")
       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
       (thread-sleep! 10)
       (if (> count 0)
	   (begin
	     (sqlite3:finalize! (lock-queue:db-dat-get-db dbdat))
	     (lock-queue:release-lock fname test-id count: (- count 1)))








	   #f))
     (sqlite3:execute (lock-queue:db-dat-get-db dbdat) "DELETE FROM runlocks WHERE test_id=?;" test-id)
     (sqlite3:finalize! (lock-queue:db-dat-get-db dbdat)))))

(define (lock-queue:steal-lock dbdat test-id #!key (count 10))
  (debug:print-info 0 "Attempting to steal lock at " (lock-queue:db-dat-get-path dbdat))
  (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 "lock-queue:steal-lock; waiting on journal")
  (handle-exceptions







|




>
>
>
>
>
>
>
>
|







141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
(define (lock-queue:release-lock fname test-id #!key (count 10))
  (let* ((dbdat (lock-queue:open-db fname)))
    (handle-exceptions
     exn
     (begin
       (debug:print 0 "WARNING: Failed to release queue lock. Will try again in few seconds")
       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
       (thread-sleep! (/ count 10))
       (if (> count 0)
	   (begin
	     (sqlite3:finalize! (lock-queue:db-dat-get-db dbdat))
	     (lock-queue:release-lock fname test-id count: (- count 1)))
	   (let ((journal (conc fname "-journal")))
	     ;; If we've tried ten times and failed there is a serious problem
	     ;; try to remove the lock db and allow it to be recreated
	     (handle-exceptions
	      exn
	      #f
	      (if (file-exists? journal)(delete-file journal))
	      (if (file-exists? fname)  (delete-file fname))
	      #f))))
     (sqlite3:execute (lock-queue:db-dat-get-db dbdat) "DELETE FROM runlocks WHERE test_id=?;" test-id)
     (sqlite3:finalize! (lock-queue:db-dat-get-db dbdat)))))

(define (lock-queue:steal-lock dbdat test-id #!key (count 10))
  (debug:print-info 0 "Attempting to steal lock at " (lock-queue:db-dat-get-path dbdat))
  (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 "lock-queue:steal-lock; waiting on journal")
  (handle-exceptions

Modified rmt.scm from [ef41e52830] to [f191ba6eba].

88
89
90
91
92
93
94

95
96
97
98
99
100
101
    (if connection-info
	(let ((res             (http-transport:client-api-send-receive run-id connection-info cmd jparams)))
	  (http-transport:server-dat-update-last-access connection-info)
	  (if res
	      (db:string->obj res)
	      (let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")

		(rmt:send-receive cmd run-id params))))
	(let ((max-avg-qry (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "-1"))))
	  (debug:print-info 4 "no server and read-only query, bypassing normal channel")
	  ;; (if (rmt:write-frequency-over-limit? cmd run-id)(server:kind-run run-id))
	  (let ((curr-max (rmt:get-max-query-average)))
	    (if (> (cdr curr-max) max-avg-qry)
		(begin







>







88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
    (if connection-info
	(let ((res             (http-transport:client-api-send-receive run-id connection-info cmd jparams)))
	  (http-transport:server-dat-update-last-access connection-info)
	  (if res
	      (db:string->obj res)
	      (let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")
		(hash-table-delete! *runremote* run-id)
		(rmt:send-receive cmd run-id params))))
	(let ((max-avg-qry (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "-1"))))
	  (debug:print-info 4 "no server and read-only query, bypassing normal channel")
	  ;; (if (rmt:write-frequency-over-limit? cmd run-id)(server:kind-run run-id))
	  (let ((curr-max (rmt:get-max-query-average)))
	    (if (> (cdr curr-max) max-avg-qry)
		(begin