Overview
Comment: | Corrected issue with server overload handling. Switch to using a less burdensome server ping instead of server:check-if-running |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.64 |
Files: | files | file ages | folders |
SHA1: |
fd6bcc8b75450d820420f86aa3bd4122 |
User & Date: | matt on 2017-06-01 00:21:31 |
Other Links: | branch diff | manifest | tags |
Context
2017-06-01
| ||
22:08 | Fixed sync to megatest.db, it was not respecting last_update and thus syncing everything. Added hard storage of last update in a no-sync db on /tmp check-in: b665288f56 user: matt tags: v1.64 | |
00:21 | Corrected issue with server overload handling. Switch to using a less burdensome server ping instead of server:check-if-running check-in: fd6bcc8b75 user: matt tags: v1.64 | |
2017-05-31
| ||
23:05 | Improved messages in couple noisy cases. Add flag for turning off touching of server logs when max api calls situation occurs check-in: bc2a1779bc user: matt tags: v1.64 | |
Changes
Modified http-transport.scm from [2c955425b7] to [e6367359e1].
︙ | |||
293 294 295 296 297 298 299 | 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 | + + + + + - - + + | (define (http-transport:close-connections #!key (area-dat #f)) (let* ((runremote (or area-dat *runremote*)) (server-dat (if runremote (remote-conndat runremote) #f))) ;; (hash-table-ref/default *runremote* run-id #f))) (if (vector? server-dat) (let ((api-dat (http-transport:server-dat-get-api-uri server-dat))) (handle-exceptions exn (begin (print-call-chain *default-log-port*) (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn))) |
︙ | |||
432 433 434 435 436 437 438 | 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 | - + - + | (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond ((and *server-run* |
︙ |
Modified megatest-version.scm from [132e4528f7] to [397d84c9d5].
1 2 3 4 5 | 1 2 3 4 5 6 7 | - + | ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. (declare (unit megatest-version)) |
Modified rmt.scm from [f39fa419cf] to [7ffa1b548f].
︙ | |||
131 132 133 134 135 136 137 | 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | - - - - - - - + | ;;DOT CASE4 -> "rmt:send-receive"; ;; reset the connection if it has been unused too long ((and runremote (remote-conndat runremote) (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 10)))) ;; Subtract or add the random value? Seems like it should be substract but Neither fixes the "WARNING: failure in with-input-from-request to #<request>.\n message: Server closed connection before sending response" (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") |
︙ | |||
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 | 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 | + + - - + + | (if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end (if (and (vector? res) (eq? (vector-length res) 2) (eq? (vector-ref res 1) 'overloaded)) ;; since we are looking at the data to carry the error we'll use a fairly obtuse combo to minimise the chances of some sort of collision. (let ((wait-delay (+ attemptnum (* attemptnum 10)))) (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") (mutex-lock! *rmt-mutex*) (http-transport:close-connections area-dat: runremote) (set! *runremote* #f) ;; force starting over (mutex-unlock! *rmt-mutex*) (thread-sleep! wait-delay) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) res) ;; All good, return res (begin (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) (remote-conndat-set! runremote #f) (http-transport:close-connections area-dat: runremote) (remote-server-url-set! runremote #f) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") |
︙ |
Modified runs.scm from [180cc9d9e4] to [50559e5a2a].
︙ | |||
1267 1268 1269 1270 1271 1272 1273 | 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 | + + + - + | itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) ;; every 15 minutes verify the server is there for this run (if (and (common:low-noise-print 240 "try start server" run-id) (not (or (and *runremote* (remote-server-url *runremote*) (server:ping (remote-server-url *runremote*))) |
︙ |
Modified tasks.scm from [7b85a80157] to [5e421a88af].
︙ | |||
193 194 195 196 197 198 199 | 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | - + | (setenv "TARGETHOST_LOGF" (conc logdir "server-kills.log")) (system (conc "nbfake kill "kill-switch" "pid)) (when logfile (thread-sleep! 0.5) (if (file-exists? gzfile) (delete-file gzfile)) |
︙ |