Megatest

Check-in [0b6b35ab5b]
Login
Overview
Comment:Many tweaks to improve reliability under stress
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60
Files: files | file ages | folders
SHA1: 0b6b35ab5b13f49ea80ac027def9815d312c8af3
User & Date: mrwellan on 2014-11-25 16:39:28
Other Links: branch diff | manifest | tags
Context
2014-11-25
21:10
Add big delay and take a break when system is clearly overloaded. check-in: ec50f4ac00 user: matt tags: v1.60
16:39
Many tweaks to improve reliability under stress check-in: 0b6b35ab5b user: mrwellan tags: v1.60
2014-11-24
12:45
Fixed import-megatest.db bug check-in: 398c48390d user: mrwellan tags: v1.60, v1.6006
Changes

Modified db.scm from [dec2e76730] to [7251c124d5].

193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
    (if (or rdb
	    do-not-open)
	rdb
	(let* ((dbpath       (db:dbfile-path run-id)) ;; (conc toppath "/db/" run-id ".db"))
	       (dbexists     (file-exists? dbpath))
	       (inmem        (if local #f (db:open-inmem-db)))
	       (refdb        (if local #f (db:open-inmem-db)))
	       (db           (db:lock-create-open dbpath 
						  (lambda (db)
						    (handle-exceptions
						     exn
						     (begin
						       (release-dot-lock dbpath)
						       (if (> attemptnum 2)
							   (debug:print 0 "ERROR: tried twice, cannot create/initialize db for run-id " run-id ", at path " dbpath)







|







193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
    (if (or rdb
	    do-not-open)
	rdb
	(let* ((dbpath       (db:dbfile-path run-id)) ;; (conc toppath "/db/" run-id ".db"))
	       (dbexists     (file-exists? dbpath))
	       (inmem        (if local #f (db:open-inmem-db)))
	       (refdb        (if local #f (db:open-inmem-db)))
	       (db           (db:lock-create-open dbpath ;; this is the database physically on disk
						  (lambda (db)
						    (handle-exceptions
						     exn
						     (begin
						       (release-dot-lock dbpath)
						       (if (> attemptnum 2)
							   (debug:print 0 "ERROR: tried twice, cannot create/initialize db for run-id " run-id ", at path " dbpath)
232
233
234
235
236
237
238

239
240
241
242



243
244
245
246
247
248
249
	  ;; (dbr:dbstruct-set-run-id! dbstruct run-id)
	  (if local
	      (begin
		(dbr:dbstruct-set-localdb! dbstruct run-id db) ;; (dbr:dbstruct-set-inmem! dbstruct db) ;; direct access ...
		db)
	      (begin
		(dbr:dbstruct-set-inmem!  dbstruct inmem)

		(db:sync-tables db:sync-tests-only db inmem)
		(db:delay-if-busy dbpath: (db:dbdat-get-path refdb))
		(dbr:dbstruct-set-refdb!  dbstruct refdb)
		(db:sync-tables db:sync-tests-only db refdb)



		inmem))))))

;; This routine creates the db. It is only called if the db is not already ls opened
;;
(define (db:open-main dbstruct) ;;  (conc *toppath* "/megatest.db") (car *configinfo*)))
  (let ((mdb (dbr:dbstruct-get-main dbstruct)))
    (if mdb







>

|


>
>
>







232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
	  ;; (dbr:dbstruct-set-run-id! dbstruct run-id)
	  (if local
	      (begin
		(dbr:dbstruct-set-localdb! dbstruct run-id db) ;; (dbr:dbstruct-set-inmem! dbstruct db) ;; direct access ...
		db)
	      (begin
		(dbr:dbstruct-set-inmem!  dbstruct inmem)
		(sqlite3:execute db "DELETE FROM tests WHERE state='DELETED';") ;; they just slow us down in this context
		(db:sync-tables db:sync-tests-only db inmem)
		(db:delay-if-busy refdb) ;; dbpath: (db:dbdat-get-path refdb))
		(dbr:dbstruct-set-refdb!  dbstruct refdb)
		(db:sync-tables db:sync-tests-only db refdb)
		;; sync once more to deal with delays
		(db:sync-tables db:sync-tests-only db inmem)
		(db:sync-tables db:sync-tests-only db refdb)
		inmem))))))

;; This routine creates the db. It is only called if the db is not already ls opened
;;
(define (db:open-main dbstruct) ;;  (conc *toppath* "/megatest.db") (car *configinfo*)))
  (let ((mdb (dbr:dbstruct-get-main dbstruct)))
    (if mdb
290
291
292
293
294
295
296

297
298
299
300
301
302
303
	(inmem  (dbr:dbstruct-get-inmem dbstruct))
	(maindb (dbr:dbstruct-get-main  dbstruct))
	(refdb  (dbr:dbstruct-get-refdb dbstruct))
	(olddb  (dbr:dbstruct-get-olddb dbstruct))
	;; (runid  (dbr:dbstruct-get-run-id dbstruct))
	)
    (debug:print-info 4 "Syncing for run-id: " run-id)

    (if (eq? run-id 0)
	;; runid equal to 0 is main.db
	(if maindb
	    (if (or (not (number? mtime))
		    (not (number? stime))
		    (> mtime stime)
		    force-sync)







>







294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
	(inmem  (dbr:dbstruct-get-inmem dbstruct))
	(maindb (dbr:dbstruct-get-main  dbstruct))
	(refdb  (dbr:dbstruct-get-refdb dbstruct))
	(olddb  (dbr:dbstruct-get-olddb dbstruct))
	;; (runid  (dbr:dbstruct-get-run-id dbstruct))
	)
    (debug:print-info 4 "Syncing for run-id: " run-id)
    (mutex-lock! *http-mutex*)
    (if (eq? run-id 0)
	;; runid equal to 0 is main.db
	(if maindb
	    (if (or (not (number? mtime))
		    (not (number? stime))
		    (> mtime stime)
		    force-sync)
320
321
322
323
324
325
326

327


328
329
330
331
332
333
334
335
		(> mtime stime)
		force-sync)
	    (begin
	      (db:delay-if-busy rundb)
	      (db:delay-if-busy olddb)
	      (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb)))
		(dbr:dbstruct-set-stime! dbstruct (current-milliseconds))

		num-synced)


	      0)))))

(define (db:close-main dbstruct)
  (let ((maindb (dbr:dbstruct-get-main dbstruct)))
    (if maindb
	(begin
	  (sqlite3:finalize! (db:dbdat-get-db maindb))
	  (dbr:dbstruct-set-main! dbstruct #f)))))







>

>
>
|







325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
		(> mtime stime)
		force-sync)
	    (begin
	      (db:delay-if-busy rundb)
	      (db:delay-if-busy olddb)
	      (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb)))
		(dbr:dbstruct-set-stime! dbstruct (current-milliseconds))
		(mutex-unlock! *http-mutex*)
		num-synced)
	      (begin
		(mutex-unlock! *http-mutex*)
		0))))))

(define (db:close-main dbstruct)
  (let ((maindb (dbr:dbstruct-get-main dbstruct)))
    (if maindb
	(begin
	  (sqlite3:finalize! (db:dbdat-get-db maindb))
	  (dbr:dbstruct-set-main! dbstruct #f)))))

Modified http-transport.scm from [d5c0bd2a5f] to [907ced71b2].

491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
    (debug:print-info 0 "Average non-cached time   "
		      (if (eq? *number-non-write-queries* 0)
			  "n/a (no queries)"
			  (/ *total-non-write-delay* 
			     *number-non-write-queries*))
		      " ms")
    (debug:print-info 0 "Server shutdown complete. Exiting")
    (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running")
    (exit)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id)







|







491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
    (debug:print-info 0 "Average non-cached time   "
		      (if (eq? *number-non-write-queries* 0)
			  "n/a (no queries)"
			  (/ *total-non-write-delay* 
			     *number-non-write-queries*))
		      " ms")
    (debug:print-info 0 "Server shutdown complete. Exiting")
    (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running complete")
    (exit)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id)
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552

553
554
555
556
557
558
559
					  "-")
				      run-id
				      server-id)) "Server run"))
		 (th3 (make-thread (lambda ()
				     (debug:print-info 0 "Server monitor thread started")
				     (http-transport:keep-running server-id run-id))
				   "Keep running")))
	    ;; Database connection


	    ;; don't start the db here

	    ;; (set! *inmemdb*  (db:setup run-id))


	    (thread-start! th2)

	    (thread-start! th3)
	    (set! *didsomething* #t)
	    (thread-join! th2)
	    (exit))))))

(define (http-transport:server-signal-handler signum)
  (signal-mask! signum)







<
<
<
<
<
<
<
<

>







537
538
539
540
541
542
543








544
545
546
547
548
549
550
551
552
					  "-")
				      run-id
				      server-id)) "Server run"))
		 (th3 (make-thread (lambda ()
				     (debug:print-info 0 "Server monitor thread started")
				     (http-transport:keep-running server-id run-id))
				   "Keep running")))








	    (thread-start! th2)
	    (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor.
	    (thread-start! th3)
	    (set! *didsomething* #t)
	    (thread-join! th2)
	    (exit))))))

(define (http-transport:server-signal-handler signum)
  (signal-mask! signum)

Modified launch.scm from [e538ecba1a] to [34882953c1].

97
98
99
100
101
102
103

104

105
106
107
108
109
110
111
					    
	  ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART,
	  ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY*
	  ;;
	  (let ((test-info (rmt:get-testinfo-state-status run-id test-id)))
	    (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")))
		(tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")

		(debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed")))

	  
	  (debug:print 2 "Exectuing " test-name " (id: " test-id ") on " (get-host-name))
	  (set! keys       (rmt:get-keys))
	  ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process
	  ;; one of these is defunct/redundant ...
	  (if (not (launch:setup-for-run force: #t))
	      (begin







>
|
>







97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
					    
	  ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART,
	  ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY*
	  ;;
	  (let ((test-info (rmt:get-testinfo-state-status run-id test-id)))
	    (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")))
		(tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
		(begin
		  (debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed")
		  (exit))))
	  
	  (debug:print 2 "Exectuing " test-name " (id: " test-id ") on " (get-host-name))
	  (set! keys       (rmt:get-keys))
	  ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process
	  ;; one of these is defunct/redundant ...
	  (if (not (launch:setup-for-run force: #t))
	      (begin

Modified rmt.scm from [a4cf4136f4] to [4e95475281].

84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
  (mutex-unlock! *db-multi-sync-mutex*)
  (let* ((run-id          (if rid rid 0))
	 (connection-info (rmt:get-connection-info run-id))
	 (jparams         (db:obj->string params)))
    (if connection-info
	;; use the server if have connection info
	(let* ((dat     (http-transport:client-api-send-receive run-id connection-info cmd jparams))
	       (res     (if (vector? dat) (vector-ref dat 1) #f))
	       (success (if (vector? dat) (vector-ref dat 0) #f)))
	  (http-transport:server-dat-update-last-access connection-info)
	  (if success
	      (db:string->obj res)
	      ;; (if (< attemptnum 100)
	      ;;     (begin
	      ;;       (hash-table-delete! *runremote* run-id)
	      ;;       (thread-sleep! 0.5)
	      ;;       (rmt:send-receive cmd rid params attempnum: (+ attemptnum 1)))
	      ;;     (begin
	      ;;       (print-call-chain (current-error-port))
	      ;;       (debug:print 0 "ERROR: too many attempts to communicate have failed. Giving up. Kill your mtest processes and start over")
	      ;;       (exit 1)))))
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")
		(hash-table-delete! *runremote* run-id) ;; don't keep using the same connection

		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
		(if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))

		(rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1)))))
	(if (and (< attemptnum 10)
		 (tasks:need-server run-id))
	    (begin
	      (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
	      (rmt:send-receive cmd rid params (+ attemptnum 1)))
	    (rmt:open-qry-close-locally cmd run-id params)))))







|
|




















|







84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
  (mutex-unlock! *db-multi-sync-mutex*)
  (let* ((run-id          (if rid rid 0))
	 (connection-info (rmt:get-connection-info run-id))
	 (jparams         (db:obj->string params)))
    (if connection-info
	;; use the server if have connection info
	(let* ((dat     (http-transport:client-api-send-receive run-id connection-info cmd jparams))
	       (res     (if (and dat (vector? dat)) (vector-ref dat 1) #f))
	       (success (if (and dat (vector? dat)) (vector-ref dat 0) #f)))
	  (http-transport:server-dat-update-last-access connection-info)
	  (if success
	      (db:string->obj res)
	      ;; (if (< attemptnum 100)
	      ;;     (begin
	      ;;       (hash-table-delete! *runremote* run-id)
	      ;;       (thread-sleep! 0.5)
	      ;;       (rmt:send-receive cmd rid params attempnum: (+ attemptnum 1)))
	      ;;     (begin
	      ;;       (print-call-chain (current-error-port))
	      ;;       (debug:print 0 "ERROR: too many attempts to communicate have failed. Giving up. Kill your mtest processes and start over")
	      ;;       (exit 1)))))
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")
		(hash-table-delete! *runremote* run-id) ;; don't keep using the same connection

		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
		(if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))
		(thread-sleep! 2)
		(rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1)))))
	(if (and (< attemptnum 10)
		 (tasks:need-server run-id))
	    (begin
	      (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
	      (rmt:send-receive cmd rid params (+ attemptnum 1)))
	    (rmt:open-qry-close-locally cmd run-id params)))))

Modified runs.scm from [6ed325fc14] to [7667be3cc8].

552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
			(runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull)
			reruns)))
	      (list (car newtal)(append (cdr newtal) reg) '() reruns))))

     ((and (null? fails)
	   (null? prereq-fails)
	   (null? non-completed))
      (if  (runs:can-keep-running? hed 5)
	  (begin
	    (runs:inc-cant-run-tests hed)
	    (debug:print-info 1 "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0))
	    ;; num-retries code was here
	    ;; we use this opportunity to move contents of reg to tal
	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
	  (begin







|







552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
			(runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull)
			reruns)))
	      (list (car newtal)(append (cdr newtal) reg) '() reruns))))

     ((and (null? fails)
	   (null? prereq-fails)
	   (null? non-completed))
      (if  (runs:can-keep-running? hed 20)
	  (begin
	    (runs:inc-cant-run-tests hed)
	    (debug:print-info 1 "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0))
	    ;; num-retries code was here
	    ;; we use this opportunity to move contents of reg to tal
	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
	  (begin
671
672
673
674
675
676
677

678

679
680



681
682
683
684
685
686
687
	  #f))
     
     ;; Register tests 
     ;;
     ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f))
      (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" )
      ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs

      (rmt:general-call 'register-test run-id run-id test-name item-path)

      (if (rmt:get-test-id run-id test-name item-path)
	  (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done))



      (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done))
	  (begin
	    (rmt:general-call 'register-test run-id run-id test-name "")
	    (if (rmt:get-test-id run-id test-name "")
		(hash-table-set! test-registry (runs:make-full-test-name test-name "") 'done))))
      (runs:shrink-can-run-more-tests-count)   ;; DELAY TWEAKER (still needed?)
      (if (and (null? tal)(null? reg))







>
|
>
|
|
>
>
>







671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
	  #f))
     
     ;; Register tests 
     ;;
     ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f))
      (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" )
      ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs
      (let register-loop ((numtries 15))
	(rmt:general-call 'register-test run-id run-id test-name item-path)
	(thread-sleep! 0.5)
	(if (rmt:get-test-id run-id test-name item-path)
	    (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done)
	    (if (> numtries 0)
		(register-loop (- numtries 1))
		(debug:print 0 "ERROR: failed to register test " (runs:make-full-test-name test-name item-path)))))
      (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done))
	  (begin
	    (rmt:general-call 'register-test run-id run-id test-name "")
	    (if (rmt:get-test-id run-id test-name "")
		(hash-table-set! test-registry (runs:make-full-test-name test-name "") 'done))))
      (runs:shrink-can-run-more-tests-count)   ;; DELAY TWEAKER (still needed?)
      (if (and (null? tal)(null? reg))

Modified tasks.scm from [7c5174d4f3] to [af4bc3dbb1].

165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

(define (tasks:server-lock-slot mdb run-id)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot")
  (if (< (tasks:num-in-available-state mdb run-id) 4)
      (begin 
	(tasks:server-set-available mdb run-id)
	(thread-sleep! 2) ;; Try removing this. It may not be needed.
	(tasks:server-am-i-the-server? mdb run-id))
      #f))
	
;; register that this server may come online (first to register goes though with the process)
(define (tasks:server-set-available mdb run-id)
  (sqlite3:execute 
   mdb 







|







165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

(define (tasks:server-lock-slot mdb run-id)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot")
  (if (< (tasks:num-in-available-state mdb run-id) 4)
      (begin 
	(tasks:server-set-available mdb run-id)
	;; (thread-sleep! 2) ;; Try removing this. It may not be needed.
	(tasks:server-am-i-the-server? mdb run-id))
      #f))
	
;; register that this server may come online (first to register goes though with the process)
(define (tasks:server-set-available mdb run-id)
  (sqlite3:execute 
   mdb 
342
343
344
345
346
347
348









349
350
351
352
353
354
355
  (let ((res #f))
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     mdb ;; NEEDS dbprep ADDED
     "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND  (strftime('%s','now') - start_time) < 60));" run-id)
    res))










(define (tasks:need-server run-id)
  (let ((forced (configf:lookup *configdat* "server" "required"))
	(maxqry (cdr (rmt:get-max-query-average run-id)))
	(threshold   (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))
    (cond
     (forced 







>
>
>
>
>
>
>
>
>







342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
  (let ((res #f))
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     mdb ;; NEEDS dbprep ADDED
     "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND  (strftime('%s','now') - start_time) < 60));" run-id)
    res))

(define (tasks:server-running? mdb run-id)
  (let ((res #f))
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     mdb ;; NEEDS dbprep ADDED
     "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id)
    res))

(define (tasks:need-server run-id)
  (let ((forced (configf:lookup *configdat* "server" "required"))
	(maxqry (cdr (rmt:get-max-query-average run-id)))
	(threshold   (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))
    (cond
     (forced 
403
404
405
406
407
408
409

410
411
412
413
414
415
416
(define (tasks:kill-server-run-id run-id #!key (tag "default"))
  (let* ((tdbdat  (tasks:open-db))
	 (sdat    (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
    (if sdat
	(let ((hostname (vector-ref sdat 6))
	      (pid      (vector-ref sdat 5))
	      (server-id (vector-ref sdat 0)))

	  (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
	  (tasks:kill-server hostname pid)
	  (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
	(debug:print-info 0 "No server found for run-id " run-id ", nothing to kill"))
    ;; (sqlite3:finalize! tdb)
    ))
    







>







412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
(define (tasks:kill-server-run-id run-id #!key (tag "default"))
  (let* ((tdbdat  (tasks:open-db))
	 (sdat    (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
    (if sdat
	(let ((hostname (vector-ref sdat 6))
	      (pid      (vector-ref sdat 5))
	      (server-id (vector-ref sdat 0)))
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed")
	  (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
	  (tasks:kill-server hostname pid)
	  (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
	(debug:print-info 0 "No server found for run-id " run-id ", nothing to kill"))
    ;; (sqlite3:finalize! tdb)
    ))
    

Modified tests/fullrun/megatest.config from [79e8e68f6b] to [a6f800861f].

131
132
133
134
135
136
137
138
139
140
141
142
143
144

145
146
147
148
149
150
151
# If the server can't be started on this port it will try the next port until
# it succeeds
port 8080

# This server will keep running this number of hours after last access. 
# Three minutes is 0.05 hours
# timeout 0.025
timeout 0.01

# Server is required - slower but more resistant to Sqlite issues.
# required yes

# Start server when average query takes longer than this
server-query-threshold 55500


# daemonize yes
# hostname #{scheme (get-host-name)}

## disks are:
## name host:/path/to/area
## -or-







|





|
>







131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# If the server can't be started on this port it will try the next port until
# it succeeds
port 8080

# This server will keep running this number of hours after last access. 
# Three minutes is 0.05 hours
# timeout 0.025
timeout 0.1

# Server is required - slower but more resistant to Sqlite issues.
# required yes

# Start server when average query takes longer than this
server-query-threshold 100
# 55500

# daemonize yes
# hostname #{scheme (get-host-name)}

## disks are:
## name host:/path/to/area
## -or-