Megatest

Check-in [0b3a6d8aa9]
Login
Overview
Comment:Merged v1.60 changes into try-nanomsg
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | try-nanomsg
Files: files | file ages | folders
SHA1: 0b3a6d8aa92b0c1bee6199cd6a43aff8c5eb60a5
User & Date: matt on 2014-11-26 08:59:53
Other Links: branch diff | manifest | tags
Context
2014-11-26
16:09
Added debug support in newdashboard check-in: b79afa463f user: matt tags: try-nanomsg
08:59
Merged v1.60 changes into try-nanomsg check-in: 0b3a6d8aa9 user: matt tags: try-nanomsg
2014-11-25
21:10
Add big delay and take a break when system is clearly overloaded. check-in: ec50f4ac00 user: matt tags: v1.60
2014-11-24
23:36
several little fixes check-in: 308a5d65b6 user: matt tags: try-nanomsg
Changes

Modified db.scm from [f9ca78a55d] to [75e6e604f1].

193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
    (if (or rdb
	    do-not-open)
	rdb
	(let* ((dbpath       (db:dbfile-path run-id)) ;; (conc toppath "/db/" run-id ".db"))
	       (dbexists     (file-exists? dbpath))
	       (inmem        (if local #f (db:open-inmem-db)))
	       (refdb        (if local #f (db:open-inmem-db)))
	       (db           (db:lock-create-open dbpath 
						  (lambda (db)
						    (handle-exceptions
						     exn
						     (begin
						       (release-dot-lock dbpath)
						       (if (> attemptnum 2)
							   (debug:print 0 "ERROR: tried twice, cannot create/initialize db for run-id " run-id ", at path " dbpath)







|







193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
    (if (or rdb
	    do-not-open)
	rdb
	(let* ((dbpath       (db:dbfile-path run-id)) ;; (conc toppath "/db/" run-id ".db"))
	       (dbexists     (file-exists? dbpath))
	       (inmem        (if local #f (db:open-inmem-db)))
	       (refdb        (if local #f (db:open-inmem-db)))
	       (db           (db:lock-create-open dbpath ;; this is the database physically on disk
						  (lambda (db)
						    (handle-exceptions
						     exn
						     (begin
						       (release-dot-lock dbpath)
						       (if (> attemptnum 2)
							   (debug:print 0 "ERROR: tried twice, cannot create/initialize db for run-id " run-id ", at path " dbpath)
232
233
234
235
236
237
238

239
240
241
242



243
244
245
246
247
248
249
	  ;; (dbr:dbstruct-set-run-id! dbstruct run-id)
	  (if local
	      (begin
		(dbr:dbstruct-set-localdb! dbstruct run-id db) ;; (dbr:dbstruct-set-inmem! dbstruct db) ;; direct access ...
		db)
	      (begin
		(dbr:dbstruct-set-inmem!  dbstruct inmem)

		(db:sync-tables db:sync-tests-only db inmem)
		(db:delay-if-busy dbpath: (db:dbdat-get-path refdb))
		(dbr:dbstruct-set-refdb!  dbstruct refdb)
		(db:sync-tables db:sync-tests-only db refdb)



		inmem))))))

;; This routine creates the db. It is only called if the db is not already ls opened
;;
(define (db:open-main dbstruct) ;;  (conc *toppath* "/megatest.db") (car *configinfo*)))
  (let ((mdb (dbr:dbstruct-get-main dbstruct)))
    (if mdb







>

|


>
>
>







232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
	  ;; (dbr:dbstruct-set-run-id! dbstruct run-id)
	  (if local
	      (begin
		(dbr:dbstruct-set-localdb! dbstruct run-id db) ;; (dbr:dbstruct-set-inmem! dbstruct db) ;; direct access ...
		db)
	      (begin
		(dbr:dbstruct-set-inmem!  dbstruct inmem)
		(sqlite3:execute db "DELETE FROM tests WHERE state='DELETED';") ;; they just slow us down in this context
		(db:sync-tables db:sync-tests-only db inmem)
		(db:delay-if-busy refdb) ;; dbpath: (db:dbdat-get-path refdb))
		(dbr:dbstruct-set-refdb!  dbstruct refdb)
		(db:sync-tables db:sync-tests-only db refdb)
		;; sync once more to deal with delays
		(db:sync-tables db:sync-tests-only db inmem)
		(db:sync-tables db:sync-tests-only db refdb)
		inmem))))))

;; This routine creates the db. It is only called if the db is not already ls opened
;;
(define (db:open-main dbstruct) ;;  (conc *toppath* "/megatest.db") (car *configinfo*)))
  (let ((mdb (dbr:dbstruct-get-main dbstruct)))
    (if mdb
290
291
292
293
294
295
296

297
298
299
300
301
302
303
	(inmem  (dbr:dbstruct-get-inmem dbstruct))
	(maindb (dbr:dbstruct-get-main  dbstruct))
	(refdb  (dbr:dbstruct-get-refdb dbstruct))
	(olddb  (dbr:dbstruct-get-olddb dbstruct))
	;; (runid  (dbr:dbstruct-get-run-id dbstruct))
	)
    (debug:print-info 4 "Syncing for run-id: " run-id)

    (if (eq? run-id 0)
	;; runid equal to 0 is main.db
	(if maindb
	    (if (or (not (number? mtime))
		    (not (number? stime))
		    (> mtime stime)
		    force-sync)







>







294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
	(inmem  (dbr:dbstruct-get-inmem dbstruct))
	(maindb (dbr:dbstruct-get-main  dbstruct))
	(refdb  (dbr:dbstruct-get-refdb dbstruct))
	(olddb  (dbr:dbstruct-get-olddb dbstruct))
	;; (runid  (dbr:dbstruct-get-run-id dbstruct))
	)
    (debug:print-info 4 "Syncing for run-id: " run-id)
    (mutex-lock! *http-mutex*)
    (if (eq? run-id 0)
	;; runid equal to 0 is main.db
	(if maindb
	    (if (or (not (number? mtime))
		    (not (number? stime))
		    (> mtime stime)
		    force-sync)
320
321
322
323
324
325
326

327


328
329
330
331
332
333
334
335
		(> mtime stime)
		force-sync)
	    (begin
	      (db:delay-if-busy rundb)
	      (db:delay-if-busy olddb)
	      (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb)))
		(dbr:dbstruct-set-stime! dbstruct (current-milliseconds))

		num-synced)


	      0)))))

(define (db:close-main dbstruct)
  (let ((maindb (dbr:dbstruct-get-main dbstruct)))
    (if maindb
	(begin
	  (sqlite3:finalize! (db:dbdat-get-db maindb))
	  (dbr:dbstruct-set-main! dbstruct #f)))))







>

>
>
|







325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
		(> mtime stime)
		force-sync)
	    (begin
	      (db:delay-if-busy rundb)
	      (db:delay-if-busy olddb)
	      (let ((num-synced (db:sync-tables db:sync-tests-only inmem refdb rundb olddb)))
		(dbr:dbstruct-set-stime! dbstruct (current-milliseconds))
		(mutex-unlock! *http-mutex*)
		num-synced)
	      (begin
		(mutex-unlock! *http-mutex*)
		0))))))

(define (db:close-main dbstruct)
  (let ((maindb (dbr:dbstruct-get-main dbstruct)))
    (if maindb
	(begin
	  (sqlite3:finalize! (db:dbdat-get-db maindb))
	  (dbr:dbstruct-set-main! dbstruct #f)))))
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
	  (for-each 
	   (lambda (run-id)
	     (db:delay-if-busy mtdb)
	     (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))
		   (dbstruct (if toppath (make-dbr:dbstruct path: toppath local: #t) #f)))
	       (debug:print 0 "INFO: Propagating " (length testrecs) " records for run-id=" run-id " to run specific db")
	       (db:replace-test-records dbstruct run-id testrecs)
	       (sqlite3:finalize! (dbr:dbstruct-get-rundb dbstruct))))
	   run-ids)))

    ;; now ensure all newdb data are synced to megatest.db
    (if (member 'new2old options)
	(for-each
	 (lambda (run-id)
	   (let* ((fromdb (if toppath (make-dbr:dbstruct path: toppath local: #t) #f))
		  (frundb (db:dbdat-get-db (db:get-db fromdb run-id))))
	     ;; (db:delay-if-busy frundb)
	     ;; (db:delay-if-busy mtdb)
	     (if (eq? run-id 0)
		 (db:sync-tables (db:sync-main-list dbstruct) (db:get-db fromdb #f) mtdb)
		 (db:sync-tables db:sync-tests-only (db:get-db fromdb run-id) mtdb))))
	 run-ids))
    ;; (db:close-all dbstruct)
    ;; (sqlite3:finalize! mdb)
    ))

;; keeping it around for debugging purposes only
(define (open-run-close-no-exception-handling  proc idb . params)
  (debug:print-info 11 "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params)







|













|







657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
	  (for-each 
	   (lambda (run-id)
	     (db:delay-if-busy mtdb)
	     (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))
		   (dbstruct (if toppath (make-dbr:dbstruct path: toppath local: #t) #f)))
	       (debug:print 0 "INFO: Propagating " (length testrecs) " records for run-id=" run-id " to run specific db")
	       (db:replace-test-records dbstruct run-id testrecs)
	       (sqlite3:finalize! (db:dbdat-get-db (dbr:dbstruct-get-rundb dbstruct)))))
	   run-ids)))

    ;; now ensure all newdb data are synced to megatest.db
    (if (member 'new2old options)
	(for-each
	 (lambda (run-id)
	   (let* ((fromdb (if toppath (make-dbr:dbstruct path: toppath local: #t) #f))
		  (frundb (db:dbdat-get-db (db:get-db fromdb run-id))))
	     ;; (db:delay-if-busy frundb)
	     ;; (db:delay-if-busy mtdb)
	     (if (eq? run-id 0)
		 (db:sync-tables (db:sync-main-list dbstruct) (db:get-db fromdb #f) mtdb)
		 (db:sync-tables db:sync-tests-only (db:get-db fromdb run-id) mtdb))))
	 (cons 0 run-ids)))
    ;; (db:close-all dbstruct)
    ;; (sqlite3:finalize! mdb)
    ))

;; keeping it around for debugging purposes only
(define (open-run-close-no-exception-handling  proc idb . params)
  (debug:print-info 11 "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params)
1942
1943
1944
1945
1946
1947
1948

1949

1950
1951
1952
1953
1954
1955
1956
1957
1958

(define db:test-record-qry-selector (string-intersperse db:test-record-fields ","))


;; NOTE: Use db:test-get* to access records
;; NOTE: This needs rundir decoding? Decide, decode here or where used? For the moment decode where used.
(define (db:get-all-tests-info-by-run-id dbstruct run-id)

  (let ((dbdat (db:get-db dbstruct run-id))

	(db    (db:dbdat-get-db dbdat))
	(res '()))
    (db:delay-if-busy dbdat)
    (sqlite3:for-each-row
     (lambda (id run-id testname state status event-time host cpuload diskfree uname rundir item-path run-duration final-logf comment shortdir attemptnum)
       ;;                 0    1       2      3      4        5       6      7        8     9     10      11          12          13       14     15
       (set! res (cons (vector id run-id testname state status event-time host cpuload diskfree uname rundir item-path run-duration final-logf comment shortdir attemptnum)
		       res)))
     db







>
|
>
|
|







1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968

(define db:test-record-qry-selector (string-intersperse db:test-record-fields ","))


;; NOTE: Use db:test-get* to access records
;; NOTE: This needs rundir decoding? Decide, decode here or where used? For the moment decode where used.
(define (db:get-all-tests-info-by-run-id dbstruct run-id)
  (let* ((dbdat (if (vector? dbstruct)
		    (db:get-db dbstruct run-id)
		    dbstruct)) ;; still settling on when to use dbstruct or dbdat
	 (db    (db:dbdat-get-db dbdat))
	 (res '()))
    (db:delay-if-busy dbdat)
    (sqlite3:for-each-row
     (lambda (id run-id testname state status event-time host cpuload diskfree uname rundir item-path run-duration final-logf comment shortdir attemptnum)
       ;;                 0    1       2      3      4        5       6      7        8     9     10      11          12          13       14     15
       (set! res (cons (vector id run-id testname state status event-time host cpuload diskfree uname rundir item-path run-duration final-logf comment shortdir attemptnum)
		       res)))
     db

Modified http-transport.scm from [8c622aa861] to [445c83d4cb].

486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
    (debug:print-info 0 "Average non-cached time   "
		      (if (eq? *number-non-write-queries* 0)
			  "n/a (no queries)"
			  (/ *total-non-write-delay* 
			     *number-non-write-queries*))
		      " ms")
    (debug:print-info 0 "Server shutdown complete. Exiting")
    (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running")
    (exit)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id)







|







486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
    (debug:print-info 0 "Average non-cached time   "
		      (if (eq? *number-non-write-queries* 0)
			  "n/a (no queries)"
			  (/ *total-non-write-delay* 
			     *number-non-write-queries*))
		      " ms")
    (debug:print-info 0 "Server shutdown complete. Exiting")
    (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running complete")
    (exit)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id)
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547

548
549
550
551
552
553
554
					  "-")
				      run-id
				      server-id)) "Server run"))
		 (th3 (make-thread (lambda ()
				     (debug:print-info 0 "Server monitor thread started")
				     (http-transport:keep-running server-id run-id))
				   "Keep running")))
	    ;; Database connection


	    ;; don't start the db here

	    ;; (set! *inmemdb*  (db:setup run-id))


	    (thread-start! th2)

	    (thread-start! th3)
	    (set! *didsomething* #t)
	    (thread-join! th2)
	    (exit))))))

(define (http-transport:server-signal-handler signum)
  (signal-mask! signum)







<
<
<
<
<
<
<
<

>







532
533
534
535
536
537
538








539
540
541
542
543
544
545
546
547
					  "-")
				      run-id
				      server-id)) "Server run"))
		 (th3 (make-thread (lambda ()
				     (debug:print-info 0 "Server monitor thread started")
				     (http-transport:keep-running server-id run-id))
				   "Keep running")))








	    (thread-start! th2)
	    (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor.
	    (thread-start! th3)
	    (set! *didsomething* #t)
	    (thread-join! th2)
	    (exit))))))

(define (http-transport:server-signal-handler signum)
  (signal-mask! signum)

Modified launch.scm from [e538ecba1a] to [34882953c1].

97
98
99
100
101
102
103

104

105
106
107
108
109
110
111
					    
	  ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART,
	  ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY*
	  ;;
	  (let ((test-info (rmt:get-testinfo-state-status run-id test-id)))
	    (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")))
		(tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")

		(debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed")))

	  
	  (debug:print 2 "Exectuing " test-name " (id: " test-id ") on " (get-host-name))
	  (set! keys       (rmt:get-keys))
	  ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process
	  ;; one of these is defunct/redundant ...
	  (if (not (launch:setup-for-run force: #t))
	      (begin







>
|
>







97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
					    
	  ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART,
	  ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY*
	  ;;
	  (let ((test-info (rmt:get-testinfo-state-status run-id test-id)))
	    (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")))
		(tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
		(begin
		  (debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed")
		  (exit))))
	  
	  (debug:print 2 "Exectuing " test-name " (id: " test-id ") on " (get-host-name))
	  (set! keys       (rmt:get-keys))
	  ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process
	  ;; one of these is defunct/redundant ...
	  (if (not (launch:setup-for-run force: #t))
	      (begin

Modified rmt.scm from [e6ea17e36c] to [2b271743b3].

87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)
    (if connection-info
	;; use the server if have connection info
	(let* ((dat     (case *transport-type*
			  ((http)(http-transport:client-api-send-receive run-id connection-info cmd jparams))
			  ((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info cmd params))
			  (else  (exit))))
	       (res     (if (vector? dat) (vector-ref dat 1) #f))
	       (success (if (vector? dat) (vector-ref dat 0) #f)))
	  (http-transport:server-dat-update-last-access connection-info)
	  (if success
	      (case *transport-type* 
		((http)(db:string->obj res))
		((nmsg) res))
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")
		(hash-table-delete! *runremote* run-id) ;; don't keep using the same connection

		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
		(if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))

		(rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1)))))
	(if (and (< attemptnum 10)
		 (tasks:need-server run-id))
	    (begin
	      (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
	      (rmt:send-receive cmd rid params (+ attemptnum 1)))
	    (rmt:open-qry-close-locally cmd run-id params)))))







|
|













|







87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)
    (if connection-info
	;; use the server if have connection info
	(let* ((dat     (case *transport-type*
			  ((http)(http-transport:client-api-send-receive run-id connection-info cmd jparams))
			  ((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info cmd params))
			  (else  (exit))))
	       (res     (if (and dat (vector? dat)) (vector-ref dat 1) #f))
	       (success (if (and dat (vector? dat)) (vector-ref dat 0) #f)))
	  (http-transport:server-dat-update-last-access connection-info)
	  (if success
	      (case *transport-type* 
		((http)(db:string->obj res))
		((nmsg) res))
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to http-transport:client-api-send-receive again.")
		(hash-table-delete! *runremote* run-id) ;; don't keep using the same connection

		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
		(if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))
		(thread-sleep! 2)
		(rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1)))))
	(if (and (< attemptnum 10)
		 (tasks:need-server run-id))
	    (begin
	      (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
	      (rmt:send-receive cmd rid params (+ attemptnum 1)))
	    (rmt:open-qry-close-locally cmd run-id params)))))

Modified runs.scm from [3205d6c7db] to [f2976f1213].

552
553
554
555
556
557
558
559
560
561
562


563
564
565
566
567
568
569
			(runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull)
			reruns)))
	      (list (car newtal)(append (cdr newtal) reg) '() reruns))))

     ((and (null? fails)
	   (null? prereq-fails)
	   (null? non-completed))
      (if  (runs:can-keep-running? hed 5)
	  (begin
	    (runs:inc-cant-run-tests hed)
	    (debug:print-info 1 "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0))


	    ;; num-retries code was here
	    ;; we use this opportunity to move contents of reg to tal
	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
	  (begin
	    (debug:print-info 1 "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
	    (let ((test-id (rmt:get-test-id run-id hed "")))
	      (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))







|



>
>







552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
			(runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull)
			reruns)))
	      (list (car newtal)(append (cdr newtal) reg) '() reruns))))

     ((and (null? fails)
	   (null? prereq-fails)
	   (null? non-completed))
      (if  (runs:can-keep-running? hed 20)
	  (begin
	    (runs:inc-cant-run-tests hed)
	    (debug:print-info 1 "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0))
	    ;; getting here likely means the system is way overloaded, kill a full minute before continuing
	    (thread-sleep! 60)
	    ;; num-retries code was here
	    ;; we use this opportunity to move contents of reg to tal
	    (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
	  (begin
	    (debug:print-info 1 "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
	    (let ((test-id (rmt:get-test-id run-id hed "")))
	      (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))
671
672
673
674
675
676
677

678

679
680



681
682
683
684
685
686
687
	  #f))
     
     ;; Register tests 
     ;;
     ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f))
      (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" )
      ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs

      (rmt:general-call 'register-test run-id run-id test-name item-path)

      (if (rmt:get-test-id run-id test-name item-path)
	  (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done))



      (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done))
	  (begin
	    (rmt:general-call 'register-test run-id run-id test-name "")
	    (if (rmt:get-test-id run-id test-name "")
		(hash-table-set! test-registry (runs:make-full-test-name test-name "") 'done))))
      (runs:shrink-can-run-more-tests-count)   ;; DELAY TWEAKER (still needed?)
      (if (and (null? tal)(null? reg))







>
|
>
|
|
>
>
>







673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
	  #f))
     
     ;; Register tests 
     ;;
     ((not (hash-table-ref/default test-registry (runs:make-full-test-name test-name item-path) #f))
      (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" )
      ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs
      (let register-loop ((numtries 15))
	(rmt:general-call 'register-test run-id run-id test-name item-path)
	(thread-sleep! 0.5)
	(if (rmt:get-test-id run-id test-name item-path)
	    (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done)
	    (if (> numtries 0)
		(register-loop (- numtries 1))
		(debug:print 0 "ERROR: failed to register test " (runs:make-full-test-name test-name item-path)))))
      (if (not (eq? (hash-table-ref/default test-registry (runs:make-full-test-name test-name "") #f) 'done))
	  (begin
	    (rmt:general-call 'register-test run-id run-id test-name "")
	    (if (rmt:get-test-id run-id test-name "")
		(hash-table-set! test-registry (runs:make-full-test-name test-name "") 'done))))
      (runs:shrink-can-run-more-tests-count)   ;; DELAY TWEAKER (still needed?)
      (if (and (null? tal)(null? reg))

Modified tasks.scm from [84082aa618] to [42971aa928].

165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

(define (tasks:server-lock-slot mdb run-id)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot")
  (if (< (tasks:num-in-available-state mdb run-id) 4)
      (begin 
	(tasks:server-set-available mdb run-id)
	(thread-sleep! 2) ;; Try removing this. It may not be needed.
	(tasks:server-am-i-the-server? mdb run-id))
      #f))
	
;; register that this server may come online (first to register goes though with the process)
(define (tasks:server-set-available mdb run-id)
  (sqlite3:execute 
   mdb 







|







165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

(define (tasks:server-lock-slot mdb run-id)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot")
  (if (< (tasks:num-in-available-state mdb run-id) 4)
      (begin 
	(tasks:server-set-available mdb run-id)
	;; (thread-sleep! 2) ;; Try removing this. It may not be needed.
	(tasks:server-am-i-the-server? mdb run-id))
      #f))
	
;; register that this server may come online (first to register goes though with the process)
(define (tasks:server-set-available mdb run-id)
  (sqlite3:execute 
   mdb 
342
343
344
345
346
347
348









349
350
351
352
353
354
355
  (let ((res #f))
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     mdb ;; NEEDS dbprep ADDED
     "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND  (strftime('%s','now') - start_time) < 60));" run-id)
    res))










(define (tasks:need-server run-id)
  (let ((forced (configf:lookup *configdat* "server" "required"))
	(maxqry (cdr (rmt:get-max-query-average run-id)))
	(threshold   (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))
    (cond
     (forced 







>
>
>
>
>
>
>
>
>







342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
  (let ((res #f))
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     mdb ;; NEEDS dbprep ADDED
     "SELECT id FROM servers WHERE run_id=? AND (state = 'running' OR (state = 'dbprep' AND  (strftime('%s','now') - start_time) < 60));" run-id)
    res))

(define (tasks:server-running? mdb run-id)
  (let ((res #f))
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     mdb ;; NEEDS dbprep ADDED
     "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id)
    res))

(define (tasks:need-server run-id)
  (let ((forced (configf:lookup *configdat* "server" "required"))
	(maxqry (cdr (rmt:get-max-query-average run-id)))
	(threshold   (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10"))))
    (cond
     (forced 
403
404
405
406
407
408
409

410
411
412
413
414
415
416
(define (tasks:kill-server-run-id run-id #!key (tag "default"))
  (let* ((tdbdat  (tasks:open-db))
	 (sdat    (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
    (if sdat
	(let ((hostname (vector-ref sdat 6))
	      (pid      (vector-ref sdat 5))
	      (server-id (vector-ref sdat 0)))

	  (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
	  (tasks:kill-server hostname pid)
	  (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
	(debug:print-info 0 "No server found for run-id " run-id ", nothing to kill"))
    ;; (sqlite3:finalize! tdb)
    ))
    







>







412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
(define (tasks:kill-server-run-id run-id #!key (tag "default"))
  (let* ((tdbdat  (tasks:open-db))
	 (sdat    (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
    (if sdat
	(let ((hostname (vector-ref sdat 6))
	      (pid      (vector-ref sdat 5))
	      (server-id (vector-ref sdat 0)))
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed")
	  (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
	  (tasks:kill-server hostname pid)
	  (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
	(debug:print-info 0 "No server found for run-id " run-id ", nothing to kill"))
    ;; (sqlite3:finalize! tdb)
    ))
    

Modified tests/fullrun/megatest.config from [558689506a] to [7518de8980].

131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# If the server can't be started on this port it will try the next port until
# it succeeds
port 8080

# This server will keep running this number of hours after last access. 
# Three minutes is 0.05 hours
# timeout 0.025
timeout 0.01

# Server is required - slower but more resistant to Sqlite issues.
required yes

# Start server when average query takes longer than this
# server-query-threshold 55500
server-query-threshold -1







|







131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# If the server can't be started on this port it will try the next port until
# it succeeds
port 8080

# This server will keep running this number of hours after last access. 
# Three minutes is 0.05 hours
# timeout 0.025
timeout 0.1

# Server is required - slower but more resistant to Sqlite issues.
required yes

# Start server when average query takes longer than this
# server-query-threshold 55500
server-query-threshold -1