Megatest

Check-in [da7f14af0e]
Login
Overview
Comment:Untested cleanup
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | multi-area
Files: files | file ages | folders
SHA1: da7f14af0ef7f1b6ffccea82a0c210cb3d65f881
User & Date: matt on 2015-04-05 23:02:59
Other Links: branch diff | manifest | tags
Context
2015-04-05
23:16
more untested cleanup check-in: c9180c4d63 user: matt tags: multi-area
23:02
Untested cleanup check-in: da7f14af0e user: matt tags: multi-area
22:04
More cleanup after big tear up check-in: 2b14c9b4a7 user: matt tags: multi-area
Changes

Modified client.scm from [671f590b03] to [141f2f00f0].

46
47
48
49
50
51
52
53

54
55
56
57



58
59
60
61
62
63
64
46
47
48
49
50
51
52

53
54



55
56
57
58
59
60
61
62
63
64







-
+

-
-
-
+
+
+







(define (client:connect iface port)
  (case (server:get-transport)
    ((rpc)  (rpc:client-connect  iface port))
    ((http) (http:client-connect iface port))
    ((zmq)  (zmq:client-connect  iface port))
    (else   (rpc:client-connect  iface port))))

(define (client:setup  run-id #!key (remaining-tries 10) (failed-connects 0))
(define (client:setup run-id area-dat #!key (remaining-tries 10) (failed-connects 0))
  (case (server:get-transport)
    ((rpc) (rpc-transport:client-setup run-id)) ;;(client:setup-rpc run-id))
    ((http)(client:setup-http run-id))
    (else  (rpc-transport:client-setup run-id)))) ;; (client:setup-rpc run-id))))
    ((rpc) (rpc-transport:client-setup run-id area-dat))
    ((http)(client:setup-http run-id area-dat))
    (else  (rpc-transport:client-setup run-id area-dat))))

;; (define (client:login-no-auto-setup server-info run-id)
;;   (case (server:get-transport)
;;     ((rpc)  (rpc:login-no-auto-client-setup server-info run-id))
;;     ((http) (rmt:login-no-auto-client-setup server-info run-id))
;;     (else   (rpc:login-no-auto-client-setup server-info run-id))))
;; 
81
82
83
84
85
86
87
88

89
90
91
92
93
94
95
96
97
98
99
100

101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

134
135
136
137
138

139
140

141
142
143
144
145
146
147
148
149

150
151

152
153
154
155
156
157

158
159


160
161
162
163
164
165
166
167
168
169
170

171
172
173

174
175
176
177
178
179
180
181
182
183
184
185
186
187

188
189
190
191
192
193
194
195
196
197
198
199
200
201

202
203
204
205
206
207
208
209

210
211
212
213
214
215
216
81
82
83
84
85
86
87

88
89
90
91
92
93
94
95
96
97
98
99

100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116

117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

133
134
135
136
137

138
139

140
141
142
143
144
145
146
147
148

149
150

151
152
153
154
155
156

157
158

159
160
161
162
163
164
165
166
167
168
169
170

171
172
173

174
175
176
177
178
179
180
181
182
183
184
185
186
187

188
189
190
191
192
193
194
195
196
197
198
199
200
201

202
203
204
205
206
207
208
209

210
211
212
213
214
215
216
217







-
+











-
+
















-
+















-
+




-
+

-
+








-
+

-
+





-
+

-
+
+










-
+


-
+













-
+













-
+







-
+







;; 		    (hash-table-set! (common:get-remote remote) run-id start-res)
;; 		    start-res)  ;; return the server info
;; 		  (if (member remaining-tries '(3 4 6))
;; 		      (begin    ;; login failed
;; 			(debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" host-info)
;; 			(hash-table-delete! (common:get-remote remote) run-id)
;; 			(open-run-close tasks:server-force-clean-run-record
;; 			 		tasks:open-db
;; 			 		(lambda ()(tasks:open-db area-dat))
;; 			 		run-id 
;; 			 		(car  host-info)
;; 			 		(cadr host-info)
;; 					" client:setup (host-info=#t)")
;; 			(thread-sleep! 5)
;; 			(client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1)))
;; 		      (begin
;; 			(debug:print 25 "INFO: client:setup failed to connect, start-res=" start-res ", run-id=" run-id ", host-info=" host-info)
;; 			(thread-sleep! 5)
;; 			(client:setup run-id remaining-tries: (- remaining-tries 1))))))
;; 	    ;; YUK: rename server-dat here
;; 	    (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id)))
;; 	    (let* ((server-dat (open-run-close tasks:get-server (lambda ()(tasks:open-db area-dat)) run-id)))
;; 	      (debug:print-info 0 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
;; 	      (if server-dat
;; 		  (let* ((iface     (tasks:hostinfo-get-interface server-dat))
;; 			 (port      (tasks:hostinfo-get-port      server-dat))
;; 			 (start-res (http-transport:client-connect iface port))
;; 			 ;; (ping-res  (server:ping-server run-id iface port))
;; 			 (ping-res  (rmt:login-no-auto-client-setup start-res run-id)))
;; 		    (if start-res
;; 			(begin
;; 			  (hash-table-set! (common:get-remote remote) run-id start-res)
;; 			  start-res)
;; 			(if (member remaining-tries '(2 5))
;; 			    (begin    ;; login failed
;; 			      (debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
;; 			      (hash-table-delete! (common:get-remote remote) run-id)
;; 			      (open-run-close tasks:server-force-clean-run-record
;; 					      tasks:open-db
;; 					      (lambda ()(tasks:open-db area-dat))
;; 					      run-id 
;; 					      (tasks:hostinfo-get-interface server-dat)
;; 					      (tasks:hostinfo-get-port      server-dat)
;; 					      " client:setup (server-dat = #t)")
;; 			      (thread-sleep! 2)
;; 			      (server:try-running run-id)
;; 			      (thread-sleep! 10) ;; give server a little time to start up
;; 			      (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1)))
;; 			    (begin
;; 			      (debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
;; 			      (thread-sleep! 5)
;; 			      (client:setup run-id remaining-tries: (- remaining-tries 1))))))
;; 		  (begin    ;; no server registered
;; 		    (if (eq? remaining-tries 2)
;; 			(begin
;; 			  ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)")
;; 			  ;; (open-run-close tasks:server-clean-out-old-records-for-run-id (lambda ()(tasks:open-db area-dat)) run-id " client:setup (server-dat=#f)")
;; 			  (client:setup run-id remaining-tries: 10))
;; 			(begin
;; 			  (thread-sleep! 2) 
;; 			  (debug:print 25 "INFO: client:setup start-res (not defined here), run-id=" run-id ", server-dat=" server-dat)
;; 			  (if (< (open-run-close tasks:num-in-available-state tasks:open-db run-id) 3)
;; 			  (if (< (open-run-close tasks:num-in-available-state (lambda ()(tasks:open-db area-dat)) run-id) 3)
;; 			      (begin
;; 				;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)")
;; 				;; (open-run-close tasks:server-clean-out-old-records-for-run-id (lambda ()(tasks:open-db area-dat)) run-id " client:setup (server-dat=#f)")
;; 				(server:try-running run-id)))
;; 			  (thread-sleep! 10) ;; give server a little time to start up
;; 			  (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))))

;; Do all the connection work, look up the transport type and set up the
;; connection if required.
;;
;; There are two scenarios. 
;;   1. We are a test manager and we received *transport-type* and (common:get-remote remote) via cmdline
;;   1. We are a test manager and we received transport-type and (common:get-remote remote) via cmdline
;;   2. We are a run tests, list runs or other interactive process and we must figure out
;;      *transport-type* and (common:get-remote remote) from the monitor.db
;;      transport-type and (common:get-remote remote) from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove (common:get-remote remote) stuff
;;
(define (client:setup-http run-id #!key (remaining-tries 10) (failed-connects 0)(remote #f))
(define (client:setup-http run-id area-dat #!key (remaining-tries 10) (failed-connects 0)(remote #f))
  (debug:print-info 2 "client:setup remaining-tries=" remaining-tries)
  (let* ((tdbdat (tasks:open-db)))
  (let* ((tdbdat         (tasks:open-db area-dat))
	 (transport-type (megatest:area-transport area-dat)))
    (if (<= remaining-tries 0)
	(begin
	  (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	  (exit 1))
	(let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
	  (debug:print-info 4 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	  (if server-dat
	      (let* ((iface     (tasks:hostinfo-get-interface server-dat))
		     (hostname  (tasks:hostinfo-get-hostname  server-dat))
		     (port      (tasks:hostinfo-get-port      server-dat))
		     (start-res (case *transport-type*
		     (start-res (case transport-type
				  ((http)(http-transport:client-connect iface port))
				  ((nmsg)(nmsg-transport:client-connect hostname port))))
		     (ping-res  (case *transport-type* 
		     (ping-res  (case transport-type 
				  ((http)(rmt:login-no-auto-client-setup start-res run-id))
				  ((nmsg)(let ((logininfo (rmt:login-no-auto-client-setup start-res run-id)))
 					   (if logininfo
 					       (car (vector-ref logininfo 1))
 					       #f))))))
		(if (and start-res
			 ping-res)
		    (begin
		      (common:set-remote! remote run-id start-res)
		      (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res))
		      start-res)
		    (begin    ;; login failed but have a server record, clean out the record and try again
		      (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
		      (case *transport-type* 
		      (case transport-type 
			((http)(http-transport:close-connections run-id)))
		      (common:del-remote! remote run-id)
		      (tasks:kill-server-run-id run-id)
		      (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
							   run-id 
							   (tasks:hostinfo-get-interface server-dat)
							   (tasks:hostinfo-get-port      server-dat)
							   " client:setup (server-dat = #t)")
		      (if (> remaining-tries 8)
			  (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little
			  (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time
		      (server:try-running run-id)
		      (thread-sleep! 5)   ;; give server a little time to start up
		      (client:setup run-id remaining-tries: (- remaining-tries 1))
		      (client:setup run-id area-dat remaining-tries: (- remaining-tries 1))
		      )))
	      (begin    ;; no server registered
		(let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id)))
		  (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
		  (if (< num-available 2)
		      (server:try-running run-id))
		  (thread-sleep! (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
		  (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))
		  (client:setup run-id area-dat remaining-tries: (- remaining-tries 1)))))))))

;; keep this as a function to ease future 
(define (client:start run-id server-info)
  (http-transport:client-connect (tasks:hostinfo-get-interface server-info)
				 (tasks:hostinfo-get-port server-info)))

;; client:signal-handler

Modified dashboard-guimonitor.scm from [886a02dbe2] to [c023768ca3].

173
174
175
176
177
178
179
180

181
182
183
184
185

186
187

188
189
190
191
192
193
173
174
175
176
177
178
179

180
181
182
183
184

185
186

187
188
189
190
191
192
193







-
+




-
+

-
+






                 (iup:attribute-set! tabtop "TABTITLE1" "Collateral")
                 (iup:attribute-set! tabtop "TABTITLE2" "Fossil")
                 (iup:attribute-set! tabtop "TABTITLE3" "Tools")
                 tabtop))))

;; BUG: Remember to re-instate this!!!!
;; (on-exit (lambda ()
;; 	   (let ((tdb (tasks:open-db)))
;; 	   (let ((tdb (tasks:open-db area-dat)))
;; 	     ;; (print "On-exit called")
;; 	     (tasks:remove-monitor-record tdb)
;; 	     (sqlite3:finalize! tdb))))

(define (gui-monitor db)
(define (gui-monitor db area-dat)
  (let ((keys (db:get-keys db))
	(tdb  (tasks:open-db)))
	(tdb  (tasks:open-db area-dat)))
    (tasks:register-monitor db tdb) ;;; let the other monitors know we are here
    (control-panel db tdb keys)
    ;(tasks:remove-monitor-record db)
    ;(sqlite3:finalize! db)
   ))
	

Modified dashboard-tests.scm from [8072d2165f] to [d0c66f198a].

416
417
418
419
420
421
422
423

424
425
426
427
428
429
430
416
417
418
419
420
421
422

423
424
425
426
427
428
429
430







-
+







	  (exit 1))
	(let* (;; (run-id        (if testdat (db:test-get-run_id testdat) #f))
	       (keydat        (if testdat (db:get-key-val-pairs dbstruct run-id) #f))
	       (rundat        (if testdat (db:get-run-info dbstruct run-id) #f))
	       (runname       (if testdat (db:get-value-by-header (db:get-rows rundat)
								  (db:get-header rundat)
								  "runname") #f))
	       (tdb           (tdb:open-test-db-by-test-id-local dbstruct run-id test-id))
	       (tdb           (tdb:open-test-db-by-test-id-local dbstruct area-dat run-id test-id))
	       ;; These next two are intentional bad values to ensure errors if they should not
	       ;; get filled in properly.
	       (logfile       "/this/dir/better/not/exist")
	       (rundir        (if testdat 
				  (db:test-get-rundir testdat)
				  logfile))
	       (testdat-path  (conc rundir "/testdat.db")) ;; this gets recalculated until found 
693
694
695
696
697
698
699
700

701
702
703
704
705
706
707
693
694
695
696
697
698
699

700
701
702
703
704
705
706
707







-
+







											      (db:test-data-get-value    x)
											      (db:test-data-get-expected x)
											      (db:test-data-get-tol      x)
											      (db:test-data-get-status   x)
											      (db:test-data-get-units    x)
											      (db:test-data-get-type     x)
											      (db:test-data-get-comment  x)))
										    (tdb:open-run-close-db-by-test-id-local dbstruct run-id test-id #f tdb:read-test-data test-id "%")))
										    (tdb:open-run-close-db-by-test-id-local dbstruct area-dat run-id test-id #f tdb:read-test-data test-id "%")))
									      "\n")))
							       (if (not (equal? currval newval))
								   (iup:attribute-set! test-data "VALUE" newval ))))) ;; "TITLE" newval)))))
					  test-data))
				       ;;(dashboard:run-controls)
				       )))
				 (iup:attribute-set! tabs "TABTITLE0" "Steps")

Modified db.scm from [69b8627abb] to [4816354319].

647
648
649
650
651
652
653
654

655
656
657
658
659
660
661
647
648
649
650
651
652
653

654
655
656
657
658
659
660
661







-
+







	 (mtdb     (if toppath (db:open-megatest-db area-dat)))
	 (allow-cleanup (if run-ids #f #t))
	 (run-ids  (if run-ids 
		       run-ids
		       (if toppath (begin
				     (db:delay-if-busy mtdb area-dat)
				     (db:get-all-run-ids mtdb)))))
	 (tdbdat  (tasks:open-db))
	 (tdbdat  (tasks:open-db area-dat))
	 (servers (tasks:get-all-servers (db:delay-if-busy tdbdat area-dat))))
    
    ;; kill servers
    (if (member 'killservers options)
	(for-each
	 (lambda (server)
	   (tasks:server-delete-record (db:delay-if-busy tdbdat area-dat) (vector-ref server 0) "dbmigration")

Modified dcommon.scm from [203653f6f0] to [1f00e0f1a9].

510
511
512
513
514
515
516
517
518


519
520
521
522
523
524
525
510
511
512
513
514
515
516


517
518
519
520
521
522
523
524
525







-
-
+
+







    (updater)
    (set! dashboard:update-summary-tab updater)
    (iup:attribute-set! stats-matrix "WIDTHDEF" "40")
    (iup:vbox
     ;; (iup:label "Run statistics"  #:expand "HORIZONTAL")
     stats-matrix)))

(define (dcommon:servers-table)
  (let* ((tdbdat         (tasks:open-db))
(define (dcommon:servers-table area-dat)
  (let* ((tdbdat         (tasks:open-db area-dat))
	 (colnum         0)
	 (rownum         0)
	 (servers-matrix (iup:matrix #:expand "YES"
				     #:numcol 7
				     #:numcol-visible 7
				     #:numlin-visible 5
				     ))

Modified http-transport.scm from [6df4b07013] to [d8101c25db].

67
68
69
70
71
72
73
74

75
76
77
78
79
80
81
67
68
69
70
71
72
73

74
75
76
77
78
79
80
81







-
+







	 (db              #f) ;;        (open-db)) ;; we don't want the server to be opening and closing the db unnecesarily
	 (hostname        (get-host-name))
	 (ipaddrstr       (let ((ipstr (if (string=? "-" hostn)
					   ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
					   (server:get-best-guess-address hostname)
					   #f)))
			    (if ipstr ipstr hostn))) ;; hostname))) 
	 (start-port      (portlogger:open-run-close portlogger:find-port))
	 (start-port      (portlogger:open-run-close portlogger:find-port area-dat))
	 (link-tree-path  (configf:lookup configdat "setup" "linktree")))
    ;; (set! db *inmemdb*)
    (debug:print-info 0 "portlogger recommended port: " start-port)
    (root-path     (if link-tree-path 
		       link-tree-path
		       (current-directory))) ;; WARNING: SECURITY HOLE. FIX ASAP!
    (handle-directory spiffy-directory-listing)
130
131
132
133
134
135
136
137

138
139
140
141
142
143
144

145
146
147
148
149
150
151
130
131
132
133
134
135
136

137
138
139
140
141
142
143

144
145
146
147
148
149
150
151







-
+






-
+







     (begin
       (print-error-message exn)
       (if (< portnum 64000)
	   (begin 
	     (debug:print 0 "WARNING: attempt to start server failed. Trying again ...")
	     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	     (debug:print 0 "exn=" (condition->list exn))
	     (portlogger:open-run-close portlogger:set-failed portnum)
	     (portlogger:open-run-close portlogger:set-failed area-dat portnum)
	     (debug:print 0 "WARNING: failed to start on portnum: " portnum ", trying next port")
	     (thread-sleep! 0.1)

	     ;; get_next_port goes here
	     (http-transport:try-start-server run-id
					      ipaddrstr
					      (portlogger:open-run-close portlogger:find-port)
					      (portlogger:open-run-close portlogger:find-port area-dat)
					      server-id
					      area-dat))
	   (begin
	     (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server")
	     (print "ERROR: Tried and tried but could not start the server"))))
     ;; any error in following steps will result in a retry
     (set! *server-info* (list ipaddrstr portnum))
363
364
365
366
367
368
369
370

371
372
373
374
375

376
377
378
379
380
381
382
363
364
365
366
367
368
369

370
371
372
373
374

375
376
377
378
379
380
381
382







-
+




-
+







	 (api-req      (make-request method: 'POST uri: api-uri))
	 (server-dat   (vector iface port api-uri api-url api-req (current-seconds))))
    server-dat))

;; run http-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (http-transport:keep-running server-id run-id)
(define (http-transport:keep-running server-id run-id area-dat)
  ;; if none running or if > 20 seconds since 
  ;; server last used then start shutdown
  ;; This thread waits for the server to come alive
  (debug:print-info 0 "Starting the sync-back, keep alive thread in server for run-id=" run-id)
  (let* ((tdbdat      (tasks:open-db))
  (let* ((tdbdat      (tasks:open-db area-dat))
	 (server-info (let loop ((start-time (current-seconds))
				 (changed    #t)
				 (last-sdat  "not this"))
                        (let ((sdat #f))
			  (thread-sleep! 0.01)
			  (debug:print-info 0 "Waiting for server alive signature")
                          (mutex-lock! *heartbeat-mutex*)
411
412
413
414
415
416
417
418

419
420
421
422
423
424
425
411
412
413
414
415
416
417

418
419
420
421
422
423
424
425







-
+







		(sync-time  #f)
		(rem-time   #f))
	    ;; inmemdb is a dbstruct
	    (condition-case
	     (db:sync-touched *inmemdb* *run-id* force-sync: #t)
	     ((sync-failed)(cond
			    ((> bad-sync-count 10) ;; time to give up
			     (http-transport:server-shutdown server-id port))
			     (http-transport:server-shutdown server-id port area-dat))
			    (else ;; (> bad-sync-count 0)  ;; we've had a fail or two, delay and loop
			     (thread-sleep! 5)
			     (loop count server-state (+ bad-sync-count 1)))))
	     ((exn)
	      (debug:print 0 "ERROR: error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server")
	      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running crashed")
	      (exit)))
444
445
446
447
448
449
450
451

452
453
454
455
456
457
458
444
445
446
447
448
449
450

451
452
453
454
455
456
457
458







-
+







		      (set! *inmemdb*  (db:setup run-id))
		      ;; force initialization
		      ;; (db:get-db *inmemdb* #t)
		      (db:get-db *inmemdb* run-id)
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running"))
		    (begin ;; gotta exit nicely
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision")
		      (http-transport:server-shutdown server-id port))))))
		      (http-transport:server-shutdown server-id port area-dat))))))
      
      (if (< count 1) ;; 3x3 = 9 secs aprox
	  (loop (+ count 1) 'running bad-sync-count))
      
      ;; Check that iface and port have not changed (can happen if server port collides)
      (mutex-lock! *heartbeat-mutex*)
      (set! sdat *server-info*)
485
486
487
488
489
490
491
492

493
494
495


496
497
498
499
500
501
502
503
504

505
506
507
508
509
510
511
485
486
487
488
489
490
491

492
493


494
495
496
497
498
499
500
501
502
503

504
505
506
507
508
509
510
511







-
+

-
-
+
+








-
+







	    ;; Consider implementing some smarts here to re-insert the record or kill self is
	    ;; the db indicates so
	    ;;
	    ;; (if (tasks:server-am-i-the-server? tdb run-id)
	    ;;     (tasks:server-set-state! tdb server-id "running"))
	    ;;
	    (loop 0 server-state bad-sync-count))
	  (http-transport:server-shutdown server-id port)))))
	  (http-transport:server-shutdown server-id port area-dat)))))

(define (http-transport:server-shutdown server-id port)
  (let ((tdbdat (tasks:open-db)))
(define (http-transport:server-shutdown server-id port area-dat)
  (let ((tdbdat (tasks:open-db area-dat)))
    (debug:print-info 0 "Starting to shutdown the server.")
    ;; need to delete only *my* server entry (future use)
    (set! *time-to-exit* #t)
    (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t))
    ;;
    ;; start_shutdown
    ;;
    (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down")
    (portlogger:open-run-close portlogger:set-port port "released")
    (portlogger:open-run-close portlogger:set-port area-dat port "released")
    (thread-sleep! 5)
    (debug:print-info 0 "Max cached queries was    " *max-cache-size*)
    (debug:print-info 0 "Number of cached writes   " *number-of-writes*)
    (debug:print-info 0 "Average cached write time "
		      (if (eq? *number-of-writes* 0)
			  "n/a (no writes)"
			  (/ *writes-total-delay*
523
524
525
526
527
528
529
530

531
532
533
534
535
536
537
538
539

540
541
542
543

544
545
546
547
548
549

550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567

568
569
570
571
572
573
574
523
524
525
526
527
528
529

530
531
532
533
534
535
536
537
538

539
540
541
542

543
544
545
546
547
548

549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566

567
568
569
570
571
572
573
574







-
+








-
+



-
+





-
+

















-
+







    (exit)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id area-dat)
  (let* ((tdbdat (tasks:open-db)))
  (let* ((tdbdat (tasks:open-db area-dat)))
    (set! *run-id*   run-id)
    (if (args:get-arg "-daemonize")
	(begin
	  (daemon:ize)
	  (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
	      (begin
		(current-error-port *alt-log-file*)
		(current-output-port *alt-log-file*)))))
    (if (server:check-if-running run-id)
    (if (server:check-if-running run-id area-dat)
	(begin
	  (debug:print 0 "INFO: Server for run-id " run-id " already running")
	  (exit 0)))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id area-dat))
	       (remtries  4))
      (if (not server-id)
	  (if (> remtries 0)
	      (begin
		(thread-sleep! 2)
		(loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)
		(loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id area-dat)
		      (- remtries 1)))
	      (begin
		;; since we didn't get the server lock we are going to clean up and bail out
		(debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
		(tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch")
		))
	  (let* ((th2 (make-thread (lambda ()
				     (debug:print-info 0 "Server run thread started")
				     (http-transport:run 
				      (if (args:get-arg "-server")
					  (args:get-arg "-server")
					  "-")
				      run-id
				      server-id
				      area-dat)) "Server run"))
		 (th3 (make-thread (lambda ()
				     (debug:print-info 0 "Server monitor thread started")
				     (http-transport:keep-running server-id run-id))
				     (http-transport:keep-running server-id run-id area-dat))
				   "Keep running")))
	    (thread-start! th2)
	    (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor.
	    (thread-start! th3)
	    (set! *didsomething* #t)
	    (thread-join! th2)
	    (exit))))))

Modified launch.scm from [c48e97b3ce] to [b6be267d56].

872
873
874
875
876
877
878
879

880
881
882
883
884
885
886
872
873
874
875
876
877
878

879
880
881
882
883
884
885
886







-
+







	  (create-directory work-area #t)
	  (debug:print 0 "WARNING: No disk work area specified - running in the test directory under tmp_run")))
    (set! cmdparms (base64:base64-encode 
		    (z3:encode-buffer 
		     (with-output-to-string
		       (lambda () ;; (list 'hosts     hosts)
			 (write (list (list 'testpath  test-path)
				      (list 'transport (conc (megatest:area-transport area-dat))) ;;; *transport-type*))
				      (list 'transport (conc (megatest:area-transport area-dat))) ;;
				      ;; (list 'serverinf *server-info*)
				      (list 'toppath   toppath)
				      (list 'work-area work-area)
				      (list 'test-name test-name) 
				      (list 'runscript runscript) 
				      (list 'run-id    run-id   )
				      (list 'test-id   test-id  )

Modified megatest.scm from [74462f253a] to [f59b483f32].

649
650
651
652
653
654
655
656

657
658
659
660
661
662
663
649
650
651
652
653
654
655

656
657
658
659
660
661
662
663







-
+







    ;; Server? Start up here.
    ;;
    (let ((tl        (launch:setup-for-run *area-dat*))
	  (run-id    (and (args:get-arg "-run-id")
			  (string->number (args:get-arg "-run-id")))))
      (if run-id
	  (begin
	    (server:launch run-id)
	    (server:launch run-id *area-dat*)
	    (set! *didsomething* #t))
	  (debug:print 0 "ERROR: server requires run-id be specified with -run-id")))

    ;; Not a server? This section will decide how to communicate
    ;;
    ;;  Setup client for all expect listed here
    (if (null? (lset-intersection 
686
687
688
689
690
691
692
693

694
695
696
697
698
699
700
686
687
688
689
690
691
692

693
694
695
696
697
698
699
700







-
+







;; MAY STILL NEED THIS
;;		       (set! *megatest-db* (make-dbr:dbstruct path: toppath local: #t))))))))))

(if (or (args:get-arg "-list-servers")
	(args:get-arg "-stop-server"))
    (let ((tl (launch:setup-for-run *area-dat*)))
      (if tl 
	  (let* ((tdbdat  (tasks:open-db))
	  (let* ((tdbdat  (tasks:open-db *area-dat*))
		 (servers (tasks:get-all-servers (db:delay-if-busy tdbdat)))
		 (fmtstr  "~5a~12a~8a~20a~24a~10a~10a~10a~10a\n")
		 (servers-to-kill '())
		 (killinfo   (args:get-arg "-stop-server"))
		 (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f))
		 (sid        (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f)))
	    (format #t fmtstr "Id" "MTver" "Pid" "Host" "Interface:OutPort" "InPort" "LastBeat" "State" "Transport")
1049
1050
1051
1052
1053
1054
1055
1056

1057
1058
1059
1060
1061
1062
1063
1049
1050
1051
1052
1053
1054
1055

1056
1057
1058
1059
1060
1061
1062
1063







-
+







   "-runtests" 
   "run a test" 
   (lambda (target runname keys keyvals)
     ;;
     ;; May or may not implement it this way ...
     ;;
     ;; Insert this run into the tasks queue
     ;; (open-run-close tasks:add tasks:open-db 
     ;; (open-run-close tasks:add (lambda ()(tasks:open-db *area-dat*))
     ;;    	     "runtests" 
     ;;    	     user
     ;;    	     target
     ;;    	     runname
     ;;    	     (args:get-arg "-runtests")
     ;;    	     #f))))
     (runs:run-tests target

Modified nmsg-transport.scm from [43df6f5458] to [fa92993bb8].

61
62
63
64
65
66
67
68

69
70
71
72

73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91

92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111

112
113
114
115
116
117
118
119
120
121
122
123
124
125

126
127
128
129
130
131
132
133
134
135

136
137
138
139
140
141
142
61
62
63
64
65
66
67

68
69
70
71

72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110

111
112
113
114
115
116
117
118
119
120
121
122
123
124

125
126
127
128
129
130
131
132
133
134

135
136
137
138
139
140
141
142







-
+



-
+


















-
+



















-
+













-
+









-
+








;;======================================================================
;; S E R V E R
;;======================================================================

(define (nmsg-transport:run dbstruct area-dat hostn run-id server-id #!key (retrynum 1000))
  (debug:print 2 "Attempting to start the server ...")
  (let* ((start-port      (portlogger:open-run-close portlogger:find-port))
  (let* ((start-port      (portlogger:open-run-close portlogger:find-port area-dat))
	 (server-thread   (make-thread (lambda ()
					 (nmsg-transport:try-start-server dbstruct run-id start-port server-id))
				       "server thread"))
	 (tdbdat          (tasks:open-db)))
	 (tdbdat          (tasks:open-db area-dat)))
    (thread-start! server-thread)
    (thread-sleep! 0.1)
    (if (nmsg-transport:ping hostn start-port timeout: 2 expected-key: (current-process-id))
	(let ((interface (if (equal? hostn "-")(get-host-name) hostn)))
	  (tasks:server-set-interface-port (db:delay-if-busy tdbdat) server-id interface start-port)
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep")
	  (set! *server-info* (list hostn start-port)) ;; probably not needed anymore? currently used by keep-running
	  (thread-sleep! 3) ;; give some margin for queries to complete before switching from file based access to server based access
	  ;; (set! *inmemdb*  dbstruct)
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")
	  (thread-start! (make-thread
			  (lambda ()(nmsg-transport:keep-running server-id run-id area-dat))
			  "keep running"))
	  (thread-join! server-thread))
	(if (> retrynum 0)
	    (begin
	      (debug:print 0 "WARNING: Failed to connect to server (self) on host " hostn ":" start-port ", trying again.")
	      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature")
	      (portlogger:open-run-close portlogger:set-failed start-port)
	      (portlogger:open-run-close portlogger:set-failed area-dat start-port)
	      (nmsg-transport:run dbstruct area-dat hostn run-id server-id))
	    (begin
	      (debug:print 0 "ERROR: could not find an open port to start server on. Giving up")
	      (exit 1))))))

(define (nmsg-transport:try-start-server dbstruct run-id portnum server-id)
  (let ((repsoc (nn-socket 'rep)))
    (nn-bind repsoc (conc "tcp://*:" portnum))
    (let loop ((msg-in (nn-recv repsoc)))
      (let* ((dat    (db:string->obj msg-in transport: 'nmsg)))
	(debug:print 0 "server, received: " dat)
	(let ((result (api:execute-requests dbstruct dat)))
	  (debug:print 0 "server, sending: " result)
	  (nn-send repsoc (db:obj->string result  transport: 'nmsg)))
	(loop (nn-recv repsoc))))))

;; all routes though here end in exit ...
;;
(define (nmsg-transport:launch run-id area-dat)
  (let* ((tdbdat   (tasks:open-db))
  (let* ((tdbdat   (tasks:open-db area-dat))
	 (dbstruct (db:setup run-id))
	 (hostn    (or (args:get-arg "-server") "-")))
    (set! *run-id*   run-id)
    (set! *inmemdb* dbstruct)
    ;; with nbfake daemonize isn't really needed
    ;;
    ;; (if (args:get-arg "-daemonize")
    ;;     (begin
    ;;       (daemon:ize)
    ;;       (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
    ;;           (begin
    ;;     	(current-error-port *alt-log-file*)
    ;;     	(current-output-port *alt-log-file*)))))
    (if (server:check-if-running run-id)
    (if (server:check-if-running run-id area-dat)
	(begin
	  (debug:print-info 0 "Server for run-id " run-id " already running")
	  (exit 0)))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))
	       (remtries  4))
      (if (not server-id)
	  (if (> remtries 0)
	      (begin
		(thread-sleep! 2)
		(if (not (server:check-if-running run-id))
		(if (not (server:check-if-running run-id area-dat))
		    (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)
			  (- remtries 1))
		    (begin
		      (debug:print-info 0 "Another server took the slot, exiting")
		      (exit 0))))
	      (begin
		;; since we didn't get the server lock we are going to clean up and bail out
269
270
271
272
273
274
275
276

277
278
279
280
281
282
283
269
270
271
272
273
274
275

276
277
278
279
280
281
282
283







-
+







				sdat)
                              (begin
                                (thread-sleep! 0.5)
                                (loop))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
	 (tdbdat      (tasks:open-db))
	 (tdbdat      (tasks:open-db area-dat))
	 (server-timeout (let ((tmo (configf:lookup  (megatest:area-configdat area-dat) "server" "timeout")))
			   (if (and (string? tmo)
				    (string->number tmo))
			       (* 60 60 (string->number tmo))
			       ;; (* 3 24 60 60) ;; default to three days
			       (* 60 1)         ;; default to one minute
			       ;; (* 60 60 25)      ;; default to 25 hours

Modified olddashboard.scm from [ad835aa81d] to [af152b63b7].

472
473
474
475
476
477
478
479
480


481
482
483
484
485
486
487
472
473
474
475
476
477
478


479
480
481
482
483
484
485
486
487







-
-
+
+







    (updater)
    (set! dashboard:update-summary-tab updater)
    (iup:attribute-set! stats-matrix "WIDTHDEF" "40")
    (iup:vbox
     ;; (iup:label "Run statistics"  #:expand "HORIZONTAL")
     stats-matrix)))

(define (dcommon:servers-table)
  (let* ((tdbdat         (tasks:open-db))
(define (dcommon:servers-table area-dat)
  (let* ((tdbdat         (tasks:open-db area-dat))
	 (colnum         0)
	 (rownum         0)
	 (servers-matrix (iup:matrix #:expand "YES"
				     #:numcol 7
				     #:numcol-visible 7
				     #:numlin-visible 5
				     ))
2124
2125
2126
2127
2128
2129
2130
2131

2132
2133
2134
2135
2136
2137
2138
2124
2125
2126
2127
2128
2129
2130

2131
2132
2133
2134
2135
2136
2137
2138







-
+







	   (> modtime last-db-update-time)
	   (> (current-seconds)(+ last-db-update-time 1)))))

(define *monitor-db-path* (conc *dbdir* "/monitor.db"))
(define *last-monitor-update-time* 0)

;; Force creation of the db in case it isn't already there.
(tasks:open-db)
(tasks:open-db area-dat)

(define (dashboard:get-youngest-run-db-mod-time)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn))
     (current-seconds)) ;; something went wrong - just print an error and return current-seconds

Modified portlogger.scm from [bdca7fc363] to [58d53e15b4].

15
16
17
18
19
20
21
22

23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

42
43
44
45
46
47
48
49
50
51
52

53
54
55
56
57
58
59
60
61
62
63
64
65

66
67
68
69
70
71
72
15
16
17
18
19
20
21

22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40

41
42
43
44
45
46
47
48
49
50
51

52
53
54
55
56
57
58
59
60
61
62
63
64

65
66
67
68
69
70
71
72







-
+


















-
+










-
+












-
+








(declare (unit portlogger))
(declare (uses db))

;; lsof -i


(define (portlogger:open-db fname)
(define (portlogger:open-db fname area-dat)
  (let* ((avail    (tasks:wait-on-journal fname 5 remove: #t)) ;; wait up to about 10 seconds for the journal to go away
	 (exists   (file-exists? fname))
	 (db       (if avail 
		       (sqlite3:open-database fname)
		       (begin
			 (system (conc "rm -f " fname))
			 (sqlite3:open-database fname))))
	 (handler  (make-busy-timeout 136000))
	 (canwrite (file-write-access? fname)))
	 ;; (db-init  (lambda ()
	 ;;             (sqlite3:execute 
	 ;;              db
	 ;;              "CREATE TABLE IF NOT EXISTS ports (
         ;;                 port INTEGER PRIMARY KEY,
         ;;                 state TEXT DEFAULT 'not-used',
         ;;                 fail_count INTEGER DEFAULT 0,
         ;;                 update_time TIMESTAMP DEFAULT (strftime('%s','now')) );"))))
    (sqlite3:set-busy-handler! db handler)
    (db:set-sync db) ;; (sqlite3:execute db "PRAGMA synchronous = 0;")
    (db:set-sync db area-dat) ;; (sqlite3:execute db "PRAGMA synchronous = 0;")
    ;; (if (not exists) ;; needed with IF NOT EXISTS?
    (sqlite3:execute 
     db
     "CREATE TABLE IF NOT EXISTS ports (
            port INTEGER PRIMARY KEY,
            state TEXT DEFAULT 'not-used',
            fail_count INTEGER DEFAULT 0,
            update_time TIMESTAMP DEFAULT (strftime('%s','now')) );")
    db))

(define (portlogger:open-run-close proc . params)
(define (portlogger:open-run-close proc area-dat . params)
  (let* ((fname  (conc "/tmp/." (current-user-name) "-portlogger.db"))
	 (avail  (tasks:wait-on-journal fname 10))) ;; wait up to about 10 seconds for the journal to go away
    (handle-exceptions
     exn
     (begin
       ;; (release-dot-lock fname)
       (debug:print 0 "ERROR: portlogger:open-run-close failed. " proc " " params)
       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 0 "exn=" (condition->list exn))
       (if (file-exists? fname)(delete-file fname)) ;; brutally get rid of it
       (print-call-chain (current-error-port)))
     (let* (;; (lock   (obtain-dot-lock fname 2 9 10))
	    (db     (portlogger:open-db fname))
	    (db     (portlogger:open-db fname area-dat))
	    (res    (apply proc db params)))
       (sqlite3:finalize! db)
       ;; (release-dot-lock fname)
       res))))

;; (fold-row PROC INIT DATABASE SQL . PARAMETERS) 
(define (portlogger:take-port db portnum)

Modified rmt.scm from [3e0b8a2905] to [9d5589428d].

74
75
76
77
78
79
80
81
82


83
84
85
86
87
88
89
74
75
76
77
78
79
80


81
82
83
84
85
86
87
88
89







-
-
+
+







;;
(define (rmt:get-connection-info run-id area-dat #!key (remote #f))
  (let ((cinfo (common:get-remote remote run-id)))
    (if cinfo
	cinfo
	;; NB// can cache the answer for server running for 10 seconds ...
	;;  ;; (and (not (rmt:write-frequency-over-limit? cmd run-id))
	(if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id)
	    (client:setup run-id remote: remote)
	(if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db area-dat)) run-id)
	    (client:setup run-id area-dat remote: remote)
	    #f))))

(define (rmt:discard-old-connections area-dat)
  ;; clean out old connections
  (mutex-lock! *db-multi-sync-mutex*)
  (let ((remote      (megatest:area-remote area-dat))
	(expire-time (- (current-seconds) (server:get-timeout area-dat) 10))) ;; don't forget the 10 second margin
103
104
105
106
107
108
109

110

111
112
113
114
115
116

117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

132
133
134
135
136
137
138
139
140
141
142
143

144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

165
166
167
168
169
170
171
103
104
105
106
107
108
109
110

111
112
113
114
115
116

117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

132
133
134
135
136
137
138
139
140
141
142
143

144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

165
166
167
168
169
170
171
172







+
-
+





-
+














-
+











-
+




















-
+







  (mutex-unlock! *db-multi-sync-mutex*))

(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

(define (rmt:send-receive cmd rid params area-dat #!key (attemptnum 1)(remote #f)) ;; start attemptnum at 1 so the modulo below works as expected
  (rmt:discard-old-connections area-dat)
  ;; (mutex-lock! *send-receive-mutex*)
  (let* ((transport-type  (megatest:area-transport area-dat))
  (let* ((run-id          (if rid rid 0))
	 (run-id          (if rid rid 0))
	 (configdat       (megatest:area-configdat area-dat))
	 (connection-info (rmt:get-connection-info run-id area-dat)))
    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)
    (if connection-info
	;; use the server if have connection info
	(let* ((dat     (case *transport-type*
	(let* ((dat     (case transport-type
			  ((http)(condition-case
				  (http-transport:client-api-send-receive run-id connection-info cmd params)
				  ((commfail)(vector #f "communications fail"))
				  ((exn)(vector #f "other fail"))))
			  ((nmsg)(condition-case
				  (nmsg-transport:client-api-send-receive run-id connection-info cmd params)
				  ((timeout)(vector #f "timeout talking to server"))))
			  (else  (exit))))
	       (success (if (vector? dat) (vector-ref dat 0) #f))
	       (res     (if (vector? dat) (vector-ref dat 1) #f)))
	  (if (vector? connection-info)(http-transport:server-dat-update-last-access connection-info))
	  (if success
	      (begin
		;; (mutex-unlock! *send-receive-mutex*)
		(case *transport-type* 
		(case 
		  ((http) res) ;; (db:string->obj res))
		  ((nmsg) res))) ;; (vector-ref res 1)))
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to rmt:send-receive again.")
		;; (case *transport-type*
		;;   ((nmsg)(nn-close (http-transport:server-dat-get-socket connection-info))))
		(common:del-remote! remote run-id) ;; don't keep using the same connection
		;; NOTE: killing server causes this process to block forever. No idea why. Dec 2. 
		;; (if (eq? (modulo attemptnum 5) 0)
		;;     (tasks:kill-server-run-id run-id tag: "api-send-receive-failed"))
		;; (mutex-unlock! *send-receive-mutex*) ;; close the mutex here to allow other threads access to communications
		(tasks:start-and-wait-for-server (tasks:open-db) run-id 15)
		(tasks:start-and-wait-for-server (tasks:open-db area-dat) run-id 15)
		;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1))))))

		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
		;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))
		;; (thread-sleep! 2)
		(rmt:send-receive cmd run-id params area-dat attemptnum: (+ attemptnum 1)))))
	;; no connection info? try to start a server, or access locally if no
	;; server and the query is read-only
	;;
	;; Note: The tasks db was checked for a server in starting mode in the rmt:get-connection-info call
	;;
	(if (and (< attemptnum 15)
		 (member cmd api:write-queries))
	    (let ((faststart (configf:lookup configdat "server" "faststart")))
	      (common:del-remote! remote run-id)
	      ;; (mutex-unlock! *send-receive-mutex*)
	      (if (and faststart (equal? faststart "no"))
		  (begin
		    (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
		    (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db area-dat)) run-id 10)
		    (thread-sleep! (random 5)) ;; give some time to settle and minimize collison?
		    (rmt:send-receive cmd rid params area-dat attemptnum: (+ attemptnum 1)))
		  (begin
		    (server:kind-run run-id area-dat)
		    (rmt:open-qry-close-locally cmd run-id params area-dat))))
	    (begin
	      ;; (debug:print 0 "ERROR: Communication failed!")
374
375
376
377
378
379
380
381

382
383
384
385
386
387
388
375
376
377
378
379
380
381

382
383
384
385
386
387
388
389







-
+







  (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id) area-dat))

(define (rmt:open-test-db-by-test-id run-id test-id area-dat #!key (work-area #f))
  (let* ((test-path (if (string? work-area)
			work-area
			(rmt:test-get-rundir-from-test-id run-id test-id area-dat))))
    (debug:print 3 "TEST PATH: " test-path)
    (open-test-db test-path)))
    (open-test-db test-path area-dat)))

;; WARNING: This currently bypasses the transaction wrapped writes system
(define (rmt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment area-dat)
  (rmt:send-receive 'test-set-state-status-by-id run-id (list run-id test-id newstate newstatus newcomment) area-dat))

(define (rmt:set-tests-state-status run-id testnames currstate currstatus newstate newstatus area-dat)
  (rmt:send-receive 'set-tests-state-status run-id (list run-id testnames currstate currstatus newstate newstatus) area-dat))

Modified rpc-transport.scm from [9f0167ff5f] to [75fd7447c7].

35
36
37
38
39
40
41
42

43
44
45
46

47
48
49
50

51
52
53
54
55
56

57
58
59
60
61

62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81

82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

99
100
101
102

103
104
105
106
107
108
109
110
111
112
113
114
115
116

117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133

134
135
136
137
138
139
140
35
36
37
38
39
40
41

42
43
44
45

46
47
48
49

50
51
52
53
54
55

56
57
58
59
60

61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80

81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

98
99
100
101

102
103
104
105
106
107
108
109
110
111
112
113
114
115

116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

133
134
135
136
137
138
139
140







-
+



-
+



-
+





-
+




-
+



















-
+
















-
+



-
+













-
+
















-
+







   ;;    (apply (eval (string->symbol (conc "remote:" procstr))) params)
   (apply (eval (string->symbol procstr)) params)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (rpc-transport:launch run-id)
(define (rpc-transport:launch run-id area-dat)
  (set! *run-id*   run-id)
  (if (args:get-arg "-daemonize")
      (daemon:ize))
  (if (server:check-if-running run-id)
  (if (server:check-if-running run-id area-dat)
      (begin
	(debug:print 0 "INFO: Server for run-id " run-id " already running")
	(exit 0)))
  (let loop ((server-id (open-run-close tasks:server-lock-slot tasks:open-db run-id))
  (let loop ((server-id (open-run-close tasks:server-lock-slot (lambda ()(tasks:open-db area-dat))run-id))
	     (remtries  4))
    (if (not server-id)
	(if (> remtries 0)
	    (begin
	      (thread-sleep! 2)
	      (loop (open-run-close tasks:server-lock-slot tasks:open-db run-id)
	      (loop (open-run-close tasks:server-lock-slot (lambda ()(tasks:open-db area-dat)) run-id)
		    (- remtries 1)))
	    (begin
	      ;; since we didn't get the server lock we are going to clean up and bail out
	      (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
	      (open-run-close tasks:server-delete-records-for-this-pid tasks:open-db " rpc-transport:launch")))
	      (open-run-close tasks:server-delete-records-for-this-pid (lambda ()(tasks:open-db area-dat)) " rpc-transport:launch")))
	(begin
	  (rpc-transport:run (if (args:get-arg "-server")(args:get-arg "-server") "-") run-id server-id)
	  (exit)))))

(define (rpc-transport:run hostn run-id server-id area-dat)
  (debug:print 2 "Attempting to start the rpc server ...")
   ;; (trace rpc:publish-procedure!)

  (rpc:publish-procedure! 'server:login server:login)
  (rpc:publish-procedure! 'testing (lambda () "Just testing"))

  (let* ((configdat       (megatest:area-configdat area-dat))
	 (db              #f)
	 (hostname        (get-host-name))
	 (ipaddrstr       (let ((ipstr (if (string=? "-" hostn)
					   ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
					   (server:get-best-guess-address hostname)
					   #f)))
			    (if ipstr ipstr hostn))) ;; hostname))) 
	 (start-port      (open-run-close tasks:server-get-next-port tasks:open-db))
	 (start-port      (open-run-close tasks:server-get-next-port (lambda ()(tasks:open-db area-dat))))
	 (link-tree-path  (configf:lookup configdat "setup" "linktree"))
	 (rpc:listener    (rpc-transport:find-free-port-and-open (rpc:default-server-port)))
	 (th1             (make-thread
			   (lambda ()
			     ((rpc:make-server rpc:listener) #t))
			   "rpc:server"))
			   ;; (cute (rpc:make-server rpc:listener) "rpc:server")
			   ;; 'rpc:server))
	 (hostname        (if (string=? "-" hostn)
			      (get-host-name) 
			      hostn))
	 (ipaddrstr       (if (string=? "-" hostn)
			      (server:get-best-guess-address hostname) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
			      #f))
	 (portnum         (rpc:default-server-port))
	 (host:port       (conc (if ipaddrstr ipaddrstr hostname) ":" portnum))
	 (tdb             (tasks:open-db)))
	 (tdb             (tasks:open-db area-dat)))
    (thread-start! th1)
    (set! db *inmemdb*)
    (open-run-close tasks:server-set-interface-port 
		    tasks:open-db 
		    (lambda ()(tasks:open-db area-dat))
		    server-id 
		    ipaddrstr portnum)
    (debug:print 0 "Server started on " host:port)
    
    ;; (trace rpc:publish-procedure!)
    ;; (rpc:publish-procedure! 'server:login server:login)
    ;; (rpc:publish-procedure! 'testing (lambda () "Just testing"))

    ;;======================================================================
    ;;	  ;; end of publish-procedure section
    ;;======================================================================
    ;;
    (on-exit (lambda ()
	       (open-run-close tasks:server-set-state! tasks:open-db server-id "stopped")))
	       (open-run-close tasks:server-set-state! (lambda ()(tasks:open-db area-dat)) server-id "stopped")))

    (set! *rpc:listener* rpc:listener)
    (tasks:server-set-state! tdb server-id "running")
    (set! *inmemdb*  (db:setup run-id))
    ;; if none running or if > 20 seconds since 
    ;; server last used then start shutdown
    (let loop ((count 0))
      (thread-sleep! 5) ;; no need to do this very often
      (let ((numrunning -1)) ;; (db:get-count-tests-running db)))
	(if (or (> numrunning 0)
		(> (+ *last-db-access* 60)(current-seconds)))
	    (begin
	      (debug:print-info 0 "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*))
	      (loop (+ 1 count)))
	    (begin
	      (debug:print-info 0 "Starting to shutdown the server side")
	      (open-run-close tasks:server-delete-record tasks:open-db server-id " rpc-transport:try-start-server stop")
	      (open-run-close tasks:server-delete-record (lambda ()(tasks:open-db area-dat)) server-id " rpc-transport:try-start-server stop")
	      (thread-sleep! 10)
	      (debug:print-info 0 "Max cached queries was " *max-cache-size*)
	      (debug:print-info 0 "Server shutdown complete. Exiting")
	      ))))))

(define (rpc-transport:find-free-port-and-open port)
  (handle-exceptions
177
178
179
180
181
182
183
184

185
186
187
188
189
190
191
177
178
179
180
181
182
183

184
185
186
187
188
189
190
191







-
+







		  (let ((server-dat (list iface port #f #f #f)))
		    (common:set-remote! remote run-id server-dat)
		    server-dat)
		  (begin
		    (server:try-running run-id)
		    (thread-sleep! 2)
		    (rpc-transport:client-setup run-id area-dat remtries: (- remtries 1)))))
 	    (let* ((server-db-info (open-run-close tasks:get-server tasks:open-db run-id)))
 	    (let* ((server-db-info (open-run-close tasks:get-server (lambda ()(tasks:open-db area-dat)) run-id)))
 	      (debug:print-info 0 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	      (if server-db-info
 		  (let* ((iface     (tasks:hostinfo-get-interface server-db-info))
 			 (port      (tasks:hostinfo-get-port      server-db-info))
			 (server-dat (list iface port #f #f #f))
 			 (ping-res  ((rpc:procedure 'server:login host port) toppath)))
 		    (if start-res

Modified runs.scm from [6f4d489558] to [45c9c10cd7].

220
221
222
223
224
225
226
227

228
229
230
231
232
233
234
235

236
237
238
239
240
241
242
220
221
222
223
224
225
226

227
228
229
230
231
232
233
234

235
236
237
238
239
240
241
242







-
+







-
+







	 (test-records       (make-hash-table))
	 ;; need to process runconfigs before generating these lists
	 (all-tests-registry #f)  ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names
	 (all-test-names     #f)  ;; (hash-table-keys all-tests-registry))
	 (test-names         #f)  ;; (tests:filter-test-names all-test-names test-patts))
	 (required-tests     #f)  ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work
	 (task-key           (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id)))
	 (tdbdat             (tasks:open-db)))
	 (tdbdat             (tasks:open-db area-dat)))

    (if (tasks:need-server run-id area-dat)(tasks:start-and-wait-for-server tdbdat run-id 10))

    (set-signal-handler! signal/int
			 (lambda (signum)
			   (signal-mask! signum)
			   (print "Received signal " signum ", cleaning up before exit. Please wait...")
			   (let ((tdbdat (tasks:open-db)))
			   (let ((tdbdat (tasks:open-db area-dat)))
			     (rmt:tasks-set-state-given-param-key task-key "killed"))
			   (print "Killed by signal " signum ". Exiting")
			   (exit)))

    ;; register this run in monitor.db
    (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params)
    (rmt:tasks-set-state-given-param-key task-key "running")
917
918
919
920
921
922
923
924

925
926
927
928
929
930
931
917
918
919
920
921
922
923

924
925
926
927
928
929
930
931







-
+







	(max-concurrent-jobs   (let ((mcj (config-lookup configdat "setup"     "max_concurrent_jobs")))
				 (if (and mcj (string->number mcj))
				     (string->number mcj)
				     1))) ;; length of the register queue ahead
	(reglen                (if (number? reglen-in) reglen-in 1))
	(last-time-incomplete  (- (current-seconds) 900)) ;; force at least one clean up cycle
	(last-time-some-running (current-seconds))
	(tdbdat                (tasks:open-db)))
	(tdbdat                (tasks:open-db area-dat)))

    ;; Initialize the test-registery hash with tests that already have a record
    ;; convert state to symbol and use that as the hash value
    (for-each (lambda (trec)
		(let ((id (db:test-get-id        trec))
		      (tn (db:test-get-testname  trec))
		      (ip (db:test-get-item-path trec))
1426
1427
1428
1429
1430
1431
1432
1433

1434
1435
1436
1437
1438
1439
1440
1426
1427
1428
1429
1430
1431
1432

1433
1434
1435
1436
1437
1438
1439
1440







-
+







;;    'set-state-status
;;
;; NB// should pass in keys?
;;
(define (runs:operate-on action target runnamepatt testpatt area-dat #!key (state #f)(status #f)(new-state-status #f)(mode 'remove-all)(options '()))
  (common:clear-caches) ;; clear all caches
  (let* ((db           #f)
	 (tdbdat       (tasks:open-db))
	 (tdbdat       (tasks:open-db area-dat))
	 (keys         (rmt:get-keys area-dat))
	 (rundat       (mt:get-runs-by-patt keys runnamepatt target))
	 (header       (vector-ref rundat 0))
	 (runs         (vector-ref rundat 1))
	 (states       (if state  (string-split state  ",") '()))
	 (statuses     (if status (string-split status ",") '()))
	 (state-status (if (string? new-state-status) (string-split new-state-status ",") '(#f #f))))

Modified server.scm from [f70ac864c6] to [7ddbaf8b7c].

46
47
48
49
50
51
52
53

54
55
56
57

58
59
60
61
62
63
64
46
47
48
49
50
51
52

53
54
55
56

57
58
59
60
61
62
63
64







-
+



-
+







;;

;; all routes though here end in exit ...
;;
;; start_server
;;
(define (server:launch run-id area-dat)
  (case *transport-type*
  (case (megatest:area-transport area-dat)
    ((http)(http-transport:launch run-id area-dat))
    ((nmsg)(nmsg-transport:launch run-id area-dat))
    ((rpc)  (rpc-transport:launch run-id area-dat))
    (else (debug:print 0 "ERROR: unknown server type " *transport-type*))))
    (else (debug:print 0 "ERROR: unknown server type " (megatest:area-transport area-dat)))))
;;       (else   (debug:print 0 "ERROR: No known transport set, transport=" transport ", using rpc")
;; 	      (rpc-transport:launch run-id)))))

;;======================================================================
;; S E R V E R   U T I L I T I E S 
;;======================================================================

80
81
82
83
84
85
86
87

88
89
90
91
92
93
94
95
96
97
98
99
100

101
102
103
104
105
106
107
80
81
82
83
84
85
86

87
88
89
90
91
92
93
94
95
96
97
98
99

100
101
102
103
104
105
106
107







-
+












-
+







			   (lambda ()
			     (write (list (current-directory)
					  (argv)))))))

;; When using zmq this would send the message back (two step process)
;; with spiffy or rpc this simply returns the return data to be returned
;; 
(define (server:reply return-addr query-sig success/fail result #!key (remote #f))
(define (server:reply return-addr query-sig success/fail result area-dat #!key (remote #f))
  (debug:print-info 11 "server:reply return-addr=" return-addr ", result=" result)
  ;; (send-message pubsock target send-more: #t)
  ;; (send-message pubsock 
  (case (server:get-transport)
    ((rpc)  (db:obj->string (vector success/fail query-sig result)))
    ((http) (db:obj->string (vector success/fail query-sig result)))
    ((zmq)
     (let ((pub-socket (vector-ref (common:get-remote remote #f) 1)))
       (send-message pub-socket return-addr send-more: #t)
       (send-message pub-socket (db:obj->string (vector success/fail query-sig result)))))
    ((fs)   result)
    (else 
     (debug:print 0 "ERROR: unrecognised transport type: " *transport-type*)
     (debug:print 0 "ERROR: unrecognised transport type: " (megatest:area-transport area-dat))
     result)))

;; Given a run id start a server process    ### NOTE ### > file 2>&1 
;; if the run-id is zero and the target-host is set 
;; try running on that host
;;
(define  (server:run run-id area-dat)
155
156
157
158
159
160
161
162
163


164
165
166
167
168
169
170
171
172

173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192


193
194
195
196
197
198
199
155
156
157
158
159
160
161


162
163
164
165
166
167
168
169
170
171

172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190


191
192
193
194
195
196
197
198
199







-
-
+
+








-
+


















-
-
+
+







;; The generic run a server command. Dispatches the call to server 0 if run-id != 0
;; 
(define (server:try-running run-id area-dat)
  (if (eq? run-id 0)
      (server:run run-id area-dat)
      (rmt:start-server run-id)))

(define (server:check-if-running run-id)
  (let ((tdbdat (tasks:open-db)))
(define (server:check-if-running run-id area-dat)
  (let ((tdbdat (tasks:open-db area-dat)))
    (let loop ((server (tasks:get-server (db:delay-if-busy tdbdat) run-id))
	       (trycount 0))
    (if server
	;; note: client:start will set (common:get-remote remote). this needs to be changed
	;;       also, client:start will login to the server, also need to change that.
	;;
	;; client:start returns #t if login was successful.
	;;
	(let ((res (case *transport-type*
	(let ((res (case (megatest:area-transport area-dat)
		     ((http)(server:ping-server run-id 
						(tasks:hostinfo-get-interface server)
						(tasks:hostinfo-get-port      server)))
		     ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
						 (tasks:hostinfo-get-port      server)
						 timeout: 2)))))
	  ;; if the server didn't respond we must remove the record
	  (if res
	      #t
	      (begin
		(debug:print-info 0 "server at " server " not responding, removing record")
		(tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id 
				" server:check-if-running")
		res)))
	#f))))

;; called in megatest.scm, host-port is string hostname:port
;;
(define (server:ping run-id host:port)
  (let ((tdbdat (tasks:open-db)))
(define (server:ping run-id host:port area-dat)
  (let ((tdbdat (tasks:open-db area-dat)))
    (let* ((host-port (let ((slst (string-split   host:port ":")))
			(if (eq? (length slst) 2)
			    (list (car slst)(string->number (cadr slst)))
			    #f)))
	   (toppath       (launch:setup-for-run))
	   (server-db-dat (if (not host-port)(tasks:get-server (db:delay-if-busy tdbdat) run-id) #f)))
      (if (not run-id)

Modified tasks.scm from [af8f785c6d] to [f2e471c669].

98
99
100
101
102
103
104
105

106
107
108
109
110
111
112
98
99
100
101
102
103
104

105
106
107
108
109
110
111
112







-
+







			     ((file-read-access? dbpath)    (sqlite3:open-database dbfile))
			     (else (sqlite3:open-database ":memory:")))) ;; (never-give-up-open-db dbpath))
	      (handler      (make-busy-timeout 36000)))
	 (if (and exists
		  (not write-access))
	     (set! *db-write-access* write-access)) ;; only unset so other db's also can use this control
	 (sqlite3:set-busy-handler! mdb handler)
	 (db:set-sync mdb) ;; (sqlite3:execute mdb (conc "PRAGMA synchronous = 0;"))
	 (db:set-sync mdb area-dat) ;; (sqlite3:execute mdb (conc "PRAGMA synchronous = 0;"))
	 ;;  (if (or (and (not exists)
	 ;; 	      (file-write-access? toppath))
	 ;; 	 (not (file-read-access? dbpath)))
	 ;;      (begin
	 ;; 
	 ;; TASKS QUEUE MOVED TO main.db
	 ;;
164
165
166
167
168
169
170
171

172
173
174
175

176
177
178
179
180
181

182
183
184
185
186
187
188
189
190
191
192
193
194
195

196
197
198
199
200
201
202
164
165
166
167
168
169
170

171
172
173
174

175
176
177
178
179
180

181
182
183
184
185
186
187
188
189
190
191
192
193
194

195
196
197
198
199
200
201
202







-
+



-
+





-
+













-
+







(define (tasks:hostinfo-get-interface   vec)    (vector-ref  vec 1))
(define (tasks:hostinfo-get-port        vec)    (vector-ref  vec 2))
(define (tasks:hostinfo-get-pubport     vec)    (vector-ref  vec 3))
(define (tasks:hostinfo-get-transport   vec)    (vector-ref  vec 4))
(define (tasks:hostinfo-get-pid         vec)    (vector-ref  vec 5))
(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

(define (tasks:server-lock-slot mdb run-id)
(define (tasks:server-lock-slot mdb run-id area-dat)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot")
  (if (< (tasks:num-in-available-state mdb run-id) 4)
      (begin 
	(tasks:server-set-available mdb run-id)
	(tasks:server-set-available mdb run-id area-dat)
	(thread-sleep! (/ (random 1500) 1000)) ;; (thread-sleep! 2) ;; Try removing this. It may not be needed.
	(tasks:server-am-i-the-server? mdb run-id))
      #f))
	
;; register that this server may come online (first to register goes though with the process)
(define (tasks:server-set-available mdb run-id)
(define (tasks:server-set-available mdb run-id area-dat)
  (sqlite3:execute 
   mdb 
   "INSERT INTO servers (pid,hostname,port,pubport,start_time,      priority,state,mt_version,heartbeat,   interface,transport,run_id)
                   VALUES(?, ?,       ?,   ?, strftime('%s','now'), ?,       ?,    ?,-1,?,        ?,        ?);"
   (current-process-id)          ;; pid
   (get-host-name)               ;; hostname
   -1                            ;; port
   -1                            ;; pubport
   (random 1000)                 ;; priority (used a tiebreaker on get-available)
   "available"                   ;; state
   (common:version-signature)    ;; mt_version
   -1                            ;; interface
   ;; (conc (server:get-transport)) ;; transport
   (conc *transport-type*)    ;; transport
   (conc (megatest:area-transport area-dat))    ;; transport
   run-id
   ))

(define (tasks:num-in-available-state mdb run-id)
  (let ((res 0))
    (sqlite3:for-each-row
     (lambda (num-in-queue)

Modified tdb.scm from [6d49ce2a57] to [8118e8579d].

38
39
40
41
42
43
44
45

46
47
48
49
50
51
52
38
39
40
41
42
43
44

45
46
47
48
49
50
51
52







-
+







;;======================================================================

;;======================================================================
;; T E S T   S P E C I F I C   D B 
;;======================================================================

;; Create the sqlite db for the individual test(s)
(define (open-test-db work-area) 
(define (open-test-db work-area area-dat) 
  (debug:print-info 11 "open-test-db " work-area)
  (if (and work-area 
	   (directory? work-area)
	   (file-read-access? work-area))
      (let* ((dbpath              (conc work-area "/testdat.db"))
	     (dbexists            (file-exists? dbpath))
	     (work-area-writeable (file-write-access? work-area))
69
70
71
72
73
74
75
76

77
78
79
80
81
82
83
69
70
71
72
73
74
75

76
77
78
79
80
81
82
83







-
+







					       136000))))
	
	(if (and tdb-writeable
		 *db-write-access*)
	    (sqlite3:set-busy-handler! db handler))
	(if (not dbexists)
	    (begin
	      (db:set-sync db) ;; (sqlite3:execute db "PRAGMA synchronous = FULL;")
	      (db:set-sync db area-dat) ;; (sqlite3:execute db "PRAGMA synchronous = FULL;")
	      (debug:print-info 11 "Initialized test database " dbpath)
	      (tdb:testdb-initialize db)))
	;; (sqlite3:execute db "PRAGMA synchronous = 0;")
	(debug:print-info 11 "open-test-db END (sucessful)" work-area)
	;; now let's test that everything is correct
	(handle-exceptions
	 exn
95
96
97
98
99
100
101
102

103
104
105
106
107

108
109
110

111
112
113
114
115

116
117
118

119
120
121
122

123
124
125
126
127
128
129
95
96
97
98
99
100
101

102
103
104
105
106

107
108
109

110
111
112
113
114

115
116
117

118
119
120
121

122
123
124
125
126
127
128
129







-
+




-
+


-
+




-
+


-
+



-
+







      (let ((baddb (sqlite3:open-database ":memory:")))
 	(debug:print-info 11 "open-test-db END (unsucessful)" work-area)
 	;; provide an in-mem db (this is dangerous!)
 	(tdb:testdb-initialize baddb)
 	baddb)))

;; find and open the testdat.db file for an existing test
(define (tdb:open-test-db-by-test-id test-id #!key (work-area #f))
(define (tdb:open-test-db-by-test-id test-id area-dat #!key (work-area #f))
  (let* ((test-path (if work-area
			work-area
			(rmt:test-get-rundir-from-test-id test-id))))
    (debug:print 3 "TEST PATH: " test-path)
    (open-test-db test-path)))
    (open-test-db test-path area-dat)))

;; find and open the testdat.db file for an existing test
(define (tdb:open-test-db-by-test-id-local dbstruct run-id test-id #!key (work-area #f))
(define (tdb:open-test-db-by-test-id-local dbstruct area-dat run-id test-id #!key (work-area #f))
  (let* ((test-path (if work-area
			work-area
			(db:test-get-rundir-from-test-id dbstruct run-id test-id))))
    (debug:print 3 "TEST PATH: " test-path)
    (open-test-db test-path)))
    (open-test-db test-path area-dat)))

;; find and open the testdat.db file for an existing test
(define (tdb:open-run-close-db-by-test-id-local dbstruct run-id test-id work-area proc . params)
(define (tdb:open-run-close-db-by-test-id-local dbstruct area-dat run-id test-id work-area proc . params)
  (let* ((test-path (if work-area
			work-area
			(db:test-get-rundir-from-test-id dbstruct run-id test-id)))
	 (tdb        (open-test-db test-path)))
	 (tdb        (open-test-db test-path area-dat)))
    (apply proc tdb params)))

(define (tdb:testdb-initialize db)
  (debug:print 11 "db:testdb-initialize START")
  (sqlite3:with-transaction
   db
   (lambda ()
366
367
368
369
370
371
372
373
374


375
376
377
378
379
380
381
366
367
368
369
370
371
372


373
374
375
376
377
378
379
380
381







-
-
+
+







			 (if (eq? time-a time-b)
			     (string<? (conc (vector-ref a 2))
				       (conc (vector-ref b 2)))
			     #f))
		     (string<? (conc time-a)(conc time-b))))))))

;; 
(define (tdb:remote-update-testdat-meta-info run-id test-id work-area cpuload diskfree minutes)
  (let ((tdb         (rmt:open-test-db-by-test-id run-id test-id work-area: work-area)))
(define (tdb:remote-update-testdat-meta-info run-id test-id work-area cpuload diskfree minutes area-dat)
  (let ((tdb         (rmt:open-test-db-by-test-id run-id test-id area-dat work-area: work-area)))
    (if (sqlite3:database? tdb)
	(begin
	  (sqlite3:execute tdb "INSERT INTO test_rundat (update_time,cpuload,diskfree,run_duration) VALUES (strftime('%s','now'),?,?,?);"
			   cpuload diskfree minutes)
	  (sqlite3:finalize! tdb))
	(debug:print 2 "Can't update testdat.db for test " test-id " read-only or non-existant"))))
    

Modified tests/unittests/basicserver.scm from [eb22c48242] to [28cf57e0e6].

19
20
21
22
23
24
25
26

27
28
29
30
31
32

33
34
35
36
37
38
39
40
41

42
43
44
45
46

47
48
49
50
51
52
53

54
55
56
57
58
59
60
19
20
21
22
23
24
25

26
27
28
29
30
31

32
33
34
35
36
37
38
39
40

41
42
43
44
45

46
47
48
49
50
51
52

53
54
55
56
57
58
59
60







-
+





-
+








-
+




-
+






-
+







;; (test #f '("SYSTEM" "RELEASE") (rmt:get-keys))

;; (exit)

;; Server tests go here 
(for-each
 (lambda (run-id)
   (test #f #f (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id))
   (test #f #f (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db *area-dat*) *area-dat*) run-id))
   (server:kind-run run-id *area-dat*)
   (test "did server start within 20 seconds?"
	 #t
	 (let loop ((remtries 20)
		    (running (tasks:server-running-or-starting? (db:delay-if-busy
								 (tasks:open-db))
								 (tasks:open-db *area-dat*) *area-dat*)
								run-id)))
	   (if running 
	       (> running 0)
	       (if (> remtries 0)
		   (begin
		     (thread-sleep! 1)
		     (loop (- remtries 1)
			   (tasks:server-running-or-starting? (db:delay-if-busy
							       (tasks:open-db))
							       (tasks:open-db *area-dat*) *area-dat*)
							      run-id)))))))
   
   (test "did server become available" #t
	 (let loop ((remtries 10)
		    (res      (tasks:get-server (db:delay-if-busy (tasks:open-db)) run-id)))
		    (res      (tasks:get-server (db:delay-if-busy (tasks:open-db *area-dat*) *area-dat*) run-id)))
	   (if res
	       (vector? res)
	       (begin
		 (if (> remtries 0)
		     (begin
		       (thread-sleep! 1.1)
		       (loop (- remtries 1)(tasks:get-server (db:delay-if-busy (tasks:open-db)) run-id)))
		       (loop (- remtries 1)(tasks:get-server (db:delay-if-busy (tasks:open-db *area-dat*) *area-dat*) run-id)))
		     res)))))
   )
 (list 0 1))

(define user    (current-user-name))
(define runname "mytestrun")
(define keys    (rmt:get-keys))
115
116
117
118
119
120
121
122

123
124
125
126
127
128
129

130
131
132
133
134
135
136
115
116
117
118
119
120
121

122
123
124
125
126
127
128

129
130
131
132
133
134
135
136







-
+






-
+







				"runname")))

(for-each (lambda (run-id)
;; test killing server
;;
(tasks:kill-server-run-id run-id)

(test #f #f (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id))
(test #f #f (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db *area-dat*) *area-dat*) run-id))
)
(list 0 1))

;; Tests to assess reading/writing while servers are starting/stopping
(define start-time (current-seconds))
(let loop ((test-state 'start))
  (let* ((server-dats (tasks:get-server-records (db:delay-if-busy (tasks:open-db)) run-id))
  (let* ((server-dats (tasks:get-server-records (db:delay-if-busy (tasks:open-db *area-dat*) *area-dat*) run-id))
	 (first-dat   (if (not (null? server-dats))
			  (car server-dats)
			  #f)))
    (map (lambda (dat)
	   (apply print (intersperse (vector->list dat) ", ")))
	 server-dats)
    (test #f test-one-rec (rmt:get-test-info-by-id run-id test-one-id))
161
162
163
164
165
166
167
168
169


170
171
172
173
174


175
176
177
178
179
180
181
182
183
184
185
186
187
188

189
190
191
192
193
194
195
196

197
198
199
200
201
202
203
161
162
163
164
165
166
167


168
169
170
171
172


173
174
175
176
177
178
179
180
181
182
183
184
185
186
187

188
189
190
191
192
193
194
195

196
197
198
199
200
201
202
203







-
-
+
+



-
-
+
+













-
+







-
+








;; (set! *transport-type* 'http)
;; 
;; (test "setup for run" #t (begin (launch:setup-for-run)
;; 				(string? (getenv "MT_RUN_AREA_HOME"))))
;; 
;; (test "server-register, get-best-server" #t (let ((res #f))
;; 					      (open-run-close tasks:server-register tasks:open-db 1 "bob" 1234 100 'live 'http)
;; 					      (set! res (open-run-close tasks:get-best-server tasks:open-db))
;; 					      (open-run-close tasks:server-register tasks:open-db *area-dat* 1 "bob" 1234 100 'live 'http)
;; 					      (set! res (open-run-close tasks:get-best-server tasks:open-db *area-dat*))
;; 					      (number? (vector-ref res 3))))
;; 
;; (test "de-register server" #f (let ((res #f))
;; 				(open-run-close tasks:server-deregister tasks:open-db "bob" port: 1234)
;; 				(vector? (open-run-close tasks:get-best-server tasks:open-db))))
;; 				(open-run-close tasks:server-deregister tasks:open-db *area-dat* "bob" port: 1234)
;; 				(vector? (open-run-close tasks:get-best-server tasks:open-db *area-dat*))))
;; 
;; (define server-pid #f)
;; 
;; ;; Not sure how the following should work, replacing it with system of megatest -server
;; ;; (test "launch server" #t (let ((pid (process-fork (lambda ()
;; ;; 						    ;; (daemon:ize)
;; ;; 						    (server:launch 'http)))))
;; ;; 			   (set! server-pid pid)
;; ;; 			   (number? pid)))
;; (system "../../bin/megatest -server - -debug 22 > server.log 2> server.log &")
;; 
;; (let loop ((n 10))
;;   (thread-sleep! 1) ;; need to wait for server to start.
;;   (let ((res (open-run-close tasks:get-best-server tasks:open-db)))
;;   (let ((res (open-run-close tasks:get-best-server tasks:open-db *area-dat*)))
;;     (print "tasks:get-best-server returned " res)
;;     (if (and (not res)
;; 	     (> n 0))
;; 	(loop (- n 1)))))
;; 
;; (test "get-best-server" #t (begin 
;; 			     (client:launch)
;; 			     (let ((dat (open-run-close tasks:get-best-server tasks:open-db)))
;; 			     (let ((dat (open-run-close tasks:get-best-server tasks:open-db *area-dat*)))
;; 			       (vector? dat))))
;; 
;; (define *keys*               (keys:config-get-fields *configdat*))
;; (define *keyvals*            (keys:target->keyval *keys* "a/b/c"))
;; 
;; (test #f #t                       (string? (car *runremote*)))
;; (test #f '(#t "successful login") (rmt:login)) ;;  *runremote* *toppath* *my-client-signature*)))

Modified zmq-transport.scm from [fa40b1e46e] to [46e65cf05a].

108
109
110
111
112
113
114
115

116
117
118
119
120
121
122
108
109
110
111
112
113
114

115
116
117
118
119
120
121
122







-
+








    (set! *runremote* (vector pull-socket pub-socket)) ;; overloading the use of (common:get-remote remote) BUG!?

    ;; what to do when we quit
    ;;
;;     (on-exit (lambda ()
;; 	       (if (and toppath *server-info*)
;; 		   (open-run-close tasks:server-deregister-self tasks:open-db (car *server-info*))
;; 		   (open-run-close tasks:server-deregister-self (lambda ()(tasks:open-db area-dat)) (car *server-info*))
;; 		   (let loop () 
;; 		     (let ((queue-len 0))
;; 		       (thread-sleep! (random 5))
;; 		       (mutex-lock! *incoming-mutex*)
;; 		       (set! queue-len (length *incoming-data*))
;; 		       (mutex-unlock! *incoming-mutex*)
;; 		       (if (> queue-len 0)
146
147
148
149
150
151
152
153

154
155
156
157
158
159
160
146
147
148
149
150
151
152

153
154
155
156
157
158
159
160







-
+







	      
	      (loop '()))
	    (loop (cons packet queue-lst)))))))

;; run zmq-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (zmq-transport:keep-running)
(define (zmq-transport:keep-running area-dat)
  ;; if none running or if > 20 seconds since 
  ;; server last used then start shutdown
  ;; This thread waits for the server to come alive
  (let* ((server-info (let loop ()
			(let ((sdat #f))
			  (mutex-lock! *heartbeat-mutex*)
			  (set! sdat *server-info*)
176
177
178
179
180
181
182
183




184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203




204
205
206
207
208
209
210
176
177
178
179
180
181
182

183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205

206
207
208
209
210
211
212
213
214
215
216







-
+
+
+
+



















-
+
+
+
+







      ;; GET REAL QUEUE LENGTH FROM THE VARIABLE
      (let ((queue-len 0)) ;; FOR NOW DO NOT DO THIS (cdb:client-call zmq-sockets 'sync #t 1)))
      ;; (print "Server running, count is " count)
	(if (< count 1) ;; 3x3 = 9 secs aprox
	    (loop (+ count 1)))

	;; NOTE: Get rid of this mechanism! It really is not needed...
	(open-run-close tasks:server-update-heartbeat tasks:open-db (car server-info))
	(open-run-close tasks:server-update-heartbeat
			(lambda ()
			  (tasks:open-db area-dat))
			(car server-info))

	;; (if ;; (or (> numrunning 0) ;; stay alive for two days after last access
	(mutex-lock! *heartbeat-mutex*)
	(set! last-access *last-db-access*)
	(mutex-unlock! *heartbeat-mutex*)
	(if (> (+ last-access
		  ;; (* 50 60 60)    ;; 48 hrs
		  ;; 60              ;; one minute
		  ;; (* 60 60)       ;; one hour
		  (* 45 60)          ;; 45 minutes, until the db deletion bug is fixed.
		  )
	       (current-seconds))
	    (begin
	      (debug:print-info 2 "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	      (loop 0))
	    (begin
	      (debug:print-info 0 "Starting to shutdown the server.")
	      ;; need to delete only *my* server entry (future use)
	      (set! *time-to-exit* #t)
	      (open-run-close tasks:server-deregister-self tasks:open-db (get-host-name))
	      (open-run-close tasks:server-deregister-self
			      (lambda ()
				(tasks:open-db area-dat))
			      (get-host-name))
	      (thread-sleep! 1)
	      (debug:print-info 0 "Max cached queries was " *max-cache-size*)
	      (debug:print-info 0 "Server shutdown complete. Exiting")
	      (exit)))))))

(define (zmq-transport:find-free-port-and-open iface s port stype #!key (trynum 50))
  (let ((s (if s s (make-socket stype)))
232
233
234
235
236
237
238
239

240
241
242
243
244
245
246
238
239
240
241
242
243
244

245
246
247
248
249
250
251
252







-
+







         (p1 (caddr s1))
         (s2 (zmq-transport:find-free-port-and-open ipaddrstr #f (+ 1 (if p1 p1 (+ startport 1))) 'pub))
         (p2 (caddr s2)))
    (set! *runremote* #f)
    (debug:print 0 "Server started on " ipaddrstr " ports " p1 " and " p2)
    (mutex-lock! *heartbeat-mutex*)
    (set! *server-info* (open-run-close tasks:server-register 
					tasks:open-db 
					(lambda ()(tasks:open-db area-dat))
					(current-process-id) 
					ipaddrstr p1 
					0 
					'live
					'zmq
					pubport: p2))
    (debug:print-info 11 "*server-info* set to " *server-info*)
302
303
304
305
306
307
308
309

310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325

326
327
328
329
330
331
332
308
309
310
311
312
313
314

315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330

331
332
333
334
335
336
337
338







-
+















-
+







	  (debug:print-info 2 "Failed to login or connect to " conurl)
	  (set! *runremote* #f)
	  #f))))

;; run zmq-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (zmq-transport:keep-running)
(define (zmq-transport:keep-running area-dat)
  ;; if none running or if > 20 seconds since 
  ;; server last used then start shutdown
  ;; This thread waits for the server to come alive
  (let* ((server-info (let loop ()
                        (let ((sdat #f))
                          (mutex-lock! *heartbeat-mutex*)
                          (set! sdat *runremote*)
                          (mutex-unlock! *heartbeat-mutex*)
                          (if sdat sdat
                              (begin
                                (sleep 4)
                                (loop))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
	 (tdb         (tasks:open-db))
	 (tdb         (tasks:open-db area-dat))
	 (spid        (tasks:server-get-server-id tdb #f iface port #f)))
    (print "Keep-running got server pid " spid ", using iface " iface " and port " port)
    (let loop ((count 0))
      (thread-sleep! 4) ;; no need to do this very often
      ;; NB// sync currently does NOT return queue-length
      (let () ;; (queue-len (cdb:client-call server-info 'sync #t 1)))
      ;; (print "Server running, count is " count)