Megatest

Check-in [da7f14af0e]
Login
Overview
Comment:Untested cleanup
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | multi-area
Files: files | file ages | folders
SHA1: da7f14af0ef7f1b6ffccea82a0c210cb3d65f881
User & Date: matt on 2015-04-05 23:02:59
Other Links: branch diff | manifest | tags
Context
2015-04-05
23:16
more untested cleanup check-in: c9180c4d63 user: matt tags: multi-area
23:02
Untested cleanup check-in: da7f14af0e user: matt tags: multi-area
22:04
More cleanup after big tear up check-in: 2b14c9b4a7 user: matt tags: multi-area
Changes

Modified client.scm from [671f590b03] to [141f2f00f0].

46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
(define (client:connect iface port)
  (case (server:get-transport)
    ((rpc)  (rpc:client-connect  iface port))
    ((http) (http:client-connect iface port))
    ((zmq)  (zmq:client-connect  iface port))
    (else   (rpc:client-connect  iface port))))

(define (client:setup  run-id #!key (remaining-tries 10) (failed-connects 0))
  (case (server:get-transport)
    ((rpc) (rpc-transport:client-setup run-id)) ;;(client:setup-rpc run-id))
    ((http)(client:setup-http run-id))
    (else  (rpc-transport:client-setup run-id)))) ;; (client:setup-rpc run-id))))

;; (define (client:login-no-auto-setup server-info run-id)
;;   (case (server:get-transport)
;;     ((rpc)  (rpc:login-no-auto-client-setup server-info run-id))
;;     ((http) (rmt:login-no-auto-client-setup server-info run-id))
;;     (else   (rpc:login-no-auto-client-setup server-info run-id))))
;; 







|

|
|
|







46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
(define (client:connect iface port)
  (case (server:get-transport)
    ((rpc)  (rpc:client-connect  iface port))
    ((http) (http:client-connect iface port))
    ((zmq)  (zmq:client-connect  iface port))
    (else   (rpc:client-connect  iface port))))

(define (client:setup run-id area-dat #!key (remaining-tries 10) (failed-connects 0))
  (case (server:get-transport)
    ((rpc) (rpc-transport:client-setup run-id area-dat))
    ((http)(client:setup-http run-id area-dat))
    (else  (rpc-transport:client-setup run-id area-dat))))

;; (define (client:login-no-auto-setup server-info run-id)
;;   (case (server:get-transport)
;;     ((rpc)  (rpc:login-no-auto-client-setup server-info run-id))
;;     ((http) (rmt:login-no-auto-client-setup server-info run-id))
;;     (else   (rpc:login-no-auto-client-setup server-info run-id))))
;; 
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159

160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
;; 		    (hash-table-set! (common:get-remote remote) run-id start-res)
;; 		    start-res)  ;; return the server info
;; 		  (if (member remaining-tries '(3 4 6))
;; 		      (begin    ;; login failed
;; 			(debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" host-info)
;; 			(hash-table-delete! (common:get-remote remote) run-id)
;; 			(open-run-close tasks:server-force-clean-run-record
;; 			 		tasks:open-db
;; 			 		run-id 
;; 			 		(car  host-info)
;; 			 		(cadr host-info)
;; 					" client:setup (host-info=#t)")
;; 			(thread-sleep! 5)
;; 			(client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1)))
;; 		      (begin
;; 			(debug:print 25 "INFO: client:setup failed to connect, start-res=" start-res ", run-id=" run-id ", host-info=" host-info)
;; 			(thread-sleep! 5)
;; 			(client:setup run-id remaining-tries: (- remaining-tries 1))))))
;; 	    ;; YUK: rename server-dat here
;; 	    (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id)))
;; 	      (debug:print-info 0 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
;; 	      (if server-dat
;; 		  (let* ((iface     (tasks:hostinfo-get-interface server-dat))
;; 			 (port      (tasks:hostinfo-get-port      server-dat))
;; 			 (start-res (http-transport:client-connect iface port))
;; 			 ;; (ping-res  (server:ping-server run-id iface port))
;; 			 (ping-res  (rmt:login-no-auto-client-setup start-res run-id)))
;; 		    (if start-res
;; 			(begin
;; 			  (hash-table-set! (common:get-remote remote) run-id start-res)
;; 			  start-res)
;; 			(if (member remaining-tries '(2 5))
;; 			    (begin    ;; login failed
;; 			      (debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
;; 			      (hash-table-delete! (common:get-remote remote) run-id)
;; 			      (open-run-close tasks:server-force-clean-run-record
;; 					      tasks:open-db
;; 					      run-id 
;; 					      (tasks:hostinfo-get-interface server-dat)
;; 					      (tasks:hostinfo-get-port      server-dat)
;; 					      " client:setup (server-dat = #t)")
;; 			      (thread-sleep! 2)
;; 			      (server:try-running run-id)
;; 			      (thread-sleep! 10) ;; give server a little time to start up
;; 			      (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1)))
;; 			    (begin
;; 			      (debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
;; 			      (thread-sleep! 5)
;; 			      (client:setup run-id remaining-tries: (- remaining-tries 1))))))
;; 		  (begin    ;; no server registered
;; 		    (if (eq? remaining-tries 2)
;; 			(begin
;; 			  ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)")
;; 			  (client:setup run-id remaining-tries: 10))
;; 			(begin
;; 			  (thread-sleep! 2) 
;; 			  (debug:print 25 "INFO: client:setup start-res (not defined here), run-id=" run-id ", server-dat=" server-dat)
;; 			  (if (< (open-run-close tasks:num-in-available-state tasks:open-db run-id) 3)
;; 			      (begin
;; 				;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)")
;; 				(server:try-running run-id)))
;; 			  (thread-sleep! 10) ;; give server a little time to start up
;; 			  (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))))

;; Do all the connection work, look up the transport type and set up the
;; connection if required.
;;
;; There are two scenarios. 
;;   1. We are a test manager and we received *transport-type* and (common:get-remote remote) via cmdline
;;   2. We are a run tests, list runs or other interactive process and we must figure out
;;      *transport-type* and (common:get-remote remote) from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove (common:get-remote remote) stuff
;;
(define (client:setup-http run-id #!key (remaining-tries 10) (failed-connects 0)(remote #f))
  (debug:print-info 2 "client:setup remaining-tries=" remaining-tries)
  (let* ((tdbdat (tasks:open-db)))

    (if (<= remaining-tries 0)
	(begin
	  (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	  (exit 1))
	(let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
	  (debug:print-info 4 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	  (if server-dat
	      (let* ((iface     (tasks:hostinfo-get-interface server-dat))
		     (hostname  (tasks:hostinfo-get-hostname  server-dat))
		     (port      (tasks:hostinfo-get-port      server-dat))
		     (start-res (case *transport-type*
				  ((http)(http-transport:client-connect iface port))
				  ((nmsg)(nmsg-transport:client-connect hostname port))))
		     (ping-res  (case *transport-type* 
				  ((http)(rmt:login-no-auto-client-setup start-res run-id))
				  ((nmsg)(let ((logininfo (rmt:login-no-auto-client-setup start-res run-id)))
 					   (if logininfo
 					       (car (vector-ref logininfo 1))
 					       #f))))))
		(if (and start-res
			 ping-res)
		    (begin
		      (common:set-remote! remote run-id start-res)
		      (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res))
		      start-res)
		    (begin    ;; login failed but have a server record, clean out the record and try again
		      (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
		      (case *transport-type* 
			((http)(http-transport:close-connections run-id)))
		      (common:del-remote! remote run-id)
		      (tasks:kill-server-run-id run-id)
		      (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
							   run-id 
							   (tasks:hostinfo-get-interface server-dat)
							   (tasks:hostinfo-get-port      server-dat)
							   " client:setup (server-dat = #t)")
		      (if (> remaining-tries 8)
			  (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little
			  (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time
		      (server:try-running run-id)
		      (thread-sleep! 5)   ;; give server a little time to start up
		      (client:setup run-id remaining-tries: (- remaining-tries 1))
		      )))
	      (begin    ;; no server registered
		(let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id)))
		  (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
		  (if (< num-available 2)
		      (server:try-running run-id))
		  (thread-sleep! (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
		  (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))

;; keep this as a function to ease future 
(define (client:start run-id server-info)
  (http-transport:client-connect (tasks:hostinfo-get-interface server-info)
				 (tasks:hostinfo-get-port server-info)))

;; client:signal-handler







|











|
















|















|




|

|








|

|





|

|
>










|


|













|













|







|







81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
;; 		    (hash-table-set! (common:get-remote remote) run-id start-res)
;; 		    start-res)  ;; return the server info
;; 		  (if (member remaining-tries '(3 4 6))
;; 		      (begin    ;; login failed
;; 			(debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" host-info)
;; 			(hash-table-delete! (common:get-remote remote) run-id)
;; 			(open-run-close tasks:server-force-clean-run-record
;; 			 		(lambda ()(tasks:open-db area-dat))
;; 			 		run-id 
;; 			 		(car  host-info)
;; 			 		(cadr host-info)
;; 					" client:setup (host-info=#t)")
;; 			(thread-sleep! 5)
;; 			(client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1)))
;; 		      (begin
;; 			(debug:print 25 "INFO: client:setup failed to connect, start-res=" start-res ", run-id=" run-id ", host-info=" host-info)
;; 			(thread-sleep! 5)
;; 			(client:setup run-id remaining-tries: (- remaining-tries 1))))))
;; 	    ;; YUK: rename server-dat here
;; 	    (let* ((server-dat (open-run-close tasks:get-server (lambda ()(tasks:open-db area-dat)) run-id)))
;; 	      (debug:print-info 0 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
;; 	      (if server-dat
;; 		  (let* ((iface     (tasks:hostinfo-get-interface server-dat))
;; 			 (port      (tasks:hostinfo-get-port      server-dat))
;; 			 (start-res (http-transport:client-connect iface port))
;; 			 ;; (ping-res  (server:ping-server run-id iface port))
;; 			 (ping-res  (rmt:login-no-auto-client-setup start-res run-id)))
;; 		    (if start-res
;; 			(begin
;; 			  (hash-table-set! (common:get-remote remote) run-id start-res)
;; 			  start-res)
;; 			(if (member remaining-tries '(2 5))
;; 			    (begin    ;; login failed
;; 			      (debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
;; 			      (hash-table-delete! (common:get-remote remote) run-id)
;; 			      (open-run-close tasks:server-force-clean-run-record
;; 					      (lambda ()(tasks:open-db area-dat))
;; 					      run-id 
;; 					      (tasks:hostinfo-get-interface server-dat)
;; 					      (tasks:hostinfo-get-port      server-dat)
;; 					      " client:setup (server-dat = #t)")
;; 			      (thread-sleep! 2)
;; 			      (server:try-running run-id)
;; 			      (thread-sleep! 10) ;; give server a little time to start up
;; 			      (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1)))
;; 			    (begin
;; 			      (debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
;; 			      (thread-sleep! 5)
;; 			      (client:setup run-id remaining-tries: (- remaining-tries 1))))))
;; 		  (begin    ;; no server registered
;; 		    (if (eq? remaining-tries 2)
;; 			(begin
;; 			  ;; (open-run-close tasks:server-clean-out-old-records-for-run-id (lambda ()(tasks:open-db area-dat)) run-id " client:setup (server-dat=#f)")
;; 			  (client:setup run-id remaining-tries: 10))
;; 			(begin
;; 			  (thread-sleep! 2) 
;; 			  (debug:print 25 "INFO: client:setup start-res (not defined here), run-id=" run-id ", server-dat=" server-dat)
;; 			  (if (< (open-run-close tasks:num-in-available-state (lambda ()(tasks:open-db area-dat)) run-id) 3)
;; 			      (begin
;; 				;; (open-run-close tasks:server-clean-out-old-records-for-run-id (lambda ()(tasks:open-db area-dat)) run-id " client:setup (server-dat=#f)")
;; 				(server:try-running run-id)))
;; 			  (thread-sleep! 10) ;; give server a little time to start up
;; 			  (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))))

;; Do all the connection work, look up the transport type and set up the
;; connection if required.
;;
;; There are two scenarios. 
;;   1. We are a test manager and we received transport-type and (common:get-remote remote) via cmdline
;;   2. We are a run tests, list runs or other interactive process and we must figure out
;;      transport-type and (common:get-remote remote) from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove (common:get-remote remote) stuff
;;
(define (client:setup-http run-id area-dat #!key (remaining-tries 10) (failed-connects 0)(remote #f))
  (debug:print-info 2 "client:setup remaining-tries=" remaining-tries)
  (let* ((tdbdat         (tasks:open-db area-dat))
	 (transport-type (megatest:area-transport area-dat)))
    (if (<= remaining-tries 0)
	(begin
	  (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	  (exit 1))
	(let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
	  (debug:print-info 4 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	  (if server-dat
	      (let* ((iface     (tasks:hostinfo-get-interface server-dat))
		     (hostname  (tasks:hostinfo-get-hostname  server-dat))
		     (port      (tasks:hostinfo-get-port      server-dat))
		     (start-res (case transport-type
				  ((http)(http-transport:client-connect iface port))
				  ((nmsg)(nmsg-transport:client-connect hostname port))))
		     (ping-res  (case transport-type 
				  ((http)(rmt:login-no-auto-client-setup start-res run-id))
				  ((nmsg)(let ((logininfo (rmt:login-no-auto-client-setup start-res run-id)))
 					   (if logininfo
 					       (car (vector-ref logininfo 1))
 					       #f))))))
		(if (and start-res
			 ping-res)
		    (begin
		      (common:set-remote! remote run-id start-res)
		      (debug:print-info 2 "connected to " (http-transport:server-dat-make-url start-res))
		      start-res)
		    (begin    ;; login failed but have a server record, clean out the record and try again
		      (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
		      (case transport-type 
			((http)(http-transport:close-connections run-id)))
		      (common:del-remote! remote run-id)
		      (tasks:kill-server-run-id run-id)
		      (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
							   run-id 
							   (tasks:hostinfo-get-interface server-dat)
							   (tasks:hostinfo-get-port      server-dat)
							   " client:setup (server-dat = #t)")
		      (if (> remaining-tries 8)
			  (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little
			  (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time
		      (server:try-running run-id)
		      (thread-sleep! 5)   ;; give server a little time to start up
		      (client:setup run-id area-dat remaining-tries: (- remaining-tries 1))
		      )))
	      (begin    ;; no server registered
		(let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id)))
		  (debug:print-info 0 "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
		  (if (< num-available 2)
		      (server:try-running run-id))
		  (thread-sleep! (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
		  (client:setup run-id area-dat remaining-tries: (- remaining-tries 1)))))))))

;; keep this as a function to ease future 
(define (client:start run-id server-info)
  (http-transport:client-connect (tasks:hostinfo-get-interface server-info)
				 (tasks:hostinfo-get-port server-info)))

;; client:signal-handler

Modified dashboard-guimonitor.scm from [886a02dbe2] to [c023768ca3].

173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
                 (iup:attribute-set! tabtop "TABTITLE1" "Collateral")
                 (iup:attribute-set! tabtop "TABTITLE2" "Fossil")
                 (iup:attribute-set! tabtop "TABTITLE3" "Tools")
                 tabtop))))

;; BUG: Remember to re-instate this!!!!
;; (on-exit (lambda ()
;; 	   (let ((tdb (tasks:open-db)))
;; 	     ;; (print "On-exit called")
;; 	     (tasks:remove-monitor-record tdb)
;; 	     (sqlite3:finalize! tdb))))

(define (gui-monitor db)
  (let ((keys (db:get-keys db))
	(tdb  (tasks:open-db)))
    (tasks:register-monitor db tdb) ;;; let the other monitors know we are here
    (control-panel db tdb keys)
    ;(tasks:remove-monitor-record db)
    ;(sqlite3:finalize! db)
   ))
	







|




|

|






173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
                 (iup:attribute-set! tabtop "TABTITLE1" "Collateral")
                 (iup:attribute-set! tabtop "TABTITLE2" "Fossil")
                 (iup:attribute-set! tabtop "TABTITLE3" "Tools")
                 tabtop))))

;; BUG: Remember to re-instate this!!!!
;; (on-exit (lambda ()
;; 	   (let ((tdb (tasks:open-db area-dat)))
;; 	     ;; (print "On-exit called")
;; 	     (tasks:remove-monitor-record tdb)
;; 	     (sqlite3:finalize! tdb))))

(define (gui-monitor db area-dat)
  (let ((keys (db:get-keys db))
	(tdb  (tasks:open-db area-dat)))
    (tasks:register-monitor db tdb) ;;; let the other monitors know we are here
    (control-panel db tdb keys)
    ;(tasks:remove-monitor-record db)
    ;(sqlite3:finalize! db)
   ))
	

Modified dashboard-tests.scm from [8072d2165f] to [d0c66f198a].

416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
	  (exit 1))
	(let* (;; (run-id        (if testdat (db:test-get-run_id testdat) #f))
	       (keydat        (if testdat (db:get-key-val-pairs dbstruct run-id) #f))
	       (rundat        (if testdat (db:get-run-info dbstruct run-id) #f))
	       (runname       (if testdat (db:get-value-by-header (db:get-rows rundat)
								  (db:get-header rundat)
								  "runname") #f))
	       (tdb           (tdb:open-test-db-by-test-id-local dbstruct run-id test-id))
	       ;; These next two are intentional bad values to ensure errors if they should not
	       ;; get filled in properly.
	       (logfile       "/this/dir/better/not/exist")
	       (rundir        (if testdat 
				  (db:test-get-rundir testdat)
				  logfile))
	       (testdat-path  (conc rundir "/testdat.db")) ;; this gets recalculated until found 







|







416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
	  (exit 1))
	(let* (;; (run-id        (if testdat (db:test-get-run_id testdat) #f))
	       (keydat        (if testdat (db:get-key-val-pairs dbstruct run-id) #f))
	       (rundat        (if testdat (db:get-run-info dbstruct run-id) #f))
	       (runname       (if testdat (db:get-value-by-header (db:get-rows rundat)
								  (db:get-header rundat)
								  "runname") #f))
	       (tdb           (tdb:open-test-db-by-test-id-local dbstruct area-dat run-id test-id))
	       ;; These next two are intentional bad values to ensure errors if they should not
	       ;; get filled in properly.
	       (logfile       "/this/dir/better/not/exist")
	       (rundir        (if testdat 
				  (db:test-get-rundir testdat)
				  logfile))
	       (testdat-path  (conc rundir "/testdat.db")) ;; this gets recalculated until found 
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
											      (db:test-data-get-value    x)
											      (db:test-data-get-expected x)
											      (db:test-data-get-tol      x)
											      (db:test-data-get-status   x)
											      (db:test-data-get-units    x)
											      (db:test-data-get-type     x)
											      (db:test-data-get-comment  x)))
										    (tdb:open-run-close-db-by-test-id-local dbstruct run-id test-id #f tdb:read-test-data test-id "%")))
									      "\n")))
							       (if (not (equal? currval newval))
								   (iup:attribute-set! test-data "VALUE" newval ))))) ;; "TITLE" newval)))))
					  test-data))
				       ;;(dashboard:run-controls)
				       )))
				 (iup:attribute-set! tabs "TABTITLE0" "Steps")







|







693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
											      (db:test-data-get-value    x)
											      (db:test-data-get-expected x)
											      (db:test-data-get-tol      x)
											      (db:test-data-get-status   x)
											      (db:test-data-get-units    x)
											      (db:test-data-get-type     x)
											      (db:test-data-get-comment  x)))
										    (tdb:open-run-close-db-by-test-id-local dbstruct area-dat run-id test-id #f tdb:read-test-data test-id "%")))
									      "\n")))
							       (if (not (equal? currval newval))
								   (iup:attribute-set! test-data "VALUE" newval ))))) ;; "TITLE" newval)))))
					  test-data))
				       ;;(dashboard:run-controls)
				       )))
				 (iup:attribute-set! tabs "TABTITLE0" "Steps")

Modified db.scm from [69b8627abb] to [4816354319].

647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
	 (mtdb     (if toppath (db:open-megatest-db area-dat)))
	 (allow-cleanup (if run-ids #f #t))
	 (run-ids  (if run-ids 
		       run-ids
		       (if toppath (begin
				     (db:delay-if-busy mtdb area-dat)
				     (db:get-all-run-ids mtdb)))))
	 (tdbdat  (tasks:open-db))
	 (servers (tasks:get-all-servers (db:delay-if-busy tdbdat area-dat))))
    
    ;; kill servers
    (if (member 'killservers options)
	(for-each
	 (lambda (server)
	   (tasks:server-delete-record (db:delay-if-busy tdbdat area-dat) (vector-ref server 0) "dbmigration")







|







647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
	 (mtdb     (if toppath (db:open-megatest-db area-dat)))
	 (allow-cleanup (if run-ids #f #t))
	 (run-ids  (if run-ids 
		       run-ids
		       (if toppath (begin
				     (db:delay-if-busy mtdb area-dat)
				     (db:get-all-run-ids mtdb)))))
	 (tdbdat  (tasks:open-db area-dat))
	 (servers (tasks:get-all-servers (db:delay-if-busy tdbdat area-dat))))
    
    ;; kill servers
    (if (member 'killservers options)
	(for-each
	 (lambda (server)
	   (tasks:server-delete-record (db:delay-if-busy tdbdat area-dat) (vector-ref server 0) "dbmigration")

Modified dcommon.scm from [203653f6f0] to [1f00e0f1a9].

510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
    (updater)
    (set! dashboard:update-summary-tab updater)
    (iup:attribute-set! stats-matrix "WIDTHDEF" "40")
    (iup:vbox
     ;; (iup:label "Run statistics"  #:expand "HORIZONTAL")
     stats-matrix)))

(define (dcommon:servers-table)
  (let* ((tdbdat         (tasks:open-db))
	 (colnum         0)
	 (rownum         0)
	 (servers-matrix (iup:matrix #:expand "YES"
				     #:numcol 7
				     #:numcol-visible 7
				     #:numlin-visible 5
				     ))







|
|







510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
    (updater)
    (set! dashboard:update-summary-tab updater)
    (iup:attribute-set! stats-matrix "WIDTHDEF" "40")
    (iup:vbox
     ;; (iup:label "Run statistics"  #:expand "HORIZONTAL")
     stats-matrix)))

(define (dcommon:servers-table area-dat)
  (let* ((tdbdat         (tasks:open-db area-dat))
	 (colnum         0)
	 (rownum         0)
	 (servers-matrix (iup:matrix #:expand "YES"
				     #:numcol 7
				     #:numcol-visible 7
				     #:numlin-visible 5
				     ))

Modified http-transport.scm from [6df4b07013] to [d8101c25db].

67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
	 (db              #f) ;;        (open-db)) ;; we don't want the server to be opening and closing the db unnecesarily
	 (hostname        (get-host-name))
	 (ipaddrstr       (let ((ipstr (if (string=? "-" hostn)
					   ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
					   (server:get-best-guess-address hostname)
					   #f)))
			    (if ipstr ipstr hostn))) ;; hostname))) 
	 (start-port      (portlogger:open-run-close portlogger:find-port))
	 (link-tree-path  (configf:lookup configdat "setup" "linktree")))
    ;; (set! db *inmemdb*)
    (debug:print-info 0 "portlogger recommended port: " start-port)
    (root-path     (if link-tree-path 
		       link-tree-path
		       (current-directory))) ;; WARNING: SECURITY HOLE. FIX ASAP!
    (handle-directory spiffy-directory-listing)







|







67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
	 (db              #f) ;;        (open-db)) ;; we don't want the server to be opening and closing the db unnecesarily
	 (hostname        (get-host-name))
	 (ipaddrstr       (let ((ipstr (if (string=? "-" hostn)
					   ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
					   (server:get-best-guess-address hostname)
					   #f)))
			    (if ipstr ipstr hostn))) ;; hostname))) 
	 (start-port      (portlogger:open-run-close portlogger:find-port area-dat))
	 (link-tree-path  (configf:lookup configdat "setup" "linktree")))
    ;; (set! db *inmemdb*)
    (debug:print-info 0 "portlogger recommended port: " start-port)
    (root-path     (if link-tree-path 
		       link-tree-path
		       (current-directory))) ;; WARNING: SECURITY HOLE. FIX ASAP!
    (handle-directory spiffy-directory-listing)
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
     (begin
       (print-error-message exn)
       (if (< portnum 64000)
	   (begin 
	     (debug:print 0 "WARNING: attempt to start server failed. Trying again ...")
	     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	     (debug:print 0 "exn=" (condition->list exn))
	     (portlogger:open-run-close portlogger:set-failed portnum)
	     (debug:print 0 "WARNING: failed to start on portnum: " portnum ", trying next port")
	     (thread-sleep! 0.1)

	     ;; get_next_port goes here
	     (http-transport:try-start-server run-id
					      ipaddrstr
					      (portlogger:open-run-close portlogger:find-port)
					      server-id
					      area-dat))
	   (begin
	     (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server")
	     (print "ERROR: Tried and tried but could not start the server"))))
     ;; any error in following steps will result in a retry
     (set! *server-info* (list ipaddrstr portnum))







|






|







130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
     (begin
       (print-error-message exn)
       (if (< portnum 64000)
	   (begin 
	     (debug:print 0 "WARNING: attempt to start server failed. Trying again ...")
	     (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
	     (debug:print 0 "exn=" (condition->list exn))
	     (portlogger:open-run-close portlogger:set-failed area-dat portnum)
	     (debug:print 0 "WARNING: failed to start on portnum: " portnum ", trying next port")
	     (thread-sleep! 0.1)

	     ;; get_next_port goes here
	     (http-transport:try-start-server run-id
					      ipaddrstr
					      (portlogger:open-run-close portlogger:find-port area-dat)
					      server-id
					      area-dat))
	   (begin
	     (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server")
	     (print "ERROR: Tried and tried but could not start the server"))))
     ;; any error in following steps will result in a retry
     (set! *server-info* (list ipaddrstr portnum))
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
	 (api-req      (make-request method: 'POST uri: api-uri))
	 (server-dat   (vector iface port api-uri api-url api-req (current-seconds))))
    server-dat))

;; run http-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (http-transport:keep-running server-id run-id)
  ;; if none running or if > 20 seconds since 
  ;; server last used then start shutdown
  ;; This thread waits for the server to come alive
  (debug:print-info 0 "Starting the sync-back, keep alive thread in server for run-id=" run-id)
  (let* ((tdbdat      (tasks:open-db))
	 (server-info (let loop ((start-time (current-seconds))
				 (changed    #t)
				 (last-sdat  "not this"))
                        (let ((sdat #f))
			  (thread-sleep! 0.01)
			  (debug:print-info 0 "Waiting for server alive signature")
                          (mutex-lock! *heartbeat-mutex*)







|




|







363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
	 (api-req      (make-request method: 'POST uri: api-uri))
	 (server-dat   (vector iface port api-uri api-url api-req (current-seconds))))
    server-dat))

;; run http-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (http-transport:keep-running server-id run-id area-dat)
  ;; if none running or if > 20 seconds since 
  ;; server last used then start shutdown
  ;; This thread waits for the server to come alive
  (debug:print-info 0 "Starting the sync-back, keep alive thread in server for run-id=" run-id)
  (let* ((tdbdat      (tasks:open-db area-dat))
	 (server-info (let loop ((start-time (current-seconds))
				 (changed    #t)
				 (last-sdat  "not this"))
                        (let ((sdat #f))
			  (thread-sleep! 0.01)
			  (debug:print-info 0 "Waiting for server alive signature")
                          (mutex-lock! *heartbeat-mutex*)
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
		(sync-time  #f)
		(rem-time   #f))
	    ;; inmemdb is a dbstruct
	    (condition-case
	     (db:sync-touched *inmemdb* *run-id* force-sync: #t)
	     ((sync-failed)(cond
			    ((> bad-sync-count 10) ;; time to give up
			     (http-transport:server-shutdown server-id port))
			    (else ;; (> bad-sync-count 0)  ;; we've had a fail or two, delay and loop
			     (thread-sleep! 5)
			     (loop count server-state (+ bad-sync-count 1)))))
	     ((exn)
	      (debug:print 0 "ERROR: error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server")
	      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running crashed")
	      (exit)))







|







411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
		(sync-time  #f)
		(rem-time   #f))
	    ;; inmemdb is a dbstruct
	    (condition-case
	     (db:sync-touched *inmemdb* *run-id* force-sync: #t)
	     ((sync-failed)(cond
			    ((> bad-sync-count 10) ;; time to give up
			     (http-transport:server-shutdown server-id port area-dat))
			    (else ;; (> bad-sync-count 0)  ;; we've had a fail or two, delay and loop
			     (thread-sleep! 5)
			     (loop count server-state (+ bad-sync-count 1)))))
	     ((exn)
	      (debug:print 0 "ERROR: error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server")
	      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running crashed")
	      (exit)))
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
		      (set! *inmemdb*  (db:setup run-id))
		      ;; force initialization
		      ;; (db:get-db *inmemdb* #t)
		      (db:get-db *inmemdb* run-id)
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running"))
		    (begin ;; gotta exit nicely
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision")
		      (http-transport:server-shutdown server-id port))))))
      
      (if (< count 1) ;; 3x3 = 9 secs aprox
	  (loop (+ count 1) 'running bad-sync-count))
      
      ;; Check that iface and port have not changed (can happen if server port collides)
      (mutex-lock! *heartbeat-mutex*)
      (set! sdat *server-info*)







|







444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
		      (set! *inmemdb*  (db:setup run-id))
		      ;; force initialization
		      ;; (db:get-db *inmemdb* #t)
		      (db:get-db *inmemdb* run-id)
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running"))
		    (begin ;; gotta exit nicely
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision")
		      (http-transport:server-shutdown server-id port area-dat))))))
      
      (if (< count 1) ;; 3x3 = 9 secs aprox
	  (loop (+ count 1) 'running bad-sync-count))
      
      ;; Check that iface and port have not changed (can happen if server port collides)
      (mutex-lock! *heartbeat-mutex*)
      (set! sdat *server-info*)
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
	    ;; Consider implementing some smarts here to re-insert the record or kill self is
	    ;; the db indicates so
	    ;;
	    ;; (if (tasks:server-am-i-the-server? tdb run-id)
	    ;;     (tasks:server-set-state! tdb server-id "running"))
	    ;;
	    (loop 0 server-state bad-sync-count))
	  (http-transport:server-shutdown server-id port)))))

(define (http-transport:server-shutdown server-id port)
  (let ((tdbdat (tasks:open-db)))
    (debug:print-info 0 "Starting to shutdown the server.")
    ;; need to delete only *my* server entry (future use)
    (set! *time-to-exit* #t)
    (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t))
    ;;
    ;; start_shutdown
    ;;
    (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down")
    (portlogger:open-run-close portlogger:set-port port "released")
    (thread-sleep! 5)
    (debug:print-info 0 "Max cached queries was    " *max-cache-size*)
    (debug:print-info 0 "Number of cached writes   " *number-of-writes*)
    (debug:print-info 0 "Average cached write time "
		      (if (eq? *number-of-writes* 0)
			  "n/a (no writes)"
			  (/ *writes-total-delay*







|

|
|








|







485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
	    ;; Consider implementing some smarts here to re-insert the record or kill self is
	    ;; the db indicates so
	    ;;
	    ;; (if (tasks:server-am-i-the-server? tdb run-id)
	    ;;     (tasks:server-set-state! tdb server-id "running"))
	    ;;
	    (loop 0 server-state bad-sync-count))
	  (http-transport:server-shutdown server-id port area-dat)))))

(define (http-transport:server-shutdown server-id port area-dat)
  (let ((tdbdat (tasks:open-db area-dat)))
    (debug:print-info 0 "Starting to shutdown the server.")
    ;; need to delete only *my* server entry (future use)
    (set! *time-to-exit* #t)
    (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t))
    ;;
    ;; start_shutdown
    ;;
    (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down")
    (portlogger:open-run-close portlogger:set-port area-dat port "released")
    (thread-sleep! 5)
    (debug:print-info 0 "Max cached queries was    " *max-cache-size*)
    (debug:print-info 0 "Number of cached writes   " *number-of-writes*)
    (debug:print-info 0 "Average cached write time "
		      (if (eq? *number-of-writes* 0)
			  "n/a (no writes)"
			  (/ *writes-total-delay*
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
    (exit)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id area-dat)
  (let* ((tdbdat (tasks:open-db)))
    (set! *run-id*   run-id)
    (if (args:get-arg "-daemonize")
	(begin
	  (daemon:ize)
	  (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
	      (begin
		(current-error-port *alt-log-file*)
		(current-output-port *alt-log-file*)))))
    (if (server:check-if-running run-id)
	(begin
	  (debug:print 0 "INFO: Server for run-id " run-id " already running")
	  (exit 0)))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))
	       (remtries  4))
      (if (not server-id)
	  (if (> remtries 0)
	      (begin
		(thread-sleep! 2)
		(loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)
		      (- remtries 1)))
	      (begin
		;; since we didn't get the server lock we are going to clean up and bail out
		(debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
		(tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch")
		))
	  (let* ((th2 (make-thread (lambda ()
				     (debug:print-info 0 "Server run thread started")
				     (http-transport:run 
				      (if (args:get-arg "-server")
					  (args:get-arg "-server")
					  "-")
				      run-id
				      server-id
				      area-dat)) "Server run"))
		 (th3 (make-thread (lambda ()
				     (debug:print-info 0 "Server monitor thread started")
				     (http-transport:keep-running server-id run-id))
				   "Keep running")))
	    (thread-start! th2)
	    (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor.
	    (thread-start! th3)
	    (set! *didsomething* #t)
	    (thread-join! th2)
	    (exit))))))







|








|



|





|

















|







523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
    (exit)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id area-dat)
  (let* ((tdbdat (tasks:open-db area-dat)))
    (set! *run-id*   run-id)
    (if (args:get-arg "-daemonize")
	(begin
	  (daemon:ize)
	  (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
	      (begin
		(current-error-port *alt-log-file*)
		(current-output-port *alt-log-file*)))))
    (if (server:check-if-running run-id area-dat)
	(begin
	  (debug:print 0 "INFO: Server for run-id " run-id " already running")
	  (exit 0)))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id area-dat))
	       (remtries  4))
      (if (not server-id)
	  (if (> remtries 0)
	      (begin
		(thread-sleep! 2)
		(loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id area-dat)
		      (- remtries 1)))
	      (begin
		;; since we didn't get the server lock we are going to clean up and bail out
		(debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
		(tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch")
		))
	  (let* ((th2 (make-thread (lambda ()
				     (debug:print-info 0 "Server run thread started")
				     (http-transport:run 
				      (if (args:get-arg "-server")
					  (args:get-arg "-server")
					  "-")
				      run-id
				      server-id
				      area-dat)) "Server run"))
		 (th3 (make-thread (lambda ()
				     (debug:print-info 0 "Server monitor thread started")
				     (http-transport:keep-running server-id run-id area-dat))
				   "Keep running")))
	    (thread-start! th2)
	    (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor.
	    (thread-start! th3)
	    (set! *didsomething* #t)
	    (thread-join! th2)
	    (exit))))))

Modified launch.scm from [c48e97b3ce] to [b6be267d56].

872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
	  (create-directory work-area #t)
	  (debug:print 0 "WARNING: No disk work area specified - running in the test directory under tmp_run")))
    (set! cmdparms (base64:base64-encode 
		    (z3:encode-buffer 
		     (with-output-to-string
		       (lambda () ;; (list 'hosts     hosts)
			 (write (list (list 'testpath  test-path)
				      (list 'transport (conc (megatest:area-transport area-dat))) ;;; *transport-type*))
				      ;; (list 'serverinf *server-info*)
				      (list 'toppath   toppath)
				      (list 'work-area work-area)
				      (list 'test-name test-name) 
				      (list 'runscript runscript) 
				      (list 'run-id    run-id   )
				      (list 'test-id   test-id  )







|







872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
	  (create-directory work-area #t)
	  (debug:print 0 "WARNING: No disk work area specified - running in the test directory under tmp_run")))
    (set! cmdparms (base64:base64-encode 
		    (z3:encode-buffer 
		     (with-output-to-string
		       (lambda () ;; (list 'hosts     hosts)
			 (write (list (list 'testpath  test-path)
				      (list 'transport (conc (megatest:area-transport area-dat))) ;;
				      ;; (list 'serverinf *server-info*)
				      (list 'toppath   toppath)
				      (list 'work-area work-area)
				      (list 'test-name test-name) 
				      (list 'runscript runscript) 
				      (list 'run-id    run-id   )
				      (list 'test-id   test-id  )

Modified megatest.scm from [74462f253a] to [f59b483f32].

649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
    ;; Server? Start up here.
    ;;
    (let ((tl        (launch:setup-for-run *area-dat*))
	  (run-id    (and (args:get-arg "-run-id")
			  (string->number (args:get-arg "-run-id")))))
      (if run-id
	  (begin
	    (server:launch run-id)
	    (set! *didsomething* #t))
	  (debug:print 0 "ERROR: server requires run-id be specified with -run-id")))

    ;; Not a server? This section will decide how to communicate
    ;;
    ;;  Setup client for all expect listed here
    (if (null? (lset-intersection 







|







649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
    ;; Server? Start up here.
    ;;
    (let ((tl        (launch:setup-for-run *area-dat*))
	  (run-id    (and (args:get-arg "-run-id")
			  (string->number (args:get-arg "-run-id")))))
      (if run-id
	  (begin
	    (server:launch run-id *area-dat*)
	    (set! *didsomething* #t))
	  (debug:print 0 "ERROR: server requires run-id be specified with -run-id")))

    ;; Not a server? This section will decide how to communicate
    ;;
    ;;  Setup client for all expect listed here
    (if (null? (lset-intersection 
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
;; MAY STILL NEED THIS
;;		       (set! *megatest-db* (make-dbr:dbstruct path: toppath local: #t))))))))))

(if (or (args:get-arg "-list-servers")
	(args:get-arg "-stop-server"))
    (let ((tl (launch:setup-for-run *area-dat*)))
      (if tl 
	  (let* ((tdbdat  (tasks:open-db))
		 (servers (tasks:get-all-servers (db:delay-if-busy tdbdat)))
		 (fmtstr  "~5a~12a~8a~20a~24a~10a~10a~10a~10a\n")
		 (servers-to-kill '())
		 (killinfo   (args:get-arg "-stop-server"))
		 (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f))
		 (sid        (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f)))
	    (format #t fmtstr "Id" "MTver" "Pid" "Host" "Interface:OutPort" "InPort" "LastBeat" "State" "Transport")







|







686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
;; MAY STILL NEED THIS
;;		       (set! *megatest-db* (make-dbr:dbstruct path: toppath local: #t))))))))))

(if (or (args:get-arg "-list-servers")
	(args:get-arg "-stop-server"))
    (let ((tl (launch:setup-for-run *area-dat*)))
      (if tl 
	  (let* ((tdbdat  (tasks:open-db *area-dat*))
		 (servers (tasks:get-all-servers (db:delay-if-busy tdbdat)))
		 (fmtstr  "~5a~12a~8a~20a~24a~10a~10a~10a~10a\n")
		 (servers-to-kill '())
		 (killinfo   (args:get-arg "-stop-server"))
		 (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f))
		 (sid        (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f)))
	    (format #t fmtstr "Id" "MTver" "Pid" "Host" "Interface:OutPort" "InPort" "LastBeat" "State" "Transport")
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
   "-runtests" 
   "run a test" 
   (lambda (target runname keys keyvals)
     ;;
     ;; May or may not implement it this way ...
     ;;
     ;; Insert this run into the tasks queue
     ;; (open-run-close tasks:add tasks:open-db 
     ;;    	     "runtests" 
     ;;    	     user
     ;;    	     target
     ;;    	     runname
     ;;    	     (args:get-arg "-runtests")
     ;;    	     #f))))
     (runs:run-tests target







|







1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
   "-runtests" 
   "run a test" 
   (lambda (target runname keys keyvals)
     ;;
     ;; May or may not implement it this way ...
     ;;
     ;; Insert this run into the tasks queue
     ;; (open-run-close tasks:add (lambda ()(tasks:open-db *area-dat*))
     ;;    	     "runtests" 
     ;;    	     user
     ;;    	     target
     ;;    	     runname
     ;;    	     (args:get-arg "-runtests")
     ;;    	     #f))))
     (runs:run-tests target

Modified nmsg-transport.scm from [43df6f5458] to [fa92993bb8].

61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142

;;======================================================================
;; S E R V E R
;;======================================================================

(define (nmsg-transport:run dbstruct area-dat hostn run-id server-id #!key (retrynum 1000))
  (debug:print 2 "Attempting to start the server ...")
  (let* ((start-port      (portlogger:open-run-close portlogger:find-port))
	 (server-thread   (make-thread (lambda ()
					 (nmsg-transport:try-start-server dbstruct run-id start-port server-id))
				       "server thread"))
	 (tdbdat          (tasks:open-db)))
    (thread-start! server-thread)
    (thread-sleep! 0.1)
    (if (nmsg-transport:ping hostn start-port timeout: 2 expected-key: (current-process-id))
	(let ((interface (if (equal? hostn "-")(get-host-name) hostn)))
	  (tasks:server-set-interface-port (db:delay-if-busy tdbdat) server-id interface start-port)
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep")
	  (set! *server-info* (list hostn start-port)) ;; probably not needed anymore? currently used by keep-running
	  (thread-sleep! 3) ;; give some margin for queries to complete before switching from file based access to server based access
	  ;; (set! *inmemdb*  dbstruct)
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")
	  (thread-start! (make-thread
			  (lambda ()(nmsg-transport:keep-running server-id run-id area-dat))
			  "keep running"))
	  (thread-join! server-thread))
	(if (> retrynum 0)
	    (begin
	      (debug:print 0 "WARNING: Failed to connect to server (self) on host " hostn ":" start-port ", trying again.")
	      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature")
	      (portlogger:open-run-close portlogger:set-failed start-port)
	      (nmsg-transport:run dbstruct area-dat hostn run-id server-id))
	    (begin
	      (debug:print 0 "ERROR: could not find an open port to start server on. Giving up")
	      (exit 1))))))

(define (nmsg-transport:try-start-server dbstruct run-id portnum server-id)
  (let ((repsoc (nn-socket 'rep)))
    (nn-bind repsoc (conc "tcp://*:" portnum))
    (let loop ((msg-in (nn-recv repsoc)))
      (let* ((dat    (db:string->obj msg-in transport: 'nmsg)))
	(debug:print 0 "server, received: " dat)
	(let ((result (api:execute-requests dbstruct dat)))
	  (debug:print 0 "server, sending: " result)
	  (nn-send repsoc (db:obj->string result  transport: 'nmsg)))
	(loop (nn-recv repsoc))))))

;; all routes though here end in exit ...
;;
(define (nmsg-transport:launch run-id area-dat)
  (let* ((tdbdat   (tasks:open-db))
	 (dbstruct (db:setup run-id))
	 (hostn    (or (args:get-arg "-server") "-")))
    (set! *run-id*   run-id)
    (set! *inmemdb* dbstruct)
    ;; with nbfake daemonize isn't really needed
    ;;
    ;; (if (args:get-arg "-daemonize")
    ;;     (begin
    ;;       (daemon:ize)
    ;;       (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
    ;;           (begin
    ;;     	(current-error-port *alt-log-file*)
    ;;     	(current-output-port *alt-log-file*)))))
    (if (server:check-if-running run-id)
	(begin
	  (debug:print-info 0 "Server for run-id " run-id " already running")
	  (exit 0)))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))
	       (remtries  4))
      (if (not server-id)
	  (if (> remtries 0)
	      (begin
		(thread-sleep! 2)
		(if (not (server:check-if-running run-id))
		    (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)
			  (- remtries 1))
		    (begin
		      (debug:print-info 0 "Another server took the slot, exiting")
		      (exit 0))))
	      (begin
		;; since we didn't get the server lock we are going to clean up and bail out







|



|


















|



















|













|









|







61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142

;;======================================================================
;; S E R V E R
;;======================================================================

(define (nmsg-transport:run dbstruct area-dat hostn run-id server-id #!key (retrynum 1000))
  (debug:print 2 "Attempting to start the server ...")
  (let* ((start-port      (portlogger:open-run-close portlogger:find-port area-dat))
	 (server-thread   (make-thread (lambda ()
					 (nmsg-transport:try-start-server dbstruct run-id start-port server-id))
				       "server thread"))
	 (tdbdat          (tasks:open-db area-dat)))
    (thread-start! server-thread)
    (thread-sleep! 0.1)
    (if (nmsg-transport:ping hostn start-port timeout: 2 expected-key: (current-process-id))
	(let ((interface (if (equal? hostn "-")(get-host-name) hostn)))
	  (tasks:server-set-interface-port (db:delay-if-busy tdbdat) server-id interface start-port)
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep")
	  (set! *server-info* (list hostn start-port)) ;; probably not needed anymore? currently used by keep-running
	  (thread-sleep! 3) ;; give some margin for queries to complete before switching from file based access to server based access
	  ;; (set! *inmemdb*  dbstruct)
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")
	  (thread-start! (make-thread
			  (lambda ()(nmsg-transport:keep-running server-id run-id area-dat))
			  "keep running"))
	  (thread-join! server-thread))
	(if (> retrynum 0)
	    (begin
	      (debug:print 0 "WARNING: Failed to connect to server (self) on host " hostn ":" start-port ", trying again.")
	      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature")
	      (portlogger:open-run-close portlogger:set-failed area-dat start-port)
	      (nmsg-transport:run dbstruct area-dat hostn run-id server-id))
	    (begin
	      (debug:print 0 "ERROR: could not find an open port to start server on. Giving up")
	      (exit 1))))))

(define (nmsg-transport:try-start-server dbstruct run-id portnum server-id)
  (let ((repsoc (nn-socket 'rep)))
    (nn-bind repsoc (conc "tcp://*:" portnum))
    (let loop ((msg-in (nn-recv repsoc)))
      (let* ((dat    (db:string->obj msg-in transport: 'nmsg)))
	(debug:print 0 "server, received: " dat)
	(let ((result (api:execute-requests dbstruct dat)))
	  (debug:print 0 "server, sending: " result)
	  (nn-send repsoc (db:obj->string result  transport: 'nmsg)))
	(loop (nn-recv repsoc))))))

;; all routes though here end in exit ...
;;
(define (nmsg-transport:launch run-id area-dat)
  (let* ((tdbdat   (tasks:open-db area-dat))
	 (dbstruct (db:setup run-id))
	 (hostn    (or (args:get-arg "-server") "-")))
    (set! *run-id*   run-id)
    (set! *inmemdb* dbstruct)
    ;; with nbfake daemonize isn't really needed
    ;;
    ;; (if (args:get-arg "-daemonize")
    ;;     (begin
    ;;       (daemon:ize)
    ;;       (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
    ;;           (begin
    ;;     	(current-error-port *alt-log-file*)
    ;;     	(current-output-port *alt-log-file*)))))
    (if (server:check-if-running run-id area-dat)
	(begin
	  (debug:print-info 0 "Server for run-id " run-id " already running")
	  (exit 0)))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))
	       (remtries  4))
      (if (not server-id)
	  (if (> remtries 0)
	      (begin
		(thread-sleep! 2)
		(if (not (server:check-if-running run-id area-dat))
		    (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)
			  (- remtries 1))
		    (begin
		      (debug:print-info 0 "Another server took the slot, exiting")
		      (exit 0))))
	      (begin
		;; since we didn't get the server lock we are going to clean up and bail out
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
				sdat)
                              (begin
                                (thread-sleep! 0.5)
                                (loop))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
	 (tdbdat      (tasks:open-db))
	 (server-timeout (let ((tmo (configf:lookup  (megatest:area-configdat area-dat) "server" "timeout")))
			   (if (and (string? tmo)
				    (string->number tmo))
			       (* 60 60 (string->number tmo))
			       ;; (* 3 24 60 60) ;; default to three days
			       (* 60 1)         ;; default to one minute
			       ;; (* 60 60 25)      ;; default to 25 hours







|







269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
				sdat)
                              (begin
                                (thread-sleep! 0.5)
                                (loop))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
	 (tdbdat      (tasks:open-db area-dat))
	 (server-timeout (let ((tmo (configf:lookup  (megatest:area-configdat area-dat) "server" "timeout")))
			   (if (and (string? tmo)
				    (string->number tmo))
			       (* 60 60 (string->number tmo))
			       ;; (* 3 24 60 60) ;; default to three days
			       (* 60 1)         ;; default to one minute
			       ;; (* 60 60 25)      ;; default to 25 hours

Modified olddashboard.scm from [ad835aa81d] to [af152b63b7].

472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
    (updater)
    (set! dashboard:update-summary-tab updater)
    (iup:attribute-set! stats-matrix "WIDTHDEF" "40")
    (iup:vbox
     ;; (iup:label "Run statistics"  #:expand "HORIZONTAL")
     stats-matrix)))

(define (dcommon:servers-table)
  (let* ((tdbdat         (tasks:open-db))
	 (colnum         0)
	 (rownum         0)
	 (servers-matrix (iup:matrix #:expand "YES"
				     #:numcol 7
				     #:numcol-visible 7
				     #:numlin-visible 5
				     ))







|
|







472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
    (updater)
    (set! dashboard:update-summary-tab updater)
    (iup:attribute-set! stats-matrix "WIDTHDEF" "40")
    (iup:vbox
     ;; (iup:label "Run statistics"  #:expand "HORIZONTAL")
     stats-matrix)))

(define (dcommon:servers-table area-dat)
  (let* ((tdbdat         (tasks:open-db area-dat))
	 (colnum         0)
	 (rownum         0)
	 (servers-matrix (iup:matrix #:expand "YES"
				     #:numcol 7
				     #:numcol-visible 7
				     #:numlin-visible 5
				     ))
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
	   (> modtime last-db-update-time)
	   (> (current-seconds)(+ last-db-update-time 1)))))

(define *monitor-db-path* (conc *dbdir* "/monitor.db"))
(define *last-monitor-update-time* 0)

;; Force creation of the db in case it isn't already there.
(tasks:open-db)

(define (dashboard:get-youngest-run-db-mod-time)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn))
     (current-seconds)) ;; something went wrong - just print an error and return current-seconds







|







2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
	   (> modtime last-db-update-time)
	   (> (current-seconds)(+ last-db-update-time 1)))))

(define *monitor-db-path* (conc *dbdir* "/monitor.db"))
(define *last-monitor-update-time* 0)

;; Force creation of the db in case it isn't already there.
(tasks:open-db area-dat)

(define (dashboard:get-youngest-run-db-mod-time)
  (handle-exceptions
   exn
   (begin
     (debug:print 0 "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn))
     (current-seconds)) ;; something went wrong - just print an error and return current-seconds

Modified portlogger.scm from [bdca7fc363] to [58d53e15b4].

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

(declare (unit portlogger))
(declare (uses db))

;; lsof -i


(define (portlogger:open-db fname)
  (let* ((avail    (tasks:wait-on-journal fname 5 remove: #t)) ;; wait up to about 10 seconds for the journal to go away
	 (exists   (file-exists? fname))
	 (db       (if avail 
		       (sqlite3:open-database fname)
		       (begin
			 (system (conc "rm -f " fname))
			 (sqlite3:open-database fname))))
	 (handler  (make-busy-timeout 136000))
	 (canwrite (file-write-access? fname)))
	 ;; (db-init  (lambda ()
	 ;;             (sqlite3:execute 
	 ;;              db
	 ;;              "CREATE TABLE IF NOT EXISTS ports (
         ;;                 port INTEGER PRIMARY KEY,
         ;;                 state TEXT DEFAULT 'not-used',
         ;;                 fail_count INTEGER DEFAULT 0,
         ;;                 update_time TIMESTAMP DEFAULT (strftime('%s','now')) );"))))
    (sqlite3:set-busy-handler! db handler)
    (db:set-sync db) ;; (sqlite3:execute db "PRAGMA synchronous = 0;")
    ;; (if (not exists) ;; needed with IF NOT EXISTS?
    (sqlite3:execute 
     db
     "CREATE TABLE IF NOT EXISTS ports (
            port INTEGER PRIMARY KEY,
            state TEXT DEFAULT 'not-used',
            fail_count INTEGER DEFAULT 0,
            update_time TIMESTAMP DEFAULT (strftime('%s','now')) );")
    db))

(define (portlogger:open-run-close proc . params)
  (let* ((fname  (conc "/tmp/." (current-user-name) "-portlogger.db"))
	 (avail  (tasks:wait-on-journal fname 10))) ;; wait up to about 10 seconds for the journal to go away
    (handle-exceptions
     exn
     (begin
       ;; (release-dot-lock fname)
       (debug:print 0 "ERROR: portlogger:open-run-close failed. " proc " " params)
       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 0 "exn=" (condition->list exn))
       (if (file-exists? fname)(delete-file fname)) ;; brutally get rid of it
       (print-call-chain (current-error-port)))
     (let* (;; (lock   (obtain-dot-lock fname 2 9 10))
	    (db     (portlogger:open-db fname))
	    (res    (apply proc db params)))
       (sqlite3:finalize! db)
       ;; (release-dot-lock fname)
       res))))

;; (fold-row PROC INIT DATABASE SQL . PARAMETERS) 
(define (portlogger:take-port db portnum)







|


















|










|












|







15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

(declare (unit portlogger))
(declare (uses db))

;; lsof -i


(define (portlogger:open-db fname area-dat)
  (let* ((avail    (tasks:wait-on-journal fname 5 remove: #t)) ;; wait up to about 10 seconds for the journal to go away
	 (exists   (file-exists? fname))
	 (db       (if avail 
		       (sqlite3:open-database fname)
		       (begin
			 (system (conc "rm -f " fname))
			 (sqlite3:open-database fname))))
	 (handler  (make-busy-timeout 136000))
	 (canwrite (file-write-access? fname)))
	 ;; (db-init  (lambda ()
	 ;;             (sqlite3:execute 
	 ;;              db
	 ;;              "CREATE TABLE IF NOT EXISTS ports (
         ;;                 port INTEGER PRIMARY KEY,
         ;;                 state TEXT DEFAULT 'not-used',
         ;;                 fail_count INTEGER DEFAULT 0,
         ;;                 update_time TIMESTAMP DEFAULT (strftime('%s','now')) );"))))
    (sqlite3:set-busy-handler! db handler)
    (db:set-sync db area-dat) ;; (sqlite3:execute db "PRAGMA synchronous = 0;")
    ;; (if (not exists) ;; needed with IF NOT EXISTS?
    (sqlite3:execute 
     db
     "CREATE TABLE IF NOT EXISTS ports (
            port INTEGER PRIMARY KEY,
            state TEXT DEFAULT 'not-used',
            fail_count INTEGER DEFAULT 0,
            update_time TIMESTAMP DEFAULT (strftime('%s','now')) );")
    db))

(define (portlogger:open-run-close proc area-dat . params)
  (let* ((fname  (conc "/tmp/." (current-user-name) "-portlogger.db"))
	 (avail  (tasks:wait-on-journal fname 10))) ;; wait up to about 10 seconds for the journal to go away
    (handle-exceptions
     exn
     (begin
       ;; (release-dot-lock fname)
       (debug:print 0 "ERROR: portlogger:open-run-close failed. " proc " " params)
       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
       (debug:print 0 "exn=" (condition->list exn))
       (if (file-exists? fname)(delete-file fname)) ;; brutally get rid of it
       (print-call-chain (current-error-port)))
     (let* (;; (lock   (obtain-dot-lock fname 2 9 10))
	    (db     (portlogger:open-db fname area-dat))
	    (res    (apply proc db params)))
       (sqlite3:finalize! db)
       ;; (release-dot-lock fname)
       res))))

;; (fold-row PROC INIT DATABASE SQL . PARAMETERS) 
(define (portlogger:take-port db portnum)

Modified rmt.scm from [3e0b8a2905] to [9d5589428d].

74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
;;
(define (rmt:get-connection-info run-id area-dat #!key (remote #f))
  (let ((cinfo (common:get-remote remote run-id)))
    (if cinfo
	cinfo
	;; NB// can cache the answer for server running for 10 seconds ...
	;;  ;; (and (not (rmt:write-frequency-over-limit? cmd run-id))
	(if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id)
	    (client:setup run-id remote: remote)
	    #f))))

(define (rmt:discard-old-connections area-dat)
  ;; clean out old connections
  (mutex-lock! *db-multi-sync-mutex*)
  (let ((remote      (megatest:area-remote area-dat))
	(expire-time (- (current-seconds) (server:get-timeout area-dat) 10))) ;; don't forget the 10 second margin







|
|







74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
;;
(define (rmt:get-connection-info run-id area-dat #!key (remote #f))
  (let ((cinfo (common:get-remote remote run-id)))
    (if cinfo
	cinfo
	;; NB// can cache the answer for server running for 10 seconds ...
	;;  ;; (and (not (rmt:write-frequency-over-limit? cmd run-id))
	(if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db area-dat)) run-id)
	    (client:setup run-id area-dat remote: remote)
	    #f))))

(define (rmt:discard-old-connections area-dat)
  ;; clean out old connections
  (mutex-lock! *db-multi-sync-mutex*)
  (let ((remote      (megatest:area-remote area-dat))
	(expire-time (- (current-seconds) (server:get-timeout area-dat) 10))) ;; don't forget the 10 second margin
103
104
105
106
107
108
109

110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
  (mutex-unlock! *db-multi-sync-mutex*))

(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

(define (rmt:send-receive cmd rid params area-dat #!key (attemptnum 1)(remote #f)) ;; start attemptnum at 1 so the modulo below works as expected
  (rmt:discard-old-connections area-dat)
  ;; (mutex-lock! *send-receive-mutex*)

  (let* ((run-id          (if rid rid 0))
	 (configdat       (megatest:area-configdat area-dat))
	 (connection-info (rmt:get-connection-info run-id area-dat)))
    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)
    (if connection-info
	;; use the server if have connection info
	(let* ((dat     (case *transport-type*
			  ((http)(condition-case
				  (http-transport:client-api-send-receive run-id connection-info cmd params)
				  ((commfail)(vector #f "communications fail"))
				  ((exn)(vector #f "other fail"))))
			  ((nmsg)(condition-case
				  (nmsg-transport:client-api-send-receive run-id connection-info cmd params)
				  ((timeout)(vector #f "timeout talking to server"))))
			  (else  (exit))))
	       (success (if (vector? dat) (vector-ref dat 0) #f))
	       (res     (if (vector? dat) (vector-ref dat 1) #f)))
	  (if (vector? connection-info)(http-transport:server-dat-update-last-access connection-info))
	  (if success
	      (begin
		;; (mutex-unlock! *send-receive-mutex*)
		(case *transport-type* 
		  ((http) res) ;; (db:string->obj res))
		  ((nmsg) res))) ;; (vector-ref res 1)))
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to rmt:send-receive again.")
		;; (case *transport-type*
		;;   ((nmsg)(nn-close (http-transport:server-dat-get-socket connection-info))))
		(common:del-remote! remote run-id) ;; don't keep using the same connection
		;; NOTE: killing server causes this process to block forever. No idea why. Dec 2. 
		;; (if (eq? (modulo attemptnum 5) 0)
		;;     (tasks:kill-server-run-id run-id tag: "api-send-receive-failed"))
		;; (mutex-unlock! *send-receive-mutex*) ;; close the mutex here to allow other threads access to communications
		(tasks:start-and-wait-for-server (tasks:open-db) run-id 15)
		;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1))))))

		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
		;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))
		;; (thread-sleep! 2)
		(rmt:send-receive cmd run-id params area-dat attemptnum: (+ attemptnum 1)))))
	;; no connection info? try to start a server, or access locally if no
	;; server and the query is read-only
	;;
	;; Note: The tasks db was checked for a server in starting mode in the rmt:get-connection-info call
	;;
	(if (and (< attemptnum 15)
		 (member cmd api:write-queries))
	    (let ((faststart (configf:lookup configdat "server" "faststart")))
	      (common:del-remote! remote run-id)
	      ;; (mutex-unlock! *send-receive-mutex*)
	      (if (and faststart (equal? faststart "no"))
		  (begin
		    (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10)
		    (thread-sleep! (random 5)) ;; give some time to settle and minimize collison?
		    (rmt:send-receive cmd rid params area-dat attemptnum: (+ attemptnum 1)))
		  (begin
		    (server:kind-run run-id area-dat)
		    (rmt:open-qry-close-locally cmd run-id params area-dat))))
	    (begin
	      ;; (debug:print 0 "ERROR: Communication failed!")







>
|





|














|











|




















|







103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
  (mutex-unlock! *db-multi-sync-mutex*))

(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

(define (rmt:send-receive cmd rid params area-dat #!key (attemptnum 1)(remote #f)) ;; start attemptnum at 1 so the modulo below works as expected
  (rmt:discard-old-connections area-dat)
  ;; (mutex-lock! *send-receive-mutex*)
  (let* ((transport-type  (megatest:area-transport area-dat))
	 (run-id          (if rid rid 0))
	 (configdat       (megatest:area-configdat area-dat))
	 (connection-info (rmt:get-connection-info run-id area-dat)))
    ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also)
    (if connection-info
	;; use the server if have connection info
	(let* ((dat     (case transport-type
			  ((http)(condition-case
				  (http-transport:client-api-send-receive run-id connection-info cmd params)
				  ((commfail)(vector #f "communications fail"))
				  ((exn)(vector #f "other fail"))))
			  ((nmsg)(condition-case
				  (nmsg-transport:client-api-send-receive run-id connection-info cmd params)
				  ((timeout)(vector #f "timeout talking to server"))))
			  (else  (exit))))
	       (success (if (vector? dat) (vector-ref dat 0) #f))
	       (res     (if (vector? dat) (vector-ref dat 1) #f)))
	  (if (vector? connection-info)(http-transport:server-dat-update-last-access connection-info))
	  (if success
	      (begin
		;; (mutex-unlock! *send-receive-mutex*)
		(case 
		  ((http) res) ;; (db:string->obj res))
		  ((nmsg) res))) ;; (vector-ref res 1)))
	      (begin ;; let ((new-connection-info (client:setup run-id)))
		(debug:print 0 "WARNING: Communication failed, trying call to rmt:send-receive again.")
		;; (case *transport-type*
		;;   ((nmsg)(nn-close (http-transport:server-dat-get-socket connection-info))))
		(common:del-remote! remote run-id) ;; don't keep using the same connection
		;; NOTE: killing server causes this process to block forever. No idea why. Dec 2. 
		;; (if (eq? (modulo attemptnum 5) 0)
		;;     (tasks:kill-server-run-id run-id tag: "api-send-receive-failed"))
		;; (mutex-unlock! *send-receive-mutex*) ;; close the mutex here to allow other threads access to communications
		(tasks:start-and-wait-for-server (tasks:open-db area-dat) run-id 15)
		;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1))))))

		;; no longer killing the server in http-transport:client-api-send-receive
		;; may kill it here but what are the criteria?
		;; start with three calls then kill server
		;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id))
		;; (thread-sleep! 2)
		(rmt:send-receive cmd run-id params area-dat attemptnum: (+ attemptnum 1)))))
	;; no connection info? try to start a server, or access locally if no
	;; server and the query is read-only
	;;
	;; Note: The tasks db was checked for a server in starting mode in the rmt:get-connection-info call
	;;
	(if (and (< attemptnum 15)
		 (member cmd api:write-queries))
	    (let ((faststart (configf:lookup configdat "server" "faststart")))
	      (common:del-remote! remote run-id)
	      ;; (mutex-unlock! *send-receive-mutex*)
	      (if (and faststart (equal? faststart "no"))
		  (begin
		    (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db area-dat)) run-id 10)
		    (thread-sleep! (random 5)) ;; give some time to settle and minimize collison?
		    (rmt:send-receive cmd rid params area-dat attemptnum: (+ attemptnum 1)))
		  (begin
		    (server:kind-run run-id area-dat)
		    (rmt:open-qry-close-locally cmd run-id params area-dat))))
	    (begin
	      ;; (debug:print 0 "ERROR: Communication failed!")
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
  (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id) area-dat))

(define (rmt:open-test-db-by-test-id run-id test-id area-dat #!key (work-area #f))
  (let* ((test-path (if (string? work-area)
			work-area
			(rmt:test-get-rundir-from-test-id run-id test-id area-dat))))
    (debug:print 3 "TEST PATH: " test-path)
    (open-test-db test-path)))

;; WARNING: This currently bypasses the transaction wrapped writes system
(define (rmt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment area-dat)
  (rmt:send-receive 'test-set-state-status-by-id run-id (list run-id test-id newstate newstatus newcomment) area-dat))

(define (rmt:set-tests-state-status run-id testnames currstate currstatus newstate newstatus area-dat)
  (rmt:send-receive 'set-tests-state-status run-id (list run-id testnames currstate currstatus newstate newstatus) area-dat))







|







375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
  (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id) area-dat))

(define (rmt:open-test-db-by-test-id run-id test-id area-dat #!key (work-area #f))
  (let* ((test-path (if (string? work-area)
			work-area
			(rmt:test-get-rundir-from-test-id run-id test-id area-dat))))
    (debug:print 3 "TEST PATH: " test-path)
    (open-test-db test-path area-dat)))

;; WARNING: This currently bypasses the transaction wrapped writes system
(define (rmt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment area-dat)
  (rmt:send-receive 'test-set-state-status-by-id run-id (list run-id test-id newstate newstatus newcomment) area-dat))

(define (rmt:set-tests-state-status run-id testnames currstate currstatus newstate newstatus area-dat)
  (rmt:send-receive 'set-tests-state-status run-id (list run-id testnames currstate currstatus newstate newstatus) area-dat))

Modified rpc-transport.scm from [9f0167ff5f] to [75fd7447c7].

35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
   ;;    (apply (eval (string->symbol (conc "remote:" procstr))) params)
   (apply (eval (string->symbol procstr)) params)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (rpc-transport:launch run-id)
  (set! *run-id*   run-id)
  (if (args:get-arg "-daemonize")
      (daemon:ize))
  (if (server:check-if-running run-id)
      (begin
	(debug:print 0 "INFO: Server for run-id " run-id " already running")
	(exit 0)))
  (let loop ((server-id (open-run-close tasks:server-lock-slot tasks:open-db run-id))
	     (remtries  4))
    (if (not server-id)
	(if (> remtries 0)
	    (begin
	      (thread-sleep! 2)
	      (loop (open-run-close tasks:server-lock-slot tasks:open-db run-id)
		    (- remtries 1)))
	    (begin
	      ;; since we didn't get the server lock we are going to clean up and bail out
	      (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
	      (open-run-close tasks:server-delete-records-for-this-pid tasks:open-db " rpc-transport:launch")))
	(begin
	  (rpc-transport:run (if (args:get-arg "-server")(args:get-arg "-server") "-") run-id server-id)
	  (exit)))))

(define (rpc-transport:run hostn run-id server-id area-dat)
  (debug:print 2 "Attempting to start the rpc server ...")
   ;; (trace rpc:publish-procedure!)

  (rpc:publish-procedure! 'server:login server:login)
  (rpc:publish-procedure! 'testing (lambda () "Just testing"))

  (let* ((configdat       (megatest:area-configdat area-dat))
	 (db              #f)
	 (hostname        (get-host-name))
	 (ipaddrstr       (let ((ipstr (if (string=? "-" hostn)
					   ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
					   (server:get-best-guess-address hostname)
					   #f)))
			    (if ipstr ipstr hostn))) ;; hostname))) 
	 (start-port      (open-run-close tasks:server-get-next-port tasks:open-db))
	 (link-tree-path  (configf:lookup configdat "setup" "linktree"))
	 (rpc:listener    (rpc-transport:find-free-port-and-open (rpc:default-server-port)))
	 (th1             (make-thread
			   (lambda ()
			     ((rpc:make-server rpc:listener) #t))
			   "rpc:server"))
			   ;; (cute (rpc:make-server rpc:listener) "rpc:server")
			   ;; 'rpc:server))
	 (hostname        (if (string=? "-" hostn)
			      (get-host-name) 
			      hostn))
	 (ipaddrstr       (if (string=? "-" hostn)
			      (server:get-best-guess-address hostname) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
			      #f))
	 (portnum         (rpc:default-server-port))
	 (host:port       (conc (if ipaddrstr ipaddrstr hostname) ":" portnum))
	 (tdb             (tasks:open-db)))
    (thread-start! th1)
    (set! db *inmemdb*)
    (open-run-close tasks:server-set-interface-port 
		    tasks:open-db 
		    server-id 
		    ipaddrstr portnum)
    (debug:print 0 "Server started on " host:port)
    
    ;; (trace rpc:publish-procedure!)
    ;; (rpc:publish-procedure! 'server:login server:login)
    ;; (rpc:publish-procedure! 'testing (lambda () "Just testing"))

    ;;======================================================================
    ;;	  ;; end of publish-procedure section
    ;;======================================================================
    ;;
    (on-exit (lambda ()
	       (open-run-close tasks:server-set-state! tasks:open-db server-id "stopped")))

    (set! *rpc:listener* rpc:listener)
    (tasks:server-set-state! tdb server-id "running")
    (set! *inmemdb*  (db:setup run-id))
    ;; if none running or if > 20 seconds since 
    ;; server last used then start shutdown
    (let loop ((count 0))
      (thread-sleep! 5) ;; no need to do this very often
      (let ((numrunning -1)) ;; (db:get-count-tests-running db)))
	(if (or (> numrunning 0)
		(> (+ *last-db-access* 60)(current-seconds)))
	    (begin
	      (debug:print-info 0 "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*))
	      (loop (+ 1 count)))
	    (begin
	      (debug:print-info 0 "Starting to shutdown the server side")
	      (open-run-close tasks:server-delete-record tasks:open-db server-id " rpc-transport:try-start-server stop")
	      (thread-sleep! 10)
	      (debug:print-info 0 "Max cached queries was " *max-cache-size*)
	      (debug:print-info 0 "Server shutdown complete. Exiting")
	      ))))))

(define (rpc-transport:find-free-port-and-open port)
  (handle-exceptions







|



|



|





|




|



















|
















|



|













|
















|







35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
   ;;    (apply (eval (string->symbol (conc "remote:" procstr))) params)
   (apply (eval (string->symbol procstr)) params)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (rpc-transport:launch run-id area-dat)
  (set! *run-id*   run-id)
  (if (args:get-arg "-daemonize")
      (daemon:ize))
  (if (server:check-if-running run-id area-dat)
      (begin
	(debug:print 0 "INFO: Server for run-id " run-id " already running")
	(exit 0)))
  (let loop ((server-id (open-run-close tasks:server-lock-slot (lambda ()(tasks:open-db area-dat))run-id))
	     (remtries  4))
    (if (not server-id)
	(if (> remtries 0)
	    (begin
	      (thread-sleep! 2)
	      (loop (open-run-close tasks:server-lock-slot (lambda ()(tasks:open-db area-dat)) run-id)
		    (- remtries 1)))
	    (begin
	      ;; since we didn't get the server lock we are going to clean up and bail out
	      (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
	      (open-run-close tasks:server-delete-records-for-this-pid (lambda ()(tasks:open-db area-dat)) " rpc-transport:launch")))
	(begin
	  (rpc-transport:run (if (args:get-arg "-server")(args:get-arg "-server") "-") run-id server-id)
	  (exit)))))

(define (rpc-transport:run hostn run-id server-id area-dat)
  (debug:print 2 "Attempting to start the rpc server ...")
   ;; (trace rpc:publish-procedure!)

  (rpc:publish-procedure! 'server:login server:login)
  (rpc:publish-procedure! 'testing (lambda () "Just testing"))

  (let* ((configdat       (megatest:area-configdat area-dat))
	 (db              #f)
	 (hostname        (get-host-name))
	 (ipaddrstr       (let ((ipstr (if (string=? "-" hostn)
					   ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
					   (server:get-best-guess-address hostname)
					   #f)))
			    (if ipstr ipstr hostn))) ;; hostname))) 
	 (start-port      (open-run-close tasks:server-get-next-port (lambda ()(tasks:open-db area-dat))))
	 (link-tree-path  (configf:lookup configdat "setup" "linktree"))
	 (rpc:listener    (rpc-transport:find-free-port-and-open (rpc:default-server-port)))
	 (th1             (make-thread
			   (lambda ()
			     ((rpc:make-server rpc:listener) #t))
			   "rpc:server"))
			   ;; (cute (rpc:make-server rpc:listener) "rpc:server")
			   ;; 'rpc:server))
	 (hostname        (if (string=? "-" hostn)
			      (get-host-name) 
			      hostn))
	 (ipaddrstr       (if (string=? "-" hostn)
			      (server:get-best-guess-address hostname) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
			      #f))
	 (portnum         (rpc:default-server-port))
	 (host:port       (conc (if ipaddrstr ipaddrstr hostname) ":" portnum))
	 (tdb             (tasks:open-db area-dat)))
    (thread-start! th1)
    (set! db *inmemdb*)
    (open-run-close tasks:server-set-interface-port 
		    (lambda ()(tasks:open-db area-dat))
		    server-id 
		    ipaddrstr portnum)
    (debug:print 0 "Server started on " host:port)
    
    ;; (trace rpc:publish-procedure!)
    ;; (rpc:publish-procedure! 'server:login server:login)
    ;; (rpc:publish-procedure! 'testing (lambda () "Just testing"))

    ;;======================================================================
    ;;	  ;; end of publish-procedure section
    ;;======================================================================
    ;;
    (on-exit (lambda ()
	       (open-run-close tasks:server-set-state! (lambda ()(tasks:open-db area-dat)) server-id "stopped")))

    (set! *rpc:listener* rpc:listener)
    (tasks:server-set-state! tdb server-id "running")
    (set! *inmemdb*  (db:setup run-id))
    ;; if none running or if > 20 seconds since 
    ;; server last used then start shutdown
    (let loop ((count 0))
      (thread-sleep! 5) ;; no need to do this very often
      (let ((numrunning -1)) ;; (db:get-count-tests-running db)))
	(if (or (> numrunning 0)
		(> (+ *last-db-access* 60)(current-seconds)))
	    (begin
	      (debug:print-info 0 "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*))
	      (loop (+ 1 count)))
	    (begin
	      (debug:print-info 0 "Starting to shutdown the server side")
	      (open-run-close tasks:server-delete-record (lambda ()(tasks:open-db area-dat)) server-id " rpc-transport:try-start-server stop")
	      (thread-sleep! 10)
	      (debug:print-info 0 "Max cached queries was " *max-cache-size*)
	      (debug:print-info 0 "Server shutdown complete. Exiting")
	      ))))))

(define (rpc-transport:find-free-port-and-open port)
  (handle-exceptions
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
		  (let ((server-dat (list iface port #f #f #f)))
		    (common:set-remote! remote run-id server-dat)
		    server-dat)
		  (begin
		    (server:try-running run-id)
		    (thread-sleep! 2)
		    (rpc-transport:client-setup run-id area-dat remtries: (- remtries 1)))))
 	    (let* ((server-db-info (open-run-close tasks:get-server tasks:open-db run-id)))
 	      (debug:print-info 0 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	      (if server-db-info
 		  (let* ((iface     (tasks:hostinfo-get-interface server-db-info))
 			 (port      (tasks:hostinfo-get-port      server-db-info))
			 (server-dat (list iface port #f #f #f))
 			 (ping-res  ((rpc:procedure 'server:login host port) toppath)))
 		    (if start-res







|







177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
		  (let ((server-dat (list iface port #f #f #f)))
		    (common:set-remote! remote run-id server-dat)
		    server-dat)
		  (begin
		    (server:try-running run-id)
		    (thread-sleep! 2)
		    (rpc-transport:client-setup run-id area-dat remtries: (- remtries 1)))))
 	    (let* ((server-db-info (open-run-close tasks:get-server (lambda ()(tasks:open-db area-dat)) run-id)))
 	      (debug:print-info 0 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	      (if server-db-info
 		  (let* ((iface     (tasks:hostinfo-get-interface server-db-info))
 			 (port      (tasks:hostinfo-get-port      server-db-info))
			 (server-dat (list iface port #f #f #f))
 			 (ping-res  ((rpc:procedure 'server:login host port) toppath)))
 		    (if start-res

Modified runs.scm from [6f4d489558] to [45c9c10cd7].

220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
	 (test-records       (make-hash-table))
	 ;; need to process runconfigs before generating these lists
	 (all-tests-registry #f)  ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names
	 (all-test-names     #f)  ;; (hash-table-keys all-tests-registry))
	 (test-names         #f)  ;; (tests:filter-test-names all-test-names test-patts))
	 (required-tests     #f)  ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work
	 (task-key           (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id)))
	 (tdbdat             (tasks:open-db)))

    (if (tasks:need-server run-id area-dat)(tasks:start-and-wait-for-server tdbdat run-id 10))

    (set-signal-handler! signal/int
			 (lambda (signum)
			   (signal-mask! signum)
			   (print "Received signal " signum ", cleaning up before exit. Please wait...")
			   (let ((tdbdat (tasks:open-db)))
			     (rmt:tasks-set-state-given-param-key task-key "killed"))
			   (print "Killed by signal " signum ". Exiting")
			   (exit)))

    ;; register this run in monitor.db
    (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params)
    (rmt:tasks-set-state-given-param-key task-key "running")







|







|







220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
	 (test-records       (make-hash-table))
	 ;; need to process runconfigs before generating these lists
	 (all-tests-registry #f)  ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names
	 (all-test-names     #f)  ;; (hash-table-keys all-tests-registry))
	 (test-names         #f)  ;; (tests:filter-test-names all-test-names test-patts))
	 (required-tests     #f)  ;;(lset-intersection equal? (string-split test-patts ",") test-names))) ;; test-names)) ;; Added test-names as initial for required-tests but that failed to work
	 (task-key           (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id)))
	 (tdbdat             (tasks:open-db area-dat)))

    (if (tasks:need-server run-id area-dat)(tasks:start-and-wait-for-server tdbdat run-id 10))

    (set-signal-handler! signal/int
			 (lambda (signum)
			   (signal-mask! signum)
			   (print "Received signal " signum ", cleaning up before exit. Please wait...")
			   (let ((tdbdat (tasks:open-db area-dat)))
			     (rmt:tasks-set-state-given-param-key task-key "killed"))
			   (print "Killed by signal " signum ". Exiting")
			   (exit)))

    ;; register this run in monitor.db
    (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params)
    (rmt:tasks-set-state-given-param-key task-key "running")
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
	(max-concurrent-jobs   (let ((mcj (config-lookup configdat "setup"     "max_concurrent_jobs")))
				 (if (and mcj (string->number mcj))
				     (string->number mcj)
				     1))) ;; length of the register queue ahead
	(reglen                (if (number? reglen-in) reglen-in 1))
	(last-time-incomplete  (- (current-seconds) 900)) ;; force at least one clean up cycle
	(last-time-some-running (current-seconds))
	(tdbdat                (tasks:open-db)))

    ;; Initialize the test-registery hash with tests that already have a record
    ;; convert state to symbol and use that as the hash value
    (for-each (lambda (trec)
		(let ((id (db:test-get-id        trec))
		      (tn (db:test-get-testname  trec))
		      (ip (db:test-get-item-path trec))







|







917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
	(max-concurrent-jobs   (let ((mcj (config-lookup configdat "setup"     "max_concurrent_jobs")))
				 (if (and mcj (string->number mcj))
				     (string->number mcj)
				     1))) ;; length of the register queue ahead
	(reglen                (if (number? reglen-in) reglen-in 1))
	(last-time-incomplete  (- (current-seconds) 900)) ;; force at least one clean up cycle
	(last-time-some-running (current-seconds))
	(tdbdat                (tasks:open-db area-dat)))

    ;; Initialize the test-registery hash with tests that already have a record
    ;; convert state to symbol and use that as the hash value
    (for-each (lambda (trec)
		(let ((id (db:test-get-id        trec))
		      (tn (db:test-get-testname  trec))
		      (ip (db:test-get-item-path trec))
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
;;    'set-state-status
;;
;; NB// should pass in keys?
;;
(define (runs:operate-on action target runnamepatt testpatt area-dat #!key (state #f)(status #f)(new-state-status #f)(mode 'remove-all)(options '()))
  (common:clear-caches) ;; clear all caches
  (let* ((db           #f)
	 (tdbdat       (tasks:open-db))
	 (keys         (rmt:get-keys area-dat))
	 (rundat       (mt:get-runs-by-patt keys runnamepatt target))
	 (header       (vector-ref rundat 0))
	 (runs         (vector-ref rundat 1))
	 (states       (if state  (string-split state  ",") '()))
	 (statuses     (if status (string-split status ",") '()))
	 (state-status (if (string? new-state-status) (string-split new-state-status ",") '(#f #f))))







|







1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
;;    'set-state-status
;;
;; NB// should pass in keys?
;;
(define (runs:operate-on action target runnamepatt testpatt area-dat #!key (state #f)(status #f)(new-state-status #f)(mode 'remove-all)(options '()))
  (common:clear-caches) ;; clear all caches
  (let* ((db           #f)
	 (tdbdat       (tasks:open-db area-dat))
	 (keys         (rmt:get-keys area-dat))
	 (rundat       (mt:get-runs-by-patt keys runnamepatt target))
	 (header       (vector-ref rundat 0))
	 (runs         (vector-ref rundat 1))
	 (states       (if state  (string-split state  ",") '()))
	 (statuses     (if status (string-split status ",") '()))
	 (state-status (if (string? new-state-status) (string-split new-state-status ",") '(#f #f))))

Modified server.scm from [f70ac864c6] to [7ddbaf8b7c].

46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
;;

;; all routes though here end in exit ...
;;
;; start_server
;;
(define (server:launch run-id area-dat)
  (case *transport-type*
    ((http)(http-transport:launch run-id area-dat))
    ((nmsg)(nmsg-transport:launch run-id area-dat))
    ((rpc)  (rpc-transport:launch run-id area-dat))
    (else (debug:print 0 "ERROR: unknown server type " *transport-type*))))
;;       (else   (debug:print 0 "ERROR: No known transport set, transport=" transport ", using rpc")
;; 	      (rpc-transport:launch run-id)))))

;;======================================================================
;; S E R V E R   U T I L I T I E S 
;;======================================================================








|



|







46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
;;

;; all routes though here end in exit ...
;;
;; start_server
;;
(define (server:launch run-id area-dat)
  (case (megatest:area-transport area-dat)
    ((http)(http-transport:launch run-id area-dat))
    ((nmsg)(nmsg-transport:launch run-id area-dat))
    ((rpc)  (rpc-transport:launch run-id area-dat))
    (else (debug:print 0 "ERROR: unknown server type " (megatest:area-transport area-dat)))))
;;       (else   (debug:print 0 "ERROR: No known transport set, transport=" transport ", using rpc")
;; 	      (rpc-transport:launch run-id)))))

;;======================================================================
;; S E R V E R   U T I L I T I E S 
;;======================================================================

80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
			   (lambda ()
			     (write (list (current-directory)
					  (argv)))))))

;; When using zmq this would send the message back (two step process)
;; with spiffy or rpc this simply returns the return data to be returned
;; 
(define (server:reply return-addr query-sig success/fail result #!key (remote #f))
  (debug:print-info 11 "server:reply return-addr=" return-addr ", result=" result)
  ;; (send-message pubsock target send-more: #t)
  ;; (send-message pubsock 
  (case (server:get-transport)
    ((rpc)  (db:obj->string (vector success/fail query-sig result)))
    ((http) (db:obj->string (vector success/fail query-sig result)))
    ((zmq)
     (let ((pub-socket (vector-ref (common:get-remote remote #f) 1)))
       (send-message pub-socket return-addr send-more: #t)
       (send-message pub-socket (db:obj->string (vector success/fail query-sig result)))))
    ((fs)   result)
    (else 
     (debug:print 0 "ERROR: unrecognised transport type: " *transport-type*)
     result)))

;; Given a run id start a server process    ### NOTE ### > file 2>&1 
;; if the run-id is zero and the target-host is set 
;; try running on that host
;;
(define  (server:run run-id area-dat)







|












|







80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
			   (lambda ()
			     (write (list (current-directory)
					  (argv)))))))

;; When using zmq this would send the message back (two step process)
;; with spiffy or rpc this simply returns the return data to be returned
;; 
(define (server:reply return-addr query-sig success/fail result area-dat #!key (remote #f))
  (debug:print-info 11 "server:reply return-addr=" return-addr ", result=" result)
  ;; (send-message pubsock target send-more: #t)
  ;; (send-message pubsock 
  (case (server:get-transport)
    ((rpc)  (db:obj->string (vector success/fail query-sig result)))
    ((http) (db:obj->string (vector success/fail query-sig result)))
    ((zmq)
     (let ((pub-socket (vector-ref (common:get-remote remote #f) 1)))
       (send-message pub-socket return-addr send-more: #t)
       (send-message pub-socket (db:obj->string (vector success/fail query-sig result)))))
    ((fs)   result)
    (else 
     (debug:print 0 "ERROR: unrecognised transport type: " (megatest:area-transport area-dat))
     result)))

;; Given a run id start a server process    ### NOTE ### > file 2>&1 
;; if the run-id is zero and the target-host is set 
;; try running on that host
;;
(define  (server:run run-id area-dat)
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
;; The generic run a server command. Dispatches the call to server 0 if run-id != 0
;; 
(define (server:try-running run-id area-dat)
  (if (eq? run-id 0)
      (server:run run-id area-dat)
      (rmt:start-server run-id)))

(define (server:check-if-running run-id)
  (let ((tdbdat (tasks:open-db)))
    (let loop ((server (tasks:get-server (db:delay-if-busy tdbdat) run-id))
	       (trycount 0))
    (if server
	;; note: client:start will set (common:get-remote remote). this needs to be changed
	;;       also, client:start will login to the server, also need to change that.
	;;
	;; client:start returns #t if login was successful.
	;;
	(let ((res (case *transport-type*
		     ((http)(server:ping-server run-id 
						(tasks:hostinfo-get-interface server)
						(tasks:hostinfo-get-port      server)))
		     ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
						 (tasks:hostinfo-get-port      server)
						 timeout: 2)))))
	  ;; if the server didn't respond we must remove the record
	  (if res
	      #t
	      (begin
		(debug:print-info 0 "server at " server " not responding, removing record")
		(tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id 
				" server:check-if-running")
		res)))
	#f))))

;; called in megatest.scm, host-port is string hostname:port
;;
(define (server:ping run-id host:port)
  (let ((tdbdat (tasks:open-db)))
    (let* ((host-port (let ((slst (string-split   host:port ":")))
			(if (eq? (length slst) 2)
			    (list (car slst)(string->number (cadr slst)))
			    #f)))
	   (toppath       (launch:setup-for-run))
	   (server-db-dat (if (not host-port)(tasks:get-server (db:delay-if-busy tdbdat) run-id) #f)))
      (if (not run-id)







|
|








|


















|
|







155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
;; The generic run a server command. Dispatches the call to server 0 if run-id != 0
;; 
(define (server:try-running run-id area-dat)
  (if (eq? run-id 0)
      (server:run run-id area-dat)
      (rmt:start-server run-id)))

(define (server:check-if-running run-id area-dat)
  (let ((tdbdat (tasks:open-db area-dat)))
    (let loop ((server (tasks:get-server (db:delay-if-busy tdbdat) run-id))
	       (trycount 0))
    (if server
	;; note: client:start will set (common:get-remote remote). this needs to be changed
	;;       also, client:start will login to the server, also need to change that.
	;;
	;; client:start returns #t if login was successful.
	;;
	(let ((res (case (megatest:area-transport area-dat)
		     ((http)(server:ping-server run-id 
						(tasks:hostinfo-get-interface server)
						(tasks:hostinfo-get-port      server)))
		     ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
						 (tasks:hostinfo-get-port      server)
						 timeout: 2)))))
	  ;; if the server didn't respond we must remove the record
	  (if res
	      #t
	      (begin
		(debug:print-info 0 "server at " server " not responding, removing record")
		(tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id 
				" server:check-if-running")
		res)))
	#f))))

;; called in megatest.scm, host-port is string hostname:port
;;
(define (server:ping run-id host:port area-dat)
  (let ((tdbdat (tasks:open-db area-dat)))
    (let* ((host-port (let ((slst (string-split   host:port ":")))
			(if (eq? (length slst) 2)
			    (list (car slst)(string->number (cadr slst)))
			    #f)))
	   (toppath       (launch:setup-for-run))
	   (server-db-dat (if (not host-port)(tasks:get-server (db:delay-if-busy tdbdat) run-id) #f)))
      (if (not run-id)

Modified tasks.scm from [af8f785c6d] to [f2e471c669].

98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
			     ((file-read-access? dbpath)    (sqlite3:open-database dbfile))
			     (else (sqlite3:open-database ":memory:")))) ;; (never-give-up-open-db dbpath))
	      (handler      (make-busy-timeout 36000)))
	 (if (and exists
		  (not write-access))
	     (set! *db-write-access* write-access)) ;; only unset so other db's also can use this control
	 (sqlite3:set-busy-handler! mdb handler)
	 (db:set-sync mdb) ;; (sqlite3:execute mdb (conc "PRAGMA synchronous = 0;"))
	 ;;  (if (or (and (not exists)
	 ;; 	      (file-write-access? toppath))
	 ;; 	 (not (file-read-access? dbpath)))
	 ;;      (begin
	 ;; 
	 ;; TASKS QUEUE MOVED TO main.db
	 ;;







|







98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
			     ((file-read-access? dbpath)    (sqlite3:open-database dbfile))
			     (else (sqlite3:open-database ":memory:")))) ;; (never-give-up-open-db dbpath))
	      (handler      (make-busy-timeout 36000)))
	 (if (and exists
		  (not write-access))
	     (set! *db-write-access* write-access)) ;; only unset so other db's also can use this control
	 (sqlite3:set-busy-handler! mdb handler)
	 (db:set-sync mdb area-dat) ;; (sqlite3:execute mdb (conc "PRAGMA synchronous = 0;"))
	 ;;  (if (or (and (not exists)
	 ;; 	      (file-write-access? toppath))
	 ;; 	 (not (file-read-access? dbpath)))
	 ;;      (begin
	 ;; 
	 ;; TASKS QUEUE MOVED TO main.db
	 ;;
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
(define (tasks:hostinfo-get-interface   vec)    (vector-ref  vec 1))
(define (tasks:hostinfo-get-port        vec)    (vector-ref  vec 2))
(define (tasks:hostinfo-get-pubport     vec)    (vector-ref  vec 3))
(define (tasks:hostinfo-get-transport   vec)    (vector-ref  vec 4))
(define (tasks:hostinfo-get-pid         vec)    (vector-ref  vec 5))
(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

(define (tasks:server-lock-slot mdb run-id)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot")
  (if (< (tasks:num-in-available-state mdb run-id) 4)
      (begin 
	(tasks:server-set-available mdb run-id)
	(thread-sleep! (/ (random 1500) 1000)) ;; (thread-sleep! 2) ;; Try removing this. It may not be needed.
	(tasks:server-am-i-the-server? mdb run-id))
      #f))
	
;; register that this server may come online (first to register goes though with the process)
(define (tasks:server-set-available mdb run-id)
  (sqlite3:execute 
   mdb 
   "INSERT INTO servers (pid,hostname,port,pubport,start_time,      priority,state,mt_version,heartbeat,   interface,transport,run_id)
                   VALUES(?, ?,       ?,   ?, strftime('%s','now'), ?,       ?,    ?,-1,?,        ?,        ?);"
   (current-process-id)          ;; pid
   (get-host-name)               ;; hostname
   -1                            ;; port
   -1                            ;; pubport
   (random 1000)                 ;; priority (used a tiebreaker on get-available)
   "available"                   ;; state
   (common:version-signature)    ;; mt_version
   -1                            ;; interface
   ;; (conc (server:get-transport)) ;; transport
   (conc *transport-type*)    ;; transport
   run-id
   ))

(define (tasks:num-in-available-state mdb run-id)
  (let ((res 0))
    (sqlite3:for-each-row
     (lambda (num-in-queue)







|



|





|













|







164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
(define (tasks:hostinfo-get-interface   vec)    (vector-ref  vec 1))
(define (tasks:hostinfo-get-port        vec)    (vector-ref  vec 2))
(define (tasks:hostinfo-get-pubport     vec)    (vector-ref  vec 3))
(define (tasks:hostinfo-get-transport   vec)    (vector-ref  vec 4))
(define (tasks:hostinfo-get-pid         vec)    (vector-ref  vec 5))
(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

(define (tasks:server-lock-slot mdb run-id area-dat)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot")
  (if (< (tasks:num-in-available-state mdb run-id) 4)
      (begin 
	(tasks:server-set-available mdb run-id area-dat)
	(thread-sleep! (/ (random 1500) 1000)) ;; (thread-sleep! 2) ;; Try removing this. It may not be needed.
	(tasks:server-am-i-the-server? mdb run-id))
      #f))
	
;; register that this server may come online (first to register goes though with the process)
(define (tasks:server-set-available mdb run-id area-dat)
  (sqlite3:execute 
   mdb 
   "INSERT INTO servers (pid,hostname,port,pubport,start_time,      priority,state,mt_version,heartbeat,   interface,transport,run_id)
                   VALUES(?, ?,       ?,   ?, strftime('%s','now'), ?,       ?,    ?,-1,?,        ?,        ?);"
   (current-process-id)          ;; pid
   (get-host-name)               ;; hostname
   -1                            ;; port
   -1                            ;; pubport
   (random 1000)                 ;; priority (used a tiebreaker on get-available)
   "available"                   ;; state
   (common:version-signature)    ;; mt_version
   -1                            ;; interface
   ;; (conc (server:get-transport)) ;; transport
   (conc (megatest:area-transport area-dat))    ;; transport
   run-id
   ))

(define (tasks:num-in-available-state mdb run-id)
  (let ((res 0))
    (sqlite3:for-each-row
     (lambda (num-in-queue)

Modified tdb.scm from [6d49ce2a57] to [8118e8579d].

38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
;;======================================================================

;;======================================================================
;; T E S T   S P E C I F I C   D B 
;;======================================================================

;; Create the sqlite db for the individual test(s)
(define (open-test-db work-area) 
  (debug:print-info 11 "open-test-db " work-area)
  (if (and work-area 
	   (directory? work-area)
	   (file-read-access? work-area))
      (let* ((dbpath              (conc work-area "/testdat.db"))
	     (dbexists            (file-exists? dbpath))
	     (work-area-writeable (file-write-access? work-area))







|







38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
;;======================================================================

;;======================================================================
;; T E S T   S P E C I F I C   D B 
;;======================================================================

;; Create the sqlite db for the individual test(s)
(define (open-test-db work-area area-dat) 
  (debug:print-info 11 "open-test-db " work-area)
  (if (and work-area 
	   (directory? work-area)
	   (file-read-access? work-area))
      (let* ((dbpath              (conc work-area "/testdat.db"))
	     (dbexists            (file-exists? dbpath))
	     (work-area-writeable (file-write-access? work-area))
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
					       136000))))
	
	(if (and tdb-writeable
		 *db-write-access*)
	    (sqlite3:set-busy-handler! db handler))
	(if (not dbexists)
	    (begin
	      (db:set-sync db) ;; (sqlite3:execute db "PRAGMA synchronous = FULL;")
	      (debug:print-info 11 "Initialized test database " dbpath)
	      (tdb:testdb-initialize db)))
	;; (sqlite3:execute db "PRAGMA synchronous = 0;")
	(debug:print-info 11 "open-test-db END (sucessful)" work-area)
	;; now let's test that everything is correct
	(handle-exceptions
	 exn







|







69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
					       136000))))
	
	(if (and tdb-writeable
		 *db-write-access*)
	    (sqlite3:set-busy-handler! db handler))
	(if (not dbexists)
	    (begin
	      (db:set-sync db area-dat) ;; (sqlite3:execute db "PRAGMA synchronous = FULL;")
	      (debug:print-info 11 "Initialized test database " dbpath)
	      (tdb:testdb-initialize db)))
	;; (sqlite3:execute db "PRAGMA synchronous = 0;")
	(debug:print-info 11 "open-test-db END (sucessful)" work-area)
	;; now let's test that everything is correct
	(handle-exceptions
	 exn
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
      (let ((baddb (sqlite3:open-database ":memory:")))
 	(debug:print-info 11 "open-test-db END (unsucessful)" work-area)
 	;; provide an in-mem db (this is dangerous!)
 	(tdb:testdb-initialize baddb)
 	baddb)))

;; find and open the testdat.db file for an existing test
(define (tdb:open-test-db-by-test-id test-id #!key (work-area #f))
  (let* ((test-path (if work-area
			work-area
			(rmt:test-get-rundir-from-test-id test-id))))
    (debug:print 3 "TEST PATH: " test-path)
    (open-test-db test-path)))

;; find and open the testdat.db file for an existing test
(define (tdb:open-test-db-by-test-id-local dbstruct run-id test-id #!key (work-area #f))
  (let* ((test-path (if work-area
			work-area
			(db:test-get-rundir-from-test-id dbstruct run-id test-id))))
    (debug:print 3 "TEST PATH: " test-path)
    (open-test-db test-path)))

;; find and open the testdat.db file for an existing test
(define (tdb:open-run-close-db-by-test-id-local dbstruct run-id test-id work-area proc . params)
  (let* ((test-path (if work-area
			work-area
			(db:test-get-rundir-from-test-id dbstruct run-id test-id)))
	 (tdb        (open-test-db test-path)))
    (apply proc tdb params)))

(define (tdb:testdb-initialize db)
  (debug:print 11 "db:testdb-initialize START")
  (sqlite3:with-transaction
   db
   (lambda ()







|




|


|




|


|



|







95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
      (let ((baddb (sqlite3:open-database ":memory:")))
 	(debug:print-info 11 "open-test-db END (unsucessful)" work-area)
 	;; provide an in-mem db (this is dangerous!)
 	(tdb:testdb-initialize baddb)
 	baddb)))

;; find and open the testdat.db file for an existing test
(define (tdb:open-test-db-by-test-id test-id area-dat #!key (work-area #f))
  (let* ((test-path (if work-area
			work-area
			(rmt:test-get-rundir-from-test-id test-id))))
    (debug:print 3 "TEST PATH: " test-path)
    (open-test-db test-path area-dat)))

;; find and open the testdat.db file for an existing test
(define (tdb:open-test-db-by-test-id-local dbstruct area-dat run-id test-id #!key (work-area #f))
  (let* ((test-path (if work-area
			work-area
			(db:test-get-rundir-from-test-id dbstruct run-id test-id))))
    (debug:print 3 "TEST PATH: " test-path)
    (open-test-db test-path area-dat)))

;; find and open the testdat.db file for an existing test
(define (tdb:open-run-close-db-by-test-id-local dbstruct area-dat run-id test-id work-area proc . params)
  (let* ((test-path (if work-area
			work-area
			(db:test-get-rundir-from-test-id dbstruct run-id test-id)))
	 (tdb        (open-test-db test-path area-dat)))
    (apply proc tdb params)))

(define (tdb:testdb-initialize db)
  (debug:print 11 "db:testdb-initialize START")
  (sqlite3:with-transaction
   db
   (lambda ()
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
			 (if (eq? time-a time-b)
			     (string<? (conc (vector-ref a 2))
				       (conc (vector-ref b 2)))
			     #f))
		     (string<? (conc time-a)(conc time-b))))))))

;; 
(define (tdb:remote-update-testdat-meta-info run-id test-id work-area cpuload diskfree minutes)
  (let ((tdb         (rmt:open-test-db-by-test-id run-id test-id work-area: work-area)))
    (if (sqlite3:database? tdb)
	(begin
	  (sqlite3:execute tdb "INSERT INTO test_rundat (update_time,cpuload,diskfree,run_duration) VALUES (strftime('%s','now'),?,?,?);"
			   cpuload diskfree minutes)
	  (sqlite3:finalize! tdb))
	(debug:print 2 "Can't update testdat.db for test " test-id " read-only or non-existant"))))
    







|
|







366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
			 (if (eq? time-a time-b)
			     (string<? (conc (vector-ref a 2))
				       (conc (vector-ref b 2)))
			     #f))
		     (string<? (conc time-a)(conc time-b))))))))

;; 
(define (tdb:remote-update-testdat-meta-info run-id test-id work-area cpuload diskfree minutes area-dat)
  (let ((tdb         (rmt:open-test-db-by-test-id run-id test-id area-dat work-area: work-area)))
    (if (sqlite3:database? tdb)
	(begin
	  (sqlite3:execute tdb "INSERT INTO test_rundat (update_time,cpuload,diskfree,run_duration) VALUES (strftime('%s','now'),?,?,?);"
			   cpuload diskfree minutes)
	  (sqlite3:finalize! tdb))
	(debug:print 2 "Can't update testdat.db for test " test-id " read-only or non-existant"))))
    

Modified tests/unittests/basicserver.scm from [eb22c48242] to [28cf57e0e6].

19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
;; (test #f '("SYSTEM" "RELEASE") (rmt:get-keys))

;; (exit)

;; Server tests go here 
(for-each
 (lambda (run-id)
   (test #f #f (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id))
   (server:kind-run run-id *area-dat*)
   (test "did server start within 20 seconds?"
	 #t
	 (let loop ((remtries 20)
		    (running (tasks:server-running-or-starting? (db:delay-if-busy
								 (tasks:open-db))
								run-id)))
	   (if running 
	       (> running 0)
	       (if (> remtries 0)
		   (begin
		     (thread-sleep! 1)
		     (loop (- remtries 1)
			   (tasks:server-running-or-starting? (db:delay-if-busy
							       (tasks:open-db))
							      run-id)))))))
   
   (test "did server become available" #t
	 (let loop ((remtries 10)
		    (res      (tasks:get-server (db:delay-if-busy (tasks:open-db)) run-id)))
	   (if res
	       (vector? res)
	       (begin
		 (if (> remtries 0)
		     (begin
		       (thread-sleep! 1.1)
		       (loop (- remtries 1)(tasks:get-server (db:delay-if-busy (tasks:open-db)) run-id)))
		     res)))))
   )
 (list 0 1))

(define user    (current-user-name))
(define runname "mytestrun")
(define keys    (rmt:get-keys))







|





|








|




|






|







19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
;; (test #f '("SYSTEM" "RELEASE") (rmt:get-keys))

;; (exit)

;; Server tests go here 
(for-each
 (lambda (run-id)
   (test #f #f (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db *area-dat*) *area-dat*) run-id))
   (server:kind-run run-id *area-dat*)
   (test "did server start within 20 seconds?"
	 #t
	 (let loop ((remtries 20)
		    (running (tasks:server-running-or-starting? (db:delay-if-busy
								 (tasks:open-db *area-dat*) *area-dat*)
								run-id)))
	   (if running 
	       (> running 0)
	       (if (> remtries 0)
		   (begin
		     (thread-sleep! 1)
		     (loop (- remtries 1)
			   (tasks:server-running-or-starting? (db:delay-if-busy
							       (tasks:open-db *area-dat*) *area-dat*)
							      run-id)))))))
   
   (test "did server become available" #t
	 (let loop ((remtries 10)
		    (res      (tasks:get-server (db:delay-if-busy (tasks:open-db *area-dat*) *area-dat*) run-id)))
	   (if res
	       (vector? res)
	       (begin
		 (if (> remtries 0)
		     (begin
		       (thread-sleep! 1.1)
		       (loop (- remtries 1)(tasks:get-server (db:delay-if-busy (tasks:open-db *area-dat*) *area-dat*) run-id)))
		     res)))))
   )
 (list 0 1))

(define user    (current-user-name))
(define runname "mytestrun")
(define keys    (rmt:get-keys))
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
				"runname")))

(for-each (lambda (run-id)
;; test killing server
;;
(tasks:kill-server-run-id run-id)

(test #f #f (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id))
)
(list 0 1))

;; Tests to assess reading/writing while servers are starting/stopping
(define start-time (current-seconds))
(let loop ((test-state 'start))
  (let* ((server-dats (tasks:get-server-records (db:delay-if-busy (tasks:open-db)) run-id))
	 (first-dat   (if (not (null? server-dats))
			  (car server-dats)
			  #f)))
    (map (lambda (dat)
	   (apply print (intersperse (vector->list dat) ", ")))
	 server-dats)
    (test #f test-one-rec (rmt:get-test-info-by-id run-id test-one-id))







|






|







115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
				"runname")))

(for-each (lambda (run-id)
;; test killing server
;;
(tasks:kill-server-run-id run-id)

(test #f #f (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db *area-dat*) *area-dat*) run-id))
)
(list 0 1))

;; Tests to assess reading/writing while servers are starting/stopping
(define start-time (current-seconds))
(let loop ((test-state 'start))
  (let* ((server-dats (tasks:get-server-records (db:delay-if-busy (tasks:open-db *area-dat*) *area-dat*) run-id))
	 (first-dat   (if (not (null? server-dats))
			  (car server-dats)
			  #f)))
    (map (lambda (dat)
	   (apply print (intersperse (vector->list dat) ", ")))
	 server-dats)
    (test #f test-one-rec (rmt:get-test-info-by-id run-id test-one-id))
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203

;; (set! *transport-type* 'http)
;; 
;; (test "setup for run" #t (begin (launch:setup-for-run)
;; 				(string? (getenv "MT_RUN_AREA_HOME"))))
;; 
;; (test "server-register, get-best-server" #t (let ((res #f))
;; 					      (open-run-close tasks:server-register tasks:open-db 1 "bob" 1234 100 'live 'http)
;; 					      (set! res (open-run-close tasks:get-best-server tasks:open-db))
;; 					      (number? (vector-ref res 3))))
;; 
;; (test "de-register server" #f (let ((res #f))
;; 				(open-run-close tasks:server-deregister tasks:open-db "bob" port: 1234)
;; 				(vector? (open-run-close tasks:get-best-server tasks:open-db))))
;; 
;; (define server-pid #f)
;; 
;; ;; Not sure how the following should work, replacing it with system of megatest -server
;; ;; (test "launch server" #t (let ((pid (process-fork (lambda ()
;; ;; 						    ;; (daemon:ize)
;; ;; 						    (server:launch 'http)))))
;; ;; 			   (set! server-pid pid)
;; ;; 			   (number? pid)))
;; (system "../../bin/megatest -server - -debug 22 > server.log 2> server.log &")
;; 
;; (let loop ((n 10))
;;   (thread-sleep! 1) ;; need to wait for server to start.
;;   (let ((res (open-run-close tasks:get-best-server tasks:open-db)))
;;     (print "tasks:get-best-server returned " res)
;;     (if (and (not res)
;; 	     (> n 0))
;; 	(loop (- n 1)))))
;; 
;; (test "get-best-server" #t (begin 
;; 			     (client:launch)
;; 			     (let ((dat (open-run-close tasks:get-best-server tasks:open-db)))
;; 			       (vector? dat))))
;; 
;; (define *keys*               (keys:config-get-fields *configdat*))
;; (define *keyvals*            (keys:target->keyval *keys* "a/b/c"))
;; 
;; (test #f #t                       (string? (car *runremote*)))
;; (test #f '(#t "successful login") (rmt:login)) ;;  *runremote* *toppath* *my-client-signature*)))







|
|



|
|













|







|







161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203

;; (set! *transport-type* 'http)
;; 
;; (test "setup for run" #t (begin (launch:setup-for-run)
;; 				(string? (getenv "MT_RUN_AREA_HOME"))))
;; 
;; (test "server-register, get-best-server" #t (let ((res #f))
;; 					      (open-run-close tasks:server-register tasks:open-db *area-dat* 1 "bob" 1234 100 'live 'http)
;; 					      (set! res (open-run-close tasks:get-best-server tasks:open-db *area-dat*))
;; 					      (number? (vector-ref res 3))))
;; 
;; (test "de-register server" #f (let ((res #f))
;; 				(open-run-close tasks:server-deregister tasks:open-db *area-dat* "bob" port: 1234)
;; 				(vector? (open-run-close tasks:get-best-server tasks:open-db *area-dat*))))
;; 
;; (define server-pid #f)
;; 
;; ;; Not sure how the following should work, replacing it with system of megatest -server
;; ;; (test "launch server" #t (let ((pid (process-fork (lambda ()
;; ;; 						    ;; (daemon:ize)
;; ;; 						    (server:launch 'http)))))
;; ;; 			   (set! server-pid pid)
;; ;; 			   (number? pid)))
;; (system "../../bin/megatest -server - -debug 22 > server.log 2> server.log &")
;; 
;; (let loop ((n 10))
;;   (thread-sleep! 1) ;; need to wait for server to start.
;;   (let ((res (open-run-close tasks:get-best-server tasks:open-db *area-dat*)))
;;     (print "tasks:get-best-server returned " res)
;;     (if (and (not res)
;; 	     (> n 0))
;; 	(loop (- n 1)))))
;; 
;; (test "get-best-server" #t (begin 
;; 			     (client:launch)
;; 			     (let ((dat (open-run-close tasks:get-best-server tasks:open-db *area-dat*)))
;; 			       (vector? dat))))
;; 
;; (define *keys*               (keys:config-get-fields *configdat*))
;; (define *keyvals*            (keys:target->keyval *keys* "a/b/c"))
;; 
;; (test #f #t                       (string? (car *runremote*)))
;; (test #f '(#t "successful login") (rmt:login)) ;;  *runremote* *toppath* *my-client-signature*)))

Modified zmq-transport.scm from [fa40b1e46e] to [46e65cf05a].

108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

    (set! *runremote* (vector pull-socket pub-socket)) ;; overloading the use of (common:get-remote remote) BUG!?

    ;; what to do when we quit
    ;;
;;     (on-exit (lambda ()
;; 	       (if (and toppath *server-info*)
;; 		   (open-run-close tasks:server-deregister-self tasks:open-db (car *server-info*))
;; 		   (let loop () 
;; 		     (let ((queue-len 0))
;; 		       (thread-sleep! (random 5))
;; 		       (mutex-lock! *incoming-mutex*)
;; 		       (set! queue-len (length *incoming-data*))
;; 		       (mutex-unlock! *incoming-mutex*)
;; 		       (if (> queue-len 0)







|







108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

    (set! *runremote* (vector pull-socket pub-socket)) ;; overloading the use of (common:get-remote remote) BUG!?

    ;; what to do when we quit
    ;;
;;     (on-exit (lambda ()
;; 	       (if (and toppath *server-info*)
;; 		   (open-run-close tasks:server-deregister-self (lambda ()(tasks:open-db area-dat)) (car *server-info*))
;; 		   (let loop () 
;; 		     (let ((queue-len 0))
;; 		       (thread-sleep! (random 5))
;; 		       (mutex-lock! *incoming-mutex*)
;; 		       (set! queue-len (length *incoming-data*))
;; 		       (mutex-unlock! *incoming-mutex*)
;; 		       (if (> queue-len 0)
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
	      
	      (loop '()))
	    (loop (cons packet queue-lst)))))))

;; run zmq-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (zmq-transport:keep-running)
  ;; if none running or if > 20 seconds since 
  ;; server last used then start shutdown
  ;; This thread waits for the server to come alive
  (let* ((server-info (let loop ()
			(let ((sdat #f))
			  (mutex-lock! *heartbeat-mutex*)
			  (set! sdat *server-info*)







|







146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
	      
	      (loop '()))
	    (loop (cons packet queue-lst)))))))

;; run zmq-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (zmq-transport:keep-running area-dat)
  ;; if none running or if > 20 seconds since 
  ;; server last used then start shutdown
  ;; This thread waits for the server to come alive
  (let* ((server-info (let loop ()
			(let ((sdat #f))
			  (mutex-lock! *heartbeat-mutex*)
			  (set! sdat *server-info*)
176
177
178
179
180
181
182
183



184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203



204
205
206
207
208
209
210
      ;; GET REAL QUEUE LENGTH FROM THE VARIABLE
      (let ((queue-len 0)) ;; FOR NOW DO NOT DO THIS (cdb:client-call zmq-sockets 'sync #t 1)))
      ;; (print "Server running, count is " count)
	(if (< count 1) ;; 3x3 = 9 secs aprox
	    (loop (+ count 1)))

	;; NOTE: Get rid of this mechanism! It really is not needed...
	(open-run-close tasks:server-update-heartbeat tasks:open-db (car server-info))




	;; (if ;; (or (> numrunning 0) ;; stay alive for two days after last access
	(mutex-lock! *heartbeat-mutex*)
	(set! last-access *last-db-access*)
	(mutex-unlock! *heartbeat-mutex*)
	(if (> (+ last-access
		  ;; (* 50 60 60)    ;; 48 hrs
		  ;; 60              ;; one minute
		  ;; (* 60 60)       ;; one hour
		  (* 45 60)          ;; 45 minutes, until the db deletion bug is fixed.
		  )
	       (current-seconds))
	    (begin
	      (debug:print-info 2 "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	      (loop 0))
	    (begin
	      (debug:print-info 0 "Starting to shutdown the server.")
	      ;; need to delete only *my* server entry (future use)
	      (set! *time-to-exit* #t)
	      (open-run-close tasks:server-deregister-self tasks:open-db (get-host-name))



	      (thread-sleep! 1)
	      (debug:print-info 0 "Max cached queries was " *max-cache-size*)
	      (debug:print-info 0 "Server shutdown complete. Exiting")
	      (exit)))))))

(define (zmq-transport:find-free-port-and-open iface s port stype #!key (trynum 50))
  (let ((s (if s s (make-socket stype)))







|
>
>
>



















|
>
>
>







176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
      ;; GET REAL QUEUE LENGTH FROM THE VARIABLE
      (let ((queue-len 0)) ;; FOR NOW DO NOT DO THIS (cdb:client-call zmq-sockets 'sync #t 1)))
      ;; (print "Server running, count is " count)
	(if (< count 1) ;; 3x3 = 9 secs aprox
	    (loop (+ count 1)))

	;; NOTE: Get rid of this mechanism! It really is not needed...
	(open-run-close tasks:server-update-heartbeat
			(lambda ()
			  (tasks:open-db area-dat))
			(car server-info))

	;; (if ;; (or (> numrunning 0) ;; stay alive for two days after last access
	(mutex-lock! *heartbeat-mutex*)
	(set! last-access *last-db-access*)
	(mutex-unlock! *heartbeat-mutex*)
	(if (> (+ last-access
		  ;; (* 50 60 60)    ;; 48 hrs
		  ;; 60              ;; one minute
		  ;; (* 60 60)       ;; one hour
		  (* 45 60)          ;; 45 minutes, until the db deletion bug is fixed.
		  )
	       (current-seconds))
	    (begin
	      (debug:print-info 2 "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	      (loop 0))
	    (begin
	      (debug:print-info 0 "Starting to shutdown the server.")
	      ;; need to delete only *my* server entry (future use)
	      (set! *time-to-exit* #t)
	      (open-run-close tasks:server-deregister-self
			      (lambda ()
				(tasks:open-db area-dat))
			      (get-host-name))
	      (thread-sleep! 1)
	      (debug:print-info 0 "Max cached queries was " *max-cache-size*)
	      (debug:print-info 0 "Server shutdown complete. Exiting")
	      (exit)))))))

(define (zmq-transport:find-free-port-and-open iface s port stype #!key (trynum 50))
  (let ((s (if s s (make-socket stype)))
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
         (p1 (caddr s1))
         (s2 (zmq-transport:find-free-port-and-open ipaddrstr #f (+ 1 (if p1 p1 (+ startport 1))) 'pub))
         (p2 (caddr s2)))
    (set! *runremote* #f)
    (debug:print 0 "Server started on " ipaddrstr " ports " p1 " and " p2)
    (mutex-lock! *heartbeat-mutex*)
    (set! *server-info* (open-run-close tasks:server-register 
					tasks:open-db 
					(current-process-id) 
					ipaddrstr p1 
					0 
					'live
					'zmq
					pubport: p2))
    (debug:print-info 11 "*server-info* set to " *server-info*)







|







238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
         (p1 (caddr s1))
         (s2 (zmq-transport:find-free-port-and-open ipaddrstr #f (+ 1 (if p1 p1 (+ startport 1))) 'pub))
         (p2 (caddr s2)))
    (set! *runremote* #f)
    (debug:print 0 "Server started on " ipaddrstr " ports " p1 " and " p2)
    (mutex-lock! *heartbeat-mutex*)
    (set! *server-info* (open-run-close tasks:server-register 
					(lambda ()(tasks:open-db area-dat))
					(current-process-id) 
					ipaddrstr p1 
					0 
					'live
					'zmq
					pubport: p2))
    (debug:print-info 11 "*server-info* set to " *server-info*)
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
	  (debug:print-info 2 "Failed to login or connect to " conurl)
	  (set! *runremote* #f)
	  #f))))

;; run zmq-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (zmq-transport:keep-running)
  ;; if none running or if > 20 seconds since 
  ;; server last used then start shutdown
  ;; This thread waits for the server to come alive
  (let* ((server-info (let loop ()
                        (let ((sdat #f))
                          (mutex-lock! *heartbeat-mutex*)
                          (set! sdat *runremote*)
                          (mutex-unlock! *heartbeat-mutex*)
                          (if sdat sdat
                              (begin
                                (sleep 4)
                                (loop))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
	 (tdb         (tasks:open-db))
	 (spid        (tasks:server-get-server-id tdb #f iface port #f)))
    (print "Keep-running got server pid " spid ", using iface " iface " and port " port)
    (let loop ((count 0))
      (thread-sleep! 4) ;; no need to do this very often
      ;; NB// sync currently does NOT return queue-length
      (let () ;; (queue-len (cdb:client-call server-info 'sync #t 1)))
      ;; (print "Server running, count is " count)







|















|







308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
	  (debug:print-info 2 "Failed to login or connect to " conurl)
	  (set! *runremote* #f)
	  #f))))

;; run zmq-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (zmq-transport:keep-running area-dat)
  ;; if none running or if > 20 seconds since 
  ;; server last used then start shutdown
  ;; This thread waits for the server to come alive
  (let* ((server-info (let loop ()
                        (let ((sdat #f))
                          (mutex-lock! *heartbeat-mutex*)
                          (set! sdat *runremote*)
                          (mutex-unlock! *heartbeat-mutex*)
                          (if sdat sdat
                              (begin
                                (sleep 4)
                                (loop))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
	 (tdb         (tasks:open-db area-dat))
	 (spid        (tasks:server-get-server-id tdb #f iface port #f)))
    (print "Keep-running got server pid " spid ", using iface " iface " and port " port)
    (let loop ((count 0))
      (thread-sleep! 4) ;; no need to do this very often
      ;; NB// sync currently does NOT return queue-length
      (let () ;; (queue-len (cdb:client-call server-info 'sync #t 1)))
      ;; (print "Server running, count is " count)