Megatest

Check-in [10af298b33]
Login
Overview
Comment:Added support to switch between various methods of handling call loops
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v2.0001
Files: files | file ages | folders
SHA1: 10af298b33d32d14beac3a1160f7bccd8187b48b
User & Date: matt on 2022-01-10 06:42:39
Other Links: branch diff | manifest | tags
Context
2022-01-10
07:55
Use ulex-simple to explore using tcp-server egg check-in: f885e8c541 user: matt tags: v2.0001
06:42
Added support to switch between various methods of handling call loops check-in: 10af298b33 user: matt tags: v2.0001
2022-01-08
20:46
Switch to using threads instead of mailbox for worker calls. Seems to not block as much. check-in: ba5884c651 user: matt tags: v2.0001
Changes

Modified launchmod.scm from [26d43d79a0] to [a3891cff5d].

1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
;; 0 RUNNING ==> this is actually the first condition, should not get here

(define (runs:end-of-run-check run-id )
  (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))  
	 (running-cnt (rmt:get-count-tests-running-for-run-id run-id))
	 (all-test-launched (rmt:get-var run-id (conc "lunch-complete-" run-id)))
	 (current-state (rmt:get-run-state run-id))
	 (current-status (rmt:get-run-status run-id)))
    ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
    (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)                      
    (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
    (runs:update-junit-test-reporter-xml run-id) 
    (cond 







|







1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
;; 0 RUNNING ==> this is actually the first condition, should not get here

(define (runs:end-of-run-check run-id )
  (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))  
	 (running-cnt (rmt:get-count-tests-running-for-run-id run-id))
	 (all-test-launched (rmt:get-var run-id (conc "launch-complete-" run-id)))
	 (current-state (rmt:get-run-state run-id))
	 (current-status (rmt:get-run-status run-id)))
    ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
    (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)                      
    (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
    (runs:update-junit-test-reporter-xml run-id) 
    (cond 

Modified megatest.scm from [be96ddd230] to [b7fe71f476].

154
155
156
157
158
159
160
161

162
163








164
165
166
167
168
169
170
	  launchmod
	  processmod
	  rmtmod
	  runsmod
	  servermod
	  tasksmod
	  testsmod
	  

	  )
	








;; fake out readline usage of toplevel-command
(define (toplevel-command . a) #f)
(define *didsomething* #f)  
(define *db* #f) ;; this is only for the repl, do not use in general!!!!

;; (include "common_records.scm")
;; (include "key_records.scm")







|
>

|
>
>
>
>
>
>
>
>







154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
	  launchmod
	  processmod
	  rmtmod
	  runsmod
	  servermod
	  tasksmod
	  testsmod

	  ulex
	  )

;;   ;; ulex parameters
;;   (work-method 'direct)
;;   (return-method 'direct)
  
  ;; ulex parameters
  (work-method 'mailbox)
  (return-method 'mailbox)
  
;; fake out readline usage of toplevel-command
(define (toplevel-command . a) #f)
(define *didsomething* #f)  
(define *db* #f) ;; this is only for the repl, do not use in general!!!!

;; (include "common_records.scm")
;; (include "key_records.scm")

Modified rmtmod.scm from [941de4f2ec] to [a8f42f4480].

1790
1791
1792
1793
1794
1795
1796



1797
1798
1799
1800
1801
1802
1803
1804
  (let* ((effective-toppath (or *toppath* apath)))
    (assert effective-toppath
	    "ERROR: get-pkts-dir called without *toppath* set. Exiting.")
    (let* ((pdir (conc effective-toppath "/.meta/srvpkts")))
      (if (file-exists? pdir)
	  pdir
	  (begin



	    (create-directory pdir #t)
	    pdir)))))

;; given a pkts dir read 
;;
(define (get-all-server-pkts pktsdir-in pktspec)
  (let* ((pktsdir  (if (file-exists? pktsdir-in)
		       pktsdir-in







>
>
>
|







1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
  (let* ((effective-toppath (or *toppath* apath)))
    (assert effective-toppath
	    "ERROR: get-pkts-dir called without *toppath* set. Exiting.")
    (let* ((pdir (conc effective-toppath "/.meta/srvpkts")))
      (if (file-exists? pdir)
	  pdir
	  (begin
	    (handle-exceptions ;; this exception handler should NOT be needed but ...
		exn
		pdir
	      (create-directory pdir #t))
	    pdir)))))

;; given a pkts dir read 
;;
(define (get-all-server-pkts pktsdir-in pktspec)
  (let* ((pktsdir  (if (file-exists? pktsdir-in)
		       pktsdir-in

Modified runsmod.scm from [7c62c7e318] to [b535942743].

497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
    ;; Ensure all tests are registered in the test_meta table
    (runs:update-all-test_meta #f)

    ;; run the run prehook if there are no tests yet run for this run:
    ;;
    (runs:run-pre-hook run-id)
    ;; mark all test launched flag as false in the meta table 
    (rmt:set-var run-id (conc "lunch-complete-" run-id) "no")
    (debug:print-info 1 *default-log-port* "Setting end-of-run to no")
    (let* ((config-reruns      (let ((x (configf:lookup *configdat* "setup" "reruns")))
			       (if x (string->number x) #f)))
	  (config-rerun-cnt (if config-reruns
			config-reruns
			1)))
    (if (eq? config-rerun-cnt run-count)







|







497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
    ;; Ensure all tests are registered in the test_meta table
    (runs:update-all-test_meta #f)

    ;; run the run prehook if there are no tests yet run for this run:
    ;;
    (runs:run-pre-hook run-id)
    ;; mark all test launched flag as false in the meta table 
    (rmt:set-var run-id (conc "launch-complete-" run-id) "no")
    (debug:print-info 1 *default-log-port* "Setting end-of-run to no")
    (let* ((config-reruns      (let ((x (configf:lookup *configdat* "setup" "reruns")))
			       (if x (string->number x) #f)))
	  (config-rerun-cnt (if config-reruns
			config-reruns
			1)))
    (if (eq? config-rerun-cnt run-count)
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
	  (debug:print-info 0 *default-log-port* "Have leftovers!")
	  (loop (car reg)(cdr reg) '() reruns))
	 (else
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-9")
	  (debug:print-info 4 *default-log-port* "Exiting loop with...\n  hed=" hed "\n  tal=" tal "\n  reruns=" reruns))
	 ))) ;; end loop on sorted test names
    ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched 
    (rmt:set-var run-id (conc "lunch-complete-" run-id) "yes")  
        
    ;; now *if* -run-wait we wait for all tests to be done
    ;; Now wait for any RUNNING tests to complete (if in run-wait mode)
    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
    (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle
    
    (let wait-loop ((num-running      (rmt:get-count-tests-running-for-run-id run-id))







|







1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
	  (debug:print-info 0 *default-log-port* "Have leftovers!")
	  (loop (car reg)(cdr reg) '() reruns))
	 (else
          (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-9")
	  (debug:print-info 4 *default-log-port* "Exiting loop with...\n  hed=" hed "\n  tal=" tal "\n  reruns=" reruns))
	 ))) ;; end loop on sorted test names
    ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched 
    (rmt:set-var run-id (conc "launch-complete-" run-id) "yes")  
        
    ;; now *if* -run-wait we wait for all tests to be done
    ;; Now wait for any RUNNING tests to complete (if in run-wait mode)
    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
    (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle
    
    (let wait-loop ((num-running      (rmt:get-count-tests-running-for-run-id run-id))

Modified tests/simplerun/Makefile from [18ac57f19f] to [115e15e0c2].

1
2
3
4
5

cleanup :
	killall mtest dboard -v -9 || true
	rm -rf *.log *.bak NB* logs/* .meta .db




|

1
2
3
4
5

cleanup :
	killall mtest dboard -v -9 || true
	rm -rf *.log *.bak NB* logs/* .meta .db ../simpleruns/* lt

Modified tests/simplerun/debug.scm from [d176a07199] to [6634dce456].

20
21
22
23
24
25
26
27
28

29

30
31


32
33
34
35
36
37
38
39

40
41

42
43
44
45
46
47
48
49
50
	 (l (string-length s)))
    (string->number (substring s (- l 3) l))
    ))

(define (run)
  (let* ((th1 (make-thread
	       (lambda ()
		 (let loop ((r (make-run-id))
			    (i 1))

		   (let ((start-time (current-milliseconds)))

		      (rmt:register-test r "test1" (conc "item_" i))
		      (let ((qry-time (- (current-milliseconds) start-time)))


			(if (> qry-time 500)
			    (print "WARNING: rmt:register-test took more than 500ms, "qry-time"ms"))))
		   (if (eq? (modulo i 100) 0)
		       (print "For run-id="r", num tests registered="i))
		   (if (< i 100000)
		       (loop r (+ i 1))
		       (if (< r 100)
			   (begin

			     (print "get-tests-for-run "r)
			     (rmt:get-tests-for-run r "%" '() '() 0 #f #f #f #f #f 0 #f)

			     (loop (+ r 1) 0)))))
		 ))))
    (thread-start! th1)
    (thread-join! th1)))

)

(import junk)
(run)







|
|
>
|
>
|
|
>
>
|
|
|
|
|
|
|
<
>
|
|
>
|
|







20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42

43
44
45
46
47
48
49
50
51
52
53
54
55
	 (l (string-length s)))
    (string->number (substring s (- l 3) l))
    ))

(define (run)
  (let* ((th1 (make-thread
	       (lambda ()
		 (let loop ((r 0)
			    (i 1)
			    (s 0)) ;; sum
		   (let ((start-time (current-milliseconds))
			 (run-id     (+ r (make-run-id))))
		     (rmt:register-test run-id "test1" (conc "item_" i))
		     (let* ((qry-time (- (current-milliseconds) start-time))
			    (tot-query-time (+ qry-time s))
			    (avg-query-time (* 1.0 (/ tot-query-time i))))
		       (if (> qry-time 500)
			   (print "WARNING: rmt:register-test took more than 500ms, "qry-time"ms, i="i", avg-query-time="avg-query-time))
		       (if (eq? (modulo i 100) 0)
			   (print "For run-id="run-id", "(rmt:get-keys-write)" num tests registered="i))
		       (if (< i 500)
			   (loop r (+ i 1) tot-query-time)
			   (if (< r 100)

			       (let* ((start-time (current-milliseconds)))
				 (print "rmt:get-keys "(rmt:get-keys)" in "(- (current-milliseconds) start-time))
				 (print "Got "(length (rmt:get-tests-for-run run-id "%" '() '() 0 #f #f #f #f #f 0 #f))" tests for run "run-id)
				 (print "Average query time: "avg-query-time)
				 (loop (+ r 1) 0 tot-query-time))))))))
	       )))
    (thread-start! th1)
    (thread-join! th1)))

)

(import junk)
(run)

Modified tests/simplerun/megatest.config from [de09ea7f96] to [e6118ecde2].

36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

# Valid values for state and status for steps, NB// It is not recommended you use this
[validvalues]
state start end completed

# Job tools are more advanced ways to control how your jobs are launched
[jobtools]
useshell yes
launcher nbfake

# You can override environment variables for all your tests here
[env-override]
EXAMPLE_VAR example value

# As you run more tests you may need to add additional disks, the names are arbitrary but must be unique







|







36
37
38
39
40
41
42
43
44
45
46
47
48
49
50

# Valid values for state and status for steps, NB// It is not recommended you use this
[validvalues]
state start end completed

# Job tools are more advanced ways to control how your jobs are launched
[jobtools]
# useshell yes
launcher nbfake

# You can override environment variables for all your tests here
[env-override]
EXAMPLE_VAR example value

# As you run more tests you may need to add additional disks, the names are arbitrary but must be unique

Modified tests/tests.scm from [3fa28c6a70] to [be8860baa4].

20
21
22
23
24
25
26

27
28
29



30
31
32
33
34
35
36
(import srfi-18 
	test 
	chicken.string
	chicken.process-context
	chicken.file
	chicken.pretty-print
	commonmod

	)

(define test-work-dir (current-directory))




;; given list of lists
;;  ( ( msg expected param1 param2 ...)
;;    ( ... ) )
;; apply test to all
;;
(define (test-batch proc pname inlst #!key (post-proc #f))







>



>
>
>







20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
(import srfi-18 
	test 
	chicken.string
	chicken.process-context
	chicken.file
	chicken.pretty-print
	commonmod
	ulex
	)

(define test-work-dir (current-directory))

(work-method   'mailbox)   ;; threads, direct, mailbox
(return-method 'mailbox)   ;; polling, mailbox, direct

;; given list of lists
;;  ( ( msg expected param1 param2 ...)
;;    ( ... ) )
;; apply test to all
;;
(define (test-batch proc pname inlst #!key (post-proc #f))

Modified ulex/ulex.scm from [123c4c1081] to [ded9484f4d].

48
49
50
51
52
53
54


55

56
57
58
59
60
61
62
     
     ;; needed to get the interface:port that was automatically found
     udat-port
     udat-host-port
     
     ;; for testing only
     ;; pp-uconn


     work-method ;; parameter; 'threads, 'mailbox, 'limited, 'direct

     )

(import scheme
	chicken.base
	chicken.file
	chicken.time
	chicken.condition







>
>
|
>







48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
     
     ;; needed to get the interface:port that was automatically found
     udat-port
     udat-host-port
     
     ;; for testing only
     ;; pp-uconn
     
     ;; parameters
     work-method   ;; parameter; 'threads, 'mailbox, 'limited, 'direct
     return-method ;; parameter; 'mailbox, 'polling, 'direct
     )

(import scheme
	chicken.base
	chicken.file
	chicken.time
	chicken.condition
102
103
104
105
106
107
108

109
110
111
112
113
114


115





116
117
118
119
120
121
122
  (work-queue-thread #f)
  (num-threads-running 0)
  ) 

;; Parameters

;; work-method:

;;    mailbox - all rdat goes through mailbox
;;    threads - all rdat immediately executed in new thread
;;    limited - run rdats in immediately executed threads until NthreadsMax
;;              reached, then put in mailbox
;;    direct  - no queuing
;;


(define work-method (make-parameter 'threads)) 





;; ;; struct for keeping track of others we are talking to
;; ;;
;; (defstruct pdat
;;   (host-port  #f)
;;   (conns      '()) ;; list of pcon structs, pop one off when calling the peer
;;   )
;; 







>


<
<


>
>
|
>
>
>
>
>







105
106
107
108
109
110
111
112
113
114


115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
  (work-queue-thread #f)
  (num-threads-running 0)
  ) 

;; Parameters

;; work-method:
(define work-method (make-parameter 'direct))
;;    mailbox - all rdat goes through mailbox
;;    threads - all rdat immediately executed in new thread


;;    direct  - no queuing
;;

;; return-method, return the result to waiting send-receive:
(define return-method (make-parameter 'direct))
;;    mailbox - create a mailbox and use it for passing returning results to send-receive
;;    polling - put the result in a hash table keyed by qrykey and send-receive can poll it for result
;;    direct  - no queuing, result is passed back in single tcp connection
;;

;; ;; struct for keeping track of others we are talking to
;; ;;
;; (defstruct pdat
;;   (host-port  #f)
;;   (conns      '()) ;; list of pcon structs, pop one off when calling the peer
;;   )
;; 
207
208
209
210
211
212
213
214
215
216
217

218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242



243























244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268



269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295






296
297
298
299
300
301
302
303

304
305
306
307
308
309
310
311
312
313
314
315
316
317
;;        - I believe (without substantial evidence) that re-using connections will
;;          be beneficial ...
;;
(define (send udata host-port qrykey cmd params)
  (let* ((my-host-port (udat-host-port udata))          ;; remote will return to this
	 (isme         #f #;(equal? host-port my-host-port)) ;; calling myself?
	 ;; dat is a self-contained work block that can be sent or handled locally
	 (dat          (list my-host-port qrykey cmd params))
	 )
    (if isme
	(ulex-handler udata dat) ;; no transmission needed

	(handle-exceptions ;; TODO - MAKE THIS EXCEPTION CMD SPECIFIC?
	    exn
	    #f
	  (begin
	    ; (mutex-lock! *send-mutex*)
	  (let-values (((inp oup)(tcp-connect host-port)))
	    (let ((res (if (and inp oup)
			   (begin
			     (serialize dat oup)
			     (deserialize inp)) ;; yes, we always want an ack
			   (begin
			     (print "ERROR: send called but no receiver has been setup. Please call setup first!")
			     #f))))
	      (close-input-port inp)
	      (close-output-port oup)
	      ; (mutex-unlock! *send-mutex*)
	      res))))))) ;; res will always be 'ack

;; send a request to the given host-port and register a mailbox in udata
;; wait for the mailbox data and return it
;;
(define (send-receive uconn host-port cmd data)
  (cond
   ((member cmd '(ping goodbye)) ;; these are immediate
    (send uconn host-port 'ping cmd data))



   (else























    (let* ((cmbox     (get-cmbox uconn)) ;; would it be better to keep a stack of mboxes to reuse?
	   (qrykey    (car cmbox))
	   (mbox      (cdr cmbox))
	   (mbox-time (current-milliseconds))
	   (sres      (send uconn host-port qrykey cmd data))) ;; short res
      (if (eq? sres 'ack)
	  (let* ((mbox-timeout-secs    120 #;(if (eq? 'primordial (thread-name (current-thread)))
					   #f
					   120)) ;; timeout)
		 (mbox-timeout-result 'MBOX_TIMEOUT)
		 (res                  (mailbox-receive! mbox mbox-timeout-secs mbox-timeout-result))
		 (mbox-receive-time    (current-milliseconds)))
	    ;; (put-cmbox uconn cmbox) ;; reuse mbox and cookie. is it worth it?
	    (hash-table-delete! (udat-mboxes uconn) qrykey)
	    (if (eq? res 'MBOX_TIMEOUT)
		(begin
		  (print "WARNING: mbox timed out for query "cmd", with data "data", waiting for response from "host-port".")

		  ;; here it might make sense to clean up connection records and force clean start?
		  ;; NO. The progam using ulex needs to do the reset. Right thing here is exception
		  
		  #f)  ;; convert to raising exception?
		res))
	  (begin
	    (print "ERROR: Communication failed? Got "sres)



	    #f))))))

;;======================================================================
;; responder side
;;======================================================================

;; take a request, rdat, and if not immediate put it in the work queue
;;
;; Reserved cmds; ack ping goodbye response
;;
(define (ulex-handler uconn rdat)
  (assert (list? rdat) "FATAL: ulex-handler give rdat as not list")
  (match rdat ;;  (string-split controldat)
    ((rem-host-port qrykey cmd params)
     ;; (print "ulex-handler got: "rem-host-port" qrykey: "qrykey" cmd: "cmd" params: "params)
     (let ((mbox (hash-table-ref/default (udat-mboxes uconn) qrykey #f)))
       (case cmd
	 ;; ((ack )(print "Got ack! But why? Should NOT get here.") 'ack)
	 ((ping)
	  ;; (print "Got Ping!")
	  ;; (add-to-work-queue uconn rdat)
	 'ack)
	 ((goodbye)
	  ;; just clear out references to the caller
	  (add-to-work-queue uconn rdat)
	  'ack)
	 ((response) ;; this is a result from remote processing, send it as mail ...






	  (if mbox
	      (begin
		(mailbox-send! mbox params) ;; params here is our result
		'ack)
	      (begin
		(print "ERROR: received result but no associated mbox for cookie "qrykey)
		#f)))
	 (else

	  ;; (print "Got generic request: "cmd)
	  (add-to-work-queue uconn rdat)
	  'ack))))
    (else
     (print "BAD DATA? controldat=" rdat)
     'ack) ;; send ack anyway?
    ))

;; given an already set up uconn start the cmd-loop
;;
(define (ulex-cmd-loop uconn)
  (let* ((serv-listener (udat-socket uconn))
	 (listener      (lambda ()
			  (let loop ((state 'start))







|
<
|
|
>
|
|
|
|
|




|





|
|
|







>
>
>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
|
|
>
>
>
|














<
|
|
|
|
|
|
|
|
|
|
|
>
>
>
>
>
>
|
|
|
|
|
|
|
|
>
|
|
|

|
<
|







216
217
218
219
220
221
222
223

224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321

322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352

353
354
355
356
357
358
359
360
;;        - I believe (without substantial evidence) that re-using connections will
;;          be beneficial ...
;;
(define (send udata host-port qrykey cmd params)
  (let* ((my-host-port (udat-host-port udata))          ;; remote will return to this
	 (isme         #f #;(equal? host-port my-host-port)) ;; calling myself?
	 ;; dat is a self-contained work block that can be sent or handled locally
	 (dat          (list my-host-port qrykey cmd params)))

    (cond
     (isme (ulex-handler udata dat)) ;; no transmission needed
     (else
      (handle-exceptions ;; TODO - MAKE THIS EXCEPTION CMD SPECIFIC?
	  exn
	  #f
	(begin
	  ;; (mutex-lock! *send-mutex*)
	  (let-values (((inp oup)(tcp-connect host-port)))
	    (let ((res (if (and inp oup)
			   (begin
			     (serialize dat oup)
			     (deserialize inp))
			   (begin
			     (print "ERROR: send called but no receiver has been setup. Please call setup first!")
			     #f))))
	      (close-input-port inp)
	      (close-output-port oup)
	      ;; (mutex-unlock! *send-mutex*)
	      res)))))))) ;; res will always be 'ack unless return-method is direct
  
;; send a request to the given host-port and register a mailbox in udata
;; wait for the mailbox data and return it
;;
(define (send-receive uconn host-port cmd data)
  (cond
   ((member cmd '(ping goodbye)) ;; these are immediate
    (send uconn host-port 'ping cmd data))
   ((eq? (work-method) 'direct)
    ;; the result from send will be the actual result, not an 'ack
    (send uconn host-port 'direct cmd data))
   (else
    (case (return-method)
      ((polling)
       (let* ((qrykey (make-cookie uconn))
	      (sres   (send uconn host-port qrykey cmd data)))
	 (case sres
	   ((ack)
	    (let loop ((start-time (current-milliseconds)))
	      (if (> (current-milliseconds)(+ start-time 10000)) ;; ten seconds timeout
		  (begin
		    (print "ULEX ERROR: timed out waiting for response from "host-port", "cmd" "data)
		    #f)
		  (let* ((result (hash-table-ref/default (udat-mboxes uconn) qrykey #f))) ;; NOTE: we are re-using mboxes hash
		    (if result ;; result is '(status . result-data) or #f for nothing yet
			(begin
			  (hash-table-delete! (udat-mboxes uconn) qrykey)
			  (cdr result))
			(begin
			  (thread-sleep! 0.01)
			  (loop start-time)))))))
	   (else
	    (print "ULEX ERROR: Communication failed? sres="sres)
	    #f))))
      ((mailbox) 
       (let* ((cmbox     (get-cmbox uconn)) ;; would it be better to keep a stack of mboxes to reuse?
	      (qrykey    (car cmbox))
	      (mbox      (cdr cmbox))
	      (mbox-time (current-milliseconds))
	      (sres      (send uconn host-port qrykey cmd data))) ;; short res
	 (if (eq? sres 'ack)
	     (let* ((mbox-timeout-secs    120 #;(if (eq? 'primordial (thread-name (current-thread)))
					  #f
					  120)) ;; timeout)
		    (mbox-timeout-result 'MBOX_TIMEOUT)
		    (res                  (mailbox-receive! mbox mbox-timeout-secs mbox-timeout-result))
		    (mbox-receive-time    (current-milliseconds)))
	       ;; (put-cmbox uconn cmbox) ;; reuse mbox and cookie. is it worth it?
	       (hash-table-delete! (udat-mboxes uconn) qrykey)
	       (if (eq? res 'MBOX_TIMEOUT)
		   (begin
		     (print "WARNING: mbox timed out for query "cmd", with data "data", waiting for response from "host-port".")

		     ;; here it might make sense to clean up connection records and force clean start?
		     ;; NO. The progam using ulex needs to do the reset. Right thing here is exception
		     
		     #f)  ;; convert to raising exception?
		   res))
	     (begin
	       (print "ERROR: Communication failed? Got "sres)
	       #f))))
      (else
       (print "ULEX ERROR: unrecognised return-method "(return-method)".")
       #f)))))

;;======================================================================
;; responder side
;;======================================================================

;; take a request, rdat, and if not immediate put it in the work queue
;;
;; Reserved cmds; ack ping goodbye response
;;
(define (ulex-handler uconn rdat)
  (assert (list? rdat) "FATAL: ulex-handler give rdat as not list")
  (match rdat ;;  (string-split controldat)
    ((rem-host-port qrykey cmd params)
     ;; (print "ulex-handler got: "rem-host-port" qrykey: "qrykey" cmd: "cmd" params: "params)

     (case cmd
       ;; ((ack )(print "Got ack! But why? Should NOT get here.") 'ack)
       ((ping)
	;; (print "Got Ping!")
	;; (add-to-work-queue uconn rdat)
	'ack)
       ((goodbye)
	;; just clear out references to the caller. NOT COMPLETE
	(add-to-work-queue uconn rdat)
	'ack)
       ((response) ;; this is a result from remote processing, send it as mail ...
	(case (return-method)
	  ((polling)
	   (hash-table-set! (udat-mboxes uconn) qrykey (cons 'ok params))
	   'ack)
	  ((mailbox)
	   (let ((mbox (hash-table-ref/default (udat-mboxes uconn) qrykey #f)))
	     (if mbox
		 (begin
		   (mailbox-send! mbox params) ;; params here is our result
		   'ack)
		 (begin
		   (print "ERROR: received result but no associated mbox for cookie "qrykey)
		   'no-mbox-found))))
	  (else (print "ULEX ERROR: unrecognised return-method "(return-method))
		'bad-return-method)))
       (else ;; generic request - hand it to the work queue
	(add-to-work-queue uconn rdat)
	'ack)))
    (else
     (print "ULEX ERROR: bad rdat "rdat)

     'bad-rdat)))

;; given an already set up uconn start the cmd-loop
;;
(define (ulex-cmd-loop uconn)
  (let* ((serv-listener (udat-socket uconn))
	 (listener      (lambda ()
			  (let loop ((state 'start))
355
356
357
358
359
360
361
362

363
364
365
366
367
368
369
370
371



372
373
374
375
376
377
378
379
380
381
382
     (mailbox-send! (udat-work-queue uconn) rdat))
    ((direct)
     (do-work uconn rdat))
    (else
     (print "ULEX ERROR: work-method "(work-method)" not recognised, using mailbox.")
     (mailbox-send! (udat-work-queue uconn) rdat))))
     


(define (do-work uconn rdat)
  (let* ((proc (udat-work-proc uconn))) ;; get it each time - conceivebly it could change
    ;; put this following into a do-work procedure
    (match rdat
      ((rem-host-port qrykey cmd params)
       (let* ((start-time (current-milliseconds))
	      (result (proc rem-host-port qrykey cmd params))
	      (end-time (current-milliseconds))
	      (run-time (- end-time start-time)))



	 (print "ULEX: work "cmd", "params" done in "run-time" ms")
	 ;; send 'response as cmd and result as params
	 (send uconn rem-host-port qrykey 'response result) ;; could check for ack
	 (print "ULEX: response sent back to "rem-host-port" in "(- (current-milliseconds) end-time))))
      (MBOX_TIMEOUT #f)
      (else
       (print "ERROR: rdat "rdat", did not match rem-host-port qrykey cmd params")))))

;; NEW APPROACH:
;;   
(define (process-work-queue uconn) 







|
>









>
>
>
|
|
|
|







398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
     (mailbox-send! (udat-work-queue uconn) rdat))
    ((direct)
     (do-work uconn rdat))
    (else
     (print "ULEX ERROR: work-method "(work-method)" not recognised, using mailbox.")
     (mailbox-send! (udat-work-queue uconn) rdat))))
     
;; move the logic to return the result somewhere else?
;;
(define (do-work uconn rdat)
  (let* ((proc (udat-work-proc uconn))) ;; get it each time - conceivebly it could change
    ;; put this following into a do-work procedure
    (match rdat
      ((rem-host-port qrykey cmd params)
       (let* ((start-time (current-milliseconds))
	      (result (proc rem-host-port qrykey cmd params))
	      (end-time (current-milliseconds))
	      (run-time (- end-time start-time)))
	 (case (work-method)
	   ((direct) result)
	   (else
	    (print "ULEX: work "cmd", "params" done in "run-time" ms")
	    ;; send 'response as cmd and result as params
	    (send uconn rem-host-port qrykey 'response result) ;; could check for ack
	    (print "ULEX: response sent back to "rem-host-port" in "(- (current-milliseconds) end-time))))))
      (MBOX_TIMEOUT #f)
      (else
       (print "ERROR: rdat "rdat", did not match rem-host-port qrykey cmd params")))))

;; NEW APPROACH:
;;   
(define (process-work-queue uconn)