Megatest

Check-in [be6a92a2af]
Login
Overview
Comment:merged v1.63
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.62-rpc
Files: files | file ages | folders
SHA1: be6a92a2afc651c2557d748af478899321f28f2a
User & Date: bjbarcla on 2016-12-05 15:39:36
Other Links: branch diff | manifest | tags
Context
2016-12-05
19:38
rpc server starts now check-in: 7ce5c6cfb0 user: bjbarcla tags: v1.62-rpc
15:39
merged v1.63 check-in: be6a92a2af user: bjbarcla tags: v1.62-rpc
13:11
Bumped version to v1.6301 check-in: fbf0a07a1d user: mrwellan tags: v1.63, v1.6301
2016-12-02
14:39
grafted rpc code check-in: cc303d5f43 user: bjbarcla tags: v1.62-rpc
Changes

Modified db.scm from [fef1f3f2e3] to [b74c06eb1c].

268
269
270
271
272
273
274

275
276
277
278
279
280
281
282
283
284
285
286




287
288
289
290
291
292
293
294
(define (db:open-db dbstruct #!key (areapath #f))
  (let ((tmpdb (dbr:dbstruct-tmpdb dbstruct))) ;; RA => Returns the first reference in dbstruct
    (if tmpdb
	tmpdb
        ;; (mutex-lock! *rundb-mutex*)
        (let* ((dbpath       (db:dbfile-path)) ;;  0))
               (dbexists     (file-exists? dbpath))

               (tmpdb        (db:open-megatest-db path: dbpath)) ;; lock-create-open dbpath db:initialize-main-db))
               (mtdb         (db:open-megatest-db))
               (refndb       (db:open-megatest-db path: dbpath name: "megatest_ref.db"))
               (write-access (file-write-access? dbpath)))
          (if (and dbexists (not write-access))
              (set! *db-write-access* #f))
          (dbr:dbstruct-mtdb-set!   dbstruct mtdb)
          (dbr:dbstruct-tmpdb-set!  dbstruct tmpdb) ;; olddb is already a (cons db path)
          (dbr:dbstruct-refndb-set! dbstruct refndb)
          ;;	    (mutex-unlock! *rundb-mutex*)
          (if (and (not dbexists)
                   *db-write-access*) ;; did not have a prior db and do have write access




              (db:multi-db-sync dbstruct 'old2new))  ;; migrate data from megatest.db automatically
          tmpdb))))

;; Make the dbstruct, setup up auxillary db's and call for main db at least once
;;
;; called in http-transport and replicated in rmt.scm for *local* access. 
;;
(define (db:setup #!key (areapath #f))







>










|
|
>
>
>
>
|







268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
(define (db:open-db dbstruct #!key (areapath #f))
  (let ((tmpdb (dbr:dbstruct-tmpdb dbstruct))) ;; RA => Returns the first reference in dbstruct
    (if tmpdb
	tmpdb
        ;; (mutex-lock! *rundb-mutex*)
        (let* ((dbpath       (db:dbfile-path)) ;;  0))
               (dbexists     (file-exists? dbpath))
	       (dbfexists    (file-exists? (conc dbpath "/megatest.db")))
               (tmpdb        (db:open-megatest-db path: dbpath)) ;; lock-create-open dbpath db:initialize-main-db))
               (mtdb         (db:open-megatest-db))
               (refndb       (db:open-megatest-db path: dbpath name: "megatest_ref.db"))
               (write-access (file-write-access? dbpath)))
          (if (and dbexists (not write-access))
              (set! *db-write-access* #f))
          (dbr:dbstruct-mtdb-set!   dbstruct mtdb)
          (dbr:dbstruct-tmpdb-set!  dbstruct tmpdb) ;; olddb is already a (cons db path)
          (dbr:dbstruct-refndb-set! dbstruct refndb)
          ;;	    (mutex-unlock! *rundb-mutex*)
          (if (and (not dbfexists)
                   write-access) ;; *db-write-access*) ;; did not have a prior db and do have write access
	      (begin
		(debug:print 0 *default-log-port* "filling db " (db:dbdat-get-path tmpdb) " with data from " (db:dbdat-get-path mtdb))
		(db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb refndb tmpdb))
	      (debug:print 0 *default-log-port* " db, " (db:dbdat-get-path tmpdb) " already exists, not propogating data from " (db:dbdat-get-path mtdb)))
	  ;; (db:multi-db-sync dbstruct 'old2new))  ;; migrate data from megatest.db automatically
          tmpdb))))

;; Make the dbstruct, setup up auxillary db's and call for main db at least once
;;
;; called in http-transport and replicated in rmt.scm for *local* access. 
;;
(define (db:setup #!key (areapath #f))
3109
3110
3111
3112
3113
3114
3115
3116

3117
3118
3119
3120
3121
3122
3123
	 (with-input-from-string 
	     (z3:decode-buffer
	      (base64:base64-decode
	       (string-substitute 
		(regexp "_") "=" msg #t)))
	   (lambda ()(deserialize)))
	 (begin
	   (debug:print-error 0 *default-log-port* "reception failed. Received " msg " but cannot translate it.")

	   msg))) ;; crude reply for when things go awry
    ((zmq nmsg)(with-input-from-string msg (lambda ()(deserialize))))
    (else msg))) ;; rpc

;; This is to be the big daddy call

(define (db:test-set-status-state dbstruct run-id test-id status state msg)







|
>







3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
	 (with-input-from-string 
	     (z3:decode-buffer
	      (base64:base64-decode
	       (string-substitute 
		(regexp "_") "=" msg #t)))
	   (lambda ()(deserialize)))
	 (begin
	   (debug:print-error 0 *default-log-port* "reception failed. Received \"" msg "\" but cannot translate it.")
           (print-call-chain (current-error-port))
	   msg))) ;; crude reply for when things go awry
    ((zmq nmsg)(with-input-from-string msg (lambda ()(deserialize))))
    (else msg))) ;; rpc

;; This is to be the big daddy call

(define (db:test-set-status-state dbstruct run-id test-id status state msg)

Modified http-transport.scm from [a2d6254d62] to [4d8eecbf3a].

254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280

281
282
283
284
285
286
287

288
289
290
291
292
293
294
					     ;;; "communications failed"
					     (db:obj->string #f))
					   (with-input-from-request ;; was dat
					    fullurl 
					    (list (cons 'key "thekey")
						  (cons 'cmd cmd)
						  (cons 'params sparams))
					    read-string)
                                           transport: 'http))
                                         0)) ;; added this speculatively
			      ;; Shouldn't this be a call to the managed call-all-connections stuff above?
			      (close-all-connections!)
			      (mutex-unlock! *http-mutex*)
			      ))
	      (time-out     (lambda ()
			      (thread-sleep! 45)
			      #f))
	      (th1 (make-thread send-recieve "with-input-from-request"))
	      (th2 (make-thread time-out     "time out")))
	 (thread-start! th1)
	 (thread-start! th2)
	 (thread-join! th1)
	 (thread-terminate! th2)
	 (debug:print-info 11 *default-log-port* "got res=" res)
	 (if (vector? res)
	     (if (vector-ref res 0)
		 res

		 (begin ;; note: this code also called in nmsg-transport - consider consolidating it
		   (debug:print-error 0 *default-log-port* "error occured at server, info=" (vector-ref res 2))
		   (debug:print 0 *default-log-port* " client call chain:")
		   (print-call-chain (current-error-port))
		   (debug:print 0 *default-log-port* " server call chain:")
		   (pp (vector-ref res 1) (current-error-port))
		   (signal (vector-ref res 0))))

	     (signal (make-composite-condition
		      (make-property-condition 
		       'timeout
		       'message "nmsg-transport:client-api-send-receive-raw timed out talking to server")))))))

;; careful closing of connections stored in *runremote*
;;







|
|


















>
|
|
|
|
|
|
|
>







254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
					     ;;; "communications failed"
					     (db:obj->string #f))
					   (with-input-from-request ;; was dat
					    fullurl 
					    (list (cons 'key "thekey")
						  (cons 'cmd cmd)
						  (cons 'params sparams))
					    read-string))
					  transport: 'http)
                                         0)) ;; added this speculatively
			      ;; Shouldn't this be a call to the managed call-all-connections stuff above?
			      (close-all-connections!)
			      (mutex-unlock! *http-mutex*)
			      ))
	      (time-out     (lambda ()
			      (thread-sleep! 45)
			      #f))
	      (th1 (make-thread send-recieve "with-input-from-request"))
	      (th2 (make-thread time-out     "time out")))
	 (thread-start! th1)
	 (thread-start! th2)
	 (thread-join! th1)
	 (thread-terminate! th2)
	 (debug:print-info 11 *default-log-port* "got res=" res)
	 (if (vector? res)
	     (if (vector-ref res 0)
		 res
                 (if (debug:debug-mode 11)
                     (begin ;; note: this code also called in nmsg-transport - consider consolidating it
                       (debug:print-error 11 *default-log-port* "error occured at server, info=" (vector-ref res 2))
                       (debug:print 11 *default-log-port* " client call chain:")
                       (print-call-chain (current-error-port))
                       (debug:print 11 *default-log-port* " server call chain:")
                       (pp (vector-ref res 1) (current-error-port))
                       (signal (vector-ref res 0)))
                     res))
	     (signal (make-composite-condition
		      (make-property-condition 
		       'timeout
		       'message "nmsg-transport:client-api-send-receive-raw timed out talking to server")))))))

;; careful closing of connections stored in *runremote*
;;
529
530
531
532
533
534
535

536
537
538
539
540
541
542
543
    (if (args:get-arg "-daemonize")
	(begin
	  (daemon:ize)
	  (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
	      (begin
		(current-error-port *alt-log-file*)
		(current-output-port *alt-log-file*)))))

    (if (server:check-if-running run-id)
	(begin
	  (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running")
	  (exit 0))
	(begin ;; ok, no server detected, clean out any lingering records
	   (tasks:server-force-clean-running-records-for-run-id  (db:delay-if-busy tdbdat) run-id "notresponding")))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))
	       (remtries  4))







>
|







531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
    (if (args:get-arg "-daemonize")
	(begin
	  (daemon:ize)
	  (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
	      (begin
		(current-error-port *alt-log-file*)
		(current-output-port *alt-log-file*)))))
    (if (and (server:read-dotserver *toppath*)
             (server:check-if-running run-id))
	(begin
	  (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running")
	  (exit 0))
	(begin ;; ok, no server detected, clean out any lingering records
	   (tasks:server-force-clean-running-records-for-run-id  (db:delay-if-busy tdbdat) run-id "notresponding")))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))
	       (remtries  4))
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
	    (thread-start! th2)
	    (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor.
	    (thread-start! th3)
	    (set! *didsomething* #t)
	    (thread-join! th2)
	    (exit))))))

(define (http:ping run-id host-port)
  (let* ((server-dat (http-transport:client-connect (car host-port)(cadr host-port)))
	 (login-res  (rmt:login-no-auto-client-setup server-dat run-id)))
    (if (and (list? login-res)
	     (car login-res))
	(begin
	  (print "LOGIN_OK")
	  (exit 0))
	(begin
	  (print "LOGIN_FAILED")
	  (exit 1)))))

(define (http-transport:server-signal-handler signum)
  (signal-mask! signum)
  (handle-exceptions
   exn
   (debug:print 0 *default-log-port* " ... exiting ...")
   (let ((th1 (make-thread (lambda ()







|
|
|
|
|
|
|
|
|
|
|







571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
	    (thread-start! th2)
	    (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor.
	    (thread-start! th3)
	    (set! *didsomething* #t)
	    (thread-join! th2)
	    (exit))))))

;; (define (http:ping run-id host-port)
;;   (let* ((server-dat (http-transport:client-connect (car host-port)(cadr host-port)))
;; 	 (login-res  (rmt:login-no-auto-client-setup server-dat run-id)))
;;     (if (and (list? login-res)
;; 	     (car login-res))
;; 	(begin
;; 	  (print "LOGIN_OK")
;; 	  (exit 0))
;; 	(begin
;; 	  (print "LOGIN_FAILED")
;; 	  (exit 1)))))

(define (http-transport:server-signal-handler signum)
  (signal-mask! signum)
  (handle-exceptions
   exn
   (debug:print 0 *default-log-port* " ... exiting ...")
   (let ((th1 (make-thread (lambda ()

Modified megatest-version.scm from [a6ad525294] to [b4a5a2c435].

1
2
3
4
5
6
7
;; Always use two or four digit decimal
;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..

(declare (unit megatest-version))

(define megatest-version 1.6208)






|

1
2
3
4
5
6
7
;; Always use two or four digit decimal
;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..

(declare (unit megatest-version))

(define megatest-version 1.6301)

Modified rmt.scm from [aa4e7565c6] to [4dbba4c6fb].

25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

;; generate entries for ~/.megatestrc with the following
;;
;;  grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u

(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        (if *toppath* (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (conndat           #f)
  (transport         *transport-type*)
  (server-timeout    (or (server:get-timeout) 100))) ;; default to 100 seconds

;;======================================================================
;;  S U P P O R T   F U N C T I O N S







|







25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

;; generate entries for ~/.megatestrc with the following
;;
;;  grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u

(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        (if *toppath* (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (conndat           #f)
  (transport         *transport-type*)
  (server-timeout    (or (server:get-timeout) 100))) ;; default to 100 seconds

;;======================================================================
;;  S U P P O R T   F U N C T I O N S
65
66
67
68
69
70
71









72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96






97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
  ;;
  (let* ((start-time (current-seconds))) ;; snapshot time so all use cases get same value
    (cond
     ;; give up if more than 15 attempts
     ((> attemptnum 15)
      (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.")
      (exit 1))









     ;; ensure we have a record for our connection for given area
     ((not *runremote*)                     
      (set! *runremote* (make-remote))
      (mutex-unlock! *rmt-mutex*)
      (print "case 1")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a homehost record
     ((not (pair? (remote-hh-dat *runremote*)))  ;; have a homehost record?
      (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
      (remote-hh-dat-set! *runremote* (common:get-homehost))
      (mutex-unlock! *rmt-mutex*)
      (print "case 2")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; on homehost and this is a read
     ((and (cdr (remote-hh-dat *runremote*))   ;; on homehost
           (member cmd api:read-only-queries)) ;; this is a read
      (mutex-unlock! *rmt-mutex*)
      (print "case 3")
      (rmt:open-qry-close-locally cmd 0 params))
     ;; on homehost and this is a write, we already have a server
     ((and (cdr (remote-hh-dat *runremote*))         ;; on homehost
           (not (member cmd api:read-only-queries))  ;; this is a write
           (remote-server-url *runremote*))          ;; have a server
      (mutex-unlock! *rmt-mutex*)
      (print "case 4")






      (rmt:open-qry-close-locally cmd 0 params))
     ;; no server contact made and this is a write, passively start a server 
     ((and (not (remote-server-url *runremote*))
	   (not (member cmd api:read-only-queries)))
      (print "case 5")
      (let ((serverconn (server:check-if-running *toppath*)))
	(if serverconn
	    (remote-server-url-set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed
	    (if (not (server:start-attempted? *toppath*))
		(server:kind-run *toppath*))))
      (if (cdr (remote-hh-dat *runremote*)) ;; we are on the homehost, just do the call
          (begin
            (mutex-unlock! *rmt-mutex*)
	    (print "case 5.1")
            (rmt:open-qry-close-locally cmd 0 params))
          (begin
            (mutex-unlock! *rmt-mutex*)
	    (print "case 5.2")
	    (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
            (rmt:send-receive cmd rid params attemptnum: attemptnum))))
     ;; if not on homehost ensure we have a connection to a live server
     ;; NOTE: we *have* a homehost record by now
     ((and (not (cdr (remote-hh-dat *runremote*)))        ;; are we on a homehost?
           (not (remote-conndat *runremote*)))            ;; and no connection
      (print "case 6  hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*))
      (mutex-unlock! *rmt-mutex*)
      (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
      (remote-conndat-set! *runremote* (rmt:get-connection-info 0))
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; all set up if get this far, dispatch the query
     ((cdr (remote-hh-dat *runremote*)) ;; we are on homehost
      (mutex-unlock! *rmt-mutex*)
      (print "case 7")
      (rmt:open-qry-close-locally cmd (if rid rid 0) params))
     ;; reset the connection if it has been unused too long
     ((and (remote-conndat *runremote*)
	   (let ((expire-time (- start-time (remote-server-timeout *runremote*))))
	     (< (http-transport:server-dat-get-last-access (remote-conndat *runremote*)) expire-time)))
      (print "case 8")
      (remote-conndat-set! *runremote* #f))
     ;; not on homehost, do server query
     (else
      (mutex-unlock! *rmt-mutex*)
      (print "case 9")
      (let* ((conninfo (remote-conndat *runremote*))
	     (dat      (case (remote-transport *runremote*)
			 ((http) (condition-case ;; handling here has caused a lot of problems. However it is needed to deal with attemtped communication to servers that have gone away
                                  (http-transport:client-api-send-receive 0 conninfo cmd params)
                                  ((commfail)(vector #f "communications fail"))
                                  ((exn)(vector #f "other fail" (print-call-chain)))))
                         ((rpc) (condition-case ;; handling here has caused a lot of problems. However it is needed to deal with attemtped communication to servers that have gone away
                                  (rpc-transport:client-api-send-receive 0 conninfo cmd params)
                                  ((commfail)(vector #f "communications fail"))
                                  ((exn)(vector #f "other fail" (print-call-chain)))))
			 (else
			  (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported")
			  (exit))))
	     (success  (if (vector? dat) (vector-ref dat 0) #f))
	     (res      (if (vector? dat) (vector-ref dat 1) #f)))
	(if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time
        (print "case 9. conninfo=" conninfo " dat=" dat)
	(if success
	    (case (remote-transport *runremote*)
	      ((http rpc) res)
	      (else
	       (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " is unknown")
	       (exit 1)))
	    (begin
	      (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
	      (remote-conndat-set!    *runremote* #f)
	      (remote-server-url-set! *runremote* #f)
              (print "case 9.1")
	      (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
	      (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))))))))

(define (rmt:update-db-stats run-id rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)
  (handle-exceptions
   exn







>
>
>
>
>
>
>
>
>




|






|





|






|
>
>
>
>
>
>




|
|







|

|

|






|


|




|

<
<
<
<
<
<



|
















|










|







65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145






146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
  ;;
  (let* ((start-time (current-seconds))) ;; snapshot time so all use cases get same value
    (cond
     ;; give up if more than 15 attempts
     ((> attemptnum 15)
      (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.")
      (exit 1))
     ;; reset the connection if it has been unused too long
     ((and *runremote*
           (remote-conndat *runremote*)
	   (let ((expire-time (- start-time (remote-server-timeout *runremote*))))
	     (< (http-transport:server-dat-get-last-access (remote-conndat *runremote*)) expire-time)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
      (remote-conndat-set! *runremote* #f)
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a record for our connection for given area
     ((not *runremote*)                     
      (set! *runremote* (make-remote))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  1")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; ensure we have a homehost record
     ((not (pair? (remote-hh-dat *runremote*)))  ;; have a homehost record?
      (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
      (remote-hh-dat-set! *runremote* (common:get-homehost))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  2")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; on homehost and this is a read
     ((and (cdr (remote-hh-dat *runremote*))   ;; on homehost
           (member cmd api:read-only-queries)) ;; this is a read
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  3")
      (rmt:open-qry-close-locally cmd 0 params))
     ;; on homehost and this is a write, we already have a server
     ((and (cdr (remote-hh-dat *runremote*))         ;; on homehost
           (not (member cmd api:read-only-queries))  ;; this is a write
           (remote-server-url *runremote*))          ;; have a server
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4")
      (rmt:open-qry-close-locally cmd 0 params))
     ;; on homehost and this is a write, we have a server (we know because case 4 checked)
     ((and (cdr (remote-hh-dat *runremote*))         ;; on homehost
	   (not (member cmd api:read-only-queries)))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
      (rmt:open-qry-close-locally cmd 0 params))
     ;; no server contact made and this is a write, passively start a server 
     ((and (not (remote-server-url *runremote*))
	   (not (member cmd api:read-only-queries)))
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
      (let ((serverconn (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
	(if serverconn
	    (remote-server-url-set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed
	    (if (not (server:start-attempted? *toppath*))
		(server:kind-run *toppath*))))
      (if (cdr (remote-hh-dat *runremote*)) ;; we are on the homehost, just do the call
          (begin
            (mutex-unlock! *rmt-mutex*)
	    (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.1")
            (rmt:open-qry-close-locally cmd 0 params))
          (begin                            ;; not on homehost, start server and wait
            (mutex-unlock! *rmt-mutex*)
	    (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5.2")
	    (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
            (rmt:send-receive cmd rid params attemptnum: attemptnum))))
     ;; if not on homehost ensure we have a connection to a live server
     ;; NOTE: we *have* a homehost record by now
     ((and (not (cdr (remote-hh-dat *runremote*)))        ;; are we on a homehost?
           (not (remote-conndat *runremote*)))            ;; and no connection
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6  hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*))
      (mutex-unlock! *rmt-mutex*)
      (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
      (remote-conndat-set! *runremote* (rmt:get-connection-info 0)) ;; calls client:setup which calls client:setup-http
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     ;; all set up if get this far, dispatch the query
     ((cdr (remote-hh-dat *runremote*)) ;; we are on homehost
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  7")
      (rmt:open-qry-close-locally cmd (if rid rid 0) params))






     ;; not on homehost, do server query
     (else
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9")
      (let* ((conninfo (remote-conndat *runremote*))
	     (dat      (case (remote-transport *runremote*)
			 ((http) (condition-case ;; handling here has caused a lot of problems. However it is needed to deal with attemtped communication to servers that have gone away
                                  (http-transport:client-api-send-receive 0 conninfo cmd params)
                                  ((commfail)(vector #f "communications fail"))
                                  ((exn)(vector #f "other fail" (print-call-chain)))))
                         ((rpc) (condition-case ;; handling here has caused a lot of problems. However it is needed to deal with attemtped communication to servers that have gone away
                                  (rpc-transport:client-api-send-receive 0 conninfo cmd params)
                                  ((commfail)(vector #f "communications fail"))
                                  ((exn)(vector #f "other fail" (print-call-chain)))))
			 (else
			  (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported")
			  (exit))))
	     (success  (if (vector? dat) (vector-ref dat 0) #f))
	     (res      (if (vector? dat) (vector-ref dat 1) #f)))
	(if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time
        (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9. conninfo=" conninfo " dat=" dat)
	(if success
	    (case (remote-transport *runremote*)
	      ((http rpc) res)
	      (else
	       (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " is unknown")
	       (exit 1)))
	    (begin
	      (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
	      (remote-conndat-set!    *runremote* #f)
	      (remote-server-url-set! *runremote* #f)
              (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9.1")
	      (tasks:start-and-wait-for-server (tasks:open-db) 0 15)
	      (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))))))))

(define (rmt:update-db-stats run-id rawcmd params duration)
  (mutex-lock! *db-stats-mutex*)
  (handle-exceptions
   exn
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
                    (case (remote-transport *runremote*)
                      ((http) (http-transport:client-api-send-receive run-id connection-info cmd params))
                      ((rpc) (rpc-transport:client-api-send-receive run-id connection-info cmd params))
                      (else
			  (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported (2)")
			  (exit))

                    )))
    (if (and res (vector-ref res 0))
	(vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!!
	#f)))

;; ;; Wrap json library for strings (why the ports crap in the first place?)
;; (define (rmt:dat->json-str dat)
;;   (with-output-to-string 







|







276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
                    (case (remote-transport *runremote*)
                      ((http) (http-transport:client-api-send-receive run-id connection-info cmd params))
                      ((rpc) (rpc-transport:client-api-send-receive run-id connection-info cmd params))
                      (else
			  (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported (2)")
			  (exit))

                    ))))
    (if (and res (vector-ref res 0))
	(vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!!
	#f)))

;; ;; Wrap json library for strings (why the ports crap in the first place?)
;; (define (rmt:dat->json-str dat)
;;   (with-output-to-string 

Modified server.scm from [43afd79d02] to [3b30831198].

238
239
240
241
242
243
244


245
246
247
248
249
250
251

252
253
254
255

256
257
258
259

260
261

262
263
264
265
266
267
268

;; called in megatest.scm, host-port is string hostname:port
;;
;; NOTE: This is NOT called directly from clients as not all transports support a client running
;;       in the same process as the server.
;;
(define (server:ping host-port-in #!key (do-exit #f))


  (let ((host:port (if (number? host-port-in) ;; we were handed a server-id
		       (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in)))
			 ;; (print "srec: " srec " host-port-in: " host-port-in)
			 (if srec
			     (conc (vector-ref srec 3) ":" (vector-ref srec 4))
			     (conc "no such server-id " host-port-in)))
		       host-port-in)))

    (let* ((host-port (let ((slst (string-split   host:port ":")))
			(if (eq? (length slst) 2)
			    (list (car slst)(string->number (cadr slst)))
			    #f)))

	   (toppath       (launch:setup)))
      ;; (print "host-port=" host-port)
      (if (not host-port)
	  (begin

	    (debug:print 0 *default-log-port*  "ERROR: bad host:port")
	    (if do-exit (exit 1)))

	  (let* ((iface      (car host-port))
		 (port       (cadr host-port))
		 (server-dat
                  (case (remote-transport *runremote*)
                    ((http) (http-transport:client-connect iface port))
                    ((rpc) (rpc-transport:client-connect iface port))
                    (else







>
>
|
|
|
|
|
|
|
>
|
|
|
|
>




>
|
|
>







238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274

;; called in megatest.scm, host-port is string hostname:port
;;
;; NOTE: This is NOT called directly from clients as not all transports support a client running
;;       in the same process as the server.
;;
(define (server:ping host-port-in #!key (do-exit #f))
  (let ((host:port (if (not host-port-in) ;; use read-dotserver to find
		       (server:read-dotserver *toppath*)
		       (if (number? host-port-in) ;; we were handed a server-id
			   (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in)))
			     ;; (print "srec: " srec " host-port-in: " host-port-in)
			     (if srec
				 (conc (vector-ref srec 3) ":" (vector-ref srec 4))
				 (conc "no such server-id " host-port-in)))
			   host-port-in))))
    (let* ((host-port (if host:port
			  (let ((slst (string-split   host:port ":")))
			    (if (eq? (length slst) 2)
				(list (car slst)(string->number (cadr slst)))
				#f))
			  #f))
	   (toppath       (launch:setup)))
      ;; (print "host-port=" host-port)
      (if (not host-port)
	  (begin
	    (if host-port-in
		(debug:print 0 *default-log-port*  "ERROR: bad host:port"))
	    (if do-exit (exit 1))
	    #f)
	  (let* ((iface      (car host-port))
		 (port       (cadr host-port))
		 (server-dat
                  (case (remote-transport *runremote*)
                    ((http) (http-transport:client-connect iface port))
                    ((rpc) (rpc-transport:client-connect iface port))
                    (else

Modified tasks.scm from [285ca22d6e] to [b8a3c2af2e].

402
403
404
405
406
407
408


409


410
411
412
413
414
415
416
	       (< delay-time delay-max-tries))
	  (begin
	    (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id)
		(debug:print 0 *default-log-port* "Try starting server for run-id " run-id))
	    (thread-sleep! (/ (random 2000) 1000))
	    (server:kind-run run-id)
	    (thread-sleep! (min delay-time 1))


	    (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1))))))



(define (tasks:get-all-servers mdb)
  (let ((res '()))
    (sqlite3:for-each-row
     (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id)
       ;;                       0  1     2         3      4     5          6        7     8          9          10        11     12
       (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res)))







>
>
|
>
>







402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
	       (< delay-time delay-max-tries))
	  (begin
	    (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id)
		(debug:print 0 *default-log-port* "Try starting server for run-id " run-id))
	    (thread-sleep! (/ (random 2000) 1000))
	    (server:kind-run run-id)
	    (thread-sleep! (min delay-time 1))
            (if (not (or (server:start-attempted? *toppath*)
                         (server:read-dotserver *toppath*))) ;; no point in trying
                (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1))
                #f))
          #f)))

(define (tasks:get-all-servers mdb)
  (let ((res '()))
    (sqlite3:for-each-row
     (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id)
       ;;                       0  1     2         3      4     5          6        7     8          9          10        11     12
       (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res)))

Modified tests/fullrun/megatest.config from [c5f81555e0] to [353b25ebc0].

247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
                         ((none) "nbfake") \
                         ((openlava) "bsub -o $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.log") \
                         ((sleeprunner) "sleeprunner") \
                         (else "nbfake"))}

# launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log 

# launcher #{shell if which bsub > /dev/null;then echo bsub -q priority -o openlava.log;else echo sleeprunner;fi}
# launcher nbfake

[configf:settings trim-trailing-spaces yes]

# Override the rollup for specific tests
[testrollup]
runfirst ls







|







247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
                         ((none) "nbfake") \
                         ((openlava) "bsub -o $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.log") \
                         ((sleeprunner) "sleeprunner") \
                         (else "nbfake"))}

# launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log 

# launcher #{ shell if which bsub > /dev/null;then echo bsub -q priority -o openlava.log;else echo sleeprunner;fi}
# launcher nbfake

[configf:settings trim-trailing-spaces yes]

# Override the rollup for specific tests
[testrollup]
runfirst ls