Megatest

Check-in [1d98834276]
Login
Overview
Comment:Removed references to homehost
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.6584-ck5
Files: files | file ages | folders
SHA1: 1d9883427627a2e998a5824ca303b7ea6ac308b2
User & Date: matt on 2021-04-25 05:14:25
Other Links: branch diff | manifest | tags
Context
2021-04-25
05:34
Added area key check-in: f160cf8d52 user: matt tags: v1.6584-ck5
05:14
Removed references to homehost check-in: 1d98834276 user: matt tags: v1.6584-ck5
2021-04-23
23:57
Basic initialization of dbstruct works. check-in: 1b388397ae user: matt tags: v1.6584-ck5
Changes

Modified archivemod.scm from [a7eb8f34d2] to [fd3e9e2a92].

51
52
53
54
55
56
57

58
59
60
61
62
63
64
65
	chicken.process
	chicken.process-context
	chicken.process-context.posix
	chicken.sort
	chicken.string
	chicken.time
	chicken.time.posix


	(prefix base64 base64:)
;; 	csv-xml
	directory-utils
	matchable
	regex
	s11n
	srfi-1







>
|







51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
	chicken.process
	chicken.process-context
	chicken.process-context.posix
	chicken.sort
	chicken.string
	chicken.time
	chicken.time.posix
	system-information
	
	(prefix base64 base64:)
;; 	csv-xml
	directory-utils
	matchable
	regex
	s11n
	srfi-1
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
        (archiver       (let ((s (configf:lookup *configdat* "archive" "archiver")))
			   (if s (string->symbol s) 'bup)))
        (rsync-exe (or (configf:lookup *configdat* "archive" "rsync") "rsync"))   
        (print-prefix      "Running: ") 
        (archive-info (archive:allocate-new-archive-block blockid-cache *toppath* tsname min-space target-patt run-patt "megatest-db"))
	(archive-dir  (if archive-info (cdr archive-info) #f))
	(archive-id   (if archive-info (car archive-info) -1))
        (home-host (common:get-homehost))
        (archive-time (seconds->std-time-str (current-seconds)))
        (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time))
        (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db"))
        (dbfile             (conc  archive-staging-db "/megatest.db"))) 
        (create-directory archive-staging-db #t)
        (let-values (((pid-val exit-status exit-code) (run-n-wait rsync-exe params: (list "-v" (conc (car home-host) ":"tmp-db-path) archive-staging-db) print-cmd: print-prefix)))
            (if (eq? exit-code 0)   







|







416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
        (archiver       (let ((s (configf:lookup *configdat* "archive" "archiver")))
			   (if s (string->symbol s) 'bup)))
        (rsync-exe (or (configf:lookup *configdat* "archive" "rsync") "rsync"))   
        (print-prefix      "Running: ") 
        (archive-info (archive:allocate-new-archive-block blockid-cache *toppath* tsname min-space target-patt run-patt "megatest-db"))
	(archive-dir  (if archive-info (cdr archive-info) #f))
	(archive-id   (if archive-info (car archive-info) -1))
   (home-host (get-host-name)) ;; common:get-homehost)) ;; TODO: Fix this.
        (archive-time (seconds->std-time-str (current-seconds)))
        (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time))
        (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db"))
        (dbfile             (conc  archive-staging-db "/megatest.db"))) 
        (create-directory archive-staging-db #t)
        (let-values (((pid-val exit-status exit-code) (run-n-wait rsync-exe params: (list "-v" (conc (car home-host) ":"tmp-db-path) archive-staging-db) print-cmd: print-prefix)))
            (if (eq? exit-code 0)   

Modified launchmod.scm from [032f2a94bb] to [6c6ba3f325].

369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
	       (ezsteps   (assoc/default 'ezsteps   cmdinfo))
	       (subrun    (assoc/default 'subrun    cmdinfo))
	       ;; (runremote (assoc/default 'runremote cmdinfo))
	       ;; (transport (assoc/default 'transport cmdinfo))  ;; not used
	       ;; (serverinf (assoc/default 'serverinf cmdinfo))
	       ;; (port      (assoc/default 'port      cmdinfo))
	       (serverurl (assoc/default 'serverurl cmdinfo))
	       (homehost  (assoc/default 'homehost  cmdinfo))
	       (run-id    (assoc/default 'run-id    cmdinfo))
	       (test-id   (assoc/default 'test-id   cmdinfo))
	       (target    (assoc/default 'target    cmdinfo))
	       (areaname  (assoc/default 'areaname  cmdinfo))
	       (itemdat   (assoc/default 'itemdat   cmdinfo))
	       (env-ovrd  (assoc/default 'env-ovrd  cmdinfo))
	       (set-vars  (assoc/default 'set-vars  cmdinfo)) ;; pre-overrides from -setvar







|







369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
	       (ezsteps   (assoc/default 'ezsteps   cmdinfo))
	       (subrun    (assoc/default 'subrun    cmdinfo))
	       ;; (runremote (assoc/default 'runremote cmdinfo))
	       ;; (transport (assoc/default 'transport cmdinfo))  ;; not used
	       ;; (serverinf (assoc/default 'serverinf cmdinfo))
	       ;; (port      (assoc/default 'port      cmdinfo))
	       (serverurl (assoc/default 'serverurl cmdinfo))
	       ;; (homehost  (assoc/default 'homehost  cmdinfo))
	       (run-id    (assoc/default 'run-id    cmdinfo))
	       (test-id   (assoc/default 'test-id   cmdinfo))
	       (target    (assoc/default 'target    cmdinfo))
	       (areaname  (assoc/default 'areaname  cmdinfo))
	       (itemdat   (assoc/default 'itemdat   cmdinfo))
	       (env-ovrd  (assoc/default 'env-ovrd  cmdinfo))
	       (set-vars  (assoc/default 'set-vars  cmdinfo)) ;; pre-overrides from -setvar
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
      (set! cmdparms (base64:base64-encode 
		      (z3:encode-buffer 
		       (with-output-to-string
			 (lambda () ;; (list 'hosts     hosts)
			   (write (list (list 'testpath  test-path)
					;; (list 'transport (conc *transport-type*))
					;; (list 'serverinf *server-info*)
					(list 'homehost  (let* ((hhdat (common:get-homehost)))
							   (if hhdat
							       (car hhdat)
							       #f)))
					(list 'serverurl (if *runremote*
							     (remote-server-url *runremote*)
							     #f)) ;;
					(list 'areaname  (common:get-area-name))







|







1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
      (set! cmdparms (base64:base64-encode 
		      (z3:encode-buffer 
		       (with-output-to-string
			 (lambda () ;; (list 'hosts     hosts)
			   (write (list (list 'testpath  test-path)
					;; (list 'transport (conc *transport-type*))
					;; (list 'serverinf *server-info*)
					#;(list 'homehost  (let* ((hhdat (common:get-homehost)))
							   (if hhdat
							       (car hhdat)
							       #f)))
					(list 'serverurl (if *runremote*
							     (remote-server-url *runremote*)
							     #f)) ;;
					(list 'areaname  (common:get-area-name))
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
		)) ;; we can safely cache megatest.config since we have a valid runconfig
	  data))))

;;======================================================================
;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
(define (common:watchdog)
  (debug:print-info 13 *default-log-port* "common:watchdog entered.")
  (if (launch:setup)
      (if (common:on-homehost?)
	  (let ((dbstruct (db:setup #t)))
	    (debug:print-info 13 *default-log-port* "after db:setup with dbstruct=" dbstruct)
	    (cond
	     ((dbr:dbstruct-read-only dbstruct)
	      (debug:print-info 13 *default-log-port* "loading read-only watchdog")
	      (common:readonly-watchdog dbstruct))







|







2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
		)) ;; we can safely cache megatest.config since we have a valid runconfig
	  data))))

;;======================================================================
;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
(define (common:watchdog)
  (debug:print-info 13 *default-log-port* "common:watchdog entered.")
  #;(if (launch:setup)
      (if (common:on-homehost?)
	  (let ((dbstruct (db:setup #t)))
	    (debug:print-info 13 *default-log-port* "after db:setup with dbstruct=" dbstruct)
	    (cond
	     ((dbr:dbstruct-read-only dbstruct)
	      (debug:print-info 13 *default-log-port* "loading read-only watchdog")
	      (common:readonly-watchdog dbstruct))

Modified megatest.scm from [ef10051c2f] to [58763bc3e0].

880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
     ;; for some switches always print the command to stderr
     ;;
     (if (args:any-defined? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun")
         (debug:print 0 *default-log-port* (string-intersperse (argv) " ")))
     
     ;; some switches imply homehost. Exit here if not on homehost
     ;;
     (let ((homehost-required  (list "-cleanup-db" "-server")))
       (if (apply args:any-defined? homehost-required)
           (if (not (common:on-homehost?))
     	  (for-each
     	   (lambda (switch)
     	     (if (args:get-arg switch)
     		 (begin
     		   (debug:print 0 *default-log-port* "ERROR: you must be on the homehost to run with " switch







|







880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
     ;; for some switches always print the command to stderr
     ;;
     (if (args:any-defined? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun")
         (debug:print 0 *default-log-port* (string-intersperse (argv) " ")))
     
     ;; some switches imply homehost. Exit here if not on homehost
     ;;
     #;(let ((homehost-required  (list "-cleanup-db" "-server")))
       (if (apply args:any-defined? homehost-required)
           (if (not (common:on-homehost?))
     	  (for-each
     	   (lambda (switch)
     	     (if (args:get-arg switch)
     		 (begin
     		   (debug:print 0 *default-log-port* "ERROR: you must be on the homehost to run with " switch
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
         (exit 0)))
     
     (if (or (getenv "MT_RUNSCRIPT")
     	(args:get-arg "-repl")
     	(args:get-arg "-load"))
         (let* ((toppath (launch:setup))
     	   (dbstruct (if (and toppath
			      (common:on-homehost?))
			 (db:setup #f)
			 #f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f)))
           (if *toppath*
     	  (cond
     	   ((getenv "MT_RUNSCRIPT")
     	    ;; How to run megatest scripts
     	    ;;







|







2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
         (exit 0)))
     
     (if (or (getenv "MT_RUNSCRIPT")
     	(args:get-arg "-repl")
     	(args:get-arg "-load"))
         (let* ((toppath (launch:setup))
     	   (dbstruct (if (and toppath
			      #;(common:on-homehost?))
			 (db:setup #f)
			 #f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f)))
           (if *toppath*
     	  (cond
     	   ((getenv "MT_RUNSCRIPT")
     	    ;; How to run megatest scripts
     	    ;;

Modified rmtmod.scm from [06da62af7d] to [7416ecbdb6].

149
150
151
152
153
154
155
156


157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234

235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
		  (client:setup areapath)
		  #f))))

;;======================================================================

;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected



  #;(common:telemetry-log (conc "rmt:"(->string cmd))
                        payload: `((rid . ,rid)
                                   (params . ,params)))
                          
  (if (> attemptnum 2)
      (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum))
    
  (cond
   ((> attemptnum 2) (thread-sleep! 0.053))
   ((> attemptnum 10) (thread-sleep! 0.5))
   ((> attemptnum 20) (thread-sleep! 1)))
  (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15)))  
    (begin (server:run *toppath*) (thread-sleep! 3))) 
  
  
  ;;DOT digraph megatest_state_status {
  ;;DOT   ranksep=0;
  ;;DOT   // rankdir=LR;
  ;;DOT   node [shape="box"];
  ;;DOT "rmt:send-receive" -> MUTEXLOCK;
  ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; }
  ;; do all the prep locked under the rmt-mutex
  (mutex-lock! *rmt-mutex*)
  
  ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
  ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
  ;; 3. do the query, if on homehost use local access
  ;;
  (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
         (areapath      *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
	 (runremote     (or area-dat
			    *runremote*))
         (attemptnum    (+ 1 attemptnum))
	 (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*)))

    ;; DOT INIT_RUNREMOTE; // leaving off - doesn't really add to the clarity
    ;; DOT MUTEXLOCK -> INIT_RUNREMOTE [label="no remote?"];
    ;; DOT INIT_RUNREMOTE -> MUTEXLOCK;
    ;; ensure we have a record for our connection for given area
    (if (not runremote)                   ;; can remove this one. should never get here.         
	(begin
	  (set! *runremote* (make-and-init-remote))
          (let* ((server-info (remote-server-info *runremote*))) 
            (if server-info
		(begin
			(remote-server-url-set! *runremote* (server:record->url server-info))
			(remote-server-id-set! *runremote* (server:record->id server-info)))))  
	  (set! runremote   *runremote*))) ;; new runremote will come from this on next iteration
    
    ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity
    ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"];
    ;; DOT SET_HOMEHOST -> MUTEXLOCK;
    ;; ensure we have a homehost record
    (if (not (pair? (remote-hh-dat runremote)))  ;; not on homehost
	(thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
	(remote-hh-dat-set! runremote (common:get-homehost)))
    
    ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
    (cond
     ;;DOT EXIT;
     ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" }
     ;; give up if more than 150 attempts
     ((> attemptnum 150)
      (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.")
      (exit 1))

     ;;DOT CASE2 [label="local\nreadonly\nquery"];
     ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2}
     ;;DOT CASE2 -> "rmt:open-qry-close-locally";
     ;; readonly mode, read request-  handle it - case 2
     ((and readonly-mode
           (member cmd api:read-only-queries)) 
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2")
      (rmt:open-qry-close-locally cmd 0 params)
      )


     ;;DOT CASE3 [label="write in\nread-only mode"];
     ;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3}
     ;;DOT CASE3 -> "#f";
     ;; readonly mode, write request.  Do nothing, return #f
     (readonly-mode (extras-readonly-mode *rmt-mutex* *default-log-port* cmd params))

     ;; This block was for pre-emptively resetting the connection if there had been no communication for some time.
     ;; I don't think it adds any value. If the server is not there, just fail and start a new connection.
     ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout)
     ;;
     ;;DOT CASE4 [label="reset\nconnection"];
     ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4}
     ;;DOT CASE4 -> "rmt:send-receive";
     ;; reset the connection if it has been unused too long
     ((and runremote
           (remote-conndat runremote)
	   (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on
	      (+ (http-transport:server-dat-get-last-access (remote-conndat runremote))
		 (remote-server-timeout runremote))))
      (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.")
      (http-transport:close-connections area-dat: runremote)
      (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
      (mutex-unlock! *rmt-mutex*)
      (rmt:send-receive cmd rid params attemptnum: attemptnum))
     
     ;;DOT CASE5 [label="local\nread"];
     ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5};
     ;;DOT CASE5 -> "rmt:open-qry-close-locally";

     ;; on homehost and this is a read
     ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required
	   (cdr (remote-hh-dat runremote))       ;; on homehost
           (member cmd api:read-only-queries))   ;; this is a read
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
      (rmt:open-qry-close-locally cmd 0 params))

     ;;DOT CASE6 [label="init\nremote"];
     ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6};
     ;;DOT CASE6 -> "rmt:send-receive";
     ;; on homehost and this is a write, we already have a server, but server has died
     ((and (cdr (remote-hh-dat runremote))           ;; on homehost
           (not (member cmd api:read-only-queries))  ;; this is a write
           (remote-server-url runremote)             ;; have a server
           (not (server:ping (remote-server-url runremote) (remote-server-id runremote))))  ;; server has died. NOTE: this is not a cheap call! Need better approach.
      (set! *runremote* (make-and-init-remote))
      (let* ((server-info (remote-server-info *runremote*))) 
            (if server-info
		(begin
		  (remote-server-url-set! *runremote* (server:record->url server-info))
                  (remote-server-id-set! *runremote* (server:record->id server-info)))))
      (remote-force-server-set! runremote (common:force-server?))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6")
      (rmt:send-receive cmd rid params attemptnum: attemptnum))

     ;;DOT CASE7 [label="homehost\nwrite"];
     ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7};
     ;;DOT CASE7 -> "rmt:open-qry-close-locally";
     ;; on homehost and this is a write, we already have a server
     ((and (not (remote-force-server runremote))     ;; honor forced use of server, i.e. server NOT required
	   (cdr (remote-hh-dat runremote))           ;; on homehost
           (not (member cmd api:read-only-queries))  ;; this is a write
           (remote-server-url runremote))            ;; have a server
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
      (rmt:open-qry-close-locally cmd 0 params))

     ;;DOT CASE8 [label="force\nserver"];
     ;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8};
     ;;DOT CASE8 -> "rmt:open-qry-close-locally";
     ;;  on homehost, no server contact made and this is a write, passively start a server 
     ((and (not (remote-force-server runremote))     ;; honor forced use of server, i.e. server NOT required
	   (cdr (remote-hh-dat runremote))           ;; have homehost
           (not (remote-server-url runremote))       ;; no connection yet
	   (not (member cmd api:read-only-queries))) ;; not a read-only query
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
      (let ((server-info  (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
	(if server-info
	    (begin
              (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed
              (remote-server-id-set! runremote (server:record->id server-info)))  
	    (if (common:force-server?)
		(server:start-and-wait *toppath*)
		(server:kind-run *toppath*)))
      (remote-force-server-set! runremote (common:force-server?))
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8.1")
      (rmt:open-qry-close-locally cmd 0 params)))

     ;;DOT CASE9 [label="force server\nnot on homehost"];
     ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9};
     ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive";
     ((or (and (remote-force-server runremote)              ;; we are forcing a server and don't yet have a connection to one
	       (not (remote-conndat runremote)))
	  (and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
	       (not (remote-conndat runremote))))           ;; and no connection
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote))
      (mutex-unlock! *rmt-mutex*)
      (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up?
	  (server:start-and-wait *toppath*))
      (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
      (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as

     ;;DOT CASE10 [label="on homehost"];
     ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10};
     ;;DOT CASE10 -> "rmt:open-qry-close-locally";
     ;; all set up if get this far, dispatch the query
     ((and (not (remote-force-server runremote))
	   (cdr (remote-hh-dat runremote))) ;; we are on homehost
      (mutex-unlock! *rmt-mutex*)
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10")
      (rmt:open-qry-close-locally cmd (if rid rid 0) params))

     ;;DOT CASE11 [label="send_receive"];
     ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11};
     ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"];
     ;;DOT CASE11 -> "RESULT" [label="call succeeded"];
     ;; not on homehost, do server query
     (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid)))))
    ;;DOT }

;; bunch of small functions factored out of send-receive to make debug easier
;;

;; No Title 
;; Error: (vector-ref) out of range
;; #(#<condition: (exn type)> (#("db.scm:3740: regex#regexp" #f #f) #("db.scm:3739: regex#string-substitute" #f #f) #("db.scm:3738: base64#base64-decode" #f #f) #("db.scm:3737: z3#z3:decode-buffer" #f #f) #("db.scm:3736: with-input-from-string" #f #f) #("db.scm:3741: s11n#deserialize" #f #f) #("api.scm:374: api:execute-requests" #f #f) #("api.scm:139: call-with-current-continuation" #f #f) #("api.scm:139: with-exception-handler" #f #f) #("api.scm:139: ##sys#call-with-values" #f #f) #("api.scm:158: string->symbol" #f #f) #("api.scm:160: current-milliseconds" #f #f) #("api.scm:161: dbr:dbstruct-read-only" #f #f) #("api.scm:139: k15" #f #f) #("api.scm:139: g19" #f #f) #("api.scm:142: get-call-chain" #f #f)) #("get-test-info-by-id" (1102 507299)))







|
>
>

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<
|
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|







149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234

235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
		  (client:setup areapath)
		  #f))))

;;======================================================================

;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f))
  ;; start attemptnum at 1 so the modulo below works as expected
  #f)

;; ;;   #;(common:telemetry-log (conc "rmt:"(->string cmd))
;; ;;                         payload: `((rid . ,rid)
;; ;;                                    (params . ,params)))
;; ;;                           
;; ;;   (if (> attemptnum 2)
;; ;;       (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum))
;; ;;     
;; ;;   (cond
;; ;;    ((> attemptnum 2) (thread-sleep! 0.053))
;; ;;    ((> attemptnum 10) (thread-sleep! 0.5))
;; ;;    ((> attemptnum 20) (thread-sleep! 1)))
;; ;;   (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15)))  
;; ;;     (begin (server:run *toppath*) (thread-sleep! 3))) 
;; ;;   
;; ;;   
;; ;;   ;;DOT digraph megatest_state_status {
;; ;;   ;;DOT   ranksep=0;
;; ;;   ;;DOT   // rankdir=LR;
;; ;;   ;;DOT   node [shape="box"];
;; ;;   ;;DOT "rmt:send-receive" -> MUTEXLOCK;
;; ;;   ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; }
;; ;;   ;; do all the prep locked under the rmt-mutex
;; ;;   (mutex-lock! *rmt-mutex*)
;; ;;   
;; ;;   ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
;; ;;   ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
;; ;;   ;; 3. do the query, if on homehost use local access
;; ;;   ;;
;; ;;   (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
;; ;;          (areapath      *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
;; ;; 	 (runremote     (or area-dat
;; ;; 			    *runremote*))
;; ;;          (attemptnum    (+ 1 attemptnum))
;; ;; 	 (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*)))
;; ;; 
;; ;;     ;; DOT INIT_RUNREMOTE; // leaving off - doesn't really add to the clarity
;; ;;     ;; DOT MUTEXLOCK -> INIT_RUNREMOTE [label="no remote?"];
;; ;;     ;; DOT INIT_RUNREMOTE -> MUTEXLOCK;
;; ;;     ;; ensure we have a record for our connection for given area
;; ;;     (if (not runremote)                   ;; can remove this one. should never get here.         
;; ;; 	(begin
;; ;; 	  (set! *runremote* (make-and-init-remote))
;; ;;           (let* ((server-info (remote-server-info *runremote*))) 
;; ;;             (if server-info
;; ;; 		(begin
;; ;; 			(remote-server-url-set! *runremote* (server:record->url server-info))
;; ;; 			(remote-server-id-set! *runremote* (server:record->id server-info)))))  
;; ;; 	  (set! runremote   *runremote*))) ;; new runremote will come from this on next iteration
;; ;;     
;; ;;     ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity
;; ;;     ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"];
;; ;;     ;; DOT SET_HOMEHOST -> MUTEXLOCK;
;; ;;     ;; ensure we have a homehost record
;; ;;     (if (not (pair? (remote-hh-dat runremote)))  ;; not on homehost
;; ;; 	(thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
;; ;; 	(remote-hh-dat-set! runremote (common:get-homehost)))
;; ;;     
;; ;;     ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
;; ;;     (cond
;; ;;      ;;DOT EXIT;
;; ;;      ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" }
;; ;;      ;; give up if more than 150 attempts
;; ;;      ((> attemptnum 150)
;; ;;       (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.")
;; ;;       (exit 1))
;; ;; 
;; ;;      ;;DOT CASE2 [label="local\nreadonly\nquery"];
;; ;;      ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2}
;; ;;      ;;DOT CASE2 -> "rmt:open-qry-close-locally";
;; ;;      ;; readonly mode, read request-  handle it - case 2
;; ;;      ((and readonly-mode
;; ;;            (member cmd api:read-only-queries)) 
;; ;;       (mutex-unlock! *rmt-mutex*)
;; ;;       (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2")
;; ;;       (rmt:open-qry-close-locally cmd 0 params)

;; ;;       )
;; ;; 
;; ;;      ;;DOT CASE3 [label="write in\nread-only mode"];
;; ;;      ;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3}
;; ;;      ;;DOT CASE3 -> "#f";
;; ;;      ;; readonly mode, write request.  Do nothing, return #f
;; ;;      (readonly-mode (extras-readonly-mode *rmt-mutex* *default-log-port* cmd params))
;; ;; 
;; ;;      ;; This block was for pre-emptively resetting the connection if there had been no communication for some time.
;; ;;      ;; I don't think it adds any value. If the server is not there, just fail and start a new connection.
;; ;;      ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout)
;; ;;      ;;
;; ;;      ;;DOT CASE4 [label="reset\nconnection"];
;; ;;      ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4}
;; ;;      ;;DOT CASE4 -> "rmt:send-receive";
;; ;;      ;; reset the connection if it has been unused too long
;; ;;      ((and runremote
;; ;;            (remote-conndat runremote)
;; ;; 	   (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on
;; ;; 	      (+ (http-transport:server-dat-get-last-access (remote-conndat runremote))
;; ;; 		 (remote-server-timeout runremote))))
;; ;;       (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.")
;; ;;       (http-transport:close-connections area-dat: runremote)
;; ;;       (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
;; ;;       (mutex-unlock! *rmt-mutex*)
;; ;;       (rmt:send-receive cmd rid params attemptnum: attemptnum))
;; ;;      
;; ;;      ;;DOT CASE5 [label="local\nread"];
;; ;;      ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5};
;; ;;      ;;DOT CASE5 -> "rmt:open-qry-close-locally";
;; ;; 
;; ;;      ;; on homehost and this is a read
;; ;;      ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required
;; ;; 	   (cdr (remote-hh-dat runremote))       ;; on homehost
;; ;;            (member cmd api:read-only-queries))   ;; this is a read
;; ;;       (mutex-unlock! *rmt-mutex*)
;; ;;       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
;; ;;       (rmt:open-qry-close-locally cmd 0 params))
;; ;; 
;; ;;      ;;DOT CASE6 [label="init\nremote"];
;; ;;      ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6};
;; ;;      ;;DOT CASE6 -> "rmt:send-receive";
;; ;;      ;; on homehost and this is a write, we already have a server, but server has died
;; ;;      ((and (cdr (remote-hh-dat runremote))           ;; on homehost
;; ;;            (not (member cmd api:read-only-queries))  ;; this is a write
;; ;;            (remote-server-url runremote)             ;; have a server
;; ;;            (not (server:ping (remote-server-url runremote) (remote-server-id runremote))))  ;; server has died. NOTE: this is not a cheap call! Need better approach.
;; ;;       (set! *runremote* (make-and-init-remote))
;; ;;       (let* ((server-info (remote-server-info *runremote*))) 
;; ;;             (if server-info
;; ;; 		(begin
;; ;; 		  (remote-server-url-set! *runremote* (server:record->url server-info))
;; ;;                   (remote-server-id-set! *runremote* (server:record->id server-info)))))
;; ;;       (remote-force-server-set! runremote (common:force-server?))
;; ;;       (mutex-unlock! *rmt-mutex*)
;; ;;       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6")
;; ;;       (rmt:send-receive cmd rid params attemptnum: attemptnum))
;; ;; 
;; ;;      ;;DOT CASE7 [label="homehost\nwrite"];
;; ;;      ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7};
;; ;;      ;;DOT CASE7 -> "rmt:open-qry-close-locally";
;; ;;      ;; on homehost and this is a write, we already have a server
;; ;;      ((and (not (remote-force-server runremote))     ;; honor forced use of server, i.e. server NOT required
;; ;; 	   (cdr (remote-hh-dat runremote))           ;; on homehost
;; ;;            (not (member cmd api:read-only-queries))  ;; this is a write
;; ;;            (remote-server-url runremote))            ;; have a server
;; ;;       (mutex-unlock! *rmt-mutex*)
;; ;;       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
;; ;;       (rmt:open-qry-close-locally cmd 0 params))
;; ;; 
;; ;;      ;;DOT CASE8 [label="force\nserver"];
;; ;;      ;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8};
;; ;;      ;;DOT CASE8 -> "rmt:open-qry-close-locally";
;; ;;      ;;  on homehost, no server contact made and this is a write, passively start a server 
;; ;;      ((and (not (remote-force-server runremote))     ;; honor forced use of server, i.e. server NOT required
;; ;; 	   (cdr (remote-hh-dat runremote))           ;; have homehost
;; ;;            (not (remote-server-url runremote))       ;; no connection yet
;; ;; 	   (not (member cmd api:read-only-queries))) ;; not a read-only query
;; ;;       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
;; ;;       (let ((server-info  (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
;; ;; 	(if server-info
;; ;; 	    (begin
;; ;;               (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed
;; ;;               (remote-server-id-set! runremote (server:record->id server-info)))  
;; ;; 	    (if (common:force-server?)
;; ;; 		(server:start-and-wait *toppath*)
;; ;; 		(server:kind-run *toppath*)))
;; ;;       (remote-force-server-set! runremote (common:force-server?))
;; ;;       (mutex-unlock! *rmt-mutex*)
;; ;;       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8.1")
;; ;;       (rmt:open-qry-close-locally cmd 0 params)))
;; ;; 
;; ;;      ;;DOT CASE9 [label="force server\nnot on homehost"];
;; ;;      ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9};
;; ;;      ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive";
;; ;;      ((or (and (remote-force-server runremote)              ;; we are forcing a server and don't yet have a connection to one
;; ;; 	       (not (remote-conndat runremote)))
;; ;; 	  (and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
;; ;; 	       (not (remote-conndat runremote))))           ;; and no connection
;; ;;       (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote))
;; ;;       (mutex-unlock! *rmt-mutex*)
;; ;;       (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up?
;; ;; 	  (server:start-and-wait *toppath*))
;; ;;       (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
;; ;;       (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as
;; ;; 
;; ;;      ;;DOT CASE10 [label="on homehost"];
;; ;;      ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10};
;; ;;      ;;DOT CASE10 -> "rmt:open-qry-close-locally";
;; ;;      ;; all set up if get this far, dispatch the query
;; ;;      ((and (not (remote-force-server runremote))
;; ;; 	   (cdr (remote-hh-dat runremote))) ;; we are on homehost
;; ;;       (mutex-unlock! *rmt-mutex*)
;; ;;       (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10")
;; ;;       (rmt:open-qry-close-locally cmd (if rid rid 0) params))
;; ;; 
;; ;;      ;;DOT CASE11 [label="send_receive"];
;; ;;      ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11};
;; ;;      ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"];
;; ;;      ;;DOT CASE11 -> "RESULT" [label="call succeeded"];
;; ;;      ;; not on homehost, do server query
;; ;;      (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid)))))
;; ;;     ;;DOT }

;; bunch of small functions factored out of send-receive to make debug easier
;;

;; No Title 
;; Error: (vector-ref) out of range
;; #(#<condition: (exn type)> (#("db.scm:3740: regex#regexp" #f #f) #("db.scm:3739: regex#string-substitute" #f #f) #("db.scm:3738: base64#base64-decode" #f #f) #("db.scm:3737: z3#z3:decode-buffer" #f #f) #("db.scm:3736: with-input-from-string" #f #f) #("db.scm:3741: s11n#deserialize" #f #f) #("api.scm:374: api:execute-requests" #f #f) #("api.scm:139: call-with-current-continuation" #f #f) #("api.scm:139: with-exception-handler" #f #f) #("api.scm:139: ##sys#call-with-values" #f #f) #("api.scm:158: string->symbol" #f #f) #("api.scm:160: current-milliseconds" #f #f) #("api.scm:161: dbr:dbstruct-read-only" #f #f) #("api.scm:139: k15" #f #f) #("api.scm:139: g19" #f #f) #("api.scm:142: get-call-chain" #f #f)) #("get-test-info-by-id" (1102 507299)))
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
              (exit 1)))))))
;;======================================================================
;;      (begin
;;	(debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.")
;;	(exit 1))))

(define (common:run-sync?)
    (and (common:on-homehost?)
	 (args:get-arg "-server")))




;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))







|
|







1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
              (exit 1)))))))
;;======================================================================
;;      (begin
;;	(debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.")
;;	(exit 1))))

(define (common:run-sync?)
    ;; (and (common:on-homehost?)
  (args:get-arg "-server"))




;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
		     (< num-ok 1))  ;; if there are no decent candidates for servers then try starting a new one
		(server:kind-run areapath))
	    (thread-sleep! 5)
	    (loop (server:check-if-running areapath)
		  (+ try-num 1)))))))

(define (make-and-init-remote)
  (make-remote hh-dat:      (common:get-homehost)
	       server-info:  (if *toppath* (server:check-if-running *toppath*) #f)
	       server-timeout: (server:expiration-timeout)))




;; called in megatest.scm, host-port is string hostname:port







|







1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
		     (< num-ok 1))  ;; if there are no decent candidates for servers then try starting a new one
		(server:kind-run areapath))
	    (thread-sleep! 5)
	    (loop (server:check-if-running areapath)
		  (+ try-num 1)))))))

(define (make-and-init-remote)
  (make-remote ;; hh-dat:      (common:get-homehost)
	       server-info:  (if *toppath* (server:check-if-running *toppath*) #f)
	       server-timeout: (server:expiration-timeout)))




;; called in megatest.scm, host-port is string hostname:port

Modified runsmod.scm from [1219e25fbf] to [92c90befb4].

1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
    ;;
    (if (runs:dat-load-mgmt-function runsdat)
	((runs:dat-load-mgmt-function runsdat))
	(runs:dat-load-mgmt-function-set!
	 runsdat
	 (lambda ()
	   ;; jobtools maxload is useful for where the full Megatest run is done on one machine
	   (if (and (not (common:on-homehost?))
		    maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
	       (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
	   
	   ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
	   (if maxhomehostload
	       (common:wait-for-homehost-load maxhomehostload
					      (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))))))
    
 
    
    (if (and (not (null? prereqs-not-met))
	     (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))







|




|







1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
    ;;
    (if (runs:dat-load-mgmt-function runsdat)
	((runs:dat-load-mgmt-function runsdat))
	(runs:dat-load-mgmt-function-set!
	 runsdat
	 (lambda ()
	   ;; jobtools maxload is useful for where the full Megatest run is done on one machine
	   (if (and #;(not (common:on-homehost?))
		    maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
	       (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
	   
	   ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
	   #;(if maxhomehostload
	       (common:wait-for-homehost-load maxhomehostload
					      (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))))))
    
 
    
    (if (and (not (null? prereqs-not-met))
	     (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))

Modified servermod.scm from [4b44aabd8d] to [ea2cb26e2d].

65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
      (conc "http://" (car hostport) ":" (cadr hostport))))

;;======================================================================
;; logic for getting homehost. Returns (host . at-home)
;; IF *toppath* is not set, wait up to five seconds trying every two seconds
;; (this is to accomodate the watchdog)
;;
(define (common:get-homehost #!key (trynum 5))
  (assert *toppath* "ERROR: common:get-homehost called before launch:setup. This is fatal.")
  ;; called often especially at start up. use mutex to eliminate collisions
  (mutex-lock! *homehost-mutex*)
  (cond
   (*home-host*
    (mutex-unlock! *homehost-mutex*)
    *home-host*)







|







65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
      (conc "http://" (car hostport) ":" (cadr hostport))))

;;======================================================================
;; logic for getting homehost. Returns (host . at-home)
;; IF *toppath* is not set, wait up to five seconds trying every two seconds
;; (this is to accomodate the watchdog)
;;
#;(define (common:get-homehost #!key (trynum 5))
  (assert *toppath* "ERROR: common:get-homehost called before launch:setup. This is fatal.")
  ;; called often especially at start up. use mutex to eliminate collisions
  (mutex-lock! *homehost-mutex*)
  (cond
   (*home-host*
    (mutex-unlock! *homehost-mutex*)
    *home-host*)
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160

161

162

163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
      (set! *home-host* (cons homehost at-home))
      (mutex-unlock! *homehost-mutex*)
      *home-host*))))

;;======================================================================
;; am I on the homehost?
;;
(define (common:on-homehost?)
  (let ((hh (common:get-homehost)))
    (if hh
	(cdr hh)
	#f)))

(define (common:wait-for-homehost-load maxnormload msg)
  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                     #f
                     (common:get-homehost)))
         (hh     (if hh-dat (car hh-dat) #f)))
    (common:wait-for-normalized-load maxnormload msg hh)))


;; Given a run id start a server process    ### NOTE ### > file 2>&1 
;; if the run-id is zero and the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))
	 (curr-ip     (server:get-best-guess-address curr-host))
	 (curr-pid    (current-process-id))
	 (homehost    (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
	 (target-host (car homehost))
	 (testsuite   (common:get-area-name))
	 (logfile     (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
	 (profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
			   ""))

	 (cmdln (conc (common:get-megatest-exe)

		      " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")

							   " -daemonize "
							   "")
		      ;; " -log " logfile
		      " -m testsuite:" testsuite
		      " " profile-mode
		      )) ;; (conc " >> " logfile " 2>&1 &")))))
	 (log-rotate  (make-thread common:rotate-logs  "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!?
         (load-limit  (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0)))
    ;; we want the remote server to start in *toppath* so push there
    (push-directory areapath)
    (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
    (thread-start! log-rotate)
    
    ;; host.domain.tld match host?
    (if (and target-host 
	     ;; look at target host, is it host.domain.tld or ip address and does it 
	     ;; match current ip or hostname
	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (pseudo-random-integer 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
    ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
    #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

(define (server:record->url servr)
  (handle-exceptions
   exn
   (begin 
     (debug:print-info 0 *default-log-port*  "Unable to get server url from " servr ", exn=" exn)     







|





|


















|
|




>
|
>
|
>
|
|
|
|
|
|
|




|


|








|
|



|
|
|







123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
      (set! *home-host* (cons homehost at-home))
      (mutex-unlock! *homehost-mutex*)
      *home-host*))))

;;======================================================================
;; am I on the homehost?
;;
#;(define (common:on-homehost?)
  (let ((hh (common:get-homehost)))
    (if hh
	(cdr hh)
	#f)))

#;(define (common:wait-for-homehost-load maxnormload msg)
  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                     #f
                     (common:get-homehost)))
         (hh     (if hh-dat (car hh-dat) #f)))
    (common:wait-for-normalized-load maxnormload msg hh)))


;; Given a run id start a server process    ### NOTE ### > file 2>&1 
;; if the run-id is zero and the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))
	 (curr-ip     (server:get-best-guess-address curr-host))
	 (curr-pid    (current-process-id))
	 ;; (homehost    (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
	 ;; (target-host (car homehost))
	 (testsuite   (common:get-area-name))
	 (logfile     (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
	 (profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
			   ""))
	 (cmdln (conc
		 (common:get-megatest-exe)
		 " -server " (or (get-host-name) "-")
		 (if (equal? (configf:lookup *configdat* "server" "daemonize")
			     "yes")
		     " -daemonize "
		     "")
		 ;; " -log " logfile
		 " -m testsuite:" testsuite
		 " " profile-mode
		 ))
	 ;; (log-rotate  (make-thread common:rotate-logs  "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!?
         (load-limit  (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0)))
    ;; we want the remote server to start in *toppath* so push there
    (push-directory areapath)
    (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
    ;; (thread-start! log-rotate)
    
    ;; host.domain.tld match host?
    #;(if (and target-host 
	     ;; look at target host, is it host.domain.tld or ip address and does it 
	     ;; match current ip or hostname
	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "NBFAKE_LOG" logfile)
    ;; (thread-sleep! (/ (pseudo-random-integer 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
    ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
    #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
    (system (conc "nbfake " cmdln))
    (unsetenv "NBFAKE_LOG")
    ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    ;; (thread-join! log-rotate)
    (pop-directory)))

(define (server:record->url servr)
  (handle-exceptions
   exn
   (begin 
     (debug:print-info 0 *default-log-port*  "Unable to get server url from " servr ", exn=" exn)