Megatest

Changes On Branch 87b6d8cf0ee30ade
Login

Changes In Branch v1.70-nohomehost Through [87b6d8cf0e] Excluding Merge-Ins

This is equivalent to a diff from bf877ecde8 to 87b6d8cf0e

2022-11-20
19:44
Pulled in latest changes from v1.70 check-in: e966c3ef7e user: matt tags: v1.70-nohomehost
08:34
Couple minor changes to where client connections are cleared. check-in: ae21734c5e user: matt tags: v1.70
2022-11-18
12:53
rebased newdashboard branch forward on v1.70 check-in: 5820f690ed user: mrwellan tags: v1.70-ndboard
2022-11-13
04:53
wip check-in: 87b6d8cf0e user: matt tags: v1.70-nohomehost
2022-11-12
19:25
Most routines needed for no-homehost updated. check-in: ed25403d77 user: matt tags: v1.70-nohomehost
2022-11-11
09:42
Partial implementation of homehost-free check-in: ac69021df7 user: matt tags: v1.70-nohomehost
2022-11-10
13:37
Fixed server/client signature. I think. check-in: bf877ecde8 user: matt tags: v1.70
2022-11-08
22:15
Added stop-the-train, crowbar switch check-in: bf43672760 user: matt tags: v1.70

Modified archive.scm from [9231707c41] to [25e6383e3d].

344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
        (archiver       (let ((s (configf:lookup *configdat* "archive" "archiver")))
			   (if s (string->symbol s) 'bup)))
        (rsync-exe (or (configf:lookup *configdat* "archive" "rsync") "rsync"))   
        (print-prefix      "Running: ") 
        (archive-info (archive:allocate-new-archive-block blockid-cache *toppath* tsname min-space target-patt run-patt "megatest-db"))
	(archive-dir  (if archive-info (cdr archive-info) #f))
	(archive-id   (if archive-info (car archive-info) -1))
        (home-host (common:get-homehost))
        (archive-time (seconds->std-time-str (current-seconds)))
        (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time))
        (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db"))
        (dbfile             (conc  archive-staging-db "/megatest.db"))) 
        (create-directory archive-staging-db #t)
        (let-values (((pid-val exit-status exit-code) (run-n-wait rsync-exe params: (list "-v" (conc (car home-host) ":"tmp-db-path) archive-staging-db) print-cmd: print-prefix)))
            (if (eq? exit-code 0)   







|







344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
        (archiver       (let ((s (configf:lookup *configdat* "archive" "archiver")))
			   (if s (string->symbol s) 'bup)))
        (rsync-exe (or (configf:lookup *configdat* "archive" "rsync") "rsync"))   
        (print-prefix      "Running: ") 
        (archive-info (archive:allocate-new-archive-block blockid-cache *toppath* tsname min-space target-patt run-patt "megatest-db"))
	(archive-dir  (if archive-info (cdr archive-info) #f))
	(archive-id   (if archive-info (car archive-info) -1))
        (home-host    (server:choose-server *toppath* 'homehost))
        (archive-time (seconds->std-time-str (current-seconds)))
        (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time))
        (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db"))
        (dbfile             (conc  archive-staging-db "/megatest.db"))) 
        (create-directory archive-staging-db #t)
        (let-values (((pid-val exit-status exit-code) (run-n-wait rsync-exe params: (list "-v" (conc (car home-host) ":"tmp-db-path) archive-staging-db) print-cmd: print-prefix)))
            (if (eq? exit-code 0)   

Modified client.scm from [3f204dd646] to [e699a8b092].

87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140


141
      (begin
	(debug:print-error 0 *default-log-port* "failed to start or connect to server")
	(exit 1))
      ;;
      ;; Alternatively here, we can get the list of candidate servers and work our way
      ;; through them searching for a good one.
      ;;
      (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath))
	     (runremote  (or area-dat *runremote*)))
	(if (not server-dat) ;; no server found
	    (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))
	    (let ((host  (cadr  server-dat))
		  (port  (caddr server-dat))
                  (server-id (caddr (cddr server-dat))))
	      (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	      (if (and (not area-dat)
		       (not *runremote*))
                  (begin       
		    (set! *runremote* (make-remote))
                    (let* ((server-info (remote-server-info *runremote*))) 
                      (if server-info
                        (begin
                          (remote-server-url-set! *runremote* (server:record->url server-info))
                          (remote-server-id-set! *runremote* (server:record->id server-info)))))))
	      (if (and host port server-id)
		  (let* ((start-res (case *transport-type*
				      ((http)(http-transport:client-connect host port server-id))))
			 (ping-res  (case *transport-type* 
				      ((http)(rmt:login-no-auto-client-setup start-res)))))
		    (if (and start-res
			     ping-res)
			(let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago
			  (if runremote
			      (begin
				(remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res)
				(debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res))
				start-res)
			      (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))))
			(begin    ;; login failed but have a server record, clean out the record and try again
			  (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid.  Fixes part of Randy;s ticket 1405717332
			  (case *transport-type* 
			    ((http)(http-transport:close-connections)))
                          (if *runremote* 
			    (remote-conndat-set! runremote #f)  ;; (hash-table-delete! runremote run-id)
                          )
			  (thread-sleep! 1)
			  (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))
			  )))
		  (begin    ;; no server registered
		    ;; (server:kind-run areapath)
		    (server:start-and-wait areapath)
		    (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries)
		    (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
		    (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)))))))))










|



<
|
|
|
|
|
|
|
|
|
|
|
|
|
<
|
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>
>

87
88
89
90
91
92
93
94
95
96
97

98
99
100
101
102
103
104
105
106
107
108
109
110

111

112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
      (begin
	(debug:print-error 0 *default-log-port* "failed to start or connect to server")
	(exit 1))
      ;;
      ;; Alternatively here, we can get the list of candidate servers and work our way
      ;; through them searching for a good one.
      ;;
      (let* ((server-dat (server:choose-server areapath 'best))
	     (runremote  (or area-dat *runremote*)))
	(if (not server-dat) ;; no server found
	    (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))

	    (match server-dat
	      ((host port start-time server-id)
	       (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	       (if (and (not area-dat)
			(not *runremote*))
                   (begin       
		     (set! *runremote* (make-remote))
                     (let* ((server-info (remote-server-info *runremote*))) 
                       (if server-info
                           (begin
                             (remote-server-url-set! *runremote* (server:record->url server-info))
                             (remote-server-id-set! *runremote* (server:record->id server-info)))))))
	       (if (and host port server-id)

		   (let* ((start-res (http-transport:client-connect host port server-id))

			  (ping-res  (rmt:login-no-auto-client-setup start-res)))
		     (if (and start-res
			      ping-res)
			 (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago
			   (if runremote
			       (begin
				 (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res)
				 (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res))
				 start-res)
			       (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))))
			 (begin    ;; login failed but have a server record, clean out the record and try again
			   (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid.  Fixes part of Randy;s ticket 1405717332
			   (case *transport-type* 
			     ((http)(http-transport:close-connections)))
                           (if *runremote* 
			       (remote-conndat-set! runremote #f)  ;; (hash-table-delete! runremote run-id)
                               )
			   (thread-sleep! 1)
			   (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))
			   )))
		   (begin    ;; no server registered
		     ;; (server:kind-run areapath)
		     (server:start-and-wait areapath)
		     (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries)
		     (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
		     (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)))))
	      (else
	       (debug:print 0 *default-log-port* "ERROR: malformed server-dat="server-dat)))))))

Modified common.scm from [7a004393e6] to [c81106d159].

140
141
142
143
144
145
146

147
148
149
150
151
152
153
(define *test-meta-updated* (make-hash-table))
(define *globalexitstatus*  0) ;; attempt to work around possible thread issues
(define *passnum*           0) ;; when running track calls to run-tests or similar
;; (define *alt-log-file* #f)  ;; used by -log
;; (define *common:denoise*    (make-hash-table)) ;; for low noise printing
(define *default-log-port*  (current-error-port))
(define *time-zero* (current-seconds)) ;; for the watchdog

(define *default-area-tag* "local")

;; DATABASE
;; (define *dbstruct-dbs*         #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this.
;; db stats
(define *db-stats*            (make-hash-table)) ;; hash of vectors < count duration-total >
(define *db-stats-mutex*      (make-mutex))







>







140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
(define *test-meta-updated* (make-hash-table))
(define *globalexitstatus*  0) ;; attempt to work around possible thread issues
(define *passnum*           0) ;; when running track calls to run-tests or similar
;; (define *alt-log-file* #f)  ;; used by -log
;; (define *common:denoise*    (make-hash-table)) ;; for low noise printing
(define *default-log-port*  (current-error-port))
(define *time-zero* (current-seconds)) ;; for the watchdog
(define *on-exit-procs* '()) ;; add procs to this list to be executed on exit
(define *default-area-tag* "local")

;; DATABASE
;; (define *dbstruct-dbs*         #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this.
;; db stats
(define *db-stats*            (make-hash-table)) ;; hash of vectors < count duration-total >
(define *db-stats-mutex*      (make-mutex))
313
314
315
316
317
318
319
320

321
322
323
324
325
326
327
    ((skip) "SKIP")
    (else "FAIL")))

(define (common:logpro-exit-code->test-status exit-code)
  (status-sym->string (common:logpro-exit-code->status-sym exit-code)))

(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )

  (server-url        #f) ;; (server:check-if-running *toppath*) #f))
  (server-id         #f)
  (server-info       (if *toppath* (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (connect-time      (current-seconds))
  (conndat           #f)
  (transport         *transport-type*)







|
>







314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
    ((skip) "SKIP")
    (else "FAIL")))

(define (common:logpro-exit-code->test-status exit-code)
  (status-sym->string (common:logpro-exit-code->status-sym exit-code)))

(defstruct remote
  (hh-dat            (or (server:choose-server *toppath* 'homehost)
			 (cons #f #f)))
  (server-url        #f) ;; (server:check-if-running *toppath*) #f))
  (server-id         #f)
  (server-info       (if *toppath* (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (connect-time      (current-seconds))
  (conndat           #f)
  (transport         *transport-type*)
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
      #f))

;;======================================================================
;; logic for getting homehost. Returns (host . at-home)
;; IF *toppath* is not set, wait up to five seconds trying every two seconds
;; (this is to accomodate the watchdog)
;;
(define (common:get-homehost #!key (trynum 5))
  ;; called often especially at start up. use mutex to eliminate collisions
  (mutex-lock! *homehost-mutex*)
  (cond
   (*home-host*
    (mutex-unlock! *homehost-mutex*)
    *home-host*)
   ((not *toppath*)
    (mutex-unlock! *homehost-mutex*)
    (launch:setup) ;; safely mutexed now
    (if (> trynum 0)
	(begin
	  (thread-sleep! 2)
	  (common:get-homehost trynum: (- trynum 1)))
	#f))
   (else
    (let* ((currhost (get-host-name))
	   (bestadrs (server:get-best-guess-address currhost))
	   ;; first look in config, then look in file .homehost, create it if not found
	   (homehost (or (configf:lookup *configdat* "server" "homehost" )
			 (handle-exceptions
			     exn
			     (if (> trynum 0)
				 (let ((delay-time (* (- 5 trynum) 5)))
				   (mutex-unlock! *homehost-mutex*)
				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying "
						delay-time " seconds and trying again, message: "  ((condition-property-accessor 'exn 'message) exn)
						", exn=" exn)
				   (thread-sleep! delay-time)
				   (common:get-homehost trynum: (- trynum 1)))
				 (begin
				   (mutex-unlock! *homehost-mutex*)
				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)
						"] Failed to read .homehost file after trying five times. Giving up and exiting, message: "
						((condition-property-accessor 'exn 'message) exn))
				   (exit 1)))
			   (let ((hhf (conc *toppath* "/.homehost")))
			     (if (common:file-exists? hhf)
				 (with-input-from-file hhf read-line)
				 (if (file-write-access? *toppath*)
				     (begin
				       (with-output-to-file hhf
					 (lambda ()
					   (print bestadrs)))
				       (begin
					 (mutex-unlock! *homehost-mutex*)
					 (car (common:get-homehost))))
				     #f))))))
	   (at-home  (or (equal? homehost currhost)
			 (equal? homehost bestadrs))))
      (set! *home-host* (cons homehost at-home))
      (mutex-unlock! *homehost-mutex*)
      *home-host*))))

;;======================================================================
;; am I on the homehost?
;;
(define (common:on-homehost?)
  (let ((hh (common:get-homehost)))
    (if hh
	(cdr hh)
	#f)))

;;======================================================================
;; do we honor the caches of the config files?
;;
(define (common:use-cache?)
  (let ((res #t)) ;; priority by order of evaluation
    (if *configdat* ;; sillyness here. can't use setup/use-cache to know if we can use the cached files!







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







1305
1306
1307
1308
1309
1310
1311






























































1312
1313
1314
1315
1316
1317
1318
      #f))

;;======================================================================
;; logic for getting homehost. Returns (host . at-home)
;; IF *toppath* is not set, wait up to five seconds trying every two seconds
;; (this is to accomodate the watchdog)
;;































































;;======================================================================
;; do we honor the caches of the config files?
;;
(define (common:use-cache?)
  (let ((res #t)) ;; priority by order of evaluation
    (if *configdat* ;; sillyness here. can't use setup/use-cache to know if we can use the cached files!
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
		    (host-last-used-set! rec curr-time)
		    new-best)
		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))

(define (common:wait-for-homehost-load maxnormload msg)
  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                     #f
                     (common:get-homehost)))
         (hh     (if hh-dat (car hh-dat) #f)))
    (common:wait-for-normalized-load maxnormload msg hh)))

(define (common:get-num-cpus remote-host)
  (let* ((actual-host (or remote-host (get-host-name))))
    ;; hosts had better not be changing the number of cpus too often!
    (or (hash-table-ref/default *numcpus-cache* actual-host #f)







|







1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
		    (host-last-used-set! rec curr-time)
		    new-best)
		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))

(define (common:wait-for-homehost-load maxnormload msg)
  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                     #f
                     (server:choose-server *toppath* 'homehost)))
         (hh     (if hh-dat (car hh-dat) #f)))
    (common:wait-for-normalized-load maxnormload msg hh)))

(define (common:get-num-cpus remote-host)
  (let* ((actual-host (or remote-host (get-host-name))))
    ;; hosts had better not be changing the number of cpus too often!
    (or (hash-table-ref/default *numcpus-cache* actual-host #f)
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363

3364
3365
3366
3367
3368
3369
3370
			(string-split pktsdirs-str " ")
			#f)))
    pktsdirs))

;;======================================================================
;; use-lt is use linktree "lt" link to find pkts dir
(define (common:save-pkt pktalist-in mtconf use-lt #!key (add-only #f)) ;; add-only saves the pkt only if there is a parent already
  (if (or add-only
	  (hash-table-exists? *pkts-info* 'last-parent))
      (let* ((parent   (hash-table-ref/default *pkts-info* 'last-parent #f))
	     (pktalist (if parent
			   (cons `(parent . ,parent)
				 pktalist-in)
			   pktalist-in)))
	(let-values (((uuid pkt)
		      (alist->pkt pktalist common:pkts-spec)))
	  (hash-table-set! *pkts-info* 'last-parent uuid)
	  (let ((pktsdir (or (hash-table-ref/default *pkts-info* 'pkts-dir #f)
			     (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt))
				    (pktsdir   (car pktsdirs))) ;; assume it is there
			       (hash-table-set! *pkts-info* 'pkts-dir pktsdir)
			       pktsdir))))

            (handle-exceptions
             exn
             (debug:print-info 0 "failed to write out packet to " pktsdir ", exn=" exn) ;; don't care if this failed for now but MUST FIX - BUG!!
             (if (not (file-exists? pktsdir))
                 (create-directory pktsdir #t))
             (with-output-to-file
                 (conc pktsdir "/" uuid ".pkt")







|














>







3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
			(string-split pktsdirs-str " ")
			#f)))
    pktsdirs))

;;======================================================================
;; use-lt is use linktree "lt" link to find pkts dir
(define (common:save-pkt pktalist-in mtconf use-lt #!key (add-only #f)) ;; add-only saves the pkt only if there is a parent already
  (if (or (not add-only)
	  (hash-table-exists? *pkts-info* 'last-parent))
      (let* ((parent   (hash-table-ref/default *pkts-info* 'last-parent #f))
	     (pktalist (if parent
			   (cons `(parent . ,parent)
				 pktalist-in)
			   pktalist-in)))
	(let-values (((uuid pkt)
		      (alist->pkt pktalist common:pkts-spec)))
	  (hash-table-set! *pkts-info* 'last-parent uuid)
	  (let ((pktsdir (or (hash-table-ref/default *pkts-info* 'pkts-dir #f)
			     (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt))
				    (pktsdir   (car pktsdirs))) ;; assume it is there
			       (hash-table-set! *pkts-info* 'pkts-dir pktsdir)
			       pktsdir))))
	    (debug:print 0 *default-log-port* "pktsdir: "pktsdir)
            (handle-exceptions
             exn
             (debug:print-info 0 "failed to write out packet to " pktsdir ", exn=" exn) ;; don't care if this failed for now but MUST FIX - BUG!!
             (if (not (file-exists? pktsdir))
                 (create-directory pktsdir #t))
             (with-output-to-file
                 (conc pktsdir "/" uuid ".pkt")

Modified dashboard.scm from [6283f67b19] to [47d443cd41].

3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
    (if (not (launch:setup))
        (begin
          (debug:print 0 *default-log-port* "Failed to find megatest.config, exiting") 
          (exit 1)
        )
    )

    (if (not (common:on-homehost?))
    (begin
      (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (common:get-homehost))
      (debug:print 0 *default-log-port* "It will be slower.")
      ))


    (if (and (common:file-exists? mtdb-path)
	     (file-write-access? mtdb-path))
	(if (not (args:get-arg "-skip-version-check"))







|

|







3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
    (if (not (launch:setup))
        (begin
          (debug:print 0 *default-log-port* "Failed to find megatest.config, exiting") 
          (exit 1)
        )
    )

    #;(if (not (common:on-homehost?))
    (begin
      (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (server:get-homehost))
      (debug:print 0 *default-log-port* "It will be slower.")
      ))


    (if (and (common:file-exists? mtdb-path)
	     (file-write-access? mtdb-path))
	(if (not (args:get-arg "-skip-version-check"))

Modified db.scm from [d143149924] to [eca8c190a3].

4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599

4600
4601
4602





4603
4604
4605
4606
4607
4608
4609
	       (if (common:low-noise-print 30)
		   (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) )))) 
))


(define (std-exit-procedure)
  ;;(common:telemetry-log-close)
  (on-exit (lambda () 0))
  ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
  (let ((no-hurry  (if *time-to-exit* ;; hurry up
		       #f
		       (begin
			 (set! *time-to-exit* #t)
			 #t))))
    (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.")
    (if (and no-hurry (debug:debug-mode 18))

	(rmt:print-db-stats))
    (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds
                              (if *dbstruct-dbs* (db:close-all *dbstruct-dbs*)) ;; one second allocated





			      (if *task-db*    
				  (let ((db (cdr *task-db*)))
				    (if (sqlite3:database? db)
					(begin
					  (sqlite3:interrupt! db)
					  (sqlite3:finalize! db #t)
					  ;; (vector-set! *task-db* 0 #f)







|







|
>



>
>
>
>
>







4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
	       (if (common:low-noise-print 30)
		   (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) )))) 
))


(define (std-exit-procedure)
  ;;(common:telemetry-log-close)
  (on-exit (lambda () 0)) ;; why is this here?
  ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
  (let ((no-hurry  (if *time-to-exit* ;; hurry up
		       #f
		       (begin
			 (set! *time-to-exit* #t)
			 #t))))
    (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.")
    (if (and no-hurry
	     (debug:debug-mode 18))
	(rmt:print-db-stats))
    (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds
                              (if *dbstruct-dbs* (db:close-all *dbstruct-dbs*)) ;; one second allocated
			      (if (list? *on-exit-procs*)
				  (for-each
				   (lambda (proc)
				     (proc))
				   *on-exit-procs*))
			      (if *task-db*    
				  (let ((db (cdr *task-db*)))
				    (if (sqlite3:database? db)
					(begin
					  (sqlite3:interrupt! db)
					  (sqlite3:finalize! db #t)
					  ;; (vector-set! *task-db* 0 #f)

Modified http-transport.scm from [d2af089e7c] to [3205da4502].

424
425
426
427
428
429
430












431


432
433
434
435
436
437
438
439
440
441
442
443
444


445

446
447
448
449
450
451
452
453
			  (debug:print-info 0 *default-log-port* "Waiting for server alive signature")
                          (mutex-lock! *heartbeat-mutex*)
                          (set! sdat *server-info*)
                          (mutex-unlock! *heartbeat-mutex*)
                          (if (and sdat
				   (not changed)
				   (> (- (current-seconds) start-time) 2))












			      (begin


				(debug:print-info 0 *default-log-port* "Received server alive signature")
                                (common:save-pkt `((action . alive)
                                                   (T      . server)
                                                   (pid    . ,(current-process-id))
                                                   (ipaddr . ,(car sdat))
                                                   (port   . ,(cadr sdat)))
                                                 *configdat* #t)
				sdat)
                              (begin
				(debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat)
                                (sleep 4)
				(if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
				    (begin


				      (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server")

                                      (common:save-pkt `((action . died)
                                                         (T      . server)
                                                         (pid    . ,(current-process-id))
                                                         (ipaddr . ,(car sdat))
                                                         (port   . ,(cadr sdat))
                                                         (msg    . "Transport died?"))
						       *configdat* #t)
				      (exit))







>
>
>
>
>
>
>
>
>
>
>
>
|
>
>

|










|
>
>

>
|







424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
			  (debug:print-info 0 *default-log-port* "Waiting for server alive signature")
                          (mutex-lock! *heartbeat-mutex*)
                          (set! sdat *server-info*)
                          (mutex-unlock! *heartbeat-mutex*)
                          (if (and sdat
				   (not changed)
				   (> (- (current-seconds) start-time) 2))
			      (let* ((servinfodir (conc *toppath*"/.servinfo"))
				     (ipaddr      (car sdat))
				     (port        (cadr sdat))
				     (servinf     (conc servinfodir"/"ipaddr":"port)))
				(if (not (file-exists? servinfodir))
				    (create-directory servinfodir #t))
				(with-output-to-file servinf
				  (lambda ()
				    (print "SERVER STARTED: "ipaddr":"port" AT "(current-seconds)" server-id: "(server:get-client-signature))
				    (print "started: "(seconds->year-week/day-time (current-seconds)))))
				(set! *on-exit-procs* (cons
						       (lambda ()
							 (delete-file* servinf))
						       *on-exit-procs*))
				;; put data about this server into a simple flat file host.port
				(debug:print-info 0 *default-log-port* "Received server alive signature")
                                #;(common:save-pkt `((action . alive)
                                                   (T      . server)
                                                   (pid    . ,(current-process-id))
                                                   (ipaddr . ,(car sdat))
                                                   (port   . ,(cadr sdat)))
                                                 *configdat* #t)
				sdat)
                              (begin
				(debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat)
                                (sleep 4)
				(if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
				    (let* ((ipaddr  (car sdat))
					   (port    (cadr sdat))
					   (servinf (conc *toppath*"/.servinfo/"ipaddr":"port)))
				      (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server")
				      ;; (delete-file* servinf) ;; handled by on-exit, can be removed
                                      #;(common:save-pkt `((action . died)
                                                         (T      . server)
                                                         (pid    . ,(current-process-id))
                                                         (ipaddr . ,(car sdat))
                                                         (port   . ,(cadr sdat))
                                                         (msg    . "Transport died?"))
						       *configdat* #t)
				      (exit))
598
599
600
601
602
603
604
605
606
607
608



609
610
611
612
613
614
615
    ;; 		      (if (eq? *number-non-write-queries* 0)
    ;; 			  "n/a (no queries)"
    ;; 			  (/ *total-non-write-delay* 
    ;; 			     *number-non-write-queries*))
    ;; 		      " ms")
    
    (db:print-current-query-stats)
    (common:save-pkt `((action . exit)
                       (T      . server)
                       (pid    . ,(current-process-id)))
                     *configdat* #t)



    (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting")
    (exit)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;







|


|
>
>
>







615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
    ;; 		      (if (eq? *number-non-write-queries* 0)
    ;; 			  "n/a (no queries)"
    ;; 			  (/ *total-non-write-delay* 
    ;; 			     *number-non-write-queries*))
    ;; 		      " ms")
    
    (db:print-current-query-stats)
    #;(common:save-pkt `((action . exit)
                       (T      . server)
                       (pid    . ,(current-process-id)))
    *configdat* #t)

    ;; remove .servinfo file(s) here
    
    (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting")
    (exit)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
	  (exit)))
    ;; lets not even bother to start if there are already three or more server files ready to go
    #;(let* ((num-alive   (server:get-num-alive (server:get-list *toppath*))))
      (if (> num-alive 3)
          (begin
            (cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up"))
            (exit))))
  (common:save-pkt `((action . start)
		     (T      . server)
		     (pid    . ,(current-process-id)))
		   *configdat* #t)
    (let* ((th2 (make-thread (lambda ()
                               (debug:print-info 0 *default-log-port* "Server run thread started")
                               (http-transport:run 
                                (if (args:get-arg "-server")
                                    (args:get-arg "-server")
                                    "-")
                                )) "Server run"))







|
|
|
|







658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
	  (exit)))
    ;; lets not even bother to start if there are already three or more server files ready to go
    #;(let* ((num-alive   (server:get-num-alive (server:get-list *toppath*))))
      (if (> num-alive 3)
          (begin
            (cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up"))
            (exit))))
    #;(common:save-pkt `((action . start)
		       (T      . server)
		       (pid    . ,(current-process-id)))
		     *configdat* #t)
    (let* ((th2 (make-thread (lambda ()
                               (debug:print-info 0 *default-log-port* "Server run thread started")
                               (http-transport:run 
                                (if (args:get-arg "-server")
                                    (args:get-arg "-server")
                                    "-")
                                )) "Server run"))

Modified launch.scm from [60d380c61b] to [9881087e2c].

1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
      (set! cmdparms (base64:base64-encode 
		      (z3:encode-buffer 
		       (with-output-to-string
			 (lambda () ;; (list 'hosts     hosts)
			   (write (list (list 'testpath  test-path)
					;; (list 'transport (conc *transport-type*))
					;; (list 'serverinf *server-info*)
					(list 'homehost  (let* ((hhdat (common:get-homehost)))
							   (if hhdat
							       (car hhdat)
							       #f)))
					(list 'serverurl (if *runremote*
							     (remote-server-url *runremote*)
							     #f)) ;;
					(list 'areaname  (common:get-testsuite-name))







|







1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
      (set! cmdparms (base64:base64-encode 
		      (z3:encode-buffer 
		       (with-output-to-string
			 (lambda () ;; (list 'hosts     hosts)
			   (write (list (list 'testpath  test-path)
					;; (list 'transport (conc *transport-type*))
					;; (list 'serverinf *server-info*)
					#;(list 'homehost  (let* ((hhdat (server:get-homehost)))
							   (if hhdat
							       (car hhdat)
							       #f)))
					(list 'serverurl (if *runremote*
							     (remote-server-url *runremote*)
							     #f)) ;;
					(list 'areaname  (common:get-testsuite-name))

Modified megatest.scm from [7c70251ef1] to [b216f35a15].

654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
;; for some switches always print the command to stderr
;;
(if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun")
    (debug:print 0 *default-log-port* (string-intersperse (argv) " ")))

;; some switches imply homehost. Exit here if not on homehost
;;
(let ((homehost-required  (list "-cleanup-db" "-server")))
  (if (apply args:any? homehost-required)
      (if (not (common:on-homehost?))
	  (for-each
	   (lambda (switch)
	     (if (args:get-arg switch)
		 (begin
		   (debug:print 0 *default-log-port* "ERROR: you must be on the homehost to run with " switch
				", you can move homehost by removing the .homehost file but this will disrupt any runs in progress.")
		   (exit 1))))







|

|







654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
;; for some switches always print the command to stderr
;;
(if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun")
    (debug:print 0 *default-log-port* (string-intersperse (argv) " ")))

;; some switches imply homehost. Exit here if not on homehost
;;
(let ((homehost-required  (list "-cleanup-db")))
  (if (apply args:any? homehost-required)
      (if (not (server:choose-server *toppath* 'home?))
	  (for-each
	   (lambda (switch)
	     (if (args:get-arg switch)
		 (begin
		   (debug:print 0 *default-log-port* "ERROR: you must be on the homehost to run with " switch
				", you can move homehost by removing the .homehost file but this will disrupt any runs in progress.")
		   (exit 1))))
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
    (exit 0)))

(if (or (getenv "MT_RUNSCRIPT")
	(args:get-arg "-repl")
	(args:get-arg "-load"))
    (let* ((toppath (launch:setup))
	   (dbstructs (if (and toppath
                               (common:on-homehost?))
                          (db:setup #t)
                          #f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f)))
      (if *toppath*
	  (cond
	   ((getenv "MT_RUNSCRIPT")
	    ;; How to run megatest scripts
	    ;;







|







2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
    (exit 0)))

(if (or (getenv "MT_RUNSCRIPT")
	(args:get-arg "-repl")
	(args:get-arg "-load"))
    (let* ((toppath (launch:setup))
	   (dbstructs (if (and toppath
                               (server:choose-server toppath 'home?))
                          (db:setup #t)
                          #f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f)))
      (if *toppath*
	  (cond
	   ((getenv "MT_RUNSCRIPT")
	    ;; How to run megatest scripts
	    ;;

Modified rmt.scm from [4af82949dd] to [d9f771a22a].

114
115
116
117
118
119
120

121
122
123
124
125
126
127
128
    
    ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity
    ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"];
    ;; DOT SET_HOMEHOST -> MUTEXLOCK;
    ;; ensure we have a homehost record
    (if (not (pair? (remote-hh-dat runremote)))  ;; not on homehost
	(thread-sleep! 0.1) ;; since we shouldn't get here, delay a little

	(remote-hh-dat-set! runremote (common:get-homehost)))
    
    ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
    (cond
     #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds
      (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.")
      (set! *runremote* #f)
      ;; BUG: close-connections should go here?







>
|







114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
    
    ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity
    ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"];
    ;; DOT SET_HOMEHOST -> MUTEXLOCK;
    ;; ensure we have a homehost record
    (if (not (pair? (remote-hh-dat runremote)))  ;; not on homehost
	(thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
	(let ((hh-data (server:choose-server areapath 'homehost)))
	  (remote-hh-dat-set! runremote (or hh-data (cons #f #f)))))
    
    ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
    (cond
     #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds
      (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.")
      (set! *runremote* #f)
      ;; BUG: close-connections should go here?

Modified server.scm from [b0155e0a8d] to [237780917f].

119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137

138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174

;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))
	 (curr-ip     (server:get-best-guess-address curr-host))
	 (curr-pid    (current-process-id))
	 (homehost    (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
	 (target-host (car homehost))
	 (testsuite   (common:get-testsuite-name))
	 (logfile     (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
	 (profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
			   ""))
	 (cmdln (conc (common:get-megatest-exe)

		      " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
							   " -daemonize "
							   "")
		      ;; " -log " logfile
		      " -m testsuite:" testsuite
		      " " profile-mode
		      )) ;; (conc " >> " logfile " 2>&1 &")))))
	 (log-rotate  (make-thread common:rotate-logs  "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!?
         (load-limit  (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0)))
    ;; we want the remote server to start in *toppath* so push there
    (push-directory areapath)
    (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
    (thread-start! log-rotate)
    
    ;; host.domain.tld match host?
    (if (and target-host 
	     ;; look at target host, is it host.domain.tld or ip address and does it 
	     ;; match current ip or hostname
	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time
    (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds server-id
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let 
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 







|


|
|
|
|





>
|
|
|












|
|
|
|
|
|
|
|
|





|







119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* (;; (curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))
	 ;; (curr-ip     (server:get-best-guess-address curr-host))
	 ;; (curr-pid    (current-process-id))
	 ;; (homehost    (server:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
	 ;; (target-host (car homehost))
	 (testsuite   (common:get-testsuite-name))
	 (logfile     (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
	 (profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
			   ""))
	 (cmdln (conc (common:get-megatest-exe)
		      " -server - ";; (or target-host "-")
		      (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
			  " -daemonize "
			  "")
		      ;; " -log " logfile
		      " -m testsuite:" testsuite
		      " " profile-mode
		      )) ;; (conc " >> " logfile " 2>&1 &")))))
	 (log-rotate  (make-thread common:rotate-logs  "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!?
         (load-limit  (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0)))
    ;; we want the remote server to start in *toppath* so push there
    (push-directory areapath)
    (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
    (thread-start! log-rotate)
    
    ;; host.domain.tld match host?
    ;; (if (and target-host 
    ;; 	     ;; look at target host, is it host.domain.tld or ip address and does it 
    ;; 	     ;; match current ip or hostname
    ;; 	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
    ;; 	     (not (equal? curr-ip target-host)))
    ;; 	(begin
    ;; 	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
    ;; 	  (setenv "TARGETHOST" target-host)))
    ;;   
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time
    (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds server-id
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let 
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
		  (if (null? tal)
		      (if (and limit
			       (> (length new-res) limit))
			  new-res ;; (take new-res limit)  <= need intelligent sorting before this will work
			  new-res)
		      (loop (string-chomp (car tal)) (cdr tal) new-res)))))))))

(define (server:get-num-alive srvlst)
  (let ((num-alive 0))
    (for-each
     (lambda (server)
        (handle-exceptions
          exn
         (begin 
          (debug:print-info 0 *default-log-port*  "Unable to get server start-time and/or mod-time from " server ", exn=" exn))







|







270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
		  (if (null? tal)
		      (if (and limit
			       (> (length new-res) limit))
			  new-res ;; (take new-res limit)  <= need intelligent sorting before this will work
			  new-res)
		      (loop (string-chomp (car tal)) (cdr tal) new-res)))))))))

#;(define (server:get-num-alive srvlst)
  (let ((num-alive 0))
    (for-each
     (lambda (server)
        (handle-exceptions
          exn
         (begin 
          (debug:print-info 0 *default-log-port*  "Unable to get server start-time and/or mod-time from " server ", exn=" exn))
324
325
326
327
328
329
330


331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
		(lambda (a b)
		  (< (list-ref a 3)
		     (list-ref b 3))))))
    (if (> (length slst) nums)
	(take slst nums)
	slst)))



(define (server:get-first-best areapath)
  (let ((srvrs (server:get-best (server:get-list areapath))))
    (if (and srvrs
	     (not (null? srvrs)))
	(car srvrs)
	#f)))

(define (server:get-rand-best areapath)
  (let ((srvrs (server:get-best (server:get-list areapath))))
    (if (and (list? srvrs)
	     (not (null? srvrs)))
	(let* ((len (length srvrs))
	       (idx (random len)))
	  (list-ref srvrs idx))
	#f)))

(define (server:record->id servr)
  (handle-exceptions
   exn
   (begin 
     (debug:print-info 0 *default-log-port*  "Unable to get server id from " servr ", exn=" exn)     
   #f)
  (match-let (((mod-time host port start-time server-id pid)
	       servr))
    (if server-id
	server-id
	#f))))

(define (server:record->url servr)
  (handle-exceptions
   exn
   (begin 
     (debug:print-info 0 *default-log-port*  "Unable to get server url from " servr ", exn=" exn)     
   #f)
  (match-let (((mod-time host port start-time server-id pid)
	       servr))
    (if (and host port)
	(conc host ":" port)
	#f))))

(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))


;; if server-start-last exists, and wasn't old enough, wait <idle time> + 1, then call this function recursively until it is old enough.
;; if it is old enough, overwrite it and wait 0.25 seconds.
;; if it then has the wrong server key, wait <idle time> + 1 and call this function recursively.
;;
(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (idletime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))







>
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|







|











|
















|







325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
		(lambda (a b)
		  (< (list-ref a 3)
		     (list-ref b 3))))))
    (if (> (length slst) nums)
	(take slst nums)
	slst)))

;; ;; switch from server:get-list to server:get-servers-info
;; ;;
;; (define (server:get-first-best areapath)
;;   (let ((srvrs (server:get-best (server:get-list areapath))))
;;     (if (and srvrs
;; 	     (not (null? srvrs)))
;; 	(car srvrs)
;; 	#f)))
;; 
;; (define (server:get-rand-best areapath)
;;   (let ((srvrs (server:get-best (server:get-list areapath))))
;;     (if (and (list? srvrs)
;; 	     (not (null? srvrs)))
;; 	(let* ((len (length srvrs))
;; 	       (idx (random len)))
;; 	  (list-ref srvrs idx))
;; 	#f)))

(define (server:record->id servr)
  (handle-exceptions
   exn
   (begin 
     (debug:print-info 0 *default-log-port*  "Unable to get server id from " servr ", exn=" exn)     
   #f)
  (match-let (((host port start-time server-id)
	       servr))
    (if server-id
	server-id
	#f))))

(define (server:record->url servr)
  (handle-exceptions
   exn
   (begin 
     (debug:print-info 0 *default-log-port*  "Unable to get server url from " servr ", exn=" exn)     
   #f)
  (match-let (((host port start-time server-id)
	       servr))
    (if (and host port)
	(conc host ":" port)
	#f))))

(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))


;; if server-start-last exists, and wasn't old enough, wait <idle time> + 1, then call this function recursively until it is old enough.
;; if it is old enough, overwrite it and wait 0.25 seconds.
;; if it then has the wrong server key, wait <idle time> + 1 and call this function recursively.
;;
#;(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (idletime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))
403
404
405
406
407
408
409














410




















































411






412
413
414
415
416
417
418

419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
	       (begin
		 (debug:print-info 0 *default-log-port* "Gating server start, last start: "
				   (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server"))
		 
		 (thread-sleep! ( + 1 idletime))
		 (server:wait-for-server-start-last-flag areapath)))))))




































































        






;; kind start up of server, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least <server idletime> seconds old
  (server:wait-for-server-start-last-flag areapath)

  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* ((lock-file    (conc areapath "/logs/server-start.lock")))
	(let* ((start-flag (conc areapath "/logs/server-start-last")))
	  (common:simple-file-lock-and-wait lock-file expire-time: 25)
	  (debug:print-info  2 *default-log-port* "server:kind-run: touching " start-flag)
	  (system (conc "touch " start-flag)) ;; lazy but safe
	  (server:run areapath)
	  (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
	  (common:simple-file-release-lock lock-file)))
      (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")))

;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))
    (let loop ((server-info (server:check-if-running areapath))
	       (try-num    0))
      (if (or server-info
	      (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
	  (server:record->url server-info)
	  (let ((num-ok (length (server:get-best (server:get-list areapath)))))
	    (if (and (> try-num 0)  ;; first time through simply wait a little while then try again
		     (< num-ok 1))  ;; if there are no decent candidates for servers then try starting a new one
		(server:kind-run areapath))
	    (thread-sleep! 5)
	    (loop (server:check-if-running areapath)
		  (+ try-num 1)))))))

(define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG.

(define (server:get-num-servers #!key (numservers 2))
  (let ((ns (string->number
	     (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
    (or ns numservers)))

;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath) ;;  #!key (numservers "2"))
  (let* ((ns            (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed
	 (servers       (server:get-best (server:get-list areapath))))
    (if (or (and servers
		 (null? servers))
	    (not servers)
	    (and (list? servers)
		 (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers
        #f
        (let loop ((hed (car servers))
                   (tal (cdr servers)))
          (let ((res (server:check-server hed)))
            (if res
                hed
                (if (null? tal)
                    #f
                    (loop (car tal)(cdr tal)))))))))

;; ping the given server
;;
(define (server:check-server server-record)
  (let* ((server-url (server:record->url server-record))
         (server-id (server:record->id server-record)) 
         (res        (case *transport-type*
                       ((http)(server:ping server-url server-id))
                       ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
                       )))
    (if res
        server-url
	#f)))

(define (server:kill servr)
  (handle-exceptions
    exn







>
>
>
>
>
>
>
>
>
>
>
>
>
>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>






|
>
|



















|


|




<
<









|


|
|
|














|
<
|
<
<







406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522


523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552

553


554
555
556
557
558
559
560
	       (begin
		 (debug:print-info 0 *default-log-port* "Gating server start, last start: "
				   (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server"))
		 
		 (thread-sleep! ( + 1 idletime))
		 (server:wait-for-server-start-last-flag areapath)))))))

;; oldest server alive determines host then choose random of youngest
;; five servers on that host
;;
(define (server:get-servers-info areapath)
  (let* ((servinfodir (conc *toppath*"/.servinfo"))
	 (allfiles    (glob (conc servinfodir"/*")))
	 (res         (make-hash-table)))
    (for-each
     (lambda (f)
       (let* ((hostport  (pathname-strip-directory f))
	      (serverdat (server:logf-get-start-info f)))
	 (hash-table-set! res hostport serverdat)))
     allfiles)
    res))

;; oldest server alive determines host then choose random of youngest
;; five servers on that host
;;
;; mode:
;;   best - get best server (random of newest five)
;;   home - get home host based on oldest server
;;   info - print info
(define (server:choose-server areapath #!optional (mode 'best))
  ;; age is current-starttime
  ;; find oldest alive
  ;;   1. sort by age ascending and ping until good
  ;; find alive rand from youngest
  ;;   1. sort by age descending
  ;;   2. take five
  ;;   3. check alive, discard if not and repeat
  (let* ((serversdat  (server:get-servers-info areapath))
	 (by-time-asc (sort (hash-table-keys serversdat) ;; list of "host:port"
			    (lambda (a b)
			      (>= (list-ref (hash-table-ref serversdat a) 2)
				  (list-ref (hash-table-ref serversdat b) 2))))))
    (if (not (null? by-time-asc))
	(let* ((oldest     (last by-time-asc))
	       (oldest-dat (hash-table-ref serversdat oldest))
	       (host       (list-ref oldest-dat 0))
	       (all-valid  (filter (lambda (x)
				     (equal? host (list-ref (hash-table-ref serversdat x) 0)))
				   by-time-asc))
	       (best-five  (lambda ()
			     (if (> (length all-valid) 5)
				 (map (lambda (x)
					(hash-table-ref serversdat x))
				      (take all-valid 5))
				 all-valid)))
	       (names->dats (lambda (names)
			      (map (lambda (x)(hash-table-ref serversdat x)) names)))
	       (am-home?    (lambda ()
			      (let* ((currhost (get-host-name))
				     (bestadrs (server:get-best-guess-address currhost)))
				(or (equal? host currhost)
				    (equal? host bestadrs))))))
	  (case mode
	    ((info)
	     (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid)
	     (print "youngest: "(hash-table-ref serversdat (car all-valid))))
	    ((home)     host)
	    ((homehost) (cons host (am-home?))) ;; shut up old code
	    ((home?)    (am-home?))
	    ((best-five)(names->dats (best-five)))
	    ((all-valid)(names->dats all-valid))
	    ((best)     (let* ((best-five (best-five))
			       (len       (length best-five)))
			  (list-ref best-five (random len))))
			  
	    (else
	     (debug:print 0 *default-log-port* "ERROR: invalid command "mode)
	     #f)))
	#f)))

	  
;; kind start up of server, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least <server idletime> seconds old
  ;; (server:wait-for-server-start-last-flag areapath)
  (server:run areapath)
  #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* ((lock-file    (conc areapath "/logs/server-start.lock")))
	(let* ((start-flag (conc areapath "/logs/server-start-last")))
	  (common:simple-file-lock-and-wait lock-file expire-time: 25)
	  (debug:print-info  2 *default-log-port* "server:kind-run: touching " start-flag)
	  (system (conc "touch " start-flag)) ;; lazy but safe
	  (server:run areapath)
	  (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
	  (common:simple-file-release-lock lock-file)))
      (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")))

;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))
    (let loop ((server-info (server:check-if-running areapath))
	       (try-num    0))
      (if (or server-info
	      (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
	  (server:record->url server-info)
	  (let ((num-ok (length (server:choose-server areapath 'all-valid))))
	    (if (and (> try-num 0)  ;; first time through simply wait a little while then try again
		     (< num-ok 1))  ;; if there are no decent candidates for servers then try starting a new one
		(server:run areapath))
	    (thread-sleep! 5)
	    (loop (server:check-if-running areapath)
		  (+ try-num 1)))))))



(define (server:get-num-servers #!key (numservers 2))
  (let ((ns (string->number
	     (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
    (or ns numservers)))

;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath) ;;  #!key (numservers "2"))
  (let* ((ns            (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed
	 (servers       (server:choose-server areapath 'best-five))) ;; (server:get-best (server:get-list areapath))))
    (if (or (and servers
		 (null? servers))
	    (not servers))
	    ;; (and (list? servers)
	    ;;	 (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers
        #f
        (let loop ((hed (car servers))
                   (tal (cdr servers)))
          (let ((res (server:check-server hed)))
            (if res
                hed
                (if (null? tal)
                    #f
                    (loop (car tal)(cdr tal)))))))))

;; ping the given server
;;
(define (server:check-server server-record)
  (let* ((server-url (server:record->url server-record))
         (server-id  (server:record->id server-record)) 

         (res        (server:ping server-url server-id)))


    (if res
        server-url
	#f)))

(define (server:kill servr)
  (handle-exceptions
    exn