Megatest

Check-in [87b6d8cf0e]
Login
Overview
Comment:wip
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.70-nohomehost
Files: files | file ages | folders
SHA1: 87b6d8cf0ee30adeede40cc35530519b6103770a
User & Date: matt on 2022-11-13 04:53:25
Other Links: branch diff | manifest | tags
Context
2022-11-20
19:44
Pulled in latest changes from v1.70 check-in: e966c3ef7e user: matt tags: v1.70-nohomehost
2022-11-13
04:53
wip check-in: 87b6d8cf0e user: matt tags: v1.70-nohomehost
2022-11-12
19:25
Most routines needed for no-homehost updated. check-in: ed25403d77 user: matt tags: v1.70-nohomehost
Changes

Modified archive.scm from [5c03589f24] to [25e6383e3d].

344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
        (archiver       (let ((s (configf:lookup *configdat* "archive" "archiver")))
			   (if s (string->symbol s) 'bup)))
        (rsync-exe (or (configf:lookup *configdat* "archive" "rsync") "rsync"))   
        (print-prefix      "Running: ") 
        (archive-info (archive:allocate-new-archive-block blockid-cache *toppath* tsname min-space target-patt run-patt "megatest-db"))
	(archive-dir  (if archive-info (cdr archive-info) #f))
	(archive-id   (if archive-info (car archive-info) -1))
        (home-host (server:get-homehost))
        (archive-time (seconds->std-time-str (current-seconds)))
        (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time))
        (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db"))
        (dbfile             (conc  archive-staging-db "/megatest.db"))) 
        (create-directory archive-staging-db #t)
        (let-values (((pid-val exit-status exit-code) (run-n-wait rsync-exe params: (list "-v" (conc (car home-host) ":"tmp-db-path) archive-staging-db) print-cmd: print-prefix)))
            (if (eq? exit-code 0)   







|







344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
        (archiver       (let ((s (configf:lookup *configdat* "archive" "archiver")))
			   (if s (string->symbol s) 'bup)))
        (rsync-exe (or (configf:lookup *configdat* "archive" "rsync") "rsync"))   
        (print-prefix      "Running: ") 
        (archive-info (archive:allocate-new-archive-block blockid-cache *toppath* tsname min-space target-patt run-patt "megatest-db"))
	(archive-dir  (if archive-info (cdr archive-info) #f))
	(archive-id   (if archive-info (car archive-info) -1))
        (home-host    (server:choose-server *toppath* 'homehost))
        (archive-time (seconds->std-time-str (current-seconds)))
        (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time))
        (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db"))
        (dbfile             (conc  archive-staging-db "/megatest.db"))) 
        (create-directory archive-staging-db #t)
        (let-values (((pid-val exit-status exit-code) (run-n-wait rsync-exe params: (list "-v" (conc (car home-host) ":"tmp-db-path) archive-staging-db) print-cmd: print-prefix)))
            (if (eq? exit-code 0)   

Modified client.scm from [17a8862d81] to [e699a8b092].

87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138


139
      (begin
	(debug:print-error 0 *default-log-port* "failed to start or connect to server")
	(exit 1))
      ;;
      ;; Alternatively here, we can get the list of candidate servers and work our way
      ;; through them searching for a good one.
      ;;
      (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath))
	     (runremote  (or area-dat *runremote*)))
	(if (not server-dat) ;; no server found
	    (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))
	    (let ((host  (cadr  server-dat))
		  (port  (caddr server-dat))
                  (server-id (caddr (cddr server-dat))))
	      (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	      (if (and (not area-dat)
		       (not *runremote*))
                  (begin       
		    (set! *runremote* (make-remote))
                    (let* ((server-info (remote-server-info *runremote*))) 
                      (if server-info
                        (begin
                          (remote-server-url-set! *runremote* (server:record->url server-info))
                          (remote-server-id-set! *runremote* (server:record->id server-info)))))))
	      (if (and host port server-id)
		  (let* ((start-res (http-transport:client-connect host port server-id))
			 (ping-res  (rmt:login-no-auto-client-setup start-res)))
		    (if (and start-res
			     ping-res)
			(let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago
			  (if runremote
			      (begin
				(remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res)
				(debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res))
				start-res)
			      (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))))
			(begin    ;; login failed but have a server record, clean out the record and try again
			  (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid.  Fixes part of Randy;s ticket 1405717332
			  (case *transport-type* 
			    ((http)(http-transport:close-connections)))
                          (if *runremote* 
			    (remote-conndat-set! runremote #f)  ;; (hash-table-delete! runremote run-id)
                          )
			  (thread-sleep! 1)
			  (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))
			  )))
		  (begin    ;; no server registered
		    ;; (server:kind-run areapath)
		    (server:start-and-wait areapath)
		    (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries)
		    (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
		    (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)))))))))










|



<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>
>

87
88
89
90
91
92
93
94
95
96
97

98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
      (begin
	(debug:print-error 0 *default-log-port* "failed to start or connect to server")
	(exit 1))
      ;;
      ;; Alternatively here, we can get the list of candidate servers and work our way
      ;; through them searching for a good one.
      ;;
      (let* ((server-dat (server:choose-server areapath 'best))
	     (runremote  (or area-dat *runremote*)))
	(if (not server-dat) ;; no server found
	    (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))

	    (match server-dat
	      ((host port start-time server-id)
	       (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	       (if (and (not area-dat)
			(not *runremote*))
                   (begin       
		     (set! *runremote* (make-remote))
                     (let* ((server-info (remote-server-info *runremote*))) 
                       (if server-info
                           (begin
                             (remote-server-url-set! *runremote* (server:record->url server-info))
                             (remote-server-id-set! *runremote* (server:record->id server-info)))))))
	       (if (and host port server-id)
		   (let* ((start-res (http-transport:client-connect host port server-id))
			  (ping-res  (rmt:login-no-auto-client-setup start-res)))
		     (if (and start-res
			      ping-res)
			 (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago
			   (if runremote
			       (begin
				 (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res)
				 (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res))
				 start-res)
			       (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))))
			 (begin    ;; login failed but have a server record, clean out the record and try again
			   (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid.  Fixes part of Randy;s ticket 1405717332
			   (case *transport-type* 
			     ((http)(http-transport:close-connections)))
                           (if *runremote* 
			       (remote-conndat-set! runremote #f)  ;; (hash-table-delete! runremote run-id)
                               )
			   (thread-sleep! 1)
			   (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))
			   )))
		   (begin    ;; no server registered
		     ;; (server:kind-run areapath)
		     (server:start-and-wait areapath)
		     (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries)
		     (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
		     (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)))))
	      (else
	       (debug:print 0 *default-log-port* "ERROR: malformed server-dat="server-dat)))))))

Modified common.scm from [c4aca77534] to [c81106d159].

314
315
316
317
318
319
320
321

322
323
324
325
326
327
328
    ((skip) "SKIP")
    (else "FAIL")))

(define (common:logpro-exit-code->test-status exit-code)
  (status-sym->string (common:logpro-exit-code->status-sym exit-code)))

(defstruct remote
  (hh-dat            (server:get-homehost)) ;; homehost record ( addr . hhflag )

  (server-url        #f) ;; (server:check-if-running *toppath*) #f))
  (server-id         #f)
  (server-info       (if *toppath* (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (connect-time      (current-seconds))
  (conndat           #f)
  (transport         *transport-type*)







|
>







314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
    ((skip) "SKIP")
    (else "FAIL")))

(define (common:logpro-exit-code->test-status exit-code)
  (status-sym->string (common:logpro-exit-code->status-sym exit-code)))

(defstruct remote
  (hh-dat            (or (server:choose-server *toppath* 'homehost)
			 (cons #f #f)))
  (server-url        #f) ;; (server:check-if-running *toppath*) #f))
  (server-id         #f)
  (server-info       (if *toppath* (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (connect-time      (current-seconds))
  (conndat           #f)
  (transport         *transport-type*)
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
		    (host-last-used-set! rec curr-time)
		    new-best)
		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))

(define (common:wait-for-homehost-load maxnormload msg)
  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                     #f
                     (server:get-homehost)))
         (hh     (if hh-dat (car hh-dat) #f)))
    (common:wait-for-normalized-load maxnormload msg hh)))

(define (common:get-num-cpus remote-host)
  (let* ((actual-host (or remote-host (get-host-name))))
    ;; hosts had better not be changing the number of cpus too often!
    (or (hash-table-ref/default *numcpus-cache* actual-host #f)







|







1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
		    (host-last-used-set! rec curr-time)
		    new-best)
		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))

(define (common:wait-for-homehost-load maxnormload msg)
  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                     #f
                     (server:choose-server *toppath* 'homehost)))
         (hh     (if hh-dat (car hh-dat) #f)))
    (common:wait-for-normalized-load maxnormload msg hh)))

(define (common:get-num-cpus remote-host)
  (let* ((actual-host (or remote-host (get-host-name))))
    ;; hosts had better not be changing the number of cpus too often!
    (or (hash-table-ref/default *numcpus-cache* actual-host #f)

Modified dashboard.scm from [d1f71cff63] to [47d443cd41].

3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
    (if (not (launch:setup))
        (begin
          (debug:print 0 *default-log-port* "Failed to find megatest.config, exiting") 
          (exit 1)
        )
    )

    (if (not (common:on-homehost?))
    (begin
      (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (server:get-homehost))
      (debug:print 0 *default-log-port* "It will be slower.")
      ))


    (if (and (common:file-exists? mtdb-path)







|







3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
    (if (not (launch:setup))
        (begin
          (debug:print 0 *default-log-port* "Failed to find megatest.config, exiting") 
          (exit 1)
        )
    )

    #;(if (not (common:on-homehost?))
    (begin
      (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (server:get-homehost))
      (debug:print 0 *default-log-port* "It will be slower.")
      ))


    (if (and (common:file-exists? mtdb-path)

Modified launch.scm from [56e9ef9407] to [9881087e2c].

1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
      (set! cmdparms (base64:base64-encode 
		      (z3:encode-buffer 
		       (with-output-to-string
			 (lambda () ;; (list 'hosts     hosts)
			   (write (list (list 'testpath  test-path)
					;; (list 'transport (conc *transport-type*))
					;; (list 'serverinf *server-info*)
					(list 'homehost  (let* ((hhdat (server:get-homehost)))
							   (if hhdat
							       (car hhdat)
							       #f)))
					(list 'serverurl (if *runremote*
							     (remote-server-url *runremote*)
							     #f)) ;;
					(list 'areaname  (common:get-testsuite-name))







|







1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
      (set! cmdparms (base64:base64-encode 
		      (z3:encode-buffer 
		       (with-output-to-string
			 (lambda () ;; (list 'hosts     hosts)
			   (write (list (list 'testpath  test-path)
					;; (list 'transport (conc *transport-type*))
					;; (list 'serverinf *server-info*)
					#;(list 'homehost  (let* ((hhdat (server:get-homehost)))
							   (if hhdat
							       (car hhdat)
							       #f)))
					(list 'serverurl (if *runremote*
							     (remote-server-url *runremote*)
							     #f)) ;;
					(list 'areaname  (common:get-testsuite-name))

Modified megatest.scm from [b4770c25e0] to [b216f35a15].

656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
(if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun")
    (debug:print 0 *default-log-port* (string-intersperse (argv) " ")))

;; some switches imply homehost. Exit here if not on homehost
;;
(let ((homehost-required  (list "-cleanup-db")))
  (if (apply args:any? homehost-required)
      (if (not (common:on-homehost?))
	  (for-each
	   (lambda (switch)
	     (if (args:get-arg switch)
		 (begin
		   (debug:print 0 *default-log-port* "ERROR: you must be on the homehost to run with " switch
				", you can move homehost by removing the .homehost file but this will disrupt any runs in progress.")
		   (exit 1))))







|







656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
(if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun")
    (debug:print 0 *default-log-port* (string-intersperse (argv) " ")))

;; some switches imply homehost. Exit here if not on homehost
;;
(let ((homehost-required  (list "-cleanup-db")))
  (if (apply args:any? homehost-required)
      (if (not (server:choose-server *toppath* 'home?))
	  (for-each
	   (lambda (switch)
	     (if (args:get-arg switch)
		 (begin
		   (debug:print 0 *default-log-port* "ERROR: you must be on the homehost to run with " switch
				", you can move homehost by removing the .homehost file but this will disrupt any runs in progress.")
		   (exit 1))))
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
    (exit 0)))

(if (or (getenv "MT_RUNSCRIPT")
	(args:get-arg "-repl")
	(args:get-arg "-load"))
    (let* ((toppath (launch:setup))
	   (dbstructs (if (and toppath
                               (common:on-homehost?))
                          (db:setup #t)
                          #f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f)))
      (if *toppath*
	  (cond
	   ((getenv "MT_RUNSCRIPT")
	    ;; How to run megatest scripts
	    ;;







|







2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
    (exit 0)))

(if (or (getenv "MT_RUNSCRIPT")
	(args:get-arg "-repl")
	(args:get-arg "-load"))
    (let* ((toppath (launch:setup))
	   (dbstructs (if (and toppath
                               (server:choose-server toppath 'home?))
                          (db:setup #t)
                          #f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f)))
      (if *toppath*
	  (cond
	   ((getenv "MT_RUNSCRIPT")
	    ;; How to run megatest scripts
	    ;;

Modified rmt.scm from [a7494b375d] to [d9f771a22a].

114
115
116
117
118
119
120

121
122
123
124
125
126
127
128
    
    ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity
    ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"];
    ;; DOT SET_HOMEHOST -> MUTEXLOCK;
    ;; ensure we have a homehost record
    (if (not (pair? (remote-hh-dat runremote)))  ;; not on homehost
	(thread-sleep! 0.1) ;; since we shouldn't get here, delay a little

	(remote-hh-dat-set! runremote (server:get-homehost)))
    
    ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
    (cond
     #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds
      (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.")
      (set! *runremote* #f)
      ;; BUG: close-connections should go here?







>
|







114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
    
    ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity
    ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"];
    ;; DOT SET_HOMEHOST -> MUTEXLOCK;
    ;; ensure we have a homehost record
    (if (not (pair? (remote-hh-dat runremote)))  ;; not on homehost
	(thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
	(let ((hh-data (server:choose-server areapath 'homehost)))
	  (remote-hh-dat-set! runremote (or hh-data (cons #f #f)))))
    
    ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
    (cond
     #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds
      (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.")
      (set! *runremote* #f)
      ;; BUG: close-connections should go here?

Modified server.scm from [2c6bceea6f] to [237780917f].

119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137

138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174

;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))
	 (curr-ip     (server:get-best-guess-address curr-host))
	 (curr-pid    (current-process-id))
	 (homehost    (server:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
	 (target-host (car homehost))
	 (testsuite   (common:get-testsuite-name))
	 (logfile     (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
	 (profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
			   ""))
	 (cmdln (conc (common:get-megatest-exe)

		      " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
							   " -daemonize "
							   "")
		      ;; " -log " logfile
		      " -m testsuite:" testsuite
		      " " profile-mode
		      )) ;; (conc " >> " logfile " 2>&1 &")))))
	 (log-rotate  (make-thread common:rotate-logs  "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!?
         (load-limit  (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0)))
    ;; we want the remote server to start in *toppath* so push there
    (push-directory areapath)
    (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
    (thread-start! log-rotate)
    
    ;; host.domain.tld match host?
    (if (and target-host 
	     ;; look at target host, is it host.domain.tld or ip address and does it 
	     ;; match current ip or hostname
	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time
    (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds server-id
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let 
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 







|


|
|
|
|





>
|
|
|












|
|
|
|
|
|
|
|
|





|







119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175

;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* (;; (curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))
	 ;; (curr-ip     (server:get-best-guess-address curr-host))
	 ;; (curr-pid    (current-process-id))
	 ;; (homehost    (server:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
	 ;; (target-host (car homehost))
	 (testsuite   (common:get-testsuite-name))
	 (logfile     (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
	 (profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
			   ""))
	 (cmdln (conc (common:get-megatest-exe)
		      " -server - ";; (or target-host "-")
		      (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
			  " -daemonize "
			  "")
		      ;; " -log " logfile
		      " -m testsuite:" testsuite
		      " " profile-mode
		      )) ;; (conc " >> " logfile " 2>&1 &")))))
	 (log-rotate  (make-thread common:rotate-logs  "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!?
         (load-limit  (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0)))
    ;; we want the remote server to start in *toppath* so push there
    (push-directory areapath)
    (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
    (thread-start! log-rotate)
    
    ;; host.domain.tld match host?
    ;; (if (and target-host 
    ;; 	     ;; look at target host, is it host.domain.tld or ip address and does it 
    ;; 	     ;; match current ip or hostname
    ;; 	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
    ;; 	     (not (equal? curr-ip target-host)))
    ;; 	(begin
    ;; 	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
    ;; 	  (setenv "TARGETHOST" target-host)))
    ;;   
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time
    (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds server-id
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let 
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
452
453
454
455
456
457
458
459







460
461
462
463
464


465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
				     (equal? host (list-ref (hash-table-ref serversdat x) 0)))
				   by-time-asc))
	       (best-five  (lambda ()
			     (if (> (length all-valid) 5)
				 (map (lambda (x)
					(hash-table-ref serversdat x))
				      (take all-valid 5))
				 all-valid))))







	  (case mode
	    ((info)
	     (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid)
	     (print "youngest: "(hash-table-ref serversdat (car all-valid))))
	    ((home) host)


	    ((best-five)(best-five))
	    ((valid) (map (lambda (x)(hash-table-ref serverdat x)) all-valid))
	    ((best)(let* ((best-five (best-five))
			  (len       (length best-five)))
		     (list-ref best-five len)))
			  
	    (else
	     (debug:print 0 *default-log-port* "ERROR: invalid command "mode)
	     #f)))
	#f)))

(define (server:get-homehost #!key (trynum 5))
  ;; called often especially at start up. use mutex to eliminate collisions
  (mutex-lock! *homehost-mutex*)
  (cond
   (*home-host*
    (mutex-unlock! *homehost-mutex*)
    *home-host*)
   ((not *toppath*)
    (mutex-unlock! *homehost-mutex*)
    (launch:setup) ;; safely mutexed now
    (if (> trynum 0)
	(begin
	  (thread-sleep! 2)
	  (server:get-homehost trynum: (- trynum 1)))
	#f))
   (else
    (let* ((currhost (get-host-name))
	   (bestadrs (server:get-best-guess-address currhost))
	   (homehost (server:choose-server *toppath* 'home))
	   (at-home  (or (equal? homehost currhost)
			 (equal? homehost bestadrs))))

      ;; if no homehost start server, wait a bit and check again
      (if homehost
	  (begin
	    (set! *home-host* (cons homehost at-home))
	    (mutex-unlock! *homehost-mutex*)
	    *home-host*)
	  (begin
	    (server:kind-run *toppath*)
	    (thread-sleep! 5)
	    (server:get-homehost trynum: (- trynum 1))))))))

;;======================================================================
;; am I on the homehost?
;;
(define (common:on-homehost?)
  (let ((hh (server:get-homehost)))
    (if hh
	(cdr hh)
	#f)))

	  
;; kind start up of server, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least <server idletime> seconds old







|
>
>
>
>
>
>
>




|
>
>
|
|
|
|
|






<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485










































486
487
488
489
490
491
492
				     (equal? host (list-ref (hash-table-ref serversdat x) 0)))
				   by-time-asc))
	       (best-five  (lambda ()
			     (if (> (length all-valid) 5)
				 (map (lambda (x)
					(hash-table-ref serversdat x))
				      (take all-valid 5))
				 all-valid)))
	       (names->dats (lambda (names)
			      (map (lambda (x)(hash-table-ref serversdat x)) names)))
	       (am-home?    (lambda ()
			      (let* ((currhost (get-host-name))
				     (bestadrs (server:get-best-guess-address currhost)))
				(or (equal? host currhost)
				    (equal? host bestadrs))))))
	  (case mode
	    ((info)
	     (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid)
	     (print "youngest: "(hash-table-ref serversdat (car all-valid))))
	    ((home)     host)
	    ((homehost) (cons host (am-home?))) ;; shut up old code
	    ((home?)    (am-home?))
	    ((best-five)(names->dats (best-five)))
	    ((all-valid)(names->dats all-valid))
	    ((best)     (let* ((best-five (best-five))
			       (len       (length best-five)))
			  (list-ref best-five (random len))))
			  
	    (else
	     (debug:print 0 *default-log-port* "ERROR: invalid command "mode)
	     #f)))
	#f)))











































	  
;; kind start up of server, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least <server idletime> seconds old
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
	       (try-num    0))
      (if (or server-info
	      (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
	  (server:record->url server-info)
	  (let ((num-ok (length (server:choose-server areapath 'all-valid))))
	    (if (and (> try-num 0)  ;; first time through simply wait a little while then try again
		     (< num-ok 1))  ;; if there are no decent candidates for servers then try starting a new one
		(server:kind-run areapath))
	    (thread-sleep! 5)
	    (loop (server:check-if-running areapath)
		  (+ try-num 1)))))))

(define (server:get-num-servers #!key (numservers 2))
  (let ((ns (string->number
	     (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))







|







511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
	       (try-num    0))
      (if (or server-info
	      (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
	  (server:record->url server-info)
	  (let ((num-ok (length (server:choose-server areapath 'all-valid))))
	    (if (and (> try-num 0)  ;; first time through simply wait a little while then try again
		     (< num-ok 1))  ;; if there are no decent candidates for servers then try starting a new one
		(server:run areapath))
	    (thread-sleep! 5)
	    (loop (server:check-if-running areapath)
		  (+ try-num 1)))))))

(define (server:get-num-servers #!key (numservers 2))
  (let ((ns (string->number
	     (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))