Megatest

Check-in [a0384d728b]
Login
Overview
Comment:Steps towards eliminating homehost
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.70-nohomehost
Files: files | file ages | folders
SHA1: a0384d728b3762793b3a1c47035b36839fd810f2
User & Date: matt on 2022-11-12 07:03:12
Other Links: branch diff | manifest | tags
Context
2022-11-12
18:34
server:choose-server now working. check-in: cc546c7dfe user: matt tags: v1.70-nohomehost
07:03
Steps towards eliminating homehost check-in: a0384d728b user: matt tags: v1.70-nohomehost
2022-11-11
09:42
Partial implementation of homehost-free check-in: ac69021df7 user: matt tags: v1.70-nohomehost
Changes

Modified archive.scm from [9231707c41] to [5c03589f24].

344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
        (archiver       (let ((s (configf:lookup *configdat* "archive" "archiver")))
			   (if s (string->symbol s) 'bup)))
        (rsync-exe (or (configf:lookup *configdat* "archive" "rsync") "rsync"))   
        (print-prefix      "Running: ") 
        (archive-info (archive:allocate-new-archive-block blockid-cache *toppath* tsname min-space target-patt run-patt "megatest-db"))
	(archive-dir  (if archive-info (cdr archive-info) #f))
	(archive-id   (if archive-info (car archive-info) -1))
        (home-host (common:get-homehost))
        (archive-time (seconds->std-time-str (current-seconds)))
        (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time))
        (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db"))
        (dbfile             (conc  archive-staging-db "/megatest.db"))) 
        (create-directory archive-staging-db #t)
        (let-values (((pid-val exit-status exit-code) (run-n-wait rsync-exe params: (list "-v" (conc (car home-host) ":"tmp-db-path) archive-staging-db) print-cmd: print-prefix)))
            (if (eq? exit-code 0)   







|







344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
        (archiver       (let ((s (configf:lookup *configdat* "archive" "archiver")))
			   (if s (string->symbol s) 'bup)))
        (rsync-exe (or (configf:lookup *configdat* "archive" "rsync") "rsync"))   
        (print-prefix      "Running: ") 
        (archive-info (archive:allocate-new-archive-block blockid-cache *toppath* tsname min-space target-patt run-patt "megatest-db"))
	(archive-dir  (if archive-info (cdr archive-info) #f))
	(archive-id   (if archive-info (car archive-info) -1))
        (home-host (server:get-homehost))
        (archive-time (seconds->std-time-str (current-seconds)))
        (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time))
        (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db"))
        (dbfile             (conc  archive-staging-db "/megatest.db"))) 
        (create-directory archive-staging-db #t)
        (let-values (((pid-val exit-status exit-code) (run-n-wait rsync-exe params: (list "-v" (conc (car home-host) ":"tmp-db-path) archive-staging-db) print-cmd: print-prefix)))
            (if (eq? exit-code 0)   

Modified common.scm from [1861c8c274] to [c4aca77534].

314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
    ((skip) "SKIP")
    (else "FAIL")))

(define (common:logpro-exit-code->test-status exit-code)
  (status-sym->string (common:logpro-exit-code->status-sym exit-code)))

(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        #f) ;; (server:check-if-running *toppath*) #f))
  (server-id         #f)
  (server-info       (if *toppath* (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (connect-time      (current-seconds))
  (conndat           #f)
  (transport         *transport-type*)







|







314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
    ((skip) "SKIP")
    (else "FAIL")))

(define (common:logpro-exit-code->test-status exit-code)
  (status-sym->string (common:logpro-exit-code->status-sym exit-code)))

(defstruct remote
  (hh-dat            (server:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        #f) ;; (server:check-if-running *toppath*) #f))
  (server-id         #f)
  (server-info       (if *toppath* (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (connect-time      (current-seconds))
  (conndat           #f)
  (transport         *transport-type*)
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
      #f))

;;======================================================================
;; logic for getting homehost. Returns (host . at-home)
;; IF *toppath* is not set, wait up to five seconds trying every two seconds
;; (this is to accomodate the watchdog)
;;
(define (common:get-homehost #!key (trynum 5))
  ;; called often especially at start up. use mutex to eliminate collisions
  (mutex-lock! *homehost-mutex*)
  (cond
   (*home-host*
    (mutex-unlock! *homehost-mutex*)
    *home-host*)
   ((not *toppath*)
    (mutex-unlock! *homehost-mutex*)
    (launch:setup) ;; safely mutexed now
    (if (> trynum 0)
	(begin
	  (thread-sleep! 2)
	  (common:get-homehost trynum: (- trynum 1)))
	#f))
   (else
    (let* ((currhost (get-host-name))
	   (bestadrs (server:get-best-guess-address currhost))
	   ;; first look in config, then look in file .homehost, create it if not found
	   (homehost (or (configf:lookup *configdat* "server" "homehost" )
			 (handle-exceptions
			     exn
			     (if (> trynum 0)
				 (let ((delay-time (* (- 5 trynum) 5)))
				   (mutex-unlock! *homehost-mutex*)
				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying "
						delay-time " seconds and trying again, message: "  ((condition-property-accessor 'exn 'message) exn)
						", exn=" exn)
				   (thread-sleep! delay-time)
				   (common:get-homehost trynum: (- trynum 1)))
				 (begin
				   (mutex-unlock! *homehost-mutex*)
				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)
						"] Failed to read .homehost file after trying five times. Giving up and exiting, message: "
						((condition-property-accessor 'exn 'message) exn))
				   (exit 1)))
			   (let ((hhf (conc *toppath* "/.homehost")))
			     (if (common:file-exists? hhf)
				 (with-input-from-file hhf read-line)
				 (if (file-write-access? *toppath*)
				     (begin
				       (with-output-to-file hhf
					 (lambda ()
					   (print bestadrs)))
				       (begin
					 (mutex-unlock! *homehost-mutex*)
					 (car (common:get-homehost))))
				     #f))))))
	   (at-home  (or (equal? homehost currhost)
			 (equal? homehost bestadrs))))
      (set! *home-host* (cons homehost at-home))
      (mutex-unlock! *homehost-mutex*)
      *home-host*))))

;;======================================================================
;; am I on the homehost?
;;
(define (common:on-homehost?)
  (let ((hh (common:get-homehost)))
    (if hh
	(cdr hh)
	#f)))

;;======================================================================
;; do we honor the caches of the config files?
;;
(define (common:use-cache?)
  (let ((res #t)) ;; priority by order of evaluation
    (if *configdat* ;; sillyness here. can't use setup/use-cache to know if we can use the cached files!







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







1304
1305
1306
1307
1308
1309
1310






























































1311
1312
1313
1314
1315
1316
1317
      #f))

;;======================================================================
;; logic for getting homehost. Returns (host . at-home)
;; IF *toppath* is not set, wait up to five seconds trying every two seconds
;; (this is to accomodate the watchdog)
;;































































;;======================================================================
;; do we honor the caches of the config files?
;;
(define (common:use-cache?)
  (let ((res #t)) ;; priority by order of evaluation
    (if *configdat* ;; sillyness here. can't use setup/use-cache to know if we can use the cached files!
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
		    (host-last-used-set! rec curr-time)
		    new-best)
		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))

(define (common:wait-for-homehost-load maxnormload msg)
  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                     #f
                     (common:get-homehost)))
         (hh     (if hh-dat (car hh-dat) #f)))
    (common:wait-for-normalized-load maxnormload msg hh)))

(define (common:get-num-cpus remote-host)
  (let* ((actual-host (or remote-host (get-host-name))))
    ;; hosts had better not be changing the number of cpus too often!
    (or (hash-table-ref/default *numcpus-cache* actual-host #f)







|







1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
		    (host-last-used-set! rec curr-time)
		    new-best)
		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))

(define (common:wait-for-homehost-load maxnormload msg)
  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                     #f
                     (server:get-homehost)))
         (hh     (if hh-dat (car hh-dat) #f)))
    (common:wait-for-normalized-load maxnormload msg hh)))

(define (common:get-num-cpus remote-host)
  (let* ((actual-host (or remote-host (get-host-name))))
    ;; hosts had better not be changing the number of cpus too often!
    (or (hash-table-ref/default *numcpus-cache* actual-host #f)

Modified dashboard.scm from [6283f67b19] to [d1f71cff63].

3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
          (debug:print 0 *default-log-port* "Failed to find megatest.config, exiting") 
          (exit 1)
        )
    )

    (if (not (common:on-homehost?))
    (begin
      (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (common:get-homehost))
      (debug:print 0 *default-log-port* "It will be slower.")
      ))


    (if (and (common:file-exists? mtdb-path)
	     (file-write-access? mtdb-path))
	(if (not (args:get-arg "-skip-version-check"))







|







3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
          (debug:print 0 *default-log-port* "Failed to find megatest.config, exiting") 
          (exit 1)
        )
    )

    (if (not (common:on-homehost?))
    (begin
      (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (server:get-homehost))
      (debug:print 0 *default-log-port* "It will be slower.")
      ))


    (if (and (common:file-exists? mtdb-path)
	     (file-write-access? mtdb-path))
	(if (not (args:get-arg "-skip-version-check"))

Modified launch.scm from [60d380c61b] to [56e9ef9407].

1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
      (set! cmdparms (base64:base64-encode 
		      (z3:encode-buffer 
		       (with-output-to-string
			 (lambda () ;; (list 'hosts     hosts)
			   (write (list (list 'testpath  test-path)
					;; (list 'transport (conc *transport-type*))
					;; (list 'serverinf *server-info*)
					(list 'homehost  (let* ((hhdat (common:get-homehost)))
							   (if hhdat
							       (car hhdat)
							       #f)))
					(list 'serverurl (if *runremote*
							     (remote-server-url *runremote*)
							     #f)) ;;
					(list 'areaname  (common:get-testsuite-name))







|







1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
      (set! cmdparms (base64:base64-encode 
		      (z3:encode-buffer 
		       (with-output-to-string
			 (lambda () ;; (list 'hosts     hosts)
			   (write (list (list 'testpath  test-path)
					;; (list 'transport (conc *transport-type*))
					;; (list 'serverinf *server-info*)
					(list 'homehost  (let* ((hhdat (server:get-homehost)))
							   (if hhdat
							       (car hhdat)
							       #f)))
					(list 'serverurl (if *runremote*
							     (remote-server-url *runremote*)
							     #f)) ;;
					(list 'areaname  (common:get-testsuite-name))

Modified rmt.scm from [4af82949dd] to [a7494b375d].

114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
    
    ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity
    ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"];
    ;; DOT SET_HOMEHOST -> MUTEXLOCK;
    ;; ensure we have a homehost record
    (if (not (pair? (remote-hh-dat runremote)))  ;; not on homehost
	(thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
	(remote-hh-dat-set! runremote (common:get-homehost)))
    
    ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
    (cond
     #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds
      (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.")
      (set! *runremote* #f)
      ;; BUG: close-connections should go here?







|







114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
    
    ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity
    ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"];
    ;; DOT SET_HOMEHOST -> MUTEXLOCK;
    ;; ensure we have a homehost record
    (if (not (pair? (remote-hh-dat runremote)))  ;; not on homehost
	(thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
	(remote-hh-dat-set! runremote (server:get-homehost)))
    
    ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
    (cond
     #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds
      (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.")
      (set! *runremote* #f)
      ;; BUG: close-connections should go here?

Modified server.scm from [163356b887] to [08cd1512fb].

124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))
	 (curr-ip     (server:get-best-guess-address curr-host))
	 (curr-pid    (current-process-id))
	 (homehost    (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
	 (target-host (car homehost))
	 (testsuite   (common:get-testsuite-name))
	 (logfile     (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
	 (profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
			   ""))
	 (cmdln (conc (common:get-megatest-exe)
		      " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")







|







124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))
	 (curr-ip     (server:get-best-guess-address curr-host))
	 (curr-pid    (current-process-id))
	 (homehost    (server:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
	 (target-host (car homehost))
	 (testsuite   (common:get-testsuite-name))
	 (logfile     (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
	 (profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
			   ""))
	 (cmdln (conc (common:get-megatest-exe)
		      " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
423
424
425
426
427
428
429




430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447




448
449




450


































































451
452

453
454
455
456
457
458
459
	 (hash-table-set! res hostport serverdat)))
     allfiles)
    res))

;; oldest server alive determines host then choose random of youngest
;; five servers on that host
;;




(define (server:choose-server areapath)
  ;; age is current-starttime
  ;; find oldest alive
  ;;   1. sort by age ascending and ping until good
  ;; find alive rand from youngest
  ;;   1. sort by age descending
  ;;   2. take five
  ;;   3. check alive, discard if not and repeat
  (let* ((serversdat  (server:get-servers-info areapath))
	 (by-time-asc (sort (hash-table-keys serversdat)
			    (lambda (a b)
			      (>= (list-ref (hash-table-ref serversdat a) 2)
				  (list-ref (hash-table-ref serversdat b) 2))))))
    (if (not (null? by-time-asc))
	(let* ((oldest     (last by-time-asc))
	       (oldest-dat (hash-table-ref serversdat oldest))
	       (host       (list-ref oldest-dat 1))
	       (all-valid  (filter (lambda (x)(equal? host (list-ref x 1))) by-time-asc)))




	  (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid)
	  (print "youngest: "(hash-table-ref serversdat (car all-valid)))




	  (car all-valid))


































































	#f)))
        

;; kind start up of server, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least <server idletime> seconds old
  (server:wait-for-server-start-last-flag areapath)







>
>
>
>
|








|







|
>
>
>
>
|
|
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

|
>







423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
	 (hash-table-set! res hostport serverdat)))
     allfiles)
    res))

;; oldest server alive determines host then choose random of youngest
;; five servers on that host
;;
;; mode:
;;   best - get best server (random of newest five)
;;   home - get home host based on oldest server
;;   info - print info
(define (server:choose-server areapath #!optional (mode 'best))
  ;; age is current-starttime
  ;; find oldest alive
  ;;   1. sort by age ascending and ping until good
  ;; find alive rand from youngest
  ;;   1. sort by age descending
  ;;   2. take five
  ;;   3. check alive, discard if not and repeat
  (let* ((serversdat  (server:get-servers-info areapath))
	 (by-time-asc (sort (hash-table-keys serversdat) ;; list of "host:port"
			    (lambda (a b)
			      (>= (list-ref (hash-table-ref serversdat a) 2)
				  (list-ref (hash-table-ref serversdat b) 2))))))
    (if (not (null? by-time-asc))
	(let* ((oldest     (last by-time-asc))
	       (oldest-dat (hash-table-ref serversdat oldest))
	       (host       (list-ref oldest-dat 1))
	       (all-valid  (filter (lambda (x)
				     (equal? host (list-ref (hash-table-ref serversdat x) 1)))
				   by-time-asc)))
	  (case mode
	    ((info)
	     (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid)
	     (print "youngest: "(hash-table-ref serversdat (car all-valid))))
	    ((home) host)
	    ((best)(if (> (length all-valid) 5)
		       (map (lambda (x)
			      (hash-table-ref serversdat x))
			    (take all-valid 5))))
	    (else
	     (debug:print 0 *default-log-port* "ERROR: invalid command "mode)
	     #f))
	  #f))))

(define (server:get-homehost #!key (trynum 5))
  ;; called often especially at start up. use mutex to eliminate collisions
  (mutex-lock! *homehost-mutex*)
  (cond
   (*home-host*
    (mutex-unlock! *homehost-mutex*)
    *home-host*)
   ((not *toppath*)
    (mutex-unlock! *homehost-mutex*)
    (launch:setup) ;; safely mutexed now
    (if (> trynum 0)
	(begin
	  (thread-sleep! 2)
	  (server:get-homehost trynum: (- trynum 1)))
	#f))
   (else
    (let* ((currhost (get-host-name))
	   (bestadrs (server:get-best-guess-address currhost))
	   ;; first look in config, then look in file .homehost, create it if not found
	   (homehost (or (configf:lookup *configdat* "server" "homehost" )
			 (handle-exceptions
			     exn
			     (if (> trynum 0)
				 (let ((delay-time (* (- 5 trynum) 5)))
				   (mutex-unlock! *homehost-mutex*)
				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying "
						delay-time " seconds and trying again, message: "  ((condition-property-accessor 'exn 'message) exn)
						", exn=" exn)
				   (thread-sleep! delay-time)
				   (server:get-homehost trynum: (- trynum 1)))
				 (begin
				   (mutex-unlock! *homehost-mutex*)
				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)
						"] Failed to read .homehost file after trying five times. Giving up and exiting, message: "
						((condition-property-accessor 'exn 'message) exn))
				   (exit 1)))
			   (let ((hhf (conc *toppath* "/.homehost")))
			     (if (common:file-exists? hhf)
				 (with-input-from-file hhf read-line)
				 (if (file-write-access? *toppath*)
				     (begin
				       (with-output-to-file hhf
					 (lambda ()
					   (print bestadrs)))
				       (begin
					 (mutex-unlock! *homehost-mutex*)
					 (car (server:get-homehost))))
				     #f))))))
	   (at-home  (or (equal? homehost currhost)
			 (equal? homehost bestadrs))))
      (set! *home-host* (cons homehost at-home))
      (mutex-unlock! *homehost-mutex*)
      *home-host*))))

;;======================================================================
;; am I on the homehost?
;;
(define (common:on-homehost?)
  (let ((hh (server:get-homehost)))
    (if hh
	(cdr hh)
	#f)))

	  
;; kind start up of server, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least <server idletime> seconds old
  (server:wait-for-server-start-last-flag areapath)