Megatest

Diff
Login

Differences From Artifact [7dc5a4ab80]:

To Artifact [76e32911e1]:


37
38
39
40
41
42
43
44

45
46
47
48
49
50
51
52
53



















54
55
56
57
58
59
60
	md5
	message-digest
	(prefix mtconfigf configf:)
	;; stml2
	;; (prefix margs args:)
	z3 (prefix base64 base64:)
	(prefix mtargs args:)
	directory-utils)


(include "common_records.scm")
(include "megatest-fossil-hash.scm")
(include "megatest-version.scm")

 ;; no need to export this
(define *verbosity-cache* (make-hash-table))
(define *verbosity* 0)






















;; GLOBALS

;; CONTEXTS
#;(defstruct cxt
  (taskdb #f)







|
>









>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
	md5
	message-digest
	(prefix mtconfigf configf:)
	;; stml2
	;; (prefix margs args:)
	z3 (prefix base64 base64:)
	(prefix mtargs args:)
	directory-utils
	sparse-vectors)

(include "common_records.scm")
(include "megatest-fossil-hash.scm")
(include "megatest-version.scm")

 ;; no need to export this
(define *verbosity-cache* (make-hash-table))
(define *verbosity* 0)


(defstruct remote
  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
  (server-url        #f) ;; (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (conndat           #f)
  (transport         *transport-type*)
  (server-timeout    #f) ;; (server:expiration-timeout))
  (force-server      #f)
  (ro-mode           #f)  
  (ro-mode-checked   #f)) ;; flag that indicates we have checked for ro-mode

;; launching and hosts
(defstruct host
  (reachable    #f)
  (last-update  0)
  (last-used    0)
  (last-cpuload 1))



;; GLOBALS

;; CONTEXTS
#;(defstruct cxt
  (taskdb #f)
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
           (else (car argv))))
         (fullpath (realpath this-script)))
    fullpath))
(define *common:this-exe-fullpath* (common:get-this-exe-fullpath))
(define *common:this-exe-dir*      (pathname-directory *common:this-exe-fullpath*))
(define *common:this-exe-name*     (pathname-strip-directory *common:this-exe-fullpath*))

;; Move me elsewhere ...
;; RADT => Why do we meed the version check here, this is called only if version misma
;;
(define (common:cleanup-db dbstruct #!key (full #f))
  (apply db:multi-db-sync 
   dbstruct
   'schema
   ;; 'new2old
   'killservers
   'adj-target
   ;; 'old2new
   'new2old
   ;; (if full
       '(dejunk)
       ;; '())
       )
  (if (common:api-changed?)
      (common:set-last-run-version)))

(define (common:snapshot-file filepath #!key (subdir  ".") )
  (if (file-exists? filepath)
      (let* ((age-sec  (lambda (file)
                         (if (file-exists? file)
                             (- (current-seconds) (file-modification-time file))
                             1000000000))) ;; return really old value if file doesn't exist.  we want to clobber it if old or not exist.
             (ok-flag  #t)







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







1684
1685
1686
1687
1688
1689
1690



















1691
1692
1693
1694
1695
1696
1697
           (else (car argv))))
         (fullpath (realpath this-script)))
    fullpath))
(define *common:this-exe-fullpath* (common:get-this-exe-fullpath))
(define *common:this-exe-dir*      (pathname-directory *common:this-exe-fullpath*))
(define *common:this-exe-name*     (pathname-strip-directory *common:this-exe-fullpath*))




















(define (common:snapshot-file filepath #!key (subdir  ".") )
  (if (file-exists? filepath)
      (let* ((age-sec  (lambda (file)
                         (if (file-exists? file)
                             (- (current-seconds) (file-modification-time file))
                             1000000000))) ;; return really old value if file doesn't exist.  we want to clobber it if old or not exist.
             (ok-flag  #t)
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
		     (handle-exceptions
		      exn
		      (debug:print-error 0 *default-log-port* "failed to remove " fullname)
		      (delete-file* fullname)))))
	     files)
	    (debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files."))))))
  
;; Force a megatest cleanup-db if version is changed and skip-version-check not specified
;; Do NOT check if not on homehost!
;;
(define (common:exit-on-version-changed)
  (if (common:on-homehost?)
      (if (common:api-changed?)
	  (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config"))
                (dbfile  (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.db"))
                (read-only (not (file-write-access? dbfile)))
                (dbstruct (db:setup #t)))
	    (debug:print 0 *default-log-port*
			 "WARNING: Version mismatch!\n"
			 "   expected: " (common:version-signature) "\n"
			 "   got:      " (common:get-last-run-version))
            (cond
             ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t)
             ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only)
                   (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db
              (debug:print 0 *default-log-port* "   I see you are the owner of megatest.config, attempting to cleanup and reset to new version")
              (handle-exceptions
               exn
               (begin
                 (debug:print 0 *default-log-port* "Failed to switch versions.")
                 (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
                 (print-call-chain (current-error-port))
                 (exit 1))
               (common:cleanup-db dbstruct)))
             ((not (common:file-exists? mtconf))
              (debug:print 0 *default-log-port* "   megatest.config does not exist in this area.  Cannot proceed with megatest version migration.")
              (exit 1))
             ((not (common:file-exists? dbfile))
              (debug:print 0 *default-log-port* "   megatest.db does not exist in this area.  Cannot proceed with megatest version migration.")
              (exit 1))
             ((not (eq? (current-user-id)(file-owner mtconf)))
              (debug:print 0 *default-log-port* "   You do not own megatest.db in this area.  Cannot proceed with megatest version migration.")
              (exit 1))
             (read-only
              (debug:print 0 *default-log-port* "   You have read-only access to this area.  Cannot proceed with megatest version migration.")
              (exit 1))
             (else
              (debug:print 0 *default-log-port* " to switch versions you can run: \"megatest -cleanup-db\"")
              (exit 1)))))))
;;      (begin
;;	(debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.")
;;	(exit 1))))

;;======================================================================
;; S P A R S E   A R R A Y S
;;======================================================================

(define (make-sparse-array)
  (let ((a (make-sparse-vector)))
    (sparse-vector-set! a 0 (make-sparse-vector))







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







1847
1848
1849
1850
1851
1852
1853














































1854
1855
1856
1857
1858
1859
1860
		     (handle-exceptions
		      exn
		      (debug:print-error 0 *default-log-port* "failed to remove " fullname)
		      (delete-file* fullname)))))
	     files)
	    (debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files."))))))
  














































;;======================================================================
;; S P A R S E   A R R A Y S
;;======================================================================

(define (make-sparse-array)
  (let ((a (make-sparse-vector)))
    (sparse-vector-set! a 0 (make-sparse-vector))
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197

(define (common:simple-file-release-lock fname)
  (handle-exceptions
      exn
      #f ;; I don't really care why this failed (at least for now)
    (delete-file* fname)))

;;======================================================================
;; D E B U G G I N G   S T U F F 
;;======================================================================

;; (define *verbosity*         1)
;; (define *logging*           #f)

(define (common:set-last-run-version)
  (rmt:set-var "MEGATEST_VERSION" (common:version-signature)))

;; postive number if megatest version > db version
;; negative number if megatest version < db version
(define (common:version-db-delta)
         (- megatest-version (common:get-last-run-version-number)))

(define (common:version-changed?)
  (not (equal? (common:get-last-run-version)
               (common:version-signature))))

;; from metadat lookup MEGATEST_VERSION
;;
(define (common:get-last-run-version) ;; RADT => How does this work in send-receive function??; assume it is the value saved in some DB
  (rmt:get-var "MEGATEST_VERSION"))

(define (common:get-last-run-version-number)
  (string->number 
   (substring (common:get-last-run-version) 0 6)))

(define (common:api-changed?)
  (not (equal? (substring (->string megatest-version) 0 4)
               (substring (conc (common:get-last-run-version)) 0 4))))


;;======================================================================
;; E X I T   H A N D L I N G
;;======================================================================

(define (common:run-sync?)
    (and (common:on-homehost?)
	 (args:get-arg "-server")))

;;   (let ((ohh (common:on-homehost?))
;; 	(srv (args:get-arg "-server")))
;;     (and ohh srv)))
    ;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv)

(define *watchdog* (make-thread
		    (lambda ()
		      (handle-exceptions
			  exn
			  (begin
			    (print-call-chain)
			    (print " message: " ((condition-property-accessor 'exn 'message) exn)))
			(common:watchdog)))
		    "Watchdog thread"))

;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp
;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server)
;;
(define (common:readonly-watchdog dbstruct)
  (thread-sleep! 0.05) ;; delay for startup
  (debug:print-info 13 *default-log-port* "common:readonly-watchdog entered.")
  ;; sync megatest.db to /tmp/.../megatst.db
  (let* ((sync-cool-off-duration   3)
        (golden-mtdb     (dbr:dbstruct-mtdb dbstruct))
        (golden-mtpath   (db:dbdat-get-path golden-mtdb))
        (tmp-mtdb        (dbr:dbstruct-tmpdb dbstruct))
        (tmp-mtpath      (db:dbdat-get-path tmp-mtdb)))
    (debug:print-info 0 *default-log-port* "Read-only periodic sync thread started.")
    (let loop ((last-sync-time 0))
      (debug:print-info 13 *default-log-port* "loop top tmp-mtpath="tmp-mtpath" golden-mtpath="golden-mtpath)
      (let* ((duration-since-last-sync (- (current-seconds) last-sync-time)))
        (debug:print-info 13 *default-log-port* "duration-since-last-sync="duration-since-last-sync)
        (if (and (not *time-to-exit*)
                 (< duration-since-last-sync sync-cool-off-duration))
            (thread-sleep! (- sync-cool-off-duration duration-since-last-sync)))
        (if (not *time-to-exit*)
            (let ((golden-mtdb-mtime (file-modification-time golden-mtpath))
                  (tmp-mtdb-mtime    (file-modification-time tmp-mtpath)))
	      (if (> golden-mtdb-mtime tmp-mtdb-mtime)
		  (if (< golden-mtdb-mtime (- (current-seconds) 3)) ;; file has NOT been touched in past three seconds, this way multiple servers won't fight to sync back
		      (let ((res (db:multi-db-sync dbstruct 'old2new)))
			(debug:print-info 13 *default-log-port* "rosync called, " res " records transferred."))))
              (loop (current-seconds)))
            #t)))
    (debug:print-info 0 *default-log-port* "Exiting readonly-watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" mtpath="golden-mtpath)))

;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage
(define (common:watchdog)
  (debug:print-info 13 *default-log-port* "common:watchdog entered.")
  (if (launch:setup)
      (if (common:on-homehost?)
	  (let ((dbstruct (db:setup #t)))
	    (debug:print-info 13 *default-log-port* "after db:setup with dbstruct=" dbstruct)
	    (cond
	     ((dbr:dbstruct-read-only dbstruct)
	      (debug:print-info 13 *default-log-port* "loading read-only watchdog")
	      (common:readonly-watchdog dbstruct))
	     (else
	      (debug:print-info 13 *default-log-port* "loading writable-watchdog.")
              (let* ((syncer (or (configf:lookup *configdat* "server" "sync-method") "brute-force-sync")))
                (cond
                 ((equal? syncer "brute-force-sync")
                  (server:writable-watchdog-bruteforce dbstruct))
                 ((equal? syncer "delta-sync")
                  (server:writable-watchdog-deltasync dbstruct))
                 (else
                  (debug:print-error 0 *default-log-port* "Unknown server/sync-method specified ("syncer") - valid values are brute-force-sync and delta-sync.")
                  (exit 1)))
                ;;(debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] Syncer started (method="syncer")")
                )))
	    (debug:print-info 13 *default-log-port* "watchdog done."))
	  (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost"))))


(define (std-exit-procedure)
  ;;(common:telemetry-log-close)
  (on-exit (lambda () 0))
  ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
  (let ((no-hurry  (if *time-to-exit* ;; hurry up
		       #f
		       (begin
			 (set! *time-to-exit* #t)
			 #t))))
    (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.")
    (if (and no-hurry (debug:debug-mode 18))
	(rmt:print-db-stats))
    (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds
                              (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated
			      (if *task-db*    
				  (let ((db (cdr *task-db*)))
				    (if (sqlite3:database? db)
					(begin
					  (sqlite3:interrupt! db)
					  (sqlite3:finalize! db #t)
					  ;; (vector-set! *task-db* 0 #f)
					  (set! *task-db* #f)))))
                              (http-client#close-all-connections!)
                              ;; (if (and *runremote*
                              ;;          (remote-conndat *runremote*))
                              ;;     (begin
                              ;;       (http-client#close-all-connections!))) ;; for http-client
                              (if (not (eq? *default-log-port* (current-error-port)))
                                  (close-output-port *default-log-port*))
			      (set! *default-log-port* (current-error-port))) "Cleanup db exit thread"))
	  (th2 (make-thread (lambda ()
			      (debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...")
			      (if no-hurry
                                  (begin
                                    (thread-sleep! 5)) ;; give the clean up few seconds to do it's stuff
                                  (begin
      				  (thread-sleep! 2)))
      			      (debug:print 4 *default-log-port* " ... done")
      			      )
			    "clean exit")))
      (thread-start! th1)
      (thread-start! th2)
      (thread-join! th1)
      )
    )

  0)

(define (std-signal-handler signum)
  ;; (signal-mask! signum)
  (set! *time-to-exit* #t) 
  ;;(debug:print-info 13 *default-log-port* "got signal "signum)
  (debug:print-error 0 *default-log-port* "Received signal " signum " aaa exiting promptly")
  ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway
  (exit))

(define (special-signal-handler signum)
  ;; (signal-mask! signum)
  (set! *time-to-exit* #t)
  ;;(debug:print-info 13 *default-log-port* "got signal "signum)
  (debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting!!")
  ;;TODO send email to notify admin contact listed in the config that the lisner got killed
  ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway
  (exit))


(set-signal-handler! signal/int  std-signal-handler)  ;; ^C
(set-signal-handler! signal/term std-signal-handler)

;; (set-signal-handler! signal/stop std-signal-handler)  ;; ^Z NO, do NOT handle ^Z!

;; '(print (string-intersperse (map cadr (hash-table-ref/default (read-config "megatest.config" \#f \#t) "disks" '"'"'("none" ""))) "\n"))'
(define (common:get-disks #!key (configf #f))
  (hash-table-ref/default 
   (or configf (read-config "megatest.config" #f #t))
   "disks" '("none" "")))

;;======================================================================
;; T A R G E T S  ,   S T A T E ,   S T A T U S ,   
;;                    R U N N A M E    A N D   T E S T P A T T
;;======================================================================

;; (map print (map car (hash-table->alist (read-config "runconfigs.config" #f #t))))
;;
(define (common:get-runconfig-targets #!key (configf #f))
  (let ((targs       (sort (map car (hash-table->alist
				     (or configf ;; NOTE: There is no value in using runconfig:read here.
					 (read-config (conc *toppath* "/runconfigs.config")
						      #f #t)
					 (make-hash-table))))
			   string<?))
	(target-patt (args:get-arg "-target")))
    (if target-patt
	(filter (lambda (x)
		  (patt-list-match x target-patt))







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<










|







1936
1937
1938
1939
1940
1941
1942
































































































































































































1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960

(define (common:simple-file-release-lock fname)
  (handle-exceptions
      exn
      #f ;; I don't really care why this failed (at least for now)
    (delete-file* fname)))

































































































































































































;;======================================================================
;; T A R G E T S  ,   S T A T E ,   S T A T U S ,   
;;                    R U N N A M E    A N D   T E S T P A T T
;;======================================================================

;; (map print (map car (hash-table->alist (read-config "runconfigs.config" #f #t))))
;;
(define (common:get-runconfig-targets #!key (configf #f))
  (let ((targs       (sort (map car (hash-table->alist
				     (or configf ;; NOTE: There is no value in using runconfig:read here.
					 (configf:read-config (conc *toppath* "/runconfigs.config")
						      #f #t)
					 (make-hash-table))))
			   string<?))
	(target-patt (args:get-arg "-target")))
    (if target-patt
	(filter (lambda (x)
		  (patt-list-match x target-patt))
2206
2207
2208
2209
2210
2211
2212


2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
  ;; called often especially at start up. use mutex to eliminate collisions
  (mutex-lock! *homehost-mutex*)
  (cond
   (*home-host*
    (mutex-unlock! *homehost-mutex*)
    *home-host*)
   ((not *toppath*)


    (mutex-unlock! *homehost-mutex*)
    (launch:setup) ;; safely mutexed now
    (if (> trynum 0)
	(begin
	  (thread-sleep! 2)
	  (common:get-homehost trynum: (- trynum 1)))
	#f))
   (else
    (let* ((currhost (get-host-name))
	   (bestadrs (server:get-best-guess-address currhost))
	   ;; first look in config, then look in file .homehost, create it if not found
	   (homehost (or (configf:lookup *configdat* "server" "homehost" )
			 (handle-exceptions
			     exn







>
>
|
|
|
|
|
|
|







1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
  ;; called often especially at start up. use mutex to eliminate collisions
  (mutex-lock! *homehost-mutex*)
  (cond
   (*home-host*
    (mutex-unlock! *homehost-mutex*)
    *home-host*)
   ((not *toppath*)
    (debug:print-error 0 *default-log-port* "common:get-homehost called but *toppath* not defined!! Exiting.")
    (exit))
;; (mutex-unlock! *homehost-mutex*)
;; (launch:setup) ;; safely mutexed now
;; (if (> trynum 0)
;; 	(begin
;; 	  (thread-sleep! 2)
;; 	  (common:get-homehost trynum: (- trynum 1)))
;; 	#f))
   (else
    (let* ((currhost (get-host-name))
	   (bestadrs (server:get-best-guess-address currhost))
	   ;; first look in config, then look in file .homehost, create it if not found
	   (homehost (or (configf:lookup *configdat* "server" "homehost" )
			 (handle-exceptions
			     exn
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
		      ;; (print "NO MATCH: " hed)
		      (loop (car tal)(cdr tal) loads proc-num phys-num core-num)))))))))))

(define (common:unix-ping hostname)
  (let ((res (system (conc "ping -c 1 " hostname " > /dev/null"))))
    (eq? res 0)))

;; ideally put all this info into the db, no need to preserve it across moving homehost
;;
;; return list of
;;  ( reachable? cpuload update-time )
(define (common:get-host-info hostname)
  (let* ((loadinfo                         (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data
         (load                             (car loadinfo))
         (load-sample-time                 (cdr loadinfo))
         (load-sample-age                  (- (current-seconds) load-sample-time))
         (loadinfo-timeout-seconds         6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds
         (host-last-update-timeout-seconds 4)
         (host-rec (hash-table-ref/default *host-loads* hostname #f))
         )
    (cond
     ((< load-sample-age loadinfo-timeout-seconds)
      (list #t
            load-sample-time
            load))
     ((and host-rec
           (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds)))
      (list #t
            (host-last-update host-rec)
            (host-last-cpuload host-rec )))
     ((common:unix-ping hostname)
      (list #t
            (current-seconds)
            (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds
     (else
      (list #f 0 -1) ;; bad host, don't use!
      ))))

;; see defstruct host at top of file.
;;    host: reachable last-update last-used last-cpuload
;;
(define (common:update-host-loads-table hosts-raw)
  (let* ((hosts (filter (lambda (x)
                          (string-match (regexp "^\\S+$") x))
                        hosts-raw)))
    (for-each
     (lambda (hostname)
       (let* ((rec       (let ((h (hash-table-ref/default *host-loads* hostname #f)))
                          (if h
                              h
                              (let ((h (make-host)))
                                (hash-table-set! *host-loads* hostname h)
                                h))))
              (host-info         (common:get-host-info hostname))
              (is-reachable      (car host-info))
              (last-reached-time (cadr host-info))
              (load              (caddr host-info)))
         (host-reachable-set!    rec is-reachable)
         (host-last-update-set!  rec last-reached-time)
         (host-last-cpuload-set! rec load)))
     hosts)))

;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the
;; [host-rules] section.
;;
(define (common:get-least-loaded-host hosts-raw host-type configdat)
  (let* ((rdat       (configf:lookup configdat "host-rules" host-type))
	 (rules      (common:val->alist (or rdat "") convert: #t))   ;; maxnload, maxnjobs, maxjobrate
	 (maxnload   (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load
	 (maxnjobs   (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs
	 (maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second
	 (hosts      (filter (lambda (x)
			       (string-match (regexp "^\\S+$") x))
			     hosts-raw))
         ;; (best-host #f)
	 (get-rec    (lambda (hostname)
		       ;; (print "get-rec hostname=" hostname)
		       (let ((h (hash-table-ref/default *host-loads* hostname #f)))
			 (if h
			     h
			     (let ((h (make-host)))
			       (hash-table-set! *host-loads* hostname h)
			       h)))))
         (best-load 99999)
         (curr-time (current-seconds))
	 (get-hosts-sorted (lambda (hosts)
			     (sort hosts (lambda (a b)
					   (let ((a-rec (get-rec a))
						 (b-rec (get-rec b)))
					     ;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec))
					     ;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec))
					     (< (host-last-used a-rec)
						(host-last-used b-rec))))))))
    (debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts))
    (if (null? hosts)
	#f ;; no hosts to select from. All done and giving up now.
	(let ((hosts-sorted (get-hosts-sorted hosts)))
	  (common:update-host-loads-table hosts)
	  (let loop ((hostname  (car hosts-sorted))
		     (tal       (cdr hosts-sorted))
		     (best-host #f))
	    (let* ((rec       (get-rec hostname))
		   (reachable (host-reachable     rec))
		   (load      (host-last-cpuload  rec))
		   (last-used (host-last-used     rec))
		   (delta     (- curr-time last-used))
		   (job-rate  (if (> delta 0)
				  (/ 1 delta)
				  999)) ;; jobs per second
		   (new-best  
		    (cond
		     ((not reachable)
		      (debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.")
		      best-host)
		     ((and (< load maxnload)        ;; load is acceptable
			   (< job-rate maxjobrate)) ;; job rate is acceptable
		      (set! best-load load)
		      hostname)
		     (else best-host))))
	      (debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." )
	      (if new-best
		  (begin ;; found a host, return it
		    (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
		    (host-last-used-set! rec curr-time)
		    new-best)
		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))

(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f))
  (let* ((loadavg (common:get-cpu-load remote-host))
	 (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
		      (common:get-num-cpus remote-host)
		      numcpus-in))
	 (maxload (if force-maxload
		      maxload-in
		      (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
	 (loadjmp (- first next))
         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) ))  ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
    (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
		      ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)
    (cond
     ((and (> first adjload)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)))))

(define (common:wait-for-homehost-load maxload msg)
  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                     #f
                     (common:get-homehost)))
         (hh     (if hh-dat (car hh-dat) #f))
         (numcpus (common:get-num-cpus hh)))
    (common:wait-for-normalized-load maxload msg hh)))

(define (common:get-num-cpus remote-host)
  (let* ((actual-host (or remote-host (get-host-name))))
    (or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often!
	(let* ((proc   (lambda ()
			 (let loop ((numcpu 0)
				    (inl    (read-line)))
			   (if (eof-object? inl)
			       (begin
				 (common:write-cached-info remote-host "num-cpus" numcpu)
				 numcpu)
			       (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl)
					 (+ numcpu 1)
					 numcpu)
				     (read-line))))))
	       (result (if remote-host
			   (with-input-from-pipe 
			       (conc "ssh " remote-host " cat /proc/cpuinfo")
			     proc)
			   (with-input-from-file "/proc/cpuinfo" proc))))
	  (common:write-cached-info actual-host "num-cpus" result)
	  result))))

;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxload msg remote-host)
  (let ((num-cpus (common:get-num-cpus remote-host)))
    (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host)))

(define (get-uname . params)
  (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
	 (uname #f))
    (if (null? (car uname-res))
	"unknown"
	(caar uname-res))))








<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







2330
2331
2332
2333
2334
2335
2336























































































































































































2337
2338
2339
2340
2341
2342
2343
		      ;; (print "NO MATCH: " hed)
		      (loop (car tal)(cdr tal) loads proc-num phys-num core-num)))))))))))

(define (common:unix-ping hostname)
  (let ((res (system (conc "ping -c 1 " hostname " > /dev/null"))))
    (eq? res 0)))
























































































































































































(define (get-uname . params)
  (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
	 (uname #f))
    (if (null? (car uname-res))
	"unknown"
	(caar uname-res))))

2775
2776
2777
2778
2779
2780
2781













2782
2783
2784
2785
2786
2787
2788
  ;;            (close-input-port inp)
  ;;            (close-output-port oup)
  ;;            ;; (process-wait pid)
  ;;            res)
  ;;          (loop (read-line) inl))))))
  (with-input-from-pipe (conc "readlink -f " inpath) read-line))














;;======================================================================
;; D I S K   S P A C E 
;;======================================================================

(define (common:get-disk-space-used fpath)
  (with-input-from-pipe (conc "/usr/bin/du -s " fpath) read))








>
>
>
>
>
>
>
>
>
>
>
>
>







2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
  ;;            (close-input-port inp)
  ;;            (close-output-port oup)
  ;;            ;; (process-wait pid)
  ;;            res)
  ;;          (loop (read-line) inl))))))
  (with-input-from-pipe (conc "readlink -f " inpath) read-line))

(define (server:get-best-guess-address hostname)
  (let ((res #f))
    (for-each 
     (lambda (adr)
       (if (not (eq? (u8vector-ref adr 0) 127))
	   (set! res adr)))
     ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME
     (vector->list (hostinfo-addresses (hostname->hostinfo hostname))))
    (string-intersperse 
     (map number->string
	  (u8vector->list
	   (if res res (hostname->ip hostname)))) ".")))

;;======================================================================
;; D I S K   S P A C E 
;;======================================================================

(define (common:get-disk-space-used fpath)
  (with-input-from-pipe (conc "/usr/bin/du -s " fpath) read))

3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
;; first read ~/views.config if it exists, then read $MTRAH/views.config if it exists
;;
(define (common:load-views-config)
  (let* ((view-cfgdat    (make-hash-table))
	 (home-cfgfile   (conc (get-environment-variable "HOME") "/.mtviews.config"))
	 (mthome-cfgfile (conc *toppath* "/.mtviews.config")))
    (if (common:file-exists? mthome-cfgfile)
	(read-config mthome-cfgfile view-cfgdat #t))
    ;; we load the home dir file AFTER the MTRAH file so the user can clobber settings when running the dashboard in read-only areas
    (if (common:file-exists? home-cfgfile)
	(read-config home-cfgfile view-cfgdat #t))
    view-cfgdat))

;;======================================================================
;; H I E R A R C H I C A L   H A S H   T A B L E S
;;======================================================================

;; Every element including top element is a vector:







|


|







2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
;; first read ~/views.config if it exists, then read $MTRAH/views.config if it exists
;;
(define (common:load-views-config)
  (let* ((view-cfgdat    (make-hash-table))
	 (home-cfgfile   (conc (get-environment-variable "HOME") "/.mtviews.config"))
	 (mthome-cfgfile (conc *toppath* "/.mtviews.config")))
    (if (common:file-exists? mthome-cfgfile)
	(configf:read-config mthome-cfgfile view-cfgdat #t))
    ;; we load the home dir file AFTER the MTRAH file so the user can clobber settings when running the dashboard in read-only areas
    (if (common:file-exists? home-cfgfile)
	(configf:read-config home-cfgfile view-cfgdat #t))
    view-cfgdat))

;;======================================================================
;; H I E R A R C H I C A L   H A S H   T A B L E S
;;======================================================================

;; Every element including top element is a vector: