Megatest

Diff
Login

Differences From Artifact [33c7316880]:

To Artifact [a6fd7a1472]:


1
2
3
4
5
6
7
8
;;======================================================================
;; Copyright 2006-2012, Matthew Welland.
;; 
;; This file is part of Megatest.
;; 
;;     Megatest is free software: you can redistribute it and/or modify
;;     it under the terms of the GNU General Public License as published by
;;     the Free Software Foundation, either version 3 of the License, or
|







1
2
3
4
5
6
7
8
;get-u;======================================================================
;; Copyright 2006-2012, Matthew Welland.
;; 
;; This file is part of Megatest.
;; 
;;     Megatest is free software: you can redistribute it and/or modify
;;     it under the terms of the GNU General Public License as published by
;;     the Free Software Foundation, either version 3 of the License, or
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
      (if (> num-tries 0) ;; should be "num-tries-left". 
	  (if (common:low-noise-print 30 (conc (round effective-load) "-load-acceptable-" effective-host))
	      (debug:print 0 *default-log-port* "Load on " effective-host " is acceptable at effective normalized load of "
			   effective-normalized-load "  continuing."))
	  (debug:print 0 *default-log-port* "Load on " effective-host ", "
		       first" could not be retrieved. Giving up and continuing."))))))

;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
;;
;; (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5))
;;   (let* ((loadavg (common:get-cpu-load remote-host))
;; 	 (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
;; 		      (common:get-num-cpus remote-host)
;; 		      numcpus-in))
;; 	 (maxload (if force-maxload
;; 		      maxload-in
;; 		      (if (number? maxload-in)
;; 			  (max maxload-in 0.5)
;; 			  0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
;; 	 (first   (car loadavg))
;; 	 (next    (cadr loadavg))
;; 	 (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where
;; 					       ;; numcpus (or could be
;; 					       ;; maxload) is zero,
;; 					       ;; crude fallback is to
;; 					       ;; at least use 1
;; 	 (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next?
;; 			       0
;; 			       next))) ;; we will force a conservative calculation any time next is large.
;; 	 (first-next-avg    (/ (+ first next) 2))
;; 	 ;; add some randomness to the time to break any alignment
;; 	 ;; where netbatch dumps many jobs to machines simultaneously
;;          (adjwait           (min (+ 300 (random 10)) (abs (* (+ (random 10)
;; 								(/ (- 1000 count) 10)
;; 								waitdelay)
;; 							     (- first adjmaxload) ))))
;; 	 (load-jump-limit   (configf:lookup-number *configdat* "setup" "load-jump-limit"))
;; 	 ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
;; 	 ;; etc.
;; 	 (effective-load    (common:get-intercept first next))
;; 	 (effective-host    (or remote-host "localhost"))
;; 	 (normalized-effective-load (/ effective-load numcpus))
;; 	 (will-wait                 (> normalized-effective-load maxload)))
;; 	 
;;     ;; let's let the user know once in a long while that load checking
;;     ;; is happening but not constantly report it
;;     #;(if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time
;; 	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
;;     ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
;; 
;;     (debug:print-info 1 *default-log-port*
;; 		      "On host: " effective-host
;; 		      ", effective load: " effective-load
;; 		      ", numcpus: " numcpus
;; 		      ", normalized effective load: " normalized-effective-load
;; 		      )
;;     
;;     (cond
;;      ;; bad data, try again to get the data
;;      ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
;; 	   (> num-tries 0))
;;       (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.")
;;       (thread-sleep! 10)
;;       (common:wait-for-cpuload maxload-in numcpus-in waitdelay
;; 			       count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1)))
;;      ;; need to wait for load to drop
;;      ((and will-wait ;; (> first adjmaxload)
;; 	   (> count 0))
;;       (debug:print-info 0 *default-log-port*
;; 			"Delaying " 15 ;; adjwait
;; 			" seconds due to normalized effective load " normalized-effective-load ;; first
;; 			" exceeding max of " adjmaxload
;; 			" on server " (or remote-host (get-host-name))
;; 			" (normalized load-limit: " maxload ") " (if msg msg ""))
;;       (thread-sleep! 15) ;; adjwait)
;;       (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
;;      ((and (> loadjmp (cond
;; 		       (load-jump-limit load-jump-limit)
;; 		       ((> numcpus 8)(/ numcpus 2))
;; 		       ((> numcpus 4)(/ numcpus 1.2))
;; 		       (else 0.5)))
;; 	   (> count 0))
;;       (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". "
;; 			(if msg msg ""))
;;       (thread-sleep! adjwait)
;;       (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
;;      (else
;;       (if (> num-tries 0)
;; 	  (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost")))
;; 	      (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing."))
;; 	  (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing."))))))
;; 
(define (get-uname . params)
  (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
	 (uname #f))
    (if (null? (car uname-res))
	"unknown"
	(caar uname-res))))

;; for reasons I don't understand multiple calls to real-path in parallel threads
;; must be protected by mutexes
;;
(define (common:real-path inpath)
  ;; (process:cmd-run-with-stderr->list "readlink" "-f" inpath)) ;; cmd . params)
  ;; (let-values 
  ;;  (((inp oup pid) (process "readlink" (list "-f" inpath))))
  ;;  (with-input-from-port inp
  ;;    (let loop ((inl (read-line))
  ;;       	(res #f))
  ;;      (print "inl=" inl)
  ;;      (if (eof-object? inl)
  ;;          (begin
  ;;            (close-input-port inp)
  ;;            (close-output-port oup)
  ;;            ;; (process-wait pid)
  ;;            res)
  ;;          (loop (read-line) inl))))))
  (with-input-from-pipe (conc "readlink -f " inpath) read-line))

;;======================================================================
;; D I S K   S P A C E 
;;======================================================================

(define (common:get-disk-space-used fpath)







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<











<
<
<
<
<
<
<
<
<
<
<
<
<
<







2230
2231
2232
2233
2234
2235
2236





















































































2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247














2248
2249
2250
2251
2252
2253
2254
      (if (> num-tries 0) ;; should be "num-tries-left". 
	  (if (common:low-noise-print 30 (conc (round effective-load) "-load-acceptable-" effective-host))
	      (debug:print 0 *default-log-port* "Load on " effective-host " is acceptable at effective normalized load of "
			   effective-normalized-load "  continuing."))
	  (debug:print 0 *default-log-port* "Load on " effective-host ", "
		       first" could not be retrieved. Giving up and continuing."))))))






















































































(define (get-uname . params)
  (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
	 (uname #f))
    (if (null? (car uname-res))
	"unknown"
	(caar uname-res))))

;; for reasons I don't understand multiple calls to real-path in parallel threads
;; must be protected by mutexes
;;
(define (common:real-path inpath)














  (with-input-from-pipe (conc "readlink -f " inpath) read-line))

;;======================================================================
;; D I S K   S P A C E 
;;======================================================================

(define (common:get-disk-space-used fpath)
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
   ((equal? status "WARN")    "orange")
   ((equal? status "KILLED")  "orange")
   ((equal? status "KILLREQ") "purple")
   ((equal? status "RUNNING") "blue")
   ((equal? status "ABORT")   "brown")
   (else "black")))

;; ;;======================================================================
;; ;; N A N O M S G   C L I E N T
;; ;;======================================================================
;; 
;; 
;; 
;; (define (common:send-dboard-main-changed)
;;   (let* ((dashboard-ips (mddb:get-dashboards)))
;;     (for-each
;;      (lambda (ipadr)
;;        (let* ((soc (common:open-nm-req (conc "tcp://" ipadr)))
;; 	      (msg (conc "main " *toppath*))
;; 	      (res (common:nm-send-receive-timeout soc msg)))
;; 	 (if (not res) ;; couldn't reach that dashboard - remove it from db
;; 	     (print "ERROR: couldn't reach dashboard " ipadr))
;; 	 res))
;;      dashboard-ips)))
;;     
;;     
;; ;;======================================================================
;; ;; D A S H B O A R D   D B 
;; ;;======================================================================
;; 
;; (define (mddb:open-db)
;;   (let* ((db (open-database (conc (get-environment-variable "HOME") "/.dashboard.db"))))
;;     (set-busy-handler! db (busy-timeout 10000))
;;     (for-each
;;      (lambda (qry)
;;        (exec (sql db qry)))
;;      (list 
;;       "CREATE TABLE IF NOT EXISTS vars       (id INTEGER PRIMARY KEY,key TEXT, val TEXT, CONSTRAINT varsconstraint UNIQUE (key));"
;;       "CREATE TABLE IF NOT EXISTS dashboards (
;;           id         INTEGER PRIMARY KEY,
;;           pid        INTEGER,
;;           username   TEXT,
;;           hostname   TEXT,
;;           ipaddr     TEXT,
;;           portnum    INTEGER,
;;           start_time TIMESTAMP DEFAULT (strftime('%s','now')),
;;              CONSTRAINT hostport UNIQUE (hostname,portnum)
;;         );"
;;       ))
;;     db))
;; 
;; ;; register a dashboard 
;; ;;
;; (define (mddb:register-dashboard port)
;;   (let* ((pid      (current-process-id))
;; 	 (hostname (get-host-name))
;; 	 (ipaddr   (server:get-best-guess-address hostname))
;; 	 (username (current-user-name)) ;; (car userinfo)))
;; 	 (db      (mddb:open-db)))
;;     (print "Register monitor, pid: " pid ", hostname: " hostname ", port: " port ", username: " username)
;;     (exec (sql db "INSERT OR REPLACE INTO dashboards (pid,username,hostname,ipaddr,portnum) VALUES (?,?,?,?,?);")
;; 	   pid username hostname ipaddr port)
;;     (close-database db)))
;; 
;; ;; unregister a monitor
;; ;;
;; (define (mddb:unregister-dashboard host port)
;;   (let* ((db      (mddb:open-db)))
;;     (print "Register unregister monitor, host:port=" host ":" port)
;;     (exec (sql db "DELETE FROM dashboards WHERE hostname=? AND portnum=?;") host port)
;;     (close-database db)))
;; 
;; ;; get registered dashboards
;; ;;
;; (define (mddb:get-dashboards)
;;   (let ((db (mddb:open-db)))
;;     (query fetch-column
;; 	   (sql db "SELECT ipaddr || ':' || portnum FROM dashboards;"))))
    
;;======================================================================
;;  T E S T   L A U N C H I N G   P E R   I T E M   W I T H   H O S T   T Y P E S
;;======================================================================
;; 
;; [hosts]
;; arm cubie01 cubie02







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







2987
2988
2989
2990
2991
2992
2993







































































2994
2995
2996
2997
2998
2999
3000
   ((equal? status "WARN")    "orange")
   ((equal? status "KILLED")  "orange")
   ((equal? status "KILLREQ") "purple")
   ((equal? status "RUNNING") "blue")
   ((equal? status "ABORT")   "brown")
   (else "black")))








































































    
;;======================================================================
;;  T E S T   L A U N C H I N G   P E R   I T E M   W I T H   H O S T   T Y P E S
;;======================================================================
;; 
;; [hosts]
;; arm cubie01 cubie02