Megatest

Artifact [e690f680a7]
Login

Artifact e690f680a7fdd2994ffbf9751baa1fc0e153fdf6:


;; Copyright 2006-2017, Matthew Welland.
;; 
;; This file is part of Megatest.
;; 
;;     Megatest is free software: you can redistribute it and/or modify
;;     it under the terms of the GNU General Public License as published by
;;     the Free Software Foundation, either version 3 of the License, or
;;     (at your option) any later version.
;; 
;;     Megatest is distributed in the hope that it will be useful,
;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;     GNU General Public License for more details.
;; 
;;     You should have received a copy of the GNU General Public License
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
;;

(declare (unit servermod))

(declare (uses commonmod))
(declare (uses configfmod))
(declare (uses debugprint))
(declare (uses mtargs))

(module servermod
	*
	
(use (srfi 18) extras s11n)
(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest)
(use directory-utils posix-extras matchable utils)
(use spiffy uri-common intarweb http-client spiffy-request-vars)

(import commonmod
	configfmod
	debugprint
	(prefix mtargs args:))

(include "common_records.scm")
(include "db_records.scm")

(define (server:make-server-url hostport)
  (if (not hostport)
      #f
      (conc "http://" (car hostport) ":" (cadr hostport))))

(define  *server-loop-heart-beat* (current-seconds))

;;======================================================================
;; P K T S   S T U F F 
;;======================================================================

;; ???

;;======================================================================
;; P K T S   S T U F F 
;;======================================================================

;; ???

;;======================================================================
;; S E R V E R
;;======================================================================

;; Call this to start the actual server
;;

;;======================================================================
;; S E R V E R   U T I L I T I E S 
;;======================================================================

;; Get the transport
(define (server:get-transport)
  (if *transport-type*
      *transport-type*
      (let ((ttype (string->symbol
		    (or (args:get-arg "-transport")
			(configf:lookup *configdat* "server" "transport")
			"rpc"))))
	(set! *transport-type* ttype)
	ttype)))
	    
;; Generate a unique signature for this server
(define (server:mk-signature)
  (message-digest-string (md5-primitive) 
			 (with-output-to-string
			   (lambda ()
			     (write (list (current-directory)
                                          (current-process-id)
					  (argv)))))))

(define (server:get-client-signature)
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic
        (set! *my-client-signature* sig)
        *my-client-signature*)))

(define (server:get-server-id)
  (if *server-id* *server-id*
      (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic
        (set! *server-id* sig)
        *server-id*)))

;; ;; When using zmq this would send the message back (two step process)
;; ;; with spiffy or rpc this simply returns the return data to be returned
;; ;; 
;; (define (server:reply return-addr query-sig success/fail result)
;;   (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result)
;;   ;; (send-message pubsock target send-more: #t)
;;   ;; (send-message pubsock 
;;   (case (server:get-transport)
;;     ((rpc)  (db:obj->string (vector success/fail query-sig result)))
;;     ((http) (db:obj->string (vector success/fail query-sig result)))
;;     ((fs)   result)
;;     (else 
;;      (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
;;      result)))

;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((testsuite   (common:get-testsuite-name))
	 (logfile     (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
	 (profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
			   ""))
	 (cmdln (conc (common:get-megatest-exe)
		      " -server - ";; (or target-host "-")
		      (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
			  " -daemonize "
			  "")
		      ;; " -log " logfile
		      " -m testsuite:" testsuite
		      " " profile-mode
		      )) ;; (conc " >> " logfile " 2>&1 &")))))
	 (log-rotate  (make-thread common:rotate-logs  "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!?
         (load-limit  (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0)))
    ;; we want the remote server to start in *toppath* so push there
    (push-directory areapath)
    (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
    (thread-start! log-rotate)
    
    ;; host.domain.tld match host?
    ;; (if (and target-host 
    ;; 	     ;; look at target host, is it host.domain.tld or ip address and does it 
    ;; 	     ;; match current ip or hostname
    ;; 	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
    ;; 	     (not (equal? curr-ip target-host)))
    ;; 	(begin
    ;; 	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
    ;; 	  (setenv "TARGETHOST" target-host)))
    ;;   
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time
    (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds server-id
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let 
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 

(define (server:logf-get-start-info logf)
  (let ((server-rx    (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx    (regexp "^SERVER: dbprep"))
        (dbprep-found 0)
	(bad-dat      (list #f #f #f #f #f)))
    (handle-exceptions
     exn
     (begin
       ;; WARNING: this is potentially dangerous to blanket ignore the errors
       (if (file-exists? logf)
	   (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn))
       bad-dat) ;; no idea what went wrong, call it a bad server
     (with-input-from-file
	 logf
       (lambda ()
	 (let loop ((inl  (read-line))
		    (lnum 0))
	   (if (not (eof-object? inl))
	       (let ((mlst (string-match server-rx inl))
		     (dbprep (string-match dbprep-rx inl)))
		 (if dbprep (set! dbprep-found 1))
		 (if (not mlst)
		     (if (< lnum 500) ;; give up if more than 500 lines of server log read
			 (loop (read-line)(+ lnum 1))
			 (begin 
                           (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf )
                           bad-dat))
		     (match mlst
			    ((_ host port start server-id pid)
			     (list host
				   (string->number port)
				   (string->number start)
				   server-id
				   (string->number pid)))
			    (else
			     (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst)
			     bad-dat))))
	       (begin 
		 (if dbprep-found
		     (begin
		       (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time))
		       (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting?
		     (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds))))
		 bad-dat))))))))

;; ;; get a list of servers from the log files, with all relevant data
;; ;; ( mod-time host port start-time pid )
;; ;;
;; (define (server:get-list areapath #!key (limit #f))
;;   (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
;; 	(day-seconds (* 24 60 60)))
;;     ;; if the directory exists continue to get the list
;;     ;; otherwise attempt to create the logs dir and then
;;     ;; continue
;;     (if (if (directory-exists? (conc areapath "/logs"))
;; 	    '()
;; 	    (if (file-write-access? areapath)
;; 		(begin
;; 		  (condition-case
;; 		   (create-directory (conc areapath "/logs") #t)
;; 		   (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
;; 		   (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn)))
;; 		  (directory-exists? (conc areapath "/logs")))
;; 		'()))
;; 
;;         ;; Get the list of server logs.
;; 	(let* (
;;                ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers.
;;                ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'")))
;;                (server-logs   (glob (conc areapath "/logs/server-*-*.log")))
;; 	       (num-serv-logs (length server-logs)))
;; 	  (if (or (null? server-logs) (= num-serv-logs 0))
;;               (let ()
;;                  (debug:print 2  *default-log-port* "There are no servers running at " (common:human-time))
;; 	         '()
;;               )
;; 	      (let loop ((hed  (string-chomp (car server-logs)))
;; 			 (tal  (cdr server-logs))
;; 			 (res '()))
;; 		(let* ((mod-time  (handle-exceptions
;; 				   exn
;; 				   (begin
;; 				     (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn)
;; 				     (current-seconds)) ;; 0
;; 				   (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
;; 		       (down-time (- (current-seconds) mod-time))
;; 		       (serv-dat  (if (or (< num-serv-logs 10)
;; 				  	  (< down-time 900)) ;; day-seconds))
;; 				      (server:logf-get-start-info hed)
;; 				      '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at
;; 		       (serv-rec (cons mod-time serv-dat))
;; 		       (fmatch   (string-match fname-rx hed))
;; 		       (pid      (if fmatch (string->number (list-ref fmatch 2)) #f))
;; 		       (new-res  (if (null? serv-dat)
;; 				     res
;; 				     (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let 
;; 		  (if (null? tal)
;; 		      (if (and limit
;; 			       (> (length new-res) limit))
;; 			  new-res ;; (take new-res limit)  <= need intelligent sorting before this will work
;; 			  new-res)
;; 		      (loop (string-chomp (car tal)) (cdr tal) new-res)))))))))

#;(define (server:get-num-alive srvlst)
  (let ((num-alive 0))
    (for-each
     (lambda (server)
        (handle-exceptions
          exn
         (begin 
          (debug:print-info 0 *default-log-port*  "Unable to get server start-time and/or mod-time from " server ", exn=" exn))
       (match-let (((mod-time host port start-time server-id pid)
		    server))
	 (let* ((uptime  (- (current-seconds) mod-time))
		(runtime (if start-time
			     (- mod-time start-time)
			     0)))
	   (if (< uptime 5)(set! num-alive (+ num-alive 1)))))))
     srvlst)
    num-alive))

;; ;; given a list of servers get a list of valid servers, i.e. at least
;; ;; 10 seconds old, has started and is less than 1 hour old and is
;; ;; active (i.e. mod-time < 10 seconds
;; ;;
;; ;; mod-time host port start-time pid
;; ;;
;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off
;; ;; and servers should stick around for about two hours or so.
;; ;;
;; (define (server:get-best srvlst)
;;   (let* ((nums (server:get-num-servers))
;; 	 (now  (current-seconds))
;; 	 (slst (sort
;; 		(filter (lambda (rec)
;; 			  (if (and (list? rec)
;; 				   (> (length rec) 2))
;; 			      (let ((start-time (list-ref rec 3))
;; 				    (mod-time   (list-ref rec 0)))
;; 				;; (print "start-time: " start-time " mod-time: " mod-time)
;; 				(and start-time mod-time
;; 				     (> (- now start-time) 0)    ;; been running at least 0 seconds
;; 				     (< (- now mod-time)   16)   ;; still alive - file touched in last 16 seconds
;; 				     (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set
;; 					 (< (- now start-time)       
;; 					    (+ (- (string->number (configf:lookup *configdat* "server" "runtime"))
;; 					       180)
;; 					    (random 360)))) ;; under one hour running time +/- 180
;; 				     ))
;; 			      #f))
;; 			srvlst)
;; 		(lambda (a b)
;; 		  (< (list-ref a 3)
;; 		     (list-ref b 3))))))
;;     (if (> (length slst) nums)
;; 	(take slst nums)
;; 	slst)))

;; ;; switch from server:get-list to server:get-servers-info
;; ;;
;; (define (server:get-first-best areapath)
;;   (let ((srvrs (server:get-best (server:get-list areapath))))
;;     (if (and srvrs
;; 	     (not (null? srvrs)))
;; 	(car srvrs)
;; 	#f)))
;; 
;; (define (server:get-rand-best areapath)
;;   (let ((srvrs (server:get-best (server:get-list areapath))))
;;     (if (and (list? srvrs)
;; 	     (not (null? srvrs)))
;; 	(let* ((len (length srvrs))
;; 	       (idx (random len)))
;; 	  (list-ref srvrs idx))
;; 	#f)))

(define (server:record->id servr)
  (handle-exceptions
   exn
   (begin 
     (debug:print-info 0 *default-log-port*  "Unable to get server id from " servr ", exn=" exn)     
   #f)
  (match-let (((host port start-time server-id pid)
	       servr))
    (if server-id
	server-id
	#f))))

(define (server:record->url servr)
  (handle-exceptions
   exn
   (begin 
     (debug:print-info 0 *default-log-port*  "Unable to get server url from " servr ", exn=" exn)     
   #f)
  (match-let (((host port start-time server-id pid)
	       servr))
    (if (and host port)
	(conc host ":" port)
	#f))))


;; if server-start-last exists, and wasn't old enough, wait <idle time> + 1, then call this function recursively until it is old enough.
;; if it is old enough, overwrite it and wait 0.25 seconds.
;; if it then has the wrong server key, wait <idle time> + 1 and call this function recursively.
;;
#;(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (idletime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))
	       (old-enough   (> delta idletime))
               (new-server-key ""))
          ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than <idletime> seconds, and the new file still has the same server key as you just wrote, return #t.
	  ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process.
           (if (and old-enough
		    (begin
                      (debug:print-info 2 *default-log-port* "Writing " start-flag)
		      (with-output-to-file start-flag (lambda () (print server-key)))
		      (thread-sleep! 0.25)
		      (set! new-server-key (with-input-from-file start-flag (lambda () (read-line))))
		      (equal? server-key new-server-key)))
	       #t
               ;; If either of the above conditions is not true, print a "Gating server start" message, wait <idle-time> + 1, then call this function recursively. 
	       (begin
		 (debug:print-info 0 *default-log-port* "Gating server start, last start: "
				   (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server"))
		 
		 (thread-sleep! ( + 1 idletime))
		 (server:wait-for-server-start-last-flag areapath)))))))

;; oldest server alive determines host then choose random of youngest
;; five servers on that host
;;
(define (server:get-servers-info areapath)
  ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.")
  (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo")))
    (if (not (file-exists? servinfodir))
	(create-directory servinfodir))
    (let* ((allfiles    (glob (conc servinfodir"/*")))
	   (res         (make-hash-table)))
      (for-each
       (lambda (f)
	 (let* ((hostport  (pathname-strip-directory f))
		(serverdat (server:logf-get-start-info f)))
	   (match serverdat
	     ((host port start server-id pid)
	      (if (and host port start server-id pid)
		  (hash-table-set! res hostport serverdat)
		  (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat)))
	     (else
	      (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat)))))
       allfiles)
      res)))

;; check the .servinfo directory, are there other servers running on this
;; or another host?
;;
;; returns #t => ok to start another server
;;         #f => not ok to start another server
;;
(define (server:minimal-check areapath)
  (server:clean-up-old areapath)
  (let* ((srvdir      (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo"))
	 (servrs      (glob (conc srvdir"/*")))
	 (thishostip  (server:get-best-guess-address (get-host-name)))
	 (thisservrs  (glob (conc srvdir"/"thishostip":*")))
	 (homehostinf (server:choose-server areapath 'homehost))
	 (havehome    (car homehostinf))
	 (wearehome   (cdr homehostinf)))
    (debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome
		      ", numservers: "(length thisservrs))
    (cond
     ((not havehome) #t) ;; no homehost yet, go for it
     ((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another
     ((and havehome (not wearehome)) #f)     ;; we are not the home host
     ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running
     (else
      (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs)
      #t))))
	 

(define server-last-start 0)


;; oldest server alive determines host then choose random of youngest
;; five servers on that host
;;
;; mode:
;;   best - get best server (random of newest five)
;;   home - get home host based on oldest server
;;   info - print info
(define (server:choose-server areapath #!optional (mode 'best))
  ;; age is current-starttime
  ;; find oldest alive
  ;;   1. sort by age ascending and ping until good
  ;; find alive rand from youngest
  ;;   1. sort by age descending
  ;;   2. take five
  ;;   3. check alive, discard if not and repeat
  ;; first we clean up old server files
  (assert (eq? (rmt:transport-mode) 'http) "FATAL: server:run called with rmt:transport-mode="(rmt:transport-mode))
  (server:clean-up-old areapath)
  (let* ((since-last (- (current-seconds) server-last-start))
        (server-start-delay 10))     
    (if ( < (- (current-seconds) server-last-start) 10 )
      (begin
        (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start))
        (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds")
        (thread-sleep! server-start-delay)
      )
      (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start))
    )
  )
  (let* ((serversdat  (server:get-servers-info areapath))
	 (servkeys    (hash-table-keys serversdat))
	 (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last
			  (sort servkeys ;; list of "host:port"
				(lambda (a b)
				  (>= (list-ref (hash-table-ref serversdat a) 2)
				      (list-ref (hash-table-ref serversdat b) 2))))
			  '())))
    (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat)
    (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys)
    (if (not (null? by-time-asc))
	(let* ((oldest     (last by-time-asc))
	       (oldest-dat (hash-table-ref serversdat oldest))
	       (host       (list-ref oldest-dat 0))
	       (all-valid  (filter (lambda (x)
				     (equal? host (list-ref (hash-table-ref serversdat x) 0)))
				   by-time-asc))
	       (best-ten  (lambda ()
			     (if (> (length all-valid) 11)
				 (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out
				 (if (> (length all-valid) 8)
				     (drop-right all-valid 1)
				     all-valid))))
	       (names->dats (lambda (names)
			      (map (lambda (x)
				     (hash-table-ref serversdat x))
				   names)))
	       (am-home?    (lambda ()
			      (let* ((currhost (get-host-name))
				     (bestadrs (server:get-best-guess-address currhost)))
				(or (equal? host currhost)
				    (equal? host bestadrs))))))
	  (case mode
	    ((info)
	     (debug:print 0 *default-log-port* "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid)
	     (debug:print 0 *default-log-port* "youngest: "(hash-table-ref serversdat (car all-valid))))
	    ((home)     host)
	    ((homehost) (cons host (am-home?))) ;; shut up old code
	    ((home?)    (am-home?))
	    ((best-ten)(names->dats (best-ten)))
	    ((all-valid)(names->dats all-valid))
	    ((best)     (let* ((best-ten (best-ten))
			       (len       (length best-ten)))
			  (hash-table-ref serversdat (list-ref best-ten (random len)))))
	    ((count)(length all-valid))
	    (else
	     (debug:print 0 *default-log-port* "ERROR: invalid command "mode)
	     #f)))
	(begin
	  (server:run areapath)
          (set! server-last-start (current-seconds))
	  ;; (thread-sleep! 3)
	  (case mode
	    ((homehost) (cons #f #f))
	    (else	#f))))))

(define (server:get-servinfo-dir areapath)
  (let* ((spath (conc areapath"/.servinfo")))
    (if (not (file-exists? spath))
	(create-directory spath #t))
    spath))

(define (server:clean-up-old areapath)
  ;; any server file that has not been touched in ten minutes is effectively dead
  (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*"))))
    (for-each
     (lambda (sfile)
       (let* ((modtime (handle-exceptions
			   exn
			 (begin
			   (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile)
			   (current-seconds))
			 (file-modification-time sfile))))
	 (if (and (number? modtime)
		  (> (- (current-seconds) modtime)
		     600))
	     (begin
	       (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.")
	       (handle-exceptions
		   exn
		 (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile)
		 (delete-file sfile))))))
     sfiles)))

;; would like to eventually get rid of this
;;
(define (common:on-homehost?)
  (if (eq? (rmt:transport-mode) 'http)
      (server:choose-server *toppath* 'home?)
      #t)) ;; there is no homehost for tcp and nfs is always on home so #t should work

;; kind start up of server, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least <server idletime> seconds old
  ;; (server:wait-for-server-start-last-flag areapath)
  (let loop ()
    (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2)
	(begin
	  (if (common:low-noise-print 30 "our-host-load")
	      (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server."))
	  (loop))))
  (if (< (server:choose-server areapath 'count) 20)
      (server:run areapath))
  #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* ((lock-file    (conc areapath "/logs/server-start.lock")))
	(let* ((start-flag (conc areapath "/logs/server-start-last")))
	  (common:simple-file-lock-and-wait lock-file expire-time: 25)
	  (debug:print-info  2 *default-log-port* "server:kind-run: touching " start-flag)
	  (system (conc "touch " start-flag)) ;; lazy but safe
	  (server:run areapath)
	  (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
	  (common:simple-file-release-lock lock-file)))
      (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")))

;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))
    (let loop ((server-info (server:check-if-running areapath))
	       (try-num    0))
      (if (or server-info
	      (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
	  (server:record->url server-info)
	  (let* ( (servers (server:choose-server areapath 'all-valid))
                (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0)))
	    (if (and (> try-num 0)  ;; first time through simply wait a little while then try again
		     (< num-ok 1))  ;; if there are no decent candidates for servers then try starting a new one
		(server:run areapath))
	    (thread-sleep! 5)
	    (loop (server:check-if-running areapath)
		  (+ try-num 1)))))))

(define (server:get-num-servers #!key (numservers 2))
  (let ((ns (string->number
	     (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
    (or ns numservers)))

;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath) ;;  #!key (numservers "2"))
  (let* ((ns            (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed
	 (servers       (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath))))
    (if (or (and servers
		 (null? servers))
	    (not servers))
	    ;; (and (list? servers)
	    ;;	 (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers
        #f
        (let loop ((hed (car servers))
                   (tal (cdr servers)))
          (let ((res (server:check-server hed)))
            (if res
                hed
                (if (null? tal)
                    #f
                    (loop (car tal)(cdr tal)))))))))

;; ping the given server
;;
(define (server:check-server server-record)
  (let* ((server-url (server:record->url server-record))
         (server-id  (server:record->id server-record)) 
         (res        (server:ping server-url server-id)))
    (if res
        server-url
	#f)))

(define (server:kill servr)
  (handle-exceptions
    exn
    (begin 
      (debug:print-info 0 *default-log-port*  "Unable to get host and/or port from " servr ", exn=" exn)     
    #f)
  (match-let (((hostname port start-time server-id pid)
	       servr))
    (tasks:kill-server hostname pid))))

;; ;; called in megatest.scm, host-port is string hostname:port
;; ;;
;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running
;; ;;       in the same process as the server.
;; ;;
;; (define (server:ping host:port server-id #!key (do-exit #f))
;;   (let* ((host-port (cond
;; 		     ((string? host:port)
;; 		      (let ((slst (string-split   host:port ":")))
;; 			(if (eq? (length slst) 2)
;; 			    (list (car slst)(string->number (cadr slst)))
;; 			    #f)))
;; 		     (else
;; 		      #f))))
;;     (cond
;;      ((and (list? host-port)
;; 	   (eq? (length host-port) 2))
;;       (let* ((myrunremote (make-and-init-remote *toppath*))
;; 	     (iface       (car host-port))
;; 	     (port        (cadr host-port))
;; 	     (server-dat  (client:connect iface port server-id myrunremote))
;; 	     (login-res   (rmt:login-no-auto-client-setup myrunremote)))
;; 	(http-transport:close-connections myrunremote)
;; 	(if (and (list? login-res)
;; 		 (car login-res))
;; 	    (begin
;; 	      ;; (print "LOGIN_OK")
;; 	      (if do-exit (exit 0))
;; 	      #t)
;; 	    (begin
;; 	      ;; (print "LOGIN_FAILED")
;; 	      (if do-exit (exit 1))
;; 	      #f))))
;;      (else 
;;       (if host:port
;; 	  (debug:print 0 *default-log-port*  "ERROR: bad host:port "host:port))
;;       (if do-exit
;; 	  (exit 1)
;; 	  #f)))))
;; 
;; ;; run ping in separate process, safest way in some cases
;; ;;
;; (define (server:ping-server ifaceport)
;;   (with-input-from-pipe 
;;    (conc (common:get-megatest-exe) " -ping " ifaceport)
;;    (lambda ()
;;      (let loop ((inl (read-line))
;; 		(res "NOREPLY"))
;;        (if (eof-object? inl)
;; 	   (case (string->symbol res)
;; 	     ((NOREPLY)  #f)
;; 	     ((LOGIN_OK) #t)
;; 	     (else       #f))
;; 	   (loop (read-line) inl))))))
;; 
;; ;; NOT USED (well, ok, reference in rpc-transport but otherwise not used).
;; ;;
;; (define (server:login toppath)
;;   (lambda (toppath)
;;     (set! *db-last-access* (current-seconds)) ;; might not be needed.
;;     (if (equal? *toppath* toppath)
;; 	#t
;; 	#f)))

;; timeout is hms string: 1h 5m 3s, default is 1 minute
;; This is currently broken. Just use the number of hours with no unit.
;; Default is 600 seconds.
;;
(define (server:expiration-timeout)
  (let* ((tmo (configf:lookup *configdat* "server" "timeout")))
    (if (string? tmo)
	(let* ((num (string->number tmo)))
	  (if num
	      (* 3600 num)
	      (common:hms-string->seconds tmo)))
	600 ;; this is the default
	)))

(define (server:get-best-guess-address hostname)
  (let ((res #f))
    (for-each 
     (lambda (adr)
       (if (not (eq? (u8vector-ref adr 0) 127))
	   (set! res adr)))
     ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME
     (vector->list (hostinfo-addresses (hostname->hostinfo hostname))))
    (string-intersperse 
     (map number->string
	  (u8vector->list
	   (if res res (hostname->ip hostname)))) ".")))

;; moving this here as it needs access to db and cannot be in common.
;;

(define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f))
  (debug:print "WARNING: bruteforce-syncer is called but has been disabled!")
  (lambda ()
    (debug:print "WARNING: bruteforce-syncer is called but has been disabled!")))

;; 
(defstruct remote

  ;; transport to be used
  ;; http              - use http-transport
  ;; http-read-cached  - use http-transport for writes but in-mem cached for reads
  (rmode            'http)
  (hh-dat            (let ((res (or (server:choose-server *toppath* 'homehost)
				    (cons #f #f))))
		       (assert (pair? res)(conc "FATAL: hh-dat should be a pair, got "res))
		       res))
  (server-url        #f) ;; (server:check-if-running *toppath*) #f))
  (server-id         #f)
  (server-info       #f) ;; (if *toppath* (server:check-if-running *toppath*) #f))
  (last-server-check 0)  ;; last time we checked to see if the server was alive
  (connect-time      (current-seconds)) ;; when we first connected
  (last-access       (current-seconds)) ;; last time we talked to server
  ;; (conndat           #f) ;; iface port api-uri api-url api-req seconds server-id
  (server-timeout    (server:expiration-timeout))
  (force-server      #f)
  (ro-mode           #f)  
  (ro-mode-checked   #f) ;; flag that indicates we have checked for ro-mode

  ;; conndat stuff
  (iface             #f) ;; TODO: Consolidate this data with server-url and server-info above
  (port              #f)
  (api-url           #f)
  (api-uri           #f)
  (api-req           #f))

;;======================================================================
;; Rotate logs, logic: 
;;                 if > 500k and older than 1 week:
;;                     remove previous compressed log and compress this log
;; WARNING: This proc operates assuming that it is in the directory above the
;;          logs directory you wish to log-rotate.
;;
(define (common:rotate-logs)
  (let* ((all-files (make-hash-table))
	 (stats     (make-hash-table))
	 (inc-stat  (lambda (key)
		      (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1))))
	(max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "600")))) ;; name -> age
    (if (not (directory-exists? "logs"))(create-directory "logs"))
    (directory-fold 
     (lambda (file rem)
       (handle-exceptions
	exn
	(begin
	  (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore. exn=" exn)
	  (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
	  ;; (print-call-chain (current-error-port)) ;; 
	  )
	(let* ((fullname  (conc "logs/" file))
	       (mod-time  (file-modification-time fullname))
	       (file-age  (- (current-seconds) mod-time))
	       (file-old  (> file-age (* 48 60 60)))
	       (file-big  (> (file-size fullname) 200000)))
	  (hash-table-set! all-files file mod-time)
	  (if (or (and (string-match "^.*.log" file)
		       file-old
		       file-big)
		  (and (string-match "^server-.*.log" file)
		       file-old))
	      (let ((gzfile (conc fullname ".gz")))
		(if (common:file-exists? gzfile)
		    (begin
		      (debug:print-info 0 *default-log-port* "removing " gzfile)
		      (delete-file* gzfile)
		      (hash-table-delete!  all-files gzfile) ;; needed?
		      ))
		(debug:print-info 0 *default-log-port* "compressing " file)
		(system (conc "gzip " fullname))
		(inc-stat "gzipped")
		(hash-table-set! all-files (conc file ".gz") file-age)  ;; add the .gz file and remove the base file
		(hash-table-delete! all-files file)
		)
	      (if (and (> file-age (* (string->number (or (configf:lookup *configdat* "setup" "log-expire-days") "30")) 24 3600))
		       (file-exists? fullname)) ;; just in case it was gzipped - will get it next time
		  (handle-exceptions
		   exn
		   #f
		   (if (directory? fullname)
		       (begin
			 (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
			 (inc-stat "directories"))
		       (begin
			 (delete-file* fullname)
			 (inc-stat "deleted")))
		   (hash-table-delete! all-files file)))))))
     '()
     "logs")
    (for-each
     (lambda (category)
       (let ((quant (hash-table-ref/default stats category 0)))
	 (if (> quant 0)
	     (debug:print-info 0 *default-log-port* category " log files: " quant))))
     `("deleted" "gzipped" "directories"))
    (let ((num-logs (hash-table-size all-files)))
      (if (> num-logs max-allowed) ;; because NFS => don't let number of logs exceed 300
	  (let ((files (take (sort (hash-table-keys all-files)
				   (lambda (a b)
				     (< (hash-table-ref all-files a)(hash-table-ref all-files b))))
			     (- num-logs max-allowed))))
	    (for-each
	     (lambda (file)
	       (let* ((fullname (conc "logs/" file)))
		 (if (directory? fullname)
		     (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
		     (handle-exceptions
		      exn
		      (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn)
		      (delete-file* fullname)))))
	     files)
	    (debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files."))))))

(define (common:wait-for-homehost-load maxnormload msg)
  (let loop ((start-time (current-seconds))) ;; we saw some instances of this being called before *toppath* was set. This might be an early setup race. This delay should help but it is impossible to test...
    (if (not *toppath*)
	(begin
	  (debug:print 0 *default-log-port* "ERROR: common:wait-for-homehost-load called before *toppath* set.")
	  (thread-sleep! 30)
	  (if (< (- (current-seconds) start-time) 300)
	      (loop start-time)))))
  (case (rmt:transport-mode)
    ((http)
     (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
			#f
			(server:choose-server *toppath* 'homehost)))
            (hh     (if hh-dat (car hh-dat) #f)))
       (common:wait-for-normalized-load maxnormload msg hh)))
    (else
     (common:wait-for-normalized-load maxnormload msg (get-host-name)))))
    
;;======================================================================
;; Force a megatest cleanup-db if version is changed and skip-version-check not specified
;; Do NOT check if not on homehost!
;;
(define (common:exit-on-version-changed)
  (if (and *toppath*              ;; do nothing if *toppath* not yet provided
	   (common:on-homehost?))
      (if (common:api-changed?)
	  (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config"))
                 (dbfile  (conc (get-environment-variable "MT_RUN_AREA_HOME") ".mtdb/main.db"))
                 (read-only (not (file-write-access? dbfile)))
                 (dbstruct (db:setup))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;;  #t)))
	    (debug:print 0 *default-log-port*
			 "WARNING: Version mismatch!\n"
			 "   expected: " (common:version-signature) "\n"
			 "   got:      " (common:get-last-run-version))
            (cond
             ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t)
             ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only)
                   (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db
              (debug:print 0 *default-log-port* "   I see you are the owner of megatest.config, attempting to cleanup and reset to new version")
              (handle-exceptions
               exn
               (begin
                 (debug:print 0 *default-log-port* "Failed to switch versions. exn=" exn)
                 (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
                 (print-call-chain (current-error-port))
                 (exit 1))
               (common:cleanup-db dbstruct)))
             ((not (common:file-exists? mtconf))
              (debug:print 0 *default-log-port* "   megatest.config does not exist in this area.  Cannot proceed with megatest version migration.")
              (exit 1))
             ((not (common:file-exists? dbfile))
              (debug:print 0 *default-log-port* "   .mtdb/main.db does not exist in this area.  Cannot proceed with megatest version migration.")
              (exit 1))
             ((not (eq? (current-user-id)(file-owner mtconf)))
              (debug:print 0 *default-log-port* "   You do not own .mtdb/main.db in this area.  Cannot proceed with megatest version migration.")
              (exit 1))
             (read-only
              (debug:print 0 *default-log-port* "   You have read-only access to this area.  Cannot proceed with megatest version migration.")
              (exit 1))
             (else
              (debug:print 0 *default-log-port* " to switch versions you can run: \"megatest -cleanup-db\"")
              (exit 1)))))))
;;======================================================================
;;      (begin
;;	(debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.")
;;	(exit 1))))

;;======================================================================
;; E X I T   H A N D L I N G
;;======================================================================

;; (define (common:run-sync?)
;;   (and *toppath*               ;; gate if called before *toppath* is set
;;        (common:on-homehost?)
;;        (args:get-arg "-server")))


(define (std-signal-handler signum)
  ;; (signal-mask! signum)
  (set! *time-to-exit* #t) 
  ;;(debug:print-info 13 *default-log-port* "got signal "signum)
  (debug:print-error 0 *default-log-port* "Received signal " signum " aaa exiting promptly")
  ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway
  (exit))

(define (special-signal-handler signum)
  ;; (signal-mask! signum)
  (set! *time-to-exit* #t)
  ;;(debug:print-info 13 *default-log-port* "got signal "signum)
  (debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting!!")
  ;;TODO send email to notify admin contact listed in the config that the lisner got killed
  ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway
  (exit))


(set-signal-handler! signal/int  std-signal-handler)  ;; ^C
(set-signal-handler! signal/term std-signal-handler)
;;======================================================================
;; ideally put all this info into the db, no need to preserve it across moving homehost
;;
;; return list of
;;  ( reachable? cpuload update-time )
(define (common:get-host-info hostname)
  (let* ((loadinfo                         (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data
         (load                             (car loadinfo))
         (load-sample-time                 (cdr loadinfo))
         (load-sample-age                  (- (current-seconds) load-sample-time))
         (loadinfo-timeout-seconds         6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds
         (host-last-update-timeout-seconds 4)
         (host-rec (hash-table-ref/default *host-loads* hostname #f))
         )
    (cond
     ((< load-sample-age loadinfo-timeout-seconds)
      (list #t
            load-sample-time
            load))
     ((and host-rec
           (< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds)))
      (list #t
            (host-last-update host-rec)
            (host-last-cpuload host-rec )))
     ((common:unix-ping hostname)
      (list #t
            (current-seconds)
            (alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds
     (else
      (list #f 0 -1) ;; bad host, don't use!
      ))))

;;======================================================================
;; see defstruct host at top of file.
;;    host: reachable last-update last-used last-cpuload
;;
(define (common:update-host-loads-table hosts-raw)
  (let* ((hosts (filter (lambda (x)
                          (string-match (regexp "^\\S+$") x))
                        hosts-raw)))
    (for-each
     (lambda (hostname)
       (let* ((rec       (let ((h (hash-table-ref/default *host-loads* hostname #f)))
                          (if h
                              h
                              (let ((h (make-host)))
                                (hash-table-set! *host-loads* hostname h)
                                h))))
              (host-info         (common:get-host-info hostname))
              (is-reachable      (car host-info))
              (last-reached-time (cadr host-info))
              (load              (caddr host-info)))
         (host-reachable-set!    rec is-reachable)
         (host-last-update-set!  rec last-reached-time)
         (host-last-cpuload-set! rec load)))
     hosts)))

;;======================================================================
;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
;;   count     - count down to zero, at some point we'd give up if the load never drops
;;   num-tries - count down to zero number tries to get numcpus
;;
(define (common:wait-for-cpuload maxnormload numcpus-in
				 #!key (count 1000)
				 (msg #f)(remote-host #f)(num-tries 5))
  (let* ((loadavg     (common:get-cpu-load remote-host))
    ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
    (numcpus     (if (<= 1 numcpus-in)
    (common:get-num-cpus remote-host) numcpus-in))
    (first       (car loadavg))
    (next        (cadr loadavg))
    (adjmaxload  (* maxnormload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude
						  ;; fallback is to at least use 1
    ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
    ;; etc.
    (effective-load    (common:get-intercept first next))
    (recommended-delay (common:get-delay effective-load numcpus))
    (effective-host    (or remote-host "localhost"))
    (normalized-effective-load (/ effective-load numcpus))
    (will-wait                 (> normalized-effective-load maxnormload)))
    (if (and will-wait (> recommended-delay 1)) 
      (let* ((actual-delay (min recommended-delay 30)))
        (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load"))
          (debug:print-info 0 *default-log-port* "Load control, delaying "
				actual-delay " seconds to maintain safe load. current normalized effective load is "
				normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load))
	(thread-sleep! actual-delay)))
    
    (cond
     ;; bad data, try again to get the data
     ((not will-wait)
      (if (common:low-noise-print 3600 (conc (round normalized-effective-load) "-load-acceptable-" effective-host))
      	  (debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing.")))

     ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
	   (> num-tries 0))
      (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load "
		   first ", we'll sleep 10s and try " num-tries " more times.")
      (thread-sleep! 10)
      (common:wait-for-cpuload maxnormload numcpus-in
			       count: count remote-host: remote-host num-tries: (- num-tries 1)))

     ;; need to wait for load to drop
     ((and will-wait ;; (> first adjmaxload)
	   (> count 0))
      (debug:print-info 0 *default-log-port*
			"Delaying 15" ;; adjwait
			" seconds due to normalized effective load " normalized-effective-load ;; first
			" exceeding max of " adjmaxload
			" on server " (or remote-host (get-host-name))
			" (normalized load-limit: " maxnormload ") " (if msg msg ""))
      (thread-sleep! 15) ;; adjwait)
      (common:wait-for-cpuload maxnormload numcpus count: (- count 1) msg: msg remote-host: remote-host)
      ;; put the message here to indicate came out of waiting
      (debug:print-info 1 *default-log-port*
			"On host: " effective-host
			", effective load: " effective-load
			", numcpus: " numcpus
			", normalized effective load: " normalized-effective-load
			))
     ;; overloaded and count expired (i.e. went to zero)
     (else
      (if (> num-tries 0) ;; should be "num-tries-left". 
	  (if (common:low-noise-print 30 (conc (round effective-load) "-load-acceptable-" effective-host))
	      (debug:print 0 *default-log-port* "Load on " effective-host " is acceptable at effective normalized load of "
			   effective-normalized-load "  continuing."))
	  (debug:print 0 *default-log-port* "Load on " effective-host ", "
		       first" could not be retrieved. Giving up and continuing."))))))

;;======================================================================
;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
;;
;; (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5))
;;   (let* ((loadavg (common:get-cpu-load remote-host))
;; 	 (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
;; 		      (common:get-num-cpus remote-host)
;; 		      numcpus-in))
;; 	 (maxload (if force-maxload
;; 		      maxload-in
;; 		      (if (number? maxload-in)
;; 			  (max maxload-in 0.5)
;; 			  0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
;; 	 (first   (car loadavg))
;; 	 (next    (cadr loadavg))
;; 	 (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where
;; 					       ;; numcpus (or could be
;; 					       ;; maxload) is zero,
;; 					       ;; crude fallback is to
;; 					       ;; at least use 1
;; 	 (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next?
;; 			       0
;; 			       next))) ;; we will force a conservative calculation any time next is large.
;; 	 (first-next-avg    (/ (+ first next) 2))
;; 	 ;; add some randomness to the time to break any alignment
;; 	 ;; where netbatch dumps many jobs to machines simultaneously
;;          (adjwait           (min (+ 300 (random 10)) (abs (* (+ (random 10)
;; 								(/ (- 1000 count) 10)
;; 								waitdelay)
;; 							     (- first adjmaxload) ))))
;; 	 (load-jump-limit   (configf:lookup-number *configdat* "setup" "load-jump-limit"))
;; 	 ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
;; 	 ;; etc.
;; 	 (effective-load    (common:get-intercept first next))
;; 	 (effective-host    (or remote-host "localhost"))
;; 	 (normalized-effective-load (/ effective-load numcpus))
;; 	 (will-wait                 (> normalized-effective-load maxload)))
;; 	 
;;     ;; let's let the user know once in a long while that load checking
;;     ;; is happening but not constantly report it
;;     #;(if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time
;; 	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
;;     ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
;; 
;;     (debug:print-info 1 *default-log-port*
;; 		      "On host: " effective-host
;; 		      ", effective load: " effective-load
;; 		      ", numcpus: " numcpus
;; 		      ", normalized effective load: " normalized-effective-load
;; 		      )
;;     
;;     (cond
;;      ;; bad data, try again to get the data
;;      ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
;; 	   (> num-tries 0))
;;       (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.")
;;       (thread-sleep! 10)
;;       (common:wait-for-cpuload maxload-in numcpus-in waitdelay
;; 			       count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1)))
;;      ;; need to wait for load to drop
;;      ((and will-wait ;; (> first adjmaxload)
;; 	   (> count 0))
;;       (debug:print-info 0 *default-log-port*
;; 			"Delaying " 15 ;; adjwait
;; 			" seconds due to normalized effective load " normalized-effective-load ;; first
;; 			" exceeding max of " adjmaxload
;; 			" on server " (or remote-host (get-host-name))
;; 			" (normalized load-limit: " maxload ") " (if msg msg ""))
;;       (thread-sleep! 15) ;; adjwait)
;;       (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
;;      ((and (> loadjmp (cond
;; 		       (load-jump-limit load-jump-limit)
;; 		       ((> numcpus 8)(/ numcpus 2))
;; 		       ((> numcpus 4)(/ numcpus 1.2))
;; 		       (else 0.5)))
;; 	   (> count 0))
;;       (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". "
;; 			(if msg msg ""))
;;       (thread-sleep! adjwait)
;;       (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
;;      (else
;;       (if (> num-tries 0)
;; 	  (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost")))
;; 	      (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing."))
;; 	  (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing."))))))
;; 

;;======================================================================
;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5))
  (let ((num-cpus (common:get-num-cpus remote-host)))
    (if num-cpus
	(common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host)
	(begin
	  (thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again
	  (if (> rem-tries 0)
	      (common:wait-for-normalized-load maxnormload msg remote-host (- rem-tries 1))
	      #f)))))


)