Megatest

Check-in [f22d7f8cb6]
Login
Overview
Comment:Added waits to steps, get rid of defunct server stuff (let them die natural deaths) ==7.34/1.5/WARN/1202==
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65-cleanup
Files: files | file ages | folders
SHA1: f22d7f8cb6542bd9c3314590ae84720b3eb04b6d
User & Date: mrwellan on 2020-08-29 09:22:21
Original Comment: Added waits to steps, get rid of defunct server stuff (let them die natural deaths)
Other Links: branch diff | manifest | tags
Context
2020-08-30
00:14
Gate when first detect that all job slots are in use. No point in looping until slots are availble. ==7.1/1.6/WARN/1201/mars== check-in: cd1100885b user: matt tags: v1.65-cleanup
2020-08-29
09:22
Added waits to steps, get rid of defunct server stuff (let them die natural deaths) ==7.34/1.5/WARN/1202== check-in: f22d7f8cb6 user: mrwellan tags: v1.65-cleanup
06:30
force server exit only if [setup] runtime is set ==3.7/1.0/PASS/mars== check-in: 7951d54072 user: matt tags: v1.65-cleanup
Changes

Modified common.scm from [15370e151d] to [3722d67826].

1811
1812
1813
1814
1815
1816
1817
1818




1819
1820
1821
1822
1823
1824
1825
1826
1827







1828
1829
1830



1831
1832
1833
1834
1835
1836
1837

1838
1839
1840
1841
1842
1843
1844
;;     cpu-load))

;; get values from cached info from dropping file in logs dir
;;  e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 10))
  (if *toppath*
      (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")))




	(if (and (file-exists? fullpath)
		 (file-read-access? fullpath))
	    (handle-exceptions
		exn
	      (begin
		(debug:print 0 *default-log-port* "failed to get cached info from " fullpath ", exn=" exn)
		#f)
	     (debug:print 2 *default-log-port* "reading file " fullpath)
	     (let ((real-age (- (current-seconds)(file-change-time fullpath)))) 







	       (if (< real-age age)
		   (handle-exceptions
		       exn



		     (begin
		       (debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn)
		       (delete-file* fullpath)
		       #f)
		     (with-input-from-file fullpath read))
		   (begin
		     (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")

		     #f))))
	    (begin
	      (debug:print 2 *default-log-port* "not reading file " fullpath)
	      #f)))
      #f))
 
(define (common:write-cached-info key dtype dat)







|
>
>
>
>








|
>
>
>
>
>
>
>



>
>
>
|
<
|
|
|

|
>







1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845

1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
;;     cpu-load))

;; get values from cached info from dropping file in logs dir
;;  e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 10))
  (if *toppath*
      (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log"))
	     (delfile  (lambda ()
			 (debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn)
			 (delete-file* fullpath)
			 #f)))
	(if (and (file-exists? fullpath)
		 (file-read-access? fullpath))
	    (handle-exceptions
		exn
	      (begin
		(debug:print 0 *default-log-port* "failed to get cached info from " fullpath ", exn=" exn)
		#f)
	     (debug:print 2 *default-log-port* "reading file " fullpath)
	     (let ((real-age (- (current-seconds)
				(handle-exceptions
				    exn
				  (begin
				    (debug:print 1 *default-log-port* "Failed to read mod time on file "
						 fullpath ", using 0, exn=" exn)
				    0)
				  (file-change-time fullpath)))))
	       (if (< real-age age)
		   (handle-exceptions
		       exn
		     (delfile)
		     (let* ((res (with-input-from-file fullpath read)))
		       (if (eof-object? res)
			   (begin

			     (delfile)
			     #f)
			   res)))
		   (begin
		     (debug:print-info 2 *default-log-port* "file " fullpath
				       " is too old (" real-age" seconds) to trust, skipping reading it")
		     #f))))
	    (begin
	      (debug:print 2 *default-log-port* "not reading file " fullpath)
	      #f)))
      #f))
 
(define (common:write-cached-info key dtype dat)
2100
2101
2102
2103
2104
2105
2106

2107
2108


2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129


2130
2131
2132
2133
2134
2135
2136
(define (common:wait-for-homehost-load maxnormload msg)
  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                     #f
                     (common:get-homehost)))
         (hh     (if hh-dat (car hh-dat) #f)))
    (common:wait-for-normalized-load maxnormload msg hh)))


(define (common:get-num-cpus remote-host)
  (let* ((actual-host (or remote-host (get-host-name))))


    (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) ;; hosts had better not be changing the number of cpus too often!
	(let* ((proc   (lambda ()
			 (let loop ((numcpu 0)
				    (inl    (read-line)))
			   (if (eof-object? inl)
			       (if (> numcpu 0)
				   numcpu
				   #f) ;; if zero return #f so caller knows that things are not working
			       (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl)
					 (+ numcpu 1)
					 numcpu)
				     (read-line))))))
	       (result (if remote-host
			   (with-input-from-pipe 
			       (conc "ssh " remote-host " cat /proc/cpuinfo")
			     proc)
			   (with-input-from-file "/proc/cpuinfo" proc))))
	  (if (and (number? result)
		   (> result 0))
	      (common:write-cached-info actual-host "num-cpus" result))
	  result))))



;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5))
  (let ((num-cpus (common:get-num-cpus remote-host)))
    (if num-cpus
	(common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host)







>


>
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>
>







2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
(define (common:wait-for-homehost-load maxnormload msg)
  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                     #f
                     (common:get-homehost)))
         (hh     (if hh-dat (car hh-dat) #f)))
    (common:wait-for-normalized-load maxnormload msg hh)))

(define *numcpus-cache* (make-hash-table))
(define (common:get-num-cpus remote-host)
  (let* ((actual-host (or remote-host (get-host-name))))
    ;; hosts had better not be changing the number of cpus too often!
    (or (hash-table-ref/default *numcpus-cache* actual-host #f)
	(let* ((numcpus (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600)))
			    (let* ((proc   (lambda ()
					     (let loop ((numcpu 0)
							(inl    (read-line)))
					       (if (eof-object? inl)
						   (if (> numcpu 0)
						       numcpu
						       #f) ;; if zero return #f so caller knows that things are not working
						   (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl)
							     (+ numcpu 1)
							     numcpu)
							 (read-line))))))
				   (result (if remote-host
					       (with-input-from-pipe 
						(conc "ssh " remote-host " cat /proc/cpuinfo")
						proc)
					       (with-input-from-file "/proc/cpuinfo" proc))))
			      (if (and (number? result)
				       (> result 0))
				  (common:write-cached-info actual-host "num-cpus" result))
			      result))))
	  (hash-table-set! *numcpus-cache* actual-host numcpus)
	  numcpus))))

;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5))
  (let ((num-cpus (common:get-num-cpus remote-host)))
    (if num-cpus
	(common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host)

Modified http-transport.scm from [6328aa908d] to [67489ed9ab].

564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
                                       (full-serv-fname (conc *toppath* "/logs/" serv-fname))
                                       (new-serv-fname  (conc *toppath* "/logs/" "defunct-" serv-fname)))
                                  (debug:print 0 *default-log-port* msg)
                                  (if (common:file-exists? full-serv-fname)
                                      (system (conc "sleep 1;mv -f " full-serv-fname " " new-serv-fname))
                                      (debug:print 0 *default-log-port* "INFO: cannot move " full-serv-fname " to " new-serv-fname))
                                  (exit)))))
    (if (and (not start-time-old) ;; last server start try was less than five seconds ago
	     (not server-starting))
	(begin
	  (cleanup-proc "NOT starting server, there is either a recently started server or a server in process of starting")
	  (exit)))
    ;; lets not even bother to start if there are already three or more server files ready to go
    (let* ((num-alive   (server:get-num-alive (server:get-list *toppath*))))
      (if (> num-alive 3)
          (begin
            (cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up"))
            (exit))))
  (common:save-pkt `((action . start)
		     (T      . server)
		     (pid    . ,(current-process-id)))







|





|







564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
                                       (full-serv-fname (conc *toppath* "/logs/" serv-fname))
                                       (new-serv-fname  (conc *toppath* "/logs/" "defunct-" serv-fname)))
                                  (debug:print 0 *default-log-port* msg)
                                  (if (common:file-exists? full-serv-fname)
                                      (system (conc "sleep 1;mv -f " full-serv-fname " " new-serv-fname))
                                      (debug:print 0 *default-log-port* "INFO: cannot move " full-serv-fname " to " new-serv-fname))
                                  (exit)))))
    #;(if (and (not start-time-old) ;; last server start try was less than five seconds ago
	     (not server-starting))
	(begin
	  (cleanup-proc "NOT starting server, there is either a recently started server or a server in process of starting")
	  (exit)))
    ;; lets not even bother to start if there are already three or more server files ready to go
    #;(let* ((num-alive   (server:get-num-alive (server:get-list *toppath*))))
      (if (> num-alive 3)
          (begin
            (cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up"))
            (exit))))
  (common:save-pkt `((action . start)
		     (T      . server)
		     (pid    . ,(current-process-id)))

Modified megatest.scm from [06490b6ac8] to [0e58f17e0f].

2097
2098
2099
2100
2101
2102
2103

2104
2105
2106
2107
2108
2109
2110
2111

2112
2113
2114
2115
2116
2117
2118
	      (rmt:teststep-set-status! run-id test-id step state status (or comment msg) logfile))
	    (begin
	      (debug:print-error 0 *default-log-port* "You must specify :state and :status with every call to -step")
	      (exit 6))))))

(if (args:get-arg "-step")
    (begin

      (megatest:step 
       (args:get-arg "-step")
       (or (args:get-arg "-state")(args:get-arg ":state"))
       (or (args:get-arg "-status")(args:get-arg ":status"))
       (args:get-arg "-setlog")
       (args:get-arg "-m"))
      ;; (if db (sqlite3:finalize! db))
      (set! *didsomething* #t)))

    
(if (or (args:get-arg "-setlog")       ;; since setting up is so costly lets piggyback on -test-status
	;;     (not (args:get-arg "-step")))  ;; -setlog may have been processed already in the "-step" previous
	;;     NEW POLICY - -setlog sets test overall log on every call.
	(args:get-arg "-set-toplog")
	(args:get-arg "-test-status")
	(args:get-arg "-set-values")







>







|
>







2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
	      (rmt:teststep-set-status! run-id test-id step state status (or comment msg) logfile))
	    (begin
	      (debug:print-error 0 *default-log-port* "You must specify :state and :status with every call to -step")
	      (exit 6))))))

(if (args:get-arg "-step")
    (begin
      (thread-sleep! 1.5)
      (megatest:step 
       (args:get-arg "-step")
       (or (args:get-arg "-state")(args:get-arg ":state"))
       (or (args:get-arg "-status")(args:get-arg ":status"))
       (args:get-arg "-setlog")
       (args:get-arg "-m"))
      ;; (if db (sqlite3:finalize! db))
      (set! *didsomething* #t)
      (thread-sleep! 1.5)))
    
(if (or (args:get-arg "-setlog")       ;; since setting up is so costly lets piggyback on -test-status
	;;     (not (args:get-arg "-step")))  ;; -setlog may have been processed already in the "-step" previous
	;;     NEW POLICY - -setlog sets test overall log on every call.
	(args:get-arg "-set-toplog")
	(args:get-arg "-test-status")
	(args:get-arg "-set-values")