Megatest

Check-in [1f3e2a5c9a]
Login
Overview
Comment:Many small fixes, still broken though
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65-cleanup
Files: files | file ages | folders
SHA1: 1f3e2a5c9a80016b0b2605a8c6fd6233707519cc
User & Date: matt on 2020-08-22 22:43:48
Other Links: branch diff | manifest | tags
Context
2020-08-23
00:03
Remove junk code causing false stack trace. check-in: dfe44999f0 user: matt tags: v1.65-cleanup
2020-08-22
22:43
Many small fixes, still broken though check-in: 1f3e2a5c9a user: matt tags: v1.65-cleanup
14:14
Merged in partial content from v1.66-captain-ulex work. ==7.2/0.9/1201/mars== check-in: 070ebd2d0d user: matt tags: v1.65-cleanup
Changes

Modified common.scm from [d232b0060b] to [5f4ffd9564].

2080
2081
2082
2083
2084
2085
2086
2087



2088
2089
2090
2091
2092
2093
2094
2095
2096

2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109

2110
2111
2112
2113
2114
2115
2116




2117
2118


2119
2120
2121
2122

2123

2124
2125
2126
2127
2128
2129
2130
2080
2081
2082
2083
2084
2085
2086

2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097

2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110

2111
2112
2113
2114
2115
2116
2117

2118
2119
2120
2121
2122

2123
2124
2125
2126
2127
2128
2129

2130
2131
2132
2133
2134
2135
2136
2137







-
+
+
+








-
+












-
+






-
+
+
+
+

-
+
+




+
-
+







	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where
					       ;; numcpus (or could be
					       ;; maxload) is zero,
					       ;; crude fallback is to
					       ;; at least use 1
	 (loadjmp (- first next))
	 (loadjmp (- first (if (> next (* numcpus 0.7))
			       0
			       next))) ;; we will force a conservative calculation any time next is large.
	 ;; add some randomness to the time to break any alignment
	 ;; where netbatch dumps many jobs to machines simultaneously
         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)
						      (/ (- 1000 count) 10)
						      waitdelay)
						   (- first adjmaxload) ))  )))
    ;; let's let the user know once in a long while that load checking
    ;; is happening but not constantly report it
    (if (> (random 100) 75) ;; about 25% of the time
    (if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time
	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
			  ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
    (cond
     ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
	   (> num-tries 0))
      (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.")
      (thread-sleep! 10)
      (common:wait-for-cpuload maxload-in numcpus-in waitdelay
			       count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1)))
     ((and (> first adjmaxload)
	   (> count 0))
      (debug:print-info 0 *default-log-port*
			"server start delayed " adjwait
			"Delaying " adjwait
			" seconds due to load " first
			" exceeding max of " adjmaxload
			" on server " (or remote-host (get-host-name))
			" (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)
     ((and (> loadjmp (cond
		       ((> numcpus 8)(/ numcpus 4))
		       ((> numcpus 4)(/ numcpus 2))
		       (else 0.5)))
	   (> count 0))
      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg ""))
      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus
			(if msg (conc " " msg) ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     (else
      (if (> num-tries 0)
	  (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost")))
	  (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing.")
	      (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing."))
	  (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing."))))))

(define (get-uname . params)
  (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
	 (uname #f))
    (if (null? (car uname-res))
	"unknown"

Modified db.scm from [f30ded65d6] to [b17ec00910].

169
170
171
172
173
174
175










176
177
178
179
180
181
182
183
184
185
186
187








188
189
190
191
192


193
194
195
196
197
198
199
200
201
202
203
204
205
206

























207
208
209
210
211
212
213
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190







191
192
193
194
195
196
197
198
199
200
201
202
203
204
205














206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237







+
+
+
+
+
+
+
+
+
+





-
-
-
-
-
-
-
+
+
+
+
+
+
+
+





+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







;; 	(mutex-lock! *rundb-mutex*)
;; 	(if (eq? mod-read 'mod)
;; 	    (dbr:dbstruct-mtime-set! dbstruct (current-milliseconds))
;; 	    (dbr:dbstruct-rtime-set! dbstruct (current-milliseconds)))
;; 	(dbr:dbstruct-inuse-set! dbstruct #f)
;; 	(mutex-unlock! *rundb-mutex*))))

(define-inline (db:generic-error-printout exn . message)
  (print-call-chain (current-error-port))
  (apply debug:print-error 0 *default-log-port* message)
  (debug:print-error 0 *default-log-port* "   params: " params
		     ", error: "     ((condition-property-accessor 'exn 'message)   exn)
		     ", arguments: " ((condition-property-accessor 'exn 'arguments) exn)
		     ", location: "  ((condition-property-accessor 'exn 'location)  exn)
		     ))

(print-call-chain (current-error-port))
;; (db:with-db dbstruct run-id sqlite3:exec "select blah fgrom blaz;")
;; r/w is a flag to indicate if the db is modified by this query #t = yes, #f = no
;;
(define (db:with-db dbstruct run-id r/w proc . params)
  (let* ((have-struct (dbr:dbstruct? dbstruct))
         (dbdat (if have-struct 
                    (db:get-db dbstruct)
                    #f))
	 (db    (if have-struct
		    (db:dbdat-get-db dbdat)
		    dbstruct))
	 (use-mutex (> *api-process-request-count* 25)))
         (dbdat     (if have-struct 
			(db:get-db dbstruct)
			#f))
	 (db        (if have-struct
			(db:dbdat-get-db dbdat)
			dbstruct))
	 (fname     (db:dbdat-get-path dbdat))
	 (use-mutex (> *api-process-request-count* 25))) ;; was 25
    (if (and use-mutex
	     (common:low-noise-print 120 "over-50-parallel-api-requests"))
	(debug:print-info 0 *default-log-port* *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access"))
    (if (common:low-noise-print 600 (conc "parallel-api-requests" *max-api-process-requests*))
	(debug:print-info 2 *default-log-port* "Parallel api request count: " *api-process-request-count* " max parallel requests: " *max-api-process-requests*))
    (condition-case
     (begin
    (handle-exceptions
     exn
     (begin
       (print-call-chain (current-error-port))
       (debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn))
       ;; there is no recovering at this time. exit
       (exit 50))
     (if use-mutex (mutex-lock! *db-with-db-mutex*))
     (let ((res (apply proc db params)))
       (if use-mutex (mutex-unlock! *db-with-db-mutex*))
       ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
       (if dbdat (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat))
       res))))

    ;;;;;;;;; (handle-exceptions
    ;;;;;;;;;  exn
    ;;;;;;;;;  (begin
    ;;;;;;;;;    (print-call-chain (current-error-port))
    ;;;;;;;;;    (debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn))
    ;;;;;;;;;    ;; there is no recovering at this time. exit
    ;;;;;;;;;    (exit 50))
       (if use-mutex (mutex-lock! *db-with-db-mutex*))
       (let ((res (apply proc db params)))
	 (if use-mutex (mutex-unlock! *db-with-db-mutex*))
	 ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
	 (if dbdat (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat))
	 res))
     (exn (io-error)
	  (db:generic-error-printout exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again."))
     (exn (corrupt)
	  (db:generic-error-printout exn "ERROR: database " fname " is corrupt. Repair it to proceed."))
     (exn (busy)
	  (db:generic-error-printout exn "ERROR: database " fname
				     " is locked. Try copying to another location, remove original and copy back."))
     (exn (permission)(db:generic-error-printout exn "ERROR: database " fname " has some permissions problem."))
     (exn ()
	  (db:generic-error-printout exn "ERROR: Unknown error with database " fname " message: "
		       ((condition-property-accessor 'exn 'message) exn))))))
      
;;======================================================================
;; K E E P   F I L E D B   I N   dbstruct
;;======================================================================

;; (define (db:get-filedb dbstruct run-id)
;;   (let ((db (vector-ref dbstruct 2)))
;;     (if db
1797
1798
1799
1800
1801
1802
1803















1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826























1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838













1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853















1854
1855
1856
1857


1858
1859



1860
1861

1862
1863

1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877








1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890





1891
1892


1893
1894
1895
1896
1897
1898







1899
1900
1901
1902



1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842























1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865












1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878















1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894



1895
1896
1897

1898
1899
1900
1901

1902
1903

1904














1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922



1923
1924
1925
1926
1927
1928

1929
1930
1931





1932
1933
1934
1935
1936
1937
1938
1939



1940
1941
1942




















1943
1944
1945
1946
1947
1948
1949







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

-
-
-
+
+

-
+
+
+

-
+

-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+










-
-
-
+
+
+
+
+

-
+
+

-
-
-
-
-
+
+
+
+
+
+
+

-
-
-
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-







         )
    (debug:print-info 4  *default-log-port* "running-deadtime = " running-deadtime)
    (debug:print-info 4  *default-log-port* "deadtime-trim = " deadtime-trim)

    (db:with-db 
     dbstruct #f #f
     (lambda (db)
       (let* ((stmth1 (db:get-cache-stmth
		       dbstruct db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('RUNNING');"))
	      (stmth2 (db:get-cache-stmth
		       dbstruct db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('REMOTEHOSTSTART');"))
	      (stmth3 (db:get-cache-stmth
		       dbstruct db
		       "SELECT id,rundir,uname,testname,item_path FROM tests
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400
                                          AND state IN ('LAUNCHED');")))
       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
       ;;
       ;; HOWEVER: this code in run:test seems to work fine
       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
       ;;                     (db:test-get-run_duration testdat)))
       ;;                    600) 
       ;; (db:delay-if-busy dbdat)
       (sqlite3:for-each-row 
        (lambda (test-id run-dir uname testname item-path event-time run-duration)
          (if (and (equal? uname "n/a")
                   (equal? item-path "")) ;; this is a toplevel test
              ;; what to do with toplevel? call rollup?
              (begin
                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
              (begin
                (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
                (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration))))
        db
        
        "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');"
        run-id running-deadtime) ;; default time 720 seconds

	 ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
	 ;;
	 ;; HOWEVER: this code in run:test seems to work fine
	 ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
	 ;;                     (db:test-get-run_duration testdat)))
	 ;;                    600) 
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
		  (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
				    test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
				    " event-time="event-time" run-duration="run-duration))))
	  stmth1
	  run-id running-deadtime) ;; default time 720 seconds
       
       
       (sqlite3:for-each-row 
        (lambda (test-id run-dir uname testname item-path event-time run-duration)
          (if (and (equal? uname "n/a")
                   (equal? item-path "")) ;; this is a toplevel test
              ;; what to do with toplevel? call rollup?
              (begin
                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
              (begin
                (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)
                (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
				    " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
				    " run-duration="run-duration)
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
        db
        "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');"
        run-id remotehoststart-deadtime) ;; default time 230 seconds

       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
       ;;
       ;; (db:delay-if-busy dbdat)
       (sqlite3:for-each-row
        (lambda (test-id run-dir uname testname item-path)
          (if (and (equal? uname "n/a")
                   (equal? item-path "")) ;; this is a toplevel test
              ;; what to do with toplevel? call rollup?
              (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
              (begin
                (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id" 1 day since event_time marked")
	  stmth2
	  run-id remotehoststart-deadtime) ;; default time 230 seconds
	 
	 ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
	 ;;
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row
	  (lambda (test-id run-dir uname testname item-path)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		(begin
		  (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id
				    " 1 day since event_time marked")
                (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))))
        db
        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"
        run-id)
	  stmth3
	  run-id)
       
       (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")
	 (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, "
			   (length toplevels) " old LAUNCHED toplevel tests and "
			   (length incompleted) " tests marked RUNNING but apparently dead."))

       ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
	 ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
       ;;
       ;; (db:delay-if-busy dbdat)
	 ;; (db:delay-if-busy dbdat)
       (let* (;; (min-incompleted (filter (lambda (x)
              ;;      		      (let* ((testpath (cadr x))
              ;;      			     (tdatpath (conc testpath "/testdat.db"))
              ;;      			     (dbexists (common:file-exists? tdatpath)))
              ;;      			(or (not dbexists) ;; if no file then something wrong - mark as incomplete
              ;;      			    (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim
              ;;      		    incompleted))
              (min-incompleted-ids (map car incompleted)) ;; do 'em all
              (all-ids             (append min-incompleted-ids (map car oldlaunched))))
         (if (> (length all-ids) 0)
             (begin
	       ;; (launch:is-test-alive "localhost" 435)
               (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD")
               (for-each
	 (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all
		(all-ids             (append min-incompleted-ids (map car oldlaunched))))
	   (if (> (length all-ids) 0)
	       (begin
		 ;; (launch:is-test-alive "localhost" 435)
		 (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ")
			      " as DEAD")
		 (for-each
                  (lambda (test-id)
                    (let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id))
			   (tinfo   (db:get-test-info-by-id dbstruct run-id test-id))
			   (run-dir (db:test-get-rundir     tinfo))
			   (host    (db:test-get-host       tinfo))
			   (pid     (db:test-get-process_id tinfo))
			   (result (db:get-status-from-final-status-file run-dir)))
		      (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) 
			  (begin
			    (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD")
			    (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "PASS"
								   "Test stopped responding but it has PASSED; marking it PASS in the DB."))
			  (let ((is-alive (launch:is-test-alive host pid)))
			    (db:set-state-status-and-roll-up-items
			     dbstruct run-id test-id 'foo "COMPLETED" "PASS"
			     "Test stopped responding but it has PASSED; marking it PASS in the DB."))
			  (let ((is-alive (and (not (eq? pid 0))  ;; 0 is default in re-used field "attemptnum" where pid stored.
					       (launch:is-test-alive host pid))))
			    (if is-alive
				(debug:print 0 *default-log-port* "INFO: test " test-id " on host " host " has a process on pid " pid ", NOT setting to DEAD.")
				(debug:print 0 *default-log-port* "INFO: test " test-id " on host " host
					     " has a process on pid " pid ", NOT setting to DEAD.")
				(begin
				  (debug:print 0 *default-log-port* "INFO: test " test-id " final state/status is not COMPLETED/PASS. It is " result)
				  (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD"
									 "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
                     ;; call end of eud of run detection for posthook - from merge, is it needed?
                     ;; (launch:end-of-run-check run-id)
				  (debug:print 0 *default-log-port* "INFO: test " test-id
					       " final state/status is not COMPLETED/PASS. It is " result)
				  (db:set-state-status-and-roll-up-items
				   dbstruct run-id test-id 'foo "COMPLETED" "DEAD"
				   "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
		  ;; call end of eud of run detection for posthook - from merge, is it needed?
		  ;; (launch:end-of-run-check run-id)
		  all-ids)
	       ;;call end of eud of run detection for posthook
	       (launch:end-of-run-check run-id)
	       )))))))
		 ;;call end of eud of run detection for posthook
		 (launch:end-of-run-check run-id)
		 )))))))


;; ALL REPLACED BY THE BLOCK ABOVE
;;
;; 	    (sqlite3:execute 
;; 	     db
;; 	     (conc "UPDATE tests SET state='INCOMPLETE' WHERE run_id=? AND id IN (" 
;; 		   (string-intersperse (map conc all-ids) ",")
;; 		   ");")
;;              run-id))))
;; 
;;     ;; Now do rollups for the toplevel tests
;;     ;;
;;     ;; (db:delay-if-busy dbdat)
;;     (for-each
;;      (lambda (toptest)
;;        (let ((test-name (list-ref toptest 3)))
;; ;;	     (run-id    (list-ref toptest 5)))
;; 	 (db:top-test-set-per-pf-counts dbstruct run-id test-name)))
;;      toplevels)))

;; BUG: Probably broken - does not explicitly use run-id in the query
;;
(define (db:top-test-set-per-pf-counts dbstruct run-id test-name)
  (db:general-call dbstruct 'top-test-set-per-pf-counts (list test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name)))

;; Clean out old junk and vacuum the database
3253
3254
3255
3256
3257
3258
3259
3260

3261
3262
3263
3264
3265
3266
3267
3273
3274
3275
3276
3277
3278
3279

3280
3281
3282
3283
3284
3285
3286
3287







-
+







     #f
     (lambda (db)
       (let* ((stmth (db:get-cache-stmth dbstruct db qry)))
	 (sqlite3:first-result stmth run-id))))))

;; For a given testname how many items are running? Used to determine
;; probability for regenerating html
;; 
;;
(define (db:get-count-tests-running-for-testname dbstruct run-id testname)
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (db)
     (let* ((stmt "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? AND NOT (uname = 'n/a' AND item_path = '') AND testname=?;")

Modified launch.scm from [baaec62bb2] to [774f43269a].

1367
1368
1369
1370
1371
1372
1373
1374

1375
1376
1377
1378
1379
1380
1381
1367
1368
1369
1370
1371
1372
1373

1374
1375
1376
1377
1378
1379
1380
1381







-
+







	;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; 
	;; 			    (begin
	;; 			      (thread-sleep! 1)
	;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time))))))
	 (item-path       (item-list->path itemdat))
	 (contour         #f)) ;; NOT READY FOR THIS (args:get-arg "-contour")))
    (let loop ((delta        (- (current-seconds) *last-launch*))
	       (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1)))
	       (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 0)))
      (if (> launch-delay delta)
	  (begin
	    (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay.
		(debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds"))
	    (thread-sleep! (- launch-delay delta))
	    (loop (- (current-seconds) *last-launch*) launch-delay))))
    (change-directory *toppath*)

Modified runs.scm from [77a44f327a] to [bbb893475a].

313
314
315
316
317
318
319
320



321
322
323
324
325
326
327
313
314
315
316
317
318
319

320
321
322
323
324
325
326
327
328
329







-
+
+
+







	   (args:get-arg "-one-pass"))
      (exit 0))

  (thread-sleep! (cond ;; BB: check with Matt.  Should this sleep move
		       ;; to cond clauses below where we determine we
		       ;; have too many jobs running rather than each
		       ;; time the and condition above is true (which
		       ;; seems like always)?
		  ;; seems like always)?
		  ((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way
		   0)
        	  ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time
		   (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ..."))
		   10)  ;; obviously haven't had any work to do for a while
		  (else 0)))
;;		   ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?
;;		   (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)
;;		   )))
1190
1191
1192
1193
1194
1195
1196
1197

1198
1199
1200
1201
1202
1203
1204
1192
1193
1194
1195
1196
1197
1198

1199
1200
1201
1202
1203
1204
1205
1206







-
+







     ;; If no resources are available just kill time and loop again
     ;;
     ((not have-resources) ;; simply try again after waiting a second
      (if (runs:lownoise "no resources" 60)
	  (debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ..."))
      ;; Have gone back and forth on this but db starvation is an issue.
      ;; wait one second before looking again to run jobs.
      (thread-sleep! 1)
      (thread-sleep! 0.25)
      ;; could have done hed tal here but doing car/cdr of newtal to rotate tests
      (list (car newtal)(cdr newtal) reg reruns))
     
     ;; This is the final stage, everything is in place so launch the test
     ;;
     ((and have-resources
	   (or (null? prereqs-not-met)