Megatest

Check-in [1f3e2a5c9a]
Login
Overview
Comment:Many small fixes, still broken though
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65-cleanup
Files: files | file ages | folders
SHA1: 1f3e2a5c9a80016b0b2605a8c6fd6233707519cc
User & Date: matt on 2020-08-22 22:43:48
Other Links: branch diff | manifest | tags
Context
2020-08-23
00:03
Remove junk code causing false stack trace. check-in: dfe44999f0 user: matt tags: v1.65-cleanup
2020-08-22
22:43
Many small fixes, still broken though check-in: 1f3e2a5c9a user: matt tags: v1.65-cleanup
14:14
Merged in partial content from v1.66-captain-ulex work. ==7.2/0.9/1201/mars== check-in: 070ebd2d0d user: matt tags: v1.65-cleanup
Changes

Modified common.scm from [d232b0060b] to [5f4ffd9564].

2080
2081
2082
2083
2084
2085
2086
2087


2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116



2117
2118

2119
2120
2121
2122

2123
2124
2125
2126
2127
2128
2129
2130
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where
					       ;; numcpus (or could be
					       ;; maxload) is zero,
					       ;; crude fallback is to
					       ;; at least use 1
	 (loadjmp (- first next))


	 ;; add some randomness to the time to break any alignment
	 ;; where netbatch dumps many jobs to machines simultaneously
         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)
						      (/ (- 1000 count) 10)
						      waitdelay)
						   (- first adjmaxload) ))  )))
    ;; let's let the user know once in a long while that load checking
    ;; is happening but not constantly report it
    (if (> (random 100) 75) ;; about 25% of the time
	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
			  ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
    (cond
     ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
	   (> num-tries 0))
      (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.")
      (thread-sleep! 10)
      (common:wait-for-cpuload maxload-in numcpus-in waitdelay
			       count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1)))
     ((and (> first adjmaxload)
	   (> count 0))
      (debug:print-info 0 *default-log-port*
			"server start delayed " adjwait
			" seconds due to load " first
			" exceeding max of " adjmaxload
			" on server " (or remote-host (get-host-name))
			" (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)



	   (> count 0))
      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg ""))

      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     (else
      (if (> num-tries 0)

	  (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing.")
	  (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing."))))))

(define (get-uname . params)
  (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
	 (uname #f))
    (if (null? (car uname-res))
	"unknown"







|
>
>








|












|






|
>
>
>

|
>




>
|







2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where
					       ;; numcpus (or could be
					       ;; maxload) is zero,
					       ;; crude fallback is to
					       ;; at least use 1
	 (loadjmp (- first (if (> next (* numcpus 0.7))
			       0
			       next))) ;; we will force a conservative calculation any time next is large.
	 ;; add some randomness to the time to break any alignment
	 ;; where netbatch dumps many jobs to machines simultaneously
         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)
						      (/ (- 1000 count) 10)
						      waitdelay)
						   (- first adjmaxload) ))  )))
    ;; let's let the user know once in a long while that load checking
    ;; is happening but not constantly report it
    (if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time
	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
			  ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
    (cond
     ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
	   (> num-tries 0))
      (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.")
      (thread-sleep! 10)
      (common:wait-for-cpuload maxload-in numcpus-in waitdelay
			       count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1)))
     ((and (> first adjmaxload)
	   (> count 0))
      (debug:print-info 0 *default-log-port*
			"Delaying " adjwait
			" seconds due to load " first
			" exceeding max of " adjmaxload
			" on server " (or remote-host (get-host-name))
			" (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp (cond
		       ((> numcpus 8)(/ numcpus 4))
		       ((> numcpus 4)(/ numcpus 2))
		       (else 0.5)))
	   (> count 0))
      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus
			(if msg (conc " " msg) ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     (else
      (if (> num-tries 0)
	  (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost")))
	      (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing."))
	  (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing."))))))

(define (get-uname . params)
  (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
	 (uname #f))
    (if (null? (car uname-res))
	"unknown"

Modified db.scm from [f30ded65d6] to [b17ec00910].

169
170
171
172
173
174
175










176
177
178
179
180
181
182
183
184
185
186

187
188
189
190
191
192


193
194
195
196
197
198
199
200
201
202
203
204
205











206
207
208
209
210
211
212
213
;; 	(mutex-lock! *rundb-mutex*)
;; 	(if (eq? mod-read 'mod)
;; 	    (dbr:dbstruct-mtime-set! dbstruct (current-milliseconds))
;; 	    (dbr:dbstruct-rtime-set! dbstruct (current-milliseconds)))
;; 	(dbr:dbstruct-inuse-set! dbstruct #f)
;; 	(mutex-unlock! *rundb-mutex*))))











;; (db:with-db dbstruct run-id sqlite3:exec "select blah fgrom blaz;")
;; r/w is a flag to indicate if the db is modified by this query #t = yes, #f = no
;;
(define (db:with-db dbstruct run-id r/w proc . params)
  (let* ((have-struct (dbr:dbstruct? dbstruct))
         (dbdat (if have-struct 
                    (db:get-db dbstruct)
                    #f))
	 (db    (if have-struct
		    (db:dbdat-get-db dbdat)
		    dbstruct))

	 (use-mutex (> *api-process-request-count* 25)))
    (if (and use-mutex
	     (common:low-noise-print 120 "over-50-parallel-api-requests"))
	(debug:print-info 0 *default-log-port* *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access"))
    (if (common:low-noise-print 600 (conc "parallel-api-requests" *max-api-process-requests*))
	(debug:print-info 2 *default-log-port* "Parallel api request count: " *api-process-request-count* " max parallel requests: " *max-api-process-requests*))


    (handle-exceptions
     exn
     (begin
       (print-call-chain (current-error-port))
       (debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn))
       ;; there is no recovering at this time. exit
       (exit 50))
     (if use-mutex (mutex-lock! *db-with-db-mutex*))
     (let ((res (apply proc db params)))
       (if use-mutex (mutex-unlock! *db-with-db-mutex*))
       ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
       (if dbdat (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat))
       res))))












;;======================================================================
;; K E E P   F I L E D B   I N   dbstruct
;;======================================================================

;; (define (db:get-filedb dbstruct run-id)
;;   (let ((db (vector-ref dbstruct 2)))
;;     (if db







>
>
>
>
>
>
>
>
>
>





|
|
|
|
|
|
>
|





>
>
|
|
|
|
|
|
|
|
|
|
|
|
|
>
>
>
>
>
>
>
>
>
>
>
|







169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
;; 	(mutex-lock! *rundb-mutex*)
;; 	(if (eq? mod-read 'mod)
;; 	    (dbr:dbstruct-mtime-set! dbstruct (current-milliseconds))
;; 	    (dbr:dbstruct-rtime-set! dbstruct (current-milliseconds)))
;; 	(dbr:dbstruct-inuse-set! dbstruct #f)
;; 	(mutex-unlock! *rundb-mutex*))))

(define-inline (db:generic-error-printout exn . message)
  (print-call-chain (current-error-port))
  (apply debug:print-error 0 *default-log-port* message)
  (debug:print-error 0 *default-log-port* "   params: " params
		     ", error: "     ((condition-property-accessor 'exn 'message)   exn)
		     ", arguments: " ((condition-property-accessor 'exn 'arguments) exn)
		     ", location: "  ((condition-property-accessor 'exn 'location)  exn)
		     ))

(print-call-chain (current-error-port))
;; (db:with-db dbstruct run-id sqlite3:exec "select blah fgrom blaz;")
;; r/w is a flag to indicate if the db is modified by this query #t = yes, #f = no
;;
(define (db:with-db dbstruct run-id r/w proc . params)
  (let* ((have-struct (dbr:dbstruct? dbstruct))
         (dbdat     (if have-struct 
			(db:get-db dbstruct)
			#f))
	 (db        (if have-struct
			(db:dbdat-get-db dbdat)
			dbstruct))
	 (fname     (db:dbdat-get-path dbdat))
	 (use-mutex (> *api-process-request-count* 25))) ;; was 25
    (if (and use-mutex
	     (common:low-noise-print 120 "over-50-parallel-api-requests"))
	(debug:print-info 0 *default-log-port* *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access"))
    (if (common:low-noise-print 600 (conc "parallel-api-requests" *max-api-process-requests*))
	(debug:print-info 2 *default-log-port* "Parallel api request count: " *api-process-request-count* " max parallel requests: " *max-api-process-requests*))
    (condition-case
     (begin
    ;;;;;;;;; (handle-exceptions
    ;;;;;;;;;  exn
    ;;;;;;;;;  (begin
    ;;;;;;;;;    (print-call-chain (current-error-port))
    ;;;;;;;;;    (debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn))
    ;;;;;;;;;    ;; there is no recovering at this time. exit
    ;;;;;;;;;    (exit 50))
       (if use-mutex (mutex-lock! *db-with-db-mutex*))
       (let ((res (apply proc db params)))
	 (if use-mutex (mutex-unlock! *db-with-db-mutex*))
	 ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
	 (if dbdat (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat))
	 res))
     (exn (io-error)
	  (db:generic-error-printout exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again."))
     (exn (corrupt)
	  (db:generic-error-printout exn "ERROR: database " fname " is corrupt. Repair it to proceed."))
     (exn (busy)
	  (db:generic-error-printout exn "ERROR: database " fname
				     " is locked. Try copying to another location, remove original and copy back."))
     (exn (permission)(db:generic-error-printout exn "ERROR: database " fname " has some permissions problem."))
     (exn ()
	  (db:generic-error-printout exn "ERROR: Unknown error with database " fname " message: "
		       ((condition-property-accessor 'exn 'message) exn))))))
      
;;======================================================================
;; K E E P   F I L E D B   I N   dbstruct
;;======================================================================

;; (define (db:get-filedb dbstruct run-id)
;;   (let ((db (vector-ref dbstruct 2)))
;;     (if db
1797
1798
1799
1800
1801
1802
1803















1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837


1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853

1854
1855
1856

1857
1858
1859


1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876

1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888

1889

1890
1891
1892

1893
1894

1895

1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
         )
    (debug:print-info 4  *default-log-port* "running-deadtime = " running-deadtime)
    (debug:print-info 4  *default-log-port* "deadtime-trim = " deadtime-trim)

    (db:with-db 
     dbstruct #f #f
     (lambda (db)















       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
       ;;
       ;; HOWEVER: this code in run:test seems to work fine
       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
       ;;                     (db:test-get-run_duration testdat)))
       ;;                    600) 
       ;; (db:delay-if-busy dbdat)
       (sqlite3:for-each-row 
        (lambda (test-id run-dir uname testname item-path event-time run-duration)
          (if (and (equal? uname "n/a")
                   (equal? item-path "")) ;; this is a toplevel test
              ;; what to do with toplevel? call rollup?
              (begin
                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
              (begin
                (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
                (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration))))
        db
        
        "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');"
        run-id running-deadtime) ;; default time 720 seconds

       
       (sqlite3:for-each-row 
        (lambda (test-id run-dir uname testname item-path event-time run-duration)
          (if (and (equal? uname "n/a")
                   (equal? item-path "")) ;; this is a toplevel test
              ;; what to do with toplevel? call rollup?
              (begin
                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
              (begin
                (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)


                (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
        db
        "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');"
        run-id remotehoststart-deadtime) ;; default time 230 seconds

       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
       ;;
       ;; (db:delay-if-busy dbdat)
       (sqlite3:for-each-row
        (lambda (test-id run-dir uname testname item-path)
          (if (and (equal? uname "n/a")
                   (equal? item-path "")) ;; this is a toplevel test
              ;; what to do with toplevel? call rollup?
              (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
              (begin
                (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id" 1 day since event_time marked")

                (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))))
        db
        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"

        run-id)
       
       (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")



       ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
       ;;
       ;; (db:delay-if-busy dbdat)
       (let* (;; (min-incompleted (filter (lambda (x)
              ;;      		      (let* ((testpath (cadr x))
              ;;      			     (tdatpath (conc testpath "/testdat.db"))
              ;;      			     (dbexists (common:file-exists? tdatpath)))
              ;;      			(or (not dbexists) ;; if no file then something wrong - mark as incomplete
              ;;      			    (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim
              ;;      		    incompleted))
              (min-incompleted-ids (map car incompleted)) ;; do 'em all
              (all-ids             (append min-incompleted-ids (map car oldlaunched))))
         (if (> (length all-ids) 0)
             (begin
	       ;; (launch:is-test-alive "localhost" 435)
               (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD")

               (for-each
                  (lambda (test-id)
                    (let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id))
			   (tinfo   (db:get-test-info-by-id dbstruct run-id test-id))
			   (run-dir (db:test-get-rundir     tinfo))
			   (host    (db:test-get-host       tinfo))
			   (pid     (db:test-get-process_id tinfo))
			   (result (db:get-status-from-final-status-file run-dir)))
		      (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) 
			  (begin
			    (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD")
			    (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "PASS"

								   "Test stopped responding but it has PASSED; marking it PASS in the DB."))

			  (let ((is-alive (launch:is-test-alive host pid)))
			    (if is-alive
				(debug:print 0 *default-log-port* "INFO: test " test-id " on host " host " has a process on pid " pid ", NOT setting to DEAD.")

				(begin
				  (debug:print 0 *default-log-port* "INFO: test " test-id " final state/status is not COMPLETED/PASS. It is " result)

				  (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD"

									 "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
                     ;; call end of eud of run detection for posthook - from merge, is it needed?
                     ;; (launch:end-of-run-check run-id)
		  all-ids)
	       ;;call end of eud of run detection for posthook
	       (launch:end-of-run-check run-id)
	       )))))))


;; ALL REPLACED BY THE BLOCK ABOVE
;;
;; 	    (sqlite3:execute 
;; 	     db
;; 	     (conc "UPDATE tests SET state='INCOMPLETE' WHERE run_id=? AND id IN (" 
;; 		   (string-intersperse (map conc all-ids) ",")
;; 		   ");")
;;              run-id))))
;; 
;;     ;; Now do rollups for the toplevel tests
;;     ;;
;;     ;; (db:delay-if-busy dbdat)
;;     (for-each
;;      (lambda (toptest)
;;        (let ((test-name (list-ref toptest 3)))
;; ;;	     (run-id    (list-ref toptest 5)))
;; 	 (db:top-test-set-per-pf-counts dbstruct run-id test-name)))
;;      toplevels)))

;; BUG: Probably broken - does not explicitly use run-id in the query
;;
(define (db:top-test-set-per-pf-counts dbstruct run-id test-name)
  (db:general-call dbstruct 'top-test-set-per-pf-counts (list test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name)))

;; Clean out old junk and vacuum the database







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<
|
|
|
|
|
|
|
|
|
|
>
>
|
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>

<
<
>
|

|
>
>

|

|
<
<
<
<
<
<
<
|
|
|
|
|
|
>
|










|
>
|
>
|

|
>

|
>
|
>
|
|
|

|
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865

1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878

1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894


1895
1896
1897
1898
1899
1900
1901
1902
1903
1904







1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942




















1943
1944
1945
1946
1947
1948
1949
         )
    (debug:print-info 4  *default-log-port* "running-deadtime = " running-deadtime)
    (debug:print-info 4  *default-log-port* "deadtime-trim = " deadtime-trim)

    (db:with-db 
     dbstruct #f #f
     (lambda (db)
       (let* ((stmth1 (db:get-cache-stmth
		       dbstruct db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('RUNNING');"))
	      (stmth2 (db:get-cache-stmth
		       dbstruct db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('REMOTEHOSTSTART');"))
	      (stmth3 (db:get-cache-stmth
		       dbstruct db
		       "SELECT id,rundir,uname,testname,item_path FROM tests
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400
                                          AND state IN ('LAUNCHED');")))
	 ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
	 ;;
	 ;; HOWEVER: this code in run:test seems to work fine
	 ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
	 ;;                     (db:test-get-run_duration testdat)))
	 ;;                    600) 
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
		  (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
				    test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
				    " event-time="event-time" run-duration="run-duration))))
	  stmth1
	  run-id running-deadtime) ;; default time 720 seconds
       

	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
				    " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
				    " run-duration="run-duration)
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))

	  stmth2
	  run-id remotehoststart-deadtime) ;; default time 230 seconds
	 
	 ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
	 ;;
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row
	  (lambda (test-id run-dir uname testname item-path)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		(begin
		  (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id
				    " 1 day since event_time marked")
                (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))))


	  stmth3
	  run-id)
       
	 (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, "
			   (length toplevels) " old LAUNCHED toplevel tests and "
			   (length incompleted) " tests marked RUNNING but apparently dead."))

	 ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
       ;;
	 ;; (db:delay-if-busy dbdat)







	 (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all
		(all-ids             (append min-incompleted-ids (map car oldlaunched))))
	   (if (> (length all-ids) 0)
	       (begin
		 ;; (launch:is-test-alive "localhost" 435)
		 (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ")
			      " as DEAD")
		 (for-each
                  (lambda (test-id)
                    (let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id))
			   (tinfo   (db:get-test-info-by-id dbstruct run-id test-id))
			   (run-dir (db:test-get-rundir     tinfo))
			   (host    (db:test-get-host       tinfo))
			   (pid     (db:test-get-process_id tinfo))
			   (result (db:get-status-from-final-status-file run-dir)))
		      (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) 
			  (begin
			    (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD")
			    (db:set-state-status-and-roll-up-items
			     dbstruct run-id test-id 'foo "COMPLETED" "PASS"
			     "Test stopped responding but it has PASSED; marking it PASS in the DB."))
			  (let ((is-alive (and (not (eq? pid 0))  ;; 0 is default in re-used field "attemptnum" where pid stored.
					       (launch:is-test-alive host pid))))
			    (if is-alive
				(debug:print 0 *default-log-port* "INFO: test " test-id " on host " host
					     " has a process on pid " pid ", NOT setting to DEAD.")
				(begin
				  (debug:print 0 *default-log-port* "INFO: test " test-id
					       " final state/status is not COMPLETED/PASS. It is " result)
				  (db:set-state-status-and-roll-up-items
				   dbstruct run-id test-id 'foo "COMPLETED" "DEAD"
				   "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
		  ;; call end of eud of run detection for posthook - from merge, is it needed?
		  ;; (launch:end-of-run-check run-id)
		  all-ids)
		 ;;call end of eud of run detection for posthook
		 (launch:end-of-run-check run-id)
		 )))))))





















;; BUG: Probably broken - does not explicitly use run-id in the query
;;
(define (db:top-test-set-per-pf-counts dbstruct run-id test-name)
  (db:general-call dbstruct 'top-test-set-per-pf-counts (list test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name)))

;; Clean out old junk and vacuum the database
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
     #f
     (lambda (db)
       (let* ((stmth (db:get-cache-stmth dbstruct db qry)))
	 (sqlite3:first-result stmth run-id))))))

;; For a given testname how many items are running? Used to determine
;; probability for regenerating html
;; 
(define (db:get-count-tests-running-for-testname dbstruct run-id testname)
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (db)
     (let* ((stmt "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? AND NOT (uname = 'n/a' AND item_path = '') AND testname=?;")







|







3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
     #f
     (lambda (db)
       (let* ((stmth (db:get-cache-stmth dbstruct db qry)))
	 (sqlite3:first-result stmth run-id))))))

;; For a given testname how many items are running? Used to determine
;; probability for regenerating html
;;
(define (db:get-count-tests-running-for-testname dbstruct run-id testname)
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (db)
     (let* ((stmt "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? AND NOT (uname = 'n/a' AND item_path = '') AND testname=?;")

Modified launch.scm from [baaec62bb2] to [774f43269a].

1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
	;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; 
	;; 			    (begin
	;; 			      (thread-sleep! 1)
	;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time))))))
	 (item-path       (item-list->path itemdat))
	 (contour         #f)) ;; NOT READY FOR THIS (args:get-arg "-contour")))
    (let loop ((delta        (- (current-seconds) *last-launch*))
	       (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1)))
      (if (> launch-delay delta)
	  (begin
	    (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay.
		(debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds"))
	    (thread-sleep! (- launch-delay delta))
	    (loop (- (current-seconds) *last-launch*) launch-delay))))
    (change-directory *toppath*)







|







1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
	;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; 
	;; 			    (begin
	;; 			      (thread-sleep! 1)
	;; 			      (loop (rmt:no-sync-get-lock lock-key) expire-time))))))
	 (item-path       (item-list->path itemdat))
	 (contour         #f)) ;; NOT READY FOR THIS (args:get-arg "-contour")))
    (let loop ((delta        (- (current-seconds) *last-launch*))
	       (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 0)))
      (if (> launch-delay delta)
	  (begin
	    (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay.
		(debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds"))
	    (thread-sleep! (- launch-delay delta))
	    (loop (- (current-seconds) *last-launch*) launch-delay))))
    (change-directory *toppath*)

Modified runs.scm from [77a44f327a] to [bbb893475a].

313
314
315
316
317
318
319
320


321
322
323
324
325
326
327
	   (args:get-arg "-one-pass"))
      (exit 0))

  (thread-sleep! (cond ;; BB: check with Matt.  Should this sleep move
		       ;; to cond clauses below where we determine we
		       ;; have too many jobs running rather than each
		       ;; time the and condition above is true (which
		       ;; seems like always)?


        	  ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time
		   (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ..."))
		   10)  ;; obviously haven't had any work to do for a while
		  (else 0)))
;;		   ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?
;;		   (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)
;;		   )))







|
>
>







313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
	   (args:get-arg "-one-pass"))
      (exit 0))

  (thread-sleep! (cond ;; BB: check with Matt.  Should this sleep move
		       ;; to cond clauses below where we determine we
		       ;; have too many jobs running rather than each
		       ;; time the and condition above is true (which
		  ;; seems like always)?
		  ((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way
		   0)
        	  ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time
		   (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ..."))
		   10)  ;; obviously haven't had any work to do for a while
		  (else 0)))
;;		   ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?
;;		   (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)
;;		   )))
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
     ;; If no resources are available just kill time and loop again
     ;;
     ((not have-resources) ;; simply try again after waiting a second
      (if (runs:lownoise "no resources" 60)
	  (debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ..."))
      ;; Have gone back and forth on this but db starvation is an issue.
      ;; wait one second before looking again to run jobs.
      (thread-sleep! 1)
      ;; could have done hed tal here but doing car/cdr of newtal to rotate tests
      (list (car newtal)(cdr newtal) reg reruns))
     
     ;; This is the final stage, everything is in place so launch the test
     ;;
     ((and have-resources
	   (or (null? prereqs-not-met)







|







1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
     ;; If no resources are available just kill time and loop again
     ;;
     ((not have-resources) ;; simply try again after waiting a second
      (if (runs:lownoise "no resources" 60)
	  (debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ..."))
      ;; Have gone back and forth on this but db starvation is an issue.
      ;; wait one second before looking again to run jobs.
      (thread-sleep! 0.25)
      ;; could have done hed tal here but doing car/cdr of newtal to rotate tests
      (list (car newtal)(cdr newtal) reg reruns))
     
     ;; This is the final stage, everything is in place so launch the test
     ;;
     ((and have-resources
	   (or (null? prereqs-not-met)