Megatest

Diff
Login

Differences From Artifact [8a077942c3]:

To Artifact [62e275181d]:


1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     db
     "SELECT id FROM archive_blocks WHERE archive_disk_id=? AND disk_path=?;"
     bdisk-id archive-path)
    (if res ;; record exists, update du if applicable and return res
	(begin
	  (if du (sqlite3:execute db "UPDATE archive_blocks SET last_du=?,last_du_time=(strftime('%s','now'))
                                          WHERE archive_disk_id=? AND disk_path=?;"
				   bdisk-id archive-path du))
	  res)
	(begin
	  (sqlite3:execute db "INSERT OR REPLACE INTO archive_blocks (archive_disk_id,disk_path,last_du)
                                                        VALUES (?,?,?);"
			   bdisk-id archive-path (or du 0))
	  (db:archive-register-block-name dbstruct bdisk-id archive-path du: du)))
    (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat)
    res))


;; The "archived" field in tests is overloaded; 0 = not archived, > 0 archived in block with given id
;;
(define (db:test-set-archive-block-id dbstruct run-id test-id archive-block-id)







<
|

|
<




|







1462
1463
1464
1465
1466
1467
1468

1469
1470
1471

1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
    (sqlite3:for-each-row
     (lambda (id)
       (set! res id))
     db
     "SELECT id FROM archive_blocks WHERE archive_disk_id=? AND disk_path=?;"
     bdisk-id archive-path)
    (if res ;; record exists, update du if applicable and return res

	(if du (sqlite3:execute db "UPDATE archive_blocks SET last_du=?,last_du_time=(strftime('%s','now'))
                                          WHERE archive_disk_id=? AND disk_path=?;"
				bdisk-id archive-path du))

	(begin
	  (sqlite3:execute db "INSERT OR REPLACE INTO archive_blocks (archive_disk_id,disk_path,last_du)
                                                        VALUES (?,?,?);"
			   bdisk-id archive-path (or du 0))
	  (set! res (db:archive-register-block-name dbstruct bdisk-id archive-path du: du))))
    (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat)
    res))


;; The "archived" field in tests is overloaded; 0 = not archived, > 0 archived in block with given id
;;
(define (db:test-set-archive-block-id dbstruct run-id test-id archive-block-id)
1612
1613
1614
1615
1616
1617
1618




















1619
1620
1621
1622
1623
1624
1625
1626
1627
1628


1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639



1640
1641
1642
1643
1644
1645
1646
           #f
           #t)))))

;; given a launch delay (minimum time from last launch) return amount of time to wait
;;
;; (define (db:launch-delay-left dbstruct run-id launch-delay)
  





















;;  select end_time-now from
;;      (select testname,item_path,event_time+run_duration as
;;                          end_time,strftime('%s','now') as now from tests where state in
;;      ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));

(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
  (let* ((incompleted '())
	 (oldlaunched '())
	 (toplevels   '())


         (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime")))
         (server-start-allowance 200)
         (server-overloaded-budget 200)
         (launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30))
         (launch-monitor-on-time-budget 30)
         (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget))
         (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30))
         (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default))
         (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))
         (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period)
         )



    (db:with-db 
     dbstruct #f #f
     (lambda (db)
       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
       ;;
       ;; HOWEVER: this code in run:test seems to work fine
       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>










>
>











>
>
>







1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
           #f
           #t)))))

;; given a launch delay (minimum time from last launch) return amount of time to wait
;;
;; (define (db:launch-delay-left dbstruct run-id launch-delay)
  


(define (db:get-status-from-final-status-file run-dir)
  (let (
       (infile (conc run-dir "/.final-status")))

       ;; first verify we are able to write the output file
       (if (not (file-read-access? infile))
          (begin 
	        (debug:print 0 *default-log-port* "ERROR: cannot read " infile)
          (debug:print 0 *default-log-port* "ERROR: run-dir is " run-dir)
          #f
          )
          (with-input-from-file infile read-lines)
       )
  )
)




;;  select end_time-now from
;;      (select testname,item_path,event_time+run_duration as
;;                          end_time,strftime('%s','now') as now from tests where state in
;;      ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));

(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
  (let* ((incompleted '())
	 (oldlaunched '())
	 (toplevels   '())
          ;; The default running-deadtime is 720 seconds = 12 minutes.
          ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30))
         (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime")))
         (server-start-allowance 200)
         (server-overloaded-budget 200)
         (launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30))
         (launch-monitor-on-time-budget 30)
         (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget))
         (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30))
         (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default))
         (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))
         (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period)
         )
    (debug:print-info 4  *default-log-port* "running-deadtime = " running-deadtime)
    (debug:print-info 4  *default-log-port* "deadtime-trim = " deadtime-trim)

    (db:with-db 
     dbstruct #f #f
     (lambda (db)
       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
       ;;
       ;; HOWEVER: this code in run:test seems to work fine
       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
1655
1656
1657
1658
1659
1660
1661

1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
              (begin
                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
              (begin
                (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
                (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration))))
        db

        "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');"
        run-id running-deadtime)

       
       (sqlite3:for-each-row 
        (lambda (test-id run-dir uname testname item-path event-time run-duration)
          (if (and (equal? uname "n/a")
                   (equal? item-path "")) ;; this is a toplevel test
              ;; what to do with toplevel? call rollup?
              (begin
                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
              (begin
                (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)
                (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
        db
        "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');"
        run-id remotehoststart-deadtime)

       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
       ;;
       ;; (db:delay-if-busy dbdat)
       (sqlite3:for-each-row
        (lambda (test-id run-dir uname testname item-path)
          (if (and (equal? uname "n/a")







>

|















|







1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
              (begin
                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
              (begin
                (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
                (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration))))
        db
        
        "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');"
        run-id running-deadtime) ;; default time 720 seconds

       
       (sqlite3:for-each-row 
        (lambda (test-id run-dir uname testname item-path event-time run-duration)
          (if (and (equal? uname "n/a")
                   (equal? item-path "")) ;; this is a toplevel test
              ;; what to do with toplevel? call rollup?
              (begin
                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
              (begin
                (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)
                (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
        db
        "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');"
        run-id remotehoststart-deadtime) ;; default time 230 seconds

       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
       ;;
       ;; (db:delay-if-busy dbdat)
       (sqlite3:for-each-row
        (lambda (test-id run-dir uname testname item-path)
          (if (and (equal? uname "n/a")
1706
1707
1708
1709
1710
1711
1712

1713
1714
1715
















1716
1717
1718




1719
1720
1721
1722
1723
1724
1725
              ;;      			(or (not dbexists) ;; if no file then something wrong - mark as incomplete
              ;;      			    (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim
              ;;      		    incompleted))
              (min-incompleted-ids (map car incompleted)) ;; do 'em all
              (all-ids             (append min-incompleted-ids (map car oldlaunched))))
         (if (> (length all-ids) 0)
             (begin

               (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD")
               (for-each
                (lambda (test-id)
















                  (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))
                  ;;(db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS.  ref ticket 220546828
                all-ids))))))))





;; ALL REPLACED BY THE BLOCK ABOVE
;;
;; 	    (sqlite3:execute 
;; 	     db
;; 	     (conc "UPDATE tests SET state='INCOMPLETE' WHERE run_id=? AND id IN (" 
;; 		   (string-intersperse (map conc all-ids) ",")







>


|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
|
|
>
>
>
>







1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
              ;;      			(or (not dbexists) ;; if no file then something wrong - mark as incomplete
              ;;      			    (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim
              ;;      		    incompleted))
              (min-incompleted-ids (map car incompleted)) ;; do 'em all
              (all-ids             (append min-incompleted-ids (map car oldlaunched))))
         (if (> (length all-ids) 0)
             (begin
	       ;; (launch:is-test-alive "localhost" 435)
               (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD")
               (for-each
                  (lambda (test-id)
                    (let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id))
			   (tinfo   (db:get-test-info-by-id dbstruct run-id test-id))
			   (run-dir (db:test-get-rundir     tinfo))
			   (host    (db:test-get-host       tinfo))
			   (pid     (db:test-get-process_id tinfo))
			   (result (db:get-status-from-final-status-file run-dir)))
		      (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) 
			  (begin
			    (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD")
			    (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "PASS"
								   "Test stopped responding but it has PASSED; marking it PASS in the DB."))
			  (let ((is-alive (launch:is-test-alive host pid)))
			    (if is-alive
				(debug:print 0 *default-log-port* "INFO: test " test-id " on host " host " has a process on pid " pid ", NOT setting to DEAD.")
				(begin
				  (debug:print 0 *default-log-port* "INFO: test " test-id " final state/status is not COMPLETED/PASS. It is " result)
				  (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD"
									 "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
		  all-ids)
	       ;;call end of eud of run detection for posthook
	       (launch:end-of-run-check run-id)
	       )))))))


;; ALL REPLACED BY THE BLOCK ABOVE
;;
;; 	    (sqlite3:execute 
;; 	     db
;; 	     (conc "UPDATE tests SET state='INCOMPLETE' WHERE run_id=? AND id IN (" 
;; 		   (string-intersperse (map conc all-ids) ",")