Megatest: Diff

Differences From Artifact [f99424cdf8]:

File runs.scm — part of check-in [9e35b1252c] at 2020-10-12 16:49:17 on branch v1.65-minor-patch — Reduced message from failed to info. Reverted a delay which seems to help pass full stack ext-tests. (user: mrwellan, size: 160457) [annotate] [blame] [check-ins using] [more...]

To Artifact [7ac979c2cd]:

File runs.scm — part of check-in [9ccc81e58b] at 2020-11-02 15:20:17 on branch v1.65 — Merged run removal fix (user: mrwellan, size: 161348) [annotate] [blame] [check-ins using] [more...]

︙			︙
59 60 61 62 63 64 65 66 67 68 69 70 71 72 73	(last-jobs-check-time 0) ) (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup waitons testmode newtal itemmaps prereqs-not-met) ;; look in the $MT_RUN_AREA_HOME/.softlocks directory for key-host-pid.softlock files ;; - remove any that are over 3600 seconds old ;; - if there are any that are younger than 10 seconds ;; * sleep 10 seconds ;; * touch my key-host-pid.softlock file ;; * return ;; - if there are no files younger than 10 seconds	\| > > > > > > > > > > > > > > > > > > > > > > > > > > >	59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100	(last-jobs-check-time 0) ) (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup waitons testmode newtal itemmaps prereqs-not-met) (module runsmod ( runs:wait-if-seen-recently ) (import scheme chicken data-structures extras files) (import posix typed-records srfi-18 srfi-69 md5 message-digest regex srfi-1) (define last-seen-ht (make-hash-table)) (define (runs:wait-if-seen-recently wait-until . keys) (let* ((full-key (string-intersperse keys "-")) (last-seen (hash-table-ref/default last-seen-ht full-key 0)) (now (current-seconds)) (delta (- now last-seen)) (needed (if (< delta wait-until) 0 (- wait-until delta)))) (if (> needed 0)(thread-sleep! needed)) (hash-table-set! last-seen-ht full-key (current-seconds)) needed)) ) (import runsmod) ;; look in the $MT_RUN_AREA_HOME/.softlocks directory for key-host-pid.softlock files ;; - remove any that are over 3600 seconds old ;; - if there are any that are younger than 10 seconds ;; * sleep 10 seconds ;; * touch my key-host-pid.softlock file ;; * return ;; - if there are no files younger than 10 seconds
︙			︙
319 320 321 322 323 324 325 ~~326~~ 327 328 329 330 331 332 333	;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) ~~(let* ((num-running (rmt:get-count-tests-running run-id ~~#f)) ;; fastmode=no~~~~ (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup configdat "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) (if (> (+ num-running num-running-in-jobgroup) 0) (runs:inc-can-run-more-tests-count runsdat)) ;; (set! runs:can-run-more-tests-count (+ runs:can-run-more-tests-count 1)))	\|	346 347 348 349 350 351 352 353 354 355 356 357 358 359 360	;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup configdat "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) (if (> (+ num-running num-running-in-jobgroup) 0) (runs:inc-can-run-more-tests-count runsdat)) ;; (set! runs:can-run-more-tests-count (+ runs:can-run-more-tests-count 1)))
︙			︙
433 434 435 436 437 438 439 440 441 442 443 444 445 446	(debug:print-info 0 default-log-port "running run-post-hook: \"" run-post-hook "\", log is " actual-logf) (system (conc run-post-hook " >> " actual-logf " 2>&1")) (debug:print-info 0 default-log-port "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run.")))))) ;; return #t when all items in waitors-upon list are represented in test-patt, #f otherwise. (define (runs:testpatts-mention-waitors-upon? test-patt waitors-upon) (null? (tests:filter-test-names-not-matched waitors-upon test-patt))) ;;====================================================================== ;; runs:run-tests is called from megatest.scm and itself ;;====================================================================== ;; ;; test-names: Comma separated patterns same as test-patts but used in selection ;; of tests to run. The item portions are not respected.	> >	460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475	(debug:print-info 0 default-log-port "running run-post-hook: \"" run-post-hook "\", log is " actual-logf) (system (conc run-post-hook " >> " actual-logf " 2>&1")) (debug:print-info 0 default-log-port "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run.")))))) ;; return #t when all items in waitors-upon list are represented in test-patt, #f otherwise. (define (runs:testpatts-mention-waitors-upon? test-patt waitors-upon) (null? (tests:filter-test-names-not-matched waitors-upon test-patt))) (define find-and-mark-incomplete-last-run (make-hash-table)) ;;====================================================================== ;; runs:run-tests is called from megatest.scm and itself ;;====================================================================== ;; ;; test-names: Comma separated patterns same as test-patts but used in selection ;; of tests to run. The item portions are not respected.
︙			︙
604 605 606 607 608 609 610 ~~611 612 613 614 615 616 617~~ 618 619 620 621 622 623 624	;; run the run prehook if there are no tests yet run for this run: ;; (runs:run-pre-hook run-id) ;; mark all test launced flag as false in the meta table (rmt:set-var (conc "lunch-complete-" run-id) "no") (debug:print-info 1 default-log-port "Setting end-of-run to no") (let* ((config-reruns (let ((x (configf:lookup configdat "setup" "reruns"))) ~~(if x (string->number x) #f))) (config-rerun-cnt (if config-reruns config-reruns 1))) (if (eq? config-rerun-cnt run-count) (rmt:set-var (conc "end-of-run-" run-id) "no")))~~ (rmt:set-run-state-status run-id "new" "n/a") ;; now add non-directly referenced dependencies (i.e. waiton) ;;====================================================================== ;; refactoring this block into tests:get-full-data ;; ;; What happended, this code is now duplicated in tests!? ;;	\| \| \| \| \| \| \|	633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653	;; run the run prehook if there are no tests yet run for this run: ;; (runs:run-pre-hook run-id) ;; mark all test launced flag as false in the meta table (rmt:set-var (conc "lunch-complete-" run-id) "no") (debug:print-info 1 default-log-port "Setting end-of-run to no") (let* ((config-reruns (let ((x (configf:lookup configdat "setup" "reruns"))) (if x (string->number x) #f))) (config-rerun-cnt (if config-reruns config-reruns 1))) (if (eq? config-rerun-cnt run-count) (rmt:set-var (conc "end-of-run-" run-id) "no"))) (rmt:set-run-state-status run-id "new" "n/a") ;; now add non-directly referenced dependencies (i.e. waiton) ;;====================================================================== ;; refactoring this block into tests:get-full-data ;; ;; What happended, this code is now duplicated in tests!? ;;
︙			︙
728 729 730 731 732 733 734 ~~735 736 737 738 739 740 741 742 743~~ 744 745 746 747 748 749 ~~750 751~~ ~~752~~ 753 754 ~~755~~ 756 ~~757 758~~ 759 760 761 762 763 764 765 766 767 768 769 770 ~~771 772~~ 773 774 775 776 777 778 779	(debug:print-info 1 default-log-port "Adding \"" (string-intersperse required-tests " ") "\" to the run queue")) ;; NOTE: these are all parent tests, items are not expanded yet. (debug:print-info 4 default-log-port "test-records=" (hash-table->alist test-records)) (let ((reglen (configf:lookup configdat "setup" "runqueue"))) (if (> (length (hash-table-keys test-records)) 0) (let* ((keep-going #t) (run-queue-retries 5) ~~#;(th1 (make-thread (lambda ()~~ ~~(handle-exceptions~~ ~~exn~~ ~~(begin~~ ~~(print-call-chain)~~ ~~(print " message: " ((condition-property-accessor 'exn 'message) exn)))~~ ~~(runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests~~ ~~(any->number reglen) all-tests-registry)))~~ ~~"runs:run-tests-queue"))~~ (th2 (make-thread (lambda () ;; BBQ: why are we visiting ALL runs here? ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going (handle-exceptions ~~exn (debug:print 0 default-log-port "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)~~ ~~(rmt:find-and-mark-incomplete run-id #f~~)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) run-ids))) "runs: mark-incompletes"))) ~~;; (thread-start! th1)~~ (thread-start! th2) ~~;; (thread-join! th1)~~ ~~;; just do the main stuff in the main thread~~ (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry) (set! keep-going #f) (thread-join! th2) ;; if run-count > 0 call, set -preclean and -rerun STUCK/DEAD (if (> run-count 0) ;; handle reruns (begin (if (not (hash-table-ref/default flags "-preclean" #f)) (hash-table-set! flags "-preclean" #t)) (if (not (hash-table-ref/default flags "-rerun" #f)) (hash-table-set! flags "-rerun" "STUCK/DEAD,n/a,ZERO_ITEMS")) ;; recursive call to self ~~(runs:run-tests target runname test-patts user flags run-count: (- run-count 1))) (launch:end-of-run-check run-id)))~~ (debug:print-info 0 default-log-port "No tests to run"))) (debug:print-info 4 default-log-port "All done by here") ;; TODO: try putting post hook call here ; (debug:print-info 2 default-log-port " run-count " run-count) ; (runs:run-post-hook run-id)) ; (debug:print-info 2 default-log-port "Not calling post hook runcount = " run-count ))	< < < < < < < < < \| \| > > > > > \| < < < \| \|	757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801	(debug:print-info 1 default-log-port "Adding \"" (string-intersperse required-tests " ") "\" to the run queue")) ;; NOTE: these are all parent tests, items are not expanded yet. (debug:print-info 4 default-log-port "test-records=" (hash-table->alist test-records)) (let ((reglen (configf:lookup configdat "setup" "runqueue"))) (if (> (length (hash-table-keys test-records)) 0) (let* ((keep-going #t) (run-queue-retries 5) (th2 (make-thread (lambda () ;; BBQ: why are we visiting ALL runs here? ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going (handle-exceptions exn (debug:print 0 default-log-port "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn) ;; lets run this only if a run has been NOT seen for more than 900 seconds (if (> (- (current-seconds)(hash-table-ref/default find-and-mark-incomplete-last-run run-id 0)) 900) (begin (rmt:find-and-mark-incomplete run-id #f) (hash-table-set! find-and-mark-incomplete-last-run run-id (current-seconds))) )))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) run-ids))) "runs: mark-incompletes"))) (thread-start! th2) (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry) (set! keep-going #f) (thread-join! th2) ;; if run-count > 0 call, set -preclean and -rerun STUCK/DEAD (if (> run-count 0) ;; handle reruns (begin (if (not (hash-table-ref/default flags "-preclean" #f)) (hash-table-set! flags "-preclean" #t)) (if (not (hash-table-ref/default flags "-rerun" #f)) (hash-table-set! flags "-rerun" "STUCK/DEAD,n/a,ZERO_ITEMS")) ;; recursive call to self (runs:run-tests target runname test-patts user flags run-count: (- run-count 1))) (launch:end-of-run-check run-id))) (debug:print-info 0 default-log-port "No tests to run"))) (debug:print-info 4 default-log-port "All done by here") ;; TODO: try putting post hook call here ; (debug:print-info 2 default-log-port " run-count " run-count) ; (runs:run-post-hook run-id)) ; (debug:print-info 2 default-log-port "Not calling post hook runcount = " run-count ))
︙			︙
1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481	(num-retries 0) (max-retries (configf:lookup configdat "setup" "maxretries")) (max-concurrent-jobs (configf:lookup-number configdat "setup" "max_concurrent_jobs" default: 50)) (reglen (if (number? reglen-in) reglen-in 1)) (last-time-incomplete (- (current-seconds) 900)) ;; force at least one clean up cycle (last-time-some-running (current-seconds)) ;; (tdbdat (tasks:open-db)) (runsdat (make-runs:dat ;; hed: hed ;; tal: tal ;; reg: reg ;; reruns: reruns reglen: reglen regfull: #f ;; regfull	>	1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504	(num-retries 0) (max-retries (configf:lookup configdat "setup" "maxretries")) (max-concurrent-jobs (configf:lookup-number configdat "setup" "max_concurrent_jobs" default: 50)) (reglen (if (number? reglen-in) reglen-in 1)) (last-time-incomplete (- (current-seconds) 900)) ;; force at least one clean up cycle (last-time-some-running (current-seconds)) ;; (tdbdat (tasks:open-db)) (misc-data (make-hash-table)) ;; use as needed (runsdat (make-runs:dat ;; hed: hed ;; tal: tal ;; reg: reg ;; reruns: reruns reglen: reglen regfull: #f ;; regfull
︙			︙
1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541	;; (if (> (current-seconds)(+ last-time-incomplete 900)) (begin (set! last-time-incomplete (current-seconds)) ;; (rmt:find-and-mark-incomplete-all-runs) )) ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns) (let* ((test-record (hash-table-ref test-records hed)) (test-name (tests:testqueue-get-testname test-record)) (tconfig (tests:testqueue-get-testconfig test-record)) (jobgroup (configf:lookup tconfig "test_meta" "jobgroup")) (testmode (let ((m (configf:lookup tconfig "requirements" "mode"))) (if m (map string->symbol (string-split m)) '(normal))))	> > > > > >	1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570	;; (if (> (current-seconds)(+ last-time-incomplete 900)) (begin (set! last-time-incomplete (current-seconds)) ;; (rmt:find-and-mark-incomplete-all-runs) )) ;; WAIT FOR TIME ON TIGHT LOOP (if (< (- (current-milliseconds)(hash-table-ref/default misc-data "tight-loop-last-time" 0)) 100) ;; less than 1/100 second since came through the loop (thread-sleep! 0.1)) ;; wait a 1/100 seconds (hash-table-set! misc-data "tight-loop-last-time" (current-milliseconds)) ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns) (let* ((test-record (hash-table-ref test-records hed)) (test-name (tests:testqueue-get-testname test-record)) (tconfig (tests:testqueue-get-testconfig test-record)) (jobgroup (configf:lookup tconfig "test_meta" "jobgroup")) (testmode (let ((m (configf:lookup tconfig "requirements" "mode"))) (if m (map string->symbol (string-split m)) '(normal))))
︙			︙
1563 1564 1565 1566 1567 1568 1569 ~~1570~~ 1571 1572 1573 1574 1575 1576 1577	)) extras) extras) '()))) (waitons (delete-duplicates (append (tests:testqueue-get-waitons test-record) extra-waits) equal?)) (newtal (append tal (list hed))) (regfull (>= (length reg) reglen)) ~~(num-running (rmt:get-count-tests-running-for-run-id run-id ~~#t)) ;; fastmode=yes~~~~ (testdat (make-runs:testdat hed: hed tal: tal reg: reg reruns: reruns test-record: test-record test-name: test-name	\|	1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606	)) extras) extras) '()))) (waitons (delete-duplicates (append (tests:testqueue-get-waitons test-record) extra-waits) equal?)) (newtal (append tal (list hed))) (regfull (>= (length reg) reglen)) (num-running (rmt:get-count-tests-running-for-run-id run-id)) (testdat (make-runs:testdat hed: hed tal: tal reg: reg reruns: reruns test-record: test-record test-name: test-name
︙			︙
1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726	;; wait for load here (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) (- remtries 1))))))) ))))) ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running (if loop-list (apply loop loop-list))))	> > >	1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758	;; wait for load here (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) (- remtries 1))))))) ))))) ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed (let ((waited (runs:wait-if-seen-recently 5 "prereqs-not-met" hed item-path))) ;; if we've been down this path in the past 5 seconds - wait out the difference (if (> waited 0)(debug:print 0 default-log-port "Waited for prereqs-not-met-"hed"-"item-path" for " waited "seconds."))) (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running (if loop-list (apply loop loop-list))))
︙			︙
1829 1830 1831 1832 1833 1834 1835 ~~1836~~ 1837 1838 1839 1840 1841 1842 1843 1844 ~~1845 1846~~ 1847 1848 ~~1849~~ 1850 1851 1852 1853 1854 1855 ~~1856~~ 1857 1858 1859 1860 1861 1862 1863	(rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now if -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle ~~(let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id ~~#t)) ;; fastmode=yes~~~~ (prev-num-running 0)) ;; (debug:print-info 13 default-log-port "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup configdat "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 default-log-port "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) ~~(if (> (current-seconds)(+ last-time-incomplete 900)) (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fast~~mode=~~no~~ (debug:print-info 0 default-log-port "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) ~~(set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!~~ (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 default-log-port "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) ~~(wait-loop (rmt:get-count-tests-running-for-run-id run-id ~~#t) ;; fastmode=yes~~~~ num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 default-log-port "Calling Post Hook") ;; (runs:run-post-hook run-id) (debug:print-info 1 default-log-port "All tests launched")))	\| > \| \| \| > \|	1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897	(rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now if -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id)) (prev-num-running 0)) ;; (debug:print-info 13 default-log-port "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup configdat "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 default-log-port "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (- (current-seconds)(hash-table-ref/default find-and-mark-incomplete-last-run run-id 0)) 900) ;; (begin(if (> (current-seconds)(+ last-time-incomplete 900)) (let ((actual-num-running num-running)) ;; (rmt:get-count-tests-running-for-run-id run-id))) ;; why call it again? (debug:print-info 0 default-log-port "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) ;; (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (hash-table-set! find-and-mark-incomplete-last-run run-id (current-seconds)) (debug:print-info 0 default-log-port "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 default-log-port "Calling Post Hook") ;; (runs:run-post-hook run-id) (debug:print-info 1 default-log-port "All tests launched")))
︙			︙
2253 2254 2255 2256 2257 2258 2259 ~~2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278~~ 2279 2280 2281 2282 2283 2284 2285	(path-out (conc "/" (string-intersperse (take dparts (- (length dparts) 1)) "/"))) ) path-out ) ) ~~;; (define (runs:remove-all-but-last-n-runs-per-target target-patts runpatt num-to-keep)~~ ~~;; (let ((data (runs:get-all-but-most-recent-n-per-target target-patts runpatt num-to-keep)))~~ ~~;; (for-each~~ ~~;; (lambda (target)~~ ~~;; (let ((runs-to-remove (hash-table-ref data target )))~~ ~~;; (for-each~~ ~~;; (lambda (run)~~ ~~;; (print "megatest -remove-runs -target " target " -runname " (simple-run-runname run) " -testpatt %"))~~ ~~;; runs-to-remove)))~~ ~~;; (hash-table-keys data))))~~ ~~;; Remove runs~~ ~~;; fields are passing in through~~ ~~;; action:~~ ~~;; 'remove-runs~~ ~~;; 'set-state-status~~ ;; ~~;; NB// should pass in keys?~~ ;; (define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(mode #f)(options '())) (common:clear-caches) ;; clear all caches (let* ((db #f) ;; (tdbdat (tasks:open-db)) (keys (rmt:get-keys)) (rundat (mt:get-runs-by-patt keys runnamepatt target)) (header (vector-ref rundat 0))	< < < < < < < < < < < < < < < < < < <	2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300	(path-out (conc "/" (string-intersperse (take dparts (- (length dparts) 1)) "/"))) ) path-out ) ) (define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(mode #f)(options '())) (common:clear-caches) ;; clear all caches (let* ((db #f) ;; (tdbdat (tasks:open-db)) (keys (rmt:get-keys)) (rundat (mt:get-runs-by-patt keys runnamepatt target)) (header (vector-ref rundat 0))
︙			︙
2509 2510 2511 2512 2513 2514 2515 ~~2516~~ 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 ~~2529~~ 2530 2531 2532 2533 2534 2535 2536	(loop new-test-dat tal) (loop (car tal)(append tal (list new-test-dat))))) (begin (let ((rundir (db:test-get-rundir new-test-dat))) (if (and (not (string= rundir "/tmp/badname")) (file-exists? rundir) (substring-index run-name rundir) ~~(~~sub~~strin~~g-index~~ target rundir)~~ ) (begin (set! lasttpath (db:test-get-rundir new-test-dat)) ;; remember this path for run removal (set! lastrealpath (remove-last-path-directory (resolve-pathname lasttpath))) (hash-table-set! run-paths-hash lastrealpath 1) (runs:remove-test-directory new-test-dat mode) ;; 'remove-all) ) (begin (debug:print 2 default-log-port "Not removing directory " rundir " because either it doesn't exist or has a bad name") (debug:print 2 default-log-port "Is /tmp/badname: " (string= rundir "/tmp/badname")) (debug:print 2 default-log-port "Exists: " (file-exists? rundir)) (debug:print 2 default-log-port "Has run-name: " (substring-index run-name rundir)) ~~(debug:print 2 default-log-port "Has target: " (~~sub~~strin~~g-index~~ target rundir))~~ ;;PJH remove record from db no need to cleanup directory (case mode ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) (db:test-get-state test)(db:test-get-status test) #f)) ((archive-remove) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVED" #f #f)) (else (rmt:delete-test-records (db:test-get-run_id test) (db:test-get-id test)))) )	\| \| > >	2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553	(loop new-test-dat tal) (loop (car tal)(append tal (list new-test-dat))))) (begin (let ((rundir (db:test-get-rundir new-test-dat))) (if (and (not (string= rundir "/tmp/badname")) (file-exists? rundir) (substring-index run-name rundir) (tests:glob-like-match (conc "%/" target "/%") rundir) ) (begin (set! lasttpath (db:test-get-rundir new-test-dat)) ;; remember this path for run removal (set! lastrealpath (remove-last-path-directory (resolve-pathname lasttpath))) (hash-table-set! run-paths-hash lastrealpath 1) (runs:remove-test-directory new-test-dat mode) ;; 'remove-all) ) (begin (debug:print 2 default-log-port "Not removing directory " rundir " because either it doesn't exist or has a bad name") (debug:print 2 default-log-port "Is /tmp/badname: " (string= rundir "/tmp/badname")) (debug:print 2 default-log-port "Exists: " (file-exists? rundir)) (debug:print 2 default-log-port "Has run-name: " (substring-index run-name rundir)) (debug:print 2 default-log-port "Has target: " (tests:glob-like-match (conc "%/" target "/%") rundir)) (debug:print 2 default-log-port "Target: " target) ;;PJH remove record from db no need to cleanup directory (case mode ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) (db:test-get-state test)(db:test-get-status test) #f)) ((archive-remove) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVED" #f #f)) (else (rmt:delete-test-records (db:test-get-run_id test) (db:test-get-id test)))) )
︙			︙