Megatest: Check-in [ddd390fb9a]

Overview

Comment:	Trial 1. Merge all good stuff with minor changes back to last working node.
Downloads:	Tarball \| ZIP archive \| SQL archive
Timelines:	family \| ancestors \| descendants \| both \| v1.65-cleanup-try-1
Files:	files \| file ages \| folders
SHA1:	ddd390fb9a1ee9fb983d9738466e42e752a24cbb
User & Date:	matt on 2020-10-04 18:05:02
Other Links:	branch diff \| manifest \| tags

Context

2020-10-04
19:05		Trial 2. Backout (current-seconds) to 0 as it was for fallback log age. Leaf check-in: da95f1afe5 user: matt tags: v1.65-cleanup-try-2
18:05		Trial 1. Merge all good stuff with minor changes back to last working node. Closed-Leaf check-in: ddd390fb9a user: matt tags: v1.65-cleanup-try-1
2020-08-26
21:14		Added instrumentation for server start throttle. ==/3.18/0.6/PASS/1201/orion/== check-in: 56b3986bbb user: mrwellan tags: v1.65-cleanup

Changes

Hide Diffs Unified Diffs Ignore Whitespace Patch

Modified TODO from [da5eae4898] to [0885dee1e5].

Modified common.scm from [2890b41a2a] to [33c7316880].

Modified dashboard.scm from [935bf4d2df] to [59b903a7ca].

Modified db.scm from [f437d94aa0] to [2c7b396933].

Modified http-transport.scm from [6328aa908d] to [67489ed9ab].

Modified launch.scm from [4a656b9137] to [625d099d51].

Modified rmt.scm from [39d97c528a] to [29d7593e43].

Modified runs.scm from [96a5410a91] to [3376b43bbb].

Modified server.scm from [1606531507] to [5d96649d59].

Modified tests.scm from [0094b671e6] to [9525e7e2a6].

︙			︙
541 542 543 544 545 546 547 ~~548~~ 549 550 551 552 553 554 555	(if (and (> file-age (* (string->number (or (configf:lookup configdat "setup" "log-expire-days") "30")) 24 3600)) (file-exists? fullname)) ;; just in case it was gzipped - will get it next time (handle-exceptions exn #f (if (directory? fullname) (begin ~~(debug:print-~~error~~ 0 default-log-port fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")~~ (inc-stat "directories")) (begin (delete-file* fullname) (inc-stat "deleted"))) (hash-table-delete! all-files file))))))) '() "logs")	\|	541 542 543 544 545 546 547 548 549 550 551 552 553 554 555	(if (and (> file-age (* (string->number (or (configf:lookup configdat "setup" "log-expire-days") "30")) 24 3600)) (file-exists? fullname)) ;; just in case it was gzipped - will get it next time (handle-exceptions exn #f (if (directory? fullname) (begin (debug:print-info 0 default-log-port fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") (inc-stat "directories")) (begin (delete-file* fullname) (inc-stat "deleted"))) (hash-table-delete! all-files file))))))) '() "logs")
︙			︙
565 566 567 568 569 570 571 ~~572~~ 573 574 575 576 577 578 579	(lambda (a b) (< (hash-table-ref all-files a)(hash-table-ref all-files b)))) (- num-logs max-allowed)))) (for-each (lambda (file) (let* ((fullname (conc "logs/" file))) (if (directory? fullname) ~~(debug:print-~~error~~ 0 default-log-port fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")~~ (handle-exceptions exn (debug:print-error 0 default-log-port "failed to remove " fullname ", exn=" exn) (delete-file* fullname))))) files) (debug:print-info 0 default-log-port "Deleted " (length files) " files from logs, keeping " max-allowed " files."))))))	\|	565 566 567 568 569 570 571 572 573 574 575 576 577 578 579	(lambda (a b) (< (hash-table-ref all-files a)(hash-table-ref all-files b)))) (- num-logs max-allowed)))) (for-each (lambda (file) (let* ((fullname (conc "logs/" file))) (if (directory? fullname) (debug:print-info 0 default-log-port fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.") (handle-exceptions exn (debug:print-error 0 default-log-port "failed to remove " fullname ", exn=" exn) (delete-file* fullname))))) files) (debug:print-info 0 default-log-port "Deleted " (length files) " files from logs, keeping " max-allowed " files."))))))
︙			︙
1764 1765 1766 1767 1768 1769 1770 ~~1771~~ 1772 ~~1773 1774 1775 1776 1777 1778 1779 1780 1781 1782~~ ~~1783~~ 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 ~~1805~~ 1806 1807 1808 1809 1810 1811 1812 1813 ~~1814~~ 1815 1816 1817 ~~1818 1819 1820 1821 1822~~ 1823 ~~1824~~ 1825 1826 1827 1828 1829 1830 1831	;; calculate a delay number based on a droop curve ;; inputs are: ;; - load-in, load as from uptime, NOT normalized ;; - numcpus, number of cpus, ideally use the real cpus, not threads ;; (define (common:get-delay load-in numcpus) (let* ((ratio (/ load-in numcpus)) ~~(new-option (configf:lookup configdat "load" "new-load-method")))~~ (if new-option (begin (cond ((and (>= ratio 0) (< ratio .5)) 0 ) ((and (>= ratio 0.5) (<= ratio .9)) (* ratio (/ 5 .9))) ((and (> ratio .9) (<= ratio 1.1)) (+ 5 (* (- ratio .9) (/ 55 .2)))) ((> ratio 1.1) 60))) ~~(max (/ (expt 5 (* 4 ratio)) 10) 0))))~~ (define (get-cpu-load #!key (remote-host #f)) (car (common:get-cpu-load remote-host))) ;; (let* ((load-res (process:cmd-run->list "uptime")) ;; (load-rx (regexp "load average:\\s+(\\d+)")) ;; (cpu-load #f)) ;; (for-each (lambda (l) ;; (let ((match (string-search load-rx l))) ;; (if match ;; (let ((newval (string->number (cadr match)))) ;; (if (number? newval) ;; (set! cpu-load newval)))))) ;; (car load-res)) ;; cpu-load)) ;; get values from cached info from dropping file in logs dir ;; e.g. key is host and dtype is normalized-load ;; (define (common:get-cached-info key dtype #!key (age 10)) (if toppath ~~(let* ((fullpath (conc toppath "/.sysdata/" key "-" dtype ".log")))~~ (if (and (file-exists? fullpath) (file-read-access? fullpath)) (handle-exceptions exn (begin (debug:print 0 default-log-port "failed to get cached info from " fullpath ", exn=" exn) #f) (debug:print 2 default-log-port "reading file " fullpath) ~~(let ((real-age (- (current-seconds)~~(file-change-time fullpath))))~~~~ (if (< real-age age) (handle-exceptions exn (begin ~~(debug:print-info 1 default-log-port " removing bad file " fullpath ", exn=" exn)~~ (del~~ete-file* fullpath~~) #f) ~~(with-input-from-file fullpath read~~)) (begin ~~(debug:print-info 2 default-log-port "file " fullpath ~~" is too old (" real-age" seconds) to trust, skipping reading it")~~~~ #f)))) (begin (debug:print 2 default-log-port "not reading file " fullpath) #f))) #f)) (define (common:write-cached-info key dtype dat)	\| > > > \| \| \| < \| \| \| \| \| \| > > > \| > > > > > > > > \| > > > > \| > > > > > > > > > > \| < \| \| \| \| >	1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858	;; calculate a delay number based on a droop curve ;; inputs are: ;; - load-in, load as from uptime, NOT normalized ;; - numcpus, number of cpus, ideally use the real cpus, not threads ;; (define (common:get-delay load-in numcpus) (let* ((ratio (/ load-in numcpus)) (new-option (configf:lookup configdat "load" "new-load-method")) (paramstr (or (configf:lookup configdat "load" "exp-params") "15 12 1281453987.9543 0.75")) ;; 5 4 10 1")) (paramlst (map string->number (string-split paramstr)))) (if new-option (begin (cond ((and (>= ratio 0) (< ratio .5)) 0) ((and (>= ratio 0.5) (<= ratio .9)) (* ratio (/ 5 .9))) ((and (> ratio .9) (<= ratio 1.1)) (+ 5 (* (- ratio .9) (/ 55 .2)))) ((> ratio 1.1) 60))) (match paramlst ((r1 r2 s1 s2) (debug:print 3 default-log-port "Using params r1=" r1 " r2=" r2 " s1=" s1 " s2=" s2) (min (max (/ (expt r1 (* r2 s2 ratio)) s1) 0) 30)) (else (debug:print 0 default-log-port "BAD exp-params, should be \"r1 r2 s1 s2\" but got " paramstr) 30))))) (define (common:print-delay-table) (let loop ((x 0)) (print x "," (common:get-delay x 1)) (if (< x 2) (loop (+ x 0.1))))) (define (get-cpu-load #!key (remote-host #f)) (car (common:get-cpu-load remote-host))) ;; (let* ((load-res (process:cmd-run->list "uptime")) ;; (load-rx (regexp "load average:\\s+(\\d+)")) ;; (cpu-load #f)) ;; (for-each (lambda (l) ;; (let ((match (string-search load-rx l))) ;; (if match ;; (let ((newval (string->number (cadr match)))) ;; (if (number? newval) ;; (set! cpu-load newval)))))) ;; (car load-res)) ;; cpu-load)) ;; get values from cached info from dropping file in logs dir ;; e.g. key is host and dtype is normalized-load ;; (define (common:get-cached-info key dtype #!key (age 10)) (if toppath (let* ((fullpath (conc toppath "/.sysdata/" key "-" dtype ".log")) (delfile (lambda () (debug:print-info 1 default-log-port " removing bad file " fullpath ", exn=" exn) (delete-file* fullpath) #f))) (if (and (file-exists? fullpath) (file-read-access? fullpath)) (handle-exceptions exn (begin (debug:print 0 default-log-port "failed to get cached info from " fullpath ", exn=" exn) #f) (debug:print 2 default-log-port "reading file " fullpath) (let ((real-age (- (current-seconds) (handle-exceptions exn (begin (debug:print 1 default-log-port "Failed to read mod time on file " fullpath ", using 0, exn=" exn) 0) (file-change-time fullpath))))) (if (< real-age age) (handle-exceptions exn (delfile) (let* ((res (with-input-from-file fullpath read))) (if (eof-object? res) (begin (delfile) #f) res))) (begin (debug:print-info 2 default-log-port "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it") #f)))) (begin (debug:print 2 default-log-port "not reading file " fullpath) #f))) #f)) (define (common:write-cached-info key dtype dat)
︙			︙
2087 2088 2089 2090 2091 2092 2093 2094 2095 ~~2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116~~ 2117 2118 2119 2120 2121 2122 2123	(define (common:wait-for-homehost-load maxnormload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f (common:get-homehost))) (hh (if hh-dat (car hh-dat) #f))) (common:wait-for-normalized-load maxnormload msg hh))) (define (common:get-num-cpus remote-host) (let* ((actual-host (or remote-host (get-host-name)))) (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) ~~;; hosts had better not be changing the number of cpus too often!~~ (let* ((proc (lambda () (let loop ((numcpu 0) (inl (read-line))) (if (eof-object? inl) (if (> numcpu 0) numcpu #f) ;; if zero return #f so caller knows that things are not working (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl) (+ numcpu 1) numcpu) (read-line)))))) (result (if remote-host (with-input-from-pipe (conc "ssh " remote-host " cat /proc/cpuinfo") proc) (with-input-from-file "/proc/cpuinfo" proc)))) (if (and (number? result) (> result 0)) (common:write-cached-info actual-host "num-cpus" result)) result)))) ;; wait for normalized cpu load to drop below maxload ;; (define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5)) (let ((num-cpus (common:get-num-cpus remote-host))) (if num-cpus (common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host)	> > > \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| \| > >	2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155	(define (common:wait-for-homehost-load maxnormload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f (common:get-homehost))) (hh (if hh-dat (car hh-dat) #f))) (common:wait-for-normalized-load maxnormload msg hh))) (define numcpus-cache (make-hash-table)) (define (common:get-num-cpus remote-host) (let* ((actual-host (or remote-host (get-host-name)))) ;; hosts had better not be changing the number of cpus too often! (or (hash-table-ref/default numcpus-cache actual-host #f) (let* ((numcpus (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) (let* ((proc (lambda () (let loop ((numcpu 0) (inl (read-line))) (if (eof-object? inl) (if (> numcpu 0) numcpu #f) ;; if zero return #f so caller knows that things are not working (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl) (+ numcpu 1) numcpu) (read-line)))))) (result (if remote-host (with-input-from-pipe (conc "ssh " remote-host " cat /proc/cpuinfo") proc) (with-input-from-file "/proc/cpuinfo" proc)))) (if (and (number? result) (> result 0)) (common:write-cached-info actual-host "num-cpus" result)) result)))) (hash-table-set! numcpus-cache actual-host numcpus) numcpus)))) ;; wait for normalized cpu load to drop below maxload ;; (define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5)) (let ((num-cpus (common:get-num-cpus remote-host))) (if num-cpus (common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host)
︙			︙
2153 2154 2155 2156 2157 2158 2159 ~~2160~~ 2161 2162 2163 2164 2165 2166 2167	(effective-load (common:get-intercept first next)) (recommended-delay (common:get-delay effective-load numcpus)) (effective-host (or remote-host "localhost")) (normalized-effective-load (/ effective-load numcpus)) (will-wait (> normalized-effective-load maxnormload))) (if (> recommended-delay 0) (let* ((actual-delay (min recommended-delay 30))) ~~(debug:print-info 0 default-log-port "Load control, delaying " ~~actual-delay " seconds to maintain safe load.")~~~~ (thread-sleep! actual-delay))) (cond ;; bad data, try again to get the data ((not will-wait) (if (common:low-noise-print 30 (conc (round normalized-effective-load) "-load-acceptable-" effective-host)) (debug:print 0 default-log-port "Effective load on " effective-host " is acceptable at " effective-load " continuing.")))	> \| > >	2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202	(effective-load (common:get-intercept first next)) (recommended-delay (common:get-delay effective-load numcpus)) (effective-host (or remote-host "localhost")) (normalized-effective-load (/ effective-load numcpus)) (will-wait (> normalized-effective-load maxnormload))) (if (> recommended-delay 0) (let* ((actual-delay (min recommended-delay 30))) (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load")) (debug:print-info 0 default-log-port "Load control, delaying " actual-delay " seconds to maintain safe load. current normalized effective load is " normalized-effective-load".")) (thread-sleep! actual-delay))) (cond ;; bad data, try again to get the data ((not will-wait) (if (common:low-noise-print 30 (conc (round normalized-effective-load) "-load-acceptable-" effective-host)) (debug:print 0 default-log-port "Effective load on " effective-host " is acceptable at " effective-load " continuing.")))
︙			︙

︙			︙
210 211 212 213 214 215 216 217 218 219 220 221 222 223	;; (define (dboard:common-set-tabdat! commondat tabnum tabdat) (hash-table-set! (dboard:commondat-tabdats commondat) tabnum tabdat)) ;; gets and calls updater list based on curr-tab-num ;; (define (dboard:common-run-curr-updaters commondat #!key (tab-num #f)) (if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat (let* ((tnum (or tab-num (dboard:commondat-curr-tab-num commondat))) (updaters (hash-table-ref/default (dboard:commondat-updaters commondat) tnum	> >	210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225	;; (define (dboard:common-set-tabdat! commondat tabnum tabdat) (hash-table-set! (dboard:commondat-tabdats commondat) tabnum tabdat)) (define updater-running #f) ;; move this into one of the stucts ;; gets and calls updater list based on curr-tab-num ;; (define (dboard:common-run-curr-updaters commondat #!key (tab-num #f)) (if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat (let* ((tnum (or tab-num (dboard:commondat-curr-tab-num commondat))) (updaters (hash-table-ref/default (dboard:commondat-updaters commondat) tnum
︙			︙
238 239 240 241 242 243 244 ~~245~~ 246 247 248 249 250 251 252	(curr-updaters (hash-table-ref/default (dboard:commondat-updaters commondat) tnum '()))) (hash-table-set! (dboard:commondat-updaters commondat) tnum (cons updater curr-updaters)))) ;; data for each specific tab goes here ;; ~~(defstruct dboard:tabdat~~ ;; runs ((allruns '()) : list) ;; list of dboard:rundat records ((allruns-by-id (make-hash-table)) : hash-table) ;; hash of run-id -> dboard:rundat records ((done-runs '()) : list) ;; list of runs already drawn ((not-done-runs '()) : list) ;; list of runs not yet drawn (header #f) ;; header for decoding the run records (keys #f) ;; keys for this run (i.e. target components)	\| > >	240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256	(curr-updaters (hash-table-ref/default (dboard:commondat-updaters commondat) tnum '()))) (hash-table-set! (dboard:commondat-updaters commondat) tnum (cons updater curr-updaters)))) ;; data for each specific tab goes here ;; (defstruct dboard:tabdat ;; flags ((already-running #f) : boolean) ;; the updater is already running. skip ;; runs ((allruns '()) : list) ;; list of dboard:rundat records ((allruns-by-id (make-hash-table)) : hash-table) ;; hash of run-id -> dboard:rundat records ((done-runs '()) : list) ;; list of runs already drawn ((not-done-runs '()) : list) ;; list of runs not yet drawn (header #f) ;; header for decoding the run records (keys #f) ;; keys for this run (i.e. target components)
︙			︙
643 644 645 646 647 648 649 ~~650~~ 651 652 653 654 655 656 657	;; ;; NOTE: Yes, this is used ;; (define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals) (let* ((start-time (current-seconds)) (access-mode (dboard:tabdat-access-mode tabdat)) (num-to-get (string->number (or (configf:lookup configdat "setup" "num-tests-to-get") ~~"200")))~~ (states (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat))) (statuses (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat))) (do-not-use-db-file-timestamps #t) ;; (configf:lookup configdat "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab (do-not-use-query-timestamps #t) ;; (configf:lookup configdat "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab (sort-info (get-curr-sort)) (sort-by (vector-ref sort-info 1)) (sort-order (vector-ref sort-info 2))	\|	647 648 649 650 651 652 653 654 655 656 657 658 659 660 661	;; ;; NOTE: Yes, this is used ;; (define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals) (let* ((start-time (current-seconds)) (access-mode (dboard:tabdat-access-mode tabdat)) (num-to-get (string->number (or (configf:lookup configdat "setup" "num-tests-to-get") "50"))) ;; was 200, which is fine in a normal run area. (states (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat))) (statuses (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat))) (do-not-use-db-file-timestamps #t) ;; (configf:lookup configdat "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab (do-not-use-query-timestamps #t) ;; (configf:lookup configdat "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab (sort-info (get-curr-sort)) (sort-by (vector-ref sort-info 1)) (sort-order (vector-ref sort-info 2))
︙			︙
714 715 716 717 718 719 720 721 722 723 724 725 726 727	;; ;; (debug:print 0 default-log-port "got-all: " got-all " multi-get: " multi-get " num-to-get: " num-to-get " (length tmptests): " (length tmptests) " db-modified: " db-modified " db-mod-time: " db-mod-time " db-path: " db-path) (if got-all (begin (dboard:rundat-last-update-set! run-dat (- start-time 2)) (dboard:rundat-run-data-offset-set! run-dat 0)) (begin (dboard:rundat-run-data-offset-set! run-dat (+ num-to-get (dboard:rundat-run-data-offset run-dat))))) (for-each (lambda (tdat) (let ((test-id (db:test-get-id tdat)) (state (db:test-get-state tdat)))	>	718 719 720 721 722 723 724 725 726 727 728 729 730 731 732	;; ;; (debug:print 0 default-log-port "got-all: " got-all " multi-get: " multi-get " num-to-get: " num-to-get " (length tmptests): " (length tmptests) " db-modified: " db-modified " db-mod-time: " db-mod-time " db-path: " db-path) (if got-all (begin (dboard:rundat-last-update-set! run-dat (- start-time 2)) (dboard:rundat-run-data-offset-set! run-dat 0)) (begin ;;; (thread-sleep! 0.25) ;; give the rest of the gui some time to update. <-- this did NOT help (dboard:rundat-run-data-offset-set! run-dat (+ num-to-get (dboard:rundat-run-data-offset run-dat))))) (for-each (lambda (tdat) (let ((test-id (db:test-get-id tdat)) (state (db:test-get-state tdat)))
︙			︙
831 832 833 834 835 836 837 ~~838~~ 839 840 841 842 843 844 845	;; this calls dboard:get-tests-for-run-duplicate for each run ;; ;; create a virtual table of all the tests ;; keypatts: ( (KEY1 "abc%def")(KEY2 "%") ) ;; (define (dboard:update-rundat tabdat runnamepatt numruns testnamepatt keypatts) ~~~~(let* (~~(access-mode (dboard:tabdat-access-mode tabdat))~~ (keys (dboard:tabdat-keys tabdat)) ;; (db:dispatch-query access-mode rmt:get-keys db:get-keys))) (last-runs-update (- (dboard:tabdat-last-runs-update tabdat) 2)) (allruns (rmt:get-runs runnamepatt numruns (dboard:tabdat-start-run-offset tabdat) keypatts)) ;;(allruns-tree (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f)) (allruns-tree (rmt:get-runs-by-patt keys "%" #f #f #f #f 0)) ;; last-runs-update));;'("id" "runname") (header (db:get-header allruns)) (runs (db:get-rows allruns)) ;; RA => Filtered as per runpatt selected	> > \|	836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852	;; this calls dboard:get-tests-for-run-duplicate for each run ;; ;; create a virtual table of all the tests ;; keypatts: ( (KEY1 "abc%def")(KEY2 "%") ) ;; (define (dboard:update-rundat tabdat runnamepatt numruns testnamepatt keypatts) (dboard:tabdat-already-running-set! tabdat #t) (let* (;; (already-running (dboard:tabdat-already-running tabdat)) (access-mode (dboard:tabdat-access-mode tabdat)) (keys (dboard:tabdat-keys tabdat)) ;; (db:dispatch-query access-mode rmt:get-keys db:get-keys))) (last-runs-update (- (dboard:tabdat-last-runs-update tabdat) 2)) (allruns (rmt:get-runs runnamepatt numruns (dboard:tabdat-start-run-offset tabdat) keypatts)) ;;(allruns-tree (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f)) (allruns-tree (rmt:get-runs-by-patt keys "%" #f #f #f #f 0)) ;; last-runs-update));;'("id" "runname") (header (db:get-header allruns)) (runs (db:get-rows allruns)) ;; RA => Filtered as per runpatt selected
︙			︙
899 900 901 902 903 904 905 ~~906~~ 907 908 909 910 ~~911 912 913 914 915~~ 916 917 918 919 920 921 ~~922~~ 923 924 925 926 927 928 929	(elapsed-time (- (current-seconds) start-time))) (if (null? all-test-ids) (hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id) (hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct)) (if (or (null? tal) (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data should be loaded since get-tests-for-run uses last update (begin ~~(when (> elapsed-time 2)~~ (debug:print 0 default-log-port "NOTE: updates are taking a long time, " elapsed-time "s elapsed.") (let* ((old-val (iup:attribute tim "TIME")) (new-val (number->string (inexact->exact (floor (* 2 (string->number old-val))))))) (if (< (string->number new-val) 5000) ~~((debug:print 0 default-log-port "NOTE: increasing poll interval from "old-val" to "new-val) (iup:attribute-set! tim "TIME" new-val)))) )~~ (dboard:tabdat-allruns-set! tabdat new-res) maxtests) (if (> (dboard:rundat-run-data-offset run-struct) 0) (loop run tal new-res newmaxtests) ;; not done getting data for this run (loop (car tal)(cdr tal) new-res newmaxtests))))))) (dboard:tabdat-filters-changed-set! tabdat #f) ~~(dboard:update-tree tabdat runs-hash header tb)))~~ (define collapsed (make-hash-table)) (define (toggle-hide lnum uidat) ; fulltestname) (let* ((btn (vector-ref (dboard:uidat-get-lftcol uidat) lnum)) (fulltestname (iup:attribute btn "TITLE")) (parts (string-split fulltestname "("))	\| > \| \| < < < \| >	906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935	(elapsed-time (- (current-seconds) start-time))) (if (null? all-test-ids) (hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id) (hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct)) (if (or (null? tal) (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data should be loaded since get-tests-for-run uses last update (begin #;(when (> elapsed-time 2) (debug:print 0 default-log-port "NOTE: updates are taking a long time, " elapsed-time "s elapsed.") (let* ((old-val (iup:attribute tim "TIME")) (new-val (number->string (inexact->exact (floor (* 2 (string->number old-val))))))) (if (< (string->number new-val) 5000) (begin (debug:print 0 default-log-port "NOTE: increasing poll interval from "old-val" to "new-val) (iup:attribute-set! tim "TIME" new-val))))) (dboard:tabdat-allruns-set! tabdat new-res) maxtests) (if (> (dboard:rundat-run-data-offset run-struct) 0) (loop run tal new-res newmaxtests) ;; not done getting data for this run (loop (car tal)(cdr tal) new-res newmaxtests))))))) (dboard:tabdat-filters-changed-set! tabdat #f) (dboard:update-tree tabdat runs-hash header tb) (dboard:tabdat-already-running-set! tabdat #f))) (define collapsed (make-hash-table)) (define (toggle-hide lnum uidat) ; fulltestname) (let* ((btn (vector-ref (dboard:uidat-get-lftcol uidat) lnum)) (fulltestname (iup:attribute btn "TITLE")) (parts (string-split fulltestname "("))
︙			︙
2503 2504 2505 2506 2507 2508 2509 ~~2510 2511 2512 2513~~ 2514 2515 2516 2517 2518 2519 2520	sort-lb))) ) ;; insert extra widget here (if extra-widget extra-widget (iup:hbox)) ;; empty widget ))) (let* ((status-toggles (map (lambda (status) (iup:toggle (conc status) #:fontsize 8 ;; btn-fontsz ;; "10" ;; #:expand "HORIZONTAL" #:action (lambda (obj val)	< < < <	2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522	sort-lb))) ) ;; insert extra widget here (if extra-widget extra-widget (iup:hbox)) ;; empty widget ))) (let* ((status-toggles (map (lambda (status) (iup:toggle (conc status) #:fontsize 8 ;; btn-fontsz ;; "10" ;; #:expand "HORIZONTAL" #:action (lambda (obj val)
︙			︙
3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 ~~3738 3739 3740 3741 3742 3743 3744 3745 3746 3747~~ 3748 3749 3750 3751 3752 3753 3754	;; removing the tabdat-values proc ;; ;; (define (tabdat-values tabdat) ;; runs update-rundat using the various filters from the gui ;; (define (dashboard:do-update-rundat tabdat) (dboard:update-rundat tabdat (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "runname" "%") (dboard:tabdat-numruns tabdat) (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%") ;; generate key patterns from the target stored in tabdat (let* ((dbkeys (dboard:tabdat-dbkeys tabdat))) (let ((fres (if (dboard:tabdat-target tabdat) (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%")))) (map (lambda (k v)(list k v)) dbkeys ptparts)) (let ((res '())) (for-each (lambda (key) (if (not (equal? key "runname")) (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f))) (if val (set! res (cons (list key val) res)))))) dbkeys) res)))) fres)))) (define (dashboard:runs-tab-updater commondat tab-num) (debug:catch-and-dump (lambda () (let* ((tabdat (dboard:common-get-tabdat commondat tab-num: tab-num)) (dbkeys (dboard:tabdat-dbkeys tabdat))) (dashboard:do-update-rundat tabdat)	> > > > > > > \| \| \| \| \| \| \| \| \| \| >	3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764	;; removing the tabdat-values proc ;; ;; (define (tabdat-values tabdat) ;; runs update-rundat using the various filters from the gui ;; (define (dashboard:do-update-rundat tabdat) ;; this seems like a good place to check for already running and skip if so ;; ;; (set! updater-running #t) ;;(if (dboard:tabdat-already-running tabdat) ;; (begin ;; (debug:print-info 0 default-log-port "Dashboard overloaded - updates will be slow, skipping update.") ;; (dboard:tabdat-target tabdat)) (dboard:update-rundat tabdat (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "runname" "%") (dboard:tabdat-numruns tabdat) (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%") ;; generate key patterns from the target stored in tabdat (let* ((dbkeys (dboard:tabdat-dbkeys tabdat))) (let ((fres (if (dboard:tabdat-target tabdat) (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%")))) (map (lambda (k v)(list k v)) dbkeys ptparts)) (let ((res '())) (for-each (lambda (key) (if (not (equal? key "runname")) (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f))) (if val (set! res (cons (list key val) res)))))) dbkeys) res)))) fres))) #;(set! updater-running #f)) (define (dashboard:runs-tab-updater commondat tab-num) (debug:catch-and-dump (lambda () (let* ((tabdat (dboard:common-get-tabdat commondat tab-num: tab-num)) (dbkeys (dboard:tabdat-dbkeys tabdat))) (dashboard:do-update-rundat tabdat)
︙			︙
3799 3800 3801 3802 3803 3804 3805 ~~3806~~ ~~3807 3808 3809 3810 3811 3812 3813~~ ~~3814~~ ~~3815 3816 3817~~ 3818 3819 3820 3821 3822 3823 3824	commondat (lambda () (dashboard:runs-tab-updater commondat 1)) tab-num: 2) (iup:callback-set! tim "ACTION_CB" (lambda (time-obj) ~~(let ((update~~-is~~-running ~~#f)~~)~~ (mutex-lock! (dboard:commondat-update-mutex commondat)) (set! update-is-running (dboard:commondat-updating commondat)) (if (not update-is-running) (dboard:commondat-updating-set! commondat #t)) (mutex-unlock! (dboard:commondat-update-mutex commondat)) (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update (begin ~~(dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)~~ ~~(mutex-lock! (dboard:commondat-update-mutex commondat)) (dboard:commondat-updating-set! commondat #f) (mutex-unlock! (dboard:commondat-update-mutex commondat)))~~ )) 1)))) (let ((th1 (make-thread (lambda () (thread-sleep! 1) (dboard:common-run-curr-updaters commondat 0) ;; force update of summary tab ) "update buttons once"))	\| > \| \| \| \| \| \| \| > \| > \| \| \|	3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837	commondat (lambda () (dashboard:runs-tab-updater commondat 1)) tab-num: 2) (iup:callback-set! tim "ACTION_CB" (lambda (time-obj) (if (not updater-running) (begin ;; (mutex-lock! (dboard:commondat-update-mutex commondat)) ;; (set! update-is-running (dboard:commondat-updating commondat)) ;;(if (not update-is-running) ;; (dboard:commondat-updating-set! commondat #t)) ;;(mutex-unlock! (dboard:commondat-update-mutex commondat)) ;;(if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update ;; (begin (set! updater-running #t) (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat) (set! updater-running #f) ;; (mutex-lock! (dboard:commondat-update-mutex commondat)) ;; (dboard:commondat-updating-set! commondat #f) ;; (mutex-unlock! (dboard:commondat-update-mutex commondat))) )) 1)))) (let ((th1 (make-thread (lambda () (thread-sleep! 1) (dboard:common-run-curr-updaters commondat 0) ;; force update of summary tab ) "update buttons once"))
︙			︙

︙			︙
457 458 459 460 461 462 463 ~~464~~ 465 466 467 468 469 470 471	exn (begin (print "Attempt to safely close sqlite3 db failed. Trying again. exn=" exn) (thread-sleep! 3) (sqlite3:interrupt! db) (db:safely-close-sqlite3-db db stmtcache try-num: (- try-num 1))) (if (sqlite3:database? db) ~~(let* ((stmts (hash-table-ref/default stmt-cache db #f)))~~ (if stmts (map sqlite3:finalize! (hash-table-values stmts))) (sqlite3:finalize! db) #t) #f)))) ;; close all opened run-id dbs (define (db:close-all dbstruct)	\|	457 458 459 460 461 462 463 464 465 466 467 468 469 470 471	exn (begin (print "Attempt to safely close sqlite3 db failed. Trying again. exn=" exn) (thread-sleep! 3) (sqlite3:interrupt! db) (db:safely-close-sqlite3-db db stmtcache try-num: (- try-num 1))) (if (sqlite3:database? db) (let* ((stmts (and stmt-cache (hash-table-ref/default stmt-cache db #f)))) (if stmts (map sqlite3:finalize! (hash-table-values stmts))) (sqlite3:finalize! db) #t) #f)))) ;; close all opened run-id dbs (define (db:close-all dbstruct)
︙			︙
824 825 826 827 828 829 830 ~~831 832 833 834 835 836~~ 837 838 839 840 841 842 843	(thread-sleep! delay-handicap) (debug:print-info 0 default-log-port "synthetic sync delay of "delay-handicap" seconds completed") ) ;; first pass implementation, just insert all changed rows (for-each (lambda (targdb) (let* ((db (db:dbdat-get-db targdb)) (drp-trigger (if (member "last_update" field-names) (db:drop-trigger db tablename) #f)) (is-trigger-dropped (if (member "last_update" field-names) (db:is-trigger-dropped db tablename) ~~#f))~~ (stmth (sqlite3:prepare db full-ins))) ;; (db:delay-if-busy targdb) ;; NO WAITING (if (member "last_update" field-names) (debug:print-info 3 default-log-port "is-trigger-dropped: " is-trigger-dropped)) (for-each (lambda (fromdat-lst) (sqlite3:with-transaction	\| \| \| \| \| \| >	824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844	(thread-sleep! delay-handicap) (debug:print-info 0 default-log-port "synthetic sync delay of "delay-handicap" seconds completed") ) ;; first pass implementation, just insert all changed rows (for-each (lambda (targdb) (let* ((db (db:dbdat-get-db targdb)) (drp-trigger (if (member "last_update" field-names) (db:drop-trigger db tablename) #f)) (is-trigger-dropped (if (member "last_update" field-names) (db:is-trigger-dropped db tablename) #f)) (stmth (sqlite3:prepare db full-ins))) ;; (db:delay-if-busy targdb) ;; NO WAITING (if (member "last_update" field-names) (debug:print-info 3 default-log-port "is-trigger-dropped: " is-trigger-dropped)) (for-each (lambda (fromdat-lst) (sqlite3:with-transaction
︙			︙
1222 1223 1224 1225 1226 1227 1228 ~~1229~~ 1230 1231 1232 1233 1234 ~~1235~~ 1236 1237 1238 1239 1240 ~~1241~~ 1242 1243 1244 1245 1246 ~~1247~~ 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 ~~1266~~ 1267 1268 ~~1269~~ 1270 1271 ~~1272 1273 1274~~ ~~1275 1276 1277 1278 1279 1280 1281~~ 1282 1283 ~~1284~~ ~~1285 1286~~ 1287 1288 ~~1289 1290 1291 1292~~ ~~1293 1294 1295~~ 1296 1297 1298 1299 1300 1301 1302	(define db:trigger-list (list (list "update_runs_trigger" "CREATE TRIGGER IF NOT EXISTS update_runs_trigger AFTER UPDATE ON runs FOR EACH ROW BEGIN UPDATE runs SET last_update=(strftime('%s','now')) WHERE id=old.id; END;" ) ~~(list "update_run_stats_trigger" "CREATE TRIGGER IF NOT EXISTS update_run_stats_trigger AFTER UPDATE ON run_stats~~ FOR EACH ROW BEGIN UPDATE run_stats SET last_update=(strftime('%s','now')) WHERE id=old.id; END;" ) ~~(list "update_tests_trigger" "CREATE TRIGGER IF NOT EXISTS update_tests_trigger AFTER UPDATE ON tests~~ FOR EACH ROW BEGIN UPDATE tests SET last_update=(strftime('%s','now')) WHERE id=old.id; END;" ) ~~(list "update_teststeps_trigger" "CREATE TRIGGER IF NOT EXISTS update_teststeps_trigger AFTER UPDATE ON test_steps~~ FOR EACH ROW BEGIN UPDATE test_steps SET last_update=(strftime('%s','now')) WHERE id=old.id; END;" ) ~~(list "update_test_data_trigger" "CREATE TRIGGER IF NOT EXISTS update_test_data_trigger AFTER UPDATE ON test_data~~ FOR EACH ROW BEGIN UPDATE test_data SET last_update=(strftime('%s','now')) WHERE id=old.id; END;" ))) (define (db:create-all-triggers dbstruct) (db:with-db dbstruct #f #f (lambda (db) (db:create-triggers db)))) (define (db:create-triggers db) (for-each (lambda (key) (sqlite3:execute db (cadr key))) db:trigger-list)) (define (db:drop-all-triggers dbstruct) ~~(db:with-db~~ dbstruct #f #f (lambda (db) ~~(db:drop-triggers db))))~~ (define (db:is-trigger-dropped db tbl-name) ~~(let* ((trigger-name (if (equal? tbl-name "test_steps") "update_teststeps_trigger" (conc "update_" tbl-name "_trigger"))))~~ (sqlite3:for-each-row (lambda (name) ~~;(p~~rint name) (set! res ~~(vector name~~))) db "~~select~~ name ~~from~~ sqlite_master ~~where~~ type = 'trigger' ;" ))) (define (db:drop-triggers db) ~~(for-each ~~(lambda (key)~~~~ ~~(sqlite3:execute db (conc "drop trigger " (car key)))) db:trigger-list))~~ (define (db:drop-trigger db tbl-name) ~~(let* ((trigger-name (if (equal? tbl-name "test_steps") "update_teststeps_trigger" (conc "update_" tbl-name "_trigger")))) (for-each ~~(lambda (key)~~~~ ~~(if (equal? (car key) trigger-name) (sqlite3:execute db (conc "drop trigger " trigger-name)))) db:trigger-list)))~~ (define (db:create-trigger db tbl-name) (let* ((trigger-name (if (equal? tbl-name "test_steps") "update_teststeps_trigger" (conc "update_" tbl-name "_trigger")))) (for-each (lambda (key) (if (equal? (car key) trigger-name)	\| \| \| \| \| \| \| \| \| > \| \| \| \| \| \| \| \| > \| \| \| \| \| \| > \| \| \|	1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306	(define db:trigger-list (list (list "update_runs_trigger" "CREATE TRIGGER IF NOT EXISTS update_runs_trigger AFTER UPDATE ON runs FOR EACH ROW BEGIN UPDATE runs SET last_update=(strftime('%s','now')) WHERE id=old.id; END;" ) (list "update_run_stats_trigger" "CREATE TRIGGER IF NOT EXISTS update_run_stats_trigger AFTER UPDATE ON run_stats FOR EACH ROW BEGIN UPDATE run_stats SET last_update=(strftime('%s','now')) WHERE id=old.id; END;" ) (list "update_tests_trigger" "CREATE TRIGGER IF NOT EXISTS update_tests_trigger AFTER UPDATE ON tests FOR EACH ROW BEGIN UPDATE tests SET last_update=(strftime('%s','now')) WHERE id=old.id; END;" ) (list "update_teststeps_trigger" "CREATE TRIGGER IF NOT EXISTS update_teststeps_trigger AFTER UPDATE ON test_steps FOR EACH ROW BEGIN UPDATE test_steps SET last_update=(strftime('%s','now')) WHERE id=old.id; END;" ) (list "update_test_data_trigger" "CREATE TRIGGER IF NOT EXISTS update_test_data_trigger AFTER UPDATE ON test_data FOR EACH ROW BEGIN UPDATE test_data SET last_update=(strftime('%s','now')) WHERE id=old.id; END;" ))) (define (db:create-all-triggers dbstruct) (db:with-db dbstruct #f #f (lambda (db) (db:create-triggers db)))) (define (db:create-triggers db) (for-each (lambda (key) (sqlite3:execute db (cadr key))) db:trigger-list)) (define (db:drop-all-triggers dbstruct) (db:with-db dbstruct #f #f (lambda (db) (db:drop-triggers db)))) (define (db:is-trigger-dropped db tbl-name) (let* ((trigger-name (if (equal? tbl-name "test_steps") "update_teststeps_trigger" (conc "update_" tbl-name "_trigger"))) (res #f)) (sqlite3:for-each-row (lambda (name) (if (equal? name trigger-name) (set! res #t))) db "SELECT name FROM sqlite_master WHERE type = 'trigger' ;" ))) (define (db:drop-triggers db) (for-each (lambda (key) (sqlite3:execute db (conc "drop trigger if exists " (car key)))) db:trigger-list)) (define (db:drop-trigger db tbl-name) (let* ((trigger-name (if (equal? tbl-name "test_steps") "update_teststeps_trigger" (conc "update_" tbl-name "_trigger")))) (for-each (lambda (key) (if (equal? (car key) trigger-name) (sqlite3:execute db (conc "drop trigger if exists " trigger-name)))) db:trigger-list))) (define (db:create-trigger db tbl-name) (let* ((trigger-name (if (equal? tbl-name "test_steps") "update_teststeps_trigger" (conc "update_" tbl-name "_trigger")))) (for-each (lambda (key) (if (equal? (car key) trigger-name)
︙			︙
1528 1529 1530 1531 1532 1533 1534 ~~1535~~ 1536 1537 1538 1539 1540 1541 1542	id INTEGER PRIMARY KEY, test_id INTEGER, state TEXT DEFAULT 'new', status TEXT DEFAULT 'n/a', archive_type TEXT DEFAULT 'bup', du INTEGER, archive_path TEXT);"))) ~~(print "creating trigges from init")~~ (db:create-triggers db) db)) ;; ) ;;====================================================================== ;; A R C H I V E S ;;======================================================================	\|	1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546	id INTEGER PRIMARY KEY, test_id INTEGER, state TEXT DEFAULT 'new', status TEXT DEFAULT 'n/a', archive_type TEXT DEFAULT 'bup', du INTEGER, archive_path TEXT);"))) ;; (print "creating trigges from init") (db:create-triggers db) db)) ;; ) ;;====================================================================== ;; A R C H I V E S ;;======================================================================
︙			︙
1770 1771 1772 1773 1774 1775 1776 ~~1777 1778 1779 1780~~ ~~1781~~ 1782 1783 1784 1785 1786 1787 1788	(debug:print 0 default-log-port "ERROR: cannot read " infile) (debug:print 0 default-log-port "ERROR: run-dir is " run-dir) #f ) (with-input-from-file infile read-lines) ))) ;; ~~select end_time-now from~~ ;; (~~select te~~st~~name,item_path,event_time+~~run_duration as ;; end~~_time,strftime('%s','now') as now from~~ test~~s where st~~at~~e in~~ ;; (~~'RUNNING','REMOTEHOSTSTART','LAUNCHED'~~)); (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) (let* ((incompleted '()) (oldlaunched '()) (toplevels '()) ;; The default running-deadtime is 720 seconds = 12 minutes. ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30)) (deadtime-trim (or ovr-deadtime (configf:lookup-number configdat "setup" "deadtime")))	> > \| \| \| \| > > > > > > > > > > > > > > > > \|	1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810	(debug:print 0 default-log-port "ERROR: cannot read " infile) (debug:print 0 default-log-port "ERROR: run-dir is " run-dir) #f ) (with-input-from-file infile read-lines) ))) ;; check duration against test-run.dat file if it exists and update the value in ;; the db if necessary ;; (define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration) (let* ((datf (conc run-dir ".mt_data/test-run.dat")) (modt (if (and (file-exists? datf) (file-read-access? datf)) (file-modification-time datf) #f)) ;; (+ event-time run-duration)))) (alt-run-duration (if modt (- modt event-time) #f))) (if (and alt-run-duration (> alt-run-duration run-duration)) (begin (debug:print 0 default-log-port "Test " test-id " run duration mismatch. Setting to " alt-run-duration) (db:with-db dbstruct #f #f (lambda (db) (sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id) #t))) #f))) ;; #f = we did NOT adjust the time (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) (let* ((incompleted '()) (oldlaunched '()) (toplevels '()) ;; The default running-deadtime is 720 seconds = 12 minutes. ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30)) (deadtime-trim (or ovr-deadtime (configf:lookup-number configdat "setup" "deadtime")))
︙			︙
1822 1823 1824 1825 1826 1827 1828 ~~1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839~~ 1840 1841 ~~1842~~ 1843 1844 ~~1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855~~ 1856 1857 1858 1859 1860 1861 1862	;; HOWEVER: this code in run:test seems to work fine ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) ;; (db:test-get-run_duration testdat))) ;; 600) ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) (if (and (equal? uname "n/a") (equal? item-path "")) ;; this is a toplevel test ;; what to do with toplevel? call rollup? (begin (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) (debug:print-info 0 default-log-port "Found old toplevel test in RUNNING state, test-id=" test-id)) (begin (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) (debug:print-info 0 default-log-port "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds) " event-time="event-time" run-duration="run-duration)))) stmth1 run-id running-deadtime) ;; default time 720 seconds (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) (if (and (equal? uname "n/a") (equal? item-path "")) ;; this is a toplevel test ;; what to do with toplevel? call rollup? (begin (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) (debug:print-info 0 default-log-port "Found old toplevel test in RUNNING state, test-id=" test-id)) (begin (debug:print-info 0 default-log-port "Found old test in REMOTEHOSTSTART state, test-id=" test-id " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time " run-duration="run-duration) (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) stmth2 run-id remotehoststart-deadtime) ;; default time 230 seconds ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config ;; ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row	> \| \| \| \| \| \| \| \| \| \| \| \| > \| \| \| \| \| \| \| \| \| \| \|	1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886	;; HOWEVER: this code in run:test seems to work fine ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) ;; (db:test-get-run_duration testdat))) ;; 600) ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)) (if (and (equal? uname "n/a") (equal? item-path "")) ;; this is a toplevel test ;; what to do with toplevel? call rollup? (begin (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) (debug:print-info 0 default-log-port "Found old toplevel test in RUNNING state, test-id=" test-id)) (begin (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) (debug:print-info 0 default-log-port "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds) " event-time="event-time" run-duration="run-duration))))) stmth1 run-id running-deadtime) ;; default time 720 seconds (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)) (if (and (equal? uname "n/a") (equal? item-path "")) ;; this is a toplevel test ;; what to do with toplevel? call rollup? (begin (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) (debug:print-info 0 default-log-port "Found old toplevel test in RUNNING state, test-id=" test-id)) (begin (debug:print-info 0 default-log-port "Found old test in REMOTEHOSTSTART state, test-id=" test-id " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time " run-duration="run-duration) (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))) stmth2 run-id remotehoststart-deadtime) ;; default time 230 seconds ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config ;; ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row
︙			︙
2178 2179 2180 2181 2182 2183 2184 ~~2185 2186~~ 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 ~~2201~~ ~~2202 2203~~ 2204 2205 2206 2207 2208 2209 2210	(string->number res) #f))) (if newres newres res)) res))) ~~(define (db:no-sync-close-db db stmtcache) (db:safely-close-sqlite3-db db stmtcache))~~ ;; transaction protected lock aquisition ;; either: ;; fails returns (#f . lock-creation-time) ;; succeeds (returns (#t . lock-creation-time) ;; use (db:no-sync-del! db keyname) to release the lock ;; (define (db:no-sync-get-lock db-in keyname) (let ((db (db:no-sync-db db-in))) (sqlite3:with-transaction db (lambda () (handle-exceptions exn ~~(let ((lock-time (current-seconds)))~~ ~~(sqlite3:execute db "INSERT INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-time) `(#t . ,lock-time))~~ `(#f . ,(sqlite3:first-result db "SELECT val FROM no_sync_metadat WHERE var=?;" keyname))))))) ;; use a global for some primitive caching, it is just silly to ;; re-read the db over and over again for the keys since they never ;; change	\| \| \| > \| \|	2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235	(string->number res) #f))) (if newres newres res)) res))) (define (db:no-sync-close-db db stmt-cache) (db:safely-close-sqlite3-db db stmt-cache)) ;; transaction protected lock aquisition ;; either: ;; fails returns (#f . lock-creation-time) ;; succeeds (returns (#t . lock-creation-time) ;; use (db:no-sync-del! db keyname) to release the lock ;; (define (db:no-sync-get-lock db-in keyname) (let ((db (db:no-sync-db db-in))) (sqlite3:with-transaction db (lambda () (handle-exceptions exn (let ((lock-time (current-seconds))) (debug:print-info 2 default-log-port "db:no-sync-get-lock keyname=" keyname ", lock-time=" lock-time ", exn=" exn) (sqlite3:execute db "INSERT INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-time) `(#t . ,lock-time)) `(#f . ,(sqlite3:first-result db "SELECT val FROM no_sync_metadat WHERE var=?;" keyname))))))) ;; use a global for some primitive caching, it is just silly to ;; re-read the db over and over again for the keys since they never ;; change
︙			︙
2236 2237 2238 2239 2240 2241 2242 ~~2243~~ 2244 2245 2246 2247 2248 2249 2250	(let loop ((hed (car header)) (tal (cdr header)) (n 0)) (if (equal? hed field) (handle-exceptions exn (begin ~~(debug:print 0 default-log-port "WARNING: attempt to read non-existant field, row=" ~~row " header=" header " field=" field ", exn=" exn)~~~~ #f) (vector-ref row n)) (if (null? tal) #f (loop (car tal)(cdr tal)(+ n 1))))))) ;; Accessors for the header/data structure ;; get rows and header from (define (db:get-header vec)(vector-ref vec 0))	\| >	2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276	(let loop ((hed (car header)) (tal (cdr header)) (n 0)) (if (equal? hed field) (handle-exceptions exn (begin (debug:print 0 default-log-port "WARNING: attempt to read non-existant field, row=" row " header=" header " field=" field ", exn=" exn) #f) (vector-ref row n)) (if (null? tal) #f (loop (car tal)(cdr tal)(+ n 1))))))) ;; Accessors for the header/data structure ;; get rows and header from (define (db:get-header vec)(vector-ref vec 0))
︙			︙
3459 3460 3461 3462 3463 3464 3465 ~~3466~~ 3467 3468 3469 3470 3471 3472 3473	(let* ((run-ids (db:get-all-run-ids mtdb))) (for-each (lambda (run-id) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))) (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs))) run-ids))) ~~;; Get test data using test_id, run-id is not used~~ ;; (define (db:get-test-info-by-id dbstruct run-id test-id) (db:with-db dbstruct #f ;; run-id #f (lambda (db)	\|	3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499	(let* ((run-ids (db:get-all-run-ids mtdb))) (for-each (lambda (run-id) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))) (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs))) run-ids))) ;; Get test data using test_id, run-id is not used - but it will be! ;; (define (db:get-test-info-by-id dbstruct run-id test-id) (db:with-db dbstruct #f ;; run-id #f (lambda (db)
︙			︙
4477 4478 4479 4480 4481 4482 4483 ~~4484~~ 4485 4486 4487 4488 4489 4490 4491	(if dbdat (let* ((dbpath (db:dbdat-get-path dbdat)) (db (db:dbdat-get-db dbdat)) ;; we'll return this so (db:delay--if-busy can be called inline (dbfj (conc dbpath "-journal"))) (if (handle-exceptions exn (begin ~~(debug:print-info 0 default-log-port "WARNING: failed to test for existance of " dbfj)~~ (thread-sleep! 1) (db:delay-if-busy count (- count 1))) (common:file-exists? dbfj)) (case count ((6) (thread-sleep! 0.2) (db:delay-if-busy count: 5))	\|	4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517	(if dbdat (let* ((dbpath (db:dbdat-get-path dbdat)) (db (db:dbdat-get-db dbdat)) ;; we'll return this so (db:delay--if-busy can be called inline (dbfj (conc dbpath "-journal"))) (if (handle-exceptions exn (begin (debug:print-info 0 default-log-port "WARNING: failed to test for existance of " dbfj ", exn=" exn) (thread-sleep! 1) (db:delay-if-busy count (- count 1))) (common:file-exists? dbfj)) (case count ((6) (thread-sleep! 0.2) (db:delay-if-busy count: 5))
︙			︙

︙			︙
203 204 205 206 207 208 209 ~~210~~ 211 212 213 214 215 ~~216~~ 217 218 219 220 221 222 223	(round (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) ~~(tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)~~ (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) ~~(common:telemetry-log "zombie" (conc "launch:monitor-job - ~~top of loop encountered at "(current-seconds)" with last-sync="last-sync))~~~~ (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds	\| \| > >	203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225	(round (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - ;; top of loop encountered at "(current-seconds)" with ;; last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds
︙			︙
231 232 233 234 235 236 237 ~~238~~ 239 240 241 242 243 244 245 246 ~~247~~ 248 249 250 251 252 253 ~~254 255 256 257 258 259 260~~ 261 262 263 264 265 266 267	(do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") ~~(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)~~ (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 default-log-port "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) (~~when~~ do-sync ~~;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)~~ ~~;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))~~ ~~(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds)))~~ (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) ~~(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))~~ (if kill-job? (begin (debug:print-info 0 default-log-port "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries?	\| \| \| < < < \| < <	233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264	(do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 default-log-port "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) (if do-sync ;; save meta data about the running of this test (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) (if kill-job? (begin (debug:print-info 0 default-log-port "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries?
︙			︙
310 311 312 313 314 315 316 ~~317~~ 318 319 320 321 322 323 324	(begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) ~~(tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional~~ (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) ;;(bb-check-path msg: "launch:execute incoming")	\|	307 308 309 310 311 312 313 314 315 316 317 318 319 320 321	(begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) ;;(bb-check-path msg: "launch:execute incoming")
︙			︙
1302 1303 1304 1305 1306 1307 1308 ~~1309 1310~~ 1311 1312 1313 1314 1315 1316 1317	(begin ;; (let ((lnktarget (conc lnkpath "/" item-path))) (debug:print 2 default-log-port "Setting up sub test run area") (debug:print 2 default-log-port " - creating run area in " test-path) (handle-exceptions exn (begin (debug:print-error 0 default-log-port " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) ~~", ~~exi~~ting, exn=" exn) ~~(exit 1))~~~~ (create-directory test-path #t)) (debug:print 2 default-log-port " - creating link from: " test-path "\n" " to: " lnktarget) ;; If there is already a symlink delete it and recreate it. (handle-exceptions	\| < >	1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314	(begin ;; (let ((lnktarget (conc lnkpath "/" item-path))) (debug:print 2 default-log-port "Setting up sub test run area") (debug:print 2 default-log-port " - creating run area in " test-path) (handle-exceptions exn (begin (debug:print-error 0 default-log-port " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn) ", continuing (might cause downstream issues?), exn=" exn) #f) (create-directory test-path #t)) (debug:print 2 default-log-port " - creating link from: " test-path "\n" " to: " lnktarget) ;; If there is already a symlink delete it and recreate it. (handle-exceptions
︙			︙

︙			︙
20 21 22 23 24 25 26 27 28 29 30 31 32 33	(use format typed-records) ;; RADT => purpose of json format?? (declare (unit rmt)) (declare (uses api)) (declare (uses http-transport)) (include "common_records.scm") ;; (declare (uses rmtmod)) ;; (import rmtmod) ;; ;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!! ;;	> >	20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35	(use format typed-records) ;; RADT => purpose of json format?? (declare (unit rmt)) (declare (uses api)) (declare (uses http-transport)) (include "common_records.scm") (include "db_records.scm") ;; (declare (uses rmtmod)) ;; (import rmtmod) ;; ;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!! ;;
︙			︙
523 524 525 526 527 528 529 ~~530~~ 531 532 533 ~~534~~ 535 536 537 538 539 540 541	;; Just some syntatic sugar (define (rmt:register-test run-id test-name item-path) (rmt:general-call 'register-test run-id run-id test-name item-path)) (define (rmt:get-test-id run-id testname item-path) (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) ~~;; run-id is NOT used~~ ;; (define (rmt:get-test-info-by-id run-id test-id) (if (number? test-id) ~~(rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))~~ (begin (debug:print 0 default-log-port "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) (print-call-chain (current-error-port)) #f))) (define (rmt:test-get-rundir-from-test-id run-id test-id) (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))	\| \| > > > > > > > > >	525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552	;; Just some syntatic sugar (define (rmt:register-test run-id test-name item-path) (rmt:general-call 'register-test run-id run-id test-name item-path)) (define (rmt:get-test-id run-id testname item-path) (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) ;; run-id is NOT used - but it will be! ;; (define (rmt:get-test-info-by-id run-id test-id) (if (number? test-id) (let* ((testdat (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))) (trundatf (conc (db:test-get-rundir testdat) "/.mt_data/test-run.dat"))) ;; now we can update a couple fields from the filesystem (if (and (db:test-get-rundir testdat) (file-exists? trundatf)) (let* ((duration (db:test-get-run_duration testdat)) (event-time (db:test-get-event_time testdat)) (last-touch (file-modification-time trundatf))) (db:test-set-run_duration! testdat (max duration (- last-touch event-time))))) testdat) (begin (debug:print 0 default-log-port "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) (print-call-chain (current-error-port)) #f))) (define (rmt:test-get-rundir-from-test-id run-id test-id) (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))
︙			︙

︙			︙
200 201 202 203 204 205 206 ~~207 208 209~~ 210 211 212 213 214 215 216 217 218 219 ~~220 221 222 223 224~~ 225 226 227 228 229 230 231 232 233 234 235 ~~236 237 238 239 240 241~~ 242 243 244 245 246 247 248	;; otherwise attempt to create the logs dir and then ;; continue (if (if (directory-exists? (conc areapath "/logs")) '() (if (file-write-access? areapath) (begin (condition-case (create-directory (conc areapath "/logs") #t) (exn (i/o file)(debug:print 0 default-log-port "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 default-log-port "ERROR: Unknown error attemtping to get server list."))) (directory-exists? (conc areapath "/logs"))) '())) (let* ((server-logs (glob (conc areapath "/logs/server-.log"))) (num-serv-logs (length server-logs))) (if (null? server-logs) '() (let loop ((hed (car server-logs)) (tal (cdr server-logs)) (res '())) (let ((mod-time (handle-exceptions exn (begin (print "failed to get modification time on " hed ", exn=" exn) (current-seconds)) ;; 0 (file-modification-time hed))) ;; default to very old so log gets ignored if deleted (down-time (- (current-seconds) mod-time)) (serv-dat (if (or (< num-serv-logs 10) (< down-time 900)) ;; day-seconds)) (server:logf-get-start-info hed) '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at (serv-rec (cons mod-time serv-dat)) (fmatch (string-match fname-rx hed)) (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) (new-res (if (null? serv-dat) res (cons (append serv-rec (list pid)) res)))) (if (null? tal) (if (and limit (> (length new-res) limit)) new-res ;; (take new-res limit) <= need intelligent sorting before this will work new-res) (loop (car tal)(cdr tal) new-res))))))))) (define (server:get-num-alive srvlst) (let ((num-alive 0)) (for-each (lambda (server) (match-let (((mod-time host port start-time pid) server))	\| \| \| \| \| \| \| \| \| \| \| \| \| \|	200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248	;; otherwise attempt to create the logs dir and then ;; continue (if (if (directory-exists? (conc areapath "/logs")) '() (if (file-write-access? areapath) (begin (condition-case (create-directory (conc areapath "/logs") #t) (exn (i/o file)(debug:print 0 default-log-port "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 default-log-port "ERROR: Unknown error attemtping to get server list. exn=" exn))) (directory-exists? (conc areapath "/logs"))) '())) (let* ((server-logs (glob (conc areapath "/logs/server-.log"))) (num-serv-logs (length server-logs))) (if (null? server-logs) '() (let loop ((hed (car server-logs)) (tal (cdr server-logs)) (res '())) (let ((mod-time (handle-exceptions exn (begin (print "failed to get modification time on " hed ", exn=" exn) (current-seconds)) ;; 0 (file-modification-time hed))) ;; default to very old so log gets ignored if deleted (down-time (- (current-seconds) mod-time)) (serv-dat (if (or (< num-serv-logs 10) (< down-time 900)) ;; day-seconds)) (server:logf-get-start-info hed) '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at (serv-rec (cons mod-time serv-dat)) (fmatch (string-match fname-rx hed)) (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) (new-res (if (null? serv-dat) res (cons (append serv-rec (list pid)) res)))) (if (null? tal) (if (and limit (> (length new-res) limit)) new-res ;; (take new-res limit) <= need intelligent sorting before this will work new-res) (loop (car tal)(cdr tal) new-res))))))))) (define (server:get-num-alive srvlst) (let ((num-alive 0)) (for-each (lambda (server) (match-let (((mod-time host port start-time pid) server))
︙			︙
272 273 274 275 276 277 278 ~~279 280 281 282~~ 283 284 285 286 287 288 289	(> (length rec) 2)) (let ((start-time (list-ref rec 3)) (mod-time (list-ref rec 0))) ;; (print "start-time: " start-time " mod-time: " mod-time) (and start-time mod-time (> (- now start-time) 0) ;; been running at least 0 seconds (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds (< (- now start-time) (+ (- (string->numbe~~r (o~~r (configf:lookup configdat "server" "runtime~~") "3600~~")) 180) (random 360))) ;; under one hour running time +/- 180 )) #f)) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) (if (> (length slst) nums)	> \| \| \| \|	272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290	(> (length rec) 2)) (let ((start-time (list-ref rec 3)) (mod-time (list-ref rec 0))) ;; (print "start-time: " start-time " mod-time: " mod-time) (and start-time mod-time (> (- now start-time) 0) ;; been running at least 0 seconds (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds (or (not (configf:lookup configdat "server" "runtime")) ;; skip if not set (< (- now start-time) (+ (- (string->number (configf:lookup configdat "server" "runtime")) 180) (random 360)))) ;; under one hour running time +/- 180 )) #f)) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) (if (> (length slst) nums)
︙			︙
319 320 321 322 323 324 325 ~~326~~ 327 328 ~~329 330 331~~ ~~332 333 334~~ ~~335~~ 336 ~~337 338 339 340~~ 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 ~~361~~ 362 363 364 365 366 367 368 369	(let ((sig (server:mk-signature))) (set! my-client-signature sig) my-client-signature))) ;; wait for server=start-last to be three seconds old ;; (define (server:wait-for-server-start-last-flag areapath) ~~(let* ((start-flag (conc areapath "/logs/server-start-last")))~~ (if (file-exists? start-flag) (let* ((fmodtime (file-modification-time start-flag)) ~~(reftime (+ 3 (random 5)))~~ (delta (- (current-seconds) fmodtime))) ~~(if~~ (> delta reftime) ~~;; good enough~~ ~~(begin (debug:print-info 0 default-log-port "Re~~ady to~~ s~~tar~~t server, l~~ast start~~: " ~~fmodtime ", delta: " delta ", reftime: " reftime)~~~~ ~~(system (conc "touch " start-flag))) ;; lazy but safe~~ (begin ~~(thread-sleep! 5) (server:wait-for-server-start-last-flag areapath)))) ~~(system (conc "touch " start-flag)))))~~~~ ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least 3 seconds old (server:wait-for-server-start-last-flag areapath) (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((last-run-dat (hash-table-ref/default server-kind-run areapath '(0 0))) ;; callnum, whenrun (call-num (car last-run-dat)) (when-run (cadr last-run-dat)) (run-delay (+ (case call-num ((0) 0) ((1) 20) ((2) 300) (else 600)) (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously (lock-file (conc areapath "/logs/server-start.lock"))) (if (> (- (current-seconds) when-run) run-delay) ~~(~~begin~~~~ (common:simple-file-lock-and-wait lock-file expire-time: 15) (server:run areapath) (thread-sleep! 2) ;; don't release the lock for at least a few seconds (common:simple-file-release-lock lock-file))) (hash-table-set! server-kind-run areapath (list (+ call-num 1)(current-seconds)))))) ;; this one seems to be the general entry point ;;	\| > > > < \| \| > > \| \| < > > > > > > > > > > > \| > > \| \| < < \| >	320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385	(let ((sig (server:mk-signature))) (set! my-client-signature sig) my-client-signature))) ;; wait for server=start-last to be three seconds old ;; (define (server:wait-for-server-start-last-flag areapath) (let* ((start-flag (conc areapath "/logs/server-start-last")) ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) (reftime (configf:lookup-number configdat "server" "idletime" default: 4)) (server-key (conc (get-host-name) "-" (current-process-id)))) (if (file-exists? start-flag) (let* ((fmodtime (file-modification-time start-flag)) (delta (- (current-seconds) fmodtime)) (all-go (> delta reftime))) (if (handle-exceptions exn (begin (debug:print-info 0 default-log-port "Something not right with soft server locking: exn=" exn) #t) (and all-go (begin (with-output-to-file start-flag (lambda () (print server-key))) (thread-sleep! 0.25) (let ((res (with-input-from-file start-flag (lambda () (read-line))))) (equal? server-key res))))) #t ;; (system (conc "touch " start-flag)) ;; lazy but safe (begin (debug:print-info 0 default-log-port "Gating server start, last start: " fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go) (thread-sleep! reftime) (server:wait-for-server-start-last-flag areapath))))))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least 3 seconds old (server:wait-for-server-start-last-flag areapath) (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((last-run-dat (hash-table-ref/default server-kind-run areapath '(0 0))) ;; callnum, whenrun (call-num (car last-run-dat)) (when-run (cadr last-run-dat)) (run-delay (+ (case call-num ((0) 0) ((1) 20) ((2) 300) (else 600)) (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously (lock-file (conc areapath "/logs/server-start.lock"))) (if (> (- (current-seconds) when-run) run-delay) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (common:simple-file-lock-and-wait lock-file expire-time: 15) (system (conc "touch " start-flag)) ;; lazy but safe (server:run areapath) (thread-sleep! 2) ;; don't release the lock for at least a few seconds (common:simple-file-release-lock lock-file))) (hash-table-set! server-kind-run areapath (list (+ call-num 1)(current-seconds)))))) ;; this one seems to be the general entry point ;;
︙			︙
572 573 574 575 576 577 578 579 580 ~~581~~ ~~582~~ 583 584 585 ~~586~~ 587 588 589 590 591 592 593	(debug:print 0 default-log-port "INFO: syncer thread NOT sleeping ; maybe time-to-exit..."))) (if (not (configf:lookup configdat "server" "disable-db-snapshot")) (common:snapshot-file mtdbfile subdir: ".db-snapshot")) (delete-file* staging-file) (let* ((start-time (current-milliseconds)) (res (system sync-cmd)) (res2 (cond ~~((eq? 0 res)~~ ~~(delete-file* ~~(conc mtdbfile ".~~backup"))~~ (if (eq? 0 (file-size sync-log)) (delete-file sync-log)) (system (conc "/bin/mv " staging-file " " mtdbfile)) (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000)) (set! off-time (calculate-off-time last-sync-seconds (cond ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1)) duty-cycle) (else	> \| > > > > > > \| > \|	588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617	(debug:print 0 default-log-port "INFO: syncer thread NOT sleeping ; maybe time-to-exit..."))) (if (not (configf:lookup configdat "server" "disable-db-snapshot")) (common:snapshot-file mtdbfile subdir: ".db-snapshot")) (delete-file* staging-file) (let* ((start-time (current-milliseconds)) (res (system sync-cmd)) (dbbackupfile (conc mtdbfile ".backup")) (res2 (cond ((eq? 0 res ) (handle-exceptions exn (begin (debug:print-info 0 default-log-port "Problem making backup db file, exn=" exn) #t) (if (file-exists? dbbackupfile) (delete-file* dbbackupfile) ) (if (eq? 0 (file-size sync-log)) (delete-file sync-log)) (system (conc "/bin/mv " staging-file " " mtdbfile)) ) (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000)) (set! off-time (calculate-off-time last-sync-seconds (cond ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1)) duty-cycle) (else
︙			︙

︙			︙
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28	;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;; strftime('%m/%d/%Y %H:%M:%S','now','localtime') (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) ~~posix-extras directory-utils pathname-expand typed-records format sxml-serializer ~~sxml-modifications)~~~~ (declare (unit runs)) (declare (uses db)) (declare (uses common)) (declare (uses items)) (declare (uses runconfig)) (declare (uses tests))	\| >	14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29	;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;; strftime('%m/%d/%Y %H:%M:%S','now','localtime') (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) posix-extras directory-utils pathname-expand typed-records format sxml-serializer sxml-modifications matchable) (declare (unit runs)) (declare (uses db)) (declare (uses common)) (declare (uses items)) (declare (uses runconfig)) (declare (uses tests))
︙			︙
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60	;; (defstruct runs:dat reglen regfull runname max-concurrent-jobs run-id test-patts required-tests test-registry registry-mutex flags keyvals run-info all-tests-registry ~~can-run-more-tests~~ ((can-run-more-tests-count 0) : fixnum) (last-fuel-check 0) ;; time when we last checked fuel (beginning-of-time (current-seconds)) ) (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup waitons testmode newtal itemmaps prereqs-not-met)	\| > > > > >	44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66	;; (defstruct runs:dat reglen regfull runname max-concurrent-jobs run-id test-patts required-tests test-registry registry-mutex flags keyvals run-info all-tests-registry ;; stores results from last runs:can-run-more-tests (can-run-more-tests #f) ;; (list can-run-more-flag num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit) ((can-run-more-tests-count 0) : fixnum) (last-fuel-check 0) ;; time when we last checked fuel (beginning-of-time (current-seconds)) (load-mgmt-function #f) (wait-for-jobs-function #f) (last-load-check-time 0) (last-jobs-check-time 0) ) (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup waitons testmode newtal itemmaps prereqs-not-met)
︙			︙
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101	((< age 10) #t) (else #f)))) lock-files))) (if fresh-locks (begin (if (runs:lownoise "runners-softlock-wait" 360) (debug:print-info 0 default-log-port "Other runners in flight, giving up some time...")) ~~(thread-sleep! 10))~~ (begin (if (runs:lownoise "runners-softlock-nowait" 360) (debug:print-info 0 default-log-port "No runners in flight, updating softlock")) (let* ((ouf (open-output-file my-lock-file))) (with-output-to-port ouf (lambda ()(print (current-seconds)))) (close-output-port ouf))))	\|	93 94 95 96 97 98 99 100 101 102 103 104 105 106 107	((< age 10) #t) (else #f)))) lock-files))) (if fresh-locks (begin (if (runs:lownoise "runners-softlock-wait" 360) (debug:print-info 0 default-log-port "Other runners in flight, giving up some time...")) (thread-sleep! 2)) (begin (if (runs:lownoise "runners-softlock-nowait" 360) (debug:print-info 0 default-log-port "No runners in flight, updating softlock")) (let* ((ouf (open-output-file my-lock-file))) (with-output-to-port ouf (lambda ()(print (current-seconds)))) (close-output-port ouf))))
︙			︙
311 312 313 314 315 316 317 ~~318 319 320 321 322 323 324 325 326 327 328 329 330 331 332~~ 333 334 335 336 337 338 339	(define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) ~~(thread-sleep! (cond ;; BB: check with Matt. Should this sleep move~~ ~~;; to cond clauses below where we determine we~~ ~~;; have too many jobs running rather than each~~ ~~;; time the and condition above is true (which~~ ~~;; seems like always)?~~ ~~((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way~~ 0) ~~((>~~ (runs:dat-~~can-r~~un~~-more-tests-count~~ runsdat) ~~20) ;; o~~r~~igi~~n~~al intent was - save cycles, wait a~~ lo~~ng time~~ ~~(if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 default-log-port "waiting for tasks to complete, sleeping briefly ..."))~~ ~~10) ;; obviously haven't had any work to do for a while~~ ~~(else 0)))~~ ~~;; ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?~~ ~~;; (configf:lookup-number configdat "setup" "inter-test-delay" default: 0.01)~~ ~~;; )))~~ (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup configdat "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) (if (> (+ num-running num-running-in-jobgroup) 0)	< < < < < < < \| < < < < < < \|	317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332	(define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup configdat "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) (if (> (+ num-running num-running-in-jobgroup) 0)
︙			︙
537 538 539 540 541 542 543 ~~544~~ 545 546 547 548 549 550 551	(debug:print-info 0 default-log-port "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests) ));; tests will be ANDed with this list ;; register this run in monitor.db (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) (rmt:tasks-set-state-given-param-key task-key "running") ~~(common:telemetry-log "run-tests"~~ payload: `( (target . ,target) (run-name . ,runname) (test-patts . ,test-patts) ) ) ;; Now generate all the tests lists	\|	530 531 532 533 534 535 536 537 538 539 540 541 542 543 544	(debug:print-info 0 default-log-port "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests) ));; tests will be ANDed with this list ;; register this run in monitor.db (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) (rmt:tasks-set-state-given-param-key task-key "running") #;(common:telemetry-log "run-tests" payload: `( (target . ,target) (run-name . ,runname) (test-patts . ,test-patts) ) ) ;; Now generate all the tests lists
︙			︙
989 990 991 992 993 994 995 ~~996~~ 997 ~~998~~ 999 1000 1001 1002 1003 1004 1005	((and (null? fails) ;; have not-started tests, but unable to run them. everything looks completed with no prospect of unsticking something that is stuck. we should mark hed as moribund and exit or continue if there are more tests to consider (null? prereq-fails) (null? non-completed)) (debug:print-info 4 default-log-port "cond branch - " "ei-4") (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) ~~(debug:print-info 0 default-log-port "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default seen-cant-run-tests hed 0)) ;;~~ ;; getting here likely means the system is way overloaded, kill a full minute before continuing ~~(thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)~~ ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 default-log-port "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))	\| \| > > > >	982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002	((and (null? fails) ;; have not-started tests, but unable to run them. everything looks completed with no prospect of unsticking something that is stuck. we should mark hed as moribund and exit or continue if there are more tests to consider (null? prereq-fails) (null? non-completed)) (debug:print-info 4 default-log-port "cond branch - " "ei-4") (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) (debug:print-info 0 default-log-port "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default seen-cant-run-tests hed 0) ", going to wait 60 sec.") ;; ;; getting here likely means the system is way overloaded, kill a full minute before continuing ;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!! ;; No runsdat, can't do this yet ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) ;; (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 default-log-port "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))
︙			︙
1121 1122 1123 1124 1125 1126 1127 ~~1128~~ 1129 1130 1131 1132 1133 1134 1135	(if (vector? t) (conc (db:test-get-state t) "/" (db:test-get-status t)) (conc " WARNING: t is not a vector=" t ))) prereqs-not-met) ", ") ") fails: " fails "\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) (if (and (not (null? prereqs-not-met)) (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) (debug:print-info 2 default-log-port "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) ;; Don't know at this time if the test have been launched at some time in the past ;; i.e. is this a re-launch?	> > > > > > > > > > > > \| > > > > > >	1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150	(if (vector? t) (conc (db:test-get-state t) "/" (db:test-get-status t)) (conc " WARNING: t is not a vector=" t ))) prereqs-not-met) ", ") ") fails: " fails "\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) ;; well, first lets see if cpu load throttling is enabled. If so wait around until the ;; average cpu load is under the threshold before continuing ;; (if (runs:dat-load-mgmt-function runsdat) ((runs:dat-load-mgmt-function runsdat)) (runs:dat-load-mgmt-function-set! runsdat (lambda () ;; jobtools maxload is useful for where the full Megatest run is done on one machine (if (and (not (common:on-homehost?)) maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues (if maxhomehostload (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))))) (if (and (not (null? prereqs-not-met)) (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) (debug:print-info 2 default-log-port "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) ;; Don't know at this time if the test have been launched at some time in the past ;; i.e. is this a re-launch?
︙			︙
1210 1211 1212 1213 1214 1215 1216 ~~1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229~~ 1230 1231 1232 1233 1234 ~~1235~~ 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 ~~1256~~ 1257 1258 1259 1260 ~~1261 1262 1263 1264~~ 1265 1266 1267 1268 1269 1270 1271	(null? non-completed) (not (member 'exclusive testmode))))) ;; (hash-table-delete! max-tries-hash (db:test-make-full-name test-name item-path)) ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! max-tries-hash (make-hash-table)) ~~;; well, first lets see if cpu load throttling is enabled. If so wait around until the~~ ~~;; average cpu load is under the threshold before continuing~~ ~~;; jobtools maxload is useful for where the full Megatest run is done on one machine~~ ~~(if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized~~ ~~(common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))~~ ~~;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues~~ ~~(if maxhomehostload~~ ~~(common:wait-for-homehost-load maxhomehostload~~ ~~(conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))~~ (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! global-delta) (if (or (not (null? tal))(not (null? reg))) ~~(runs:loop-values tal reg reglen regfull reruns)~~ #f)) ;; must be we have unmet prerequisites ;; (else (debug:print 4 default-log-port "FAILS: " fails) ;; If one or more of the prereqs-not-met are FAIL then we can issue ;; a message and drop hed from the items to be processed. ;; (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) (if (and (not (null? prereqs-not-met)) (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) (debug:print-info 1 default-log-port "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) (if (or (null? fails) (member 'toplevel testmode)) (begin ;; couldn't run, take a breather (if (runs:lownoise "Waiting for more work to do..." 60) (debug:print-info 0 default-log-port "Waiting for more work to do...")) ~~(thread-sleep! 1)~~ (list (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (begin (let ((my-test-id (rmt:get-test-id run-id test-name item-path))) ~~(mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2"))~~ (if (or (not (null? reg))(not (null? tal))) (if (vector? hed) (begin (debug:print 1 default-log-port "WARNING: Dropping test " test-name "/" item-path " from the launch list as it has prerequistes that are FAIL") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))	< < < < < < < < < < < < \| \| > > \| \| < < <	1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273	(null? non-completed) (not (member 'exclusive testmode))))) ;; (hash-table-delete! max-tries-hash (db:test-make-full-name test-name item-path)) ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! max-tries-hash (make-hash-table)) (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat testdat) (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! global-delta) (if (or (not (null? tal))(not (null? reg))) (runs:loop-values tal reg reglen regfull reruns) ;; hed should be dropped at this time #f)) ;; must be we have unmet prerequisites ;; (else (debug:print 4 default-log-port "FAILS: " fails) ;; If one or more of the prereqs-not-met are FAIL then we can issue ;; a message and drop hed from the items to be processed. ;; (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) (if (and (not (null? prereqs-not-met)) (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) (debug:print-info 1 default-log-port "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) (if (or (null? fails) (member 'toplevel testmode)) (begin ;; couldn't run, take a breather (if (runs:lownoise "Waiting for more work to do..." 60) (debug:print-info 0 default-log-port "Waiting for more work to do...")) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) (list (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (begin (let ((my-test-id (rmt:get-test-id run-id test-name item-path))) (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2")) (if (or (not (null? reg))(not (null? tal))) (if (vector? hed) (begin (debug:print 1 default-log-port "WARNING: Dropping test " test-name "/" item-path " from the launch list as it has prerequistes that are FAIL") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))
︙			︙
1579 1580 1581 1582 1583 1584 1585 ~~1586 1587 1588 1589 1590 1591 1592 1593 1594~~ 1595 1596 1597 1598 1599 1600 1601	testmode: testmode newtal: newtal itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) ~~;; -- removed BB 17ww28 - no longer needed.~~ ~~;; every 15 minutes verify the server is there for this run~~ ~~;; (if (and (common:low-noise-print 240 "try start server" run-id)~~ ;; (not (or (and runremote ~~;; (remote-server-url runremote)~~ ~~;; (server:ping (remote-server-url runremote)))~~ ~~;; (server:check-if-running toppath))))~~ ~~;; (server:kind-run toppath))~~ (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup configdat "setup" "give-up-waiting") 36000))) (hash-table-set! max-tries-hash tfullname (+ (hash-table-ref/default max-tries-hash tfullname 0) 1))) ;; (debug:print 0 default-log-port "max-tries-hash: " (hash-table->alist max-tries-hash))	< < < < < < < < <	1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594	testmode: testmode newtal: newtal itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup configdat "setup" "give-up-waiting") 36000))) (hash-table-set! max-tries-hash tfullname (+ (hash-table-ref/default max-tries-hash tfullname 0) 1))) ;; (debug:print 0 default-log-port "max-tries-hash: " (hash-table->alist max-tries-hash))
︙			︙
1634 1635 1636 1637 1638 1639 1640 ~~1641~~ 1642 1643 1644 1645 1646 1647 1648	"\n num-retries: " num-retries "\n reruns: " reruns "\n regfull: " regfull "\n reglen: " reglen "\n length reg: " (length reg) ) ~~(runs:parallel-runners-mgmt runsdat)~~ ;; check for hed in waitons => this would be circular, remove it and issue an ;; error (if (member test-name waitons) (begin (debug:print-error 0 default-log-port "test " test-name " has listed itself as a waiton, please correct this!") (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))	\|	1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641	"\n num-retries: " num-retries "\n reruns: " reruns "\n regfull: " regfull "\n reglen: " reglen "\n length reg: " (length reg) ) ;; (runs:parallel-runners-mgmt runsdat) ;; check for hed in waitons => this would be circular, remove it and issue an ;; error (if (member test-name waitons) (begin (debug:print-error 0 default-log-port "test " test-name " has listed itself as a waiton, please correct this!") (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))
︙			︙
1670 1671 1672 1673 1674 1675 1676 ~~1677~~ 1678 ~~1679 1680~~ ~~1681~~ 1682 ~~1683~~ ~~1684~~ ~~1685~~ 1686 1687 1688 1689 1690 1691 1692	;; ((not items) (debug:print-info 4 default-log-port "cond branch - " "rtq-2") (debug:print-info 4 default-log-port "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) ~~(runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))~~ ~~;; Th~~is w~~ould ~~be a good place to blo~~ck ~~on homehost load~~~~ ~~(runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ~~;; look at the test jobgroup and tot jobs running~~~~ ~~(let ((loop-list (runs:process-expanded-tests runsdat testdat)))~~ (if loop-list (apply loop loop-list)))) ;; items processed into a list but not came in as a list been processed ;; ((and (list? items) ;; thus we know our items are already calculated (not itemdat)) ;; and not yet expanded into the list of things to be done (debug:print-info 4 default-log-port "cond branch - " "rtq-3")	\| > > > > > > > > > > > > > > > > > > > > > > > > \| \| > > > > > > > > > > > > > > > \| > > > > > > > \| > \| > \|	1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733	;; ((not items) (debug:print-info 4 default-log-port "cond branch - " "rtq-2") (debug:print-info 4 default-log-port "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) ;;(runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; gonna try a strategy change here. ;; ;; check if can run more tests. if yes, continue, if no, rest until can run more ;; look at the test jobgroup and tot jobs running ;; ;; NOTE: This does NOT actually gate here, only captures the proc to be called later ;; (if (not (runs:dat-wait-for-jobs-function runsdat)) (runs:dat-wait-for-jobs-function-set! runsdat (lambda (testdat-in) (let* ((jobgroup (runs:testdat-jobgroup testdat-in)) (can-run-more-tests (runs:dat-can-run-more-tests runsdat)) (last-jobs-check-time (runs:dat-last-jobs-check-time runsdat)) (should-check-jobs (match can-run-more-tests ((can-run-more-flag num-running nr-in-jobgroup max-concurrent-jobs . params) (if (< (- max-concurrent-jobs num-running) 25) (begin (debug:print-info 0 default-log-port "less than 20 jobs headroom, ("max-concurrent-jobs "-"num-running")>20. Forcing prelaunch check.") #t) #f)) (else #f)))) ;; no record yet (if should-check-jobs (let loop-can-run-more ((res (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) (remtries 1440)) ;; we can wait for up to two hours for jobs to get done (match res ((run-more num-running . rem) (if (or run-more (< remtries 1)) (begin (if (runs:lownoise "num-running" 30) (debug:print-info 0 default-log-port "Have "num-running" tests of max " max-concurrent-jobs)) (runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through (begin (if (runs:lownoise "num-running" 10) (debug:print-info 0 default-log-port "Can't run more tests, have "num-running" tests of " max-concurrent-jobs " allowed.")) (thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable ;; wait for load here (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) (- remtries 1))))))) ))))) ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running (if loop-list (apply loop loop-list)))) ;; items processed into a list but not came in as a list been processed ;; ((and (list? items) ;; thus we know our items are already calculated (not itemdat)) ;; and not yet expanded into the list of things to be done (debug:print-info 4 default-log-port "cond branch - " "rtq-3")
︙			︙
1741 1742 1743 1744 1745 1746 1747 ~~1748~~ 1749 ~~1750 1751~~ 1752 1753 ~~1754 1755 1756~~ 1757 1758 1759 1760 1761 1762 1763	(loop (car tal)(cdr tal) reg reruns))) ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS ((or (procedure? items)(eq? items 'have-procedure)) (debug:print-info 4 default-log-port "cond branch - " "rtq-4") ~~(let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)))~~ (if (and (list? can-run-more) (car can-run-more)) (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup ~~max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here~~ (if loop-list (apply loop loop-list) ~~(debug:print-info 4 default-log-port " -- Can't expand hed="hed) ) )~~ ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns)))) ;; this case should not happen, added to help catch any bugs ((and (list? items) itemdat) (debug:print-info 4 default-log-port "cond branch - " "rtq-5") (debug:print-error 0 default-log-port "Should not have a list of items in a test and the itemspath set - please report this")	\| \| \| > > > \| < <	1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805	(loop (car tal)(cdr tal) reg reruns))) ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS ((or (procedure? items)(eq? items 'have-procedure)) (debug:print-info 4 default-log-port "cond branch - " "rtq-4") (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) (car can-run-more)) ;; itemized test expanded here (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) (if loop-list (apply loop loop-list) (debug:print-info 4 default-log-port " -- Can't expand hed="hed))) ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns)))) ;; this case should not happen, added to help catch any bugs ((and (list? items) itemdat) (debug:print-info 4 default-log-port "cond branch - " "rtq-5") (debug:print-error 0 default-log-port "Should not have a list of items in a test and the itemspath set - please report this")
︙			︙
1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 ~~1812~~ 1813 1814 1815 1816 1817 1818 1819	(debug:print-info 4 default-log-port "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) ))) ;; end loop on sorted test names ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched (rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now if -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes (prev-num-running 0)) ;; (debug:print-info 13 default-log-port "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup configdat "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 default-log-port "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no (debug:print-info 0 default-log-port "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 default-log-port "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ~~(thread-sleep! 5)~~ (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 default-log-port "Calling Post Hook") ;; (runs:run-post-hook run-id)	> > > \|	1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864	(debug:print-info 4 default-log-port "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) ))) ;; end loop on sorted test names ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched (rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now if -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes (prev-num-running 0)) ;; (debug:print-info 13 default-log-port "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup configdat "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 default-log-port "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no (debug:print-info 0 default-log-port "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 default-log-port "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 default-log-port "Calling Post Hook") ;; (runs:run-post-hook run-id)
︙			︙
1864 1865 1866 1867 1868 1869 1870 ~~1871~~ 1872 1873 1874 1875 1876 1877 1878	(if (not (vector? t)) (conc t) (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t)))) lst)) ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step ;; ~~(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry)~~ ;; All these vars might be referenced by the testconfig file reader ;; ;; NEED to reprocess testconfig here, ensuring that item variables are available. ;; This is for Tal's issue with item-specific env vars not being set for use in skip. ;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273 ;; (let* ((test-name (tests:testqueue-get-testname test-record))	\|	1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923	(if (not (vector? t)) (conc t) (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t)))) lst)) ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step ;; (define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry runsdat testdat-rec) ;; All these vars might be referenced by the testconfig file reader ;; ;; NEED to reprocess testconfig here, ensuring that item variables are available. ;; This is for Tal's issue with item-specific env vars not being set for use in skip. ;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273 ;; (let* ((test-name (tests:testqueue-get-testname test-record))
︙			︙
1942 1943 1944 1945 1946 1947 1948 ~~1949~~ ~~1950~~ 1951 1952 1953 1954 1955 1956 1957	(debug:print 2 default-log-port "WARN: Test not pre-created? test-name=" test-name ", item-path=" item-path ", run-id=" run-id) (rmt:register-test run-id test-name item-path) (set! test-id (rmt:get-test-id run-id test-name item-path)))) (debug:print-info 4 default-log-port "test-id=" test-id ", run-id=" run-id ", test-name=" test-name ", item-path=\"" item-path "\"") (set! testdat (rmt:get-test-info-by-id run-id test-id)) (if (not testdat) (begin ~~(debug:print-info 0 default-log-port "WARNING: server is overloaded, trying again in ~~one~~ second")~~ ~~(thread-sleep! 1)~~ (loop))))) (if (not testdat) ;; should NOT happen (debug:print-error 0 default-log-port "failed to get test record for test-id " test-id)) (set! test-id (db:test-get-id testdat)) (if (common:file-exists? test-path) (change-directory test-path) (begin	\| > \|	1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003	(debug:print 2 default-log-port "WARN: Test not pre-created? test-name=" test-name ", item-path=" item-path ", run-id=" run-id) (rmt:register-test run-id test-name item-path) (set! test-id (rmt:get-test-id run-id test-name item-path)))) (debug:print-info 4 default-log-port "test-id=" test-id ", run-id=" run-id ", test-name=" test-name ", item-path=\"" item-path "\"") (set! testdat (rmt:get-test-info-by-id run-id test-id)) (if (not testdat) (begin (debug:print-info 0 default-log-port "WARNING: server is overloaded, trying again in two seconds") ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 2) (loop))))) (if (not testdat) ;; should NOT happen (debug:print-error 0 default-log-port "failed to get test record for test-id " test-id)) (set! test-id (db:test-get-id testdat)) (if (common:file-exists? test-path) (change-directory test-path) (begin
︙			︙
2054 2055 2056 2057 2058 2059 2060 ~~2061 2062 2063 2064 2065~~ 2066 2067 2068 2069 2070 2071 2072	(if skip-test (begin (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test) (debug:print-info 1 default-log-port "SKIPPING Test " full-test-name " due to " skip-test)) ;; ;; Here the test is handed off to launch.scm for launch-test to complete the launch process ;; (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags)) (begin (print "ERROR: Failed to launch the test. Exiting as soon as possible") (set! globalexitstatus 1) ;; (process-signal (current-process-id) signal/kill~~))))))~~)) ((KILLED) (debug:print 1 default-log-port "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.") (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)) ;; KILLED)) ((LAUNCHED REMOTEHOSTSTART RUNNING) (debug:print 2 default-log-port "NOTE: " test-name " is already running")) ;; (if (> (- (current-seconds)(+ (db:test-get-event_time testdat) ;; (db:test-get-run_duration testdat)))	> > > > > \| \| \| \| \| > > >	2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126	(if skip-test (begin (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test) (debug:print-info 1 default-log-port "SKIPPING Test " full-test-name " due to " skip-test)) ;; ;; Here the test is handed off to launch.scm for launch-test to complete the launch process ;; (begin ;; wait for less than max jobs here (if (runs:dat-wait-for-jobs-function runsdat) ((runs:dat-wait-for-jobs-function runsdat) testdat-rec)) (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags)) (begin (print "ERROR: Failed to launch the test. Exiting as soon as possible") (set! globalexitstatus 1) ;; (process-signal (current-process-id) signal/kill)) ) ;; wait again here? )))))) ((KILLED) (debug:print 1 default-log-port "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.") (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)) ;; KILLED)) ((LAUNCHED REMOTEHOSTSTART RUNNING) (debug:print 2 default-log-port "NOTE: " test-name " is already running")) ;; (if (> (- (current-seconds)(+ (db:test-get-event_time testdat) ;; (db:test-get-run_duration testdat)))
︙			︙
2533 2534 2535 2536 2537 2538 2539 ~~2540~~ 2541 2542 2543 2544 2545 2546 2547	(debug:print-info 2 default-log-port "new state " new-state ", new status " new-status ) (mt:test-set-state-status-by-id run-id test-id new-state new-status #f)) (if (not (null? tal)) (loop (car tal)(cdr tal)))) ((run-wait) ;; BB TODO - manage has-subrun case (debug:print-info 2 default-log-port "still waiting, " (length tests) " tests still running") ~~(thread-sleep! 10)~~ (let ((new-tests (proc-get-tests run-id))) (if (null? new-tests) (debug:print-info 1 default-log-port "Run completed according to zero tests matching provided criteria.") (loop (car new-tests)(cdr new-tests))))) ((archive) ;; BB TODO - manage has-subrun case (if (and run-dir (not toplevel-with-children))	\|	2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601	(debug:print-info 2 default-log-port "new state " new-state ", new status " new-status ) (mt:test-set-state-status-by-id run-id test-id new-state new-status #f)) (if (not (null? tal)) (loop (car tal)(cdr tal)))) ((run-wait) ;; BB TODO - manage has-subrun case (debug:print-info 2 default-log-port "still waiting, " (length tests) " tests still running") (thread-sleep! 5) (let ((new-tests (proc-get-tests run-id))) (if (null? new-tests) (debug:print-info 1 default-log-port "Run completed according to zero tests matching provided criteria.") (loop (car new-tests)(cdr new-tests))))) ((archive) ;; BB TODO - manage has-subrun case (if (and run-dir (not toplevel-with-children))
︙			︙