Megatest: Changes On Branch 5a05fc04ff63c390

Changes In Branch v1.6569-gt-crash-fix Excluding Merge-Ins

This is equivalent to a diff from 2769e4b7c9 to 5a05fc04ff

2021-03-01
17:42		Manually patched in the new view check-in: f5206150ee user: mrwellan tags: v1.6569-new-view
2021-01-26
20:05		Merged in the removeal of fastmode (which did nothing) check-in: 4d5af74171 user: matt tags: v1.65-lazyqueue-items-rollup
14:00		Fix for the > crash. Maybe... Leaf check-in: 5a05fc04ff user: matt tags: v1.6569-gt-crash-fix
2021-01-25
12:03		rebased lazy-queue rollup check-in: 07ab120544 user: matt tags: v1.65-lazyqueue-items-rollup
2021-01-15
22:46		begin diet check-in: badd71f3b3 user: matt tags: v1.6569-diet
21:34		eval-string-in-environment if was disabled, re-enabled check-in: 9564772564 user: matt tags: v1.6569-reenable-eval-if
2021-01-08
11:42		enable custom value for max delay between archive time and test last update time Leaf check-in: 86a3d1148e user: pjhatwal tags: v1.6569-refactor
2020-11-25
12:00		Fixed issues in server gating code Leaf check-in: 063273e8cb user: mrwellan tags: v1.6569-server-gate-fix
2020-11-24
22:27		Added support for resetting run - allows to reload tests-paths to add tests to a run part way though. Just run megatest -clean-cache -runname $MT_RUNNAME Leaf check-in: 213021e02d user: mrwellan tags: v1.6596-reload-tests-paths
2020-10-13
16:46		Changed version from 69 to 76. No other changes. Will compile with chicken 13 check-in: 87ca35010f user: mmgraham tags: v1.65, v1.6576
2020-10-12
16:49		Reduced message from failed to info. Reverted a delay which seems to help pass full stack ext-tests. Leaf check-in: 9e35b1252c user: mrwellan tags: v1.65-minor-patch
10:18		Safe vector access in rmt. check-in: 58bb6d997a user: mrwellan tags: v1.65-side2
2020-10-11
22:46		Patched forward adjutant code. check-in: f936717bfa user: matt tags: v1.65-adjutant-again
2020-10-05
22:49		Do not exit on failure to create directory - race conditons on NFS cause false fail scenarios - just keep going and cross your fingers... (cherrypicked from v1.6572) check-in: 05b253a452 user: matt tags: v1.65-sidework
22:46		run duration testdat check-in: 4a0b43f3c6 user: matt tags: v1.65-test-rundat2
2020-09-21
15:36		merged in 1.65-test-rundat branch ==/FAIL/orion,mars/== check-in: cfd25d66e9 user: mmgraham tags: v1.6571, v1.65-failed-testdat
07:00		Added get-testsuite-name all over launch:setup and still not set when needed! This did NOT work. Closed-Leaf check-in: 2efe8ad422 user: mrwellan tags: v1.65-get-testsuitename
2020-09-19
04:21		Start moving test_rundat to no-sync db. ==/20/2/WARN/1203/mars/== check-in: abfabdb839 user: matt tags: v1.65-test-rundat
2020-09-18
17:30		added check for file existence before file delete ==/14/1.9/WARN/orion,mars/== NOTE: This is the last v1.65 before the split off. I.e code from before this point IS in the far future v1.65 branch. Code from this point to that branch might NOT be in the branch. check-in: 2769e4b7c9 user: mmgraham tags: v1.65, v1.6569
12:27		cherry picked 2 fixes, changed version to 1.6569 ==/7.2/2.0/PASS/1201/mars/== check-in: d145d0eb02 user: mmgraham tags: v1.65

Modified db.scm from [fb3a18f52f] to [09786a6ce2].

Modified rmt.scm from [39d97c528a] to [169f275811].

Modified runs.scm from [030b929939] to [a0489a126c].

︙			︙
3213 3214 3215 3216 3217 3218 3219 ~~3220 3221 3222 3223~~ 3224 3225 3226 3227 3228 3229 3230	(if newstatus (sqlite3:execute db "UPDATE tests SET status=? WHERE id=?;" newstatus test-id)) (if newcomment (sqlite3:execute db "UPDATE tests SET comment=? WHERE id=?;" newcomment ;; (sdb:qry 'getid newcomment) test-id)))))) (mt:process-triggers dbstruct run-id test-id newstate newstatus)) ;; NEW BEHAVIOR: Count tests running in all runs! ;; (define (db:get-count-tests-running dbstruct run-id fastmode) (let* ((qry (if fastmode "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '') LIMIT 1;" "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '');"))) (db:with-db dbstruct run-id #f (lambda (db) (let* ((stmth (db:get-cache-stmth dbstruct db qry))) (sqlite3:first-result stmth))))))	\| \| \| \|	3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230	(if newstatus (sqlite3:execute db "UPDATE tests SET status=? WHERE id=?;" newstatus test-id)) (if newcomment (sqlite3:execute db "UPDATE tests SET comment=? WHERE id=?;" newcomment ;; (sdb:qry 'getid newcomment) test-id)))))) (mt:process-triggers dbstruct run-id test-id newstate newstatus)) ;; NEW BEHAVIOR: Count tests running in all runs! ;; (define (db:get-count-tests-running dbstruct run-id) ;; fastmode) (let* ((qry ;; (if fastmode ;; "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '') LIMIT 1;" "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '');")) ;; ) (db:with-db dbstruct run-id #f (lambda (db) (let* ((stmth (db:get-cache-stmth dbstruct db qry))) (sqlite3:first-result stmth))))))
︙			︙
3243 3244 3245 3246 3247 3248 3249 ~~3250 3251 3252 3253~~ 3254 3255 3256 3257 3258 3259 3260	;; "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id NOT IN (SELECT id FROM runs WHERE state='deleted') AND NOT (uname = 'n/a' AND item_path = '');") "SELECT count(id) FROM tests WHERE state in ('RUNNING','REMOTEHOSTSTART','LAUNCHED') AND run_id=?;" run-id)))) ;; NOT IN (SELECT id FROM runs WHERE state='deleted');") ;; NEW BEHAVIOR: Look only at single run with run-id ;; ;; (define (db:get-running-stats dbstruct run-id) (define (db:get-count-tests-running-for-run-id dbstruct run-id fastmode) (let* ((qry (if fastmode "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? LIMIT 1;" "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;"))) (db:with-db dbstruct run-id #f (lambda (db) (let* ((stmth (db:get-cache-stmth dbstruct db qry))) (sqlite3:first-result stmth run-id))))))	\| \| \| \|	3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260	;; "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id NOT IN (SELECT id FROM runs WHERE state='deleted') AND NOT (uname = 'n/a' AND item_path = '');") "SELECT count(id) FROM tests WHERE state in ('RUNNING','REMOTEHOSTSTART','LAUNCHED') AND run_id=?;" run-id)))) ;; NOT IN (SELECT id FROM runs WHERE state='deleted');") ;; NEW BEHAVIOR: Look only at single run with run-id ;; ;; (define (db:get-running-stats dbstruct run-id) (define (db:get-count-tests-running-for-run-id dbstruct run-id) ;; fastmode) (let* ((qry ;; (if fastmode ;; "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? LIMIT 1;" "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;")) ;; ) (db:with-db dbstruct run-id #f (lambda (db) (let* ((stmth (db:get-cache-stmth dbstruct db qry))) (sqlite3:first-result stmth run-id))))))
︙			︙

︙			︙
319 320 321 322 323 324 325 ~~326~~ 327 328 329 330 331 332 333	;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) ~~(let* ((num-running (rmt:get-count-tests-running run-id ~~#f)) ;; fastmode=no~~~~ (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup configdat "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) (if (> (+ num-running num-running-in-jobgroup) 0) (runs:inc-can-run-more-tests-count runsdat)) ;; (set! runs:can-run-more-tests-count (+ runs:can-run-more-tests-count 1)))	\|	319 320 321 322 323 324 325 326 327 328 329 330 331 332 333	;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup configdat "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) (if (> (+ num-running num-running-in-jobgroup) 0) (runs:inc-can-run-more-tests-count runsdat)) ;; (set! runs:can-run-more-tests-count (+ runs:can-run-more-tests-count 1)))
︙			︙
1563 1564 1565 1566 1567 1568 1569 ~~1570~~ 1571 1572 1573 1574 1575 1576 1577	)) extras) extras) '()))) (waitons (delete-duplicates (append (tests:testqueue-get-waitons test-record) extra-waits) equal?)) (newtal (append tal (list hed))) (regfull (>= (length reg) reglen)) ~~(num-running (rmt:get-count-tests-running-for-run-id run-id ~~#t)) ;; fastmode=yes~~~~ (testdat (make-runs:testdat hed: hed tal: tal reg: reg reruns: reruns test-record: test-record test-name: test-name	\|	1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577	)) extras) extras) '()))) (waitons (delete-duplicates (append (tests:testqueue-get-waitons test-record) extra-waits) equal?)) (newtal (append tal (list hed))) (regfull (>= (length reg) reglen)) (num-running (rmt:get-count-tests-running-for-run-id run-id)) (testdat (make-runs:testdat hed: hed tal: tal reg: reg reruns: reruns test-record: test-record test-name: test-name
︙			︙
1829 1830 1831 1832 1833 1834 1835 ~~1836~~ 1837 1838 1839 1840 1841 1842 1843 1844 1845 ~~1846~~ 1847 1848 1849 1850 1851 1852 1853 1854 1855 ~~1856~~ 1857 1858 1859 1860 1861 1862 1863	(rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now if -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle ~~(let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id ~~#t)) ;; fastmode=yes~~~~ (prev-num-running 0)) ;; (debug:print-info 13 default-log-port "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup configdat "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 default-log-port "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) ~~(let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id ~~#f))) ;; fastmode=no~~~~ (debug:print-info 0 default-log-port "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 default-log-port "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) ~~(wait-loop (rmt:get-count-tests-running-for-run-id run-id ~~#t) ;; fastmode=yes~~~~ num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 default-log-port "Calling Post Hook") ;; (runs:run-post-hook run-id) (debug:print-info 1 default-log-port "All tests launched")))	\| \| \|	1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863	(rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now if -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id)) (prev-num-running 0)) ;; (debug:print-info 13 default-log-port "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup configdat "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 default-log-port "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id))) (debug:print-info 0 default-log-port "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 default-log-port "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 default-log-port "Calling Post Hook") ;; (runs:run-post-hook run-id) (debug:print-info 1 default-log-port "All tests launched")))
︙			︙