Megatest

Diff
Login

Differences From Artifact [f99424cdf8]:

To Artifact [7ac979c2cd]:


59
60
61
62
63
64
65
66



























67
68
69
70
71
72
73
  (last-jobs-check-time    0)
  )

(defstruct runs:testdat
  hed tal reg reruns  test-record
  test-name item-path jobgroup
  waitons testmode  newtal itemmaps prereqs-not-met)
  



























;; look in the $MT_RUN_AREA_HOME/.softlocks directory for key-host-pid.softlock files
;;  - remove any that are over 3600 seconds old
;;  - if there are any that are younger than 10 seconds
;;      * sleep 10 seconds
;;      * touch my key-host-pid.softlock file
;;      * return
;;  - if there are no files younger than 10 seconds







|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
  (last-jobs-check-time    0)
  )

(defstruct runs:testdat
  hed tal reg reruns  test-record
  test-name item-path jobgroup
  waitons testmode  newtal itemmaps prereqs-not-met)

(module runsmod
    (
     runs:wait-if-seen-recently
     )
  
(import scheme chicken data-structures extras files)
(import posix typed-records srfi-18 srfi-69
	  md5 message-digest
	  regex srfi-1)

(define *last-seen-ht* (make-hash-table))

(define (runs:wait-if-seen-recently wait-until . keys)
  (let* ((full-key   (string-intersperse keys "-"))
	 (last-seen  (hash-table-ref/default *last-seen-ht* full-key 0))
	 (now        (current-seconds))
	 (delta      (- now last-seen))
	 (needed     (if (< delta wait-until)
			 0
			 (- wait-until delta))))
    (if (> needed 0)(thread-sleep! needed))
    (hash-table-set! *last-seen-ht* full-key (current-seconds))
    needed))
)

(import runsmod)
    
;; look in the $MT_RUN_AREA_HOME/.softlocks directory for key-host-pid.softlock files
;;  - remove any that are over 3600 seconds old
;;  - if there are any that are younger than 10 seconds
;;      * sleep 10 seconds
;;      * touch my key-host-pid.softlock file
;;      * return
;;  - if there are no files younger than 10 seconds
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
  ;; Take advantage of a good place to exit if running the one-pass methodology
  (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20)
	   (args:get-arg "-one-pass"))
      (exit 0))

  (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))

  (let* ((num-running             (rmt:get-count-tests-running run-id #f)) ;; fastmode=no
	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
	 (job-group-limit         (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))
				    (if (string? jobg-count)
					(string->number jobg-count)
					jobg-count))))
    (if (> (+ num-running num-running-in-jobgroup) 0)
	(runs:inc-can-run-more-tests-count runsdat)) ;; (set! *runs:can-run-more-tests-count* (+ *runs:can-run-more-tests-count* 1)))







|







346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
  ;; Take advantage of a good place to exit if running the one-pass methodology
  (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20)
	   (args:get-arg "-one-pass"))
      (exit 0))

  (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))

  (let* ((num-running             (rmt:get-count-tests-running run-id)) 
	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
	 (job-group-limit         (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))
				    (if (string? jobg-count)
					(string->number jobg-count)
					jobg-count))))
    (if (> (+ num-running num-running-in-jobgroup) 0)
	(runs:inc-can-run-more-tests-count runsdat)) ;; (set! *runs:can-run-more-tests-count* (+ *runs:can-run-more-tests-count* 1)))
433
434
435
436
437
438
439


440
441
442
443
444
445
446
	      (debug:print-info 0 *default-log-port* "running run-post-hook: \"" run-post-hook "\", log is " actual-logf)
	      (system (conc run-post-hook " >> " actual-logf " 2>&1"))
	      (debug:print-info 0 *default-log-port* "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run."))))))

;; return #t when all items in waitors-upon list are represented in test-patt, #f otherwise.
(define (runs:testpatts-mention-waitors-upon? test-patt waitors-upon)
  (null? (tests:filter-test-names-not-matched waitors-upon test-patt)))



;;======================================================================
;; runs:run-tests is called from megatest.scm and itself
;;======================================================================
;;
;;  test-names: Comma separated patterns same as test-patts but used in selection 
;;              of tests to run. The item portions are not respected.







>
>







460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
	      (debug:print-info 0 *default-log-port* "running run-post-hook: \"" run-post-hook "\", log is " actual-logf)
	      (system (conc run-post-hook " >> " actual-logf " 2>&1"))
	      (debug:print-info 0 *default-log-port* "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run."))))))

;; return #t when all items in waitors-upon list are represented in test-patt, #f otherwise.
(define (runs:testpatts-mention-waitors-upon? test-patt waitors-upon)
  (null? (tests:filter-test-names-not-matched waitors-upon test-patt)))

(define *find-and-mark-incomplete-last-run* (make-hash-table))

;;======================================================================
;; runs:run-tests is called from megatest.scm and itself
;;======================================================================
;;
;;  test-names: Comma separated patterns same as test-patts but used in selection 
;;              of tests to run. The item portions are not respected.
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
    ;; run the run prehook if there are no tests yet run for this run:
    ;;
    (runs:run-pre-hook run-id)
    ;; mark all test launced flag as false in the meta table 
    (rmt:set-var (conc "lunch-complete-" run-id) "no")
    (debug:print-info 1 *default-log-port* "Setting end-of-run to no")
    (let* ((config-reruns      (let ((x (configf:lookup *configdat* "setup" "reruns")))
			       (if x (string->number x) #f)))
	  (config-rerun-cnt (if config-reruns
			config-reruns
			1)))
    (if (eq? config-rerun-cnt run-count)
      (rmt:set-var (conc "end-of-run-" run-id) "no")))

    (rmt:set-run-state-status run-id "new" "n/a")
    ;; now add non-directly referenced dependencies (i.e. waiton)
    ;;======================================================================
    ;; refactoring this block into tests:get-full-data
    ;;
    ;; What happended, this code is now duplicated in tests!?
    ;;







|
|
|
|
|
|
|







633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
    ;; run the run prehook if there are no tests yet run for this run:
    ;;
    (runs:run-pre-hook run-id)
    ;; mark all test launced flag as false in the meta table 
    (rmt:set-var (conc "lunch-complete-" run-id) "no")
    (debug:print-info 1 *default-log-port* "Setting end-of-run to no")
    (let* ((config-reruns      (let ((x (configf:lookup *configdat* "setup" "reruns")))
				 (if x (string->number x) #f)))
	   (config-rerun-cnt (if config-reruns
				 config-reruns
				 1)))
      (if (eq? config-rerun-cnt run-count)
	  (rmt:set-var (conc "end-of-run-" run-id) "no")))
    
    (rmt:set-run-state-status run-id "new" "n/a")
    ;; now add non-directly referenced dependencies (i.e. waiton)
    ;;======================================================================
    ;; refactoring this block into tests:get-full-data
    ;;
    ;; What happended, this code is now duplicated in tests!?
    ;;
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751





752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
	(debug:print-info 1 *default-log-port* "Adding \"" (string-intersperse required-tests " ") "\" to the run queue"))
    ;; NOTE: these are all parent tests, items are not expanded yet.
    (debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records))
    (let ((reglen (configf:lookup *configdat* "setup" "runqueue")))
      (if (> (length (hash-table-keys test-records)) 0)
	  (let* ((keep-going        #t)
		 (run-queue-retries 5)
		 #;(th1        (make-thread (lambda ()
					    (handle-exceptions
						exn
						(begin
						  (print-call-chain)
						  (print " message: " ((condition-property-accessor 'exn 'message) exn)))
					      (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
								    (any->number reglen) all-tests-registry)))
					  "runs:run-tests-queue"))
		 (th2        (make-thread (lambda ()			 ;; BBQ: why are we visiting ALL runs here?	    
					    ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ...
					    (let ((run-ids (rmt:get-all-run-ids)))
					      (for-each (lambda (run-id)
							  (if keep-going
							      (handle-exceptions
							       exn
							       (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)





							       (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
							run-ids)))
					  "runs: mark-incompletes")))
	    ;; (thread-start! th1)
	    (thread-start! th2)
	    ;; (thread-join! th1)
	    ;; just do the main stuff in the main thread
	    (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
								    (any->number reglen) all-tests-registry)
	    (set! keep-going #f)
	    (thread-join! th2)
	    ;; if run-count > 0 call, set -preclean and -rerun STUCK/DEAD
	    (if (> run-count 0) ;; handle reruns
		(begin
		  (if (not (hash-table-ref/default flags "-preclean" #f))
		      (hash-table-set! flags "-preclean" #t))
		  (if (not (hash-table-ref/default flags "-rerun" #f))
		      (hash-table-set! flags "-rerun" "STUCK/DEAD,n/a,ZERO_ITEMS"))
		  ;; recursive call to self
      (runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))
                  (launch:end-of-run-check run-id)))
	  (debug:print-info 0 *default-log-port* "No tests to run")))
    (debug:print-info 4 *default-log-port* "All done by here")
    ;; TODO: try putting post hook call here
      
    ;  (debug:print-info 2 *default-log-port* " run-count " run-count)
    ;  (runs:run-post-hook run-id))
    ;  (debug:print-info 2 *default-log-port* "Not calling post hook runcount = " run-count ))   







<
<
<
<
<
<
<
<
<






|
|
>
>
>
>
>
|


<

<
<












|
|







757
758
759
760
761
762
763









764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779

780


781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
	(debug:print-info 1 *default-log-port* "Adding \"" (string-intersperse required-tests " ") "\" to the run queue"))
    ;; NOTE: these are all parent tests, items are not expanded yet.
    (debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records))
    (let ((reglen (configf:lookup *configdat* "setup" "runqueue")))
      (if (> (length (hash-table-keys test-records)) 0)
	  (let* ((keep-going        #t)
		 (run-queue-retries 5)









		 (th2        (make-thread (lambda ()			 ;; BBQ: why are we visiting ALL runs here?	    
					    ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ...
					    (let ((run-ids (rmt:get-all-run-ids)))
					      (for-each (lambda (run-id)
							  (if keep-going
							      (handle-exceptions
								  exn
								  (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)
								;; lets run this only if a run has been NOT seen for more than 900 seconds
								(if (> (- (current-seconds)(hash-table-ref/default *find-and-mark-incomplete-last-run* run-id 0)) 900)
								    (begin
								      (rmt:find-and-mark-incomplete run-id #f)
								      (hash-table-set! *find-and-mark-incomplete-last-run* run-id (current-seconds)))
								    )))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
							run-ids)))
					  "runs: mark-incompletes")))

	    (thread-start! th2)


	    (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
								    (any->number reglen) all-tests-registry)
	    (set! keep-going #f)
	    (thread-join! th2)
	    ;; if run-count > 0 call, set -preclean and -rerun STUCK/DEAD
	    (if (> run-count 0) ;; handle reruns
		(begin
		  (if (not (hash-table-ref/default flags "-preclean" #f))
		      (hash-table-set! flags "-preclean" #t))
		  (if (not (hash-table-ref/default flags "-rerun" #f))
		      (hash-table-set! flags "-rerun" "STUCK/DEAD,n/a,ZERO_ITEMS"))
		  ;; recursive call to self
		  (runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))
		(launch:end-of-run-check run-id)))
	  (debug:print-info 0 *default-log-port* "No tests to run")))
    (debug:print-info 4 *default-log-port* "All done by here")
    ;; TODO: try putting post hook call here
      
    ;  (debug:print-info 2 *default-log-port* " run-count " run-count)
    ;  (runs:run-post-hook run-id))
    ;  (debug:print-info 2 *default-log-port* "Not calling post hook runcount = " run-count ))   
1468
1469
1470
1471
1472
1473
1474

1475
1476
1477
1478
1479
1480
1481
         (num-retries           0)
         (max-retries           (configf:lookup *configdat* "setup" "maxretries"))
         (max-concurrent-jobs   (configf:lookup-number *configdat* "setup" "max_concurrent_jobs" default: 50))
         (reglen                (if (number? reglen-in) reglen-in 1))
         (last-time-incomplete  (- (current-seconds) 900)) ;; force at least one clean up cycle
         (last-time-some-running (current-seconds))
         ;; (tdbdat                (tasks:open-db))

         (runsdat (make-runs:dat
                   ;; hed: hed
                   ;; tal: tal
                   ;; reg: reg
                   ;; reruns: reruns
                   reglen: reglen
                   regfull: #f ;; regfull







>







1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
         (num-retries           0)
         (max-retries           (configf:lookup *configdat* "setup" "maxretries"))
         (max-concurrent-jobs   (configf:lookup-number *configdat* "setup" "max_concurrent_jobs" default: 50))
         (reglen                (if (number? reglen-in) reglen-in 1))
         (last-time-incomplete  (- (current-seconds) 900)) ;; force at least one clean up cycle
         (last-time-some-running (current-seconds))
         ;; (tdbdat                (tasks:open-db))
	 (misc-data             (make-hash-table)) ;; use as needed
         (runsdat (make-runs:dat
                   ;; hed: hed
                   ;; tal: tal
                   ;; reg: reg
                   ;; reruns: reruns
                   reglen: reglen
                   regfull: #f ;; regfull
1528
1529
1530
1531
1532
1533
1534






1535
1536
1537
1538
1539
1540
1541
      ;;
      (if (> (current-seconds)(+ last-time-incomplete 900))
          (begin
            (set! last-time-incomplete (current-seconds))
            ;; (rmt:find-and-mark-incomplete-all-runs)
	    ))







      ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns)
      (let* ((test-record (hash-table-ref test-records hed))
	     (test-name   (tests:testqueue-get-testname test-record))
	     (tconfig     (tests:testqueue-get-testconfig test-record))
	     (jobgroup    (configf:lookup tconfig "test_meta" "jobgroup"))
	     (testmode    (let ((m (configf:lookup tconfig "requirements" "mode")))
			    (if m (map string->symbol (string-split m)) '(normal))))







>
>
>
>
>
>







1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
      ;;
      (if (> (current-seconds)(+ last-time-incomplete 900))
          (begin
            (set! last-time-incomplete (current-seconds))
            ;; (rmt:find-and-mark-incomplete-all-runs)
	    ))

      ;; WAIT FOR TIME ON TIGHT LOOP
      (if (< (- (current-milliseconds)(hash-table-ref/default misc-data "tight-loop-last-time" 0))
	     100) ;; less than 1/100 second since came through the loop
	  (thread-sleep! 0.1)) ;; wait a 1/100 seconds
      (hash-table-set! misc-data "tight-loop-last-time" (current-milliseconds))
      
      ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns)
      (let* ((test-record (hash-table-ref test-records hed))
	     (test-name   (tests:testqueue-get-testname test-record))
	     (tconfig     (tests:testqueue-get-testconfig test-record))
	     (jobgroup    (configf:lookup tconfig "test_meta" "jobgroup"))
	     (testmode    (let ((m (configf:lookup tconfig "requirements" "mode")))
			    (if m (map string->symbol (string-split m)) '(normal))))
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
				       ))
				   extras)
				  extras)
				'())))
	     (waitons     (delete-duplicates (append (tests:testqueue-get-waitons test-record) extra-waits) equal?))
	     (newtal      (append tal (list hed)))
	     (regfull     (>= (length reg) reglen))
	     (num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes
	     (testdat     (make-runs:testdat
			   hed: hed
			   tal: tal
			   reg: reg
			   reruns: reruns
			   test-record: test-record
			   test-name:   test-name







|







1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
				       ))
				   extras)
				  extras)
				'())))
	     (waitons     (delete-duplicates (append (tests:testqueue-get-waitons test-record) extra-waits) equal?))
	     (newtal      (append tal (list hed)))
	     (regfull     (>= (length reg) reglen))
	     (num-running (rmt:get-count-tests-running-for-run-id run-id))
	     (testdat     (make-runs:testdat
			   hed: hed
			   tal: tal
			   reg: reg
			   reruns: reruns
			   test-record: test-record
			   test-name:   test-name
1713
1714
1715
1716
1717
1718
1719



1720
1721
1722
1723
1724
1725
1726
				  ;; wait for load here
				  (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
				  (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)
						     (- remtries 1)))))))
		       )))))

	  ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed



	  (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))

	  ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed
	  (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))

	  (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running
            (if loop-list (apply loop loop-list))))







>
>
>







1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
				  ;; wait for load here
				  (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
				  (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)
						     (- remtries 1)))))))
		       )))))

	  ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed
	  (let ((waited (runs:wait-if-seen-recently 5 "prereqs-not-met" hed item-path))) ;; if we've been down this path in the past 5 seconds - wait out the difference
	    (if (> waited 0)(debug:print 0 *default-log-port* "Waited for prereqs-not-met-"hed"-"item-path" for " waited "seconds.")))
	  
	  (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))

	  ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed
	  (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))

	  (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running
            (if loop-list (apply loop loop-list))))
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844

1845
1846
1847
1848
1849
1850

1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
    (rmt:set-var (conc "lunch-complete-" run-id) "yes")  
        
    ;; now *if* -run-wait we wait for all tests to be done
    ;; Now wait for any RUNNING tests to complete (if in run-wait mode)
    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
    (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle
    
    (let wait-loop ((num-running      (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes
		    (prev-num-running 0))
      ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running)
      (if (and (or (args:get-arg "-run-wait")
		   (equal? (configf:lookup *configdat* "setup" "run-wait") "yes"))
	       (> num-running 0))
	  (begin
	    ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
	    ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0))

	    (if (> (current-seconds)(+ last-time-incomplete 900))
		(let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no
		  (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id
				    ". Running as pid " (current-process-id) " on " (get-host-name))
		  (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
		  (rmt:find-and-mark-incomplete run-id #f)

		  (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running
				    " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at "
				    (time->string (seconds->local-time (current-seconds))))))
	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
	    (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1))
	    (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes
		       num-running))))
    ;; LET* ((test-record
    ;; we get here on "drop through". All done!
    ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. 
    ;; (debug:print-info 0 *default-log-port* "Calling Post Hook")    
    ;; (runs:run-post-hook run-id)
    (debug:print-info 1 *default-log-port* "All tests launched")))







|








>
|
|


|

>





|







1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
    (rmt:set-var (conc "lunch-complete-" run-id) "yes")  
        
    ;; now *if* -run-wait we wait for all tests to be done
    ;; Now wait for any RUNNING tests to complete (if in run-wait mode)
    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
    (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle
    
    (let wait-loop ((num-running      (rmt:get-count-tests-running-for-run-id run-id))
		    (prev-num-running 0))
      ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running)
      (if (and (or (args:get-arg "-run-wait")
		   (equal? (configf:lookup *configdat* "setup" "run-wait") "yes"))
	       (> num-running 0))
	  (begin
	    ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
	    ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0))
	    (if (> (- (current-seconds)(hash-table-ref/default *find-and-mark-incomplete-last-run* run-id 0)) 900)
		;; (begin(if (> (current-seconds)(+ last-time-incomplete 900))
		(let ((actual-num-running num-running)) ;; (rmt:get-count-tests-running-for-run-id run-id))) ;; why call it again?
		  (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id
				    ". Running as pid " (current-process-id) " on " (get-host-name))
		  ;; (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
		  (rmt:find-and-mark-incomplete run-id #f)
		  (hash-table-set! *find-and-mark-incomplete-last-run* run-id (current-seconds))
		  (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running
				    " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at "
				    (time->string (seconds->local-time (current-seconds))))))
	    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
	    (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1))
	    (wait-loop (rmt:get-count-tests-running-for-run-id run-id)
		       num-running))))
    ;; LET* ((test-record
    ;; we get here on "drop through". All done!
    ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. 
    ;; (debug:print-info 0 *default-log-port* "Calling Post Hook")    
    ;; (runs:run-post-hook run-id)
    (debug:print-info 1 *default-log-port* "All tests launched")))
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
    (path-out (conc "/" (string-intersperse (take dparts (- (length dparts) 1)) "/")))
    )
    path-out
  )
)


;; (define (runs:remove-all-but-last-n-runs-per-target target-patts runpatt num-to-keep)
;;   (let ((data (runs:get-all-but-most-recent-n-per-target target-patts runpatt num-to-keep)))
;;     (for-each
;;      (lambda (target)
;;        (let ((runs-to-remove (hash-table-ref data target )))
;;          (for-each
;;           (lambda (run)
;;             (print "megatest -remove-runs -target " target " -runname " (simple-run-runname run) " -testpatt %"))
;;           runs-to-remove)))
;;      (hash-table-keys data))))

;; Remove runs
;; fields are passing in through 
;; action:
;;    'remove-runs
;;    'set-state-status
;;
;; NB// should pass in keys?
;;
(define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(mode #f)(options '()))
  (common:clear-caches) ;; clear all caches
  (let* ((db           #f)
	 ;; (tdbdat       (tasks:open-db))
	 (keys         (rmt:get-keys))
	 (rundat       (mt:get-runs-by-patt keys runnamepatt target))
	 (header       (vector-ref rundat 0))







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







2287
2288
2289
2290
2291
2292
2293



















2294
2295
2296
2297
2298
2299
2300
    (path-out (conc "/" (string-intersperse (take dparts (- (length dparts) 1)) "/")))
    )
    path-out
  )
)





















(define (runs:operate-on action target runnamepatt testpatt #!key (state #f)(status #f)(new-state-status #f)(mode #f)(options '()))
  (common:clear-caches) ;; clear all caches
  (let* ((db           #f)
	 ;; (tdbdat       (tasks:open-db))
	 (keys         (rmt:get-keys))
	 (rundat       (mt:get-runs-by-patt keys runnamepatt target))
	 (header       (vector-ref rundat 0))
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529


2530
2531
2532
2533
2534
2535
2536
                                            (loop new-test-dat tal)
                                            (loop (car tal)(append tal (list new-test-dat)))))
                                      (begin
                                       (let ((rundir (db:test-get-rundir new-test-dat)))
                                        (if (and (not (string=  rundir "/tmp/badname")) 
                                             (file-exists? rundir)
                                             (substring-index run-name rundir)
                                             (substring-index target rundir)
                                             )
                                          (begin
                                            (set! lasttpath (db:test-get-rundir new-test-dat)) ;; remember this path for run removal
                                            (set! lastrealpath (remove-last-path-directory (resolve-pathname lasttpath)))
                                            (hash-table-set! run-paths-hash lastrealpath 1)
                                            (runs:remove-test-directory new-test-dat mode) ;; 'remove-all)
                                          )
                                          (begin
                                            (debug:print 2 *default-log-port* "Not removing directory " rundir " because either it doesn't exist or has a bad name")
                                            (debug:print 2 *default-log-port* "Is /tmp/badname: " (string=  rundir "/tmp/badname"))
                                            (debug:print 2 *default-log-port* "Exists: " (file-exists? rundir))
                                            (debug:print 2 *default-log-port* "Has run-name: " (substring-index run-name rundir))
                                            (debug:print 2 *default-log-port* "Has target: " (substring-index target rundir))


                                            ;;PJH remove record from db no need to cleanup directory
                                            (case mode
                                               ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) (db:test-get-state test)(db:test-get-status test) #f))
                                               ((archive-remove)  (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVED" #f #f))
                                               (else (rmt:delete-test-records (db:test-get-run_id test) (db:test-get-id test))))

                                          )







|












|
>
>







2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
                                            (loop new-test-dat tal)
                                            (loop (car tal)(append tal (list new-test-dat)))))
                                      (begin
                                       (let ((rundir (db:test-get-rundir new-test-dat)))
                                        (if (and (not (string=  rundir "/tmp/badname")) 
                                             (file-exists? rundir)
                                             (substring-index run-name rundir)
                                             (tests:glob-like-match (conc "%/" target "/%") rundir)
                                             )
                                          (begin
                                            (set! lasttpath (db:test-get-rundir new-test-dat)) ;; remember this path for run removal
                                            (set! lastrealpath (remove-last-path-directory (resolve-pathname lasttpath)))
                                            (hash-table-set! run-paths-hash lastrealpath 1)
                                            (runs:remove-test-directory new-test-dat mode) ;; 'remove-all)
                                          )
                                          (begin
                                            (debug:print 2 *default-log-port* "Not removing directory " rundir " because either it doesn't exist or has a bad name")
                                            (debug:print 2 *default-log-port* "Is /tmp/badname: " (string=  rundir "/tmp/badname"))
                                            (debug:print 2 *default-log-port* "Exists: " (file-exists? rundir))
                                            (debug:print 2 *default-log-port* "Has run-name: " (substring-index run-name rundir))
                                            (debug:print 2 *default-log-port* "Has target: " (tests:glob-like-match (conc "%/" target "/%") rundir))
                                            (debug:print 2 *default-log-port* "Target: " target)

                                            ;;PJH remove record from db no need to cleanup directory
                                            (case mode
                                               ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) (db:test-get-state test)(db:test-get-status test) #f))
                                               ((archive-remove)  (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVED" #f #f))
                                               (else (rmt:delete-test-records (db:test-get-run_id test) (db:test-get-id test))))

                                          )