Megatest

Check-in [de21785cce]
Login
Overview
Comment:When checking for running tests if on same host do not use ssh
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v2.0001
Files: files | file ages | folders
SHA1: de21785cce250a405cedf39c13b8fbeb90d953d1
User & Date: mrwellan on 2022-02-15 11:30:24
Other Links: branch diff | manifest | tags
Context
2022-02-17
12:27
Merged fork check-in: 162628b5d6 user: mrwellan tags: v2.0001
2022-02-16
10:31
Rebase forward. Closed-Leaf check-in: 09cc793198 user: mrwellan tags: v2.0001-disallow-eof
09:59
rebase of v2.0001-dashboard Closed-Leaf check-in: 235f4e077d user: mrwellan tags: v2.0001-dashboard
2022-02-15
11:30
When checking for running tests if on same host do not use ssh check-in: de21785cce user: mrwellan tags: v2.0001
2022-02-14
21:18
Speculative fix for db:get-status-from-final-status-file (untested) check-in: 0bdb58420b user: mrwellan tags: v2.0001
Changes

Modified commonmod.scm from [875119b082] to [2a227221ee].

2700
2701
2702
2703
2704
2705
2706




2707
2708
2709
2710
2711
2712
2713
2714

2715
2716
2717
2718
2719
2720
2721
		      (loop (car tal)(cdr tal) loads proc-num phys-num core-num)))))))))))

(define (common:unix-ping hostname)
  (let ((res (system (conc "ping -c 1 " hostname " > /dev/null"))))
    (eq? res 0)))

(define (launch:is-test-alive host pid)




  (if (and host pid (not (equal? host "n/a")))
      (let* ((cmd (conc "ssh " host " pstree -A " pid))
	     (output (with-input-from-pipe cmd read-lines)))
	(debug:print 2 *default-log-port* "Running " cmd " received " output)
	(if (eq? (length output) 0)
	   #f
	   #t))
      #t))


(define (common:get-num-cpus remote-host)
  (let* ((actual-host (or remote-host (get-host-name))))
    ;; hosts had better not be changing the number of cpus too often!
    (or (hash-table-ref/default *numcpus-cache* actual-host #f)
	(let* ((numcpus (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (pseudo-random-integer 3600)))
			    (let* ((proc   (lambda ()







>
>
>
>
|
|
|
|
|
|
|
<
>







2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717

2718
2719
2720
2721
2722
2723
2724
2725
		      (loop (car tal)(cdr tal) loads proc-num phys-num core-num)))))))))))

(define (common:unix-ping hostname)
  (let ((res (system (conc "ping -c 1 " hostname " > /dev/null"))))
    (eq? res 0)))

(define (launch:is-test-alive host pid)
  (let* ((same-host (equal? host (get-host-name)))
	 (cmd (conc 
	       (if same-host "" (conc "ssh "host" "))
	       "pstree -A "pid)))
    (if (and host pid
	     (not (equal? host "n/a")))
	(let* ((output (with-input-from-pipe cmd read-lines)))
	  (debug:print 2 *default-log-port* "Running " cmd " received " output)
	  (if (eq? (length output) 0)
	      #f
	      #t))

	#t))) ;; assuming bad query is about a live test is likely not the right thing to do?

(define (common:get-num-cpus remote-host)
  (let* ((actual-host (or remote-host (get-host-name))))
    ;; hosts had better not be changing the number of cpus too often!
    (or (hash-table-ref/default *numcpus-cache* actual-host #f)
	(let* ((numcpus (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (pseudo-random-integer 3600)))
			    (let* ((proc   (lambda ()

Modified launchmod.scm from [9e6a47cc52] to [03e715c08b].

1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
	    (debug:print 4 *default-log-port* "look for  post hook. currseconds: " (current-seconds) " EOR " (rmt:get-var run-id (conc "end-of-run-" run-id)))
	    (debug:print 0 *default-log-port* "End of Run Detected.")
	    (rmt:set-var run-id (conc "end-of-run-" run-id) "yes")
					;(thread-sleep! 10)
	    (runs:run-post-hook run-id)
	    (debug:print 4 *default-log-port* "currseconds: " (current-seconds)" eor: " (rmt:get-var run-id (conc "end-of-run-" run-id)))
	    (common:simple-unlock (conc "endOfRun" run-id)))
	  (debug:print 0 *default-log-port* "End of Run Detected but not running post hook. This should happen when eor is set to yes. This will happen only when 2 tests exit at smae time. eor= " (rmt:get-var run-id (conc "end-of-run-" run-id)))))
     ((> running-cnt 3) 
      (debug:print 0 *default-log-port* "There are " running-cnt " tests running." ))
     ((> running-cnt 0)
      (debug:print 0 *default-log-port* "running cnt > 0 but <= 3 kill-running-tests-if-dead" )
      (let ((kill-cnt (launch:kill-tests-if-dead run-id)))
	(if (and all-test-launched  (equal? all-test-launched "yes") (eq? kill-cnt running-cnt))
	    (runs:end-of-run-check run-id)))) ;;todo







|







1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
	    (debug:print 4 *default-log-port* "look for  post hook. currseconds: " (current-seconds) " EOR " (rmt:get-var run-id (conc "end-of-run-" run-id)))
	    (debug:print 0 *default-log-port* "End of Run Detected.")
	    (rmt:set-var run-id (conc "end-of-run-" run-id) "yes")
					;(thread-sleep! 10)
	    (runs:run-post-hook run-id)
	    (debug:print 4 *default-log-port* "currseconds: " (current-seconds)" eor: " (rmt:get-var run-id (conc "end-of-run-" run-id)))
	    (common:simple-unlock (conc "endOfRun" run-id)))
	  (debug:print 0 *default-log-port* "End of Run Detected but not running post hook. This should happen when eor is set to yes. This will happen only when 2 tests exit at same time. eor= " (rmt:get-var run-id (conc "end-of-run-" run-id)))))
     ((> running-cnt 3) 
      (debug:print 0 *default-log-port* "There are " running-cnt " tests running." ))
     ((> running-cnt 0)
      (debug:print 0 *default-log-port* "running cnt > 0 but <= 3 kill-running-tests-if-dead" )
      (let ((kill-cnt (launch:kill-tests-if-dead run-id)))
	(if (and all-test-launched  (equal? all-test-launched "yes") (eq? kill-cnt running-cnt))
	    (runs:end-of-run-check run-id)))) ;;todo
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952

1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
		      (if (not (null? tal))
			  (loop (car tal) (cdr tal)))))))))))

(define (runs:find-and-mark-incomplete-and-check-end-of-run run-id ovr-deadtime)
  (rmt:find-and-mark-incomplete run-id ovr-deadtime)
  (runs:end-of-run-check run-id))



(define (launch:kill-tests-if-dead run-id)
  (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f)))
       (let loop ((running-test (car running-tests))
			     (tal    (cdr running-tests))
			     (kill-cnt 0))
		       (let* ((test-name (vector-ref running-test 2))
                 (item-path (vector-ref running-test 11))
								 (test-id (vector-ref running-test 0))
                 (host (vector-ref running-test 6))
                 (pid  (rmt:test-get-top-process-pid run-id test-id))   
                 (event-time (vector-ref running-test 5))
                 (duration (vector-ref running-test 12))
                 (flag 0)   
                 (curr-time (current-seconds)))

       (if (and (< (+ event-time duration 600) curr-time) (not (launch:is-test-alive host pid))) ;;test has not updated duration in last 10 min then likely its not running but confirm before marking it as killed
           (begin    
			       	(debug:print 0 *default-log-port* "test " test-name "/" item-path " needs to be killed")
              (set! flag 1) 
              (rmt:set-state-status-and-roll-up-items run-id test-name item-path "KILLREQ" "n/a" #f)))
               (if (not (null? tal))
				  (loop (car tal) (cdr tal) (+ kill-cnt flag))
                 (+ kill-cnt flag))))))


(define (runs:run-post-hook run-id)
    (let* ((run-post-hook   (configf:lookup *configdat* "runs" "post-hook"))
           (existing-tests (if run-post-hook
                               (rmt:get-tests-for-run run-id "%" '() '() ;; run-id testpatt states statuses
                                                      #f #f ;; offset limit







|
<


|
|
|
|
|
|
|
|
|
|
|
|
>
|
|
|


|
|
|







1930
1931
1932
1933
1934
1935
1936
1937

1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
		      (if (not (null? tal))
			  (loop (car tal) (cdr tal)))))))))))

(define (runs:find-and-mark-incomplete-and-check-end-of-run run-id ovr-deadtime)
  (rmt:find-and-mark-incomplete run-id ovr-deadtime)
  (runs:end-of-run-check run-id))

;; only called if there are more than zero running tests

(define (launch:kill-tests-if-dead run-id)
  (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f)))
    (let loop ((running-test (car running-tests))
	       (tal          (cdr running-tests))
	       (kill-cnt     0))
      (let* ((test-name  (vector-ref running-test 2))
             (item-path  (vector-ref running-test 11))
	     (test-id    (vector-ref running-test 0))
             (host       (vector-ref running-test 6))
             (pid        (rmt:test-get-top-process-pid run-id test-id))   
             (event-time (vector-ref running-test 5))
             (duration   (vector-ref running-test 12))
             (flag       0)   
             (curr-time  (current-seconds)))
	(if (and (< (+ event-time duration 600) curr-time)
		 (not (launch:is-test-alive host pid))) ;;test has not updated duration in last 10 min then likely its not running but confirm before marking it as killed
            (begin    
	      (debug:print 0 *default-log-port* "test " test-name "/" item-path " needs to be killed")
              (set! flag 1) 
              (rmt:set-state-status-and-roll-up-items run-id test-name item-path "KILLREQ" "n/a" #f)))
        (if (not (null? tal))
	    (loop (car tal) (cdr tal) (+ kill-cnt flag))
            (+ kill-cnt flag))))))


(define (runs:run-post-hook run-id)
    (let* ((run-post-hook   (configf:lookup *configdat* "runs" "post-hook"))
           (existing-tests (if run-post-hook
                               (rmt:get-tests-for-run run-id "%" '() '() ;; run-id testpatt states statuses
                                                      #f #f ;; offset limit