Megatest

Check-in [e010ede9bd]
Login
Overview
Comment:subrun kill works but suboptimal (serial kill)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | 1.65-subrun-ancilliary-usecases
Files: files | file ages | folders
SHA1: e010ede9bd89034197618e007c786c534e9a0ace
User & Date: bjbarcla on 2017-12-27 16:24:12
Other Links: branch diff | manifest | tags
Context
2017-12-27
18:12
parallelized removal of subruns check-in: a035fad97b user: bjbarcla tags: 1.65-subrun-ancilliary-usecases
16:24
subrun kill works but suboptimal (serial kill) check-in: e010ede9bd user: bjbarcla tags: 1.65-subrun-ancilliary-usecases
2017-12-26
18:07
wip; added hooks for subrun remove-run handling check-in: 05b23944bc user: bjbarcla tags: 1.65-subrun-ancilliary-usecases
Changes

Modified Makefile from [79ddb592c4] to [c37f1ed514].

89
90
91
92
93
94
95

96
97
98
99
100
101
102
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103







+







	rpc-transport.o \
	runconfig.o \
	runs.o \
	server.o \
	tasks.o \
	tdb.o \
	tests.o \
	subrun.o \


tcmt : $(TCMTOBJS) tcmt.scm
	csc $(CSCOPTS) $(TCMTOBJS) tcmt.scm -o tcmt

# install documentation to $(PREFIX)/docs
# DOES NOT REBUILD DOCS

Modified runs.scm from [6aafacc0b0] to [089f325036].

2062
2063
2064
2065
2066
2067
2068
2069


2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088


2089
2090
2091
2092













2093
2094
2095
2096
2097
2098
2099
2062
2063
2064
2065
2066
2067
2068

2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091




2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111







-
+
+



















+
+
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+







			     (if (not (null? tal))
				 (loop (car tal)(cdr tal))))
			   (let* ((item-path     (db:test-get-item-path new-test-dat))
				  (test-name     (db:test-get-testname new-test-dat))
				  (run-dir       ;;(filedb:get-path *fdb*
				   ;; (rmt:sdb-qry 'getid 
				   (db:test-get-rundir new-test-dat)) ;; )    ;; run dir is from the link tree
                                  (has-subrun   (subrun:subrun-test-initialized? run-dir))
                                  (has-subrun    (and (subrun:subrun-test-initialized? run-dir)
                                                      (not (subrun:subrun-removed? run-dir))))
				  (test-state    (db:test-get-state new-test-dat))
				  (test-fulln    (db:test-get-fullname new-test-dat))
				  (uname         (db:test-get-uname    new-test-dat))
				  (toplevel-with-children (and (db:test-get-is-toplevel test)
							       (> (rmt:test-toplevel-num-items run-id test-name) 0))))
			     (case action
			       ((remove-runs)
				;; if the test is a toplevel-with-children issue an error and do not remove
				(cond
                                 (toplevel-with-children
                                  (debug:print 0 *default-log-port* "WARNING: skipping removal of " test-fulln " with run-id " run-id " as it has sub tests")
                                  (hash-table-set! toplevel-retries test-fulln (+ (hash-table-ref/default toplevel-retries test-fulln 0) 1))
                                  (if (> (hash-table-ref toplevel-retries test-fulln) 3)
                                      (if (not (null? tal))
                                          (loop (car tal)(cdr tal))) ;; no else clause - drop it if no more in queue and > 3 tries
                                      (let ((newtal (append tal (list test))))
                                        (loop (car newtal)(cdr newtal))))) ;; loop with test still in queue
                                 (has-subrun
                                  ;; BB TODO - manage toplevasel-retries hash and retries in general
                                  (debug:print 0 *default-log-port* "WARNING: postponing removal of " test-fulln " with run-id " run-id " as it has a subrun")
                                  (let* ((subrun-remove-succeeded
                                  (subrun:remove-subrun test-run-dir new-test-dat test-name item-path test-state test-fulln toplevel-with-children test)
                                  

                                  )
                                          (subrun:remove-subrun run-dir new-test-dat test-name item-path test-state test-fulln toplevel-with-children test)))
                                    (cond
                                     (subrun-remove-succeeded
                                      
                                      (debug:print 0 *default-log-port* "Now removing of " test-fulln " with run-id " run-id " as it has a subrun")
                                      (runs:remove-test-directory new-test-dat mode))
                                     (else
                                      (let* ((logfile (subrun:get-log-path run-dir "remove")))
                                        (debug:print 0 *default-log-port* "WARNING: removal of subrun failed.  Please check "logfile" for details."))))
                                    
                                  (if (not (null? tal))
                                            (loop (car tal)(cdr tal)))))

                                 (else
                                  (debug:print-info 0 *default-log-port* "test: " test-name " itest-state: " test-state)
                                  (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
                                      (begin
                                        (if (not (hash-table-ref/default test-retry-time test-fulln #f))
                                            (begin
                                              ;; want to set to REMOVING BUT CANNOT do it here?

Modified subrun.scm from [4e6779a4e5] to [9da03e90bf].

1
2
3
4
5
6
7
8
9
10
11
12
13
14


15
16
17

18
19
20
21
22
23

24
25
26
27
28
29

30
31
32
33
34
35
36
37
38














39
40
41
42
43
44
45
1
2
3
4
5
6
7
8
9
10
11
12
13

14
15
16
17

18
19
20
21
22
23

24
25
26
27
28
29

30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60













-
+
+


-
+





-
+





-
+









+
+
+
+
+
+
+
+
+
+
+
+
+
+








;; Copyright 2006-2016, Matthew Welland.
;; 
;;  This program is made available under the GNU GPL version 2.0 or
;;  greater. See the accompanying file COPYING for details.
;; 
;;  This program is distributed WITHOUT ANY WARRANTY; without even the
;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
;;  PURPOSE.

;;  strftime('%m/%d/%Y %H:%M:%S','now','localtime')

(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) 
     posix-extras directory-utils pathname-expand typed-records format)
     posix-extras directory-utils pathname-expand typed-records format
     call-with-environment-variables)
(declare (unit subrun))
;;(declare (uses runs))
;;(declare (uses db))
(declare (uses db))
(declare (uses common))
;;(declare (uses items))
;;(declare (uses runconfig))
;;(declare (uses tests))
;;(declare (uses server))
;;(declare (uses mt))
(declare (uses mt))
;;(declare (uses archive))
;; (declare (uses filedb))

;(include "common_records.scm")
;;(include "key_records.scm")
;;(include "db_records.scm")
(include "db_records.scm") ;; provides db:test-get-id
;;(include "run_records.scm")
;;(include "test_records.scm")

(define (subrun:subrun-test-initialized? test-run-dir)
  (if (and (common:file-exists? (conc test-run-dir "/subrun-area") )
           (common:file-exists? (conc test-run-dir "/testconfig.subrun") ))
      #t
      #f))

(define (subrun:subrun-removed? test-run-dir)
  (if (subrun:subrun-test-initialized? test-run-dir)
      (let ((flagfile (conc test-run-dir "/subrun.removed")))
        (if (common:file-exists? flagfile)
            #t
            #f))
      #t))

(define (subrun:set-subrun-removed test-run-dir)
  (let ((flagfile (conc test-run-dir "/subrun.removed")))
    (if (and (subrun:subrun-test-initialized? test-run-dir) (not (common:file-exists? flagfile)))
        (with-output-to-file flagfile
          (lambda () (print (current-seconds)))))))

(define (subrun:testconfig-defines-subrun? testconfig)
  (configf:lookup testconfig "subrun" "runwait")) ;; we use runwait as the flag that a subrun is requested

(define (subrun:initialize-toprun-test  testconfig test-run-dir)

  (let ((ra (configf:lookup testconfig "subrun" "run-area"))
        (logpro (configf:lookup testconfig "subrun" "logpro"))
60
61
62
63
64
65
66
67
68


69
70
71
72
73
74
75
76
77


78
79

80
81
82

83
84
85
86




87
88
89
90
91
92
93
94
75
76
77
78
79
80
81


82
83









84
85


86



87




88
89
90
91

92
93
94
95
96
97
98







-
-
+
+
-
-
-
-
-
-
-
-
-
+
+
-
-
+
-
-
-
+
-
-
-
-
+
+
+
+
-









(define (subrun:remove-subrun test-run-dir new-test-dat test-name item-path test-state test-fulln toplevel-with-children test)
;; set state/status of test item
;; fork off megatest
;; set state/status of test item
;;

  (let* ((subrun-alist (subrun:selector+log-alist test-run-dir log-prefix))
  (BB> "Entered subrun:remove-subrun with "test-fulln)
  (if (and (not (subrun:subrun-removed? test-run-dir)) (subrun:subrun-test-initialized? test-run-dir))
         (runlog       (alist-ref "-log" subrun-alist equal? #f)))
    (if (not (common:file-exists? runlog))
        (BB> "no runlog @ "runlog)
        (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
            ;; This test is not in a correct state for cleaning up. Let's try some graceful shutdown steps first
            ;; Set the test to "KILLREQ" and wait five seconds then try again. Repeat up to five times then give
            ;; up and blow it away.
            
            ;; call in submegatest:
      (let* ((remove-result
              (subrun:exec-sub-megatest test-run-dir "-remove-runs" "remove")))
            ;;  (tasks:kill-runner target run-name testpatt)
            
        (if remove-result
            (mt:test-set-state-status-by-id run-id (db:test-get-id test) "SUBRUN-KILLREQ" "n/a" #f)
            )

            (begin
        ;; on success:
        ;;   set state of test, or delete it or whatever
        )
    )
              (subrun:set-subrun-removed test-run-dir)
              #t)
            #f))
      #t))
  )

(define (subrun:launch-cmd test-run-dir)
  (let* ((log-prefix "run")
         (switches (subrun:selector+log-switches test-run-dir log-prefix))
         (run-wait #t)
         (cmd      (conc "megatest -run "switches" "
                         (if run-wait "-run-wait " ""))))
167
168
169
170
171
172
173
174

175
176
177
178
179
180







181
182
183
184
185
186
187















188
189

171
172
173
174
175
176
177

178






179
180
181
182
183
184
185
186
187
188




189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206







-
+
-
-
-
-
-
-
+
+
+
+
+
+
+



-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+


+
            (map
             (lambda (x)
               (list (car x) (cdr x)))
             switch-alist))
           " ")))
    res))

(define (subrun:exec-sub-megatest test-run-dir switches #!key (logfile #f))
(define (subrun:exec-sub-megatest test-run-dir action-switches-str log-prefix)
  (let* ((real-logfile (or logfile (conc (test-run-dir) "/subrun-"
                                         (string-substitute "[/*]" "_" (string-intersperse switches "^"))"-"
                                         (number->string (current-seconds)) ".log")))
         (selector-switches  (common:sub-megatest-selector-switches test-run-dir))
         (cmd-list `("megatest" ,@selector-switches ,@switches "-log" ,real-logfile))
         )
  (let* ((selector-switches  (subrun:selector+log-switches test-run-dir log-prefix))
         (cmd (conc "megatest " selector-switches " " action-switches-str ))
         (pid #f)
         (proc (lambda ()
                 (debug:print-info 0 *default-log-port* "Running sub megatest command: "cmd)
                 ;;(set! pid (process-run "/usr/bin/xterm" (list ))))))
                 (set! pid (process-run "/bin/bash" (list "-c" cmd))))))
    (call-with-environment-variables 
     (list (cons "PATH" (conc (get-environment-variable "PATH") ":.")))
     (lambda  ()
       (common:without-vars proc "^MT_.*")
       
       ))))
                             
       (common:without-vars proc "^MT_.*")))
    (let processloop ((i 0))
      (let-values (((pid-val exit-status exit-code)(process-wait pid #t)))
        (if (eq? pid-val 0)
            (begin
              (thread-sleep! 2)
              (processloop (+ i 1)))
            (begin
              (debug:print-info 0 *default-log-port* "sub megatest " action-switches-str " completed with exit code " exit-code)
              (if (eq? 0 exit-code)
                  (begin
                    #t)
                  (begin
                    #f))))))))



;; (subrun:exec-sub-megatest "/nfs/pdx/disks/icf_env_disk001/bjbarcla/gwa/issues/mtdev/165/megatest/ext-tests/tests/subrun-usecases/toparea/links/SYSTEM_val/RELEASE_val/go/toptest" "-foo" "foo")