Megatest

Check-in [e010ede9bd]
Login
Overview
Comment:subrun kill works but suboptimal (serial kill)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | 1.65-subrun-ancilliary-usecases
Files: files | file ages | folders
SHA1: e010ede9bd89034197618e007c786c534e9a0ace
User & Date: bjbarcla on 2017-12-27 16:24:12
Other Links: branch diff | manifest | tags
Context
2017-12-27
18:12
parallelized removal of subruns check-in: a035fad97b user: bjbarcla tags: 1.65-subrun-ancilliary-usecases
16:24
subrun kill works but suboptimal (serial kill) check-in: e010ede9bd user: bjbarcla tags: 1.65-subrun-ancilliary-usecases
2017-12-26
18:07
wip; added hooks for subrun remove-run handling check-in: 05b23944bc user: bjbarcla tags: 1.65-subrun-ancilliary-usecases
Changes

Modified Makefile from [79ddb592c4] to [c37f1ed514].

89
90
91
92
93
94
95

96
97
98
99
100
101
102
	rpc-transport.o \
	runconfig.o \
	runs.o \
	server.o \
	tasks.o \
	tdb.o \
	tests.o \



tcmt : $(TCMTOBJS) tcmt.scm
	csc $(CSCOPTS) $(TCMTOBJS) tcmt.scm -o tcmt

# install documentation to $(PREFIX)/docs
# DOES NOT REBUILD DOCS







>







89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
	rpc-transport.o \
	runconfig.o \
	runs.o \
	server.o \
	tasks.o \
	tdb.o \
	tests.o \
	subrun.o \


tcmt : $(TCMTOBJS) tcmt.scm
	csc $(CSCOPTS) $(TCMTOBJS) tcmt.scm -o tcmt

# install documentation to $(PREFIX)/docs
# DOES NOT REBUILD DOCS

Modified runs.scm from [6aafacc0b0] to [089f325036].

2062
2063
2064
2065
2066
2067
2068
2069

2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088


2089


2090





2091


2092
2093
2094
2095
2096
2097
2098
2099
			     (if (not (null? tal))
				 (loop (car tal)(cdr tal))))
			   (let* ((item-path     (db:test-get-item-path new-test-dat))
				  (test-name     (db:test-get-testname new-test-dat))
				  (run-dir       ;;(filedb:get-path *fdb*
				   ;; (rmt:sdb-qry 'getid 
				   (db:test-get-rundir new-test-dat)) ;; )    ;; run dir is from the link tree
                                  (has-subrun   (subrun:subrun-test-initialized? run-dir))

				  (test-state    (db:test-get-state new-test-dat))
				  (test-fulln    (db:test-get-fullname new-test-dat))
				  (uname         (db:test-get-uname    new-test-dat))
				  (toplevel-with-children (and (db:test-get-is-toplevel test)
							       (> (rmt:test-toplevel-num-items run-id test-name) 0))))
			     (case action
			       ((remove-runs)
				;; if the test is a toplevel-with-children issue an error and do not remove
				(cond
                                 (toplevel-with-children
                                  (debug:print 0 *default-log-port* "WARNING: skipping removal of " test-fulln " with run-id " run-id " as it has sub tests")
                                  (hash-table-set! toplevel-retries test-fulln (+ (hash-table-ref/default toplevel-retries test-fulln 0) 1))
                                  (if (> (hash-table-ref toplevel-retries test-fulln) 3)
                                      (if (not (null? tal))
                                          (loop (car tal)(cdr tal))) ;; no else clause - drop it if no more in queue and > 3 tries
                                      (let ((newtal (append tal (list test))))
                                        (loop (car newtal)(cdr newtal))))) ;; loop with test still in queue
                                 (has-subrun
                                  ;; BB TODO - manage toplevasel-retries hash and retries in general


                                  (subrun:remove-subrun test-run-dir new-test-dat test-name item-path test-state test-fulln toplevel-with-children test)


                                  








                                  )
                                 (else
                                  (debug:print-info 0 *default-log-port* "test: " test-name " itest-state: " test-state)
                                  (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
                                      (begin
                                        (if (not (hash-table-ref/default test-retry-time test-fulln #f))
                                            (begin
                                              ;; want to set to REMOVING BUT CANNOT do it here?







|
>



















>
>
|
>
>
|
>
>
>
>
>
|
>
>
|







2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
			     (if (not (null? tal))
				 (loop (car tal)(cdr tal))))
			   (let* ((item-path     (db:test-get-item-path new-test-dat))
				  (test-name     (db:test-get-testname new-test-dat))
				  (run-dir       ;;(filedb:get-path *fdb*
				   ;; (rmt:sdb-qry 'getid 
				   (db:test-get-rundir new-test-dat)) ;; )    ;; run dir is from the link tree
                                  (has-subrun    (and (subrun:subrun-test-initialized? run-dir)
                                                      (not (subrun:subrun-removed? run-dir))))
				  (test-state    (db:test-get-state new-test-dat))
				  (test-fulln    (db:test-get-fullname new-test-dat))
				  (uname         (db:test-get-uname    new-test-dat))
				  (toplevel-with-children (and (db:test-get-is-toplevel test)
							       (> (rmt:test-toplevel-num-items run-id test-name) 0))))
			     (case action
			       ((remove-runs)
				;; if the test is a toplevel-with-children issue an error and do not remove
				(cond
                                 (toplevel-with-children
                                  (debug:print 0 *default-log-port* "WARNING: skipping removal of " test-fulln " with run-id " run-id " as it has sub tests")
                                  (hash-table-set! toplevel-retries test-fulln (+ (hash-table-ref/default toplevel-retries test-fulln 0) 1))
                                  (if (> (hash-table-ref toplevel-retries test-fulln) 3)
                                      (if (not (null? tal))
                                          (loop (car tal)(cdr tal))) ;; no else clause - drop it if no more in queue and > 3 tries
                                      (let ((newtal (append tal (list test))))
                                        (loop (car newtal)(cdr newtal))))) ;; loop with test still in queue
                                 (has-subrun
                                  ;; BB TODO - manage toplevasel-retries hash and retries in general
                                  (debug:print 0 *default-log-port* "WARNING: postponing removal of " test-fulln " with run-id " run-id " as it has a subrun")
                                  (let* ((subrun-remove-succeeded
                                          (subrun:remove-subrun run-dir new-test-dat test-name item-path test-state test-fulln toplevel-with-children test)))
                                    (cond
                                     (subrun-remove-succeeded
                                      
                                      (debug:print 0 *default-log-port* "Now removing of " test-fulln " with run-id " run-id " as it has a subrun")
                                      (runs:remove-test-directory new-test-dat mode))
                                     (else
                                      (let* ((logfile (subrun:get-log-path run-dir "remove")))
                                        (debug:print 0 *default-log-port* "WARNING: removal of subrun failed.  Please check "logfile" for details."))))
                                    
                                  (if (not (null? tal))
                                            (loop (car tal)(cdr tal)))))

                                 (else
                                  (debug:print-info 0 *default-log-port* "test: " test-name " itest-state: " test-state)
                                  (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
                                      (begin
                                        (if (not (hash-table-ref/default test-retry-time test-fulln #f))
                                            (begin
                                              ;; want to set to REMOVING BUT CANNOT do it here?

Modified subrun.scm from [4e6779a4e5] to [9da03e90bf].

1
2
3
4
5
6
7
8
9
10
11
12
13
14

15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38














39
40
41
42
43
44
45

;; Copyright 2006-2016, Matthew Welland.
;; 
;;  This program is made available under the GNU GPL version 2.0 or
;;  greater. See the accompanying file COPYING for details.
;; 
;;  This program is distributed WITHOUT ANY WARRANTY; without even the
;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
;;  PURPOSE.

;;  strftime('%m/%d/%Y %H:%M:%S','now','localtime')

(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) 
     posix-extras directory-utils pathname-expand typed-records format)

(declare (unit subrun))
;;(declare (uses runs))
;;(declare (uses db))
(declare (uses common))
;;(declare (uses items))
;;(declare (uses runconfig))
;;(declare (uses tests))
;;(declare (uses server))
;;(declare (uses mt))
;;(declare (uses archive))
;; (declare (uses filedb))

;(include "common_records.scm")
;;(include "key_records.scm")
;;(include "db_records.scm")
;;(include "run_records.scm")
;;(include "test_records.scm")

(define (subrun:subrun-test-initialized? test-run-dir)
  (if (and (common:file-exists? (conc test-run-dir "/subrun-area") )
           (common:file-exists? (conc test-run-dir "/testconfig.subrun") ))
      #t
      #f))















(define (subrun:testconfig-defines-subrun? testconfig)
  (configf:lookup testconfig "subrun" "runwait")) ;; we use runwait as the flag that a subrun is requested

(define (subrun:initialize-toprun-test  testconfig test-run-dir)

  (let ((ra (configf:lookup testconfig "subrun" "run-area"))
        (logpro (configf:lookup testconfig "subrun" "logpro"))













|
>


|





|





|









>
>
>
>
>
>
>
>
>
>
>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

;; Copyright 2006-2016, Matthew Welland.
;; 
;;  This program is made available under the GNU GPL version 2.0 or
;;  greater. See the accompanying file COPYING for details.
;; 
;;  This program is distributed WITHOUT ANY WARRANTY; without even the
;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
;;  PURPOSE.

;;  strftime('%m/%d/%Y %H:%M:%S','now','localtime')

(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) 
     posix-extras directory-utils pathname-expand typed-records format
     call-with-environment-variables)
(declare (unit subrun))
;;(declare (uses runs))
(declare (uses db))
(declare (uses common))
;;(declare (uses items))
;;(declare (uses runconfig))
;;(declare (uses tests))
;;(declare (uses server))
(declare (uses mt))
;;(declare (uses archive))
;; (declare (uses filedb))

;(include "common_records.scm")
;;(include "key_records.scm")
(include "db_records.scm") ;; provides db:test-get-id
;;(include "run_records.scm")
;;(include "test_records.scm")

(define (subrun:subrun-test-initialized? test-run-dir)
  (if (and (common:file-exists? (conc test-run-dir "/subrun-area") )
           (common:file-exists? (conc test-run-dir "/testconfig.subrun") ))
      #t
      #f))

(define (subrun:subrun-removed? test-run-dir)
  (if (subrun:subrun-test-initialized? test-run-dir)
      (let ((flagfile (conc test-run-dir "/subrun.removed")))
        (if (common:file-exists? flagfile)
            #t
            #f))
      #t))

(define (subrun:set-subrun-removed test-run-dir)
  (let ((flagfile (conc test-run-dir "/subrun.removed")))
    (if (and (subrun:subrun-test-initialized? test-run-dir) (not (common:file-exists? flagfile)))
        (with-output-to-file flagfile
          (lambda () (print (current-seconds)))))))

(define (subrun:testconfig-defines-subrun? testconfig)
  (configf:lookup testconfig "subrun" "runwait")) ;; we use runwait as the flag that a subrun is requested

(define (subrun:initialize-toprun-test  testconfig test-run-dir)

  (let ((ra (configf:lookup testconfig "subrun" "run-area"))
        (logpro (configf:lookup testconfig "subrun" "logpro"))
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

85
86
87
88
89
90
91
92
93
94


(define (subrun:remove-subrun test-run-dir new-test-dat test-name item-path test-state test-fulln toplevel-with-children test)
;; set state/status of test item
;; fork off megatest
;; set state/status of test item
;;

  (let* ((subrun-alist (subrun:selector+log-alist test-run-dir log-prefix))
         (runlog       (alist-ref "-log" subrun-alist equal? #f)))
    (if (not (common:file-exists? runlog))
        (BB> "no runlog @ "runlog)
        (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
            ;; This test is not in a correct state for cleaning up. Let's try some graceful shutdown steps first
            ;; Set the test to "KILLREQ" and wait five seconds then try again. Repeat up to five times then give
            ;; up and blow it away.
            
            ;; call in submegatest:
            ;;  (tasks:kill-runner target run-name testpatt)
            
            (mt:test-set-state-status-by-id run-id (db:test-get-id test) "SUBRUN-KILLREQ" "n/a" #f)
            )

        ;; on success:
        ;;   set state of test, or delete it or whatever

        )
    )
  )

(define (subrun:launch-cmd test-run-dir)
  (let* ((log-prefix "run")
         (switches (subrun:selector+log-switches test-run-dir log-prefix))
         (run-wait #t)
         (cmd      (conc "megatest -run "switches" "
                         (if run-wait "-run-wait " ""))))







|
|
<
<
<
<
<
<
<
|
|
<
|
<
<
|
<
|
>
|
|
<







75
76
77
78
79
80
81
82
83







84
85

86


87

88
89
90
91

92
93
94
95
96
97
98


(define (subrun:remove-subrun test-run-dir new-test-dat test-name item-path test-state test-fulln toplevel-with-children test)
;; set state/status of test item
;; fork off megatest
;; set state/status of test item
;;
  (BB> "Entered subrun:remove-subrun with "test-fulln)
  (if (and (not (subrun:subrun-removed? test-run-dir)) (subrun:subrun-test-initialized? test-run-dir))







      (let* ((remove-result
              (subrun:exec-sub-megatest test-run-dir "-remove-runs" "remove")))

        (if remove-result


            (begin

              (subrun:set-subrun-removed test-run-dir)
              #t)
            #f))
      #t))


(define (subrun:launch-cmd test-run-dir)
  (let* ((log-prefix "run")
         (switches (subrun:selector+log-switches test-run-dir log-prefix))
         (run-wait #t)
         (cmd      (conc "megatest -run "switches" "
                         (if run-wait "-run-wait " ""))))
167
168
169
170
171
172
173
174
175
176
177
178
179



180

181
182
183
184
185











186
187
188
189

            (map
             (lambda (x)
               (list (car x) (cdr x)))
             switch-alist))
           " ")))
    res))

(define (subrun:exec-sub-megatest test-run-dir switches #!key (logfile #f))
  (let* ((real-logfile (or logfile (conc (test-run-dir) "/subrun-"
                                         (string-substitute "[/*]" "_" (string-intersperse switches "^"))"-"
                                         (number->string (current-seconds)) ".log")))
         (selector-switches  (common:sub-megatest-selector-switches test-run-dir))
         (cmd-list `("megatest" ,@selector-switches ,@switches "-log" ,real-logfile))



         )

    (call-with-environment-variables 
     (list (cons "PATH" (conc (get-environment-variable "PATH") ":.")))
     (lambda  ()
       (common:without-vars proc "^MT_.*")
       











       ))))
                             










|
<
<
<
|
|
>
>
>
|
>



|
|
>
>
>
>
>
>
>
>
>
>
>
|
|


>
171
172
173
174
175
176
177
178



179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
            (map
             (lambda (x)
               (list (car x) (cdr x)))
             switch-alist))
           " ")))
    res))

(define (subrun:exec-sub-megatest test-run-dir action-switches-str log-prefix)



  (let* ((selector-switches  (subrun:selector+log-switches test-run-dir log-prefix))
         (cmd (conc "megatest " selector-switches " " action-switches-str ))
         (pid #f)
         (proc (lambda ()
                 (debug:print-info 0 *default-log-port* "Running sub megatest command: "cmd)
                 ;;(set! pid (process-run "/usr/bin/xterm" (list ))))))
                 (set! pid (process-run "/bin/bash" (list "-c" cmd))))))
    (call-with-environment-variables 
     (list (cons "PATH" (conc (get-environment-variable "PATH") ":.")))
     (lambda  ()
       (common:without-vars proc "^MT_.*")))
    (let processloop ((i 0))
      (let-values (((pid-val exit-status exit-code)(process-wait pid #t)))
        (if (eq? pid-val 0)
            (begin
              (thread-sleep! 2)
              (processloop (+ i 1)))
            (begin
              (debug:print-info 0 *default-log-port* "sub megatest " action-switches-str " completed with exit code " exit-code)
              (if (eq? 0 exit-code)
                  (begin
                    #t)
                  (begin
                    #f))))))))



;; (subrun:exec-sub-megatest "/nfs/pdx/disks/icf_env_disk001/bjbarcla/gwa/issues/mtdev/165/megatest/ext-tests/tests/subrun-usecases/toparea/links/SYSTEM_val/RELEASE_val/go/toptest" "-foo" "foo")