Megatest

Check-in [0b313f7218]
Login
Overview
Comment:Somewhat speculative and partial fix for the non updating state/status on test run with transport=fs
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.55
Files: files | file ages | folders
SHA1: 0b313f721805c58236ae52057afc6e0b6a82d393
User & Date: mrwellan on 2013-09-13 18:17:39
Other Links: branch diff | manifest | tags
Context
2013-09-13
22:56
Join monitor thread in launch check-in: c84bb1b895 user: matt tags: v1.55
18:17
Somewhat speculative and partial fix for the non updating state/status on test run with transport=fs check-in: 0b313f7218 user: mrwellan tags: v1.55
2013-09-12
23:42
Fixed typo check-in: cbc9328c04 user: matt tags: v1.55
Changes

Modified launch.scm from [9aa0c0384c] to [680b277def].

134
135
136
137
138
139
140

141



142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160









161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
	  ;; environment overrides are done *before* the remaining critical envars.
	  (alist->env-vars env-ovrd)
	  (set-megatest-env-vars run-id)
	  (set-item-env-vars itemdat)
	  (save-environment-as-files "megatest")
	  ;; open-run-close not needed for test-set-meta-info
	  (tests:set-full-meta-info #f test-id run-id 0 work-area)

	  (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f)



	  (if (args:get-arg "-xterm")
	      (set! fullrunscript "xterm")
	      (if (and fullrunscript (not (file-execute-access? fullrunscript)))
		  (system (conc "chmod ug+x " fullrunscript))))
	  ;; We are about to actually kick off the test
	  ;; so this is a good place to remove the records for 
	  ;; any previous runs
	  ;; (db:test-remove-steps db run-id testname itemdat)
	  
	  (let* ((m            (make-mutex))
		 (kill-job?    #f)
		 (exit-info    (vector #t #t #t))
		 (job-thread   #f)
		 (keep-going   #t)
		 (runit        (lambda ()
				 ;; (let-values
				 ;;  (((pid exit-status exit-code)
				 ;;    (run-n-wait fullrunscript)))
				 (tests:test-set-status! test-id "RUNNING" "n/a" #f #f)









				 ;; if there is a runscript do it first
				 (if fullrunscript
				     (let ((pid (process-run fullrunscript)))
				       (let loop ((i 0))
					 (let-values
					  (((pid-val exit-status exit-code) (process-wait pid #t)))
					  (mutex-lock! m)
					  (vector-set! exit-info 0 pid)
					  (vector-set! exit-info 1 exit-status)
					  (vector-set! exit-info 2 exit-code)
					  (set! rollup-status exit-code) 
					  (mutex-unlock! m)
					  (if (eq? pid-val 0)
					      (begin
						(thread-sleep! 1)
						(loop (+ i 1)))
					      )))))
				 ;; then, if runscript ran ok (or did not get called)
				 ;; do all the ezsteps (if any)
				 (if ezsteps
				     (let* ((testconfig (read-config (conc work-area "/testconfig") #f #t environ-patt: "pre-launch-env-vars")) ;; FIXME??? is allow-system ok here?
					    (ezstepslst (hash-table-ref/default testconfig "ezsteps" '())))







>
|
>
>
>


















|
>
>
>
>
>
>
>
>
>














|







134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
	  ;; environment overrides are done *before* the remaining critical envars.
	  (alist->env-vars env-ovrd)
	  (set-megatest-env-vars run-id)
	  (set-item-env-vars itemdat)
	  (save-environment-as-files "megatest")
	  ;; open-run-close not needed for test-set-meta-info
	  (tests:set-full-meta-info #f test-id run-id 0 work-area)

	  ;; (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f)
	  (tests:test-force-state-status! test-id "REMOTEHOSTSTART" "n/a")
	  (thread-sleep! 0.3) ;; NFS slowness has caused grief here

	  (if (args:get-arg "-xterm")
	      (set! fullrunscript "xterm")
	      (if (and fullrunscript (not (file-execute-access? fullrunscript)))
		  (system (conc "chmod ug+x " fullrunscript))))
	  ;; We are about to actually kick off the test
	  ;; so this is a good place to remove the records for 
	  ;; any previous runs
	  ;; (db:test-remove-steps db run-id testname itemdat)
	  
	  (let* ((m            (make-mutex))
		 (kill-job?    #f)
		 (exit-info    (vector #t #t #t))
		 (job-thread   #f)
		 (keep-going   #t)
		 (runit        (lambda ()
				 ;; (let-values
				 ;;  (((pid exit-status exit-code)
				 ;;    (run-n-wait fullrunscript)))
				 ;; (tests:test-set-status! test-id "RUNNING" "n/a" #f #f)
				 ;; Since we should have a clean slate at this time there is no need to do 
				 ;; any of the other stuff that tests:test-set-status! does. Let's just 
				 ;; force RUNNING/n/a
				 

				 (thread-sleep! 0.3)
				 (tests:test-force-state-status! test-id "RUNNING" "n/a")
				 (thread-sleep! 0.3) ;; NFS slowness has caused grief here

				 ;; if there is a runscript do it first
				 (if fullrunscript
				     (let ((pid (process-run fullrunscript)))
				       (let loop ((i 0))
					 (let-values
					  (((pid-val exit-status exit-code) (process-wait pid #t)))
					  (mutex-lock! m)
					  (vector-set! exit-info 0 pid)
					  (vector-set! exit-info 1 exit-status)
					  (vector-set! exit-info 2 exit-code)
					  (set! rollup-status exit-code) 
					  (mutex-unlock! m)
					  (if (eq? pid-val 0)
					      (begin
						(thread-sleep! 2)
						(loop (+ i 1)))
					      )))))
				 ;; then, if runscript ran ok (or did not get called)
				 ;; do all the ezsteps (if any)
				 (if ezsteps
				     (let* ((testconfig (read-config (conc work-area "/testconfig") #f #t environ-patt: "pre-launch-env-vars")) ;; FIXME??? is allow-system ok here?
					    (ezstepslst (hash-table-ref/default testconfig "ezsteps" '())))
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
								   (mutex-lock! m)
								   (vector-set! exit-info 0 pid)
								   (vector-set! exit-info 1 exit-status)
								   (vector-set! exit-info 2 exit-code)
								   (mutex-unlock! m)
								   (if (eq? pid-val 0)
								       (begin
									 (thread-sleep! 1)
									 (processloop (+ i 1))))
								   ))
                                                     (let ((exinfo (vector-ref exit-info 2))
                                                           (logfna (if logpro-used (conc stepname ".html") "")))
						       ;; testing if procedures called in a remote call cause problems (ans: no or so I suspect)
						       (db:teststep-set-status! #f test-id stepname "end" exinfo #f logfna work-area: work-area))
						     (if logpro-used







|







235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
								   (mutex-lock! m)
								   (vector-set! exit-info 0 pid)
								   (vector-set! exit-info 1 exit-status)
								   (vector-set! exit-info 2 exit-code)
								   (mutex-unlock! m)
								   (if (eq? pid-val 0)
								       (begin
									 (thread-sleep! 2)
									 (processloop (+ i 1))))
								   ))
                                                     (let ((exinfo (vector-ref exit-info 2))
                                                           (logfna (if logpro-used (conc stepname ".html") "")))
						       ;; testing if procedures called in a remote call cause problems (ans: no or so I suspect)
						       (db:teststep-set-status! #f test-id stepname "end" exinfo #f logfna work-area: work-area))
						     (if logpro-used
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
						     (sqlite3:finalize! tdb)
						     (exit 1))))
					     (set! kill-tries (+ 1 kill-tries))
					     (mutex-unlock! m)))
				       ;; (sqlite3:finalize! db)
				       (if keep-going
					   (begin
					     (thread-sleep! (+ 10 (random 10))) ;; add some jitter to the call home time to spread out the db accesses
					     (if keep-going
						 (loop (calc-minutes)))))))))) ;; NOTE: Checking twice for keep-going is intentional
		 (th1          (make-thread monitorjob "monitor job"))
		 (th2          (make-thread runit "run job")))
	    (set! job-thread th2)
	    (thread-start! th1)
	    (thread-start! th2)
	    (thread-join! th2)
	    (set! keep-going #f)
	    (thread-sleep! 1)
	    (thread-terminate! th1) ;; Not sure if this is a good idea
	    (thread-sleep! 0.1) ;; give thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec.
	    (mutex-lock! m)
	    (let* ((item-path (item-list->path itemdat))
		   (testinfo  (cdb:get-test-info-by-id *runremote* test-id))) ;; )) ;; run-id test-name item-path)))
	      ;; Am I completed?
	      (if (equal? (db:test-get-state testinfo) "RUNNING") ;; (not (equal? (db:test-get-state testinfo) "COMPLETED"))
		  (let ((new-state  (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status
				                                        ;; "COMPLETED"
							                ;; (db:test-get-state testinfo)))   ;; else preseve the state as set within the test
				    )
			(new-status (cond
				     ((not (vector-ref exit-info 1)) "FAIL") ;; job failed to run
				     ((eq? rollup-status 0)







|










|
|




|







341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
						     (sqlite3:finalize! tdb)
						     (exit 1))))
					     (set! kill-tries (+ 1 kill-tries))
					     (mutex-unlock! m)))
				       ;; (sqlite3:finalize! db)
				       (if keep-going
					   (begin
					     (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
					     (if keep-going
						 (loop (calc-minutes)))))))))) ;; NOTE: Checking twice for keep-going is intentional
		 (th1          (make-thread monitorjob "monitor job"))
		 (th2          (make-thread runit "run job")))
	    (set! job-thread th2)
	    (thread-start! th1)
	    (thread-start! th2)
	    (thread-join! th2)
	    (set! keep-going #f)
	    (thread-sleep! 1)
	    ;; (thread-terminate! th1) ;; Not sure if this is a good idea
	    (thread-sleep! 1)       ;; give thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec.
	    (mutex-lock! m)
	    (let* ((item-path (item-list->path itemdat))
		   (testinfo  (cdb:get-test-info-by-id *runremote* test-id))) ;; )) ;; run-id test-name item-path)))
	      ;; Am I completed?
	      (if (member (db:test-get-state testinfo) '("REMOTEHOSTSTART" "RUNNING")) ;; NOTE: It should *not* be REMOTEHOSTSTART but for reasons I don't yet understand it sometimes gets stuck in that state ;; (not (equal? (db:test-get-state testinfo) "COMPLETED"))
		  (let ((new-state  (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status
				                                        ;; "COMPLETED"
							                ;; (db:test-get-state testinfo)))   ;; else preseve the state as set within the test
				    )
			(new-status (cond
				     ((not (vector-ref exit-info 1)) "FAIL") ;; job failed to run
				     ((eq? rollup-status 0)

Modified megatest.scm from [ab8143b2ea] to [2bdb2f5097].

32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63

(include "common_records.scm")
(include "key_records.scm")
(include "db_records.scm")
(include "run_records.scm")
(include "megatest-fossil-hash.scm")

;; Overall exit handling setup immediately
;;
(let ((original-exit (exit-handler)))
  (exit-handler (lambda (#!optional (exit-code 0))
		  (printf "Preparing to exit with exit code ~A ...\n" exit-code)
		  (children
		   (lambda (pid)
		     (handle-exceptions
		      exn
		      #t
		      (let-values (((pid-val exit-status exit-code) (process-wait pid #t)))
				  (if (or (eq? pid-val pid)
					  (eq? pid-val 0))
				      (begin
					(printf "Sending signal/term to ~A\n" pid)
					(process-signal pid signal/term)))))))
		  (original-exit exit-code))))

(let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc")))
  (if (file-exists? debugcontrolf)
      (load debugcontrolf)))


(define help (conc "
Megatest, documentation at http://www.kiatoa.com/fossils/megatest







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







32
33
34
35
36
37
38


















39
40
41
42
43
44
45

(include "common_records.scm")
(include "key_records.scm")
(include "db_records.scm")
(include "run_records.scm")
(include "megatest-fossil-hash.scm")



















(let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc")))
  (if (file-exists? debugcontrolf)
      (load debugcontrolf)))


(define help (conc "
Megatest, documentation at http://www.kiatoa.com/fossils/megatest
268
269
270
271
272
273
274
























275
276
277
278
279
280
281

(if (args:get-arg "-version")
    (begin
      (print megatest-version)
      (exit)))

(define *didsomething* #f)

























;; Force default transport to fs
;; (if ;; (and (or (args:get-arg "-list-targets")
;;     ;;          (args:get-arg "-list-db-targets"))
;;  (not (args:get-arg "-transport"))
;;  (hash-table-set! args:arg-hash "-transport" "fs"))








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287

(if (args:get-arg "-version")
    (begin
      (print megatest-version)
      (exit)))

(define *didsomething* #f)

;; Overall exit handling setup immediately
;;
(if (or (args:get-arg "-process-reap"))
        ;; (args:get-arg "-runtests")
	;; (args:get-arg "-execute")
	;; (args:get-arg "-remove-runs")
	;; (args:get-arg "-runstep"))
    (let ((original-exit (exit-handler)))
      (exit-handler (lambda (#!optional (exit-code 0))
		      (printf "Preparing to exit with exit code ~A ...\n" exit-code)
		      (for-each 
		       (lambda (pid)
			 (handle-exceptions
			  exn
			  #t
			  (let-values (((pid-val exit-status exit-code) (process-wait pid #t)))
				      (if (or (eq? pid-val pid)
					      (eq? pid-val 0))
					  (begin
					    (printf "Sending signal/term to ~A\n" pid)
					    (process-signal pid signal/term))))))
		       (process:children #f))
		      (original-exit exit-code)))))

;; Force default transport to fs
;; (if ;; (and (or (args:get-arg "-list-targets")
;;     ;;          (args:get-arg "-list-db-targets"))
;;  (not (args:get-arg "-transport"))
;;  (hash-table-set! args:arg-hash "-transport" "fs"))

Modified process.scm from [8a2775d2d2] to [88799f98f8].

111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
	       (loop (+ i 1)))
	     (values pid-val exit-status exit-code))))))
  
;;======================================================================
;; MISC PROCESS RELATED STUFF
;;======================================================================

(define (children proc)
  (with-input-from-pipe
   (conc "ps h --ppid " (current-process-id) " -o pid")
   (lambda ()
     (let loop ((inl (read-line))
		(res '()))
       (if (eof-object? inl)
	   (reverse res)







|







111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
	       (loop (+ i 1)))
	     (values pid-val exit-status exit-code))))))
  
;;======================================================================
;; MISC PROCESS RELATED STUFF
;;======================================================================

(define (process:children proc)
  (with-input-from-pipe
   (conc "ps h --ppid " (current-process-id) " -o pid")
   (lambda ()
     (let loop ((inl (read-line))
		(res '()))
       (if (eof-object? inl)
	   (reverse res)

Modified tests.scm from [5de2830eb7] to [dfe6ddf3ca].

276
277
278
279
280
281
282



283
284
285
286
287
288
289
				    (if (null? tal)
					#t
					(loop (car tal)(cdr tal)))
				    #f))))))
	    (pop-directory)
	    result)))))





;; Do not rpc this one, do the underlying calls!!!
(define (tests:test-set-status! test-id state status comment dat #!key (work-area #f))
  (debug:print-info 4 "tests:test-set-status! test-id=" test-id ", state=" state ", status=" status ", dat=" dat)
  (let* ((db          #f)
	 (real-status status)
	 (otherdat    (if dat dat (make-hash-table)))







>
>
>







276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
				    (if (null? tal)
					#t
					(loop (car tal)(cdr tal)))
				    #f))))))
	    (pop-directory)
	    result)))))

(define (tests:test-force-state-status! test-id state status)
  (cdb:test-set-status-state *runremote* test-id status state #f)
  (mt:process-triggers test-id state status))

;; Do not rpc this one, do the underlying calls!!!
(define (tests:test-set-status! test-id state status comment dat #!key (work-area #f))
  (debug:print-info 4 "tests:test-set-status! test-id=" test-id ", state=" state ", status=" status ", dat=" dat)
  (let* ((db          #f)
	 (real-status status)
	 (otherdat    (if dat dat (make-hash-table)))

Modified tests/fullrun/config/mt_include_1.config from [32c08be131] to [92ab22bf8a].

1
2
3
4
5
6
7
8
9
10
11
12

13
14
15
16
17
18
[setup]
# exectutable /path/to/megatest
max_concurrent_jobs 25

linktree #{getenv MT_RUN_AREA_HOME}/tmp/mt_links

[jobtools]
useshell yes
# ## launcher launches jobs, the job is managed on the target host
## by megatest, comment out launcher to run local
# workhosts localhost hermes
launcher exec nbfake

# launcher nodanggood

## use "xterm -e csi -- " as a launcher to examine the launch environment.
## exit with (exit)
## get a shell with (system "bash")
# launcher xterm -e csi --


|








|
>






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
[setup]
# exectutable /path/to/megatest
max_concurrent_jobs 50

linktree #{getenv MT_RUN_AREA_HOME}/tmp/mt_links

[jobtools]
useshell yes
# ## launcher launches jobs, the job is managed on the target host
## by megatest, comment out launcher to run local
# workhosts localhost hermes
# launcher exec nbfake
launcher nbfind
# launcher nodanggood

## use "xterm -e csi -- " as a launcher to examine the launch environment.
## exit with (exit)
## get a shell with (system "bash")
# launcher xterm -e csi --

Modified tests/fullrun/megatest.config from [438c5f6345] to [233f3b9aad].

20
21
22
23
24
25
26

27
28
29
30
31
32
33
[setup]
# Set launchwait to no to use the more agressive code that does not wait for the launch to complete before proceeding
# this may save a few milliseconds on launching tests
# launchwait no

# Use http instead of direct filesystem access
# transport http


# If set to "default" the old code is used. Otherwise defaults to 200 or uses
# numeric value given.
#
runqueue 20









>







20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
[setup]
# Set launchwait to no to use the more agressive code that does not wait for the launch to complete before proceeding
# this may save a few milliseconds on launching tests
# launchwait no

# Use http instead of direct filesystem access
# transport http
transport fs

# If set to "default" the old code is used. Otherwise defaults to 200 or uses
# numeric value given.
#
runqueue 20


Modified utils/nbfake from [8a5014b010] to [455975d5ec].

1
2
3
4
5
6
7
8
9
10
11
12
#!/bin/bash

# ssh localhost "nohup $* > nbfake.log 2> nbfake.err < /dev/null"

# Can't always trust $PWD
CURRWD=`pwd`

if [[ $TARGETHOST == ""  ]]; then
  exec sh -c "cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=$PATH; nohup $* > NBFAKE-`date +%GWW%V.%u_%T` 2>&1 &"
else
  exec ssh -n -f $TARGETHOST "sh -c \"cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=$PATH; nohup $* > NBFAKE-`date +%GWW%V.%u_%T` 2>&1 &\""
fi








|

|

1
2
3
4
5
6
7
8
9
10
11
12
#!/bin/bash

# ssh localhost "nohup $* > nbfake.log 2> nbfake.err < /dev/null"

# Can't always trust $PWD
CURRWD=`pwd`

if [[ $TARGETHOST == ""  ]]; then
  sh -c "cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=$PATH; nohup $* > NBFAKE-`date +%GWW%V.%u_%T` 2>&1 &"
else
  ssh -n -f $TARGETHOST "sh -c \"cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=$PATH; nohup $* > NBFAKE-`date +%GWW%V.%u_%T` 2>&1 &\""
fi

Modified utils/nbfind from [03c58ee4f1] to [02152a07a2].

11
12
13
14
15
16
17
18


19
20
21
22
lperc=`echo "100 * $load / $numcpu"|bc`
if [[ "x$MAX_ALLOWED_LOAD" == "x" ]]; then
  max_load=50
else
  max_load=$MAX_ALLOWED_LOAD
fi 
if [[ $lperc -lt $max_load ]];then
  nbfake "$@"


else
  $NBLAUNCHER "$@"
fi








|
>
>




11
12
13
14
15
16
17
18
19
20
21
22
23
24
lperc=`echo "100 * $load / $numcpu"|bc`
if [[ "x$MAX_ALLOWED_LOAD" == "x" ]]; then
  max_load=50
else
  max_load=$MAX_ALLOWED_LOAD
fi 
if [[ $lperc -lt $max_load ]];then
  echo "$@" | at now + 0 minutes
elif [[ "x$NBLAUNCHER" == "x" ]];then
  echo "nbfind $@" | at now + 2 minutes
else
  $NBLAUNCHER "$@"
fi