Megatest

Diff
Login

Differences From Artifact [acb41eb596]:

To Artifact [1d44ba3939]:


89
90
91
92
93
94
95


96
97
98
99
100
101
102
	                                  (if (and (file-exists? fulln)
                                                   (file-execute-access? fulln))
                                              fulln
                                              runscript))))) ;; assume it is on the path
	       (rollup-status 0))
	  (change-directory top-path)



	  ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART,
	  ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY*
	  ;;
	  (let ((test-info (rmt:get-testinfo-state-status run-id test-id)))
	    (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")))
		(tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
		(debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed")))







>
>







89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
	                                  (if (and (file-exists? fulln)
                                                   (file-execute-access? fulln))
                                              fulln
                                              runscript))))) ;; assume it is on the path
	       (rollup-status 0))
	  (change-directory top-path)

	  ;; (set-signal-handler! signal/int (lambda ()
					    
	  ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART,
	  ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY*
	  ;;
	  (let ((test-info (rmt:get-testinfo-state-status run-id test-id)))
	    (if (not (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")))
		(tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
		(debug:print 0 "ERROR: test state is " (db:test-get-state test-info) ", cannot proceed")))
205
206
207
208
209
210
211

212
213
214
215
216
217
218
				 (tests:test-force-state-status! run-id test-id "RUNNING" "n/a")
				 (rmt:roll-up-pass-fail-counts run-id test-name item-path "RUNNING")
				 (thread-sleep! 0.3) ;; NFS slowness has caused grief here

				 ;; if there is a runscript do it first
				 (if fullrunscript
				     (let ((pid (process-run fullrunscript)))

				       (let loop ((i 0))
					 (let-values
					  (((pid-val exit-status exit-code) (process-wait pid #t)))
					  (mutex-lock! m)
					  (vector-set! exit-info 0 pid)
					  (vector-set! exit-info 1 exit-status)
					  (vector-set! exit-info 2 exit-code)







>







207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
				 (tests:test-force-state-status! run-id test-id "RUNNING" "n/a")
				 (rmt:roll-up-pass-fail-counts run-id test-name item-path "RUNNING")
				 (thread-sleep! 0.3) ;; NFS slowness has caused grief here

				 ;; if there is a runscript do it first
				 (if fullrunscript
				     (let ((pid (process-run fullrunscript)))
				       (rmt:test-set-top-process-pid run-id test-id pid)
				       (let loop ((i 0))
					 (let-values
					  (((pid-val exit-status exit-code) (process-wait pid #t)))
					  (mutex-lock! m)
					  (vector-set! exit-info 0 pid)
					  (vector-set! exit-info 1 exit-status)
					  (vector-set! exit-info 2 exit-code)
260
261
262
263
264
265
266

267
268
269
270
271
272
273
						   ;; call the command using mt_ezstep
						   (set! script (conc "mt_ezstep " stepname " " (if prevstep prevstep "-") " " stepcmd))

						   (debug:print 4 "script: " script)
						   (rmt:teststep-set-status! run-id test-id stepname "start" "-" #f #f)
						   ;; now launch
						   (let ((pid (process-run script)))

						     (let processloop ((i 0))
						       (let-values (((pid-val exit-status exit-code)(process-wait pid #t)))
								   (mutex-lock! m)
								   (vector-set! exit-info 0 pid)
								   (vector-set! exit-info 1 exit-status)
								   (vector-set! exit-info 2 exit-code)
								   (mutex-unlock! m)







>







263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
						   ;; call the command using mt_ezstep
						   (set! script (conc "mt_ezstep " stepname " " (if prevstep prevstep "-") " " stepcmd))

						   (debug:print 4 "script: " script)
						   (rmt:teststep-set-status! run-id test-id stepname "start" "-" #f #f)
						   ;; now launch
						   (let ((pid (process-run script)))
						     (rmt:test-set-top-process-pid run-id test-id pid)
						     (let processloop ((i 0))
						       (let-values (((pid-val exit-status exit-code)(process-wait pid #t)))
								   (mutex-lock! m)
								   (vector-set! exit-info 0 pid)
								   (vector-set! exit-info 1 exit-status)
								   (vector-set! exit-info 2 exit-code)
								   (mutex-unlock! m)
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357


358



359
360
361
362
363


364
365
366
367
368
369
370
371
372
373
374
375
376

377

378
379
380
381
382
383
384
385
386
387


388
389
390
391
392
393
394
395
396
397
398
399

400
401
402
403
404
405
406
							   (and runtlim (let* ((run-seconds   (- (current-seconds) start-seconds))
									       (time-exceeded (> run-seconds runtlim)))
									  (if time-exceeded
									      (begin
										(debug:print-info 0 "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)
										#t)
									      #f)))))
				       ;; open-run-close not needed for test-set-meta-info
				       ;; (tests:set-partial-meta-info #f test-id run-id minutes work-area)
				       (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f)
				       ;; (tests:set-partial-meta-info #f test-id run-id minutes work-area 10)
				       ;; (tests:set-partial-meta-info test-id run-id minutes work-area)
				       (if kill-job? 
					   (begin
					     (mutex-lock! m)
					     ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
					     ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
					     ;;       between tries?
					     (let* ((pid (vector-ref exit-info 0)))


					       (if (number? pid)



						   (handle-exceptions
						    exn
						    (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.")
						    ;;(process-signal pid signal/kill))
						    (begin


						      (debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")")
						      (let ((processes (cmd-run->list (conc "pgrep -l -P " pid))))
							(for-each 
							 (lambda (p)
							   (let* ((parts  (string-split p))
								  (p-id   (if (> (length parts) 0)
									      (string->number (car parts))
									      #f)))
							     (if p-id
								 (begin
								   (debug:print 0 "Killing " (cadr parts) "; kill -9  " p-id)
								   ;; (process-signal pid signal/kill))))) ;; 
								   (system (conc "kill -9 " p-id))))))

							 (car processes)))

						      (system (conc "kill -9 -" pid))
						      (tests:test-set-status! test-id "KILLED"  "KILLED" (args:get-arg "-m") #f)))
						   (begin
						     (debug:print 0 "WARNING: Request received to kill job but problem with process, attempting to kill manager process")
						     (tests:test-set-status! test-id "KILLED"  "KILLED" (args:get-arg "-m") #f)
						     (tests:test-set-status! run-id test-id "KILLED"  "FAIL" (args:get-arg "-m") #f)
						     (exit 1) ;; IS THIS NECESSARY OR WISE???
						     )))
					     (set! kill-tries (+ 1 kill-tries))
					     (mutex-unlock! m)))


				       (if keep-going
					   (begin
					     (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
					     (if keep-going
						 (loop (calc-minutes)))))))
				   (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f)))) ;; NOTE: Checking twice for keep-going is intentional
		 (th1          (make-thread monitorjob "monitor job"))
		 (th2          (make-thread runit "run job")))
	    (set! job-thread th2)
	    (thread-start! th1)
	    (thread-start! th2)
	    (thread-join! th2)

	    (set! keep-going #f)
	    (thread-join! th1)
	    (thread-sleep! 1)       ;; give thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec.
	    (mutex-lock! m)
	    (let* ((item-path (item-list->path itemdat))
		   ;; only state and status needed - use lazy routine
		   (testinfo  (rmt:get-testinfo-state-status run-id test-id)))







<
<

<
<






|
>
>
|
>
>
>
|
|
<
<
|
>
>
|
<
<
<
<
<
<
<
|
|
<
|
<
>
|
>
|
|

|
<
|
<

<
|
>
>












>







343
344
345
346
347
348
349


350


351
352
353
354
355
356
357
358
359
360
361
362
363
364
365


366
367
368
369







370
371

372

373
374
375
376
377
378
379

380

381

382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
							   (and runtlim (let* ((run-seconds   (- (current-seconds) start-seconds))
									       (time-exceeded (> run-seconds runtlim)))
									  (if time-exceeded
									      (begin
										(debug:print-info 0 "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)
										#t)
									      #f)))))


				       (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f)


				       (if kill-job? 
					   (begin
					     (mutex-lock! m)
					     ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
					     ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
					     ;;       between tries?
					     (let* ((pid1 (vector-ref exit-info 0))
						    (pid2 (rmt:test-get-top-process-pid run-id test-id))
						    (pids (delete-duplicates (filter number? (list pid1 pid2)))))
					       (if (not (null? pids))
						   (begin
						     (for-each
						      (lambda (pid)
							(handle-exceptions
							 exn


							 (begin
							   (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.")
							   (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)))
							 (debug:print 0 "WARNING: Request received to kill job " pid) ;;  " (attempt # " kill-tries ")")







							 (if (process:alive? pid)
							     (begin

							       (process-signal pid signal/int)

							       (thread-sleep! 5)
							       (if (process:process-alive? pid)
								   (process-signal pid signal/kill))))))
						      pids)
						     (tests:test-set-status! run-id test-id "KILLED"  "KILLED" (args:get-arg "-m") #f))
						   (begin
						     (debug:print 0 "ERROR: Nothing to kill, pid1=" pid1 ", pid2=" pid2)

						     (tests:test-set-status! run-id test-id "KILLED"  "FAILED TO KILL" (args:get-arg "-m") #f)

						     )))

					     (mutex-unlock! m)
					     ;; no point in sticking around. Exit now.
					     (exit)))
				       (if keep-going
					   (begin
					     (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
					     (if keep-going
						 (loop (calc-minutes)))))))
				   (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f)))) ;; NOTE: Checking twice for keep-going is intentional
		 (th1          (make-thread monitorjob "monitor job"))
		 (th2          (make-thread runit "run job")))
	    (set! job-thread th2)
	    (thread-start! th1)
	    (thread-start! th2)
	    (thread-join! th2)
	    (debug:print-info 0 "Megatest exectute of test " test-name ", item path " item-path " complete. Notifying the db ...")
	    (set! keep-going #f)
	    (thread-join! th1)
	    (thread-sleep! 1)       ;; give thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec.
	    (mutex-lock! m)
	    (let* ((item-path (item-list->path itemdat))
		   ;; only state and status needed - use lazy routine
		   (testinfo  (rmt:get-testinfo-state-status run-id test-id)))
457
458
459
460
461
462
463

464
465
466
467
468
469
470
471
472

473

474
475
476
477
478
479
480
	  (if linktree
	      (if (not (file-exists? linktree))
		  (begin
		    (handle-exceptions
		     exn
		     (begin
		       (debug:print 0 "ERROR: Something went wrong when trying to create linktree dir at " linktree)

		       (exit 1))
		     (create-directory linktree #t))))
	      (begin
		(debug:print 0 "ERROR: linktree not defined in [setup] section of megatest.config")
		(exit 1)))
	  (if linktree
	      (let ((dbdir (conc linktree "/.db")))
		(handle-exceptions
		 exn

		 (debug:print 0 "ERROR: failed to create the " dbdir " area for your database files")

		 (if (not (directory-exists? dbdir))(create-directory dbdir)))
		(setenv "MT_LINKTREE" linktree))
	      (begin
		(debug:print 0 "ERROR: linktree is required in your megatest.config [setup] section")
		(exit 1)))
	  (if (and *toppath*
		   (directory-exists? *toppath*))







>









>
|
>







455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
	  (if linktree
	      (if (not (file-exists? linktree))
		  (begin
		    (handle-exceptions
		     exn
		     (begin
		       (debug:print 0 "ERROR: Something went wrong when trying to create linktree dir at " linktree)
		       (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn))
		       (exit 1))
		     (create-directory linktree #t))))
	      (begin
		(debug:print 0 "ERROR: linktree not defined in [setup] section of megatest.config")
		(exit 1)))
	  (if linktree
	      (let ((dbdir (conc linktree "/.db")))
		(handle-exceptions
		 exn
		 (begin
		   (debug:print 0 "ERROR: failed to create the " dbdir " area for your database files")
		   (debug:print 0 " message: " ((condition-property-accessor 'exn 'message) exn)))
		 (if (not (directory-exists? dbdir))(create-directory dbdir)))
		(setenv "MT_LINKTREE" linktree))
	      (begin
		(debug:print 0 "ERROR: linktree is required in your megatest.config [setup] section")
		(exit 1)))
	  (if (and *toppath*
		   (directory-exists? *toppath*))