Megatest

Diff
Login

Differences From Artifact [59d0d6772c]:

To Artifact [108aa8951e]:


142
143
144
145
146
147
148
149

150
151
152
153
154
155
156
142
143
144
145
146
147
148

149
150
151
152
153
154
155
156







-
+







	  (set-run-config-vars run-id keyvals target) ;; (db:get-target db run-id))
	  ;; environment overrides are done *before* the remaining critical envars.
	  (alist->env-vars env-ovrd)
	  (set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals)
	  (set-item-env-vars itemdat)
	  (save-environment-as-files "megatest")
	  ;; open-run-close not needed for test-set-meta-info
	  (tests:set-full-meta-info test-id run-id 0 work-area)
	  (tests:set-full-meta-info #f test-id run-id 0 work-area 10)

	  ;; (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f)
	  (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
	  (thread-sleep! 0.3) ;; NFS slowness has caused grief here

	  (if (args:get-arg "-xterm")
	      (set! fullrunscript "xterm")
300
301
302
303
304
305
306
307

308
309
310
311
312
313
314
315
316
317
318


319


320
321
322
323
324
325
326
327
328



329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344


















345
346
347
348


349
350
351
352
353
354
355
300
301
302
303
304
305
306

307
308
309
310
311
312
313
314
315
316
317
318
319
320

321
322
323
324
325
326
327
328
329
330
331
332
333
334
















335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354


355
356
357
358
359
360
361
362
363







-
+











+
+
-
+
+









+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+


-
-
+
+







					(calc-minutes  (lambda ()
							 (inexact->exact 
							  (round 
							   (- 
							    (current-seconds) 
							    start-seconds)))))
					(kill-tries 0))
				   (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
				   (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
				   (let loop ((minutes   (calc-minutes)))
				     (begin
				       (set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat))
							   (and runtlim (let* ((run-seconds   (- (current-seconds) start-seconds))
									       (time-exceeded (> run-seconds runtlim)))
									  (if time-exceeded
									      (begin
										(debug:print-info 0 "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)
										#t)
									      #f)))))
				       ;; open-run-close not needed for test-set-meta-info
				       (print "ERROR: EDIT ME")
				       (exit 1)
				       (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f)
				       ;;(tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f)
				       ;;(tests:set-partial-meta-info #f test-id run-id minutes work-area 10)
				       ;; (tests:set-partial-meta-info test-id run-id minutes work-area)
				       (if kill-job? 
					   (begin
					     (mutex-lock! m)
					     ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
					     ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
					     ;;       between tries?
					     (let* ((pid (vector-ref exit-info 0)))
					       (if (number? pid)
						   (handle-exceptions
						    exn
						    (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.")
						   (process-signal pid signal/kill)
						   ;; (begin
						   ;;   (debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")")
						   ;;   (let ((processes (cmd-run->list (conc "pgrep -l -P " pid))))
						   ;;     (for-each 
						   ;;      (lambda (p)
						   ;;        (let* ((parts  (string-split p))
						   ;;      	 (p-id   (if (> (length parts) 0)
						   ;;      		     (string->number (car parts))
						   ;;      		     #f)))
						   ;;          (if p-id
						   ;;      	(begin
						   ;;      	  (debug:print 0 "Killing " (cadr parts) "; kill -9  " p-id)
						   ;;      	  (system (conc "kill -9 " p-id))))))
						   ;;      (car processes))
						   ;;     (system (conc "kill -9 -" pid))))
						    ;;(process-signal pid signal/kill))
						    (begin
						      (debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")")
						      (let ((processes (cmd-run->list (conc "pgrep -l -P " pid))))
							(for-each 
							 (lambda (p)
							   (let* ((parts  (string-split p))
								  (p-id   (if (> (length parts) 0)
									      (string->number (car parts))
									      #f)))
							     (if p-id
								 (begin
								   (debug:print 0 "Killing " (cadr parts) "; kill -9  " p-id)
								   ;; (process-signal pid signal/kill))))) ;; 
								   (system (conc "kill -9 " p-id))))))
							 (car processes)))
						      (system (conc "kill -9 -" pid))
						      (tests:test-set-status! test-id "KILLED"  "FAIL" (args:get-arg "-m") #f)))
						   (begin
						     (debug:print 0 "WARNING: Request received to kill job but problem with process, attempting to kill manager process")
						     (tests:test-set-status! run-id test-id "KILLED"  "FAIL"
								     (args:get-arg "-m") #f)
						     ;;(tests:test-set-status! run-id test-id "KILLED"  "FAIL"
						     (tests:test-set-status! trun-id est-id "KILLED"  "FAIL" (args:get-arg "-m") #f)
						     (exit 1) ;; IS THIS NECESSARY OR WISE???
						     )))
					     (set! kill-tries (+ 1 kill-tries))
					     (mutex-unlock! m)))
				       (if keep-going
					   (begin
					     (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses