Megatest

Diff
Login

Differences From Artifact [714eb0a317]:

To Artifact [53a5275b02]:


445
446
447
448
449
450
451
452





453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
							   (inexact->exact 
							    (round 
							     (- 
							      (current-seconds) 
							      start-seconds)))))
					  (kill-tries 0))
				     (let loop ((minutes   (calc-minutes)))
				       (let ((db    (open-db)))





					 (set! kill-job? (test-get-kill-request db run-id test-name itemdat))
					 (test-update-meta-info db run-id test-name itemdat minutes)
					 (if kill-job? 
					     (begin
					       (mutex-lock! m)
					       (let* ((pid (vector-ref exit-info 0)))
						 (if (number? pid)
						     (begin
						       (debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")")
						       ;;(cond
						       ;;((>   kill-tries 0) ; 2)
						       (let ((processes (cmd-run->list (conc "pgrep -l -P " pid))))
							 (for-each 
							  (lambda (p)
							    (let* ((parts  (string-split p))
								   (p-id   (if (> (length parts) 0)
									       (string->number (car parts))
									       #f)))
							      (if p-id
								  (begin
								    (debug:print 0 "Killing " (cadr parts) "; kill -9  " p-id)
								    (system (conc "kill -9 " p-id))))))
							  (car processes))
							 (system (conc "kill -9 " pid))))
						     ;;(let* ((ppid (process-group-id pid))
						     ;;       (kcmd (conc "pkill -9 -g " ppid)))
						     ;;  ;; (process-signal pid signal/term)
						     ;;  ;; (process-signal pid signal/kill)
						     ;;  (debug:print 0 "Attempting to kill pid " pid " and children in process group " ppid " with command:\n    " kcmd)
						     ;;  (debug:print 0 "Children:")
						     ;;  (system (conc "pgrep -g -l " ppid))
						     ;;  (system kcmd)
						     ;;  (sleep 1) ;; give it a rest
						     ;;  (test-set-status! db run-id test-name "KILLED"  "FAIL"
						     ;;       	     itemdat (args:get-arg "-m"))
						     ;;  (sqlite3:finalize! db)
						     ;;  (exit 1)))))
						     (begin
						       (debug:print 0 "WARNING: Request received to kill job but problem with process, attempting to kill manager process")
						       (test-set-status! db run-id test-name "KILLED"  "FAIL"
									 itemdat (args:get-arg "-m") #f)
						       (sqlite3:finalize! db)
						       (exit 1))))
					       ;;     (thread-terminate! job-thread)))
					       (set! kill-tries (+ 1 kill-tries))
					       (mutex-unlock! m)))
					 ;; (handle-exceptions
					       ;;  exn
					       ;;  (begin
					       ;;    (debug:print 0 "ERROR: Problem killing process " (vector-ref exit-info 0))
					       ;;    (abort exn))
					       ;;  (let* ((pid   (vector-ref exit-info 0))
					       ;;         ;; (pgid  (process-group-id pid))
					       ;;         ;; (cmd  (conc "pkill -9 -P " pgid))
					       ;;         )
					       ;;    ;; (debug:print 0 "Running \"" cmd "\"")
					       ;;    ;; (system cmd)
					       ;;    (debug:print 0 "Running \"kill -9 " pid "\"")
					       ;;    (system (conc "kill -9 " pid))
					       ;;    ;; (process-signal (vector-ref exit-info 0) signal/kill)
					       ;;    ))))
					 (sqlite3:finalize! db)
					 (thread-sleep! (+ 8 (random 4))) ;; add some jitter to the call home time to spread out the db accesses
					 (loop (calc-minutes)))))))
		   (th1          (make-thread monitorjob))
		   (th2          (make-thread runit)))
	      (set! job-thread th2)
	      (thread-start! th1)







|
>
>
>
>
>

|







<
<













<
<
<
<
<
<
<
<
<
<
<
<
<






<


<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466


467
468
469
470
471
472
473
474
475
476
477
478
479













480
481
482
483
484
485

486
487















488
489
490
491
492
493
494
							   (inexact->exact 
							    (round 
							     (- 
							      (current-seconds) 
							      start-seconds)))))
					  (kill-tries 0))
				     (let loop ((minutes   (calc-minutes)))
				       (let* ((db       (open-db))
					      (cpuload  (get-cpu-load))
					      (diskfree (get-df (current-directory)))
					      (tmpfree  (get-df "/tmp")))
					 (if (not cpuload)  (begin (debug:print 0 "WARNING: CPULOAD not found.")  (set! cpuload "n/a")))
					 (if (not diskfree) (begin (debug:print 0 "WARNING: DISKFREE not found.") (set! diskfree "n/a")))
					 (set! kill-job? (test-get-kill-request db run-id test-name itemdat))
					 (test-update-meta-info db run-id test-name itemdat minutes cpuload diskfree tmpfree)
					 (if kill-job? 
					     (begin
					       (mutex-lock! m)
					       (let* ((pid (vector-ref exit-info 0)))
						 (if (number? pid)
						     (begin
						       (debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")")


						       (let ((processes (cmd-run->list (conc "pgrep -l -P " pid))))
							 (for-each 
							  (lambda (p)
							    (let* ((parts  (string-split p))
								   (p-id   (if (> (length parts) 0)
									       (string->number (car parts))
									       #f)))
							      (if p-id
								  (begin
								    (debug:print 0 "Killing " (cadr parts) "; kill -9  " p-id)
								    (system (conc "kill -9 " p-id))))))
							  (car processes))
							 (system (conc "kill -9 " pid))))













						     (begin
						       (debug:print 0 "WARNING: Request received to kill job but problem with process, attempting to kill manager process")
						       (test-set-status! db run-id test-name "KILLED"  "FAIL"
									 itemdat (args:get-arg "-m") #f)
						       (sqlite3:finalize! db)
						       (exit 1))))

					       (set! kill-tries (+ 1 kill-tries))
					       (mutex-unlock! m)))















					 (sqlite3:finalize! db)
					 (thread-sleep! (+ 8 (random 4))) ;; add some jitter to the call home time to spread out the db accesses
					 (loop (calc-minutes)))))))
		   (th1          (make-thread monitorjob))
		   (th2          (make-thread runit)))
	      (set! job-thread th2)
	      (thread-start! th1)