Megatest

Check-in [f1b027a2e2]
Login
Overview
Comment:Partial fix for simultaneously preventing races on launch of new jobs and respecting use of max-concurrent-jobs
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: f1b027a2e2782c5851000e14e5d20083ef9ffcac
User & Date: matt on 2012-10-09 15:58:19
Other Links: manifest | tags
Context
2012-10-09
20:22
Fixed bug with refusing to operate on DELETED tests. Correct fix for treatment of NOT_STARTED vs. LAUNCHED tests in counting tests that are running. check-in: 3ceb1e262a user: matt tags: trunk, v1.4614
15:58
Partial fix for simultaneously preventing races on launch of new jobs and respecting use of max-concurrent-jobs check-in: f1b027a2e2 user: matt tags: trunk
09:45
Added better signature to makefile target but not enabled yet check-in: 521a37d7f3 user: matt tags: trunk
Changes

Modified runs.scm from [260a499583] to [761365996e].

113
114
115
116
117
118
119
120



121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152

(define *last-num-running-tests* 0)
(define (runs:can-run-more-tests db test-record)
  (let* ((tconfig                 (tests:testqueue-get-testconfig test-record))
	 (jobgroup                (config-lookup tconfig "requirements" "jobgroup"))
	 (num-running             (db:get-count-tests-running db))
	 (num-running-in-jobgroup (db:get-count-tests-running-in-jobgroup db jobgroup))
	 (max-concurrent-jobs     (config-lookup *configdat* "setup"     "max_concurrent_jobs"))



	 (job-group-limit         (config-lookup *configdat* "jobgroups" jobgroup)))
    (if (not (eq? *last-num-running-tests* num-running))
	(begin
	  (debug:print 2 "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
	  (set! *last-num-running-tests* num-running)))
    (if (not (eq? 0 *globalexitstatus*))
	#f
	(let ((can-not-run-more (cond
				 ;; if max-concurrent-jobs is set and the number running is greater 
				 ;; than it than cannot run more jobs
				 ((and max-concurrent-jobs
				       (string->number max-concurrent-jobs)
				       (>= num-running (string->number max-concurrent-jobs)))
				  (debug:print 0 "WARNING: Max running jobs exceeded, current number running: " num-running 
					       ", max_concurrent_jobs: " max-concurrent-jobs)
				  #t)
				 ;; if job-group-limit is set and number of jobs in the group is greater
				 ;; than the limit then cannot run more jobs of this kind
				 ((and job-group-limit
				       (>= num-running-in-jobgroup job-group-limit))
				  (debug:print 1 "WARNING: number of jobs " num-running-in-jobgroup 
					       " in " jobgroup " exceeded, will not run " (tests:testqueue-get-testname test-record))
				  #t)
				 (else #f))))
	  (not can-not-run-more)))))

;;======================================================================
;; New methodology. These routines will replace the above in time. For
;; now the code is duplicated. This stuff is initially used in the monitor
;; based code.
;;======================================================================








|
>
>
>






|



|
<
<











|







113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134


135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

(define *last-num-running-tests* 0)
(define (runs:can-run-more-tests db test-record)
  (let* ((tconfig                 (tests:testqueue-get-testconfig test-record))
	 (jobgroup                (config-lookup tconfig "requirements" "jobgroup"))
	 (num-running             (db:get-count-tests-running db))
	 (num-running-in-jobgroup (db:get-count-tests-running-in-jobgroup db jobgroup))
	 (max-concurrent-jobs     (let ((mcj (config-lookup *configdat* "setup"     "max_concurrent_jobs")))
				    (if (and mcj (string->number mcj))
					(string->number mcj)
					#f)))
	 (job-group-limit         (config-lookup *configdat* "jobgroups" jobgroup)))
    (if (not (eq? *last-num-running-tests* num-running))
	(begin
	  (debug:print 2 "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
	  (set! *last-num-running-tests* num-running)))
    (if (not (eq? 0 *globalexitstatus*))
	(list #f num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit)
	(let ((can-not-run-more (cond
				 ;; if max-concurrent-jobs is set and the number running is greater 
				 ;; than it than cannot run more jobs
				 ((and max-concurrent-jobs (>= num-running max-concurrent-jobs))


				  (debug:print 0 "WARNING: Max running jobs exceeded, current number running: " num-running 
					       ", max_concurrent_jobs: " max-concurrent-jobs)
				  #t)
				 ;; if job-group-limit is set and number of jobs in the group is greater
				 ;; than the limit then cannot run more jobs of this kind
				 ((and job-group-limit
				       (>= num-running-in-jobgroup job-group-limit))
				  (debug:print 1 "WARNING: number of jobs " num-running-in-jobgroup 
					       " in " jobgroup " exceeded, will not run " (tests:testqueue-get-testname test-record))
				  #t)
				 (else #f))))
	  (list (not can-not-run-more) num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit)))))

;;======================================================================
;; New methodology. These routines will replace the above in time. For
;; now the code is duplicated. This stuff is initially used in the monitor
;; based code.
;;======================================================================

373
374
375
376
377
378
379
380





381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406

407
408
409
410
411
412
413
	    (if (member test-name waitons)
		(begin
		  (debug:print 0 "ERROR: test " test-name " has listed itself as a waiton, please correct this!")
		  (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))

	    (cond ;; OUTER COND
	     ((not items) ;; when false the test is ok to be handed off to launch (but not before)
	      (let* ((have-resources  (open-run-close runs:can-run-more-tests #f test-record)) ;; look at the test jobgroup and tot jobs running





		     (prereqs-not-met (open-run-close db:get-prereqs-not-met #f run-id waitons item-path mode: testmode))
		     (fails           (runs:calc-fails prereqs-not-met))
		     (non-completed   (runs:calc-not-completed prereqs-not-met)))
		(debug:print 8 "INFO: have-resources: " have-resources " prereqs-not-met: " 
			     (string-intersperse 
			      (map (lambda (t)
				     (if (vector? t)
					 (conc (db:test-get-state t) "/" (db:test-get-status t))
					 (conc " WARNING: t is not a vector=" t )))
				   prereqs-not-met) ", ") " fails: " fails)
		(debug:print 4 "INFO: hed=" hed)

		;; Don't know at this time if the test have been launched at some time in the past
		;; i.e. is this a re-launch?

		(cond ;; INNER COND #1 for a launchable test
		 ;; Check item path against item-patts
		 ((and (not (patt-list-match item-path item-patts))
		       (not (equal? item-path "")))
		  ;; else the run is stuck, temporarily or permanently
		  ;; but should check if it is due to lack of resources vs. prerequisites
		  (debug:print 1 "INFO: Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " item-patts)
		  (thread-sleep! *global-delta*)
		  (if (not (null? tal))
		      (loop (car tal)(cdr tal) reruns)))
		 ((not (hash-table-ref/default test-registery (runs:make-full-test-name test-name item-path) #f))

		  (open-run-close db:tests-register-test #f run-id test-name item-path)
		  (hash-table-set! test-registery (runs:make-full-test-name test-name item-path) #t)
		  (thread-sleep! *global-delta*)
		  (loop (car newtal)(cdr newtal) reruns))
		 ((not have-resources) ;; simply try again after waiting a second
		  (thread-sleep! (+ 1 *global-delta*))
		  (debug:print 1 "INFO: no resources to run new tests, waiting ...")







|
>
>
>
>
>
|
|
|











|










|
>







374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
	    (if (member test-name waitons)
		(begin
		  (debug:print 0 "ERROR: test " test-name " has listed itself as a waiton, please correct this!")
		  (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))

	    (cond ;; OUTER COND
	     ((not items) ;; when false the test is ok to be handed off to launch (but not before)
	      (let* ((run-limits-info         (open-run-close runs:can-run-more-tests #f test-record)) ;; look at the test jobgroup and tot jobs running
		     (have-resources          (car run-limits-info))
		     (num-running             (list-ref run-limits-info 1))
		     (num-running-in-jobgroup (list-ref run-limits-info 2))
		     (max-concurrent-jobs     (list-ref run-limits-info 3))
		     (job-group-limit         (list-ref run-limits-info 4))
		     (prereqs-not-met         (open-run-close db:get-prereqs-not-met #f run-id waitons item-path mode: testmode))
		     (fails                   (runs:calc-fails prereqs-not-met))
		     (non-completed           (runs:calc-not-completed prereqs-not-met)))
		(debug:print 8 "INFO: have-resources: " have-resources " prereqs-not-met: " 
			     (string-intersperse 
			      (map (lambda (t)
				     (if (vector? t)
					 (conc (db:test-get-state t) "/" (db:test-get-status t))
					 (conc " WARNING: t is not a vector=" t )))
				   prereqs-not-met) ", ") " fails: " fails)
		(debug:print 4 "INFO: hed=" hed)

		;; Don't know at this time if the test have been launched at some time in the past
		;; i.e. is this a re-launch?
		(debug:print 4 "INFO: run-limits-info = " run-limits-info)
		(cond ;; INNER COND #1 for a launchable test
		 ;; Check item path against item-patts
		 ((and (not (patt-list-match item-path item-patts))
		       (not (equal? item-path "")))
		  ;; else the run is stuck, temporarily or permanently
		  ;; but should check if it is due to lack of resources vs. prerequisites
		  (debug:print 1 "INFO: Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " item-patts)
		  (thread-sleep! *global-delta*)
		  (if (not (null? tal))
		      (loop (car tal)(cdr tal) reruns)))
		 ((and (not (hash-table-ref/default test-registery (runs:make-full-test-name test-name item-path) #f))
		       (and max-concurrent-jobs (> (- max-concurrent-jobs num-running) 5)))
		  (open-run-close db:tests-register-test #f run-id test-name item-path)
		  (hash-table-set! test-registery (runs:make-full-test-name test-name item-path) #t)
		  (thread-sleep! *global-delta*)
		  (loop (car newtal)(cdr newtal) reruns))
		 ((not have-resources) ;; simply try again after waiting a second
		  (thread-sleep! (+ 1 *global-delta*))
		  (debug:print 1 "INFO: no resources to run new tests, waiting ...")

Modified tests/fullrun/config/mt_include_1.config from [5e42faa87c] to [21400a737e].

1
2
3
4
5
6
7
8
9
10
[setup]
# exectutable /path/to/megatest
max_concurrent_jobs 200
linktree /tmp/mt_links

[jobtools]
useshell yes
# ## launcher launches jobs, the job is managed on the target host
## by megatest, comment out launcher to run local
# workhosts localhost hermes


|







1
2
3
4
5
6
7
8
9
10
[setup]
# exectutable /path/to/megatest
max_concurrent_jobs 30
linktree /tmp/mt_links

[jobtools]
useshell yes
# ## launcher launches jobs, the job is managed on the target host
## by megatest, comment out launcher to run local
# workhosts localhost hermes