Megatest

Check-in [7d49a78e24]
Login
Overview
Comment:Merged trunk into logging
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | sqlite3-logging
Files: files | file ages | folders
SHA1: 7d49a78e24b7542ae3ef8602e58f25ddebbc6fbd
User & Date: matt on 2012-10-11 00:11:30
Other Links: branch diff | manifest | tags
Context
2012-10-11
01:06
Propagated -logging to tests (note: this doesn't help with sub-invocations) check-in: fc03199f62 user: matt tags: sqlite3-logging
00:11
Merged trunk into logging check-in: 7d49a78e24 user: matt tags: sqlite3-logging
2012-10-10
18:19
Added mapping of mtest to megatest in launch, switched nbfake to run jobs in background, changed fsl to fossil in makefile check-in: 1b1db65b15 user: mrwellan tags: trunk
2012-10-09
12:00
Merged in needed change to Makefile for logging check-in: 7ee796c14e user: matt tags: sqlite3-logging
Changes

Modified Makefile from [5beb554be6] to [ae425a5adb].

9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

GUISRCF  = dashboard.scm dashboard-tests.scm dashboard-guimonitor.scm dashboard-main.scm

OFILES   = $(SRCFILES:%.scm=%.o)
GOFILES  = $(GUISRCF:%.scm=%.o)

HELPERS=$(addprefix $(PREFIX)/bin/,mt_laststep mt_runstep mt_ezstep)
MTESTHASH=$(shell fsl info|grep checkout:| awk '{print $$2}')
all : mtest dboard

mtest: $(OFILES) megatest.o
	csc $(CSCOPTS) $(OFILES) megatest.o -o mtest

dboard : $(OFILES) $(GOFILES)
	csc $(OFILES) $(GOFILES) -o dboard







|







9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

GUISRCF  = dashboard.scm dashboard-tests.scm dashboard-guimonitor.scm dashboard-main.scm

OFILES   = $(SRCFILES:%.scm=%.o)
GOFILES  = $(GUISRCF:%.scm=%.o)

HELPERS=$(addprefix $(PREFIX)/bin/,mt_laststep mt_runstep mt_ezstep)
MTESTHASH=$(shell fossil info|grep checkout:| awk '{print $$2}')
all : mtest dboard

mtest: $(OFILES) megatest.o
	csc $(CSCOPTS) $(OFILES) megatest.o -o mtest

dboard : $(OFILES) $(GOFILES)
	csc $(OFILES) $(GOFILES) -o dboard

Modified db.scm from [378f0551a1] to [1bca4e3583].

753
754
755
756
757
758
759
760
761
762
763
764
765
766
767

(define (db:get-count-tests-running db)
  (let ((res 0))
    (sqlite3:for-each-row
     (lambda (count)
       (set! res count))
     db
     "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART','NOT_STARTED');")
    res))

(define (db:get-count-tests-running-in-jobgroup db jobgroup)
  (if (not jobgroup)
      0 ;; 
      (let ((res 0))
	(sqlite3:for-each-row







|







753
754
755
756
757
758
759
760
761
762
763
764
765
766
767

(define (db:get-count-tests-running db)
  (let ((res 0))
    (sqlite3:for-each-row
     (lambda (count)
       (set! res count))
     db
     "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART');")
    res))

(define (db:get-count-tests-running-in-jobgroup db jobgroup)
  (if (not jobgroup)
      0 ;; 
      (let ((res 0))
	(sqlite3:for-each-row
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
;;======================================================================
;; QUEUE UP META, TEST STATUS AND STEPS
;;======================================================================

(define (db:updater)
  (debug:print 4 "INFO: Starting cache processing")
  (let loop ((start-time (current-time)))
    (thread-sleep! 5) ;; move save time around to minimize regular collisions?
    (db:write-cached-data)
    (loop start-time)))

(define (cdb:test-set-status-state test-id status state msg)
  (debug:print 4 "INFO: cdb:test-set-status-state test-id=" test-id ", status=" status ", state=" state ", msg=" msg)
  (mutex-lock! *incoming-mutex*)
  (set! *last-db-access* (current-seconds))







|







1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
;;======================================================================
;; QUEUE UP META, TEST STATUS AND STEPS
;;======================================================================

(define (db:updater)
  (debug:print 4 "INFO: Starting cache processing")
  (let loop ((start-time (current-time)))
    (thread-sleep! 10) ;; move save time around to minimize regular collisions?
    (db:write-cached-data)
    (loop start-time)))

(define (cdb:test-set-status-state test-id status state msg)
  (debug:print 4 "INFO: cdb:test-set-status-state test-id=" test-id ", status=" status ", state=" state ", msg=" msg)
  (mutex-lock! *incoming-mutex*)
  (set! *last-db-access* (current-seconds))

Modified launch.scm from [dc96227333] to [ae6bdfdb7c].

523
524
525
526
527
528
529

530
531
532
533
534
535
536
537
	 ;;                from the called megatest and convert dashboard
	 ;;             	  or dboard to megatest
	 (local-megatest  (let* ((lm  (car (argv)))
				 (dir (pathname-directory lm))
				 (exe (pathname-strip-directory lm)))
			    (conc (if dir (conc dir "/") "")
				  (case (string->symbol exe)

				    ((dboard) "megatest")
				    ((dashboard) "megatest")
				    (else exe)))))
	 (test-sig   (conc test-name ":" (item-list->path itemdat))) ;; test-path is the full path including the item-path
	 (work-area  #f)
	 (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all
	 (diskpath   #f)
	 (cmdparms   #f)







>
|







523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
	 ;;                from the called megatest and convert dashboard
	 ;;             	  or dboard to megatest
	 (local-megatest  (let* ((lm  (car (argv)))
				 (dir (pathname-directory lm))
				 (exe (pathname-strip-directory lm)))
			    (conc (if dir (conc dir "/") "")
				  (case (string->symbol exe)
				    ((dboard)    "megatest")
				    ((mtest)     "megatest")
				    ((dashboard) "megatest")
				    (else exe)))))
	 (test-sig   (conc test-name ":" (item-list->path itemdat))) ;; test-path is the full path including the item-path
	 (work-area  #f)
	 (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all
	 (diskpath   #f)
	 (cmdparms   #f)

Modified megatest.scm from [a442c44f02] to [9492ee6fba].

353
354
355
356
357
358
359
360

361
362
363
364
365
366
367
      (debug:print 0 "INFO: Starting the standalone server")
      (if db 
	  (let* ((host:port (db:get-var db "SERVER")) ;; this doen't support multiple servers BUG!!!!
		 (th2 (server:start db (args:get-arg "-server")))
		 (th3 (make-thread (lambda ()
				     (server:keep-running db host:port)))))
	    (thread-start! th3)
	    (thread-join! th3))

	  (debug:print 0 "ERROR: Failed to setup for megatest"))))

;;======================================================================
;; full run
;;======================================================================

;; get lock in db for full run for this directory







|
>







353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
      (debug:print 0 "INFO: Starting the standalone server")
      (if db 
	  (let* ((host:port (db:get-var db "SERVER")) ;; this doen't support multiple servers BUG!!!!
		 (th2 (server:start db (args:get-arg "-server")))
		 (th3 (make-thread (lambda ()
				     (server:keep-running db host:port)))))
	    (thread-start! th3)
	    (thread-join! th3)
	    (set! *didsomething* #t))
	  (debug:print 0 "ERROR: Failed to setup for megatest"))))

;;======================================================================
;; full run
;;======================================================================

;; get lock in db for full run for this directory

Modified runs.scm from [260a499583] to [83c48ed884].

113
114
115
116
117
118
119
120



121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152

(define *last-num-running-tests* 0)
(define (runs:can-run-more-tests db test-record)
  (let* ((tconfig                 (tests:testqueue-get-testconfig test-record))
	 (jobgroup                (config-lookup tconfig "requirements" "jobgroup"))
	 (num-running             (db:get-count-tests-running db))
	 (num-running-in-jobgroup (db:get-count-tests-running-in-jobgroup db jobgroup))
	 (max-concurrent-jobs     (config-lookup *configdat* "setup"     "max_concurrent_jobs"))



	 (job-group-limit         (config-lookup *configdat* "jobgroups" jobgroup)))
    (if (not (eq? *last-num-running-tests* num-running))
	(begin
	  (debug:print 2 "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
	  (set! *last-num-running-tests* num-running)))
    (if (not (eq? 0 *globalexitstatus*))
	#f
	(let ((can-not-run-more (cond
				 ;; if max-concurrent-jobs is set and the number running is greater 
				 ;; than it than cannot run more jobs
				 ((and max-concurrent-jobs
				       (string->number max-concurrent-jobs)
				       (>= num-running (string->number max-concurrent-jobs)))
				  (debug:print 0 "WARNING: Max running jobs exceeded, current number running: " num-running 
					       ", max_concurrent_jobs: " max-concurrent-jobs)
				  #t)
				 ;; if job-group-limit is set and number of jobs in the group is greater
				 ;; than the limit then cannot run more jobs of this kind
				 ((and job-group-limit
				       (>= num-running-in-jobgroup job-group-limit))
				  (debug:print 1 "WARNING: number of jobs " num-running-in-jobgroup 
					       " in " jobgroup " exceeded, will not run " (tests:testqueue-get-testname test-record))
				  #t)
				 (else #f))))
	  (not can-not-run-more)))))

;;======================================================================
;; New methodology. These routines will replace the above in time. For
;; now the code is duplicated. This stuff is initially used in the monitor
;; based code.
;;======================================================================








|
>
>
>






|



|
<
<











|







113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134


135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

(define *last-num-running-tests* 0)
(define (runs:can-run-more-tests db test-record)
  (let* ((tconfig                 (tests:testqueue-get-testconfig test-record))
	 (jobgroup                (config-lookup tconfig "requirements" "jobgroup"))
	 (num-running             (db:get-count-tests-running db))
	 (num-running-in-jobgroup (db:get-count-tests-running-in-jobgroup db jobgroup))
	 (max-concurrent-jobs     (let ((mcj (config-lookup *configdat* "setup"     "max_concurrent_jobs")))
				    (if (and mcj (string->number mcj))
					(string->number mcj)
					#f)))
	 (job-group-limit         (config-lookup *configdat* "jobgroups" jobgroup)))
    (if (not (eq? *last-num-running-tests* num-running))
	(begin
	  (debug:print 2 "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
	  (set! *last-num-running-tests* num-running)))
    (if (not (eq? 0 *globalexitstatus*))
	(list #f num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit)
	(let ((can-not-run-more (cond
				 ;; if max-concurrent-jobs is set and the number running is greater 
				 ;; than it than cannot run more jobs
				 ((and max-concurrent-jobs (>= num-running max-concurrent-jobs))


				  (debug:print 0 "WARNING: Max running jobs exceeded, current number running: " num-running 
					       ", max_concurrent_jobs: " max-concurrent-jobs)
				  #t)
				 ;; if job-group-limit is set and number of jobs in the group is greater
				 ;; than the limit then cannot run more jobs of this kind
				 ((and job-group-limit
				       (>= num-running-in-jobgroup job-group-limit))
				  (debug:print 1 "WARNING: number of jobs " num-running-in-jobgroup 
					       " in " jobgroup " exceeded, will not run " (tests:testqueue-get-testname test-record))
				  #t)
				 (else #f))))
	  (list (not can-not-run-more) num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit)))))

;;======================================================================
;; New methodology. These routines will replace the above in time. For
;; now the code is duplicated. This stuff is initially used in the monitor
;; based code.
;;======================================================================

373
374
375
376
377
378
379
380





381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406

407
408
409
410
411
412
413
	    (if (member test-name waitons)
		(begin
		  (debug:print 0 "ERROR: test " test-name " has listed itself as a waiton, please correct this!")
		  (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))

	    (cond ;; OUTER COND
	     ((not items) ;; when false the test is ok to be handed off to launch (but not before)
	      (let* ((have-resources  (open-run-close runs:can-run-more-tests #f test-record)) ;; look at the test jobgroup and tot jobs running





		     (prereqs-not-met (open-run-close db:get-prereqs-not-met #f run-id waitons item-path mode: testmode))
		     (fails           (runs:calc-fails prereqs-not-met))
		     (non-completed   (runs:calc-not-completed prereqs-not-met)))
		(debug:print 8 "INFO: have-resources: " have-resources " prereqs-not-met: " 
			     (string-intersperse 
			      (map (lambda (t)
				     (if (vector? t)
					 (conc (db:test-get-state t) "/" (db:test-get-status t))
					 (conc " WARNING: t is not a vector=" t )))
				   prereqs-not-met) ", ") " fails: " fails)
		(debug:print 4 "INFO: hed=" hed)

		;; Don't know at this time if the test have been launched at some time in the past
		;; i.e. is this a re-launch?

		(cond ;; INNER COND #1 for a launchable test
		 ;; Check item path against item-patts
		 ((and (not (patt-list-match item-path item-patts))
		       (not (equal? item-path "")))
		  ;; else the run is stuck, temporarily or permanently
		  ;; but should check if it is due to lack of resources vs. prerequisites
		  (debug:print 1 "INFO: Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " item-patts)
		  (thread-sleep! *global-delta*)
		  (if (not (null? tal))
		      (loop (car tal)(cdr tal) reruns)))
		 ((not (hash-table-ref/default test-registery (runs:make-full-test-name test-name item-path) #f))

		  (open-run-close db:tests-register-test #f run-id test-name item-path)
		  (hash-table-set! test-registery (runs:make-full-test-name test-name item-path) #t)
		  (thread-sleep! *global-delta*)
		  (loop (car newtal)(cdr newtal) reruns))
		 ((not have-resources) ;; simply try again after waiting a second
		  (thread-sleep! (+ 1 *global-delta*))
		  (debug:print 1 "INFO: no resources to run new tests, waiting ...")







|
>
>
>
>
>
|
|
|











|










|
>







374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
	    (if (member test-name waitons)
		(begin
		  (debug:print 0 "ERROR: test " test-name " has listed itself as a waiton, please correct this!")
		  (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))

	    (cond ;; OUTER COND
	     ((not items) ;; when false the test is ok to be handed off to launch (but not before)
	      (let* ((run-limits-info         (open-run-close runs:can-run-more-tests #f test-record)) ;; look at the test jobgroup and tot jobs running
		     (have-resources          (car run-limits-info))
		     (num-running             (list-ref run-limits-info 1))
		     (num-running-in-jobgroup (list-ref run-limits-info 2))
		     (max-concurrent-jobs     (list-ref run-limits-info 3))
		     (job-group-limit         (list-ref run-limits-info 4))
		     (prereqs-not-met         (open-run-close db:get-prereqs-not-met #f run-id waitons item-path mode: testmode))
		     (fails                   (runs:calc-fails prereqs-not-met))
		     (non-completed           (runs:calc-not-completed prereqs-not-met)))
		(debug:print 8 "INFO: have-resources: " have-resources " prereqs-not-met: " 
			     (string-intersperse 
			      (map (lambda (t)
				     (if (vector? t)
					 (conc (db:test-get-state t) "/" (db:test-get-status t))
					 (conc " WARNING: t is not a vector=" t )))
				   prereqs-not-met) ", ") " fails: " fails)
		(debug:print 4 "INFO: hed=" hed)

		;; Don't know at this time if the test have been launched at some time in the past
		;; i.e. is this a re-launch?
		(debug:print 4 "INFO: run-limits-info = " run-limits-info)
		(cond ;; INNER COND #1 for a launchable test
		 ;; Check item path against item-patts
		 ((and (not (patt-list-match item-path item-patts))
		       (not (equal? item-path "")))
		  ;; else the run is stuck, temporarily or permanently
		  ;; but should check if it is due to lack of resources vs. prerequisites
		  (debug:print 1 "INFO: Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " item-patts)
		  (thread-sleep! *global-delta*)
		  (if (not (null? tal))
		      (loop (car tal)(cdr tal) reruns)))
		 ((and (not (hash-table-ref/default test-registery (runs:make-full-test-name test-name item-path) #f))
		       (and max-concurrent-jobs (> (- max-concurrent-jobs num-running) 5)))
		  (open-run-close db:tests-register-test #f run-id test-name item-path)
		  (hash-table-set! test-registery (runs:make-full-test-name test-name item-path) #t)
		  (thread-sleep! *global-delta*)
		  (loop (car newtal)(cdr newtal) reruns))
		 ((not have-resources) ;; simply try again after waiting a second
		  (thread-sleep! (+ 1 *global-delta*))
		  (debug:print 1 "INFO: no resources to run new tests, waiting ...")
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
      (case (if force ;; (args:get-arg "-force")
		'NOT_STARTED
		(if testdat
		    (string->symbol (test:get-state testdat))
		    'failed-to-insert))
	((failed-to-insert)
	 (debug:print 0 "ERROR: Failed to insert the record into the db"))
	((NOT_STARTED COMPLETED)
	 (let ((runflag #f))
	   (cond
	    ;; -force, run no matter what
	    (force (set! runflag #t))
	    ;; NOT_STARTED, run no matter what
	    ((equal? (test:get-state testdat) "NOT_STARTED")(set! runflag #t))
	    ;; not -rerun and PASS, WARN or CHECK, do no run
	    ((and (or (not rerun)
		      keepgoing)
		  ;; Require to force re-run for COMPLETED or *anything* + PASS,WARN or CHECK
		  (or (member (test:get-status testdat) '("PASS" "WARN" "CHECK"))
		      (member (test:get-state  testdat) '("COMPLETED")))) 
	     (debug:print 2 "INFO: running test " test-name "/" item-path " suppressed as it is " (test:get-state testdat) " and " (test:get-status testdat))







|





|







632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
      (case (if force ;; (args:get-arg "-force")
		'NOT_STARTED
		(if testdat
		    (string->symbol (test:get-state testdat))
		    'failed-to-insert))
	((failed-to-insert)
	 (debug:print 0 "ERROR: Failed to insert the record into the db"))
	((NOT_STARTED COMPLETED DELETED)
	 (let ((runflag #f))
	   (cond
	    ;; -force, run no matter what
	    (force (set! runflag #t))
	    ;; NOT_STARTED, run no matter what
	    ((member (test:get-state testdat) '("DELETED" "NOT_STARTED"))(set! runflag #t))
	    ;; not -rerun and PASS, WARN or CHECK, do no run
	    ((and (or (not rerun)
		      keepgoing)
		  ;; Require to force re-run for COMPLETED or *anything* + PASS,WARN or CHECK
		  (or (member (test:get-status testdat) '("PASS" "WARN" "CHECK"))
		      (member (test:get-state  testdat) '("COMPLETED")))) 
	     (debug:print 2 "INFO: running test " test-name "/" item-path " suppressed as it is " (test:get-state testdat) " and " (test:get-status testdat))

Modified tests/fullrun/config/mt_include_1.config from [5e42faa87c] to [a426d87ac1].

1
2
3
4
5
6
7
8
9
10
[setup]
# exectutable /path/to/megatest
max_concurrent_jobs 200
linktree /tmp/mt_links

[jobtools]
useshell yes
# ## launcher launches jobs, the job is managed on the target host
## by megatest, comment out launcher to run local
# workhosts localhost hermes


|







1
2
3
4
5
6
7
8
9
10
[setup]
# exectutable /path/to/megatest
max_concurrent_jobs 50
linktree /tmp/mt_links

[jobtools]
useshell yes
# ## launcher launches jobs, the job is managed on the target host
## by megatest, comment out launcher to run local
# workhosts localhost hermes

Modified utils/nbfake from [b1e649d799] to [455975d5ec].

1
2
3
4
5
6
7
8
9
10
11



12

#!/bin/bash

# ssh localhost "nohup $* > nbfake.log 2> nbfake.err < /dev/null"

if [[ $TARGETHOST == ""  ]]; then
  TARGETHOST=localhost
fi

# Can't always trust $PWD
CURRWD=`pwd`




ssh -n -f $TARGETHOST "sh -c \"cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=$PATH; nohup $* > NBFAKE-`date +%GWW%V.%u_%T` 2>&1 &\""





<
<
<
<



>
>
>
|
>
1
2
3
4




5
6
7
8
9
10
11
12
#!/bin/bash

# ssh localhost "nohup $* > nbfake.log 2> nbfake.err < /dev/null"





# Can't always trust $PWD
CURRWD=`pwd`

if [[ $TARGETHOST == ""  ]]; then
  sh -c "cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=$PATH; nohup $* > NBFAKE-`date +%GWW%V.%u_%T` 2>&1 &"
else
  ssh -n -f $TARGETHOST "sh -c \"cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=$PATH; nohup $* > NBFAKE-`date +%GWW%V.%u_%T` 2>&1 &\""
fi