Check-in [c075ebd51b]
Not logged in
Overview
SHA1 Hash:c075ebd51bcdb34062a39b7c46642be877740a6b
Date: 2011-06-16 00:11:59
User: mrwellan
Comment:Added -keepgoing, removed the calls to run-queue, fixes to job limits, -runstep, and killreq (now will do signal/kill after first trying signal/term)
Timelines: family | ancestors | descendants | both | trunk
Downloads: Tarball | ZIP archive
Other Links: files | file ages | folders | manifest
Tags And Properties
Changes

Modified dashboard.scm from [78e2859a5dd5f60b] to [d72b4fee0f77d68f].

   179    179   					 (iup:button "Cancel and close"
   180    180   						     #:expand "YES"
   181    181   						     #:action (lambda (x)
   182    182   								(hash-table-delete! *examine-test-dat* testkey)
   183    183   								(iup:destroy! self))))
   184    184   			       )))
   185    185   		  (iup:hbox ;; the test steps are tracked here
   186         -		   (let ((stepsdat (iup:label "Test steps ......................................" #:expand "YES")))
          186  +		   (let ((stepsdat (iup:label "Test steps ........................................." #:expand "YES")))
   187    187   		     (hash-table-set! widgets "Test Steps" stepsdat)
   188    188   		     stepsdat)
   189    189   		   ))))
   190    190   	  (iup:show self)
   191    191   	  ))))
   192    192   
   193    193   (define (colors-similar? color1 color2)
................................................................................
   207    207   		       (tests    (db-get-tests-for-run *db* run-id testnamepatt itemnamepatt))
   208    208   		       (key-vals (get-key-vals *db* run-id)))
   209    209   		  (if (> (length tests) maxtests)
   210    210   		      (set! maxtests (length tests)))
   211    211   		  (set! result (cons (vector run tests key-vals) result))))
   212    212   	      runs)
   213    213       (set! *header*  header)
   214         -    (set! *allruns* (reverse result))
          214  +    (set! *allruns* result)
   215    215       maxtests))
   216    216   
   217    217   (define (update-labels uidat)
   218    218     (let* ((rown    0)
   219    219   	 (lftcol (vector-ref uidat 0))
   220    220   	 (maxn   (- (vector-length lftcol) 1)))
   221    221       (let loop ((i 0))
................................................................................
   240    240       (update-labels uidat)
   241    241       (for-each 
   242    242        (lambda (popup)
   243    243          (let* ((test-id  (car popup))
   244    244   	      (widgets  (hash-table-ref *examine-test-dat* popup))
   245    245   	      (stepslbl (hash-table-ref/default widgets "Test Steps" #f)))
   246    246   	 (if stepslbl
   247         -	     (let* ((fmtstr  "~15a~8a~8a~17a")
          247  +	     (let* ((fmtstr  "~15a~8a~8a~20a")
   248    248   		    (newtxt  (string-intersperse 
   249    249   			      (append
   250    250   			       (list 
   251    251   				(format #f fmtstr "Stepname" "State" "Status" "Event Time")
   252    252   				(format #f fmtstr "========" "=====" "======" "=========="))
   253    253   			       (map (lambda (x)
   254    254   				      ;; take advantage of the \n on time->string

Modified db.scm from [e509962d692c4e15] to [617661873021dd04].

   219    219     (let ((res 0))
   220    220       (sqlite3:for-each-row
   221    221        (lambda (count)
   222    222          (set! res count))
   223    223        db
   224    224        "SELECT count(id) FROM tests WHERE state = 'RUNNING' OR state = 'LAUNCHED' OR state = 'REMOTEHOSTSTART';")
   225    225       res))
          226  +
          227  +;; done with run when:
          228  +;;   0 tests in LAUNCHED, NOT_STARTED, REMOTEHOSTSTART, RUNNING
          229  +(define (db:estimated-tests-remaining db run-id)
          230  +  (let ((res 0))
          231  +    (sqlite3:for-each-row
          232  +     (lambda (count)
          233  +       (set! res count))
          234  +     db
          235  +     "SELECT count(id) FROM tests WHERE state in ('LAUNCHED','NOT_STARTED','REMOTEHOSTSTART','RUNNING') AND run_id=?;" run-id)
          236  +    res))
   226    237   
   227    238   ;; NB// Sync this with runs:get-test-info
   228    239   (define (db:get-test-info db run-id testname item-path)
   229    240     (let ((res '()))
   230    241       (sqlite3:for-each-row
   231    242        (lambda (id run-id testname state status event-time host cpuload diskfree uname rundir item-path run_duration final_logf comment)
   232    243          (set! res (vector id run-id testname state status event-time host cpuload diskfree uname rundir item-path run_duration final_logf comment)))
................................................................................
   270    281   
   271    282   (define (db-get-test-steps-for-run db test-id)
   272    283     (let ((res '()))
   273    284       (sqlite3:for-each-row 
   274    285        (lambda (id test-id stepname state status event-time)
   275    286          (set! res (cons (vector id test-id stepname state status event-time) res)))
   276    287        db
   277         -     "SELECT id,test_id,stepname,state,status,event_time FROM test_steps WHERE test_id=? ORDER BY event_time DESC;"
          288  +     "SELECT id,test_id,stepname,state,status,event_time FROM test_steps WHERE test_id=? ORDER BY id ASC;" ;; event_time DESC,id ASC;
   278    289        test-id)
   279         -    res))
          290  +    (reverse res)))
   280    291   
   281    292   ;; check that *all* the prereqs are "COMPLETED"
   282    293   (define (db-get-prereqs-met db run-id waiton)
   283    294     (let ((res          #f)
   284    295   	(not-complete 0)
   285    296   	(tests        (db-get-tests-for-run db run-id)))
   286    297       (for-each

Modified megatest.scm from [7e0b50a664905c41] to [69c49cdbf1ae7a69].

     4      4   ;;  greater. See the accompanying file COPYING for details.
     5      5   ;; 
     6      6   ;;  This program is distributed WITHOUT ANY WARRANTY; without even the
     7      7   ;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
     8      8   ;;  PURPOSE.
     9      9   
    10     10   (include "common.scm")
    11         -(define megatest-version 1.11)
           11  +(define megatest-version 1.12)
    12     12   
    13     13   (define help (conc "
    14     14   Megatest, documentation at http://www.kiatoa.com/fossils/megatest
    15     15     version " megatest-version "
    16     16     license GPL, Copyright Matt Welland 2006-2011
    17     17   
    18     18   Usage: megatest [options]
................................................................................
    46     46   
    47     47   Misc 
    48     48     -force                  : override some checks
    49     49     -xterm                  : start an xterm instead of launching the test
    50     50     -remove-runs            : remove the data for a run, requires fields, :runname 
    51     51                               and -testpatt
    52     52     -testpatt patt          : remove tests matching patt (requires -remove-runs)
           53  +  -keepgoing              : continue running until no jobs are \"LAUNCHED\" or
           54  +                            \"NOT_STARTED\"
    53     55   
    54     56   Helpers
    55     57     -runstep stepname  ...  : take remaining params as comand and execute as stepname
    56     58                               log will be in stepname.log. Best to put command in quotes
    57     59     -logpro file            : with -exec apply logpro file to stepname.log, creates
    58     60                               stepname.html and sets log to same
    59     61                               If using make use stepname_logpro.log as your target
................................................................................
    88     90   		        "-force"
    89     91   		        "-xterm"
    90     92   		        "-showkeys"
    91     93   		        "-test-status"
    92     94   		        "-gui"
    93     95   			"-runall"    ;; run all tests
    94     96   			"-remove-runs"
           97  +			"-keepgoing"
    95     98   		       )
    96     99   		 args:arg-hash
    97    100   		 0))
    98    101   
    99    102   (if (args:get-arg "-h")
   100    103       (begin
   101    104         (print help)
................................................................................
   252    255   		(print "ERROR: Attempted to run a test but run area config file not found")
   253    256   		(exit 1))
   254    257   	      ;; put test parameters into convenient variables
   255    258   	      (let* ((test-names (get-all-legal-tests))) ;; "PROD" is ignored for now
   256    259   		(print "INFO: Attempting to start the following tests...")
   257    260   		(print "     " (string-intersperse test-names ","))
   258    261   		(run-tests db test-names)))
   259         -	  (run-waiting-tests db)
          262  +	  ;; (run-waiting-tests db)
   260    263   	  (sqlite3:finalize! db)
   261    264   	  (set! *didsomething* #t))))
   262    265   
   263    266   ;;======================================================================
   264    267   ;; run one test
   265    268   ;;======================================================================
   266    269   
................................................................................
   293    296   	      (print "ERROR: Attempted to run a test but run area config file not found")
   294    297   	      (exit 1))
   295    298   	    ;; put test parameters into convenient variables
   296    299   	    (let* ((test-names   (string-split (args:get-arg "-runtests") ",")))
   297    300   	      (run-tests db test-names)))
   298    301   	;; run-waiting-tests db)
   299    302   	(sqlite3:finalize! db)
   300         -	(run-waiting-tests #f)
          303  +	;; (run-waiting-tests #f)
   301    304   	(set! *didsomething* #t))))
   302    305   	  
   303    306   (if (args:get-arg "-runtests")
   304    307       (runtests))
   305    308   
   306    309   ;;======================================================================
   307    310   ;; execute the test
................................................................................
   389    392   							     (- 
   390    393   							      (current-seconds) 
   391    394   							      start-seconds))))))
   392    395   				     (let loop ((minutes   (calc-minutes)))
   393    396   				       (let ((db    (open-db)))
   394    397   					 (set! kill-job? (test-get-kill-request db run-id test-name itemdat))
   395    398   					 (test-update-meta-info db run-id test-name itemdat minutes)
   396         -					 (if kill-job? (process-signal (vector-ref exit-info 0) signal/term))
          399  +					 (if kill-job? 
          400  +					     (begin 
          401  +					       (process-signal (vector-ref exit-info 0) signal/term)
          402  +					       (sleep 2)
          403  +					       (handle-exceptions
          404  +						exn
          405  +						(print "ERROR: Problem killing process " (vector-ref exit-info 0))
          406  +						(process-signal (vector-ref exit-info 0) signal/kill))))
   397    407   					 (sqlite3:finalize! db)
   398    408   					 (thread-sleep! (+ 8 (random 4))) ;; add some jitter to the call home time to spread out the db accesses
   399    409   					 (loop (calc-minutes)))))))
   400    410   		   (th1          (make-thread monitorjob))
   401    411   		   (th2          (make-thread runit)))
   402    412   	      (thread-start! th1)
   403    413   	      (thread-start! th2)
................................................................................
   500    510   			 (redir      (case (string->symbol shell)
   501    511   				       ((tcsh csh ksh)    ">&")
   502    512   				       ((zsh bash sh ash) "2>&1 >")))
   503    513   			 (fullcmd    (conc "(" (string-intersperse 
   504    514   						(cons cmd params) " ")
   505    515   					   ") " redir " " logfile)))
   506    516   		    ;; mark the start of the test
   507         -		    (test-set-status! db run-id test-name "start" "n/a" itemdat (args:get-arg "-m"))
          517  +		    (teststep-set-status! db run-id test-name stepname "start" "n/a" itemdat (args:get-arg "-m"))
   508    518   		    ;; close the db
   509    519   		    (sqlite3:finalize! db)
   510    520   		    ;; run the test step
   511    521   		    (print "INFO: Running \"" fullcmd "\"")
   512    522   		    (change-directory startingdir)
   513    523   		    (set! exitstat (system fullcmd)) ;; cmd params))
   514    524   		    (set! *globalexitstatus* exitstat)

Modified runs.scm from [a13e1910a3378aca] to [4be3851e9e3a8974].

   248    248   	      tests)
   249    249       res))
   250    250   
   251    251   (define (run-tests db test-names)
   252    252     (let* ((keys        (db-get-keys db))
   253    253   	 (keyvallst   (keys->vallist keys #t))
   254    254   	 (run-id      (register-run db keys))) ;;  test-name)))
   255         -    (for-each 
   256         -     (lambda (test-name)
   257         -       (let ((num-running (db:get-count-tests-running db))
   258         -	     (max-concurrent-jobs (config-lookup *configdat* "setup" "max_concurrent_jobs")))
   259         -	 (print "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
   260         -	 (if (or (not max-concurrent-jobs)
   261         -		 (and max-concurrent-jobs
   262         -		      (string->number max-concurrent-jobs)
   263         -		      (not (> num-running (string->number max-concurrent-jobs)))))
   264         -	     (run-one-test db run-id test-name keyvallst)
   265         -	     (print "WARNING: Max running jobs exceeded, current number running: " num-running 
   266         -		    ", max_concurrent_jobs: " max-concurrent-jobs))))
   267         -     test-names)))
   268         -
          255  +    (let loop ((numtimes 0))
          256  +      (for-each 
          257  +       (lambda (test-name)
          258  +	 (let ((num-running (db:get-count-tests-running db))
          259  +	       (max-concurrent-jobs (config-lookup *configdat* "setup" "max_concurrent_jobs")))
          260  +	   (print "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
          261  +	   (if (or (not max-concurrent-jobs)
          262  +		   (and max-concurrent-jobs
          263  +			(string->number max-concurrent-jobs)
          264  +			(not (>= num-running (string->number max-concurrent-jobs)))))
          265  +	       (run-one-test db run-id test-name keyvallst)
          266  +	       (print "WARNING: Max running jobs exceeded, current number running: " num-running 
          267  +		      ", max_concurrent_jobs: " max-concurrent-jobs))))
          268  +       test-names)
          269  +      (if (args:get-arg "-keepgoing")
          270  +	  (let ((estrem (db:estimated-tests-remaining db run-id)))
          271  +	    (if (> estrem 0)
          272  +		(begin
          273  +		  (print "Keep going, estimated " estrem " tests remaining to run, will continue in 10 seconds ...")
          274  +		  (sleep 10)
          275  +		  (loop (+ numtimes 1)))))))))
          276  +	   
   269    277   ;; VERY INEFFICIENT! Move stuff that should be done once up to calling proc
   270    278   (define (run-one-test db run-id test-name keyvallst)
   271    279     (print "Launching test " test-name)
   272    280     ;; All these vars might be referenced by the testconfig file reader
   273    281     (setenv "MT_TEST_NAME" test-name) ;; 
   274    282     (setenv "MT_RUNNAME"   (args:get-arg ":runname"))
   275    283     (set-megatest-env-vars db run-id) ;; these may be needed by the launching process
................................................................................
   305    313   		   (test-status   #f)
   306    314   		   (num-running (db:get-count-tests-running db))
   307    315   		   (max-concurrent-jobs (config-lookup *configdat* "setup" "max_concurrent_jobs")))
   308    316   	      (print "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running)
   309    317   	      (if (not (or (not max-concurrent-jobs)
   310    318   			   (and max-concurrent-jobs
   311    319   				(string->number max-concurrent-jobs)
   312         -				(not (> num-running (string->number max-concurrent-jobs))))))
          320  +				(not (>= num-running (string->number max-concurrent-jobs))))))
   313    321   		  (print "WARNING: Max running jobs exceeded, current number running: " num-running 
   314    322   			 ", max_concurrent_jobs: " max-concurrent-jobs)
   315    323   		  (begin
   316    324   		    (let loop2 ((ts #f)
   317    325   				(ct 0))
   318    326   		      (if (and (not ts)
   319    327   			       (< ct 10))

Modified tests/Makefile from [4c07feade78035be] to [0b961aa417fcf512].

     1      1   # run some tests
     2      2   
     3      3   MEGATEST=$(shell realpath ../megatest)
     4      4   
     5      5   runall :
     6      6   	cd ../;make 
     7         -	$(MEGATEST) -runall :sysname ubuntu :fsname nfs :datapath none :runname `date +%GWW%V.%u` -m "This is a comment specific to a run"
            7  +	$(MEGATEST) -keepgoing -runall :sysname ubuntu :fsname nfs :datapath none :runname `date +%GWW%V.%u` -m "This is a comment specific to a run"
     8      8   
     9      9   test :
    10     10   	cd ../;make test
    11     11   	make runall
    12     12   
    13     13   dashboard :
    14     14   	cd ../;make dashboard

Modified tests/megatest.config from [b996a05181a3c85b] to [7cce1c983316acca].

     1      1   [fields]
     2      2   sysname TEXT
     3      3   fsname TEXT
     4      4   datapath TEXT
     5      5   
     6      6   [setup]
     7      7   # exectutable /path/to/megatest
     8         -max_concurrent_jobs 405
            8  +max_concurrent_jobs 8
     9      9   runsdir /tmp/runs
    10     10   
    11     11   [jobtools]
    12     12   # ## launcher launches jobs, the job is managed on the target host
    13     13   ## by megatest, comment out launcher to run local
    14     14   # workhosts localhost hermes
    15     15   launcher nbfake