Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -278,10 +278,11 @@
     (if (not (null? required-tests))
 	(debug:print 1 "INFO: Adding " required-tests " to the run queue"))
     ;; NOTE: these are all parent tests, items are not expanded yet.
     (debug:print 4 "INFO: test-records=" (hash-table->alist test-records))
     (runs:run-tests-queue run-id runname test-records keyvallst flags)
+    (debug:print 1 "INFO: running queue one more time to catch any changed test states")
     (runs:run-tests-queue run-id runname test-records keyvallst flags)
     (debug:print 4 "INFO: All done by here")))
 
 ;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... >
 (define (runs:run-tests-queue run-id runname test-records keyvallst flags)
@@ -288,14 +289,17 @@
     ;; At this point the list of parent tests is expanded 
     ;; NB// Should expand items here and then insert into the run queue.
   (debug:print 5 "test-records: " test-records ", keyvallst: " keyvallst " flags: " (hash-table->alist flags))
   (let ((sorted-test-names (tests:sort-by-priority-and-waiton test-records))
 	(item-patts        (hash-table-ref/default flags "-itempatt" #f))
-	(test-registery    (make-hash-table)))
+	(test-registery    (make-hash-table))
+	(num-retries        0))
     (if (not (null? sorted-test-names))
 	(let loop ((hed         (car sorted-test-names))
-		   (tal         (cdr sorted-test-names)))
+		   (tal         (cdr sorted-test-names))
+		   (reruns      '()))
+	  (if (not (null? reruns))(debug:print 4 "INFO: reruns=" reruns))
 	  (let* ((test-record (hash-table-ref test-records hed))
 		 (test-name   (tests:testqueue-get-testname test-record))
 		 (tconfig     (tests:testqueue-get-testconfig test-record))
 		 (testmode    (let ((m (config-lookup tconfig "requirements" "mode")))
 				(if m (string->symbol m) 'normal)))
@@ -305,11 +309,10 @@
 		 (items       (tests:testqueue-get-items      test-record))
 		 (item-path   (item-list->path itemdat))
 		 (newtal      (append tal (list hed)))
 		 (calc-fails  (lambda (prereqs-not-met)
 				(filter (lambda (test)
-					  (debug:print 9 "test: " test)
 					  (and (vector? test) ;; not (string? test))
 					       (equal? (db:test-get-state test) "COMPLETED")
 					       (not (member (db:test-get-status test)
 							    '("PASS" "WARN" "CHECK" "WAIVED")))))
 					prereqs-not-met)))
@@ -326,15 +329,16 @@
 					     (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t))))
 				       lst))))
 	    
 	    (debug:print 6
 			 "test-name: " test-name
-			 "\n  hed: " hed
+			 "\n  hed:         " hed
 			 "\n  itemdat:     " itemdat
-			 "\n  items:     " items
-			 "\n  item-path: " item-path
-			 "\n  waitons:   " waitons)
+			 "\n  items:       " items
+			 "\n  item-path:   " item-path
+			 "\n  waitons:     " waitons
+			 "\n  num-retries: " num-retries)
 
 	    ;; check for hed in waitons => this would be circular, remove it and issue an
 	    ;; error
 	    (if (member test-name waitons)
 		(begin
@@ -361,20 +365,20 @@
 		 ((not (patt-list-match item-path item-patts))
 		  ;; else the run is stuck, temporarily or permanently
 		  ;; but should check if it is due to lack of resources vs. prerequisites
 		  (debug:print 1 "INFO: Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " item-patts)
 		  (if (not (null? tal))
-		      (loop (car tal)(cdr tal))))
+		      (loop (car tal)(cdr tal) reruns)))
 		 ((not (hash-table-ref/default test-registery (conc test-name "/" item-path) #f))
 		  (open-run-close db:tests-register-test #f run-id test-name item-path)
 		  (hash-table-set! test-registery (conc test-name "/" item-path) #t)
-		  (loop (car newtal)(cdr newtal)))
+		  (loop (car newtal)(cdr newtal) reruns))
 		 ((not have-resources) ;; simply try again after waiting a second
 		  (thread-sleep! (+ 1 *global-delta*))
 		  (debug:print 1 "INFO: no resources to run new tests, waiting ...")
 		  ;; could have done hed tal here but doing car/cdr of newtal to rotate tests
-		  (loop (car newtal)(cdr newtal)))
+		  (loop (car newtal)(cdr newtal) reruns))
 		 ((and have-resources
 		       (or (null? prereqs-not-met)
 			   (and (eq? testmode 'toplevel)
 				(null? non-completed))))
 		  ;; no loop here, just drop though and use the loop at the bottom 
@@ -387,20 +391,20 @@
 			(begin
 			  ;; couldn't run, take a breather
 			  (debug:print 4 "INFO: Shouldn't really get here, race condition? Unable to launch more tests at this moment, killing time ...")
 			  (thread-sleep! (+ 1 *global-delta*)) ;; long sleep here - no resources, may as well be patient
 			  ;; we made new tal by sticking hed at the back of the list
-			  (loop (car newtal)(cdr newtal)))
+			  (loop (car newtal)(cdr newtal) reruns))
 			;; the waiton is FAIL so no point in trying to run hed ever again
 			(if (not (null? tal))
 			    (if (vector? hed)
 				(begin (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed)
 						    " from the launch list as it has prerequistes that are FAIL")
-				       (loop (car tal)(cdr tal)))
+				       (loop (car tal)(cdr tal) (cons hed reruns)))
 				(begin
 				  (debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;;  " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)")
-				  (loop hed tal)))))))))
+				  (loop hed tal reruns)))))))))
 	     
 	     ;; case where an items came in as a list been processed
 	     ((and (list? items)     ;; thus we know our items are already calculated
 		   (not   itemdat)) ;; and not yet expanded into the list of things to be done
 	      (if (and (>= *verbosity* 1)
@@ -424,11 +428,11 @@
 			 (tests:testqueue-set-item_path! new-test-record my-item-path)
 			 (hash-table-set! test-records newtestname new-test-record)
 			 (set! tal (cons newtestname tal)))))) ;; since these are itemized create new test names testname/itempath
 	       items)
 	      (if (not (null? tal))
-		  (loop (car tal)(cdr tal))))
+		  (loop (car tal)(cdr tal) reruns)))
 
 	     ;; if items is a proc then need to run items:get-items-from-config, get the list and loop 
 	     ;;    - but only do that if resources exist to kick off the job
 	     ((or (procedure? items)(eq? items 'have-procedure))
 	      (let ((can-run-more    (open-run-close runs:can-run-more-tests #f test-record)))
@@ -435,16 +439,19 @@
 		(if can-run-more
 		    (let* ((prereqs-not-met (open-run-close db:get-prereqs-not-met #f run-id waitons item-path mode: testmode))
 			   (fails           (calc-fails prereqs-not-met))
 			   (non-completed   (calc-not-completed prereqs-not-met)))
 		      (debug:print 8 "INFO: can-run-more: " can-run-more
+				   "\n testname:        " hed
 				   "\n prereqs-not-met: " (pretty-string prereqs-not-met)
 				   "\n non-completed:   " (pretty-string non-completed) 
 				   "\n fails:           " (pretty-string fails)
 				   "\n testmode:        " testmode
-				   "\n (eq? testmode 'toplevel) " (eq? testmode 'toplevel)
-				   "\n (null? non-completed)    " (null? non-completed))
+				   "\n num-retries:     " num-retries
+				   "\n (eq? testmode 'toplevel): " (eq? testmode 'toplevel)
+				   "\n (null? non-completed):    " (null? non-completed)
+				   "\n reruns: " reruns)
 		      (cond 
 		       ((or (null? prereqs-not-met) ;; all prereqs met, fire off the test
 			    ;; or, if it is a 'toplevel test and all prereqs not met are COMPLETED then launch
 			    (and (eq? testmode 'toplevel)
 				 (null? non-completed)))
@@ -454,34 +461,36 @@
 			  (open-run-close-measure set-megatest-env-vars #f run-id) ;; these may be needed by the launching process
 			  (let ((items-list (items:get-items-from-config tconfig)))
 			    (if (list? items-list)
 				(begin
 				  (tests:testqueue-set-items! test-record items-list)
-				  (loop hed tal))
+				  (loop hed tal reruns))
 				(begin
 				  (debug:print 0 "ERROR: The proc from reading the setup did not yield a list - please report this")
 				  (exit 1))))))
 		       ((null? fails)
-			(loop (car newtal)(cdr newtal))) ;; an issue with prereqs not yet met?
+			(debug:print 4 "INFO: fails is null, moving on in the queue but keeping " hed " for now")
+			(loop (car newtal)(cdr newtal) reruns)) ;; an issue with prereqs not yet met?
 		       ((and (not (null? fails))(eq? testmode 'normal))
 			(debug:print 1 "INFO: test "  hed " (mode=" testmode ") has failed prerequisite(s); "
 				     (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ")
 				     ", removing it from to-do list")
 			(if (not (null? tal))
-			    (loop (car tal)(cdr tal))))
+			    (loop (car tal)(cdr tal)(cons hed reruns))))
 		       (else
 			(debug:print 8 "ERROR: No handler for this condition.")
 			;; 	     "\n  hed:            " hed 
 			;; 	     "\n fails:           " (string-intersperse (map db:test-get-testname fails) ",")
 			;; 	     "\n testmode:        " testmode
 			;; 	     "\n prereqs-not-met: " (pretty-string prereqs-not-met)
 			;; 	     "\n items:           " items)
-			(loop (car newtal)(cdr newtal)))))
+			(loop (car newtal)(cdr newtal) reruns))))
 		    ;; if can't run more just loop with next possible test
 		    (begin
+		      (debug:print 4 "INFO: processing the case with a lambda for items or 'have-procedure. Moving through the queue without dropping " hed)
 		      (thread-sleep! (+ 1 *global-delta*))
-		      (loop (car newtal)(cdr newtal))))))
+		      (loop (car newtal)(cdr newtal) reruns)))))
 	     
 	     ;; this case should not happen, added to help catch any bugs
 	     ((and (list? items) itemdat)
 	      (debug:print 0 "ERROR: Should not have a list of items in a test and the itemspath set - please report this")
 	      (exit 1))))
@@ -495,14 +504,20 @@
 		;; FIXME! This harsh exit should not be necessary....
 		(if (not *runremote*)(exit)) ;; 
 		#f) ;; return a #f as a hint that we are done
 	      ;; Here we need to check that all the tests remaining to be run are eligible to run
 	      ;; and are not blocked by failed
-	      (let ((newlst (open-run-close tests:filter-non-runnable #f run-id tal test-records))) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED,
+	      (let* ((newlst (open-run-close tests:filter-non-runnable #f run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED,
+		     (junked (lset-difference equal? tal newlst)))
+		(debug:print 4 "INFO: full drop through, if reruns is less than 100 we will force retry them: " reruns)
+		(if (< num-retries 100)
+		    (set! newlst (append reruns newlst)))
+		(set! num-retries (+ num-retries 1))
 		(thread-sleep! *global-delta*)
 		(if (not (null? newlst))
-		    (loop (car newlst)(cdr newlst)))))))))
+		    ;; since reruns have been tacked on to newlst create new reruns from junked
+		    (loop (car newlst)(cdr newlst)(delete-duplicates junked)))))))))
 
 ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step
 (define (run:test run-id runname keyvallst test-record flags parent-test)
   ;; All these vars might be referenced by the testconfig file reader
   (let* ((test-name    (tests:testqueue-get-testname   test-record))

Index: tests/Makefile
==================================================================
--- tests/Makefile
+++ tests/Makefile
@@ -27,10 +27,12 @@
 test5 : fullprep
 	cd fullrun;$(MEGATEST) -runall -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_aa -debug $(DEBUG) > aa.log 2> aa.log &
 	cd fullrun;$(MEGATEST) -runall -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_ab -debug $(DEBUG) > ab.log 2> ab.log &
 	cd fullrun;$(MEGATEST) -runall -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_ac -debug $(DEBUG) > ac.log 2> ac.log &
 	cd fullrun;$(MEGATEST) -runall -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_ad -debug $(DEBUG) > ad.log 2> ad.log &	
+	cd fullrun;$(MEGATEST) -runall -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_ae -debug $(DEBUG) > ae.log 2> ae.log &	
+	cd fullrun;$(MEGATEST) -runall -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_af -debug $(DEBUG) > af.log 2> af.log &	
 
 test6: fullprep
 	cd fullrun;$(MEGATEST) -runtests runfirst -itempatt %/1 -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_itempatt -v
 	cd fullrun;$(MEGATEST) -runtests runfirst -itempatt %blahha% -reqtarg ubuntu/nfs/none :runname $(RUNNAME)_itempatt -debug 10