@@ -1,7 +1,7 @@
 
-;; Copyright 2006-2012, Matthew Welland.
+;; Copyright 2006-2013, Matthew Welland.
 ;; 
 ;;  This program is made available under the GNU GPL version 2.0 or
 ;;  greater. See the accompanying file COPYING for details.
 ;; 
 ;;  This program is distributed WITHOUT ANY WARRANTY; without even the
@@ -135,11 +135,11 @@
 	  (alist->env-vars env-ovrd)
 	  (set-megatest-env-vars run-id)
 	  (set-item-env-vars itemdat)
 	  (save-environment-as-files "megatest")
 	  ;; open-run-close not needed for test-set-meta-info
-	  (tests:set-meta-info #f test-id run-id test-name itemdat 0 work-area)
+	  (tests:set-full-meta-info #f test-id run-id 0 work-area)
 	  (tests:test-set-status! test-id "REMOTEHOSTSTART" "n/a" (args:get-arg "-m") #f)
 	  ;; (cdb:set-test-start-time!  *runremote* test-id)
 	  (if (args:get-arg "-xterm")
 	      (set! fullrunscript "xterm")
 	      (if (and fullrunscript (not (file-execute-access? fullrunscript)))
@@ -151,10 +151,11 @@
 	  
 	  (let* ((m            (make-mutex))
 		 (kill-job?    #f)
 		 (exit-info    (vector #t #t #t))
 		 (job-thread   #f)
+		 (keep-going   #t)
 		 (runit        (lambda ()
 				 ;; (let-values
 				 ;;  (((pid exit-status exit-code)
 				 ;;    (run-n-wait fullrunscript)))
 				 (tests:test-set-status! test-id "RUNNING" "n/a" #f #f)
@@ -170,11 +171,11 @@
 					  (vector-set! exit-info 2 exit-code)
 					  (set! rollup-status exit-code) 
 					  (mutex-unlock! m)
 					  (if (eq? pid-val 0)
 					      (begin
-						(thread-sleep! 2)
+						(thread-sleep! 1)
 						(loop (+ i 1)))
 					      )))))
 				 ;; then, if runscript ran ok (or did not get called)
 				 ;; do all the ezsteps (if any)
 				 (if ezsteps
@@ -223,11 +224,11 @@
 								   (vector-set! exit-info 1 exit-status)
 								   (vector-set! exit-info 2 exit-code)
 								   (mutex-unlock! m)
 								   (if (eq? pid-val 0)
 								       (begin
-									 (thread-sleep! 2)
+									 (thread-sleep! 1)
 									 (processloop (+ i 1))))
 								   ))
                                                      (let ((exinfo (vector-ref exit-info 2))
                                                            (logfna (if logpro-used (conc stepname ".html") "")))
 						       ;; testing if procedures called in a remote call cause problems (ans: no or so I suspect)
@@ -275,10 +276,11 @@
 							  (round 
 							   (- 
 							    (current-seconds) 
 							    start-seconds)))))
 					(kill-tries 0))
+				   (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
 				   (let loop ((minutes   (calc-minutes)))
 				     (begin
 				       (set! kill-job? (or (test-get-kill-request test-id) ;; run-id test-name itemdat))
 							   (and runtlim (let* ((run-seconds   (- (current-seconds) start-seconds))
 									       (time-exceeded (> run-seconds runtlim)))
@@ -286,14 +288,17 @@
 									      (begin
 										(debug:print-info 0 "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)
 										#t)
 									      #f)))))
 				       ;; open-run-close not needed for test-set-meta-info
-				       (tests:set-meta-info #f test-id run-id test-name itemdat minutes work-area)
+				       (tests:set-partial-meta-info #f test-id run-id minutes work-area)
 				       (if kill-job? 
 					   (begin
 					     (mutex-lock! m)
+					     ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
+					     ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
+					     ;;       between tries?
 					     (let* ((pid (vector-ref exit-info 0)))
 					       (if (number? pid)
 						   (process-signal pid signal/kill)
 						   ;; (begin
 						   ;;   (debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")")
@@ -317,19 +322,25 @@
 						     (sqlite3:finalize! tdb)
 						     (exit 1))))
 					     (set! kill-tries (+ 1 kill-tries))
 					     (mutex-unlock! m)))
 				       ;; (sqlite3:finalize! db)
-				       (thread-sleep! (+ 10 (random 10))) ;; add some jitter to the call home time to spread out the db accesses
-				       (loop (calc-minutes)))))))
-		 (th1          (make-thread monitorjob))
-		 (th2          (make-thread runit)))
+				       (if keep-going
+					   (begin
+					     (thread-sleep! (+ 10 (random 10))) ;; add some jitter to the call home time to spread out the db accesses
+					     (if keep-going
+						 (loop (calc-minutes)))))))))) ;; NOTE: Checking twice for keep-going is intentional
+		 (th1          (make-thread monitorjob "monitor job"))
+		 (th2          (make-thread runit "run job")))
 	    (set! job-thread th2)
 	    (thread-start! th1)
 	    (thread-start! th2)
 	    (thread-join! th2)
-	    (thread-sleep! 0.1) ;; give thread th1 a chance to be done TODO: Verify this is needed.
+	    (set! keep-going #f)
+	    (thread-sleep! 1)
+	    (thread-terminate! th1) ;; Not sure if this is a good idea
+	    (thread-sleep! 0.1) ;; give thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec.
 	    (mutex-lock! m)
 	    (let* ((item-path (item-list->path itemdat))
 		   (testinfo  (cdb:get-test-info-by-id *runremote* test-id))) ;; )) ;; run-id test-name item-path)))
 	      ;; Am I completed?
 	      (if (equal? (db:test-get-state testinfo) "RUNNING") ;; (not (equal? (db:test-get-state testinfo) "COMPLETED"))
@@ -338,38 +349,35 @@
 							                ;; (db:test-get-state testinfo)))   ;; else preseve the state as set within the test
 				    )
 			(new-status (cond
 				     ((not (vector-ref exit-info 1)) "FAIL") ;; job failed to run
 				     ((eq? rollup-status 0)
-				      ;; if the current status is AUTO the defer to the calculated value (i.e. leave this AUTO)
+				      ;; if the current status is AUTO then defer to the calculated value (i.e. leave this AUTO)
 				      (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO" "PASS"))
 				     ((eq? rollup-status 1) "FAIL")
 				     ((eq? rollup-status 2)
 				      ;; if the current status is AUTO the defer to the calculated value but qualify (i.e. make this AUTO-WARN)
 				      (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO-WARN" "WARN"))
 				     (else "FAIL")))) ;; (db:test-get-status testinfo)))
-		    (debug:print-info 2 "Test NOT logged as COMPLETED, (state=" (db:test-get-state testinfo) "), updating result, rollup-status is " rollup-status)
+		    (debug:print-info 1 "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (vector-ref exit-info 1) " and rollup-status of " rollup-status)
 		    (tests:test-set-status! test-id 
 					    new-state
 					    new-status
 					    (args:get-arg "-m") #f)
 		    ;; need to update the top test record if PASS or FAIL and this is a subtest
-		    (if (not (equal? item-path ""))
-			(cdb:roll-up-pass-fail-counts *runremote* run-id test-name item-path new-status))
-		    
+		    ;; NO NEED TO CALL roll-up-pass-fail-counts HERE, THIS IS DONE IN roll-up-pass-fail-counts called by tests:test-set-status!
+		    ;; (if (not (equal? item-path ""))
+		    ;;     (begin
+		    ;;       (thread-sleep! 0.1) ;; give other processes an opportunity to access the db as rollup is lower priority
+		    ;;       (cdb:roll-up-pass-fail-counts *runremote* run-id test-name item-path new-status)))
 		    ))
 	      ;; for automated creation of the rollup html file this is a good place...
 	      (if (not (equal? item-path ""))
-		  (tests:summarize-items #f run-id test-name #f)) ;; don't force - just update if no
-	      )
+		  (tests:summarize-items #f run-id test-id test-name #f))) ;; don't force - just update if no
 	    (mutex-unlock! m)
-	    ;; (exec-results (cmd-run->list fullrunscript)) ;;  (list ">" (conc test-name "-run.log"))))
-	    ;; (success      exec-results)) ;; (eq? (cadr exec-results) 0)))
 	    (debug:print 2 "Output from running " fullrunscript ", pid " (vector-ref exit-info 0) " in work area " 
 			 work-area ":\n====\n exit code " (vector-ref exit-info 2) "\n" "====\n")
-	    ;; (sqlite3:finalize! db)
-	    ;; (sqlite3:finalize! tdb)
 	    (if (not (vector-ref exit-info 1))
 		(exit 4)))))))
 
 ;; set up the very basics needed for doing anything here.
 (define (setup-for-run)