Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -1969,30 +1969,36 @@
     (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) ;; hosts had better not be changing the number of cpus too often!
 	(let* ((proc   (lambda ()
 			 (let loop ((numcpu 0)
 				    (inl    (read-line)))
 			   (if (eof-object? inl)
-			       (begin
-				 (common:write-cached-info actual-host "num-cpus" numcpu)
-				 numcpu)
+			       (if (> numcpu 0)
+				   numcpu
+				   #f) ;; if zero return #f so caller knows that things are not working
 			       (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl)
 					 (+ numcpu 1)
 					 numcpu)
 				     (read-line))))))
 	       (result (if remote-host
 			   (with-input-from-pipe 
 			       (conc "ssh " remote-host " cat /proc/cpuinfo")
 			     proc)
 			   (with-input-from-file "/proc/cpuinfo" proc))))
-	  (common:write-cached-info actual-host "num-cpus" result)
+	  (if (> result 0)(common:write-cached-info actual-host "num-cpus" result))
 	  result))))
 
 ;; wait for normalized cpu load to drop below maxload
 ;;
-(define (common:wait-for-normalized-load maxload msg remote-host)
+(define (common:wait-for-normalized-load maxload msg remote-host #!optional (rem-tries 5))
   (let ((num-cpus (common:get-num-cpus remote-host)))
-    (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host)))
+    (if num-cpus
+	(common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host)
+	(begin
+	  (thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again
+	  (if (> rem-tries 0)
+	      (common:wait-for-normalized-load maxload msg remote-host (- rem-tries 1))
+	      #f)))))
 
 ;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
 ;;
 (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f))
   (let* ((loadavg (common:get-cpu-load remote-host))

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -1614,10 +1614,30 @@
 
 ;; given a launch delay (minimum time from last launch) return amount of time to wait
 ;;
 ;; (define (db:launch-delay-left dbstruct run-id launch-delay)
   
+
+
+(define (db:get-status-from-final-status-file run-dir)
+  (let (
+       (infile (conc run-dir "/.final-status")))
+
+       ;; first verify we are able to write the output file
+       (if (not (file-read-access? infile))
+          (begin 
+	        (debug:print 0 *default-log-port* "ERROR: cannot read " infile)
+          (debug:print 0 *default-log-port* "ERROR: run-dir is " run-dir)
+          #f
+          )
+          (with-input-from-file infile read-lines)
+       )
+  )
+)
+
+
+
 
 ;;  select end_time-now from
 ;;      (select testname,item_path,event_time+run_duration as
 ;;                          end_time,strftime('%s','now') as now from tests where state in
 ;;      ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));
@@ -1624,10 +1644,12 @@
 
 (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
   (let* ((incompleted '())
 	 (oldlaunched '())
 	 (toplevels   '())
+          ;; The default running-deadtime is 720 seconds = 12 minutes.
+          ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30))
          (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime")))
          (server-start-allowance 200)
          (server-overloaded-budget 200)
          (launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30))
          (launch-monitor-on-time-budget 30)
@@ -1635,10 +1657,13 @@
          (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30))
          (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default))
          (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))
          (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period)
          )
+    (debug:print-info 4  *default-log-port* "running-deadtime = " running-deadtime)
+    (debug:print-info 4  *default-log-port* "deadtime-trim = " deadtime-trim)
+
     (db:with-db 
      dbstruct #f #f
      (lambda (db)
        ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
        ;;
@@ -1657,12 +1682,13 @@
                 (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
               (begin
                 (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
                 (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration))))
         db
+        
         "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');"
-        run-id running-deadtime)
+        run-id running-deadtime) ;; default time 720 seconds
 
        
        (sqlite3:for-each-row 
         (lambda (test-id run-dir uname testname item-path event-time run-duration)
           (if (and (equal? uname "n/a")
@@ -1674,11 +1700,11 @@
               (begin
                 (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)
                 (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
         db
         "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');"
-        run-id remotehoststart-deadtime)
+        run-id remotehoststart-deadtime) ;; default time 230 seconds.
 
        ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
        ;;
        ;; (db:delay-if-busy dbdat)
        (sqlite3:for-each-row
@@ -1710,14 +1736,34 @@
               (all-ids             (append min-incompleted-ids (map car oldlaunched))))
          (if (> (length all-ids) 0)
              (begin
                (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD")
                (for-each
-                (lambda (test-id)
-                  (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))
-                  ;;(db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS.  ref ticket 220546828
-                all-ids))))))))
+                  (lambda (test-id)
+                    (let* (
+                      (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id))
+                      (result (db:get-status-from-final-status-file run-dir)))
+                    (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) 
+                      (begin
+                        (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD")
+                        (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "PASS" "Test stopped responding but it has PASSED; marking it PASS in the DB.")
+                      )
+                      (begin
+                      (debug:print 0 *default-log-port* "INFO: test " test-id " final state/status is not COMPLETED/PASS. It is " result)
+                      (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")
+                      )
+                     )
+                    )
+                  )
+                  all-ids)
+             )
+         )
+       )
+     )
+   )
+ )
+)
 
 ;; ALL REPLACED BY THE BLOCK ABOVE
 ;;
 ;; 	    (sqlite3:execute 
 ;; 	     db

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -93,16 +93,22 @@
 	;; (let ((info (cadr ezstep)))
 	;; 		   (if (proc? info) "" info)))
 	;; (stepproc       (let ((info (cadr ezstep)))
 	;; 		   (if (proc? info) info #f)))
 	 (stepparts      (string-match (regexp "^(\\{([^\\}\\{]*)\\}\\s*|)(.*)$") stepinfo))
-	 (stepparams     (list-ref stepparts 2)) ;; for future use, {VAR=1,2,3}, run step for each
+	 (stepparams     (if (and (list? stepparts)
+				  (> (length stepparts) 1))
+			     (list-ref stepparts 2)
+			     #f)) ;; for future use, {VAR=1,2,3}, run step for each
 	 (paramparts     (if (string? stepparams)
 			     (map (lambda (x)(string-split x "=")) (string-split-fields "[^;]*=[^;]*" stepparams))
 			     '()))
 	 (subrun         (alist-ref "subrun" paramparts equal?))
-	 (stepcmd        (list-ref stepparts 3))
+	 (stepcmd        (if (and (list? stepparts)
+				  (> (length stepparts) 2))
+			     (list-ref stepparts 3)
+			     (conc "# error, no command for step "stepname)))
 	 (script         "") ; "#!/bin/bash\n") ;; yep, we depend on bin/bash FIXME!!!\
 	 (logpro-file    (conc stepname ".logpro"))
 	 (html-file      (conc stepname ".html"))
 	 (dat-file       (conc stepname ".dat"))
 	 (tconfig-logpro (configf:lookup testconfig "logpro" stepname))
@@ -782,13 +788,29 @@
 	  ;; any previous runs
 	  ;; (db:test-remove-steps db run-id testname itemdat)
 	  ;; now is also a good time to write the .testconfig file
 	  (let* ((tconfig-fname   (conc work-area "/.testconfig"))
 		 (tconfig-tmpfile (conc tconfig-fname ".tmp"))
-		 (tconfig         (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t))) ;; 'return-procs)))
+		 (tconfig         (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs)))
+		 (scripts (configf:get-section tconfig "scripts")))
+	    ;; create .testconfig file
 	    (configf:write-alist tconfig tconfig-tmpfile)
-	    (file-move tconfig-tmpfile tconfig-fname #t))
+	    (file-move tconfig-tmpfile tconfig-fname #t)
+	    (delete-file* ".final-status")
+
+	    ;; extract scripts from testconfig and write them to files in test run dir
+	    (for-each
+	     (lambda (scriptdat)
+	       (match scriptdat
+		      ((name content)
+		       (with-output-to-file name
+			 (lambda ()
+			   (print content)
+			   (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu)))))
+		      (else
+		       (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\""))))
+	     scripts))
 	  ;; 
 	  (let* ((m            (make-mutex))
 		 (kill-job?    #f)
 		 (exit-info    (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status
 		 (job-thread   #f)
@@ -832,10 +854,14 @@
 				     ((eq? (launch:einf-rollup-status exit-info) 4) "WAIVED")
 				     ((eq? (launch:einf-rollup-status exit-info) 5) "ABORT")
 				     ((eq? (launch:einf-rollup-status exit-info) 6) "SKIP")
 				     (else "FAIL")))) ;; (db:test-get-status testinfo)))
 		    (debug:print-info 1 *default-log-port* "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (launch:einf-exit-status exit-info) " and rollup-status of " (launch:einf-rollup-status exit-info))
+   
+        ;; Leave a .final-status file for each sub-test
+        (tests:save-final-status run-id test-id)
+
 		    (tests:test-set-status! run-id 
 					    test-id 
 					    new-state
 					    new-status
 					    (args:get-arg "-m") #f)
@@ -842,12 +868,14 @@
 		    ;; need to update the top test record if PASS or FAIL and this is a subtest
 		    ;; NO NEED TO CALL set-state-status-and-roll-up-items HERE, THIS IS DONE IN set-state-status-and-roll-up-items called by tests:test-set-status!
 		    ))
 	      ;; for automated creation of the rollup html file this is a good place...
 	      (if (not (equal? item-path ""))
-		  (tests:summarize-items run-id test-id test-name #f))
+		      (tests:summarize-items run-id test-id test-name #f))
 	      (tests:summarize-test run-id test-id)  ;; don't force - just update if no
+        ;; Leave a .final-status file for the top level test
+        (tests:save-final-status run-id test-id)
 	      (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id)))
 	    (mutex-unlock! m)
             (launch:end-of-run-check run-id )
 	    (debug:print 2 *default-log-port* "Output from running " fullrunscript ", pid " (launch:einf-pid exit-info) " in work area " 
 			 work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n")

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -1996,11 +1996,11 @@
    (lambda (f x)
      (let ((fullname (conc real-dir "/" f)))
        (if (not (string-search (regexp "testdat.db") f))
 	   (runs:recursive-delete-with-error-msg fullname)))
      (+ 1 x))
-   0 real-dir)
+   0 real-dir #t)
   ;; then the entire directory
   (runs:recursive-delete-with-error-msg real-dir))
 
 ;; cleanup often needs to remove all but the last N runs per target
 ;;

Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -1401,10 +1401,29 @@
 			       ;; (string<? (conc (vector-ref a 2))
 			       ;;	    (conc (vector-ref b 2)))
 			       #f))
 		       (string<? (conc time-a)(conc time-b)))))))))
 
+
+;; Save test state and status in to a file .final-status in the test directory
+;;
+(define (tests:save-final-status run-id test-id)
+  (let* ((test-dat  (rmt:get-test-info-by-id run-id test-id))
+	 (out-dir   (db:test-get-rundir test-dat))
+	 (status-file  (conc out-dir "/.final-status"))
+   )
+    ;; first verify we are able to write the output file
+    (if (not (file-write-access? out-dir))
+	    (debug:print 0 *default-log-port* "ERROR: cannot write .final-status to " out-dir)
+	    (let* 
+         ((outp      (open-output-file status-file))
+	       (status    (db:test-get-status   test-dat))
+         (state     (db:test-get-state    test-dat)))
+        (fprintf outp "~S\n" state) 
+        (fprintf outp "~S\n" status) 
+        (close-output-port outp)))))
+
 
 ;; summarize test in to a file test-summary.html in the test directory
 ;;
 (define (tests:summarize-test run-id test-id)
   (let* ((test-dat  (rmt:get-test-info-by-id run-id test-id))
@@ -1422,11 +1441,11 @@
 	       (color     (common:get-color-from-status status))
 	       (logf      (db:test-get-final_logf test-dat))
 	       (steps-dat (tests:get-compressed-steps run-id test-id)))
 	  ;; (dcommon:get-compressed-steps #f 1 30045)
 	  ;; (#("wasting_time" "23:36:13" "23:36:21" "0" "8.0s" "wasting_time.log"))
-	  
+	
 	  (s:output-new
 	   oup
 	   (s:html
 	    (s:title "Summary for " full-name)
 	    (s:body