Megatest

Check-in [7a36940463]
Login
Overview
Comment:Added scaffolding for test recovery feature
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.60
Files: files | file ages | folders
SHA1: 7a3694046379ec3cb60833898d89b1637ec629ad
User & Date: matt on 2015-10-16 03:35:50
Other Links: branch diff | manifest | tags
Context
2015-10-16
18:05
Fixed direct access check-in: 990765c362 user: mrwellan tags: v1.60
03:35
Added scaffolding for test recovery feature check-in: 7a36940463 user: matt tags: v1.60
2015-10-14
22:31
Added lazy-dot check-in: 3369e3bfa0 user: matt tags: v1.60
Changes

Modified launch.scm from [86c32a2e0c] to [d1b54f5011].

1023
1024
1025
1026
1027
1028
1029

























            ))
      (alist->env-vars miscprevvals)
      (alist->env-vars testprevvals)
      (alist->env-vars commonprevvals)
      launch-results))
  (change-directory *toppath*))

































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
            ))
      (alist->env-vars miscprevvals)
      (alist->env-vars testprevvals)
      (alist->env-vars commonprevvals)
      launch-results))
  (change-directory *toppath*))

;; recover a test where the top controlling mtest may have died
;;
(define (launch:recover-test run-id test-id)
  ;; this function is called on the test run host via ssh
  ;;
  ;; 1. look at the process from pid
  ;;    - is it owned by calling user
  ;;    - it it's run directory correct for the test
  ;;    - is there a controlling mtest (maybe stuck)
  ;; 2. if recovery is needed watch pid
  ;;    - when it exits take the exit code and do the needful
  ;;
  (let* ((pid (rmt:test-get-top-process-id run-id test-id))
	 (psres (with-input-from-pipe
		 (conc "ps -F -u " (current-user-name) " | grep -E '" pid " ' | grep -v 'grep -E " pid "'")
		 (lambda ()
		   (read-line))))
	 (rundir (if (string? psres) ;; real process owned by user
		     (read-symbolic-link (conc "/proc/" pid "/cwd"))
		     #f)))
    ;; now wait on that process if all is correct
    ;; periodically update the db with runtime
    ;; when the process exits look at the db, if still RUNNING after 10 seconds set
    ;; state/status appropriately
    (process-wait pid)))

Modified megatest.scm from [fc50829803] to [47e41f04c2].

249
250
251
252
253
254
255

256
257
258
259
260
261
262
			"-ping"
			"-refdb2dat"
			"-o"
			"-log"
			"-archive"
			"-since"
			"-fields"

			) 
		 (list  "-h" "-help" "--help"
			"-version"
		        "-force"
		        "-xterm"
		        "-showkeys"
		        "-show-keys"







>







249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
			"-ping"
			"-refdb2dat"
			"-o"
			"-log"
			"-archive"
			"-since"
			"-fields"
			"-recover-test" ;; run-id,test-id - used internally to recover a test stuck in RUNNING state
			) 
		 (list  "-h" "-help" "--help"
			"-version"
		        "-force"
		        "-xterm"
		        "-showkeys"
		        "-show-keys"
1342
1343
1344
1345
1346
1347
1348


















1349
1350
1351
1352
1353
1354
1355
;;    - gathers host info and 
;;======================================================================

(if (args:get-arg "-execute")
    (begin
      (launch:execute (args:get-arg "-execute"))
      (set! *didsomething* #t)))



















;;======================================================================
;; Test commands (i.e. for use inside tests)
;;======================================================================

(define (megatest:step step state status logfile msg)
  (if (not (getenv "MT_CMDINFO"))







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
;;    - gathers host info and 
;;======================================================================

(if (args:get-arg "-execute")
    (begin
      (launch:execute (args:get-arg "-execute"))
      (set! *didsomething* #t)))

;;======================================================================
;; recover from a test where the managing mtest was killed but the underlying
;; process might still be salvageable
;;======================================================================

(if (args:get-arg "-recover-test")
    (let* ((params (string-split (args:get-arg "-recover-test") ",")))
      (if (> (length params) 1) ;; run-id and test-id
	  (let ((run-id (string->number (car params)))
		(test-id (string->number (cadr params))))
	    (if (and run-id test-id)
		(begin
		  (launch:recover-test run-id test-id)
		  (set! *didsomething* #t))
		(begin
		  (debug:print 0 "ERROR: bad run-id or test-id, must be integers")
		  (exit 1)))))))

;;======================================================================
;; Test commands (i.e. for use inside tests)
;;======================================================================

(define (megatest:step step state status logfile msg)
  (if (not (getenv "MT_CMDINFO"))