Megatest

Check-in [3685d1ed2e]
Login
Overview
Comment:Cherrypick of 04379 to fix the crashing test control panel bug
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.55
Files: files | file ages | folders
SHA1: 3685d1ed2e2b175d495d327ca9ec583b92c82802
User & Date: matt on 2013-07-29 21:02:39
Other Links: branch diff | manifest | tags
Context
2013-07-29
21:23
Meld in of parts of 0b1a that fix test panel apparent lock up due to overly agressive updating. check-in: d4ee26c5cb user: matt tags: v1.55
21:02
Cherrypick of 04379 to fix the crashing test control panel bug check-in: 3685d1ed2e user: matt tags: v1.55
2013-07-26
12:16
Oops. Somehow missed the actual merge check-in: 26a3036b68 user: mrwellan tags: v1.55, v1.5509
Changes

Modified runs.scm from [1b46e2afce] to [b71546766a].

906
907
908
909
910
911
912
























913
914
915
916
917
918
919

(define (get-dir-up-n dir . params) 
  (let ((dparts  (string-split dir "/"))
	(count   (if (null? params) 1 (car params))))
    (conc "/" (string-intersperse 
	       (take dparts (- (length dparts) count))
	       "/"))))
























;; Remove runs
;; fields are passing in through 
;; action:
;;    'remove-runs
;;    'set-state-status
;;
;; NB// should pass in keys?







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943

(define (get-dir-up-n dir . params) 
  (let ((dparts  (string-split dir "/"))
	(count   (if (null? params) 1 (car params))))
    (conc "/" (string-intersperse 
	       (take dparts (- (length dparts) count))
	       "/"))))

(define (runs:recursive-delete-with-error-msg real-dir)
  (if (> (system (conc "rm -rf " real-dir)) 0)
      (debug:print 0 "ERROR: There was a problem removing " real-dir " with rm -f")))

(define (runs:safe-delete-test-dir real-dir)
  ;; first delete all sub-directories
  (directory-fold 
   (lambda (f x)
     (let ((fullname (conc real-dir "/" f)))
       (if (directory? fullname)(runs:recursive-delete-with-error-msg fullname)))
     (+ 1 x))
   0 real-dir)
  ;; then files other than *testdat.db*
  (directory-fold 
   (lambda (f x)
     (let ((fullname (conc real-dir "/" f)))
       (if (not (string-search (regexp "testdat.db") f))
	   (runs:recursive-delete-with-error-msg fullname)))
     (+ 1 x))
   0 real-dir)
  ;; then the entire directory
  (runs:recursive-delete-with-error-msg real-dir))

;; Remove runs
;; fields are passing in through 
;; action:
;;    'remove-runs
;;    'set-state-status
;;
;; NB// should pass in keys?
987
988
989
990
991
992
993


994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
			    (test-fulln    (db:test-get-fullname new-test-dat)))
			   (case action
			     ((remove-runs)
			      (debug:print-info 0 "test: " test-name " itest-state: " test-state)
			      (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
				  (begin
				    (if (not (hash-table-ref/default test-retry-time test-fulln #f))


					(hash-table-set! test-retry-time test-fulln (current-seconds)))
				    (if (> (- (current-seconds)(hash-table-ref test-retry-time test-fulln)) allow-run-time)
				      ;; This test is not in a correct state for cleaning up. Let's try some graceful shutdown steps first
				      ;; Set the test to "KILLREQ" and wait five seconds then try again. Repeat up to five times then give
				      ;; up and blow it away.
				      (begin
					(debug:print 0 "WARNING: could not gracefully remove test " test-fulln ", tried to kill it to no avail. Forcing state to FAILEDKILL and continuing")
					(cdb:remote-run db:test-set-state-status-by-id db (db:test-get-id test) "FAILEDKILL" "n/a" #f)
					(thread-sleep! 1))
				      (begin
					(cdb:remote-run db:test-set-state-status-by-id db (db:test-get-id test) "KILLREQ" "n/a" #f)
					(thread-sleep! 1)))
				    ;; NOTE: This is suboptimal as the testdata will be used later and the state/status may have changed ...
				    (if (null? tal)
					(loop new-test-dat tal)
					(loop (car tal)(append tal (list new-test-dat)))))
				  (begin
				    (cdb:remote-run db:delete-test-records db #f (db:test-get-id test))
				    (debug:print-info 1 "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir)
				    (if (and real-dir 
					     (> (string-length real-dir) 5)
					     (file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc.
					(begin ;; let* ((realpath (resolve-pathname run-dir)))
					  (debug:print-info 1 "Recursively removing " real-dir)
					  (if (file-exists? real-dir)
					      (if (> (system (conc "rm -rf " real-dir)) 0)
						  (debug:print 0 "ERROR: There was a problem removing " real-dir " with rm -f"))
					      (debug:print 0 "WARNING: test dir " real-dir " appears to not exist or is not readable")))
					(if real-dir 
					    (debug:print 0 "WARNING: directory " real-dir " does not exist")
					    (debug:print 0 "WARNING: no real directory corrosponding to link " run-dir ", nothing done")))
				    (if (symbolic-link? run-dir)
					(begin
					  (debug:print-info 1 "Removing symlink " run-dir)







>
>
|
















|







|
<







1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045

1046
1047
1048
1049
1050
1051
1052
			    (test-fulln    (db:test-get-fullname new-test-dat)))
			   (case action
			     ((remove-runs)
			      (debug:print-info 0 "test: " test-name " itest-state: " test-state)
			      (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
				  (begin
				    (if (not (hash-table-ref/default test-retry-time test-fulln #f))
					(begin
					  ;; want to set to REMOVING BUT CANNOT do it here?
					  (hash-table-set! test-retry-time test-fulln (current-seconds))))
				    (if (> (- (current-seconds)(hash-table-ref test-retry-time test-fulln)) allow-run-time)
				      ;; This test is not in a correct state for cleaning up. Let's try some graceful shutdown steps first
				      ;; Set the test to "KILLREQ" and wait five seconds then try again. Repeat up to five times then give
				      ;; up and blow it away.
				      (begin
					(debug:print 0 "WARNING: could not gracefully remove test " test-fulln ", tried to kill it to no avail. Forcing state to FAILEDKILL and continuing")
					(cdb:remote-run db:test-set-state-status-by-id db (db:test-get-id test) "FAILEDKILL" "n/a" #f)
					(thread-sleep! 1))
				      (begin
					(cdb:remote-run db:test-set-state-status-by-id db (db:test-get-id test) "KILLREQ" "n/a" #f)
					(thread-sleep! 1)))
				    ;; NOTE: This is suboptimal as the testdata will be used later and the state/status may have changed ...
				    (if (null? tal)
					(loop new-test-dat tal)
					(loop (car tal)(append tal (list new-test-dat)))))
				  (begin
				    (cdb:remote-run db:test-set-state-status-by-id db (db:test-get-id test) "REMOVING" "LOCKED" #f)
				    (debug:print-info 1 "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir)
				    (if (and real-dir 
					     (> (string-length real-dir) 5)
					     (file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc.
					(begin ;; let* ((realpath (resolve-pathname run-dir)))
					  (debug:print-info 1 "Recursively removing " real-dir)
					  (if (file-exists? real-dir)
					      (runs:safe-delete-test-dir real-dir)

					      (debug:print 0 "WARNING: test dir " real-dir " appears to not exist or is not readable")))
					(if real-dir 
					    (debug:print 0 "WARNING: directory " real-dir " does not exist")
					    (debug:print 0 "WARNING: no real directory corrosponding to link " run-dir ", nothing done")))
				    (if (symbolic-link? run-dir)
					(begin
					  (debug:print-info 1 "Removing symlink " run-dir)
1036
1037
1038
1039
1040
1041
1042


1043
1044
1045
1046
1047
1048
1049
						 exn
						 (debug:print 0 "ERROR:  Failed to remove directory " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue")
						 (delete-directory run-dir)))
					    (if run-dir
						(debug:print 0 "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink")
						(debug:print 0 "NOTE: the run dir for this test is undefined. Test may have already been deleted."))
					    ))


				    (if (not (null? tal))
					(loop (car tal)(cdr tal))))))
			     ((set-state-status)
			      (debug:print-info 2 "new state " (car state-status) ", new status " (cadr state-status))
			      (cdb:remote-run db:test-set-state-status-by-id db (db:test-get-id test) (car state-status)(cadr state-status) #f)
			      (if (not (null? tal))
				  (loop (car tal)(cdr tal))))







>
>







1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
						 exn
						 (debug:print 0 "ERROR:  Failed to remove directory " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue")
						 (delete-directory run-dir)))
					    (if run-dir
						(debug:print 0 "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink")
						(debug:print 0 "NOTE: the run dir for this test is undefined. Test may have already been deleted."))
					    ))
				    ;; Only delete the records *after* removing the directory. If things fail we have a record 
				    (cdb:remote-run db:delete-test-records db #f (db:test-get-id test))
				    (if (not (null? tal))
					(loop (car tal)(cdr tal))))))
			     ((set-state-status)
			      (debug:print-info 2 "new state " (car state-status) ", new status " (cadr state-status))
			      (cdb:remote-run db:test-set-state-status-by-id db (db:test-get-id test) (car state-status)(cadr state-status) #f)
			      (if (not (null? tal))
				  (loop (car tal)(cdr tal))))