Megatest

Check-in [6ea70b977c]
Login
Overview
Comment:Added an alternative to faux-lock that is simpler and uses a transaction. Added locking to test launch process. This may address the copy collisions we have seen.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.64
Files: files | file ages | folders
SHA1: 6ea70b977cdc9de384f2e0d125403bb7cf943fff
User & Date: matt on 2017-07-09 21:33:40
Other Links: branch diff | manifest | tags
Context
2017-07-10
10:47
Bump version to v1.6424 check-in: 7249f37836 user: mrwellan tags: v1.64
2017-07-09
21:33
Added an alternative to faux-lock that is simpler and uses a transaction. Added locking to test launch process. This may address the copy collisions we have seen. check-in: 6ea70b977c user: matt tags: v1.64
2017-07-07
16:57
Added delte-run option check-in: 1c79e4a834 user: ritikaag tags: v1.64
Changes

Modified api.scm from [1f6842e15f] to [c4438e36a1].

212
213
214
215
216
217
218

219
220
221
222
223
224
225
                   ((tasks-set-state-given-param-key) (apply tasks:set-state-given-param-key dbstruct params))
                   ((tasks-get-last)            (apply tasks:get-last dbstruct params))

		   ;; NO SYNC DB
		   ((no-sync-set)               (apply db:no-sync-set         *no-sync-db* params))
		   ((no-sync-get/default)       (apply db:no-sync-get/default *no-sync-db* params))
		   ((no-sync-del!)              (apply db:no-sync-del!        *no-sync-db* params))

		 
                   ;; ARCHIVES
                   ;; ((archive-get-allocations)   
                   ((archive-register-disk)     (apply db:archive-register-disk dbstruct params))
                   ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params))
                   ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey))








>







212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
                   ((tasks-set-state-given-param-key) (apply tasks:set-state-given-param-key dbstruct params))
                   ((tasks-get-last)            (apply tasks:get-last dbstruct params))

		   ;; NO SYNC DB
		   ((no-sync-set)               (apply db:no-sync-set         *no-sync-db* params))
		   ((no-sync-get/default)       (apply db:no-sync-get/default *no-sync-db* params))
		   ((no-sync-del!)              (apply db:no-sync-del!        *no-sync-db* params))
		   ((no-sync-get-lock)          (apply db:no-sync-get-lock    *no-sync-db* params))
		 
                   ;; ARCHIVES
                   ;; ((archive-get-allocations)   
                   ((archive-register-disk)     (apply db:archive-register-disk dbstruct params))
                   ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params))
                   ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey))

Modified common.scm from [af8ef99e5c] to [4881cf65d4].

2069
2070
2071
2072
2073
2074
2075






2076
2077
2078
2079
2080
2081
2082
2083
  (string-intersperse 
   (map (lambda (x)
          (number->string x 16))
        (map string->number
             (string-split instr)))
   "/"))







(define (common:faux-lock keyname #!key (wait-time 8))
  (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count
      (if (> wait-time 0)
	  (begin
	    (thread-sleep! 1)
	    (if (eq? wait-time 1) ;; only one second left, steal the lock
		(begin
		  (debug:print-info 0 *default-log-port* "stealing lock for " keyname)







>
>
>
>
>
>
|







2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
  (string-intersperse 
   (map (lambda (x)
          (number->string x 16))
        (map string->number
             (string-split instr)))
   "/"))

;;======================================================================
;; L O C K I N G   M E C H A N I S M S 
;;======================================================================

;; faux-lock is deprecated. Please use simple-lock below
;;
(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t))
  (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count
      (if (> wait-time 0)
	  (begin
	    (thread-sleep! 1)
	    (if (eq? wait-time 1) ;; only one second left, steal the lock
		(begin
		  (debug:print-info 0 *default-log-port* "stealing lock for " keyname)
2091
2092
2093
2094
2095
2096
2097




2098




2099
2100
2101
2102
2103
2104
2105
(define (common:faux-unlock keyname #!key (force #f))
  (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f))))
      (begin
        (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname))
        #t)
      #f))





  




(define (common:in-running-test?)
  (and (args:get-arg "-execute") (get-environment-variable "MT_CMDINFO")))

(define (common:get-color-from-status status)
  (cond
   ((equal? status "PASS")    "green")
   ((equal? status "FAIL")    "red")







>
>
>
>
|
>
>
>
>







2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
(define (common:faux-unlock keyname #!key (force #f))
  (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f))))
      (begin
        (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname))
        #t)
      #f))

;; simple lock. improve and converge on this one.
;;
(define (common:simple-lock keyname)
  (rmt:no-sync-get-lock keyname))

;;======================================================================
;;
;;======================================================================

(define (common:in-running-test?)
  (and (args:get-arg "-execute") (get-environment-variable "MT_CMDINFO")))

(define (common:get-color-from-status status)
  (cond
   ((equal? status "PASS")    "green")
   ((equal? status "FAIL")    "red")

Modified db.scm from [43a8d2d12e] to [05a6ad4f54].

1889
1890
1891
1892
1893
1894
1895



















1896
1897
1898
1899
1900
1901
1902
          (if newres
              newres
              res))
        res)))

(define (db:no-sync-close-db db)
  (db:safely-close-sqlite3-db db))




















;; use a global for some primitive caching, it is just silly to
;; re-read the db over and over again for the keys since they never
;; change

;; why get the keys from the db? why not get from the *configdat*
;; using keys:config-get-fields?







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
          (if newres
              newres
              res))
        res)))

(define (db:no-sync-close-db db)
  (db:safely-close-sqlite3-db db))

;; transaction protected lock aquisition
;; either:
;;    fails    returns  (#f . lock-creation-time)
;;    succeeds (returns (#t . lock-creation-time)
;; use (db:no-sync-del! db keyname) to release the lock
;;
(define (db:no-sync-get-lock db keyname)
  (sqlite3:with-transaction
   (db:no-sync-db db)
   (lambda ()
     (handle-exceptions
	 exn
	 (let ((lock-time (current-seconds)))
	   (sqlite3:execute db "INSERT INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-time)
	   `(#t . ,lock-time))
       `(#f . ,(sqlite3:first-result db "SELECT val FROM no_sync_metadat WHERE var=?;" keyname))))))



;; use a global for some primitive caching, it is just silly to
;; re-read the db over and over again for the keys since they never
;; change

;; why get the keys from the db? why not get from the *configdat*
;; using keys:config-get-fields?

Modified launch.scm from [1d514c355f] to [bc68bfb44c].

1236
1237
1238
1239
1240
1241
1242













1243
1244
1245
1246
1247
1248
1249
1250
;; 3. create link from run dir to megatest runs area 
;; 4. remotely run the test on allocated host
;;    - could be ssh to host from hosts table (update regularly with load)
;;    - could be netbatch
;;      (launch-test db (cadr status) test-conf))
(define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params)
  (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex













  (let* ((item-path       (item-list->path itemdat))
	 (contour         #f)) ;; NOT READY FOR THIS (args:get-arg "-contour")))
    (let loop ((delta        (- (current-seconds) *last-launch*))
	       (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1)))
      (if (> launch-delay delta)
	  (begin
	    (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay.
		(debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds"))







>
>
>
>
>
>
>
>
>
>
>
>
>
|







1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
;; 3. create link from run dir to megatest runs area 
;; 4. remotely run the test on allocated host
;;    - could be ssh to host from hosts table (update regularly with load)
;;    - could be netbatch
;;      (launch-test db (cadr status) test-conf))
(define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params)
  (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex
  (let* ((lock-key        (conc "test-" test-id))
	 (got-lock        (let loop ((lock        (rmt:no-sync-get-lock lock-key))
				     (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds
			    (if (car lock)
				#t
				(if (> (current-seconds) expire-time)
				    (begin
				      (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path)
				      (rmt:no-sync-del! lock-key) ;; destroy the lock
				      (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; 
				    (begin
				      (thread-sleep! 1)
				      (loop (rmt:no-sync-get-lock lock-key) expire-time))))))
	 (item-path       (item-list->path itemdat))
	 (contour         #f)) ;; NOT READY FOR THIS (args:get-arg "-contour")))
    (let loop ((delta        (- (current-seconds) *last-launch*))
	       (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1)))
      (if (> launch-delay delta)
	  (begin
	    (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay.
		(debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds"))
1412
1413
1414
1415
1416
1417
1418

1419
1420
1421
1422
1423
1424
1425
					      cmdstr
					      (conc cmdstr " >> mt_launch.log 2>&1 &")))
					(car fullcmd))
				    (if useshell
					'()
					(cdr fullcmd)))))
        (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork.

	(if (not launchwait) ;; give the OS a little time to allow the process to start
	    (thread-sleep! 0.01))
	(with-output-to-file "mt_launch.log"
	  (lambda ()
	    (print "LAUNCHCMD: " (string-intersperse fullcmd " "))
	    (if (list? launch-results)
		(apply print launch-results)







>







1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
					      cmdstr
					      (conc cmdstr " >> mt_launch.log 2>&1 &")))
					(car fullcmd))
				    (if useshell
					'()
					(cdr fullcmd)))))
        (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork.
	(rmt:no-sync-del! lock-key)         ;; release the lock for starting this test
	(if (not launchwait) ;; give the OS a little time to allow the process to start
	    (thread-sleep! 0.01))
	(with-output-to-file "mt_launch.log"
	  (lambda ()
	    (print "LAUNCHCMD: " (string-intersperse fullcmd " "))
	    (if (list? launch-results)
		(apply print launch-results)

Modified rmt.scm from [ec4c6ff50c] to [bc431c6c0b].

865
866
867
868
869
870
871



872
873
874
875
876
877
878

(define (rmt:no-sync-get/default var default)
  (rmt:send-receive 'no-sync-get/default #f `(,var ,default)))

(define (rmt:no-sync-del! var)
  (rmt:send-receive 'no-sync-del! #f `(,var)))




;;======================================================================
;; A R C H I V E S
;;======================================================================

(define (rmt:archive-get-allocations  testname itempath dneeded)
  (rmt:send-receive 'archive-get-allocations #f (list testname itempath dneeded)))








>
>
>







865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881

(define (rmt:no-sync-get/default var default)
  (rmt:send-receive 'no-sync-get/default #f `(,var ,default)))

(define (rmt:no-sync-del! var)
  (rmt:send-receive 'no-sync-del! #f `(,var)))

(define (rmt:no-sync-get-lock keyname)
  (rmt:send-receive 'no-sync-get-lock #f `(,keyname)))

;;======================================================================
;; A R C H I V E S
;;======================================================================

(define (rmt:archive-get-allocations  testname itempath dneeded)
  (rmt:send-receive 'archive-get-allocations #f (list testname itempath dneeded)))