Megatest

Check-in [bc2a1779bc]
Login
Overview
Comment:Improved messages in couple noisy cases. Add flag for turning off touching of server logs when max api calls situation occurs
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.64
Files: files | file ages | folders
SHA1: bc2a1779bc75107f4d6bcee57846da7c31735d69
User & Date: matt on 2017-05-31 23:05:24
Other Links: branch diff | manifest | tags
Context
2017-06-01
00:21
Corrected issue with server overload handling. Switch to using a less burdensome server ping instead of server:check-if-running check-in: fd6bcc8b75 user: matt tags: v1.64
2017-05-31
23:05
Improved messages in couple noisy cases. Add flag for turning off touching of server logs when max api calls situation occurs check-in: bc2a1779bc user: matt tags: v1.64
2017-05-30
10:33
bumped version check-in: 9dbae6fda1 user: bjbarcla tags: v1.64, v1.6416
Changes

Modified api.scm from [5e34e05ad5] to [656c55c8ea].

125
126
127
128
129
130
131

132
133
134
135
136
137
138
     (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
     (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
   (cond
    ((not (vector? dat))                    ;; it is an error to not receive a vector
     (vector #f (vector #f "remote must be called with a vector")))
    ((> *api-process-request-count* 20) ;; 20)
     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")

     (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
    (else  
     (let* ((cmd-in            (vector-ref dat 0))
            (cmd               (if (symbol? cmd-in)
				   cmd-in
				   (string->symbol cmd-in)))
            (params            (vector-ref dat 1))







>







125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
     (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
     (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
   (cond
    ((not (vector? dat))                    ;; it is an error to not receive a vector
     (vector #f (vector #f "remote must be called with a vector")))
    ((> *api-process-request-count* 20) ;; 20)
     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")
     (set! *server-overloaded* #t)
     (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
    (else  
     (let* ((cmd-in            (vector-ref dat 0))
            (cmd               (if (symbol? cmd-in)
				   cmd-in
				   (string->symbol cmd-in)))
            (params            (vector-ref dat 1))

Modified common.scm from [f90660c3ad] to [ebe13dceae].

43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
	      (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val)
	    (setenv key val))
	  (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val))))

(define home (getenv "HOME"))
(define user (getenv "USER"))

;; GLOBAL GLETCHES

;; CONTEXTS
(defstruct cxt
  (taskdb #f)
  (cmutex (make-mutex)))
;; (define *contexts* (make-hash-table))
;; (define *context-mutex* (make-mutex))







|







43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
	      (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val)
	    (setenv key val))
	  (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val))))

(define home (getenv "HOME"))
(define user (getenv "USER"))

;; GLOBALS

;; CONTEXTS
(defstruct cxt
  (taskdb #f)
  (cmutex (make-mutex)))
;; (define *contexts* (make-hash-table))
;; (define *context-mutex* (make-mutex))
126
127
128
129
130
131
132

133
134
135
136
137
138
139
(define *run-id*            #f)
(define *server-kind-run*   (make-hash-table))
(define *home-host*         #f)
;; (define *total-non-write-delay* 0)
(define *heartbeat-mutex*   (make-mutex))
(define *api-process-request-count* 0)
(define *max-api-process-requests* 0)


;; client
(define *rmt-mutex*         (make-mutex))     ;; remote access calls mutex 

;; RPC transport
(define *rpc:listener*      #f)








>







126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
(define *run-id*            #f)
(define *server-kind-run*   (make-hash-table))
(define *home-host*         #f)
;; (define *total-non-write-delay* 0)
(define *heartbeat-mutex*   (make-mutex))
(define *api-process-request-count* 0)
(define *max-api-process-requests* 0)
(define *server-overloaded*  #f)

;; client
(define *rmt-mutex*         (make-mutex))     ;; remote access calls mutex 

;; RPC transport
(define *rpc:listener*      #f)

Modified db.scm from [4838550ec8] to [de6a9e2632].

400
401
402
403
404
405
406















407
408
409
410
411
412
413
414
415
416
417
418
419
420
421

422
423
424
425
426
427
428
429
430
431
432
433
      (mutex-unlock! *db-multi-sync-mutex*)
      (db:sync-tables (db:sync-all-tables-list dbstruct) update_info tmpdb refndb mtdb))
    (mutex-lock! *db-multi-sync-mutex*)
    (set! *db-last-sync* start-t)
    (set! *db-last-access* start-t)
    (mutex-unlock! *db-multi-sync-mutex*)
    (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb)))
















;; close all opened run-id dbs
(define (db:close-all dbstruct)
  (if (dbr:dbstruct? dbstruct)
      (handle-exceptions
	  exn
	  (begin
	    (debug:print 0 *default-log-port* "WARNING: Finalizing failed, "  ((condition-property-accessor 'exn 'message) exn))
	    (print-call-chain *default-log-port*))
	;; (db:sync-touched dbstruct 0 force-sync: #t) ;; NO. Do not do this here. Instead we rely on a server to be started when there are writes, even if the server itself is not going to be used as a server.
        (let ((tdbs (map db:dbdat-get-db 
                         (stack->list (dbr:dbstruct-dbstack dbstruct))))
              (mdb (db:dbdat-get-db (dbr:dbstruct-mtdb   dbstruct)))
              (rdb (db:dbdat-get-db (dbr:dbstruct-refndb dbstruct))))
          (map (lambda (db)

		 (if (sqlite3:database? db)
		     (sqlite3:finalize! db)))
	       tdbs)
          (if (sqlite3:database? mdb) (sqlite3:finalize! mdb))
          (if (sqlite3:database? rdb) (sqlite3:finalize! rdb))))))

;;   (let ((locdbs (dbr:dbstruct-locdbs dbstruct)))
;;     (if (hash-table? locdbs)
;; 	(for-each (lambda (run-id)
;; 		    (db:close-run-db dbstruct run-id))
;; 		  (hash-table-keys locdbs)))))








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>















>
|
|

|
|







400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
      (mutex-unlock! *db-multi-sync-mutex*)
      (db:sync-tables (db:sync-all-tables-list dbstruct) update_info tmpdb refndb mtdb))
    (mutex-lock! *db-multi-sync-mutex*)
    (set! *db-last-sync* start-t)
    (set! *db-last-access* start-t)
    (mutex-unlock! *db-multi-sync-mutex*)
    (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb)))

(define (db:safely-close-sqlite3-db db #!key (try-num 3))
  (if (<= try-num 0)
      #f
      (handle-exceptions
	  exn
	  (begin
	    (thread-sleep! 3)
	    (sqlite3:interrupt! db)
	    (db:safely-close-sqlite3-db db try-num: (- try-num 1)))
	(if (sqlite3:database? db)
	    (begin
	      (sqlite3:finalize! db)
	      #t)
	    #f))))

;; close all opened run-id dbs
(define (db:close-all dbstruct)
  (if (dbr:dbstruct? dbstruct)
      (handle-exceptions
	  exn
	  (begin
	    (debug:print 0 *default-log-port* "WARNING: Finalizing failed, "  ((condition-property-accessor 'exn 'message) exn))
	    (print-call-chain *default-log-port*))
	;; (db:sync-touched dbstruct 0 force-sync: #t) ;; NO. Do not do this here. Instead we rely on a server to be started when there are writes, even if the server itself is not going to be used as a server.
        (let ((tdbs (map db:dbdat-get-db 
                         (stack->list (dbr:dbstruct-dbstack dbstruct))))
              (mdb (db:dbdat-get-db (dbr:dbstruct-mtdb   dbstruct)))
              (rdb (db:dbdat-get-db (dbr:dbstruct-refndb dbstruct))))
          (map (lambda (db)
		 (db:safely-close-sqlite3-db db))
;; 		 (if (sqlite3:database? db)
;; 		     (sqlite3:finalize! db)))
	       tdbs)
          (db:safely-close-sqlite3-db mdb)     ;; (if (sqlite3:database? mdb) (sqlite3:finalize! mdb))
          (db:safely-close-sqlite3-db rdb))))) ;; (if (sqlite3:database? rdb) (sqlite3:finalize! rdb))))))

;;   (let ((locdbs (dbr:dbstruct-locdbs dbstruct)))
;;     (if (hash-table? locdbs)
;; 	(for-each (lambda (run-id)
;; 		    (db:close-run-db dbstruct run-id))
;; 		  (hash-table-keys locdbs)))))

Modified http-transport.scm from [bb11c9d077] to [2c955425b7].

432
433
434
435
436
437
438

439
440
441
442
443
444
445
	     (adjusted-timeout (if (> hrs-since-start 1)
				   (- server-timeout (inexact->exact (round (* hrs-since-start 60))))  ;; subtract 60 seconds per hour
				   server-timeout)))
	(if (common:low-noise-print 120 "server timeout")
	    (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout))
	(cond
         ((and *server-run*

	       (> (+ last-access server-timeout)
		  (current-seconds))
	       (< (- (current-seconds) server-start-time) 3600)) ;; do not update log or touch log if we've been running for more than one hour.
          (if (common:low-noise-print 120 "server continuing")
              (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	      (let ((curr-time (current-seconds)))
		(handle-exceptions







>







432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
	     (adjusted-timeout (if (> hrs-since-start 1)
				   (- server-timeout (inexact->exact (round (* hrs-since-start 60))))  ;; subtract 60 seconds per hour
				   server-timeout)))
	(if (common:low-noise-print 120 "server timeout")
	    (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout))
	(cond
         ((and *server-run*
	       (not *server-overloaded*)
	       (> (+ last-access server-timeout)
		  (current-seconds))
	       (< (- (current-seconds) server-start-time) 3600)) ;; do not update log or touch log if we've been running for more than one hour.
          (if (common:low-noise-print 120 "server continuing")
              (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
	      (let ((curr-time (current-seconds)))
		(handle-exceptions

Modified launch.scm from [66b5d11122] to [98f81b97d0].

550
551
552
553
554
555
556

557



558
559
560
561
562
563
564
	    (set-signal-handler! signal/term sighand)
	    ) ;; (set-signal-handler! signal/stop sighand)
	  
	  ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART,
	  ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY*
	  ;;
	  (let* ((test-info (rmt:get-test-info-by-id run-id test-id))

		 (test-host (db:test-get-host        test-info))



		 (test-pid  (db:test-get-process_id  test-info)))
	    (cond
	     ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "KILLREQ" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
	      (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
	      (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
	      ) ;; prime it for running







>
|
>
>
>







550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
	    (set-signal-handler! signal/term sighand)
	    ) ;; (set-signal-handler! signal/stop sighand)
	  
	  ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART,
	  ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY*
	  ;;
	  (let* ((test-info (rmt:get-test-info-by-id run-id test-id))
		 (test-host (if test-info
				(db:test-get-host        test-info)
				(begin
				  (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.")
				  (exit))))
		 (test-pid  (db:test-get-process_id  test-info)))
	    (cond
	     ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "KILLREQ" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
	      (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
	      (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
	      ) ;; prime it for running
1224
1225
1226
1227
1228
1229
1230

1231
1232
1233
1234
1235
1236
1237
1238
  (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex
  (let* ((item-path       (item-list->path itemdat))
	 (contour         #f)) ;; NOT READY FOR THIS (args:get-arg "-contour")))
    (let loop ((delta        (- (current-seconds) *last-launch*))
	       (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1)))
      (if (> launch-delay delta)
	  (begin

	    (debug:print-info 0 *default-log-port* "Delaying launch of " test-name " for " (- launch-delay delta) " seconds")
	    (thread-sleep! (- launch-delay delta))
	    (loop (- (current-seconds) *last-launch*) launch-delay))))
    (change-directory *toppath*)
    (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars)
     (append
      (list
       (list "MT_RUN_AREA_HOME" *toppath*)







>
|







1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
  (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex
  (let* ((item-path       (item-list->path itemdat))
	 (contour         #f)) ;; NOT READY FOR THIS (args:get-arg "-contour")))
    (let loop ((delta        (- (current-seconds) *last-launch*))
	       (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1)))
      (if (> launch-delay delta)
	  (begin
	    (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay.
		(debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds"))
	    (thread-sleep! (- launch-delay delta))
	    (loop (- (current-seconds) *last-launch*) launch-delay))))
    (change-directory *toppath*)
    (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars)
     (append
      (list
       (list "MT_RUN_AREA_HOME" *toppath*)

Modified tests/fdktestqa/testqa/megatest.config from [d32541500d] to [a5c70b7363].

1
2
3
4
5
6
7


8
9
10
11
12
13
14
[setup]
testcopycmd cp --remove-destination -rlv TEST_SRC_PATH/. TEST_TARG_PATH/. >> TEST_TARG_PATH/mt_launch.log 2>> TEST_TARG_PATH/mt_launch.log
# launchwait no
launch-delay 0

[server]
runtime 180



# All these are overridden in ../fdk.config
# [jobtools]
# launcher nbfake
# launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log 

[include ../fdk.config]



|


|
>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
[setup]
testcopycmd cp --remove-destination -rlv TEST_SRC_PATH/. TEST_TARG_PATH/. >> TEST_TARG_PATH/mt_launch.log 2>> TEST_TARG_PATH/mt_launch.log
# launchwait no
launch-delay 1

[server]
# runtime 180
# timeout is in hours
timeout 100

# All these are overridden in ../fdk.config
# [jobtools]
# launcher nbfake
# launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log 

[include ../fdk.config]

tests/fdktestqa/testqa/tests/bigrun/step1.sh became executable with contents [e700391a61].

tests/fdktestqa/testqa/tests/bigrun2/step1.sh became executable with contents [f90152c7af].

Modified tests/fdktestqa/testqa/tests/bigrun2/testconfig from [ccc63b9335] to [37225a43b3].

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# Add additional steps here. Format is "stepname script"
[ezsteps]
step1 step1.sh

# Test requirements are specified here
[requirements]
waiton bigrun
priority 0
mode itemwait
itemmap .*/

# Iteration for your tests are controlled by the items section
[items]
NUMBER #{scheme (string-intersperse (map (lambda (x)(conc "blah/" x)) \
                                         (map number->string (sort (let loop ((a 0)(res '())) \
                                                                        (if (<= a (or (any->number (get-environment-variable "NUMTESTS")) 2500)) \
                                                                            (loop (+ a 1)(cons a res)) res)) <))) " ")}


# test_meta is a section for storing additional data on your test
[test_meta]













|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# Add additional steps here. Format is "stepname script"
[ezsteps]
step1 step1.sh

# Test requirements are specified here
[requirements]
waiton bigrun
priority 0
mode itemwait
itemmap .*/

# Iteration for your tests are controlled by the items section
[items]
NUMBER #{scheme (string-intersperse (map (lambda (x)(conc (if (getenv "USEBLAH") "blah/" "") x)) \
                                         (map number->string (sort (let loop ((a 0)(res '())) \
                                                                        (if (<= a (or (any->number (get-environment-variable "NUMTESTS")) 2500)) \
                                                                            (loop (+ a 1)(cons a res)) res)) <))) " ")}


# test_meta is a section for storing additional data on your test
[test_meta]

tests/fdktestqa/testqa/tests/bigrun3/step1.sh became executable with contents [f90152c7af].