Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -127,10 +127,11 @@ (cond ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f (vector #f "remote must be called with a vector"))) ((> *api-process-request-count* 20) ;; 20) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") + (set! *server-overloaded* #t) (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else (let* ((cmd-in (vector-ref dat 0)) (cmd (if (symbol? cmd-in) cmd-in Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -45,11 +45,11 @@ (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val)))) (define home (getenv "HOME")) (define user (getenv "USER")) -;; GLOBAL GLETCHES +;; GLOBALS ;; CONTEXTS (defstruct cxt (taskdb #f) (cmutex (make-mutex))) @@ -128,10 +128,11 @@ (define *home-host* #f) ;; (define *total-non-write-delay* 0) (define *heartbeat-mutex* (make-mutex)) (define *api-process-request-count* 0) (define *max-api-process-requests* 0) +(define *server-overloaded* #f) ;; client (define *rmt-mutex* (make-mutex)) ;; remote access calls mutex ;; RPC transport Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -402,10 +402,25 @@ (mutex-lock! *db-multi-sync-mutex*) (set! *db-last-sync* start-t) (set! *db-last-access* start-t) (mutex-unlock! *db-multi-sync-mutex*) (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb))) + +(define (db:safely-close-sqlite3-db db #!key (try-num 3)) + (if (<= try-num 0) + #f + (handle-exceptions + exn + (begin + (thread-sleep! 3) + (sqlite3:interrupt! db) + (db:safely-close-sqlite3-db db try-num: (- try-num 1))) + (if (sqlite3:database? db) + (begin + (sqlite3:finalize! db) + #t) + #f)))) ;; close all opened run-id dbs (define (db:close-all dbstruct) (if (dbr:dbstruct? dbstruct) (handle-exceptions @@ -417,15 +432,16 @@ (let ((tdbs (map db:dbdat-get-db (stack->list (dbr:dbstruct-dbstack dbstruct)))) (mdb (db:dbdat-get-db (dbr:dbstruct-mtdb dbstruct))) (rdb (db:dbdat-get-db (dbr:dbstruct-refndb dbstruct)))) (map (lambda (db) - (if (sqlite3:database? db) - (sqlite3:finalize! db))) + (db:safely-close-sqlite3-db db)) +;; (if (sqlite3:database? db) +;; (sqlite3:finalize! db))) tdbs) - (if (sqlite3:database? mdb) (sqlite3:finalize! mdb)) - (if (sqlite3:database? rdb) (sqlite3:finalize! rdb)))))) + (db:safely-close-sqlite3-db mdb) ;; (if (sqlite3:database? mdb) (sqlite3:finalize! mdb)) + (db:safely-close-sqlite3-db rdb))))) ;; (if (sqlite3:database? rdb) (sqlite3:finalize! rdb)))))) ;; (let ((locdbs (dbr:dbstruct-locdbs dbstruct))) ;; (if (hash-table? locdbs) ;; (for-each (lambda (run-id) ;; (db:close-run-db dbstruct run-id)) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -434,10 +434,11 @@ server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond ((and *server-run* + (not *server-overloaded*) (> (+ last-access server-timeout) (current-seconds)) (< (- (current-seconds) server-start-time) 3600)) ;; do not update log or touch log if we've been running for more than one hour. (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -552,11 +552,15 @@ ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* ;; (let* ((test-info (rmt:get-test-info-by-id run-id test-id)) - (test-host (db:test-get-host test-info)) + (test-host (if test-info + (db:test-get-host test-info) + (begin + (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.") + (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "KILLREQ" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") @@ -1226,11 +1230,12 @@ (contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour"))) (let loop ((delta (- (current-seconds) *last-launch*)) (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1))) (if (> launch-delay delta) (begin - (debug:print-info 0 *default-log-port* "Delaying launch of " test-name " for " (- launch-delay delta) " seconds") + (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay. + (debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds")) (thread-sleep! (- launch-delay delta)) (loop (- (current-seconds) *last-launch*) launch-delay)))) (change-directory *toppath*) (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars) (append Index: tests/fdktestqa/testqa/megatest.config ================================================================== --- tests/fdktestqa/testqa/megatest.config +++ tests/fdktestqa/testqa/megatest.config @@ -1,12 +1,14 @@ [setup] testcopycmd cp --remove-destination -rlv TEST_SRC_PATH/. TEST_TARG_PATH/. >> TEST_TARG_PATH/mt_launch.log 2>> TEST_TARG_PATH/mt_launch.log # launchwait no -launch-delay 0 +launch-delay 1 [server] -runtime 180 +# runtime 180 +# timeout is in hours +timeout 100 # All these are overridden in ../fdk.config # [jobtools] # launcher nbfake # launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log Index: tests/fdktestqa/testqa/tests/bigrun2/testconfig ================================================================== --- tests/fdktestqa/testqa/tests/bigrun2/testconfig +++ tests/fdktestqa/testqa/tests/bigrun2/testconfig @@ -9,11 +9,11 @@ mode itemwait itemmap .*/ # Iteration for your tests are controlled by the items section [items] -NUMBER #{scheme (string-intersperse (map (lambda (x)(conc "blah/" x)) \ +NUMBER #{scheme (string-intersperse (map (lambda (x)(conc (if (getenv "USEBLAH") "blah/" "") x)) \ (map number->string (sort (let loop ((a 0)(res '())) \ (if (<= a (or (any->number (get-environment-variable "NUMTESTS")) 2500)) \ (loop (+ a 1)(cons a res)) res)) <))) " ")}