Index: commonmod.scm ================================================================== --- commonmod.scm +++ commonmod.scm @@ -383,11 +383,11 @@ *db-sync-in-progress* *db-multi-sync-mutex* *task-db* *db-access-allowed* *db-access-mutex* -*db-transaction-mutex* +;; *db-transaction-mutex* *db-cache-path* *db-with-db-mutex* *db-api-call-time* *didsomething* *no-sync-db* @@ -962,16 +962,17 @@ (define *db-last-access* (current-seconds)) ;; last db access, used in server (define *db-write-access* #t) ;; db sync (define *db-last-sync* 0) ;; last time the sync to megatest.db happened (define *db-sync-in-progress* #f) ;; if there is a sync in progress do not try to start another +;; multi-sync mutex used in both dbmod and launchmod (define *db-multi-sync-mutex* (make-mutex)) ;; protect access to *db-sync-in-progress*, *db-last-sync* ;; task db (define *task-db* #f) ;; (vector db path-to-db) (define *db-access-allowed* #t) ;; flag to allow access (define *db-access-mutex* (make-mutex)) -(define *db-transaction-mutex* (make-mutex)) +;; (define *db-transaction-mutex* (make-mutex)) (define *db-cache-path* #f) (define *db-with-db-mutex* (make-mutex)) (define *db-api-call-time* (make-hash-table)) ;; hash of command => (list of times) ;; no sync db (define *no-sync-db* #f) @@ -4438,11 +4439,13 @@ (define (server:expiration-timeout) (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below (* 3600 (string->number tmo)) - 60))) ;; default is one minute + ;; 60 ;; default is one minute + 5 + ))) (define (runs:get-mt-env-alist run-id runname target testname itempath) ;;(bb-check-path msg: "runs:set-megatest-env-vars entry") `(("MT_TEST_NAME" . ,testname) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -877,19 +877,16 @@ (hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct)) (if (or (null? tal) (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update (begin (when (> elapsed-time 2) - (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.") - (let* ((old-val (iup:attribute *tim* "TIME")) - (new-val (number->string (inexact->exact (floor (* 2 (string->number old-val))))))) - (if (< (string->number new-val) 5000) - ((debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val) - (iup:attribute-set! *tim* "TIME" new-val)))) - - - ) + (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.") + (let* ((old-val (iup:attribute *tim* "TIME")) + (new-val (number->string (inexact->exact (floor (* 2 (string->number old-val))))))) + (if (< (string->number new-val) 5000) + (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val) + (iup:attribute-set! *tim* "TIME" new-val)))) (dboard:tabdat-allruns-set! tabdat new-res) maxtests) (if (> (dboard:rundat-run-data-offset run-struct) 0) (loop run tal new-res newmaxtests) ;; not done getting data for this run (loop (car tal)(cdr tal) new-res newmaxtests))))))) Index: dbmod.scm ================================================================== --- dbmod.scm +++ dbmod.scm @@ -345,11 +345,13 @@ (inmem #f) (last-sync 0) (last-write (current-seconds)) (run-id #f) (fname #f)) - + +(define *db-transaction-mutex* (make-mutex)) + ;; Returns the dbdat for a particular dbfile inside the area ;; (define (dbr:dbstruct-get-dbdat dbstruct dbfile) (hash-table-ref/default (dbr:dbstruct-dbdats dbstruct) dbfile #f)) @@ -526,17 +528,20 @@ res)) ;; called before db is open? ;; (define (db:get-iam-server-lock dbh dbfname host port) - (sqlite3:with-transaction - dbh - (lambda () - (let* ((locker (db:get-locker dbh dbfname))) - (if locker - locker - (db:take-lock dbh dbfname port)))))) + (mutex-lock! *db-transaction-mutex*) + (let ((res (sqlite3:with-transaction + dbh + (lambda () + (let* ((locker (db:get-locker dbh dbfname))) + (if locker + locker + (db:take-lock dbh dbfname port))))))) + (mutex-unlock! *db-transaction-mutex*) + res)) ;; (exn sqlite3) (define (db:get-locker dbh dbfname) (condition-case (sqlite3:first-row dbh "SELECT owner_pid,owner_host,owner_port,event_time FROM locks WHERE lockname=?;" dbfname) @@ -1009,10 +1014,11 @@ ;; (db:delay-if-busy targdb) ;; NO WAITING (if (member "last_update" field-names) (debug:print-info 3 *default-log-port* "is-trigger-dropped: " is-trigger-dropped)) (for-each (lambda (fromdat-lst) + (mutex-lock! *db-transaction-mutex*) (sqlite3:with-transaction db (lambda () (for-each ;; (lambda (fromrow) @@ -1029,11 +1035,12 @@ (if (not same) (begin (debug:print 0 *default-log-port* "applying data "fromrow"to table "tablename", numrecs="numrecs) (apply sqlite3:execute stmth (vector->list fromrow)) (hash-table-set! numrecs tablename (+ 1 (hash-table-ref/default numrecs tablename 0))))))) - fromdat-lst)))) + fromdat-lst))) + (mutex-unlock! *db-transaction-mutex*)) fromdats) (sqlite3:finalize! stmth) (if (member "last_update" field-names) (db:create-trigger db tablename))))) @@ -1515,10 +1522,11 @@ (keys (keys:config-get-fields configdat)) (havekeys (> (length keys) 0)) (keystr (keys->keystr keys)) (fieldstr (keys:make-key/field-string configdat)) #;(db (dbr:dbdat-db dbdat))) + (mutex-lock! *db-transaction-mutex*) (for-each (lambda (key) (let ((keyn key)) (if (member (string-downcase keyn) (list "runname" "state" "status" "owner" "event_time" "comment" "fail_count" "pass_count" "contour")) @@ -1641,10 +1649,11 @@ CONSTRAINT metadat_constraint UNIQUE (var));") (sqlite3:execute db "CREATE TABLE IF NOT EXISTS access_log (id INTEGER PRIMARY KEY, user TEXT, accessed TIMESTAMP, args TEXT);") ;; Must do this *after* running patch db !! No more. ;; cannot use db:set-var since it will deadlock, hardwire the code here (sqlite3:execute db "INSERT OR REPLACE INTO metadat (var,val) VALUES (?,?);" "MEGATEST_VERSION" (common:version-signature)) + (mutex-unlock! *db-transaction-mutex*) (debug:print-info 11 *default-log-port* "db:initialize END") ;; )))) ;;====================================================================== ;; R U N S P E C I F I C D B ;;====================================================================== @@ -2771,11 +2780,11 @@ ;; Update run_stats for given run_id ;; input data is a list (state status count) ;; (define (db:update-run-stats dbstruct run-id stats) - ;; (mutex-lock! *db-transaction-mutex*) + (mutex-lock! *db-transaction-mutex*) (db:with-db dbstruct run-id #f @@ -2793,11 +2802,11 @@ (sqlite3:execute stmt1 run-id (car dat)(cadr dat)) (apply sqlite3:execute stmt2 run-id dat)) stats))))) (sqlite3:finalize! stmt1) (sqlite3:finalize! stmt2) - ;; (mutex-unlock! *db-transaction-mutex*) + (mutex-unlock! *db-transaction-mutex*) res)))) (define (db:get-main-run-stats dbstruct run-id) (db:with-db dbstruct @@ -2955,11 +2964,11 @@ ;; this is inconsistent with get-runs but it makes some sense. ;; (define (db:get-run-info dbstruct run-id) ;;(if (hash-table-ref/default *run-info-cache* run-id #f) ;; (hash-table-ref *run-info-cache* run-id) - (let* ((res (vector #f #f #f #f)) + (let* ((res (make-vector 11 #f)) (keys (db:get-keys dbstruct)) (remfields (list "id" "runname" "state" "status" "owner" "event_time" "comment" "fail_count" "pass_count" "contour" "last_update")) ;; "area_id")) (header (append keys remfields)) (keystr (conc (keys->keystr keys) "," (string-intersperse remfields ",")))) Index: runsmod.scm ================================================================== --- runsmod.scm +++ runsmod.scm @@ -1484,11 +1484,11 @@ ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) (if (> (- (current-seconds) *last-test-launch*) 5) ;; be pretty aggressive for five seconds after - (runs:too-soon-delay (conc "loop delay " hed) 1 1) ;; starting a test then apply more delay + (runs:too-soon-delay (conc "loop delay " hed) 1 0.6) ;; starting a test then apply more delay (runs:too-soon-delay (conc "loop delay " hed) 1 0.1)) (if (> num-running 0) (set! last-time-some-running (current-seconds))) @@ -1734,11 +1734,11 @@ (rmt:set-var run-id (conc "launch-complete-" run-id) "yes") ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) - (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle + (thread-sleep! 0.1) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id)) (prev-num-running 0)) ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") Index: ulex-dual/dbmgr.scm ================================================================== --- ulex-dual/dbmgr.scm +++ ulex-dual/dbmgr.scm @@ -333,18 +333,24 @@ ;; sometime in the future ;; (define (rmt:send-receive-real sinfo apath dbname cmd params) (let* ((cdat (rmt:get-conn sinfo apath dbname))) (assert cdat "FATAL: rmt:send-receive-real called without the needed channels opened") - (let* ((uconn (servdat-uconn sinfo)) ;; get the interface to ulex - ;; then send-receive using the ulex layer to host-port stored in cdat - (res (send-receive uconn (conndat-hostport cdat) cmd params))) - ;; since we accessed the server we can bump the expires time up - (conndat-expires-set! cdat (+ (current-seconds) - (server:expiration-timeout) - -2)) ;; two second margin for network time misalignments etc. - res))) + (condition-case + (let* ((uconn (servdat-uconn sinfo)) ;; get the interface to ulex + (hostport (conndat-hostport cdat)) + ;; then send-receive using the ulex layer to host-port stored in cdat + (res (send-receive uconn hostport cmd params))) + ;; since we accessed the server we can bump the expires time up + (conndat-expires-set! cdat (+ (current-seconds) + (server:expiration-timeout) + -2)) ;; two second margin for network time misalignments etc. + res) + ((exn i/o net) + (debug:print-info 0 *default-log-port* "IO failure in connection to "hostport + ", resetting connection.") + ; ;; db is at apath/.db/dbname, rid is an intermediary solution and will be removed ;; sometime in the future. ;; Index: ulex-dual/ulex.scm ================================================================== --- ulex-dual/ulex.scm +++ ulex-dual/ulex.scm @@ -260,11 +260,24 @@ `(cmd . ,cmd) `(params . ,params)))) (cond (isme (do-work udata dat)) ;; no transmission needed (else - (handle-exceptions ;; TODO - MAKE THIS EXCEPTION CMD SPECIFIC? + (let-values (((inp oup)(tcp-connect host port))) + (let ((res (if (and inp oup) + (begin + (write (obj->string dat) oup) + (close-output-port oup) + (string->obj (read inp))) + (begin + (print "ERROR: send called but no receiver has been setup. Please call setup first!") + #f)))) + (close-input-port inp))) + + + + #;(handle-exceptions ;; TODO - MAKE THIS EXCEPTION CMD SPECIFIC? exn (begin (print "ULEX send-receive: "cmd", "params", exn="exn) (message exn)) (begin Index: ulex-simple/dbmgr.scm ================================================================== --- ulex-simple/dbmgr.scm +++ ulex-simple/dbmgr.scm @@ -332,19 +332,32 @@ ;; db is at apath/.db/dbname, rid is an intermediary solution and will be removed ;; sometime in the future ;; (define (rmt:send-receive-real sinfo apath dbname cmd params) (let* ((cdat (rmt:get-conn sinfo apath dbname))) - (assert cdat "FATAL: rmt:send-receive-real called without the needed channels opened") - (let* ((uconn (servdat-uconn sinfo)) ;; get the interface to ulex - ;; then send-receive using the ulex layer to host-port stored in cdat - (res (send-receive uconn (conndat-hostport cdat) cmd params))) - ;; since we accessed the server we can bump the expires time up - (conndat-expires-set! cdat (+ (current-seconds) - (server:expiration-timeout) - -2)) ;; two second margin for network time misalignments etc. - res))) + (if (> (current-seconds)(conndat-expires cdat)) + (begin + (debug:print-info 0 *default-log-port* "Connection to "apath"/"dbname" expired, reconnecting.") + (rmt:drop-conn sinfo apath dbname) + (rmt:send-receive-real sinfo apath dbname cmd params)) + (begin + (assert cdat "FATAL: rmt:send-receive-real called without the needed channels opened") + (let* ((uconn (servdat-uconn sinfo)) ;; get the interface to ulex + ;; then send-receive using the ulex layer to host-port stored in cdat + (res (condition-case + (send-receive uconn (conndat-hostport cdat) cmd params) + ((exn i/o net timeout) + ;; here we need to close and reconnect + (rmt:drop-conn sinfo apath dbname) + (rmt:general-open-connection sinfo apath dbname) + (rmt:send-receive-real sinfo apath dbname cmd params) + )))) + ;; since we accessed the server we can bump the expires time up + (conndat-expires-set! cdat (+ (current-seconds) + (server:expiration-timeout) + -2)) ;; two second margin for network time misalignments etc. + res))))) ; ;; db is at apath/.db/dbname, rid is an intermediary solution and will be removed ;; sometime in the future. ;;