Index: TODO ================================================================== --- TODO +++ TODO @@ -16,10 +16,21 @@ # along with Megatest. If not, see . TODO ==== +23WW46 - v1.80 branch +. Use file semaphore to kill tests, eliminate db load of the KILLREQ query +. Merge this change to revolution branch +23WW45 - the revolution branch +. Add "fast" db start option (no handshaking over NFS) +. Add server-ro to server types (just "server" is fine for read/write). +. Create pause-server and resume-server calls +. Create rsync or cp sync to MTRAH function +. Change rmt:send-receive to divert calls to read-only server when possible +. Change start server to call main.db server for 1..N.db servers, block until server is read for use. + 23WW21 . Dashboard needs its own cache db in /tmp 23WW07 . Remove use of *dbstruct-dbs* Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -35,11 +35,12 @@ (use srfi-69 srfi-18 posix matchable - s11n) + s11n + typed-records) ;; allow these queries through without starting a server ;; (define api:read-only-queries '(get-key-val-pairs @@ -152,78 +153,10 @@ tasks-set-state-given-param-key )) (define *db-write-mutexes* (make-hash-table)) (define *server-signature* #f) -;; ;; These are called by the server on recipt of /api calls -;; ;; - keep it simple, only return the actual result of the call, i.e. no meta info here -;; ;; -;; ;; - returns #( flag result ) -;; ;; -;; (define (api:execute-requests dbstruct dat) -;; (if (> *api-process-request-count* 50) -;; (begin -;; (if (common:low-noise-print 30 "too many threads") -;; (debug:print 0 *default-log-port* "WARNING: "*api-process-request-count*" threads, potential overload, adding 0.5 sec delay.")) -;; ;; (thread-sleep! 0.5) ;; take a nap - no, the napping is moved to the clients via tt:backoff-incr -;; )) -;; (cond -;; ((not (vector? dat)) ;; it is an error to not receive a vector -;; (vector #f (vector #f "remote must be called with a vector"))) -;; (else -;; (let* ((cmd-in (vector-ref dat 0)) -;; (cmd (if (symbol? cmd-in) -;; cmd-in -;; (string->symbol cmd-in))) -;; (params (vector-ref dat 1)) -;; (run-id (if (null? params) -;; 0 -;; (car params))) -;; (write-mutex (if (hash-table-exists? *db-write-mutexes* run-id) -;; (hash-table-ref *db-write-mutexes* run-id) -;; (let* ((newmutex (make-mutex))) -;; (hash-table-set! *db-write-mutexes* run-id newmutex) -;; newmutex))) -;; (start-t (current-milliseconds)) -;; (readonly-mode (dbr:dbstruct-read-only dbstruct)) -;; (readonly-command (member cmd api:read-only-queries)) -;; (writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))) -;; (if (not readonly-command) -;; (mutex-lock! write-mutex)) -;; (let* ((tmppath (dbr:dbstruct-tmppath dbstruct)) -;; (clean-run-id (cond -;; ((number? run-id) run-id) -;; ((equal? run-id #f) "main") -;; (else "other"))) -;; (crumbfile (dbfile:wait-for-qif tmppath clean-run-id (cons cmd params))) -;; (res -;; (if writecmd-in-readonly-mode -;; (conc "attempt to run write command "cmd" on a read-only database") -;; (api:dispatch-request dbstruct cmd run-id params)))) -;; (delete-file* crumbfile) -;; (if (not readonly-command) -;; (mutex-unlock! write-mutex)) -;; -;; ;; save all stats -;; (let ((delta-t (- (current-milliseconds) -;; start-t)) -;; (modified-cmd (if (eq? cmd 'general-call) -;; (string->symbol (conc "general-call-" (car params))) -;; cmd))) -;; (hash-table-set! *db-api-call-time* modified-cmd -;; (cons delta-t (hash-table-ref/default *db-api-call-time* modified-cmd '())))) -;; (if writecmd-in-readonly-mode -;; (begin -;; #;(common:telemetry-log (conc "api-out:"(->string cmd)) -;; payload: `((params . ,params) -;; (ok-res . #t))) -;; (vector #f res)) -;; (begin -;; #;(common:telemetry-log (conc "api-out:"(->string cmd)) -;; payload: `((params . ,params) -;; (ok-res . #f))) -;; (vector #t res)))))))) (define *api-threads* '()) (define (api:register-thread th-in) (set! *api-threads* (cons (cons th-in (current-seconds)) *api-threads*))) @@ -237,11 +170,25 @@ (not (member (thread-state (car thdat)) '(terminated dead)))) *api-threads*))) (define (api:get-count-threads-alive) (length *api-threads*)) - + +(define *api:last-stats-print* 0) +(define *api-print-db-stats-mutex* (make-mutex)) +(define (api:print-db-stats) + (debug:print-info 0 *default-log-port* "Started periodic db stats printer") + (let loop () + (mutex-lock! *api-print-db-stats-mutex*) + (if (> (- (current-seconds) *api:last-stats-print*) 15) + (begin + (rmt:print-db-stats) + (set! *api:last-stats-print* (current-seconds)))) + (mutex-unlock! *api-print-db-stats-mutex*) + (thread-sleep! 5) + (loop))) + ;; indat is (cmd run-id params meta) ;; ;; WARNING: Do not print anything in the lambda of this function as it ;; reads/writes to current in/out port @@ -250,85 +197,105 @@ (assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.") (if (not *server-signature*) (set! *server-signature* (tt:mk-signature *toppath*))) (lambda (indat) (api:register-thread (current-thread)) - (let* (;; (indat (deserialize)) - (newcount (+ *api-process-request-count* 1)) - (numthreads (api:get-count-threads-alive)) - (delay-wait (if (> newcount 10) - (- newcount 10) - 0)) - (normal-proc (lambda (cmd run-id params) - (case cmd - ((ping) *server-signature*) - (else - (api:dispatch-request dbstruct cmd run-id params)))))) - (set! *api-process-request-count* newcount) - (set! *db-last-access* (current-seconds)) - (if (not (eq? newcount numthreads)) - (begin - (api:remove-dead-or-terminated) - (let ((threads-now (api:get-count-threads-alive))) - (debug:print 0 *default-log-port* "WARNING: newcount="newcount", numthreads="numthreads", remaining="threads-now) - (set! newcount threads-now)))) - (match indat - ((cmd run-id params meta) - (let* ((db-ok (let* ((dbfname (dbmod:run-id->dbfname run-id)) - (ok (equal? dbfname (dbr:dbstruct-dbfname dbstruct)))) - (case cmd - ((ping) #t) ;; we are fine - (else - (if (not ok)(debug:print 0 *default-log-port* "ERROR: "cmd", run-id "run-id", not correct for dbfname "(dbr:dbstruct-dbfname dbstruct))) - (assert ok "FATAL: database file and run-id not aligned."))))) - (ttdat *server-info*) - (server-state (tt-state ttdat)) - (status (cond - ((> newcount 3) 'busy) - ;; ((> newcount 5) 'loaded) ;; this gets transmitted to the client which calls tt:backoff-incr to slow stuff down. - (else 'ok))) - (errmsg (case status - ((busy) (conc "Server overloaded, "newcount" threads in flight")) - ((loaded) (conc "Server loaded, "newcount" threads in flight")) - (else #f))) - (result (case status - ((busy) - (if (eq? cmd 'ping) - (normal-proc cmd run-id params) - ;; newcount must be greater than 5 for busy - (* 1 (- newcount 3)) ;; was 15 - )) ;; (- newcount 29)) ;; call back in as many seconds - ((loaded) -;; (if (eq? (rmt:transport-mode) 'tcp) -;; (thread-sleep! 0.5)) - (normal-proc cmd run-id params)) - (else - (normal-proc cmd run-id params)))) - (meta (case cmd - ((ping) `((sstate . ,server-state))) - (else `((wait . ,delay-wait))))) - (payload (list status errmsg result meta))) - (set! *api-process-request-count* (- *api-process-request-count* 1)) - ;; (serialize payload) - (api:unregister-thread (current-thread)) - payload)) - (else - (assert #f "FATAL: failed to deserialize indat "indat)))))) - + (let* ((result + (let* ((numthreads (api:get-count-threads-alive)) + (delay-wait (if (> numthreads 10) + (- numthreads 10) + 0)) + (normal-proc (lambda (cmd run-id params) + (case cmd + ((ping) *server-signature*) + (else + (api:dispatch-request dbstruct cmd run-id params)))))) + (set! *api-process-request-count* numthreads) + (set! *db-last-access* (current-seconds)) +;; (if (not (eq? numthreads numthreads)) +;; (begin +;; (api:remove-dead-or-terminated) +;; (let ((threads-now (api:get-count-threads-alive))) +;; (debug:print 0 *default-log-port* "WARNING: numthreads="numthreads", numthreads="numthreads", remaining="threads-now) +;; (set! numthreads threads-now)))) + (match indat + ((cmd run-id params meta) + (let* ((start-t (current-milliseconds)) + (db-ok (let* ((dbfname (dbmod:run-id->dbfname run-id)) + (ok (equal? dbfname (dbr:dbstruct-dbfname dbstruct)))) + (case cmd + ((ping) #t) ;; we are fine + (else + (assert ok "FATAL: database file and run-id not aligned."))))) + (ttdat *server-info*) + (server-state (tt-state ttdat)) + (maxthreads 20) ;; make this a parameter? + (status (cond + ((and (> numthreads maxthreads) + (> (random 100) 70)) ;; allow a 30% probability to go through so we can figure out what is going wrong in main.db server. + 'busy) + ;; ((> numthreads 5) 'loaded) ;; this gets transmitted to the client which calls tt:backoff-incr to slow stuff down. + (else 'ok))) + (errmsg (case status + ((busy) (conc "Server overloaded, "numthreads" threads in flight")) + ((loaded) (conc "Server loaded, "numthreads" threads in flight")) + (else #f))) + (result (case status + ((busy) + (if (eq? cmd 'ping) + (normal-proc cmd run-id params) + ;; numthreads must be greater than 5 for busy + (* 0.1 (- numthreads maxthreads)) ;; was 15 + )) ;; (- numthreads 29)) ;; call back in as many seconds + ((loaded) + ;; (if (eq? (rmt:transport-mode) 'tcp) + ;; (thread-sleep! 0.5)) + (normal-proc cmd run-id params)) + (else + (normal-proc cmd run-id params)))) + (meta (case cmd + ((ping) `((sstate . ,server-state))) + (else `((wait . ,delay-wait))))) + (payload (list status errmsg result meta))) + ;; (cmd run-id params meta) + (db:add-stats cmd run-id params (- (current-milliseconds) start-t)) + payload)) + (else + (assert #f "FATAL: failed to deserialize indat "indat)))))) + ;; (set! *api-process-request-count* (- *api-process-request-count* 1)) + ;; (serialize payload) + + (api:unregister-thread (current-thread)) + result))) + + + +(define *api-halt-writes* #f) (define (api:dispatch-request dbstruct cmd run-id params) (if (not *no-sync-db*) (db:open-no-sync-db)) + (let* ((start-time (current-milliseconds))) + (if (member cmd api:write-queries) + (let loop () + (if *api-halt-writes* + (begin + (thread-sleep! 0.2) + (if (< (- (current-milliseconds) start-time) + 5000) ;; hope it don't take more than five seconds to sync + (loop-time) + #;(debug:print 0 *default-log-port* "ERROR: writes halted for more than 5 seconds, sync might be taking too long")))))) + (db:add-stats 'api-write-blocking-for-sync run-id params (- (current-milliseconds) start-time))) (case cmd ;;=============================================== ;; READ/WRITE QUERIES ;;=============================================== ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl ;; SERVERS - ((start-server) (apply server:kind-run params)) + ((start-server) (apply tt:server-process-run params)) ((kill-server) (set! *server-run* #f)) ;; TESTS ;;((test-set-state-status-by-id) (apply mt:test-set-state-status-by-id dbstruct params)) @@ -513,43 +480,5 @@ ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)) (else (debug:print 0 *default-log-port* "ERROR: bad api call " cmd) (conc "ERROR: BAD api call " cmd)))) -;; http-server send-response -;; api:process-request -;; db:* -;; -;; NB// Runs on the server as part of the server loop -;; -(define (api:process-request dbstruct $) ;; the $ is the request vars proc - (debug:print 4 *default-log-port* "server-id:" *server-id*) - (let* ((cmd ($ 'cmd)) - (paramsj ($ 'params)) - (key ($ 'key)) - (params (db:string->obj paramsj transport: 'http))) ;; incoming data from the POST (or is it a GET?) - (debug:print 4 *default-log-port* "cmd:" cmd " with params " params "key " key) - (if (equal? key *server-id*) - (begin - (set! *api-process-request-count* (+ *api-process-request-count* 1)) - (let* ((resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) - (success (vector-ref resdat 0)) - (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?) - (debug:print 4 *default-log-port* "res:" res) - (if (not success) - (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) - (if (> *api-process-request-count* *max-api-process-requests*) - (set! *max-api-process-requests* *api-process-request-count*)) - (set! *api-process-request-count* (- *api-process-request-count* 1)) - ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds - ;; (rmt:dat->json-str - ;; (if (or (string? res) - ;; (list? res) - ;; (number? res) - ;; (boolean? res)) - ;; res - ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res))))) - (db:obj->string res transport: 'http))) - (begin - (debug:print 0 *default-log-port* "Server refused to process request. Server id mismatch. recived " key " expected: " *server-id* ".\nOther arguments recived: cmd=" cmd " params = " params) - (db:obj->string (conc "Server refused to process request server-id mismatch: " key ", " *server-id*) transport: 'http))))) - Index: archive.scm ================================================================== --- archive.scm +++ archive.scm @@ -359,11 +359,11 @@ (archive-dir (if archive-info (cdr archive-info) #f)) (archive-id (if archive-info (car archive-info) -1)) (home-host (server:choose-server *toppath* 'homehost)) (archive-time (seconds->std-time-str (current-seconds))) (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time)) - (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db")) + (tmp-db-path (conc (dbfile:make-tmpdir-name *toppath* "") "/megatest.db")) (dbfile (conc archive-staging-db "/megatest.db"))) (create-directory archive-staging-db #t) (let-values (((pid-val exit-status exit-code) (run-n-wait rsync-exe params: (list "-v" (conc (car home-host) ":"tmp-db-path) archive-staging-db) print-cmd: print-prefix))) (if (eq? exit-code 0) (case archiver Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -21,10 +21,11 @@ (declare (unit common)) (declare (uses commonmod)) (declare (uses rmtmod)) (declare (uses debugprint)) (declare (uses mtargs)) + (use srfi-1 data-structures posix regex-case (prefix base64 base64:) format dot-locking csv-xml z3 udp ;; sql-de-lite hostinfo md5 message-digest typed-records directory-utils stack matchable regex posix (srfi 18) extras ;; tcp @@ -153,14 +154,10 @@ (define *time-zero* (current-seconds)) ;; for the watchdog (define *on-exit-procs* '()) ;; add procs to this list to be executed on exit (define *default-area-tag* "local") ;; DATABASE -;; (define *dbstruct-dbs* #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this. -;; db stats -(define *db-stats* (make-hash-table)) ;; hash of vectors < count duration-total > -(define *db-stats-mutex* (make-mutex)) ;; db access (define *db-last-access* (current-seconds)) ;; last db access, used in server ;; (define *db-write-access* #t) ;; db sync ;; (define *db-last-sync* 0) ;; last time the sync to megatest.db happened @@ -181,11 +178,10 @@ ;; (define *max-cache-size* 0) (define *logged-in-clients* (make-hash-table)) (define *server-id* #f) (define *server-info* #f) ;; good candidate for easily convert to non-global (define *time-to-exit* #f) -(define *server-run* #t) (define *run-id* #f) (define *server-kind-run* (make-hash-table)) (define *home-host* #f) ;; (define *total-non-write-delay* 0) (define *heartbeat-mutex* (make-mutex)) @@ -247,11 +243,11 @@ (define *common:this-exe-fullpath* (common:get-this-exe-fullpath)) (define *common:this-exe-dir* (pathname-directory *common:this-exe-fullpath*)) (define *common:this-exe-name* (pathname-strip-directory *common:this-exe-fullpath*)) (define (common:get-sync-lock-filepath) - (let* ((tmp-area (common:get-db-tmp-area)) + (let* ((tmp-area (common:make-tmpdir-name *toppath* "")) (lockfile (conc tmp-area "/megatest.db.lock"))) lockfile)) (define *common:logpro-exit-code->status-sym-alist* '( ( 0 . pass ) @@ -1533,11 +1529,11 @@ ;; (define (common:lazy-modification-time fpath) (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "Failed to get modifcation time for " fpath ", treating it as zero. exn=" exn) + (debug:print 2 *default-log-port* "Failed to get modification time for " fpath ", treating it as zero. exn=" exn) 0) (if (file-exists? fpath) (file-modification-time fpath) 0))) @@ -2280,11 +2276,11 @@ (define (common:check-db-dir-space) (let* ((required (string->number ;; default is 1GB (or actually a billion bytes) This is the number of 1 kB blocks. (or (configf:lookup *configdat* "setup" "dbdir-space-required") "1000000"))) - (dbdir (common:get-db-tmp-area)) ;; (db:get-dbdir)) + (dbdir (common:make-tmpdir-name *toppath* "")) ;; (db:get-dbdir)) (tdbspace (common:check-space-in-dir dbdir required)) (mdbspace (common:check-space-in-dir *toppath* required))) (sort (list tdbspace mdbspace) (lambda (a b) (< (cadr a)(cadr b)))))) Index: commonmod.scm ================================================================== --- commonmod.scm +++ commonmod.scm @@ -160,10 +160,17 @@ '()))) ;; should it return empty list or #f to indicate not set? (define (get-section cfgdat section) (hash-table-ref/default cfgdat section '())) + +(define (common:make-tmpdir-name areapath tmpadj) + (let* ((area (pathname-file areapath)) + (dname (conc "/tmp/"(current-user-name)"/megatest_localdb/" area "/" (string-translate areapath "/" ".") tmpadj "/.mtdb"))) + (unless (directory-exists? dname) + (create-directory dname #t)) + dname)) ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; Index: dashboard-tests.scm ================================================================== --- dashboard-tests.scm +++ dashboard-tests.scm @@ -463,11 +463,11 @@ ;;====================================================================== ;; ;;====================================================================== (define (dashboard-tests:examine-test run-id test-id) ;; run-id run-key origtest) - (let* ((db-path (db:dbfile-path)) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/db/" run-id ".db")) + (let* ((db-path (common:make-tmpdir-name *toppath* "")) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/db/" run-id ".db")) (dbstruct #f) ;; NOT USED (testdat (rmt:get-test-info-by-id run-id test-id)) ;; (db:get-test-info-by-id dbstruct run-id test-id)) (db-mod-time 0) ;; (file-modification-time db-path)) (last-update 0) ;; (current-seconds)) (request-update #t)) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -402,12 +402,12 @@ (dboard:setup-tabdat dat) (dboard:setup-num-rows dat) dat)) (define (dboard:setup-tabdat tabdat) - (dboard:tabdat-dbdir-set! tabdat (db:dbfile-path)) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/.db")) - (dboard:tabdat-dbfpath-set! tabdat (db:dbfile-path)) + (dboard:tabdat-dbdir-set! tabdat (common:make-tmpdir-name *toppath* "")) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/.db")) + (dboard:tabdat-dbfpath-set! tabdat (common:make-tmpdir-name *toppath* "")) (dboard:tabdat-monitor-db-path-set! tabdat (conc (dboard:tabdat-dbdir tabdat) "/monitor.db")) ;; HACK ALERT: this is a hack, please fix. (dboard:tabdat-ro-set! tabdat (not (file-read-access? (dboard:tabdat-dbfpath tabdat)))) @@ -928,16 +928,16 @@ (hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct)) (if (or (null? tal) (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update (begin (when (> elapsed-time 2) - (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.") + (debug:print 2 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.") (let* ((old-val (iup:attribute *tim* "TIME")) (new-val (number->string (inexact->exact (floor (* 2 (string->number old-val))))))) (if (< (string->number new-val) 5000) (begin - (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val) + (debug:print 2 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val) (iup:attribute-set! *tim* "TIME" new-val))))) (dboard:tabdat-allruns-set! tabdat new-res) maxtests) (if (> (dboard:rundat-run-data-offset run-struct) 0) (loop run tal new-res newmaxtests) ;; not done getting data for this run @@ -3112,11 +3112,11 @@ (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir ", exn=" exn) (current-seconds)) ;; something went wrong - just print an error and return current-seconds (common:max (map (lambda (filen) (file-modification-time filen)) - (glob (conc dbdir "/*.db*")))))) + (cons (conc dbdir "/main.db") (glob (conc dbdir "/?.db"))))))) (define (dashboard:monitor-changed? commondat tabdat) (let* ((run-update-time (current-seconds)) (monitor-db-path (dboard:tabdat-monitor-db-path tabdat)) (monitor-modtime (if (and monitor-db-path (common:file-exists? monitor-db-path)) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -133,11 +133,11 @@ default))) (apply sqlite3:first-result db stmt params))) (define (db:setup do-sync) (assert *toppath* "FATAL: db:setup called before launch:setup has been run.") - (let* ((tmpdir (common:get-db-tmp-area))) + (let* ((tmpdir (common:make-tmpdir-name *toppath* ""))) (if (not *dbstruct-dbs*) (dbfile:setup do-sync *toppath* tmpdir) *dbstruct-dbs*))) ;; moved from dbfile @@ -267,17 +267,10 @@ ", arguments: " ((condition-property-accessor 'exn 'arguments) exn) ", location: " ((condition-property-accessor 'exn 'location) exn) )) -;; NB// #f => return dbdir only -;; (was planned to be; zeroth db with name=main.db) -;; -;; If run-id is #f return to create and retrieve the path where the db will live. -;; -(define db:dbfile-path common:get-db-tmp-area) - (define (db:set-sync db) (let ((syncprag (configf:lookup *configdat* "setup" "sychronous"))) (sqlite3:execute db (conc "PRAGMA synchronous = " (or syncprag 0) ";")))) @@ -467,11 +460,11 @@ (get-mtime shm-file)))) ;; (define (db:all-db-sync dbstruct) ;; (let* ((dbdat (db:open-db dbstruct #f db:initialize-main-db)) ;; (data-synced 0) ;; count of changed records -;; (tmp-area (common:get-db-tmp-area)) +;; (tmp-area (common:make-tmpdir-name *toppath*)) ;; (dbfiles (glob (conc tmp-area"/.mtdb/*.db"))) ;; (sync-durations (make-hash-table)) ;; (no-sync-db (db:open-no-sync-db))) ;; (for-each ;; (lambda (file) ;; tmp db file @@ -556,11 +549,11 @@ ;; run-ids: '(1 2 3 ...) or #f (for all) ;; (define (db:multi-db-sync dbstruct . options) (let* (;; (dbdat (db:open-db dbstruct #f dbfile:db-init-proc)) (data-synced 0) ;; count of changed records - (tmp-area (common:get-db-tmp-area)) + (tmp-area (common:make-tmpdir-name *toppath* "")) (old2new (member 'old2new options)) (dejunk (member 'dejunk options)) (killservers (member 'killservers options)) (src-area (if old2new *toppath* tmp-area)) (dest-area (if old2new tmp-area *toppath*)) @@ -1254,11 +1247,11 @@ ;; no-sync.db - small bits of data to be shared between servers ;;====================================================================== (define (db:get-dbsync-path) (case (rmt:transport-mode) - ((http)(common:get-db-tmp-area)) + ((http)(common:make-tmpdir-name *toppath* "")) ((tcp) (conc *toppath*"/.mtdb")) ((nfs) (conc *toppath*"/.mtdb")) (else "/tmp/dunno-this-gonna-exist"))) ;; This is needed for api.scm @@ -1580,11 +1573,11 @@ ;; TODO: Switch this to use max(update_time) from each run db? Then if using a server there is no disk traffic (using cachedb db) ??? ;; ;; NOTE: This DOESN'T (necessarily) get the real run ids, but the number of the .db!! (define (db:get-changed-run-ids since-time) - (let* ((dbdir (db:dbfile-path)) ;; (configf:lookup *configdat* "setup" "dbdir")) + (let* ((dbdir (common:make-tmpdir-name *toppath* "")) ;; (configf:lookup *configdat* "setup" "dbdir")) (alldbs (glob (conc *toppath* "/.mtdb/[0-9]*.db*"))) (changed (filter (lambda (dbfile) (> (file-modification-time dbfile) since-time)) alldbs))) (delete-duplicates @@ -2234,21 +2227,24 @@ qry run-id (or last-update 0)))))) (define (db:get-testinfo-state-status dbstruct run-id test-id) - (let ((res #f)) - (db:with-db dbstruct run-id #f - (lambda (dbdat db) - (sqlite3:for-each-row - (lambda (run-id testname item-path state status) - ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment - (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-"))) - db - "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;" - test-id run-id))) - res)) + (db:with-db + dbstruct run-id #f + (lambda (dbdat db) + (let* ((res #f) + (stmth (db:get-cache-stmth dbdat db "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;"))) + (sqlite3:for-each-row + (lambda (run-id testname item-path state status) + ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment + (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-"))) + ;; db + ;; "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;" + stmth + test-id run-id) + res)))) ;; get a useful subset of the tests data (used in dashboard ;; use db:mintest-get-{id ,run_id,testname ...} ;; (define (db:get-tests-for-run-mindata dbstruct run-id testpatt states statuses not-in) @@ -2637,18 +2633,20 @@ (db:with-db dbstruct run-id #f (lambda (dbdat db) - (let ((res (cons #f #f))) -;; (stmth (db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=?;"))) - (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test - (lambda (state status) - (cons state status)) - db - "SELECT state,status FROM tests WHERE id=? AND run_id=?;" ;; stmth try not compiling this one - yes, this fixed the bind issue - test-id run-id) + (let ((res (cons #f #f)) + (stmth (db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=? AND run_id=?;"))) + (db:with-mutex-for-stmth + (lambda() + (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test + (lambda (state status) + (cons state status)) + ;; db + stmth ;;"SELECT state,status FROM tests WHERE id=? AND run_id=?;" ;; stmth try not compiling this one - yes, this fixed the bind issue + test-id run-id))) res)))) ;; Use db:test-get* to access ;; Get test data using test_ids. NB// Only works within a single run!! ;; @@ -3728,25 +3726,35 @@ tags))) db "SELECT testname,tags FROM test_meta") (hash-table->alist res))))) +;; testmeta doesn't change, we can cache it for up too an hour + +(define *db:testmeta-cache* (make-hash-table)) +(define *db:testmeta-last-update* 0) + ;; read the record given a testname (define (db:testmeta-get-record dbstruct testname) - (let ((res #f)) - (db:with-db - dbstruct - #f - #f - (lambda (dbdat db) - (sqlite3:for-each-row - (lambda (id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup) - (set! res (vector id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup))) - db - "SELECT id,testname,author,owner,description,reviewed,iterated,avg_runtime,avg_disk,tags,jobgroup FROM test_meta WHERE testname=?;" - testname) - res)))) + (if (and (< (- (current-seconds) *db:testmeta-last-update*) 600) + (hash-table-exists? *db:testmeta-cache* testname)) + (hash-table-ref *db:testmeta-cache* testname) + (let ((res #f)) + (db:with-db + dbstruct + #f + #f + (lambda (dbdat db) + (sqlite3:for-each-row + (lambda (id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup) + (set! res (vector id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup))) + db + "SELECT id,testname,author,owner,description,reviewed,iterated,avg_runtime,avg_disk,tags,jobgroup FROM test_meta WHERE testname=?;" + testname))) + (hash-table-set! *db:testmeta-cache* testname res) + (set! *db:testmeta-last-update* (current-seconds)) + res))) ;; create a new record for a given testname (define (db:testmeta-add-record dbstruct testname) (db:with-db dbstruct #f #t (lambda (dbdat db) @@ -4314,11 +4322,11 @@ )))) ;; sync for filesystem local db writes ;; (define (db:run-lock-and-sync no-sync-db) - (let* ((tmp-area (common:get-db-tmp-area)) + (let* ((tmp-area (common:make-tmpdir-name *toppath* "")) (dbfiles (glob (conc tmp-area"/.mtdb/*.db"))) (sync-durations (make-hash-table))) ;; (debug:print-info 0 *default-log-port* "lock-and-sync, dbfiles: "dbfiles) (for-each (lambda (file) @@ -4370,11 +4378,11 @@ (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300)) (debug-mode (debug:debug-mode 1)) (last-time (current-seconds)) ;; last time through the sync loop (no-sync-db (db:open-no-sync-db)) (sync-duration 0) ;; run time of the sync in milliseconds - (tmp-area (common:get-db-tmp-area))) + (tmp-area (common:make-tmpdir-name *toppath* ""))) ;; Sync moved to http-transport keep-running loop (debug:print-info 2 *default-log-port* "Periodic copy-based sync thread started. syncer is copy-sync, tmp-area is " tmp-area) (debug:print-info 3 *default-log-port* "watchdog starting. syncer is copy-sync pid="(current-process-id));; " this-wd-num="this-wd-num) (if (and legacy-sync (not *time-to-exit*)) @@ -4478,11 +4486,11 @@ (for-each (lambda (subdb) (let* (;;(dbstruct (db:setup)) (mtdb (dbr:subdb-mtdb subdb)) (mtpath (db:dbdat-get-path mtdb)) - (tmp-area (common:get-db-tmp-area)) + (tmp-area (common:make-tmpdir-name *toppath* "")) (res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive (set! sync-duration (- (current-milliseconds) sync-start)) (if (> res 0) ;; some records were transferred, keep the db alive (begin (mutex-lock! *heartbeat-mutex*) @@ -4525,11 +4533,10 @@ ;; ;; time to exit, close the no-sync db here ;; (db:no-sync-close-db no-sync-db stmt-cache) (if (common:low-noise-print 30) (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) )))) )) - (define (std-exit-procedure) ;;(common:telemetry-log-close) (on-exit (lambda () 0)) ;; why is this here? ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -242,11 +242,12 @@ #f ) ) (define (dbfile:make-tmpdir-name areapath tmpadj) - (let* ((dname (conc "/tmp/"(current-user-name)"/" (string-translate areapath "/" ".") tmpadj))) + (let* ((area (pathname-file areapath)) + (dname (conc "/tmp/"(current-user-name)"/megatest_localdb/" area "/" (string-translate areapath "/" ".") tmpadj "/.mtdb"))) (unless (directory-exists? dname) (create-directory dname #t)) dname)) (define (dbfile:run-id->path apath run-id) @@ -487,11 +488,11 @@ ;; NOTE: this is already protected by mutex *no-sync-db-mutex* ;; (define (dbfile:raw-open-no-sync-db dbpath) (if (not (file-exists? dbpath)) (create-directory dbpath #t)) - (debug:print-info 0 *default-log-port* "Opening "dbpath"/no-sync.db") + (debug:print-info 2 *default-log-port* "(dbfile:raw-open-no-sync-db: Opening "dbpath"/no-sync.db") (let* ((dbname (conc dbpath "/no-sync.db")) (db-exists (file-exists? dbname)) (init-proc (lambda (db) (sqlite3:with-transaction db @@ -525,17 +526,18 @@ reason TEXT DEFAULT 'none', CONSTRAINT no_sync_processes UNIQUE (host,pid));" )))))) (on-tmp (equal? (car (string-split dbpath "/")) "tmp")) (db (if on-tmp - (dbfile:cautious-open-database dbname init-proc 0 "WAL" force-init: #t) - (dbfile:cautious-open-database dbname init-proc 0 #f force-init: #t) + (dbfile:cautious-open-database dbname init-proc 1 "WAL" force-init: #t) ;; WAL MODE should use syncronous=1 + ;; (dbfile:cautious-open-database dbname init-proc 0 #f force-init: #t) + (dbfile:cautious-open-database dbname init-proc 0 "MEMORY" force-init: #t) ;; (sqlite3:open-database dbname) ))) (if on-tmp ;; done in cautious-open-database (begin - (sqlite3:execute db "PRAGMA synchronous = 0;") + ;; (sqlite3:execute db "PRAGMA synchronous = 0;") ;; why was this here when is is handled by cautious-open-database? (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)))) db)) ;; mtest processes registry calls @@ -675,20 +677,26 @@ ;; fails returns (#f lock-creation-time identifier) ;; succeeds (returns (#t lock-creation-time identifier) ;; use (db:no-sync-del! db keyname) to release the lock ;; (define (db:no-sync-get-lock-with-id db keyname identifier) + (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: db: " db " keyname: " keyname " identifier: " identifier) (sqlite3:with-transaction db (lambda () (condition-case (let* ((curr-val (db:no-sync-get/default db keyname #f))) + (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: curr-val: " curr-val) (if curr-val (match (db:extract-time-identifier curr-val) ;; result->timestamp, identifier ((timestamp . ident) (cons (equal? ident identifier) timestamp)) - (else (cons #f 'malformed-lock))) ;; lock malformed + (else + (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: malformed lock") + (cons #f 'malformed-lock) + ) + ) ;; lock malformed (let ((curr-sec (current-seconds)) (lock-value (if identifier (conc (current-seconds)"+"identifier) (current-seconds)))) (sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-value) @@ -1572,7 +1580,15 @@ ;; (db:hoh-set! stmt-cache db stmt newstmth) (hash-table-set! stmt-cache stmt newstmth) newstmth)))) (mutex-unlock! *get-cache-stmth-mutex*) result)) + +(define *mutex-stmth-call* (make-mutex)) + +(define (db:with-mutex-for-stmth proc) + (mutex-lock! *mutex-stmth-call*) + (let* ((res (proc))) + (mutex-unlock! *mutex-stmth-call*) + res)) ) Index: dbmod.scm ================================================================== --- dbmod.scm +++ dbmod.scm @@ -31,10 +31,11 @@ chicken data-structures extras files + format (prefix sqlite3 sqlite3:) matchable posix typed-records srfi-1 @@ -198,11 +199,11 @@ (let* ((dbstruct (or dbstruct-in (make-dbr:dbstruct areapath: areapath))) (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))) (dbpath (dbmod:get-dbdir dbstruct)) ;; directory where all the .db files are kept (dbfullname (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id)) (dbexists (file-exists? dbfullname)) - (tmpdir (dbfile:make-tmpdir-name areapath tmpadj)) + (tmpdir (common:make-tmpdir-name areapath tmpadj)) (tmpdb (let* ((fname (conc tmpdir"/"dbfname))) fname)) (cachedb (dbmod:open-cachedb-db init-proc ;; (if (eq? (dbfile:cache-method) 'cachedb) ;; #f @@ -225,50 +226,21 @@ (dbr:dbstruct-dbfname-set! dbstruct dbfname) (dbr:dbstruct-sync-proc-set! dbstruct (lambda (last-update) (if *sync-in-progress* (debug:print 3 *default-log-port* "WARNING: overlapping calls to sync to disk") - (let* ((syncer-logfile (conc areapath"/logs/"dbfname"-syncer.log")) - (sync-cmd (if (eq? syncdir 'todisk) - (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "tmpdb" -to "dbfullname" -period 5 -timeout 10 > /dev/null 2&>1)&") - (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "dbfullname" -to "tmpdb" -period 5 -timeout 10 > /dev/null 2&>1)&"))) - (synclock-file (conc dbfullname".lock")) - (syncer-running-file (conc dbfullname"-sync-running")) - (synclock-mod-time (if (file-exists? synclock-file) - (handle-exceptions - exn - #f - (file-modification-time synclock-file)) - #f)) - (thethread (lambda () - (thread-start! - (make-thread - (lambda () - (set! *sync-in-progress* #t) - (debug:print-info "Running "sync-cmd) - (if (file-exists? syncer-running-file) - (debug:print-info 0 *default-log-port* "Syncer still running, skipping syncer start.") - (system sync-cmd)) - (set! *sync-in-progress* #f))))))) - (if ((if (eq? syncdir 'todisk) < >) ;; use less than for todisk, greater than for from disk - (file-modification-time tmpdb) - (file-modification-time dbfullname)) - (debug:print 4 *default-log-port* "Skipping sync, "tmpdb" older than "dbfullname) - (if synclock-mod-time - (if (> (- (current-seconds) synclock-mod-time) 20) ;; something wrong with sync, remove file - (begin - (handle-exceptions - exn - #f - (begin - (debug:print 0 *default-log-port* "Sync lock file " synclock-file "is older than 20 seconds (" synclock-mod-time " seconds). Removing it") - (delete-file synclock-file) - ) - ) - (thethread)) - (debug:print 0 *default-log-port* "Skipping sync, lockfile "synclock-file" found.")) - (thethread))))))) + (begin + ;; turn off writes - send busy or block? + ;; call db2db internally + ;; turn writes back on + ;; + (set! *api-halt-writes* #t) ;; do we need a mutex? + ;; (dbmod:db-to-db-sync src-db dest-db last-update (dbfile:db-init-proc) keys) + (debug:print-info 2 *default-log-port* "Internal sync running from "tmpdb" to "dbfullname) + (dbmod:db-to-db-sync tmpdb dbfullname last-update (dbfile:db-init-proc) keys) + (set! *api-halt-writes* #f) + )))) ;; (dbmod:sync-tables tables #f db cachedb) ;; (thread-sleep! 1) ;; let things settle before syncing in needed data (dbmod:sync-gasket tables #f cachedb db dbfullname 'fromdest keys) ;; ) ;; load into cachedb (dbr:dbstruct-last-update-set! dbstruct (+ (current-seconds) -10)) ;; should this be offset back in time by one second? @@ -856,6 +828,94 @@ (res (dbmod:sync-gasket tables last-update sdb ddb dest-db 'todisk keys))) (sqlite3:finalize! sdb) (sqlite3:finalize! ddb) res))) #f)) + +;; ====================================================================== +;; dbstats +;;====================================================================== + +;; (define *dbstruct-dbs* #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this. +;; db stats +(define *db-stats* (make-hash-table)) ;; hash of vectors < count duration-total > +(define *db-stats-mutex* (make-mutex)) + +(define (rmt:print-db-stats) + (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f" + (debug:print 0 *default-log-port* "DB Stats\n========") + (debug:print 0 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg")) + (for-each (lambda (cmd) + (let* ((dat (hash-table-ref *db-stats* cmd)) + (count (dbstat-cnt dat)) + (tottime (dbstat-tottime dat))) + (debug:print 0 *default-log-port* + (format #f fmtstr cmd count tottime + (/ tottime count))))) + (sort (hash-table-keys *db-stats*) + (lambda (a b) + (> (dbstat-tottime (hash-table-ref *db-stats* a)) + (dbstat-tottime (hash-table-ref *db-stats* b)))))))) + +(defstruct dbstat + (cnt 0) + (tottime 0)) + +(define (db:add-stats cmd run-id params delta) + (let* ((modified-cmd (if (eq? cmd 'general-call) + (string->symbol (conc "general-call-" (car params))) + cmd)) + (rec (hash-table-ref/default *db-stats* modified-cmd #f))) + (if (not rec) + (let ((new-rec (make-dbstat))) + (hash-table-set! *db-stats* modified-cmd new-rec) + (set! rec new-rec))) + (dbstat-cnt-set! rec (+ (dbstat-cnt rec) 1)) + (dbstat-tottime-set! rec (+ (dbstat-tottime rec) delta)))) + + + ) + + +;; ATTIC + + #;(let* ((syncer-logfile (conc areapath"/logs/"dbfname"-syncer.log")) + (sync-cmd (if (eq? syncdir 'todisk) + (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "tmpdb" -to "dbfullname" -period 5 -timeout 10 > /dev/null 2&>1)&") + (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "dbfullname" -to "tmpdb" -period 5 -timeout 10 > /dev/null 2&>1)&"))) + (synclock-file (conc dbfullname".lock")) + (syncer-running-file (conc dbfullname"-sync-running")) + (synclock-mod-time (if (file-exists? synclock-file) + (handle-exceptions + exn + #f + (file-modification-time synclock-file)) + #f)) + (thethread (lambda () + (thread-start! + (make-thread + (lambda () + (set! *sync-in-progress* #t) + (debug:print-info "Running "sync-cmd) + (if (file-exists? syncer-running-file) + (debug:print-info 0 *default-log-port* "Syncer still running, skipping syncer start.") + (system sync-cmd)) + (set! *sync-in-progress* #f))))))) + (if ((if (eq? syncdir 'todisk) < >) ;; use less than for todisk, greater than for from disk + (file-modification-time tmpdb) + (file-modification-time dbfullname)) + (debug:print 4 *default-log-port* "Skipping sync, "tmpdb" older than "dbfullname) + (if synclock-mod-time + (if (> (- (current-seconds) synclock-mod-time) 20) ;; something wrong with sync, remove file + (begin + (handle-exceptions + exn + #f + (begin + (debug:print 0 *default-log-port* "Sync lock file " synclock-file "is older than 20 seconds (" synclock-mod-time " seconds). Removing it") + (delete-file synclock-file) + ) + ) + (thethread)) + (debug:print 0 *default-log-port* "Skipping sync, lockfile "synclock-file" found.")) + (thethread)))) Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -236,11 +236,10 @@ (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) - ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load @@ -256,15 +255,16 @@ (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-state-status-by-id run-id test-id)) (state (car test-info));; (db:test-get-state test-info)) (status (cdr test-info));; (db:test-get-status test-info)) + (killreq (equal? state "KILLREQ")) (kill-reason "no kill reason specified") (kill-job? #f)) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond - ((test-get-kill-request run-id test-id) + (killreq (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) @@ -276,16 +276,11 @@ (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty (launch:handle-zombie-tests run-id)) (when do-sync - ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) - ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) - ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) - ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))) - ) + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) @@ -331,18 +326,17 @@ ))) (mutex-unlock! m) ;; no point in sticking around. Exit now. But run end of run before exiting? (launch:end-of-run-check run-id) (exit))) - (if (hash-table-ref/default misc-flags 'keep-going #f) + (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (begin - (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses - (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta - (loop (calc-minutes) - (or new-cpu-load cpu-load) - (or new-disk-free disk-free) - (if do-sync (current-seconds) last-sync))))))) + (thread-sleep! 6) ;; was 3 + (loop (calc-minutes) + (or new-cpu-load cpu-load) + (or new-disk-free disk-free) + (if do-sync (current-seconds) last-sync)))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. ;; (declare (unit megatest-version)) -(define megatest-version 1.8017) +(define megatest-version 1.8018) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -968,12 +968,13 @@ (tl (launch:setup)) (keys (keys:config-get-fields *configdat*))) (case (rmt:transport-mode) ((tcp) (let* ((timeout (server:expiration-timeout))) - (debug:print 0 *default-log-port* "INFO: Running using tcp method with server timeout of "timeout) + (debug:print 0 *default-log-port* "INFO: Starting server for " dbfname " using tcp method with server timeout of "timeout) (tt-server-timeout-param timeout) + (thread-start! (make-thread api:print-db-stats "print-db-stats")) (if dbfname (tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys) (begin (debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.") (exit 1))))) @@ -1091,10 +1092,14 @@ sfiles ) ) ) dbfiles + ) + ;; remove this db, because otherwise metadata contains records for old servers, and this causes a problem with db:no-sync-get-lock-with-id. + (if (file-exists? (conc *toppath* "/.mtdb/no-sync.db")) + (delete-file (conc *toppath* "/.mtdb/no-sync.db")) ) (set! *didsomething* #t) (exit) ) ) @@ -2132,13 +2137,13 @@ (exit 1))) (if (common:file-exists? (conc *toppath* "/megatest.db")) (begin (debug:print-info 1 *default-log-port* "File " (conc *toppath* "/megatest.db") " already exists. Please remove it before trying to replicate db") (exit 1))) - (if (and (common:get-db-tmp-area) (> (length (directory (common:get-db-tmp-area) #f)) 0)) + (if (and (common:make-tmpdir-name *toppath* "") (> (length (directory (common:make-tmpdir-name *toppath* "") #f)) 0)) (begin - (debug:print-info 1 *default-log-port* (common:get-db-tmp-area) " not empty. Please remove it before trying to replicate db") + (debug:print-info 1 *default-log-port* (common:make-tmpdir-name *toppath* "") " not empty. Please remove it before trying to replicate db") (exit 1))) ;; check if timestamp (let* ((source (args:get-arg "-source")) (src (if (not (equal? (substring source 0 1) "/")) (conc (current-directory) "/" source) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -68,82 +68,56 @@ ;;====================================================================== (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id -;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) -;; -(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected +;; how to make area-dat +(define (rmt:set-ttdat areapath ttdat) + (if ttdat + ttdat + (let* ((newremote (make-and-init-remote areapath))) + (set! *ttdat* newremote) + newremote))) + +;; NB// area-dat replaced by ttdat +;; +(define (rmt:send-receive cmd run-id params #!key (attemptnum 1)(ttdat #f)) (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.") - - (if (not (eq? (rmt:transport-mode) 'nfs)) - (begin - (if (> attemptnum 2) - (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum)) - - (cond - ((> attemptnum 2) (thread-sleep! 0.05)) - ((> attemptnum 10) (thread-sleep! 0.5)) - ((> attemptnum 20) (thread-sleep! 1))) - - ;; I'm turning this off, it may make sense to move it - ;; into http-transport-handler - (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15))) - (begin - (debug:print 0 *default-log-port* "ERROR: can't connect to server, trying to start a server.") - (case (rmt:transport-mode) - ((http) - (server:run *toppath*) - (thread-sleep! 3)) - (else - (thread-sleep! 1) ;; for tcp the server is started by routines in tcp-transportmod. For nfs there is no server - )))))) - - ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote - ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. - ;; 3. do the query, if on homehost use local access - ;; (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas - (runremote (or area-dat - *runremote*)) (attemptnum (+ 1 attemptnum)) - (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*)) + (readonly-mode (rmtmod:calc-ro-mode ttdat *toppath*)) (testsuite (common:get-testsuite-name)) - (mtexe (common:find-local-megatest))) - - (case (rmt:transport-mode) - ((http)(http-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode)) - ((tcp) (tcp-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe)) - ((nfs) (nfs-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe)) - ))) - -(define (nfs-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe) - (let* ((keys (common:get-fields *configdat*)) - (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard"))) - (api:dispatch-request dbstruct cmd run-id params))) - -(define (tcp-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe) - (if (not runremote) - (let* ((newremote (make-and-init-remote areapath))) - (set! *runremote* newremote) - (set! runremote newremote))) - (let* ((dbfname (conc (dbfile:run-id->dbnum run-id)".db"))) ;;(dbfile:run-id->path areapath run-id))) - (tt:handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe))) - -(define (rmt:print-db-stats) - (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f" - (debug:print 18 *default-log-port* "DB Stats\n========") - (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg")) - (for-each (lambda (cmd) - (let ((cmd-dat (hash-table-ref *db-stats* cmd))) - (debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0)))))) - (sort (hash-table-keys *db-stats*) - (lambda (a b) - (> (vector-ref (hash-table-ref *db-stats* a) 0) - (vector-ref (hash-table-ref *db-stats* b) 0))))))) - + (mtexe (common:find-local-megatest)) + (dbfname (conc (dbfile:run-id->dbnum run-id)".db")) + (ttdat (rmt:set-ttdat areapath ttdat)) + (conn (tt:get-conn ttdat dbfname)) + (is-main (equal? dbfname "main.db")) ;; why not (not run-id) ? + (server-start-proc (if is-main + #f + (lambda () + ;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname) + (rmt:start-server ;; tt:server-process-run + areapath + testsuite ;; (dbfile:testsuite-name) + mtexe + run-id))))) + ;; here we look at ttdat, if dbfname is NOT main.db we check that a conn exists for it + ;; and if there is no conn we first send a request to the main.db server to start a + ;; server for the dbfname. + #;(if (and (not is-main)(not conn)) ;; no existing connection to non-main server, call in a start up request + (begin + (server-start-proc) + (thread-sleep! 1))) + (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc))) + +;; KEEP THIS HERE FOR A WHILE, WE MAY WANT TO RESURECT IT +;; (define (nfs-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe) +;; (let* ((keys (common:get-fields *configdat*)) +;; (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard"))) +;; (api:dispatch-request dbstruct cmd run-id params))) + (define (rmt:get-max-query-average run-id) (mutex-lock! *db-stats-mutex*) (let* ((runkey (conc "run-id=" run-id " ")) (cmds (filter (lambda (x) (substring-index runkey x)) @@ -167,11 +141,11 @@ (mutex-unlock! *db-stats-mutex*) res)) (define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5)) (let* ((qry-is-write (not (member cmd api:read-only-queries))) - (db-file-path (db:dbfile-path)) ;; 0)) + (db-file-path (common:make-tmpdir-name *toppath* "")) ;; 0)) (dbstructs-local (db:setup #t)) (read-only (not (file-write-access? db-file-path))) (start (current-milliseconds)) (resdat (if (not (and read-only qry-is-write)) (let ((v (api:execute-requests dbstructs-local (vector (symbol->string cmd) params)))) @@ -204,11 +178,11 @@ ;; (rmt:update-db-stats run-id cmd params duration) ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it (if qry-is-write (let ((start-time (current-seconds))) (mutex-lock! *db-multi-sync-mutex*) -/ (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) + (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) (mutex-unlock! *db-multi-sync-mutex*))))) res)) ;;====================================================================== ;; @@ -221,12 +195,12 @@ ;;====================================================================== (define (rmt:kill-server run-id) (rmt:send-receive 'kill-server run-id (list run-id))) -(define (rmt:start-server run-id) - (rmt:send-receive 'start-server 0 (list run-id))) +(define (rmt:start-server areapath testsuite mtexe run-id) ;; run on main.db server + (rmt:send-receive 'start-server #f (list areapath testsuite mtexe run-id))) ;;====================================================================== ;; M I S C ;;====================================================================== @@ -235,16 +209,16 @@ ;; This login does no retries under the hood - it acts a bit like a ping. ;; Deprecated for nmsg-transport. ;; ;; (define (rmt:login-no-auto-client-setup runremote) -;; (rmt:send-receive-no-auto-client-setup runremote 'login 0 (list *toppath* megatest-version (client:get-signature)))) +;; (rmt:send-receive-no-auto-client-setup runremote 'login #f (list *toppath* megatest-version (client:get-signature)))) ;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host (define (rmt:get-latest-host-load hostname) - (rmt:send-receive 'get-latest-host-load 0 (list hostname))) + (rmt:send-receive 'get-latest-host-load #f (list hostname))) (define (rmt:sdb-qry qry val run-id) ;; add caching if qry is 'getid or 'getstr (rmt:send-receive 'sdb-qry run-id (list qry val))) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -2064,11 +2064,11 @@ ;; ;; There is now a single call to runs:update-all-test_meta and this ;; per-test call is not needed. Given the delicacy of the move to ;; v1.55 this code is being left in place for the time being. ;; - (if (not (hash-table-ref/default *test-meta-updated* test-name #f)) + (if (not (hash-table-exists? *test-meta-updated* test-name)) (begin (hash-table-set! *test-meta-updated* test-name #t) (runs:update-test_meta test-name test-conf))) ;; itemdat => ((ripeness "overripe") (temperature "cool") (season "summer")) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -84,11 +84,11 @@ (tasks:open-db numretries (- numretries 1))) (begin (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 5 *default-log-port* " exn=" (condition->list exn)))) - (let* ((dbpath (db:dbfile-path )) ;; (tasks:get-task-db-path)) + (let* ((dbpath (common:make-tmpdir-name *toppath* "")) ;; (tasks:get-task-db-path)) (dbfile (conc dbpath "/monitor.db")) (avail (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away (exists (common:file-exists? dbpath)) (write-access (file-write-access? dbpath)) (mdb (cond ;; what the hek is *toppath* doing here? Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -111,10 +111,11 @@ ;; (define tt-server-timeout-param (make-parameter 600)) ;; make ttdat visible (define *server-info* #f) +(define *server-run* #t) (define (tt:make-remote areapath) (make-tt areapath: areapath)) ;; 1 ... or #f @@ -125,33 +126,40 @@ (and (or (number? run-id) (not run-id)) (equal? (dbfile:run-id->dbfname run-id) dbfname))) (tcp-buffer-size 2048) -;; (max-connections 4096) +;; (max-connections 4096) + +(define (tt:get-conn ttdat dbfname) + (hash-table-ref/default (tt-conns ttdat) dbfname #f)) ;; do all the busy work of finding and setting up conn for ;; connecting to a server ;; -(define (tt:client-connect-to-server ttdat dbfname run-id testsuite) +(define (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc) (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id) - (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f)) - (server-start-proc (lambda () - (tt:server-process-run - (tt-areapath ttdat) - testsuite ;; (dbfile:testsuite-name) - (common:find-local-megatest) - run-id)))) + (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id) + (let* ((conn (tt:get-conn ttdat dbfname)) + (server-start-proc (or server-start-proc + (lambda () + (assert (equal? dbfname "main.db") ;; only main.db is started here + "FATAL: called server-start-proc for db other than main.db") + (tt:server-process-run + (tt-areapath ttdat) + testsuite ;; (dbfile:testsuite-name) + (common:find-local-megatest) + run-id))))) (if conn (begin - ; (debug:print-info 0 *default-log-port* "already connected to the server") + (debug:print-info 2 *default-log-port* "already connected to a server") conn) ;; we are already connected to the server (let* ((sdat (tt:get-current-server-info ttdat dbfname))) (match sdat ((host port start-time server-id pid dbfname2 servinffile) (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.") - ;(debug:print-info 0 *default-log-port* "in match servinffile:" servinffile) + (debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile) (let* ((host-port (conc host":"port)) (conn (make-tt-conn host: host port: port host-port: host-port @@ -162,35 +170,41 @@ pid: pid))) ;; verify we can talk to this server (let* ((result (tt:timed-ping host port server-id)) (ping-res (car result)) (ping (cdr result))) - (debug:print-info 0 *default-log-port* "ping time: " ping) + (debug:print-info 2 *default-log-port* "host " host " port " port " ping time: " ping " result " ping-res) (case ping-res ((running) + (debug:print-info 2 *default-log-port* "Setting conn = " conn " in hash table") (hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good? conn) ((starting) (thread-sleep! 0.5) - (tt:client-connect-to-server ttdat dbfname run-id testsuite)) + (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect") + (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)) (else (let* ((curr-secs (current-seconds))) ;; rm the (last server) would go here (if (> (- curr-secs (tt-last-serv-start ttdat)) 10) (begin (tt-last-serv-start-set! ttdat curr-secs) - (server-start-proc))) ;; start server if 30 sec since last attempt + (server-start-proc))) ;; start server if 10 sec since last attempt (thread-sleep! 1) - (tt:client-connect-to-server ttdat dbfname run-id testsuite))))))) + (debug:print-info 2 *default-log-port* "server ping result was neither running nor starting. Retrying connect") + (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) (else ;; no good server found, if haven't started server in > 5 secs, start another - (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 5) ;; BUG - grow this number really do not want to swamp the machine with servers + (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers (begin - (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname) + (debug:print-info 0 *default-log-port* "Starting server for "dbfname) (server-start-proc) - (tt-last-serv-start-set! ttdat (current-seconds)))) + (tt-last-serv-start-set! ttdat (current-seconds)) + (thread-sleep! 3) + )) (thread-sleep! 1) - (tt:client-connect-to-server ttdat dbfname run-id testsuite))))))) + (debug:print-info 0 *default-log-port* "Connect to server for " dbfname) + (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) (define (tt:timed-ping host port server-id) (let* ((start-time (current-milliseconds)) (result (tt:ping host port server-id))) (cons result (- (current-milliseconds) start-time)))) @@ -222,18 +236,18 @@ ;; client side handler ;; ;;(tt:handler # get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest") ;; -(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe) - ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now. - (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f))) +(define (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc) + ;; connect-to-server will start a server if needed. + (let* ((areapath (tt-areapath ttdat)) + (conn (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))) ;; looks up conn keyed by dbfname (if conn ;; have connection, call the server (let* ((res (tt:send-receive ttdat conn cmd run-id params))) ;; res is (status errmsg result meta) - ; (debug:print 0 *default-log-port* "conn:" conn " res: " res) (match res ((status errmsg result meta) (if (list? meta) (let* ((delay-wait (alist-ref 'delay-wait meta))) (if (and (number? delay-wait) @@ -241,35 +255,36 @@ (begin (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds") (thread-sleep! delay-wait))))) (case status ((busy) ;; result will be how long the server wants you to delay - (let* ((dly (if (number? result) result 0.1))) - (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, will try again in "dly" seconds.") + (let* ((raw-dly (if (number? result) result 0.1)) + (dly (+ raw-dly (/ attemptnum 10)))) ;; (* raw-dly (/ attemptnum 2)))) + (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, cmd is "cmd", will try again in "dly" seconds. This is attempt "(- attemptnum 1)) (thread-sleep! dly) - (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))) ((loaded) (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.") (tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn)) - result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) + result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe)) (else result))) (else ;; did not receive properly formated result - (if (not res) ;; tt:handler is telling us that communication failed + (if (not res) ;; tt:send-receive telling us that communication failed (let* ((host (tt-conn-host conn)) (port (tt-conn-port conn)) ;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db (pid (tt-conn-pid conn)) ;;(servinf (tt-conn-servinf-file conn))) (servinf (tt-servinf-file ttdat))) ;; (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname))) ;; TODO, use (server:get-servinfo-dir areapath) - (hash-table-set! (tt-conns ttdat) dbfname #f) + (hash-table-set! (tt-conns ttdat) dbfname #f) ;; clear out the conn for this dbfname to force finding new server (if (and servinf (file-exists? servinf)) (begin (if (< attemptnum 10) (begin (thread-sleep! 0.5) - (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)) (begin (debug:print 0 *default-log-port* "INFO: no response from server "host":"port" for "dbfname) (if (and (file-exists? servinf) (> (- (current-seconds)(file-modification-time servinf)) 60)) (begin @@ -276,30 +291,30 @@ (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.") (handle-exceptions exn #f (delete-file* servinf)) - (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)) (begin ;; start server - addressed in client-connect-to-server ;; delay - addressed in client-connect-to-server ;; try again (thread-sleep! 0.25) ;; dunno, I think this needs to be here - (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)) )))) (begin ;; no server file, delay and try again - (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf) + (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", no servinf file. Server exited? ") (thread-sleep! 0.5) - (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)))) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)))) (begin ;; this case is where res is malformed. Probably should abort (assert #f "FATAL: tt:handler received bad data "res) ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.") - ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe) + ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe) ))))) (begin (thread-sleep! 1) ;; no conn yet set up, give it a rest and try again - (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe))))) + (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc))))) (define (tt:bid-for-servership run-id) #f) ;; gets server info and appends path to server file @@ -464,14 +479,16 @@ ;; Server viability is checked in keep-running. Blindly start and run here. ;; (define (tt:start-server areapath run-id dbfname-in handler keys) (assert areapath "FATAL: areapath not provided for tt:start-server") ;; is there already a server for this dbfile? Then exit. + (debug:print 2 *default-log-port* "tt:start-server: " dbfname-in) (let* ((ttdat (make-tt areapath: areapath)) (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))) (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead - (if (> (length servers) 4) + (debug:print 0 *default-log-port* "Found " (length servers) " already running for " dbfname) + (if (> (length servers) 0) (begin (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.") (exit)) (let* ((dbstruct (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys))) (tt-handler-set! ttdat (handler dbstruct)) @@ -522,18 +539,17 @@ (let* ((db-locked-in #f) (areapath (tt-areapath ttdat)) (nosyncdbpath (conc areapath"/.mtdb")) (cleanup (lambda () (if (tt-cleanup-proc ttdat) - ((tt-cleanup-proc ttdat))) + ((tt-cleanup-proc ttdat))) ;; removes .servinfo file (dbfile:with-no-sync-db nosyncdbpath (lambda (db) (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct))) - (debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname) + ;; (debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname) (db:no-sync-del! db dbfname) - #;(if dbtmpname - (delete-file dbtmpname)))))))) + )))))) (set! *server-info* ttdat) (let loop ((count 0)) (if (> count 240) (begin (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.") @@ -548,10 +564,13 @@ ;; in over ten seconds we exit (thread-sleep! 0.05) ;; any real need for delay here? (let loop () (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) (ok (cond + ((not *server-run*) + (debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.") + #f) ((null? servers) #f) ;; not ok ((equal? (list-ref (car servers) 6) ;; compare the servinfofile (tt-servinf-file ttdat)) (let* ((res (if db-locked-in #t @@ -576,10 +595,11 @@ (if (and res (common:low-noise-print 120 "top server message")) (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for " dbfname" on "(tt-host ttdat)":"(tt-port ttdat))) res)) (else + ;; wrong servinfo file (debug:print-info 0 *default-log-port* "I'm not the lead server: "servers) (let* ((leadsrv (car servers))) (match leadsrv ((host port startseconds server-id pid dbfname servinfofile) (let* ((result (tt:timed-ping host port server-id)) @@ -616,12 +636,13 @@ (let* ((last-update (dbr:dbstruct-last-update dbstruct)) (curr-secs (current-seconds))) (if (and (eq? (tt-state ttdat) 'running) (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db? - (begin - (set! (file-modification-time (tt-servinf-file ttdat)) (current-seconds)) + (let* ((sinfo-file (tt-servinf-file ttdat))) + ;; (debug:print 0 *default-log-port* "INFO: touching "sinfo-file) + (set! (file-modification-time sinfo-file) (current-seconds)) ((dbr:dbstruct-sync-proc dbstruct) last-update) (dbr:dbstruct-last-update-set! dbstruct curr-secs)))) (if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param)) (begin @@ -704,12 +725,28 @@ ;; if more than one, wait one second and look again ;; future: ping oldest, if alive remove other : files ;; (define (tt:find-server areapath dbfname) (let* ((servdir (tt:get-servinfo-dir areapath)) - (sfiles (glob (conc servdir"/*:"dbfname)))) - sfiles)) + (sfiles (glob (conc servdir"/*:"dbfname))) + (goodfiles '())) + + ;; filter the files here by looking in processes table (if we are not main.db) + ;; and or look at the time stamp on the servinfo file, a running server will + ;; touch the file every minute (again, this will only apply for main.db) + (for-each (lambda (fname) + (let* ((age (- (current-seconds)(file-modification-time fname)))) + (if (> age 200) ;; can't trust it if over twenty seconds old + (begin + (debug:print 0 *default-log-port* "WARNING: removing stale servinfo file "fname) + (handle-exceptions + exn + (debug:print 0 *default-log-port* "WARNING: error attempting to remove stale servinfo file "fname) + (delete-file fname))) ;; + (set! goodfiles (cons fname goodfiles))))) + sfiles) + goodfiles)) ;; given a path to a server info file return: host port startseconds server-id pid dbfname logf ;; example of what it's looking for in the log file: ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 ;; @@ -719,12 +756,13 @@ (dbprep-found 0) (bad-dat (list #f #f #f #f #f #f logf))) (let ((fdat (handle-exceptions exn (begin - ;; WARNING: this is potentially dangerous to blanket ignore the errors - (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn)) + ;; BUG, TODO: add err checking, for now blanket ignore the errors? + (debug:print-info 0 *default-log-port* "Unable to get server info from "logf + ", exn="(condition->list exn)) '()) ;; no idea what went wrong, call it a bad server, return empty list (with-input-from-file logf read-lines)))) (if (null? fdat) ;; bad data, return bad-dat bad-dat (let loop ((inl (car fdat)) @@ -750,10 +788,17 @@ logf)) (else (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst) bad-dat))))))))) +(define *last-server-start* (make-hash-table)) + +(define (tt:too-recent-server-start dbfname) + (let* ((last-run-time (hash-table-ref/default *last-server-start* dbfname #f))) + (and last-run-time + (< (- (current-seconds) last-run-time) 5)))) + ;; Given an area path, start a server process ### NOTE ### > file 2>&1 ;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; @@ -760,51 +805,58 @@ (define (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area (assert areapath "FATAL: tt:server-process-run called without areapath defined.") (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.") (assert mtexe "FATAL: tt:server-process-run called without mtexe defined.") ;; mtest -server - -m testsuite:ext-tests -db 6.db - (let* ((dbfname (dbmod:run-id->dbfname run-id)) - (load (get-normalized-cpu-load)) - (trying (length (tt:find-server areapath dbfname))) - (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname)))) - (cond - ((> load 2.0) - (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server.") - (thread-sleep! 1)) - ((> nrun 100) - (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.") - (thread-sleep! 1)) - ((> trying 4) - (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.") - (thread-sleep! 1)) - (else - (if (not (file-exists? (conc areapath"/logs"))) - (create-directory (conc areapath"/logs") #t)) - (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log")) - (cmdln (conc - mtexe - " -startdir "areapath - " -server - ";; (or target-host "-") - " -m testsuite:"testsuite - " -db "dbfname ;; (dbmod:run-id->dbfname run-id) - " " profile-mode - (conc " >> " logfile " 2>&1 &")))) - ;; we want the remote server to start in *toppath* so push there - ;; (push-directory areapath) ;; use cd in the command line instead - (debug:print 0 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath) - ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) - - (system cmdln) - ;; ;; use below to go back to nbfake - nbfake does cause trouble ... - ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ... - ;; (setenv "NBFAKE_LOG" logfile) - ;; (system (conc "cd "areapath" ; nbfake " cmdln)) - ;; (unsetenv "NBFAKE_QUIET") - ;; (unsetenv "NBFAKE_LOG") - - ;;(pop-directory) - ))))) + (let* ((dbfname (dbmod:run-id->dbfname run-id))) + (if (tt:too-recent-server-start dbfname) + #f + (let* ((load (get-normalized-cpu-load)) + (srvrs (tt:find-server areapath dbfname)) + (trying (length srvrs)) + (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname)))) + (cond + ((> load 2.0) + (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes") + (thread-sleep! 1) + #f) + ((> nrun 100) + (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.") + (thread-sleep! 1) + #f) + ((> trying 2) + (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.") + (thread-sleep! 1) + #f) + (else + (if (not (file-exists? (conc areapath"/logs"))) + (create-directory (conc areapath"/logs") #t)) + (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log")) + (cmdln (conc + mtexe + " -startdir "areapath + " -server - ";; (or target-host "-") + " -m testsuite:"testsuite + " -db "dbfname ;; (dbmod:run-id->dbfname run-id) + " " profile-mode + (conc " >> " logfile " 2>&1 &")))) + ;; we want the remote server to start in *toppath* so push there + ;; (push-directory areapath) ;; use cd in the command line instead + (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath) + ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) + + (system cmdln) + (hash-table-set! *last-server-start* dbfname (current-seconds)) + ;; ;; use below to go back to nbfake - nbfake does cause trouble ... + ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ... + ;; (setenv "NBFAKE_LOG" logfile) + ;; (system (conc "cd "areapath" ; nbfake " cmdln)) + ;; (unsetenv "NBFAKE_QUIET") + ;; (unsetenv "NBFAKE_LOG") + + ;;(pop-directory) + #t))))))) ;;====================================================================== ;; tcp connection stuff ;;====================================================================== Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -1964,11 +1964,12 @@ ;; test steps ;;====================================================================== ;; teststep-set-status! used to be here -(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat) +;; NOT NEEDED +#;(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat) (let* ((testdat (rmt:get-test-state-status-by-id run-id test-id))) (and testdat (equal? (car testdat) "KILLREQ")))) (define (test:tdb-get-rundat-count tdb) Index: utils/mt_xterm ================================================================== --- utils/mt_xterm +++ utils/mt_xterm @@ -20,18 +20,16 @@ MT_TMPDISPLAY=$DISPLAY MT_TMPUSER=$USER MT_HOME=$HOME tmpfile=`mktemp` - -grep -v "export USER=" megatest.sh | grep -v "export HOME=" > $tmpfile -source $tmpfile -rm $tmpfile - -# if [ -e megatest.sh ];then -#source megatest.sh -#fi +if [[ -e megatest.sh ]]; then + grep -v "export USER=" megatest.sh | grep -v "export HOME=" > $tmpfile + source $tmpfile + rm $tmpfile +fi + export DISPLAY=$MT_TMPDISPLAY export USER=$USER export HOME=$MT_HOME if [ x"$MT_XTERM_CMD" == "x" ];then