Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -36,14 +36,15 @@ all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut tcmt transport-mode.scm : transport-mode.scm.template cp transport-mode.scm.template transport-mode.scm -dashboard-mode.scm : transport-mode.scm.template +dashboard-transport-mode.scm : transport-mode.scm.template cp transport-mode.scm.template dashboard-transport-mode.scm megatest.scm : transport-mode.scm +dashboard.scm : dashboard-transport-mode.scm # dbmod.import.o is just a hack here mofiles/dbfile.o : mofiles/debugprint.o mofiles/commonmod.o dbmod.import.o db.o : dbmod.import.o mofiles/debugprint.o : mofiles/mtargs.o Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -155,101 +155,105 @@ ;; ;; - returns #( flag result ) ;; (define (api:execute-requests dbstruct dat) (db:open-no-sync-db) ;; sets *no-sync-db* -;; (handle-exceptions -;; exn -;; (let ((call-chain (get-call-chain))) -;; (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn) -;; (print-call-chain (current-error-port)) -;; (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + ;; (handle-exceptions + ;; exn + ;; (let ((call-chain (get-call-chain))) + ;; (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn) + ;; (print-call-chain (current-error-port)) + ;; (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) ;; (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (if (> *api-process-request-count* 200) (begin (if (common:low-noise-print 30 "too many threads") (debug:print 0 *default-log-port* "WARNING: "*api-process-request-count*" threads, potential overload, adding 0.5 sec delay.")) (thread-sleep! 0.5) ;; take a nap )) - (cond - ((not (vector? dat)) ;; it is an error to not receive a vector - (vector #f (vector #f "remote must be called with a vector"))) - #;((> *api-process-request-count* 200) ;; 20) - (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") - (set! *server-overloaded* #t) - (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! - (else - (let* ((cmd-in (vector-ref dat 0)) - (cmd (if (symbol? cmd-in) - cmd-in - (string->symbol cmd-in))) - (params (vector-ref dat 1)) - (run-id (if (null? params) - 0 - (car params))) - (write-mutex (if (hash-table-exists? *db-write-mutexes* run-id) - (hash-table-ref *db-write-mutexes* run-id) - (let* ((newmutex (make-mutex))) - (hash-table-set! *db-write-mutexes* run-id newmutex) - newmutex))) - (start-t (current-milliseconds)) - (readonly-mode (dbr:dbstruct-read-only dbstruct)) - (readonly-command (member cmd api:read-only-queries)) - (writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))) - (if (not readonly-command) - (mutex-lock! write-mutex)) - (let* ((tmppath (dbr:dbstruct-tmppath dbstruct)) - (clean-run-id (cond - ((number? run-id) run-id) - ((equal? run-id #f) "main") - (else "other"))) - (crumbfile (dbfile:wait-for-qif tmppath clean-run-id (cons cmd params))) - (res - (if writecmd-in-readonly-mode - (conc "attempt to run write command "cmd" on a read-only database") - (api:dispatch-request dbstruct cmd run-id params)))) - (delete-file* crumbfile) - (if (not readonly-command) - (mutex-unlock! write-mutex)) - - ;; save all stats - (let ((delta-t (- (current-milliseconds) - start-t)) - (modified-cmd (if (eq? cmd 'general-call) - (string->symbol (conc "general-call-" (car params))) - cmd))) - (hash-table-set! *db-api-call-time* modified-cmd - (cons delta-t (hash-table-ref/default *db-api-call-time* modified-cmd '())))) - (if writecmd-in-readonly-mode - (begin - #;(common:telemetry-log (conc "api-out:"(->string cmd)) - payload: `((params . ,params) - (ok-res . #t))) - (vector #f res)) - (begin - #;(common:telemetry-log (conc "api-out:"(->string cmd)) - payload: `((params . ,params) - (ok-res . #f))) - (vector #t res)))))))) + (cond + ((not (vector? dat)) ;; it is an error to not receive a vector + (vector #f (vector #f "remote must be called with a vector"))) + #;((> *api-process-request-count* 200) ;; 20) + (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") + (set! *server-overloaded* #t) + (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! + (else + (let* ((cmd-in (vector-ref dat 0)) + (cmd (if (symbol? cmd-in) + cmd-in + (string->symbol cmd-in))) + (params (vector-ref dat 1)) + (run-id (if (null? params) + 0 + (car params))) + (write-mutex (if (hash-table-exists? *db-write-mutexes* run-id) + (hash-table-ref *db-write-mutexes* run-id) + (let* ((newmutex (make-mutex))) + (hash-table-set! *db-write-mutexes* run-id newmutex) + newmutex))) + (start-t (current-milliseconds)) + (readonly-mode (dbr:dbstruct-read-only dbstruct)) + (readonly-command (member cmd api:read-only-queries)) + (writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))) + (if (not readonly-command) + (mutex-lock! write-mutex)) + (let* ((tmppath (dbr:dbstruct-tmppath dbstruct)) + (clean-run-id (cond + ((number? run-id) run-id) + ((equal? run-id #f) "main") + (else "other"))) + (crumbfile (dbfile:wait-for-qif tmppath clean-run-id (cons cmd params))) + (res + (if writecmd-in-readonly-mode + (conc "attempt to run write command "cmd" on a read-only database") + (api:dispatch-request dbstruct cmd run-id params)))) + (delete-file* crumbfile) + (if (not readonly-command) + (mutex-unlock! write-mutex)) + + ;; save all stats + (let ((delta-t (- (current-milliseconds) + start-t)) + (modified-cmd (if (eq? cmd 'general-call) + (string->symbol (conc "general-call-" (car params))) + cmd))) + (hash-table-set! *db-api-call-time* modified-cmd + (cons delta-t (hash-table-ref/default *db-api-call-time* modified-cmd '())))) + (if writecmd-in-readonly-mode + (begin + #;(common:telemetry-log (conc "api-out:"(->string cmd)) + payload: `((params . ,params) + (ok-res . #t))) + (vector #f res)) + (begin + #;(common:telemetry-log (conc "api-out:"(->string cmd)) + payload: `((params . ,params) + (ok-res . #f))) + (vector #t res)))))))) ;; indat is (cmd run-id params meta) +;; +;; WARNING: Do not print anything in this function as it reads/writes to current in/out port +;; (define (api:tcp-dispatch-request-make-handler dbstruct) ;; cmd run-id params) (lambda () (let* ((indat (deserialize))) (set! *api-process-request-count* (+ *api-process-request-count* 1)) (match indat ((cmd run-id params meta) (let* ((status (cond - ((> *api-process-request-count* 50) 'busy) - ((> *api-process-request-count* 25) 'loaded) + ;; turn off busy throttling while trying to get things stable + ;; ((> *api-process-request-count* 50) 'busy) + ;; ((> *api-process-request-count* 25) 'loaded) (else 'ok))) (errmsg (case status ((busy) (conc "Server overloaded, "*api-process-request-count*" threads in flight")) ((loaded) (conc "Server loaded, "*api-process-request-count*" threads in flight")) (else #f))) (result (case status - ((busy) #f) + ((busy loaded) #f) (else (case cmd ((ping) (tt:mk-signature *toppath*)) (else (api:dispatch-request dbstruct cmd run-id params)))))) @@ -259,10 +263,11 @@ (else (assert #f "FATAL: failed to deserialize indat "indat)))))) (define (api:dispatch-request dbstruct cmd run-id params) + (db:open-no-sync-db) (case cmd ;;=============================================== ;; READ/WRITE QUERIES ;;=============================================== Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1472,29 +1472,37 @@ ;;====================================================================== ;; no-sync.db - small bits of data to be shared between servers ;;====================================================================== -(define (db:no-sync-db db-in) - (if db-in - db-in - (if *no-sync-db* - *no-sync-db* - (begin - (mutex-lock! *db-access-mutex*) - (let ((dbpath (common:get-db-tmp-area)) - (db (dbfile:open-no-sync-db dbpath))) - (set! *no-sync-db* db) - (mutex-unlock! *db-access-mutex*) - db))))) +(define (db:get-dbsync-path) + (case (rmt:transport-mode) + ((http)(common:get-db-tmp-area)) + ((tcp) (conc *toppath*"/.megatest")) + ((nfs) (conc *toppath*"/.megatest")) + (else "/tmp/dunno-this-gonna-exist"))) + + ;; (define (db:no-sync-db db-in) + ;; (if db-in + ;; db-in + ;; (if *no-sync-db* + ;; *no-sync-db* + ;; (begin + ;; (mutex-lock! *db-access-mutex*) + ;; (let ((dbpath (db:get-dbsync-path)) + ;; (db (dbfile:open-no-sync-db dbpath))) + ;; (assert (sqlite3:database? db) "FATAL: db:no-sync-db failed to open a database") + ;; (set! *no-sync-db* db) + ;; (mutex-unlock! *db-access-mutex*) + ;; db))))) (define (with-no-sync-db proc) - (let* ((db (db:no-sync-db *no-sync-db*))) + (let* ((db (db:open-no-sync-db))) (proc db))) (define (db:open-no-sync-db) - (dbfile:open-no-sync-db (db:dbfile-path))) + (dbfile:open-no-sync-db (db:get-dbsync-path))) (define (db:no-sync-close-db db stmt-cache) (db:safely-close-sqlite3-db db stmt-cache)) Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -394,11 +394,10 @@ "cp "backupfname" "fname))) (dbfile:print-err "WARNING: attempting recovery of file "fname" by running commands:\n" " "cmd) (system cmd))) - (define (dbfile:open-no-sync-db dbpath) (if *no-sync-db* *no-sync-db* (begin (if (not (file-exists? dbpath)) @@ -408,13 +407,18 @@ (init-proc (lambda (db) (if (not db-exists) (begin (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));")) ))) - (db (dbfile:cautious-open-database dbname init-proc 0 "WAL"))) ;; (sqlite3:open-database dbname))) - ;; (sqlite3:execute db "PRAGMA synchronous = 0;") - ;; (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)) ;; done in cautious-open-database + (on-tmp (equal? (car (string-split dbpath "/")) "tmp")) + (db (if on-tmp + (dbfile:cautious-open-database dbname init-proc 0 "WAL") + (sqlite3:open-database dbname)))) + (if on-tmp ;; done in cautious-open-database + (begin + (sqlite3:execute db "PRAGMA synchronous = 0;") + (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)))) (set! *no-sync-db* db) db)))) (define (db:no-sync-set db var val) (sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES (?,?);" var val)) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -104,12 +104,17 @@ ;; I'm turning this off, it may make sense to move it ;; into http-transport-handler (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15))) (begin (debug:print 0 *default-log-port* "ERROR: can't connect to server, trying to start a server.") - (server:run *toppath*) - (thread-sleep! 3))) + (case (rmt:transport-mode) + ((http) + (server:run *toppath*) + (thread-sleep! 3)) + (else + (thread-sleep! 1) ;; for tcp the server is started by routines in tcp-transportmod. For nfs there is no server + )))) ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -469,10 +469,11 @@ ;; find alive rand from youngest ;; 1. sort by age descending ;; 2. take five ;; 3. check alive, discard if not and repeat ;; first we clean up old server files + (assert (eq? (rmt:transport-mode) 'http) "FATAL: server:run called with rmt:transport-mode="(rmt:transport-mode)) (server:clean-up-old areapath) (let* ((since-last (- (current-seconds) server-last-start)) (server-start-delay 10)) (if ( < (- (current-seconds) server-last-start) 10 ) (begin @@ -568,11 +569,13 @@ sfiles))) ;; would like to eventually get rid of this ;; (define (common:on-homehost?) - (server:choose-server *toppath* 'home?)) + (if (eq? (rmt:transport-mode) 'http) + (server:choose-server *toppath* 'home?) + #t)) ;; there is no homehost for tcp and nfs is always on home so #t should work ;; kind start up of server, wait before allowing another server for a given ;; area to be launched ;; (define (server:kind-run areapath) Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -98,10 +98,11 @@ (cmd-thread #f) (ro-mode #f) (ro-mode-checked #f) (last-access (current-seconds)) (servinf-file #f) + (last-serv-start 0) ) (define (tt:make-remote areapath) (make-tt areapath: areapath)) @@ -134,17 +135,21 @@ pid: pid))) (hash-table-set! (tt-conns ttdat) dbfname conn) ;; verify we can talk to this server (if (tt:ping host port server-id) conn - (begin + (let* ((curr-secs (current-seconds))) ;; rm the (last server) would go here - (server-start-proc) + (if (> (- curr-secs (tt-last-serv-start ttdat)) 30) + (begin + (tt-last-serv-start-set! ttdat curr-secs) + (server-start-proc))) ;; don't try and start server unless 30 sec has gone by since last attempt (thread-sleep! 1) (tt:client-connect-to-server ttdat dbfname run-id testsuite))))) (else - (debug:print-info 0 *default-log-port* "Number of records did not match expected. Bad server info?") + (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname) + (tt-last-serv-start-set! ttdat (current-seconds)) (server-start-proc) (thread-sleep! 1) (tt:client-connect-to-server ttdat dbfname run-id testsuite))))))) (define (tt:ping host port server-id) @@ -187,11 +192,11 @@ (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)) (else result))) (else (if (not res) - (begin ;; server likely died + (begin ;; let* ((srvfile (tt-conn-servinf-file ))) ;; server likely died (hash-table-set! (tt-conns ttdat) dbfname #f) (debug:print 0 *default-log-port* "INFO: connection to server broken, reconnecting.") (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)) (assert #f "FATAL: tt:handler received bad data "res))))) (begin