Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -371,11 +371,13 @@ ((add-var) (apply db:add-var dbstruct params)) ((insert-run) (apply db:insert-run dbstruct params)) ;; STEPS - ((teststep-set-status!) (apply db:teststep-set-status! dbstruct params)) + ((teststep-set-status!) + ;; (apply db:teststep-set-status! dbstruct params)) + (db:add-cached-write dbstruct db:teststep-set-status! run-id params)) ((delete-steps-for-test!) (apply db:delete-steps-for-test! dbstruct params)) ;; TEST DATA ((test-data-rollup) (apply db:test-data-rollup dbstruct params)) ((csv->test-data) (apply db:csv->test-data dbstruct params)) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -3112,11 +3112,11 @@ (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir ", exn=" exn) (current-seconds)) ;; something went wrong - just print an error and return current-seconds (common:max (map (lambda (filen) (file-modification-time filen)) - (glob (conc dbdir "/*.db*")))))) + (cons (conc dbdir "/main.db") (glob (conc dbdir "/?.db"))))))) (define (dashboard:monitor-changed? commondat tabdat) (let* ((run-update-time (current-seconds)) (monitor-db-path (dboard:tabdat-monitor-db-path tabdat)) (monitor-modtime (if (and monitor-db-path (common:file-exists? monitor-db-path)) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -2739,12 +2739,10 @@ "INSERT OR REPLACE into test_steps (test_id,stepname,state,status,event_time,comment,logfile) VALUES(?,?,?,?,?,?,?);" test-id teststep-name state-in status-in (current-seconds) (if comment comment "") (if logfile logfile ""))))) - - (define (db:delete-steps-for-test! dbstruct run-id test-id) ;; TODO: figure out why status is the key field rather than state (note: CONSTRAINT test_steps_constraint UNIQUE (test_id,stepname,state) ) (db:with-db dbstruct run-id @@ -4247,10 +4245,56 @@ (conc (current-directory) "/" outputfile))) results) ;; brutal clean up (dbfile:add-dbdat dbstruct #f dbdat) (system "rm -rf tempdir"))) + +;;====================================================================== +;; cached writes stuff +;;====================================================================== + +(define (db:add-cached-write dbstruct proc run-id params) + (debug:print 0 *default-log-port* "Adding cached write for run-id "run-id" params " params) + (mutex-lock! *cached-writes-mutex*) + (let* ((hkey (cons dbstruct run-id)) + (cached-writes-queue (hash-table-ref/default *cached-writes-queues* hkey '()))) + (hash-table-set! *cached-writes-queues* hkey (cons (list proc params) cached-writes-queue))) + (if (not *cached-writes-flag*) + (begin + (set! *cached-writes-flag* #t) + (thread-start! (make-thread + (lambda () + (debug:print 0 *default-log-port* "process cached writes thread started.") + (thread-sleep! 1) + (db:process-cached-writes-queue)))))) + (mutex-unlock! *cached-writes-mutex*)) + +(define (db:process-cached-writes-queue) + (mutex-lock! *cached-writes-mutex*) + (hash-table-for-each + *cached-writes-queues* + (lambda (hkey writes-list) + (let* ((dbstruct (car hkey)) + (run-id (cdr hkey))) + (debug:print 0 *default-log-port* "Processing "(length writes-list)" cached writes for run "run-id) + (db:with-db + dbstruct + run-id + #t + (lambda (dbdat db) + (sqlite3:with-transaction + db + (lambda () + (for-each + (lambda (queued-write) + (match queued-write + ((proc params)(apply proc dbstruct params)) + (else (assert #f "BAD queued-write")))) + writes-list))) + (hash-table-delete! *cached-writes-queues* hkey)))))) + (set! *cached-writes-flag* #f) + (mutex-unlock! *cached-writes-mutex*)) ;; (db:extract-ods-file db "outputfile.ods" '(("sysname" "%")("fsname" "%")("datapath" "%")) "%") ;;====================================================================== ;; moving watch dogs here due to dependencies Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -1579,7 +1579,17 @@ ;; (db:hoh-set! stmt-cache db stmt newstmth) (hash-table-set! stmt-cache stmt newstmth) newstmth)))) (mutex-unlock! *get-cache-stmth-mutex*) result)) + +;;====================================================================== +;; cached writes - run list of procs inside transaction +;; NOTE: this only works because we have once database per process +;;====================================================================== + +(define *cached-writes-mutex* (make-mutex)) +(define *cached-writes-flag* #f) +(define *cached-writes-queues* (make-hash-table)) ;; dbstruct->list of writes + ) Index: dbmod.scm ================================================================== --- dbmod.scm +++ dbmod.scm @@ -119,12 +119,14 @@ (loop (- count 1))) (begin (debug:print-info 0 *default-log-port* "dbmod:with-db, database is busy, giving up.") (exit 1)))) (exn () - (dbfile:print-err exn "ERROR: Unknown error with database for run-id "run-id", message: " - ((condition-property-accessor 'exn 'message) exn)) + (dbfile:print-err exn "ERROR: Unknown error with db for run-id " + run-id", message: " + ((condition-property-accessor 'exn 'message) exn) + ", details: "(condition->list exn)) (exit 2)))))) (if use-mutex (mutex-unlock! *db-with-db-mutex*)) res))) (define (db:with-db dbstruct run-id w/r proc . params) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. ;; (declare (unit megatest-version)) -(define megatest-version 1.8017) +(define megatest-version 1.8019) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -999,14 +999,11 @@ (format #t fmtstr "DB" "host:port" "PID" "age" "last mod" "state") (for-each (lambda (dbfile) (let* ( (dbfname (conc (pathname-file dbfile) ".db")) - (sfiles (tt:find-server *toppath* dbfname)) ) - (for-each - (lambda (sfile) (let ( (sinfos (tt:get-server-info-sorted ttdat dbfname)) ) (for-each (lambda (sinfo) @@ -1028,13 +1025,10 @@ ) ) sinfos ) ) - ) - sfiles - ) ) ) dbfiles ) (set! *didsomething* #t) @@ -1079,11 +1073,11 @@ (killed (system (conc "ssh " host " kill " pid " > /dev/null"))) (dummy2 (sleep 1)) (state (if (> (system (conc "ssh " host " ps " pid " > /dev/null")) 0) "dead" "alive")) ) (format #t fmtstr db (conc host ":" port) pid age last-mod state) - (system (conc "rm " sfile)) + (delete-file* sfile) ) ) sinfos ) ) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -72,34 +72,11 @@ ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.") - - (if (not (eq? (rmt:transport-mode) 'nfs)) - (begin - (if (> attemptnum 2) - (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum)) - - (cond - ((> attemptnum 2) (thread-sleep! 0.05)) - ((> attemptnum 10) (thread-sleep! 0.5)) - ((> attemptnum 20) (thread-sleep! 1))) - - ;; I'm turning this off, it may make sense to move it - ;; into http-transport-handler - (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15))) - (begin - (debug:print 0 *default-log-port* "ERROR: can't connect to server, trying to start a server.") - (case (rmt:transport-mode) - ((http) - (server:run *toppath*) - (thread-sleep! 3)) - (else - (thread-sleep! 1) ;; for tcp the server is started by routines in tcp-transportmod. For nfs there is no server - )))))) - + ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -232,10 +232,11 @@ ;;(tt:handler # get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest") ;; (define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe) (debug:print 2 *default-log-port* "tt:handler cmd: " cmd " run-id: " run-id " attemptnum: " attemptnum) ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now. + ;; connect-to-server will start a server if needed. (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f))) (if conn ;; have connection, call the server (let* ((res (tt:send-receive ttdat conn cmd run-id params))) ;; res is (status errmsg result meta) @@ -260,11 +261,11 @@ (tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn)) result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) (else result))) (else ;; did not receive properly formated result - (if (not res) ;; tt:handler is telling us that communication failed + (if (not res) ;; tt:send-receive telling us that communication failed (let* ((host (tt-conn-host conn)) (port (tt-conn-port conn)) ;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db (pid (tt-conn-pid conn)) ;;(servinf (tt-conn-servinf-file conn))) @@ -293,11 +294,11 @@ ;; try again (thread-sleep! 0.25) ;; dunno, I think this needs to be here (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) )))) (begin ;; no server file, delay and try again - (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf) + (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", no servinf file. Server exited? ") (thread-sleep! 0.5) (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)))) (begin ;; this case is where res is malformed. Probably should abort (assert #f "FATAL: tt:handler received bad data "res) ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.") @@ -476,11 +477,12 @@ ;; is there already a server for this dbfile? Then exit. (debug:print 2 *default-log-port* "tt:start-server: " dbfname-in) (let* ((ttdat (make-tt areapath: areapath)) (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))) (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead - (if (> (length servers) 4) + (debug:print 0 *default-log-port* "Found " (length servers) " already running for " dbfname) + (if (> (length servers) 0) (begin (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.") (exit)) (let* ((dbstruct (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys))) (tt-handler-set! ttdat (handler dbstruct)) @@ -537,12 +539,11 @@ (dbfile:with-no-sync-db nosyncdbpath (lambda (db) (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct))) (debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname) (db:no-sync-del! db dbfname) - #;(if dbtmpname - (delete-file dbtmpname)))))))) + )))))) (set! *server-info* ttdat) (let loop ((count 0)) (if (> count 240) (begin (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.") @@ -585,10 +586,11 @@ (if (and res (common:low-noise-print 120 "top server message")) (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for " dbfname" on "(tt-host ttdat)":"(tt-port ttdat))) res)) (else + ;; wrong servinfo file (debug:print-info 0 *default-log-port* "I'm not the lead server: "servers) (let* ((leadsrv (car servers))) (match leadsrv ((host port startseconds server-id pid dbfname servinfofile) (let* ((result (tt:timed-ping host port server-id)) @@ -713,12 +715,31 @@ ;; if more than one, wait one second and look again ;; future: ping oldest, if alive remove other : files ;; (define (tt:find-server areapath dbfname) (let* ((servdir (tt:get-servinfo-dir areapath)) - (sfiles (glob (conc servdir"/*:"dbfname)))) - sfiles)) + (sfiles (glob (conc servdir"/*:"dbfname))) + (good-files '())) + (for-each + (lambda (sfile) + (let* ((sinfo (tt:server-get-info sfile)) + (host (list-ref sinfo 0)) + (port (list-ref sinfo 1)) + (server-id (list-ref sinfo 3)) + (pid (list-ref sinfo 4)) + (status (system (conc "ssh " host " ps " pid " > /dev/null"))) + ) + (if (= status 0) + (set! good-files (cons sfile good-files)) + (delete-file* sfile) + ) + ) + ) + sfiles + ) + (debug:print-info 2 *default-log-port* "tt:find-server: good-files: " good-files " sfiles: " sfiles) + good-files)) ;; given a path to a server info file return: host port startseconds server-id pid dbfname logf ;; example of what it's looking for in the log file: ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 ;;