Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -180,18 +180,18 @@ ))) (if (and start-res ping-res) (begin - (hash-table-set! *runremote* run-id start-res) + (set! *runremote* start-res) ;; (hash-table-set! *runremote* run-id start-res) (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) start-res) (begin ;; login failed but have a server record, clean out the record and try again (debug:print-info 0 *default-log-port* "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) (case *transport-type* ((http)(http-transport:close-connections run-id))) - (hash-table-delete! *runremote* run-id) + (set! *runremote* #f) ;; (hash-table-delete! *runremote* run-id) (tasks:kill-server-run-id run-id) (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id (tasks:hostinfo-get-interface server-dat) (tasks:hostinfo-get-port server-dat) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -7,11 +7,11 @@ ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. ;;====================================================================== -(use srfi-1 posix regex-case base64 format dot-locking csv-xml z3 sql-de-lite hostinfo md5 message-digest typed-records) +(use srfi-1 posix regex-case base64 format dot-locking csv-xml z3 sql-de-lite hostinfo md5 message-digest typed-records directory-utils) (require-extension regex posix) (require-extension (srfi 18) extras tcp rpc) (import (prefix sqlite3 sqlite3:)) @@ -90,11 +90,12 @@ (define *db-stats* (make-hash-table)) ;; hash of vectors < count duration-total > (define *db-stats-mutex* (make-mutex)) (define *db-sync-mutex* (make-mutex)) (define *db-multi-sync-mutex* (make-mutex)) (define *db-local-sync* (make-hash-table)) ;; used to record last touch of db -(define *last-db-access* (current-seconds)) ;; update when db is accessed via server +(define *db-last-sync* 0) ;; last time the sync to megatest.db happened +(define *last-db-access* (current-seconds)) ;; update when db is accessed via server (define *db-write-access* #t) (define *task-db* #f) ;; (vector db path-to-db) (define *db-access-allowed* #t) ;; flag to allow access (define *db-access-mutex* (make-mutex)) (define *db-cache-path* #f) @@ -101,11 +102,11 @@ ;; SERVER (define *my-client-signature* #f) (define *transport-type* 'http) (define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg -(define *runremote* (make-hash-table)) ;; if set up for server communication this will hold +(define *runremote* #f) ;; if set up for server communication this will hold (define *max-cache-size* 0) (define *logged-in-clients* (make-hash-table)) (define *server-id* #f) (define *server-info* #f) (define *time-to-exit* #f) @@ -200,10 +201,34 @@ ;; 'old2new 'new2old 'schema) (if (common:version-changed?) (common:set-last-run-version))) + +;; Rotate logs, logic: +;; if > 500k and older than 1 week: +;; remove previous compressed log and compress this log +;; WARNING: This proc operates assuming that it is in the directory above the +;; logs directory you wish to log-rotate. +;; +(define (common:rotate-logs) + (if (not (directory-exists? "logs"))(create-directory "logs")) + (directory-fold + (lambda (file rem) + (if (and (string-match "^.*.log" file) + (> (file-size (conc "logs/" file)) 200000)) + (let ((gzfile (conc "logs/" file ".gz"))) + (if (file-exists? gzfile) + (begin + (debug:print-info 0 *default-log-port* "removing " gzfile) + (delete-file gzfile))) + (debug:print-info 0 *default-log-port* "compressing " file) + (system (conc "gzip logs/" file))))) + '() + "logs")) + + ;; Force a megatest cleanup-db if version is changed and skip-version-check not specified ;; (define (common:exit-on-version-changed) (if (common:version-changed?) @@ -482,17 +507,17 @@ ;;====================================================================== (define (common:legacy-sync-recommended) (or (and (common:get-homehost) (cdr (common:get-homehost))) - (args:get-arg "-runtests") - (args:get-arg "-run") + ;;(args:get-arg "-runtests") + ;;(args:get-arg "-run") (args:get-arg "-server") ;; (args:get-arg "-set-run-status") - (args:get-arg "-remove-runs") + ;;(args:get-arg "-remove-runs") ;; (args:get-arg "-get-run-status") - (args:get-arg "-use-db-cache") ;; feels like a bad idea ... + ;;(args:get-arg "-use-db-cache") ;; feels like a bad idea ... )) (define (common:legacy-sync-required) (configf:lookup *configdat* "setup" "megatest-db")) @@ -536,11 +561,11 @@ legacy-sync) (let loop () ;; sync for filesystem local db writes ;; (let ((start-time (current-seconds))) - (common:sync-to-megatest.db 'local-sync-flags) + ;; (common:sync-to-megatest.db 'local-sync-flags) (if (and debug-mode (> (- start-time last-time) 60)) (begin (set! last-time start-time) (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*)))))) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -196,11 +196,12 @@ dir-writable ))) (if file-write ;; dir-writable (let (;; (lock (obtain-dot-lock fname 1 5 10)) (db (sqlite3:open-database fname))) (sqlite3:set-busy-handler! db (make-busy-timeout 136000)) - (db:set-sync db) ;; (sqlite3:execute db "PRAGMA synchronous = 0;") + ;; (db:set-sync db) + (sqlite3:execute db "PRAGMA synchronous = NORMAL;") (if (not file-exists) (begin (if (string-match "^/tmp/.*" fname) ;; this is a file in /tmp (sqlite3:execute db "PRAGMA journal_mode=WAL;") (print "Creating " fname " in NON-WAL mode.")) @@ -310,15 +311,18 @@ ;; (maindb (dbr:dbstruct-main dbstruct)) ;; (refdb (dbr:dbstruct-refdb dbstruct)) (tmpdb (dbr:dbstruct-tmpdb dbstruct)) (mtdb (dbr:dbstruct-mtdb dbstruct)) (refndb (dbr:dbstruct-refndb dbstruct)) + (start-t (current-seconds)) ;; (runid (dbr:dbstruct-run-id dbstruct)) ) (debug:print-info 4 *default-log-port* "Syncing for run-id: " run-id) - ;; (mutex-lock! *http-mutex*) - (db:sync-tables (db:sync-all-tables-list dbstruct) #f tmpdb refndb mtdb))) + (mutex-lock! *db-sync-mutex*) + (db:sync-tables (db:sync-all-tables-list dbstruct) (cons *db-last-sync* "last_update") tmpdb refndb mtdb) + (set! *db-last-sync* start-t) + (mutex-unlock! *db-sync-mutex*))) ;; (if (eq? run-id 0) ;; ;; runid equal to 0 is main.db ;; (if maindb ;; (if (or (not (number? mtime)) ;; (not (number? stime)) @@ -371,11 +375,11 @@ ;; close all opened run-id dbs (define (db:close-all dbstruct) (if (dbr:dbstruct? dbstruct) (begin - (db:sync-touched dbstruct 0 force-sync: #t) + ;; (db:sync-touched dbstruct 0 force-sync: #t) ;; NO. Do not do this here. Instead we rely on a server to be started when there are writes, even if the server itself is not going to be used as a server. (let ((tdb (db:dbdat-get-db (dbr:dbstruct-tmpdb dbstruct))) (mdb (db:dbdat-get-db (dbr:dbstruct-mtdb dbstruct))) (rdb (db:dbdat-get-db (dbr:dbstruct-refndb dbstruct)))) (if tdb (sqlite3:finalize! tdb)) (if mdb (sqlite3:finalize! mdb)) @@ -3155,19 +3159,25 @@ (begin (debug:print-error 0 *default-log-port* "reception failed. Received " msg " but cannot translate it.") msg))) ;; crude reply for when things go awry ((zmq nmsg)(with-input-from-string msg (lambda ()(deserialize)))) (else msg))) ;; rpc + +;; This is to be the big daddy call (define (db:test-set-status-state dbstruct run-id test-id status state msg) (let ((dbdat (db:get-db dbstruct run-id))) (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) (db:general-call dbdat 'set-test-start-time (list test-id))) - (if msg - (db:general-call dbdat 'state-status-msg (list state status msg test-id)) - (db:general-call dbdat 'state-status (list state status test-id))) - (mt:process-triggers run-id test-id state status))) + ;; (if msg + ;; (db:general-call dbdat 'state-status-msg (list state status msg test-id)) + ;; (db:general-call dbdat 'state-status (list state status test-id))) + (db:set-state-status-and-roll-up-items dbstruct run-id test-id #f state status msg) + ;; process the test_data table + (if (and test-id state status (equal? status "AUTO")) + (db:test-data-rollup dbstruct run-id test-id status)) + (mt:process-triggers run-id test-id state status))) ;; state is the priority rollup of all states ;; status is the priority rollup of all completed states ;; ;; if test-name is an integer work off that instead of test-name test-path @@ -3174,11 +3184,11 @@ ;; (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) ;; establish info on incoming test followed by info on top level test (let* ((db (db:dbdat-get-db (dbr:dbstruct-tmpdb dbstruct))) (testdat (if (number? test-name) - (db:get-test-info-by-id dbstruct run-id test-name) + (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id (db:get-test-info dbstruct run-id test-name item-path))) (test-id (db:test-get-id testdat)) (test-name (if (number? test-name) (db:test-get-testname testdat) test-name)) @@ -3207,11 +3217,11 @@ *common:std-statuses* >)) (newstate (if (null? all-curr-states) "NOT_STARTED" (car all-curr-states))) (newstatus (if (null? all-curr-statuses) "n/a" (car all-curr-statuses)))) ;; (print "Setting toplevel to: " newstate "/" newstatus) (db:test-set-state-status-by-id dbstruct run-id tl-test-id newstate newstatus #f))))))) - + (define db:roll-up-pass-fail-counts db:set-state-status-and-roll-up-items) ;; call with state = #f to roll up with out accounting for state/status of this item ;; ;; (define (db:roll-up-pass-fail-counts dbstruct run-id test-name item-path state status) @@ -3315,11 +3325,11 @@ ELSE status END WHERE id=?;") ;; DONE '(test-set-log "UPDATE tests SET final_logf=? WHERE id=?;") ;; DONE ;; '(test-set-rundir-by-test-id "UPDATE tests SET rundir=? WHERE id=?") ;; DONE ;; '(test-set-rundir "UPDATE tests SET rundir=? AND testname=? AND item_path=?;") ;; DONE - '(test-set-rundir-shortdir "UPDATE tests SET rundir=?,shortdir=? WHERE testname=? AND item_path=?;") ;; BROKEN!!! NEEDS run-id + '(test-set-rundir-shortdir "UPDATE tests SET rundir=?,shortdir=? WHERE testname=? AND item_path=? AND run_id=?;") ;; BROKEN!!! NEEDS run-id '(delete-tests-in-state ;; "DELETE FROM tests WHERE state=?;") ;; DONE "UPDATE tests SET state='DELETED' WHERE state=?") '(tests:test-set-toplog "UPDATE tests SET final_logf=? WHERE run_id=? AND testname=? AND item_path='';") '(update-cpuload-diskfree "UPDATE tests SET cpuload=?,diskfree=? WHERE id=?;") ;; DONE '(update-uname-host "UPDATE tests SET uname=?,host=? WHERE id=?;") ;; DONE @@ -3328,12 +3338,11 @@ ;; stuff for roll-up-pass-fail-counts '(update-pass-fail-counts "UPDATE tests SET fail_count=(SELECT count(id) FROM tests WHERE testname=? AND item_path != '' AND status IN ('FAIL','CHECK','INCOMPLETE','ABORT')), pass_count=(SELECT count(id) FROM tests WHERE testname=? AND item_path != '' AND status IN ('PASS','WARN','WAIVED')) WHERE testname=? AND item_path='' AND run_id=?;") ;; DONE ;; BROKEN!!! NEEDS run-id - '(top-test-set "UPDATE tests SET state=? WHERE testname=? AND item_path='';") ;; DONE ;; BROKEN!!! NEEDS run-id - '(top-test-set-running "UPDATE tests SET state='RUNNING' WHERE testname=? AND item_path='';") ;; DONE ;; BROKEN!!! NEEDS run-id + '(top-test-set-running "UPDATE tests SET state='RUNNING' WHERE testname=? AND item_path='' AND run_id=?;") ;; DONE ;; BROKEN!!! NEEDS run-id ;; Might be the following top-test-set-per-pf-counts query could be better based off of something like this: ;; ;; select state,status,count(state) from tests where run_id=59 AND testname='runfirst' group by state,status; @@ -3441,16 +3450,16 @@ sync set-verbosity killserver )) -(define (db:login dbstruct calling-path calling-version run-id client-signature) +(define (db:login dbstruct calling-path calling-version client-signature) (cond ((not (equal? calling-path *toppath*)) (list #f "Login failed due to mismatch paths: " calling-path ", " *toppath*)) - ((not (equal? *run-id* run-id)) - (list #f "Login failed due to mismatch run-id: " run-id ", " *run-id*)) + ;; ((not (equal? *run-id* run-id)) + ;; (list #f "Login failed due to mismatch run-id: " run-id ", " *run-id*)) ((not (equal? megatest-version calling-version)) (list #f "Login failed due to mismatch megatest version: " calling-version ", " megatest-version)) (else (hash-table-set! *logged-in-clients* client-signature (current-seconds)) '(#t "successful login")))) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -111,11 +111,11 @@ ;; This is recursively run by http-transport:run until sucessful ;; (define (http-transport:try-start-server run-id ipaddrstr portnum server-id) (let ((config-hostname (configf:lookup *configdat* "server" "hostname")) (tdbdat (tasks:open-db))) - (debug:print-info 0 *default-log-port* "http-transport:try-start-server run-id=" run-id " ipaddrsstr=" ipaddrstr " portnum=" portnum " server-id=" server-id " config-hostname=" config-hostname) + (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " run-id=" run-id " ipaddrsstr=" ipaddrstr " portnum=" portnum " server-id=" server-id " config-hostname=" config-hostname) (handle-exceptions exn (begin (print-error-message exn) (if (< portnum 64000) @@ -260,11 +260,11 @@ exn (begin (set! success #f) (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (hash-table-delete! *runremote* run-id) + (set! *runremote* #f) ;; (hash-table-delete! *runremote* run-id) ;; Killing associated server to allow clean retry.") ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? (mutex-unlock! *http-mutex*) ;;; (signal (make-composite-condition ;;; (make-property-condition 'commfail 'message "failed to connect to server"))) @@ -307,11 +307,11 @@ 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server"))))))) ;; careful closing of connections stored in *runremote* ;; (define (http-transport:close-connections run-id) - (let* ((server-dat (hash-table-ref/default *runremote* run-id #f))) + (let* ((server-dat *runremote*)) ;; (hash-table-ref/default *runremote* run-id #f))) (if (vector? server-dat) (let ((api-dat (http-transport:server-dat-get-api-uri server-dat))) (close-connection! api-dat) #t) #f))) @@ -398,11 +398,13 @@ (if *dbstruct-db* (let ((start-time (current-milliseconds)) (sync-time #f) (rem-time #f)) (condition-case - (db:sync-touched *dbstruct-db* *run-id* force-sync: #t) + ;; (if (and (member (mutex-state *db-sync-mutex*) '(abandoned not-abandoned)) + ;; (> (- (current-seconds) *db-last-sync*) 5)) ;; if not currently being synced nor recently synced + (db:sync-touched *dbstruct-db* *run-id* force-sync: #t) ;; usually done in the watchdog, not here. ((sync-failed)(cond ((> bad-sync-count 10) ;; time to give up (http-transport:server-shutdown server-id port)) (else ;; (> bad-sync-count 0) ;; we've had a fail or two, delay and loop (thread-sleep! 5) @@ -428,11 +430,12 @@ (if (equal? new-server-id server-id) (begin (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access (set! *dbstruct-db* (db:setup)) ;; run-id)) - (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")) + (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") + (server:write-dotserver *toppath* (conc iface ":" port))) (begin ;; gotta exit nicely (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision") (http-transport:server-shutdown server-id port)))))) (if (< count 1) ;; 3x3 = 9 secs aprox @@ -485,11 +488,11 @@ (define (http-transport:server-shutdown server-id port) (let ((tdbdat (tasks:open-db))) (debug:print-info 0 *default-log-port* "Starting to shutdown the server.") ;; need to delete only *my* server entry (future use) - (if *dbstruct-db* (db:sync-touched *dbstruct-db* *run-id* force-sync: #t)) + ;; (if *dbstruct-db* (db:sync-touched *dbstruct-db* *run-id* force-sync: #t)) ;; handled in the watchdog only (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up ;; ;; start_shutdown ;; (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") @@ -510,10 +513,12 @@ (/ *total-non-write-delay* *number-non-write-queries*)) " ms") (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting") (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running complete") + ;; if the .server file contained :myport then we can remove it + (server:remove-dotserver-file *toppath* port) (exit))) ;; all routes though here end in exit ... ;; ;; start_server? Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -896,11 +896,11 @@ (lnkpathf (conc lnkpath (if not-iterated "" "/") item-path)) (lnktarget (conc lnkpath "/" item-path))) ;; Update the rundir path in the test record for all, rundir=physical, shortdir=logical ;; rundir shortdir - (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path) + (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path run-id) (debug:print 2 *default-log-port* "INFO:\n lnkbase=" lnkbase "\n lnkpath=" lnkpath "\n toptest-path=" toptest-path "\n test-path=" test-path) (if (not (file-exists? linktree)) (begin (debug:print 0 *default-log-port* "WARNING: linktree did not exist! Creating it now at " linktree) @@ -969,11 +969,11 @@ (rmt:general-call 'test-set-rundir-shortdir run-id lnkpath (if (file-exists? lnkpath) ;; (resolve-pathname lnkpath) (common:nice-path lnkpath) lnkpath) - testname "") + testname "" run-id) ;; (rmt:general-call 'test-set-rundir run-id lnkpath testname "") ;; toptest-path) (if (or (not curr-test-path) (not (directory-exists? toptest-path))) (begin (debug:print-info 2 *default-log-port* "Creating " toptest-path " and link " lnkpath) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -343,42 +343,11 @@ (let ((targ (or (args:get-arg "-reqtarg")(args:get-arg "-target")))) (if targ (setenv "MT_TARGET" targ))) ;; The watchdog is to keep an eye on things like db sync etc. ;; -(define *watchdog* - (make-thread - (lambda () - (thread-sleep! 0.05) ;; delay for startup - (let ((legacy-sync (common:legacy-sync-required)) - (debug-mode (debug:debug-mode 1)) - (last-time (current-seconds))) - (if (common:legacy-sync-recommended) - (let loop () - ;; sync for filesystem local db writes - ;; - (let ((start-time (current-seconds))) - ;; disabling for now (if legacy-sync (common:sync-to-megatest.db #f)) - (if (and debug-mode - (> (- start-time last-time) 60)) - (begin - (set! last-time start-time) - (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*)))))) - - ;; keep going unless time to exit - ;; - (if (not *time-to-exit*) - (let delay-loop ((count 0)) - (if (and (not *time-to-exit*) - (< count 11)) ;; aprox 5-6 seconds - (begin - (thread-sleep! 1) - (delay-loop (+ count 1)))) - (loop))) - (if (common:low-noise-print 30) - (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*))))) - "Watchdog thread"))) +(define *watchdog* (make-thread common:watchdog "Watchdog thread")) (thread-start! *watchdog*) (if (args:get-arg "-log") (let ((oup (open-output-file (args:get-arg "-log")))) @@ -673,13 +642,13 @@ (if out-file (close-output-port out-port)) (exit) ;; yes, bending the rules here - need to exit since this is a utility )) (if (args:get-arg "-ping") - (let* ((run-id (string->number (args:get-arg "-run-id"))) + (let* (;; (run-id (string->number (args:get-arg "-run-id"))) (host:port (args:get-arg "-ping"))) - (server:ping run-id host:port))) + (server:ping host:port))) ;;====================================================================== ;; Capture, save and manipulate environments ;;====================================================================== @@ -2010,11 +1979,11 @@ ;;====================================================================== ;; Exit and clean up ;;====================================================================== -(if *runremote* (close-all-connections!)) +(if *runremote* (close-all-connections!)) ;; for http-client (if (not *didsomething*) (debug:print 0 *default-log-port* help)) (set! *time-to-exit* #t) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -40,11 +40,11 @@ ;; if a server is either running or in the process of starting call client:setup ;; else return #f to let the calling proc know that there is no server available ;; (define (rmt:get-connection-info run-id) - (let ((cinfo (hash-table-ref/default *runremote* run-id #f))) + (let ((cinfo *runremote*)) ;; (hash-table-ref/default *runremote* run-id #f))) (if cinfo cinfo (if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id) (client:setup run-id) #f)))) @@ -54,10 +54,23 @@ ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected ;; clean out old connections ;; (mutex-lock! *db-multi-sync-mutex*) + + ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in *runremote* + ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. + ;; 3. do the query, if on homehost use local access + + (if (and #f ;; FORCE NO GO FOR RIGHT NOW + (not *runremote*) ;; we trust *runremote* to reflect that a server was found previously + (not (member cmd api:read-only-queries))) ;; we don't trust so much the list of write queries + (let ((serverconn (server:check-if-running *toppath*))) + (if serverconn + (set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed + (server:kind-run *toppath*)))) + (rmt:open-qry-close-locally cmd (if rid rid 0) params)) ;; (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin ;; (for-each ;; (lambda (run-id) @@ -217,15 +230,11 @@ res)) (define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5)) (let* ((qry-is-write (not (member cmd api:read-only-queries))) (db-file-path (db:dbfile-path)) ;; 0)) - (dbstruct-local (if *dbstruct-db* - *dbstruct-db* - (let* ((dbstruct (db:setup))) ;; make-dbr:dbstruct path: dbdir local: #t))) - (set! *dbstruct-db* dbstruct) - dbstruct))) + (dbstruct-local (db:setup)) ;; make-dbr:dbstruct path: dbdir local: #t))) (read-only (not (file-write-access? db-file-path))) (start (current-milliseconds)) (resdat (if (not (and read-only qry-is-write)) (api:execute-requests dbstruct-local (vector (symbol->string cmd) params)) (vector #t '()))) @@ -243,11 +252,11 @@ (begin (debug:print-error 0 *default-log-port* "too many retries in rmt:open-qry-close-locally, giving up") #f)) (begin ;; (rmt:update-db-stats run-id cmd params duration) - ;; mark this run as dirty if this was a write + ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it (if qry-is-write (let ((start-time (current-seconds))) (mutex-lock! *db-multi-sync-mutex*) (hash-table-set! *db-local-sync* (or run-id 0) start-time) ;; the oldest "write" (mutex-unlock! *db-multi-sync-mutex*))))) @@ -304,13 +313,13 @@ (rmt:send-receive 'login run-id (list *toppath* megatest-version run-id *my-client-signature*))) ;; This login does no retries under the hood - it acts a bit like a ping. ;; Deprecated for nmsg-transport. ;; -(define (rmt:login-no-auto-client-setup connection-info run-id) - (case *transport-type* - ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login run-id (list *toppath* megatest-version run-id *my-client-signature*))) +(define (rmt:login-no-auto-client-setup connection-info) + (case *transport-type* ;; run-id of 0 is just a placeholder + ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login 0 (list *toppath* megatest-version *my-client-signature*))) ;;((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info 'login (list *toppath* megatest-version run-id *my-client-signature*))) )) ;; hand off a call to one of the db:queries statements ;; added run-id to make looking up the correct db possible Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -104,43 +104,28 @@ ;; Given a run id start a server process ### NOTE ### > file 2>&1 ;; if the run-id is zero and the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; -(define (server:run run-id) +(define (server:run areapath) ;; areapath is ignored for now. (let* ((curr-host (get-host-name)) (curr-ip (server:get-best-guess-address curr-host)) + (curr-pid (current-process-id)) (target-host (configf:lookup *configdat* "server" "homehost" )) (testsuite (common:get-testsuite-name)) - (logfile (conc *toppath* "/logs/" run-id ".log")) + (logfile (conc *toppath* "/logs/server-" curr-pid ".log")) (cmdln (conc (common:get-megatest-exe) - " -server " (or target-host "-") " -run-id " run-id (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") + " -server " (or target-host "-") " -run-id " 0 (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") (conc " -daemonize -log " logfile) "") - " -m testsuite:" testsuite))) ;; (conc " >> " logfile " 2>&1 &"))))) - (debug:print 0 *default-log-port* "INFO: Starting server (" cmdln ") as none running ...") + " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) + (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread"))) + ;; we want the remote server to start in *toppath* so push there (push-directory *toppath*) - (if (not (directory-exists? "logs"))(create-directory "logs")) - - ;; Rotate logs, logic: - ;; if > 500k and older than 1 week: - ;; remove previous compressed log and compress this log - ;; - (directory-fold - (lambda (file rem) - (if (and (string-match "^.*.log" file) - (> (file-size (conc "logs/" file)) 200000)) - (let ((gzfile (conc "logs/" file ".gz"))) - (if (file-exists? gzfile) - (begin - (debug:print-info 0 *default-log-port* "removing " gzfile) - (delete-file gzfile))) - (debug:print-info 0 *default-log-port* "compressing " file) - (system (conc "gzip logs/" file))))) - '() - "logs") - + (debug:print 0 *default-log-port* "INFO: Starting server (" cmdln ") as none running ...") + (thread-start! log-rotate) + ;; host.domain.tld match host? (if (and target-host ;; look at target host, is it host.domain.tld or ip address and does it ;; match current ip or hostname (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) @@ -152,11 +137,11 @@ (setenv "TARGETHOST_LOGF" logfile) (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) - ;; (system cmdln) + (thread-join! log-rotate) (pop-directory))) (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) @@ -163,91 +148,114 @@ (set! *my-client-signature* sig) *my-client-signature*))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched -(define (server:kind-run run-id) - (let ((last-run-time (hash-table-ref/default *server-kind-run* run-id #f))) +(define (server:kind-run areapath) + (let ((last-run-time (hash-table-ref/default *server-kind-run* areapath #f))) (if (or (not last-run-time) (> (- (current-seconds) last-run-time) 30)) (begin - (server:run run-id) - (hash-table-set! *server-kind-run* run-id (current-seconds)))))) + (server:run areapath) + (hash-table-set! *server-kind-run* areapath (current-seconds)))))) ;; The generic run a server command. Dispatches the call to server 0 if run-id != 0 ;; (define (server:try-running run-id) (if (eq? run-id 0) (server:run run-id) (rmt:start-server run-id))) -(define (server:check-if-running run-id) - (let ((tdbdat (tasks:open-db))) - (let loop ((server (tasks:get-server (db:delay-if-busy tdbdat) run-id)) - (trycount 0)) - (if server - ;; note: client:start will set *runremote*. this needs to be changed - ;; also, client:start will login to the server, also need to change that. - ;; - ;; client:start returns #t if login was successful. - ;; - (let ((res (case *transport-type* - ((http)(server:ping-server run-id - (tasks:hostinfo-get-interface server) - (tasks:hostinfo-get-port server))) - ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) - ;; (tasks:hostinfo-get-port server) - ;; timeout: 2)) - ))) - ;; if the server didn't respond we must remove the record +(define (server:read-dotserver areapath) + (let ((dotfile (conc areapath "/.server"))) + (if (and (file-exists? dotfile) + (file-read-access? dotfile)) + (with-input-from-file + dotfile + (lambda () + (read-line))) + #f))) + +;; write a .server file in *toppath* with hostport +;; return #t on success, #f otherwise +;; +(define (server:write-dotserver areapath hostport) + (let ((lock-file (conc areapath "/.server.lock")) + (server-file (conc areapath "/.server"))) + (if (common:simple-file-lock lock-file) + (let ((res (handle-exceptions + exn + #f ;; failed for some reason, for the moment simply return #f + (with-output-to-file server-file + (lambda () + (print hostport))) + #t))) + (common:simple-file-release-lock lock-file) + res) + #f))) + +(define (server:remove-dotserver-file areapath hostport) + (let ((dotserver (server:read-dotserver areapath)) + (server-file (conc areapath "/.server")) + (lock-file (conc areapath "/.server.lock"))) + (if (string-match (conc ".*:" hostport "$") dotserver) ;; port matches, good enough info to decide to remove the file + (if (common:simple-file-lock lock-file) + (begin + (handle-exceptions + exn + #f + (delete-file* server-file)) + (common:simple-file-release-lock lock-file)))))) + +;; no longer care if multiple servers are started by accident. older servers will drop off in time. +;; +(define (server:check-if-running areapath) + (let* ((dotserver (server:read-dotserver areapath))) ;; tdbdat (tasks:open-db))) + (if dotserver + (let* ((res (case *transport-type* + ((http)(server:ping-server dotserver)) + ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) + ))) (if res - #t - (begin - (debug:print-info 0 *default-log-port* "server at " server " not responding, removing record") - (tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id - " server:check-if-running") - res))) - #f)))) + dotserver + #f)) + #f))) ;; called in megatest.scm, host-port is string hostname:port ;; -(define (server:ping run-id host:port) +;; NOTE: This is NOT called directly from clients as not all transports support a client running +;; in the same process as the server. +;; +(define (server:ping host:port) (let ((tdbdat (tasks:open-db))) (let* ((host-port (let ((slst (string-split host:port ":"))) (if (eq? (length slst) 2) (list (car slst)(string->number (cadr slst))) #f))) - (toppath (launch:setup)) - (server-db-dat (if (not host-port)(tasks:get-server (db:delay-if-busy tdbdat) run-id) #f))) - (if (not run-id) - (begin - (debug:print-error 0 *default-log-port* "must specify run-id when doing ping, -run-id n") - (print "ERROR: No run-id") - (exit 1)) - (if (and (not host-port) - (not server-db-dat)) - (begin - (print "ERROR: bad host:port") - (exit 1)) - (let* ((iface (if host-port (car host-port) (tasks:hostinfo-get-interface server-db-dat))) - (port (if host-port (cadr host-port)(tasks:hostinfo-get-port server-db-dat))) - (server-dat (http-transport:client-connect iface port)) - (login-res (rmt:login-no-auto-client-setup server-dat run-id))) - (if (and (list? login-res) - (car login-res)) - (begin - (print "LOGIN_OK") - (exit 0)) - (begin - (print "LOGIN_FAILED") - (exit 1))))))))) + (toppath (launch:setup))) + (if (not host-port) + (begin + (print "ERROR: bad host:port") + (exit 1)) + (let* ((iface (if host-port (car host-port) (tasks:hostinfo-get-interface server-db-dat))) + (port (if host-port (cadr host-port)(tasks:hostinfo-get-port server-db-dat))) + (server-dat (http-transport:client-connect iface port)) + (login-res (rmt:login-no-auto-client-setup server-dat))) + (if (and (list? login-res) + (car login-res)) + (begin + (print "LOGIN_OK") + (exit 0)) + (begin + (print "LOGIN_FAILED") + (exit 1)))))))) ;; run ping in separate process, safest way in some cases ;; -(define (server:ping-server run-id iface port) +(define (server:ping-server ifaceport) (with-input-from-pipe - (conc (common:get-megatest-exe) " -run-id " run-id " -ping " (conc iface ":" port)) + (conc (common:get-megatest-exe) " -ping " ifaceport) (lambda () (let loop ((inl (read-line)) (res "NOREPLY")) (if (eof-object? inl) (case (string->symbol res) @@ -258,16 +266,12 @@ (define (server:login toppath) (lambda (toppath) (set! *last-db-access* (current-seconds)) (if (equal? *toppath* toppath) - (begin - ;; (debug:print-info 2 *default-log-port* "login successful") - #t) - (begin - ;; (debug:print-info 2 *default-log-port* "login failed") - #f)))) + #t + #f))) (define (server:get-timeout) (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) (string->number tmo)) Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -397,17 +397,17 @@ ;; update the primary record IF state AND status are defined (if (and state status) (begin (rmt:test-set-status-state run-id test-id real-status state (if waived waived comment)) - (mt:process-triggers run-id test-id state real-status) + ;; (mt:process-triggers run-id test-id state real-status) ;; triggers are called in test-set-status-state )) ;; if status is "AUTO" then call rollup (note, this one modifies data in test ;; run area, it does remote calls under the hood. - (if (and test-id state status (equal? status "AUTO")) - (rmt:test-data-rollup run-id test-id status)) + ;; (if (and test-id state status (equal? status "AUTO")) + ;; (rmt:test-data-rollup run-id test-id status)) ;; add metadata (need to do this way to avoid SQL injection issues) ;; :first_err ;; (let ((val (hash-table-ref/default otherdat ":first_err" #f))) @@ -481,13 +481,12 @@ (lockf (conc outputfilename ".lock"))) (let loop ((have-lock (common:simple-file-lock lockf))) (if have-lock (let ((script (configf:lookup *configdat* "testrollup" test-name))) (print "Obtained lock for " outputfilename) - ;; (rmt:top-test-set-per-pf-counts run-id test-name) (rmt:roll-up-pass-fail-counts run-id test-name "" #f #f #f) - (rmt:top-test-set-per-pf-counts run-id test-name) + ;; (rmt:test-set-status-state run-id test-name #f #f #f) ;; (rmt:top-test-set-per-pf-counts run-id test-name) (if script (system (conc script " > " outputfilename " & ")) (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename)) (common:simple-file-release-lock lockf) (change-directory orig-dir)