Index: archivemod.scm ================================================================== --- archivemod.scm +++ archivemod.scm @@ -53,11 +53,12 @@ chicken.process-context.posix chicken.sort chicken.string chicken.time chicken.time.posix - + system-information + (prefix base64 base64:) ;; csv-xml directory-utils matchable regex @@ -417,11 +418,11 @@ (rsync-exe (or (configf:lookup *configdat* "archive" "rsync") "rsync")) (print-prefix "Running: ") (archive-info (archive:allocate-new-archive-block blockid-cache *toppath* tsname min-space target-patt run-patt "megatest-db")) (archive-dir (if archive-info (cdr archive-info) #f)) (archive-id (if archive-info (car archive-info) -1)) - (home-host (common:get-homehost)) + (home-host (get-host-name)) ;; common:get-homehost)) ;; TODO: Fix this. (archive-time (seconds->std-time-str (current-seconds))) (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time)) (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db")) (dbfile (conc archive-staging-db "/megatest.db"))) (create-directory archive-staging-db #t) Index: launchmod.scm ================================================================== --- launchmod.scm +++ launchmod.scm @@ -371,11 +371,11 @@ ;; (runremote (assoc/default 'runremote cmdinfo)) ;; (transport (assoc/default 'transport cmdinfo)) ;; not used ;; (serverinf (assoc/default 'serverinf cmdinfo)) ;; (port (assoc/default 'port cmdinfo)) (serverurl (assoc/default 'serverurl cmdinfo)) - (homehost (assoc/default 'homehost cmdinfo)) + ;; (homehost (assoc/default 'homehost cmdinfo)) (run-id (assoc/default 'run-id cmdinfo)) (test-id (assoc/default 'test-id cmdinfo)) (target (assoc/default 'target cmdinfo)) (areaname (assoc/default 'areaname cmdinfo)) (itemdat (assoc/default 'itemdat cmdinfo)) @@ -1444,11 +1444,11 @@ (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) ;; (list 'transport (conc *transport-type*)) ;; (list 'serverinf *server-info*) - (list 'homehost (let* ((hhdat (common:get-homehost))) + #;(list 'homehost (let* ((hhdat (common:get-homehost))) (if hhdat (car hhdat) #f))) (list 'serverurl (if *runremote* (remote-server-url *runremote*) @@ -2281,11 +2281,11 @@ ;;====================================================================== ;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage (define (common:watchdog) (debug:print-info 13 *default-log-port* "common:watchdog entered.") - (if (launch:setup) + #;(if (launch:setup) (if (common:on-homehost?) (let ((dbstruct (db:setup #t))) (debug:print-info 13 *default-log-port* "after db:setup with dbstruct=" dbstruct) (cond ((dbr:dbstruct-read-only dbstruct) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -882,11 +882,11 @@ (if (args:any-defined? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun") (debug:print 0 *default-log-port* (string-intersperse (argv) " "))) ;; some switches imply homehost. Exit here if not on homehost ;; - (let ((homehost-required (list "-cleanup-db" "-server"))) + #;(let ((homehost-required (list "-cleanup-db" "-server"))) (if (apply args:any-defined? homehost-required) (if (not (common:on-homehost?)) (for-each (lambda (switch) (if (args:get-arg switch) @@ -2455,11 +2455,11 @@ (if (or (getenv "MT_RUNSCRIPT") (args:get-arg "-repl") (args:get-arg "-load")) (let* ((toppath (launch:setup)) (dbstruct (if (and toppath - (common:on-homehost?)) + #;(common:on-homehost?)) (db:setup #f) #f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f))) (if *toppath* (cond ((getenv "MT_RUNSCRIPT") Index: rmtmod.scm ================================================================== --- rmtmod.scm +++ rmtmod.scm @@ -151,210 +151,212 @@ ;;====================================================================== ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; -(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected - - #;(common:telemetry-log (conc "rmt:"(->string cmd)) - payload: `((rid . ,rid) - (params . ,params))) - - (if (> attemptnum 2) - (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum)) - - (cond - ((> attemptnum 2) (thread-sleep! 0.053)) - ((> attemptnum 10) (thread-sleep! 0.5)) - ((> attemptnum 20) (thread-sleep! 1))) - (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15))) - (begin (server:run *toppath*) (thread-sleep! 3))) - - - ;;DOT digraph megatest_state_status { - ;;DOT ranksep=0; - ;;DOT // rankdir=LR; - ;;DOT node [shape="box"]; - ;;DOT "rmt:send-receive" -> MUTEXLOCK; - ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; } - ;; do all the prep locked under the rmt-mutex - (mutex-lock! *rmt-mutex*) - - ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote - ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. - ;; 3. do the query, if on homehost use local access - ;; - (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value - (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas - (runremote (or area-dat - *runremote*)) - (attemptnum (+ 1 attemptnum)) - (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*))) - - ;; DOT INIT_RUNREMOTE; // leaving off - doesn't really add to the clarity - ;; DOT MUTEXLOCK -> INIT_RUNREMOTE [label="no remote?"]; - ;; DOT INIT_RUNREMOTE -> MUTEXLOCK; - ;; ensure we have a record for our connection for given area - (if (not runremote) ;; can remove this one. should never get here. - (begin - (set! *runremote* (make-and-init-remote)) - (let* ((server-info (remote-server-info *runremote*))) - (if server-info - (begin - (remote-server-url-set! *runremote* (server:record->url server-info)) - (remote-server-id-set! *runremote* (server:record->id server-info))))) - (set! runremote *runremote*))) ;; new runremote will come from this on next iteration - - ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity - ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"]; - ;; DOT SET_HOMEHOST -> MUTEXLOCK; - ;; ensure we have a homehost record - (if (not (pair? (remote-hh-dat runremote))) ;; not on homehost - (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little - (remote-hh-dat-set! runremote (common:get-homehost))) - - ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) - (cond - ;;DOT EXIT; - ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" } - ;; give up if more than 150 attempts - ((> attemptnum 150) - (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.") - (exit 1)) - - ;;DOT CASE2 [label="local\nreadonly\nquery"]; - ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2} - ;;DOT CASE2 -> "rmt:open-qry-close-locally"; - ;; readonly mode, read request- handle it - case 2 - ((and readonly-mode - (member cmd api:read-only-queries)) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") - (rmt:open-qry-close-locally cmd 0 params) - ) - - ;;DOT CASE3 [label="write in\nread-only mode"]; - ;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3} - ;;DOT CASE3 -> "#f"; - ;; readonly mode, write request. Do nothing, return #f - (readonly-mode (extras-readonly-mode *rmt-mutex* *default-log-port* cmd params)) - - ;; This block was for pre-emptively resetting the connection if there had been no communication for some time. - ;; I don't think it adds any value. If the server is not there, just fail and start a new connection. - ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout) - ;; - ;;DOT CASE4 [label="reset\nconnection"]; - ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} - ;;DOT CASE4 -> "rmt:send-receive"; - ;; reset the connection if it has been unused too long - ((and runremote - (remote-conndat runremote) - (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on - (+ (http-transport:server-dat-get-last-access (remote-conndat runremote)) - (remote-server-timeout runremote)))) - (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") - (http-transport:close-connections area-dat: runremote) - (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. - (mutex-unlock! *rmt-mutex*) - (rmt:send-receive cmd rid params attemptnum: attemptnum)) - - ;;DOT CASE5 [label="local\nread"]; - ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; - ;;DOT CASE5 -> "rmt:open-qry-close-locally"; - - ;; on homehost and this is a read - ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required - (cdr (remote-hh-dat runremote)) ;; on homehost - (member cmd api:read-only-queries)) ;; this is a read - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") - (rmt:open-qry-close-locally cmd 0 params)) - - ;;DOT CASE6 [label="init\nremote"]; - ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6}; - ;;DOT CASE6 -> "rmt:send-receive"; - ;; on homehost and this is a write, we already have a server, but server has died - ((and (cdr (remote-hh-dat runremote)) ;; on homehost - (not (member cmd api:read-only-queries)) ;; this is a write - (remote-server-url runremote) ;; have a server - (not (server:ping (remote-server-url runremote) (remote-server-id runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. - (set! *runremote* (make-and-init-remote)) - (let* ((server-info (remote-server-info *runremote*))) - (if server-info - (begin - (remote-server-url-set! *runremote* (server:record->url server-info)) - (remote-server-id-set! *runremote* (server:record->id server-info))))) - (remote-force-server-set! runremote (common:force-server?)) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6") - (rmt:send-receive cmd rid params attemptnum: attemptnum)) - - ;;DOT CASE7 [label="homehost\nwrite"]; - ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7}; - ;;DOT CASE7 -> "rmt:open-qry-close-locally"; - ;; on homehost and this is a write, we already have a server - ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required - (cdr (remote-hh-dat runremote)) ;; on homehost - (not (member cmd api:read-only-queries)) ;; this is a write - (remote-server-url runremote)) ;; have a server - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") - (rmt:open-qry-close-locally cmd 0 params)) - - ;;DOT CASE8 [label="force\nserver"]; - ;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8}; - ;;DOT CASE8 -> "rmt:open-qry-close-locally"; - ;; on homehost, no server contact made and this is a write, passively start a server - ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required - (cdr (remote-hh-dat runremote)) ;; have homehost - (not (remote-server-url runremote)) ;; no connection yet - (not (member cmd api:read-only-queries))) ;; not a read-only query - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") - (let ((server-info (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call - (if server-info - (begin - (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed - (remote-server-id-set! runremote (server:record->id server-info))) - (if (common:force-server?) - (server:start-and-wait *toppath*) - (server:kind-run *toppath*))) - (remote-force-server-set! runremote (common:force-server?)) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8.1") - (rmt:open-qry-close-locally cmd 0 params))) - - ;;DOT CASE9 [label="force server\nnot on homehost"]; - ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9}; - ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive"; - ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one - (not (remote-conndat runremote))) - (and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost - (not (remote-conndat runremote)))) ;; and no connection - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) - (mutex-unlock! *rmt-mutex*) - (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up? - (server:start-and-wait *toppath*)) - (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http - (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as - - ;;DOT CASE10 [label="on homehost"]; - ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10}; - ;;DOT CASE10 -> "rmt:open-qry-close-locally"; - ;; all set up if get this far, dispatch the query - ((and (not (remote-force-server runremote)) - (cdr (remote-hh-dat runremote))) ;; we are on homehost - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10") - (rmt:open-qry-close-locally cmd (if rid rid 0) params)) - - ;;DOT CASE11 [label="send_receive"]; - ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11}; - ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"]; - ;;DOT CASE11 -> "RESULT" [label="call succeeded"]; - ;; not on homehost, do server query - (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid))))) - ;;DOT } +(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) + ;; start attemptnum at 1 so the modulo below works as expected + #f) + +;; ;; #;(common:telemetry-log (conc "rmt:"(->string cmd)) +;; ;; payload: `((rid . ,rid) +;; ;; (params . ,params))) +;; ;; +;; ;; (if (> attemptnum 2) +;; ;; (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum)) +;; ;; +;; ;; (cond +;; ;; ((> attemptnum 2) (thread-sleep! 0.053)) +;; ;; ((> attemptnum 10) (thread-sleep! 0.5)) +;; ;; ((> attemptnum 20) (thread-sleep! 1))) +;; ;; (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15))) +;; ;; (begin (server:run *toppath*) (thread-sleep! 3))) +;; ;; +;; ;; +;; ;; ;;DOT digraph megatest_state_status { +;; ;; ;;DOT ranksep=0; +;; ;; ;;DOT // rankdir=LR; +;; ;; ;;DOT node [shape="box"]; +;; ;; ;;DOT "rmt:send-receive" -> MUTEXLOCK; +;; ;; ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; } +;; ;; ;; do all the prep locked under the rmt-mutex +;; ;; (mutex-lock! *rmt-mutex*) +;; ;; +;; ;; ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote +;; ;; ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. +;; ;; ;; 3. do the query, if on homehost use local access +;; ;; ;; +;; ;; (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value +;; ;; (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas +;; ;; (runremote (or area-dat +;; ;; *runremote*)) +;; ;; (attemptnum (+ 1 attemptnum)) +;; ;; (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*))) +;; ;; +;; ;; ;; DOT INIT_RUNREMOTE; // leaving off - doesn't really add to the clarity +;; ;; ;; DOT MUTEXLOCK -> INIT_RUNREMOTE [label="no remote?"]; +;; ;; ;; DOT INIT_RUNREMOTE -> MUTEXLOCK; +;; ;; ;; ensure we have a record for our connection for given area +;; ;; (if (not runremote) ;; can remove this one. should never get here. +;; ;; (begin +;; ;; (set! *runremote* (make-and-init-remote)) +;; ;; (let* ((server-info (remote-server-info *runremote*))) +;; ;; (if server-info +;; ;; (begin +;; ;; (remote-server-url-set! *runremote* (server:record->url server-info)) +;; ;; (remote-server-id-set! *runremote* (server:record->id server-info))))) +;; ;; (set! runremote *runremote*))) ;; new runremote will come from this on next iteration +;; ;; +;; ;; ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity +;; ;; ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"]; +;; ;; ;; DOT SET_HOMEHOST -> MUTEXLOCK; +;; ;; ;; ensure we have a homehost record +;; ;; (if (not (pair? (remote-hh-dat runremote))) ;; not on homehost +;; ;; (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little +;; ;; (remote-hh-dat-set! runremote (common:get-homehost))) +;; ;; +;; ;; ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) +;; ;; (cond +;; ;; ;;DOT EXIT; +;; ;; ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" } +;; ;; ;; give up if more than 150 attempts +;; ;; ((> attemptnum 150) +;; ;; (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.") +;; ;; (exit 1)) +;; ;; +;; ;; ;;DOT CASE2 [label="local\nreadonly\nquery"]; +;; ;; ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2} +;; ;; ;;DOT CASE2 -> "rmt:open-qry-close-locally"; +;; ;; ;; readonly mode, read request- handle it - case 2 +;; ;; ((and readonly-mode +;; ;; (member cmd api:read-only-queries)) +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") +;; ;; (rmt:open-qry-close-locally cmd 0 params) +;; ;; ) +;; ;; +;; ;; ;;DOT CASE3 [label="write in\nread-only mode"]; +;; ;; ;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3} +;; ;; ;;DOT CASE3 -> "#f"; +;; ;; ;; readonly mode, write request. Do nothing, return #f +;; ;; (readonly-mode (extras-readonly-mode *rmt-mutex* *default-log-port* cmd params)) +;; ;; +;; ;; ;; This block was for pre-emptively resetting the connection if there had been no communication for some time. +;; ;; ;; I don't think it adds any value. If the server is not there, just fail and start a new connection. +;; ;; ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout) +;; ;; ;; +;; ;; ;;DOT CASE4 [label="reset\nconnection"]; +;; ;; ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} +;; ;; ;;DOT CASE4 -> "rmt:send-receive"; +;; ;; ;; reset the connection if it has been unused too long +;; ;; ((and runremote +;; ;; (remote-conndat runremote) +;; ;; (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on +;; ;; (+ (http-transport:server-dat-get-last-access (remote-conndat runremote)) +;; ;; (remote-server-timeout runremote)))) +;; ;; (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") +;; ;; (http-transport:close-connections area-dat: runremote) +;; ;; (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) +;; ;; +;; ;; ;;DOT CASE5 [label="local\nread"]; +;; ;; ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; +;; ;; ;;DOT CASE5 -> "rmt:open-qry-close-locally"; +;; ;; +;; ;; ;; on homehost and this is a read +;; ;; ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required +;; ;; (cdr (remote-hh-dat runremote)) ;; on homehost +;; ;; (member cmd api:read-only-queries)) ;; this is a read +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") +;; ;; (rmt:open-qry-close-locally cmd 0 params)) +;; ;; +;; ;; ;;DOT CASE6 [label="init\nremote"]; +;; ;; ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6}; +;; ;; ;;DOT CASE6 -> "rmt:send-receive"; +;; ;; ;; on homehost and this is a write, we already have a server, but server has died +;; ;; ((and (cdr (remote-hh-dat runremote)) ;; on homehost +;; ;; (not (member cmd api:read-only-queries)) ;; this is a write +;; ;; (remote-server-url runremote) ;; have a server +;; ;; (not (server:ping (remote-server-url runremote) (remote-server-id runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. +;; ;; (set! *runremote* (make-and-init-remote)) +;; ;; (let* ((server-info (remote-server-info *runremote*))) +;; ;; (if server-info +;; ;; (begin +;; ;; (remote-server-url-set! *runremote* (server:record->url server-info)) +;; ;; (remote-server-id-set! *runremote* (server:record->id server-info))))) +;; ;; (remote-force-server-set! runremote (common:force-server?)) +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6") +;; ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) +;; ;; +;; ;; ;;DOT CASE7 [label="homehost\nwrite"]; +;; ;; ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7}; +;; ;; ;;DOT CASE7 -> "rmt:open-qry-close-locally"; +;; ;; ;; on homehost and this is a write, we already have a server +;; ;; ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required +;; ;; (cdr (remote-hh-dat runremote)) ;; on homehost +;; ;; (not (member cmd api:read-only-queries)) ;; this is a write +;; ;; (remote-server-url runremote)) ;; have a server +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") +;; ;; (rmt:open-qry-close-locally cmd 0 params)) +;; ;; +;; ;; ;;DOT CASE8 [label="force\nserver"]; +;; ;; ;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8}; +;; ;; ;;DOT CASE8 -> "rmt:open-qry-close-locally"; +;; ;; ;; on homehost, no server contact made and this is a write, passively start a server +;; ;; ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required +;; ;; (cdr (remote-hh-dat runremote)) ;; have homehost +;; ;; (not (remote-server-url runremote)) ;; no connection yet +;; ;; (not (member cmd api:read-only-queries))) ;; not a read-only query +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") +;; ;; (let ((server-info (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call +;; ;; (if server-info +;; ;; (begin +;; ;; (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed +;; ;; (remote-server-id-set! runremote (server:record->id server-info))) +;; ;; (if (common:force-server?) +;; ;; (server:start-and-wait *toppath*) +;; ;; (server:kind-run *toppath*))) +;; ;; (remote-force-server-set! runremote (common:force-server?)) +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8.1") +;; ;; (rmt:open-qry-close-locally cmd 0 params))) +;; ;; +;; ;; ;;DOT CASE9 [label="force server\nnot on homehost"]; +;; ;; ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9}; +;; ;; ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive"; +;; ;; ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one +;; ;; (not (remote-conndat runremote))) +;; ;; (and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost +;; ;; (not (remote-conndat runremote)))) ;; and no connection +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up? +;; ;; (server:start-and-wait *toppath*)) +;; ;; (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http +;; ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as +;; ;; +;; ;; ;;DOT CASE10 [label="on homehost"]; +;; ;; ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10}; +;; ;; ;;DOT CASE10 -> "rmt:open-qry-close-locally"; +;; ;; ;; all set up if get this far, dispatch the query +;; ;; ((and (not (remote-force-server runremote)) +;; ;; (cdr (remote-hh-dat runremote))) ;; we are on homehost +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10") +;; ;; (rmt:open-qry-close-locally cmd (if rid rid 0) params)) +;; ;; +;; ;; ;;DOT CASE11 [label="send_receive"]; +;; ;; ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11}; +;; ;; ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"]; +;; ;; ;;DOT CASE11 -> "RESULT" [label="call succeeded"]; +;; ;; ;; not on homehost, do server query +;; ;; (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid))))) +;; ;; ;;DOT } ;; bunch of small functions factored out of send-receive to make debug easier ;; ;; No Title @@ -1765,12 +1767,12 @@ ;; (begin ;; (debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.") ;; (exit 1)))) (define (common:run-sync?) - (and (common:on-homehost?) - (args:get-arg "-server"))) + ;; (and (common:on-homehost?) + (args:get-arg "-server")) ;; this one seems to be the general entry point @@ -1789,11 +1791,11 @@ (thread-sleep! 5) (loop (server:check-if-running areapath) (+ try-num 1))))))) (define (make-and-init-remote) - (make-remote hh-dat: (common:get-homehost) + (make-remote ;; hh-dat: (common:get-homehost) server-info: (if *toppath* (server:check-if-running *toppath*) #f) server-timeout: (server:expiration-timeout))) Index: runsmod.scm ================================================================== --- runsmod.scm +++ runsmod.scm @@ -1024,16 +1024,16 @@ ((runs:dat-load-mgmt-function runsdat)) (runs:dat-load-mgmt-function-set! runsdat (lambda () ;; jobtools maxload is useful for where the full Megatest run is done on one machine - (if (and (not (common:on-homehost?)) + (if (and #;(not (common:on-homehost?)) maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues - (if maxhomehostload + #;(if maxhomehostload (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))))) Index: servermod.scm ================================================================== --- servermod.scm +++ servermod.scm @@ -67,11 +67,11 @@ ;;====================================================================== ;; logic for getting homehost. Returns (host . at-home) ;; IF *toppath* is not set, wait up to five seconds trying every two seconds ;; (this is to accomodate the watchdog) ;; -(define (common:get-homehost #!key (trynum 5)) +#;(define (common:get-homehost #!key (trynum 5)) (assert *toppath* "ERROR: common:get-homehost called before launch:setup. This is fatal.") ;; called often especially at start up. use mutex to eliminate collisions (mutex-lock! *homehost-mutex*) (cond (*home-host* @@ -125,17 +125,17 @@ *home-host*)))) ;;====================================================================== ;; am I on the homehost? ;; -(define (common:on-homehost?) +#;(define (common:on-homehost?) (let ((hh (common:get-homehost))) (if hh (cdr hh) #f))) -(define (common:wait-for-homehost-load maxnormload msg) +#;(define (common:wait-for-homehost-load maxnormload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f (common:get-homehost))) (hh (if hh-dat (car hh-dat) #f))) (common:wait-for-normalized-load maxnormload msg hh))) @@ -150,49 +150,52 @@ (let* ((curr-host (get-host-name)) ;; (attempt-in-progress (server:start-attempted? areapath)) ;; (dot-server-url (server:check-if-running areapath)) (curr-ip (server:get-best-guess-address curr-host)) (curr-pid (current-process-id)) - (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) - (target-host (car homehost)) + ;; (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) + ;; (target-host (car homehost)) (testsuite (common:get-area-name)) (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) (profile-mode (or (configf:lookup *configdat* "misc" "profilesw") "")) - (cmdln (conc (common:get-megatest-exe) - " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") - " -daemonize " - "") - ;; " -log " logfile - " -m testsuite:" testsuite - " " profile-mode - )) ;; (conc " >> " logfile " 2>&1 &"))))) - (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? + (cmdln (conc + (common:get-megatest-exe) + " -server " (or (get-host-name) "-") + (if (equal? (configf:lookup *configdat* "server" "daemonize") + "yes") + " -daemonize " + "") + ;; " -log " logfile + " -m testsuite:" testsuite + " " profile-mode + )) + ;; (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) ;; we want the remote server to start in *toppath* so push there (push-directory areapath) (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") - (thread-start! log-rotate) + ;; (thread-start! log-rotate) ;; host.domain.tld match host? - (if (and target-host + #;(if (and target-host ;; look at target host, is it host.domain.tld or ip address and does it ;; match current ip or hostname (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) (not (equal? curr-ip target-host))) (begin (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) - (setenv "TARGETHOST_LOGF" logfile) - (thread-sleep! (/ (pseudo-random-integer 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time + (setenv "NBFAKE_LOG" logfile) + ;; (thread-sleep! (/ (pseudo-random-integer 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit)) (system (conc "nbfake " cmdln)) - (unsetenv "TARGETHOST_LOGF") - (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) - (thread-join! log-rotate) + (unsetenv "NBFAKE_LOG") + ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) + ;; (thread-join! log-rotate) (pop-directory))) (define (server:record->url servr) (handle-exceptions exn