Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -108,11 +108,11 @@ (remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id) (thread-sleep! 1) (client:setup-http areapath remaining-tries: (- remaining-tries 1)) ))) (begin ;; no server registered - (server:kind-run areapath) + ;; (server:kind-run areapath) + (server:start-and-wait areapath) (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. - (server:start-and-wait areapath) (client:setup-http areapath remaining-tries: (- remaining-tries 1))))))))) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -150,11 +150,12 @@ (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) - (server-timeout (or (server:get-timeout) 100))) ;; default to 100 seconds + (server-timeout (or (server:get-timeout) 100)) + (force-server #f)) ;; default to 100 seconds ;; launching and hosts (defstruct host (reachable #f) (last-update 0) @@ -921,11 +922,35 @@ tags-testpatt) ((and (equal? args-testpatt "%") rtestpatt) (debug:print-info 0 *default-log-port* "testpatt defined in "testpatt-key" from runconfigs: " rtestpatt) rtestpatt) (else args-testpatt)))) - + + + +(define (common:false-on-exception thunk #!key (message #f)) + (handle-exceptions exn + (begin + (if message + (debug:print-info 0 *default-log-port* message)) + #f) (thunk) )) + +(define (common:file-exists? path-string) + ;; this avoids stack dumps in the case where + + ;;;; TODO: catch permission denied exceptions and emit appropriate warnings, eg: system error while trying to access file: "/nfs/pdx/disks/icf_env_disk001/bjbarcla/gwa/issues/mtdev/randy-slow/reproduce/q... + (common:false-on-exception (lambda () (file-exists? path-string)) + message: (conc "Unable to access path: " path-string) + )) + +(define (common:directory-exists? path-string) + ;;;; TODO: catch permission denied exceptions and emit appropriate warnings, eg: system error while trying to access file: "/nfs/pdx/disks/icf_env_disk001/bjbarcla/gwa/issues/mtdev/randy-slow/reproduce/q... + (common:false-on-exception (lambda () (directory-exists? path-string)) + message: (conc "Unable to access path: " path-string) + )) + + (define (common:get-linktree) (or (getenv "MT_LINKTREE") (if *configdat* (configf:lookup *configdat* "setup" "linktree")))) @@ -1013,10 +1038,22 @@ ;; (define (common:use-cache?) (not (or (args:get-arg "-no-cache") (and *configdat* (equal? (configf:lookup *configdat* "setup" "use-cache") "no"))))) + +;; force use of server? +;; +(define (common:force-server?) + (let ((force-setting (configf:lookup "server" "force")) + (force-type (if force-setting (string->symbol force-setting) #f))) + (case force-type + ((#f) #f) + ((always) #t) + ((test) (if (args:get-arg "-execute") ;; we are in a test + #t + #f))))) ;;====================================================================== ;; M I S C L I S T S ;;====================================================================== Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -770,11 +770,11 @@ (sections (if target (list "default" target) #f)) ;; for runconfigs (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config (rundir (if (and runname target linktree)(conc linktree "/" target "/" runname) #f)) (mtcachef (and rundir (conc rundir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) (rccachef (and rundir (conc rundir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) - (cancreate (and rundir (file-exists? rundir)(file-write-access? rundir))) + (cancreate (and rundir (common:file-exists? rundir)(file-write-access? rundir))) (cxt (hash-table-ref/default *contexts* toppath #f))) ;; create our cxt for this area if it doesn't already exist (if (not cxt)(hash-table-set! *contexts* toppath (make-cxt))) @@ -783,11 +783,11 @@ (cond ;; data was read and cached and available in *configstatus*, toppath has already been set ((eq? *configstatus* 'fulldata) *toppath*) ;; if mtcachef exists just read it, however we need to assume toppath is available in $MT_RUN_AREA_HOME - ((and mtcachef (file-exists? mtcachef) (get-environment-variable "MT_RUN_AREA_HOME") use-cache) + ((and mtcachef (common:file-exists? mtcachef) (get-environment-variable "MT_RUN_AREA_HOME") use-cache) (set! *configdat* (configf:read-alist mtcachef)) (set! *runconfigdat* (configf:read-alist rccachef)) (set! *configinfo* (list *configdat* (get-environment-variable "MT_RUN_AREA_HOME"))) (set! *configstatus* 'fulldata) (set! *toppath* (get-environment-variable "MT_RUN_AREA_HOME")) @@ -866,11 +866,11 @@ ;; additional house keeping (let* ((linktree (or (getenv "MT_LINKTREE") (if *configdat* (configf:lookup *configdat* "setup" "linktree") #f)))) (if linktree (begin - (if (not (file-exists? linktree)) + (if (not (common:file-exists? linktree)) (begin (handle-exceptions exn (begin (debug:print-error 0 *default-log-port* "Something went wrong when trying to create linktree dir at " linktree) @@ -962,17 +962,17 @@ ;; Update the rundir path in the test record for all, rundir=physical, shortdir=logical ;; rundir shortdir (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path run-id) (debug:print 2 *default-log-port* "INFO:\n lnkbase=" lnkbase "\n lnkpath=" lnkpath "\n toptest-path=" toptest-path "\n test-path=" test-path) - (if (not (file-exists? linktree)) + (if (not (common:file-exists? linktree)) (begin (debug:print 0 *default-log-port* "WARNING: linktree did not exist! Creating it now at " linktree) (create-directory linktree #t))) ;; (system (conc "mkdir -p " linktree)))) ;; create the directory for the tests dir links, this is needed no matter what... - (if (and (not (directory-exists? lnkbase)) - (not (file-exists? lnkbase))) + (if (and (not (common:directory-exists? lnkbase)) + (not (common:file-exists? lnkbase))) (handle-exceptions exn (begin (debug:print-error 0 *default-log-port* "Problem creating linktree base at " lnkbase) (print-error-message exn (current-error-port))) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -107,11 +107,12 @@ (remote-hh-dat-set! runremote (common:get-homehost)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a read - ((and (cdr (remote-hh-dat runremote)) ;; on homehost + ((and (not (remote-force-server runremote)) ;; honor forced use of server + (cdr (remote-hh-dat runremote)) ;; on homehost (member cmd api:read-only-queries)) ;; this is a read (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3") (rmt:open-qry-close-locally cmd 0 params)) @@ -124,19 +125,21 @@ (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a write, we already have a server - ((and (cdr (remote-hh-dat runremote)) ;; on homehost + ((and (not (remote-force-server runremote)) ;; honor forced use of server + (cdr (remote-hh-dat runremote)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write - (remote-server-url runremote)) ;; have a server + (remote-server-url runremote)) ;; have a server (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4") (rmt:open-qry-close-locally cmd 0 params)) ;; on homehost, no server contact made and this is a write, passively start a server - ((and (cdr (remote-hh-dat runremote)) ; new + ((and (not (remote-force-server runremote)) ;; honor forced use of server + (cdr (remote-hh-dat runremote)) ;; new (not (remote-server-url runremote)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") (let ((server-url (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call (if server-url @@ -149,14 +152,16 @@ ((and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost (not (remote-conndat runremote))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) (mutex-unlock! *rmt-mutex*) (server:start-and-wait *toppath*) + (if (common:force-server?)(remote-force-server-set! runremote #t)) (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as ;; all set up if get this far, dispatch the query - ((cdr (remote-hh-dat runremote)) ;; we are on homehost + ((and (not (remote-force-server runremote)) + (cdr (remote-hh-dat runremote))) ;; we are on homehost (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 7") (rmt:open-qry-close-locally cmd (if rid rid 0) params)) ;; not on homehost, do server query Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -257,22 +257,28 @@ *my-client-signature*))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched (define (server:kind-run areapath) - (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun - (call-num (car last-run-dat)) - (when-run (cadr last-run-dat)) - (run-delay (+ (case call-num - ((0) 0) - ((1) 20) - ((2) 300) - (else 600)) - (random 5)))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously - (if (> (- (current-seconds) when-run) run-delay) - (server:run areapath)) - (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))) + (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? + (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun + (call-num (car last-run-dat)) + (when-run (cadr last-run-dat)) + (run-delay (+ (case call-num + ((0) 0) + ((1) 20) + ((2) 300) + (else 600)) + (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously + (lock-file (conc areapath "/logs/server-start.lock"))) + (if (> (- (current-seconds) when-run) run-delay) + (begin + (common:simple-file-lock lock-file expire-time: 15) + (server:run areapath) + (thread-sleep! 5) ;; don't release the lock for at least a few seconds + (common:simple-file-release-lock lock-file))) + (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) (let loop ((server-url (server:check-if-running areapath))) (if (or server-url