Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -125,10 +125,12 @@ (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (cond ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f #f "remote must be called with a vector") ) + ((> *api-process-request-count* 20) + (vector #f 'overloaded)) (else (let* ((cmd-in (vector-ref dat 0)) (cmd (if (symbol? cmd-in) cmd-in (string->symbol cmd-in))) @@ -272,16 +274,17 @@ ;; TESTMETA ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) ;; TASKS ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)))))) + ;; save all stats + (let ((delta-t (- (current-milliseconds) + start-t))) + (hash-table-set! *db-api-call-time* cmd + (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if (not writecmd-in-readonly-mode) - (let ((delta-t (- (current-milliseconds) - start-t))) - (hash-table-set! *db-api-call-time* cmd - (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))) - (vector #t res)) + (vector #t res) (vector #f res))))))) ;; http-server send-response ;; api:process-request ;; db:* Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -78,11 +78,11 @@ (exit 1)) ;; ;; Alternatively here, we can get the list of candidate servers and work our way ;; through them searching for a good one. ;; - (let* ((server-dat (server:get-first-best areapath)) + (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath)) (runremote (or area-dat *runremote*))) (if (not server-dat) ;; no server found (client:setup-http areapath remaining-tries: (- remaining-tries 1)) (let ((host (cadr server-dat)) (port (caddr server-dat))) @@ -108,11 +108,11 @@ (remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id) (thread-sleep! 1) (client:setup-http areapath remaining-tries: (- remaining-tries 1)) ))) (begin ;; no server registered - (server:kind-run areapath) + ;; (server:kind-run areapath) + (server:start-and-wait areapath) (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. - (server:start-and-wait areapath) (client:setup-http areapath remaining-tries: (- remaining-tries 1))))))))) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -155,11 +155,14 @@ (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) - (server-timeout (or (server:get-timeout) 100))) ;; default to 100 seconds + (server-timeout (or (server:get-timeout) 100)) ;; default to 100 seconds + (force-server #f) + (ro-mode #f) + (ro-mode-checked #f)) ;; flag that indicates we have checked for ro-mode ;; launching and hosts (defstruct host (reachable #f) (last-update 0) @@ -418,29 +421,46 @@ ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; (define (common:simple-file-lock fname #!key (expire-time 300)) - (if (file-exists? fname) - (if (> (- (current-seconds)(file-modification-time fname)) expire-time) - (begin - (delete-file* fname) - (common:simple-file-lock fname expire-time: expire-time)) - #f) - (let ((key-string (conc (get-host-name) "-" (current-process-id)))) - (with-output-to-file fname - (lambda () - (print key-string))) - (thread-sleep! 0.25) - (if (file-exists? fname) - (with-input-from-file fname - (lambda () - (equal? key-string (read-line)))) - #f)))) - + (handle-exceptions + exn + #f ;; don't really care what went wrong right now. NOTE: I have not seen this one actually fail. + (if (file-exists? fname) + (if (> (- (current-seconds)(file-modification-time fname)) expire-time) + (begin + (delete-file* fname) + (common:simple-file-lock fname expire-time: expire-time)) + #f) + (let ((key-string (conc (get-host-name) "-" (current-process-id)))) + (with-output-to-file fname + (lambda () + (print key-string))) + (thread-sleep! 0.25) + (if (file-exists? fname) + (with-input-from-file fname + (lambda () + (equal? key-string (read-line)))) + #f))))) + +(define (common:simple-file-lock-and-wait fname #!key (expire-time 300)) + (let ((end-time (+ expire-time (current-seconds)))) + (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time))) + (if got-lock + #t + (if (> end-time (current-seconds)) + (begin + (thread-sleep! 3) + (loop (common:simple-file-lock fname expire-time: expire-time))) + #f))))) + (define (common:simple-file-release-lock fname) - (delete-file* fname)) + (handle-exceptions + exn + #f ;; I don't really care why this failed (at least for now) + (delete-file* fname))) ;;====================================================================== ;; S T A T E S A N D S T A T U S E S ;;====================================================================== @@ -561,13 +581,14 @@ (if res (cadr res)(if (null? default) #f (car default))))) (define (common:get-testsuite-name) (or (configf:lookup *configdat* "setup" "area-name") ;; megatest is a flexible tool, testsuite is too limiting a description. (configf:lookup *configdat* "setup" "testsuite" ) + (getenv "MT_TESTSUITE_NAME") (if (string? *toppath* ) (pathname-file *toppath*) - (pathname-file (current-directory))))) + #f))) ;; (pathname-file (current-directory))))) (define common:get-area-name common:get-testsuite-name) (define (common:get-db-tmp-area) (if *db-cache-path* @@ -955,11 +976,35 @@ ;; tags-testpatt) ((and (equal? args-testpatt "%") rtestpatt) (debug:print-info 0 *default-log-port* "testpatt defined in "testpatt-key" from runconfigs: " rtestpatt) rtestpatt) (else args-testpatt)))) - + + + +(define (common:false-on-exception thunk #!key (message #f)) + (handle-exceptions exn + (begin + (if message + (debug:print-info 0 *default-log-port* message)) + #f) (thunk) )) + +(define (common:file-exists? path-string) + ;; this avoids stack dumps in the case where + + ;;;; TODO: catch permission denied exceptions and emit appropriate warnings, eg: system error while trying to access file: "/nfs/pdx/disks/icf_env_disk001/bjbarcla/gwa/issues/mtdev/randy-slow/reproduce/q... + (common:false-on-exception (lambda () (file-exists? path-string)) + message: (conc "Unable to access path: " path-string) + )) + +(define (common:directory-exists? path-string) + ;;;; TODO: catch permission denied exceptions and emit appropriate warnings, eg: system error while trying to access file: "/nfs/pdx/disks/icf_env_disk001/bjbarcla/gwa/issues/mtdev/randy-slow/reproduce/q... + (common:false-on-exception (lambda () (directory-exists? path-string)) + message: (conc "Unable to access path: " path-string) + )) + + (define (common:get-linktree) (or (getenv "MT_LINKTREE") (or (and *configdat* (configf:lookup *configdat* "setup" "linktree")) (if *toppath* @@ -1058,10 +1103,37 @@ (define (common:on-homehost?) (let ((hh (common:get-homehost))) (if hh (cdr hh) #f))) + +;; do we honor the caches of the config files? +;; +(define (common:use-cache?) + (not (or (args:get-arg "-no-cache") + (and *configdat* + (equal? (configf:lookup *configdat* "setup" "use-cache") "no"))))) + +;; force use of server? +;; +(define (common:force-server?) + (let* ((force-setting (configf:lookup *configdat* "server" "force")) + (force-type (if force-setting (string->symbol force-setting) #f)) + (force-result (case force-type + ((#f) #f) + ((always) #t) + ((test) (if (args:get-arg "-execute") ;; we are in a test + #t + #f)) + (else + (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.") + #t)))) ;; default to requiring server + (if force-result + (begin + (debug:print-info 0 *default-log-port* "forcing use of server, force setting is \"" force-setting "\".") + #t) + #f))) ;; do we honor the caches of the config files? ;; (define (common:use-cache?) (not (or (args:get-arg "-no-cache") Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -2718,11 +2718,11 @@ (define (dashboard:get-youngest-run-db-mod-time dbdir) (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir) + (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir) (current-seconds)) ;; something went wrong - just print an error and return current-seconds (common:max (map (lambda (filen) (file-modification-time filen)) (glob (conc dbdir "/*.db*")))))) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -2102,18 +2102,22 @@ ;; generate stats from *db-api-call-time* (let ((ordered-keys (sort (hash-table-keys *db-api-call-time*) (lambda (a b) (let ((sum-a (common:sum (hash-table-ref *db-api-call-time* a))) (sum-b (common:sum (hash-table-ref *db-api-call-time* b)))) - (> sum-a sum-b)))))) + (> sum-a sum-b))))) + (total 0)) (for-each (lambda (cmd-key) (let* ((dat (hash-table-ref *db-api-call-time* cmd-key)) - (avg (if (> (length dat) 0) + (num (length dat)) + (avg (if (> num 0) (/ (common:sum dat)(length dat))))) + (set! total (+ total num)) (debug:print-info 0 *default-log-port* cmd-key "\tavg: " avg " max: " (common:max dat) " min: " (common:min-max < dat) " num: " (length dat)))) - ordered-keys))) + ordered-keys) + (debug:print-info 0 *default-log-port* "TOTAL: " total " api calls since start."))) (define (db:get-all-run-ids dbstruct) (db:with-db dbstruct #f Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -418,11 +418,14 @@ (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) (begin (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) (flush-output *default-log-port*))) - + (if (common:low-noise-print 60 "dbstats") + (begin + (debug:print 0 *default-log-port* "Server stats:") + (db:print-current-query-stats))) (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600)) (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -421,10 +421,11 @@ (serverurl (assoc/default 'serverurl cmdinfo)) (homehost (assoc/default 'homehost cmdinfo)) (run-id (assoc/default 'run-id cmdinfo)) (test-id (assoc/default 'test-id cmdinfo)) (target (assoc/default 'target cmdinfo)) + (areaname (assoc/default 'areaname cmdinfo)) (itemdat (assoc/default 'itemdat cmdinfo)) (env-ovrd (assoc/default 'env-ovrd cmdinfo)) (set-vars (assoc/default 'set-vars cmdinfo)) ;; pre-overrides from -setvar (runname (assoc/default 'runname cmdinfo)) (megatest (assoc/default 'megatest cmdinfo)) @@ -445,10 +446,17 @@ runscript))))) ;; assume it is on the path ) ;; (rollup-status 0) (if contour (setenv "MT_CONTOUR" contour)) + ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ... + ;; + (setenv "MT_TESTSUITENAME" areaname) + (setenv "MT_RUN_AREA_HOME" top-path) + (set! *toppath* top-path) + (setenv "MT_TEST_RUN_DIR" work-area) + ;; On NFS it can be slow and unreliable to get needed startup information. ;; i. Check if we are on the homehost, if so, proceed ;; ii. Check if host and port passed in via CMDINFO are valid and if ;; possible use them. (let ((bestadrs (server:get-best-guess-address (get-host-name))) @@ -467,11 +475,16 @@ (if (and start-res ping-res) (let ((url (http-transport:server-dat-make-url start-res))) (remote-conndat-set! *runremote* start-res) (remote-server-url-set! *runremote* url) - (debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data.")) + (if (server:ping url) + (debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data.") + (begin + (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " url) + (remote-conndat-set! *runremote* #f) + (remote-server-url-set! *runremote* #f)))) (debug:print-info 0 *default-log-port* "received " host ":" port " for url but could not connect.") )) (begin (debug:print-info 0 *default-log-port* (if host-port (conc "received invalid host-port information " host-port) @@ -781,156 +794,154 @@ (let ((res (launch:setup-body force: force areapath: areapath))) (mutex-unlock! *launch-setup-mutex*) res))) (define (launch:setup-body #!key (force #f) (areapath #f)) - (let* ((use-cache (common:use-cache?)) - (toppath (or *toppath* areapath (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath - (runname (common:args-get-runname)) - (target (common:args-get-target exit-if-bad: #t)) - (linktree (common:get-linktree)) - (contour #f) ;; NOT READY FOR THIS (args:get-arg "-contour")) - (sections (if target (list "default" target) #f)) ;; for runconfigs - (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config - (rundir (if (and runname target linktree)(conc linktree (if contour (conc "/" contour) "") "/" target "/" runname) #f)) - (mtcachef (and rundir (conc rundir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) - (rccachef (and rundir (conc rundir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) - (cancreate (and rundir (file-exists? rundir)(file-write-access? rundir)))) - ;; (cxt (hash-table-ref/default *contexts* toppath #f))) - - ;; create our cxt for this area if it doesn't already exist - ;; (if (not cxt)(hash-table-set! *contexts* toppath (make-cxt))) - - ;; (print "runname: " runname " target: " target " mtcachef: " mtcachef " rccachef: " rccachef) - (set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource - (cond - ;; data was read and cached and available in *configstatus*, toppath has already been set - ((eq? *configstatus* 'fulldata) - *toppath*) - ;; if mtcachef exists just read it, however we need to assume toppath is available in $MT_RUN_AREA_HOME - ((and mtcachef (file-exists? mtcachef) (get-environment-variable "MT_RUN_AREA_HOME") use-cache) - (set! *configdat* (configf:read-alist mtcachef)) - (set! *runconfigdat* (configf:read-alist rccachef)) - (set! *configinfo* (list *configdat* (get-environment-variable "MT_RUN_AREA_HOME"))) - (set! *configstatus* 'fulldata) - (set! *toppath* (get-environment-variable "MT_RUN_AREA_HOME")) - *toppath*) - ;; we have all the info needed to fully process runconfigs and megatest.config - (mtcachef - (let* ((first-pass (find-and-read-config ;; NB// sets MT_RUN_AREA_HOME as side effect - mtconfig - environ-patt: "env-override" - given-toppath: toppath - pathenvvar: "MT_RUN_AREA_HOME")) - (first-rundat (let ((toppath (if toppath - toppath - (car first-pass)))) - (read-config ;; (conc toppath "/runconfigs.config") ;; this should be converted to runconfig:read but it is non-trivial, leaving it for now. - (conc (if (string? toppath) - toppath - (get-environment-variable "MT_RUN_AREA_HOME")) - "/runconfigs.config") - *runconfigdat* #t - sections: sections)))) - (set! *runconfigdat* first-rundat) - (if first-pass ;; - (begin - (set! *configdat* (car first-pass)) - (set! *configinfo* first-pass) - (set! *toppath* (or toppath (cadr first-pass))) ;; use the gathered data unless already have it - (set! toppath *toppath*) - (if (not *toppath*) - (begin - (debug:print-error 0 *default-log-port* "you are not in a megatest area!") - (exit 1))) - (setenv "MT_RUN_AREA_HOME" *toppath*) - ;; the seed read is done, now read runconfigs, cache it then read megatest.config one more time and cache it - (let* ((keys (rmt:get-keys)) - (key-vals (keys:target->keyval keys target)) - (linktree (common:get-linktree)) - ; (or (getenv "MT_LINKTREE") + (if (and (eq? *configstatus* 'fulldata) *toppath*) ;; no need to reprocess + *toppath* ;; return toppath + (let* ((use-cache (common:use-cache?)) + (toppath (or *toppath* areapath (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath + (runname (common:args-get-runname)) + (target (common:args-get-target)) + (linktree (common:get-linktree)) + (contour #f) ;; NOT READY FOR THIS (args:get-arg "-contour")) + (sections (if target (list "default" target) #f)) ;; for runconfigs + (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config + (rundir (if (and runname target linktree)(conc linktree (if contour (conc "/" contour) "") "/" target "/" runname) #f)) + (mtcachef (and rundir (conc rundir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) + (rccachef (and rundir (conc rundir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) + (cancreate (and rundir (common:file-exists? rundir)(file-write-access? rundir)))) + ;; (cxt (hash-table-ref/default *contexts* toppath #f))) + + ;; create our cxt for this area if it doesn't already exist + ;; (if (not cxt)(hash-table-set! *contexts* toppath (make-cxt))) + + ;; (print "runname: " runname " target: " target " mtcachef: " mtcachef " rccachef: " rccachef) + (set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource + (cond + ;; if mtcachef exists just read it, however we need to assume toppath is available in $MT_RUN_AREA_HOME + ((and mtcachef (common:file-exists? mtcachef) (get-environment-variable "MT_RUN_AREA_HOME") use-cache) + (set! *configdat* (configf:read-alist mtcachef)) + (set! *runconfigdat* (configf:read-alist rccachef)) + (set! *configinfo* (list *configdat* (get-environment-variable "MT_RUN_AREA_HOME"))) + (set! *configstatus* 'fulldata) + (set! *toppath* (get-environment-variable "MT_RUN_AREA_HOME")) + *toppath*) + ;; we have all the info needed to fully process runconfigs and megatest.config + (mtcachef + (let* ((first-pass (find-and-read-config ;; NB// sets MT_RUN_AREA_HOME as side effect + mtconfig + environ-patt: "env-override" + given-toppath: toppath + pathenvvar: "MT_RUN_AREA_HOME")) + (first-rundat (let ((toppath (if toppath + toppath + (car first-pass)))) + (read-config ;; (conc toppath "/runconfigs.config") ;; this should be converted to runconfig:read but it is non-trivial, leaving it for now. + (conc (if (string? toppath) + toppath + (get-environment-variable "MT_RUN_AREA_HOME")) + "/runconfigs.config") + *runconfigdat* #t + sections: sections)))) + (set! *runconfigdat* first-rundat) + (if first-pass ;; + (begin + (set! *configdat* (car first-pass)) + (set! *configinfo* first-pass) + (set! *toppath* (or toppath (cadr first-pass))) ;; use the gathered data unless already have it + (set! toppath *toppath*) + (if (not *toppath*) + (begin + (debug:print-error 0 *default-log-port* "you are not in a megatest area!") + (exit 1))) + (setenv "MT_RUN_AREA_HOME" *toppath*) + ;; the seed read is done, now read runconfigs, cache it then read megatest.config one more time and cache it + (let* ((keys (rmt:get-keys)) + (key-vals (keys:target->keyval keys target)) + (linktree (common:get-linktree)) ;; (or (getenv "MT_LINKTREE")(if *configdat* (configf:lookup *configdat* "setup" "linktree") #f))) ; (if *configdat* ; (configf:lookup *configdat* "setup" "linktree") ; (conc *toppath* "/lt")))) - (second-pass (find-and-read-config - mtconfig - environ-patt: "env-override" - given-toppath: toppath - pathenvvar: "MT_RUN_AREA_HOME")) - (runconfigdat (begin ;; this read of the runconfigs will see any adjustments made by re-reading megatest.config - (for-each (lambda (kt) - (setenv (car kt) (cadr kt))) - key-vals) - (read-config (conc toppath "/runconfigs.config") *runconfigdat* #t ;; consider using runconfig:read some day ... - sections: sections)))) - (if cancreate (configf:write-alist runconfigdat rccachef)) - (set! *runconfigdat* runconfigdat) - (if cancreate (configf:write-alist *configdat* mtcachef)) - (if cancreate (set! *configstatus* 'fulldata)))) - ;; no configs found? should not happen but let's try to recover gracefully, return an empty hash-table - (set! *configdat* (make-hash-table)) - ))) - ;; else read what you can and set the flag accordingly - (else - (let* ((cfgdat (find-and-read-config - (or (args:get-arg "-config") "megatest.config") - environ-patt: "env-override" - given-toppath: (get-environment-variable "MT_RUN_AREA_HOME") - pathenvvar: "MT_RUN_AREA_HOME"))) - (if cfgdat - (let* ((toppath (or (get-environment-variable "MT_RUN_AREA_HOME")(cadr cfgdat))) - (rdat (read-config (conc toppath ;; convert this to use runconfig:read! - "/runconfigs.config") *runconfigdat* #t sections: sections))) - (set! *configinfo* cfgdat) - (set! *configdat* (car cfgdat)) - (set! *runconfigdat* rdat) - (set! *toppath* toppath) - (set! *configstatus* 'partial)) - (begin - (debug:print-error 0 *default-log-port* "No " mtconfig " file found. Giving up.") - (exit 2)))))) - ;; additional house keeping - (let* ((linktree (common:get-linktree))) - (if linktree - (begin - (if (not (file-exists? linktree)) - (begin - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* "Something went wrong when trying to create linktree dir at " linktree) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (exit 1)) - (create-directory linktree #t)))) - (handle-exceptions - exn - (begin - (debug:print-error 0 *default-log-port* "Something went wrong when trying to create link to linktree at " *toppath*) - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))) - (let ((tlink (conc *toppath* "/lt"))) - (if (not (file-exists? tlink)) - (create-symbolic-link linktree tlink))))) - (begin - (debug:print-error 0 *default-log-port* "linktree not defined in [setup] section of megatest.config") - ))) - (if (and *toppath* - (directory-exists? *toppath*)) - (begin - (setenv "MT_RUN_AREA_HOME" *toppath*) - (setenv "MT_TESTSUITENAME" (common:get-testsuite-name))) - (begin - (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.") - ;;(exit 1) - (set! *toppath* #f) ;; force it to be false so we return #f - #f - )) - ;; if have -append-config then read and append here - (let ((cfname (args:get-arg "-append-config"))) - (if (and cfname - (file-read-access? cfname)) - (read-config cfname *configdat* #t))) ;; values are added to the hash, no need to do anything special. - *toppath*)) + (second-pass (find-and-read-config + mtconfig + environ-patt: "env-override" + given-toppath: toppath + pathenvvar: "MT_RUN_AREA_HOME")) + (runconfigdat (begin ;; this read of the runconfigs will see any adjustments made by re-reading megatest.config + (for-each (lambda (kt) + (setenv (car kt) (cadr kt))) + key-vals) + (read-config (conc toppath "/runconfigs.config") *runconfigdat* #t ;; consider using runconfig:read some day ... + sections: sections)))) + (if cancreate (configf:write-alist runconfigdat rccachef)) + (set! *runconfigdat* runconfigdat) + (if cancreate (configf:write-alist *configdat* mtcachef)) + (if cancreate (set! *configstatus* 'fulldata)))) + ;; no configs found? should not happen but let's try to recover gracefully, return an empty hash-table + (set! *configdat* (make-hash-table)) + ))) + ;; else read what you can and set the flag accordingly + (else + (let* ((cfgdat (find-and-read-config + (or (args:get-arg "-config") "megatest.config") + environ-patt: "env-override" + given-toppath: (get-environment-variable "MT_RUN_AREA_HOME") + pathenvvar: "MT_RUN_AREA_HOME"))) + (if cfgdat + (let* ((toppath (or (get-environment-variable "MT_RUN_AREA_HOME")(cadr cfgdat))) + (rdat (read-config (conc toppath ;; convert this to use runconfig:read! + "/runconfigs.config") *runconfigdat* #t sections: sections))) + (set! *configinfo* cfgdat) + (set! *configdat* (car cfgdat)) + (set! *runconfigdat* rdat) + (set! *toppath* toppath) + (set! *configstatus* 'partial)) + (begin + (debug:print-error 0 *default-log-port* "No " mtconfig " file found. Giving up.") + (exit 2)))))) + ;; additional house keeping + (let* ((linktree (common:get-linktree))) + (if linktree + (begin + (if (not (common:file-exists? linktree)) + (begin + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* "Something went wrong when trying to create linktree dir at " linktree) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (exit 1)) + (create-directory linktree #t)))) + (handle-exceptions + exn + (begin + (debug:print-error 0 *default-log-port* "Something went wrong when trying to create link to linktree at " *toppath*) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))) + (let ((tlink (conc *toppath* "/lt"))) + (if (not (file-exists? tlink)) + (create-symbolic-link linktree tlink))))) + (begin + (debug:print-error 0 *default-log-port* "linktree not defined in [setup] section of megatest.config") + ))) + (if (and *toppath* + (directory-exists? *toppath*)) + (begin + (setenv "MT_RUN_AREA_HOME" *toppath*) + (setenv "MT_TESTSUITENAME" (common:get-testsuite-name))) + (begin + (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.") + ;;(exit 1) + (set! *toppath* #f) ;; force it to be false so we return #f + #f + )) + ;; if have -append-config then read and append here + (let ((cfname (args:get-arg "-append-config"))) + (if (and cfname + (file-read-access? cfname)) + (read-config cfname *configdat* #t))) ;; values are added to the hash, no need to do anything special. + *toppath*))) (define (get-best-disk confdat testconfig) (let* ((disks (or (and testconfig (hash-table-ref/default testconfig "disks" #f)) (hash-table-ref/default confdat "disks" #f))) (minspace (let ((m (configf:lookup confdat "setup" "minspace"))) @@ -994,17 +1005,17 @@ ;; Update the rundir path in the test record for all, rundir=physical, shortdir=logical ;; rundir shortdir (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path run-id) (debug:print 2 *default-log-port* "INFO:\n lnkbase=" lnkbase "\n lnkpath=" lnkpath "\n toptest-path=" toptest-path "\n test-path=" test-path) - (if (not (file-exists? linktree)) + (if (not (common:file-exists? linktree)) (begin (debug:print 0 *default-log-port* "WARNING: linktree did not exist! Creating it now at " linktree) (create-directory linktree #t))) ;; (system (conc "mkdir -p " linktree)))) ;; create the directory for the tests dir links, this is needed no matter what... - (if (and (not (directory-exists? lnkbase)) - (not (file-exists? lnkbase))) + (if (and (not (common:directory-exists? lnkbase)) + (not (common:file-exists? lnkbase))) (handle-exceptions exn (begin (debug:print-error 0 *default-log-port* "Problem creating linktree base at " lnkbase) (print-error-message exn (current-error-port))) @@ -1242,11 +1253,12 @@ (if hhdat (car hhdat) #f))) (list 'serverurl (if *runremote* (remote-server-url *runremote*) - #f)) ;; + #f)) ;; + (list 'areaname (common:get-testsuite-name)) (list 'toppath *toppath*) (list 'work-area work-area) (list 'test-name test-name) (list 'runscript runscript) (list 'run-id run-id ) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -55,15 +55,26 @@ ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; - (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value - (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas - (dbfile (conc *toppath* "/megatest.db")) - (readonly-mode (not (file-write-access? dbfile))) ;; TODO: use dbstruct or runremote to figure this out in future - (runremote (or area-dat *runremote*))) + (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value + (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas + (runremote (or area-dat + *runremote*)) + (readonly-mode (if (and runremote + (remote-ro-mode-checked runremote)) + (remote-ro-mode runremote) + (let* ((dbfile (conc *toppath* "/megatest.db")) + (ro-mode (not (file-write-access? dbfile)))) ;; TODO: use dbstruct or runremote to figure this out in future + (if runremote + (begin + (remote-ro-mode-set! runremote ro-mode) + (remote-ro-mode-checked-set! runremote #t) + ro-mode) + ro-mode))))) + ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) (cond ;; give up if more than 15 attempts ((> attemptnum 15) (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.") @@ -93,12 +104,12 @@ (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") (remote-conndat-set! runremote #f) (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a record for our connection for given area - ((not runremote) - (set! *runremote* (make-remote)) + ((not runremote) ;; can remove this one. should never get here. + (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a homehost record ((not (pair? (remote-hh-dat runremote))) ;; not on homehost @@ -106,11 +117,12 @@ (remote-hh-dat-set! runremote (common:get-homehost)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a read - ((and (cdr (remote-hh-dat runremote)) ;; on homehost + ((and (not (remote-force-server runremote)) ;; honor forced use of server + (cdr (remote-hh-dat runremote)) ;; on homehost (member cmd api:read-only-queries)) ;; this is a read (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3") (rmt:open-qry-close-locally cmd 0 params)) @@ -123,39 +135,49 @@ (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a write, we already have a server - ((and (cdr (remote-hh-dat runremote)) ;; on homehost + ((and (not (remote-force-server runremote)) ;; honor forced use of server + (cdr (remote-hh-dat runremote)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write - (remote-server-url runremote)) ;; have a server + (remote-server-url runremote)) ;; have a server (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4") (rmt:open-qry-close-locally cmd 0 params)) ;; on homehost, no server contact made and this is a write, passively start a server - ((and (cdr (remote-hh-dat runremote)) ; new + ((and (not (remote-force-server runremote)) ;; honor forced use of server + (cdr (remote-hh-dat runremote)) ;; new (not (remote-server-url runremote)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") (let ((server-url (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call (if server-url (remote-server-url-set! runremote server-url) ;; the string can be consumed by the client setup if needed - (server:kind-run *toppath*))) + (if (common:force-server?) + (server:start-and-wait *toppath*) + (server:kind-run *toppath*)))) + (remote-force-server-set! runremote (common:force-server?)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") (rmt:open-qry-close-locally cmd 0 params)) - ((and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost - (not (remote-conndat runremote))) ;; and no connection + ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one + (not (remote-conndat runremote))) + (and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost + (not (remote-conndat runremote)))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) (mutex-unlock! *rmt-mutex*) - (server:start-and-wait *toppath*) + (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up? + (server:start-and-wait *toppath*)) + (remote-force-server-set! runremote (common:force-server?)) (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as ;; all set up if get this far, dispatch the query - ((cdr (remote-hh-dat runremote)) ;; we are on homehost + ((and (not (remote-force-server runremote)) + (cdr (remote-hh-dat runremote))) ;; we are on homehost (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 7") (rmt:open-qry-close-locally cmd (if rid rid 0) params)) ;; not on homehost, do server query @@ -184,18 +206,24 @@ res) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " is unknown") (mutex-unlock! *rmt-mutex*) (exit 1))) - (begin - (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) - (remote-conndat-set! runremote #f) - (remote-server-url-set! runremote #f) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") - (mutex-unlock! *rmt-mutex*) - (server:start-and-wait *toppath*) - (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) + (if (eq? res 'overloaded) + (let ((wait-delay (+ attemptnum (* attemptnum 10)))) + (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") + (thread-sleep! wait-delay) + (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) + (begin + (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) + (remote-conndat-set! runremote #f) + (remote-server-url-set! runremote #f) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") + (mutex-unlock! *rmt-mutex*) + (if (not (server:check-if-running *toppath*)) + (server:start-and-wait *toppath*)) + (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))))))))) ;; (define (rmt:update-db-stats run-id rawcmd params duration) ;; (mutex-lock! *db-stats-mutex*) ;; (handle-exceptions ;; exn Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -167,19 +167,19 @@ (day-seconds (* 24 60 60))) ;; if the directory exists continue to get the list ;; otherwise attempt to create the logs dir and then ;; continue (if (if (directory-exists? (conc areapath "/logs")) - #t + '() (if (file-write-access? areapath) (begin (condition-case (create-directory (conc areapath "/logs") #t) (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list."))) (directory-exists? (conc areapath "/logs"))) - #f)) + '())) (let* ((server-logs (glob (conc areapath "/logs/server-*.log"))) (num-serv-logs (length server-logs))) (if (null? server-logs) '() (let loop ((hed (car server-logs)) @@ -218,21 +218,24 @@ ;; (define (server:get-best srvlst) (let ((now (current-seconds))) (sort (filter (lambda (rec) - (let ((start-time (list-ref rec 3)) - (mod-time (list-ref rec 0))) - ;; (print "start-time: " start-time " mod-time: " mod-time) - (and start-time mod-time - (> (- now start-time) 0) ;; been running at least 0 seconds - (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds - (< (- now start-time) - (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) - 180) - (random 360))) ;; under one hour running time +/- 180 - ))) + (if (and (list? rec) + (> (length rec) 2)) + (let ((start-time (list-ref rec 3)) + (mod-time (list-ref rec 0))) + ;; (print "start-time: " start-time " mod-time: " mod-time) + (and start-time mod-time + (> (- now start-time) 0) ;; been running at least 0 seconds + (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds + (< (- now start-time) + (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) + 180) + (random 360))) ;; under one hour running time +/- 180 + )) + #f)) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) @@ -240,10 +243,20 @@ (let ((srvrs (server:get-best (server:get-list areapath)))) (if (and srvrs (not (null? srvrs))) (car srvrs) #f))) + +(define (server:get-rand-best areapath) + (let ((srvrs (server:get-best (server:get-list areapath)))) + (if (and (list? srvrs) + (not (null? srvrs))) + (let* ((len (length srvrs)) + (idx (random len))) + (list-ref srvrs idx)) + #f))) + (define (server:record->url servr) (match-let (((mod-time host port start-time pid) servr)) (if (and host port) @@ -257,22 +270,28 @@ *my-client-signature*))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched (define (server:kind-run areapath) - (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun - (call-num (car last-run-dat)) - (when-run (cadr last-run-dat)) - (run-delay (+ (case call-num - ((0) 0) - ((1) 20) - ((2) 300) - (else 600)) - (random 5)))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously - (if (> (- (current-seconds) when-run) run-delay) - (server:run areapath)) - (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))) + (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? + (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun + (call-num (car last-run-dat)) + (when-run (cadr last-run-dat)) + (run-delay (+ (case call-num + ((0) 0) + ((1) 20) + ((2) 300) + (else 600)) + (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously + (lock-file (conc areapath "/logs/server-start.lock"))) + (if (> (- (current-seconds) when-run) run-delay) + (begin + (common:simple-file-lock-and-wait lock-file expire-time: 15) + (server:run areapath) + (thread-sleep! 5) ;; don't release the lock for at least a few seconds + (common:simple-file-release-lock lock-file))) + (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) (let loop ((server-url (server:check-if-running areapath))) (if (or server-url @@ -284,24 +303,22 @@ (thread-sleep! 5) (loop (server:check-if-running areapath))))))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. -(define (server:dotserver-age-seconds areapath) - (let ((server-file (conc areapath "/.server"))) - (begin - (handle-exceptions - exn - #f - (- (current-seconds) - (file-modification-time server-file)))))) - ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; -(define (server:check-if-running areapath) - (let* ((servers (server:get-best (server:get-list areapath)))) - (if (null? servers) +(define (server:check-if-running areapath #!key (numservers "2")) + (let* ((ns (string->number + (or (configf:lookup *configdat* "server" "numservers") numservers))) + (servers (server:get-best (server:get-list areapath)))) + ;; (print "servers: " servers " ns: " ns) + (if (or (and servers + (null? servers)) + (not servers) + (and (list? servers) + (< (length servers) (random ns)))) ;; somewhere between 0 and numservers #f (let loop ((hed (car servers)) (tal (cdr servers))) (let ((res (server:check-server hed))) (if res