Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -207,42 +207,10 @@ ;; (server:try-running run-id)) (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries) ;; (thread-sleep! (+ 2 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. (client:setup run-id remaining-tries: (- remaining-tries 1)))))))) -;; (let ((host-info (hash-table-ref/default *runremote* run-id #f))) -;; (if host-info ;; this is a bit circular. the host-info *is* the start-res FIXME -;; (let* ((iface (http-transport:server-dat-get-iface host-info)) -;; (port (http-transport:server-dat-get-port host-info)) -;; (start-res (case *transport-type* -;; ((http)(http-transport:client-connect iface port)) -;; ((nmsg)(nmsg-transport:client-connect iface port)) ;; (http-transport:server-dat-get-socket host-info)) -;; (else #f))) -;; (ping-res (case *transport-type* -;; ((http)(rmt:login-no-auto-client-setup start-res run-id)) -;; ((nmsg)(let ((logininfo (rmt:login-no-auto-client-setup start-res run-id))) -;; (if logininfo -;; (vector-ref (vector-ref logininfo 1) 1) -;; #f))) -;; (else #f)))) -;; (if ping-res ;; sucessful login? -;; (begin -;; (debug:print-info 2 "client:setup, ping is good using host-info=" host-info ", remaining-tries=" remaining-tries) -;; start-res) ;; return the server info -;; ;; have host info but no ping. shutdown the current connection and try again -;; (begin ;; login failed -;; (debug:print-info 1 "client:setup, ping is bad for start-res=" start-res " and *runremote*=" host-info) -;; (case *transport-type* -;; ((http)(http-transport:close-connections run-id))) -;; (hash-table-delete! *runremote* run-id) -;; (if (< remaining-tries 8) -;; (thread-sleep! 5) -;; (thread-sleep! 1)) -;; (client:setup run-id remaining-tries: (- remaining-tries 1))))) -;; ;; YUK: rename server-dat here -;; - ;; keep this as a function to ease future (define (client:start run-id server-info) (http-transport:client-connect (tasks:hostinfo-get-interface server-info) (tasks:hostinfo-get-port server-info))) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -1473,13 +1473,14 @@ (file-modification-time *monitor-db-path*) -1)) (run-update-time (current-seconds)) (recalc (dashboard:recalc modtime *please-update-buttons* *last-db-update-time*))) (if (and (eq? *current-tab-number* 0) - (> monitor-modtime *last-monitor-update-time*)) + (or (> monitor-modtime *last-monitor-update-time*) + (> (- run-update-time *last-monitor-update-time*) 30))) ;; update every 1/2 minute just in case (begin - (set! *last-monitor-update-time* monitor-modtime) + (set! *last-monitor-update-time* run-update-time) ;; monitor-modtime) (if dashboard:update-servers-table (dashboard:update-servers-table)))) (if recalc (begin (case *current-tab-number* ((0) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -28,24 +28,28 @@ ;; (trace ;; rmt:send-receive ;; api:execute-requests ;; ) +;; generate entries for ~/.megatestrc with the following +;; +;; grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u + ;;====================================================================== ;; S U P P O R T F U N C T I O N S ;;====================================================================== -;; NOT USED +;; NOT USED? ;; -(define (rmt:call-transport run-id connection-info cmd jparams) - (case (server:get-transport) - ((rpc) ( rpc-transport:client-api-send-receive run-id connection-info cmd jparams)) - ((http) (http-transport:client-api-send-receive run-id connection-info cmd jparams)) - ((fs) ( fs-transport:client-api-send-receive run-id connection-info cmd jparams)) - ((zmq) (zmq-transport:client-api-send-receive run-id connection-info cmd jparams)) - (else ( rpc-transport:client-api-send-receive run-id connection-info cmd jparams)))) +;; (define (rmt:call-transport run-id connection-info cmd jparams) +;; (case (server:get-transport) +;; ((rpc) ( rpc-transport:client-api-send-receive run-id connection-info cmd jparams)) +;; ((http) (http-transport:client-api-send-receive run-id connection-info cmd jparams)) +;; ((fs) ( fs-transport:client-api-send-receive run-id connection-info cmd jparams)) +;; ((zmq) (zmq-transport:client-api-send-receive run-id connection-info cmd jparams)) +;; (else ( rpc-transport:client-api-send-receive run-id connection-info cmd jparams)))) ;; (define (rmt:write-frequency-over-limit? cmd run-id) (and (not (member cmd api:read-only-queries)) (let* ((tmprec (hash-table-ref/default *write-frequency* run-id #f)) @@ -201,33 +205,43 @@ (cons 'none 0)) (loop (car tal)(cdr tal) newmax-cmd currmax))))))) (mutex-unlock! *db-stats-mutex*) res)) -(define (rmt:open-qry-close-locally cmd run-id params) +(define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5)) (let* ((dbstruct-local (if *dbstruct-db* *dbstruct-db* (let* ((dbdir (conc (configf:lookup *configdat* "setup" "linktree") "/.db")) (db (make-dbr:dbstruct path: dbdir local: #t))) (set! *dbstruct-db* db) db))) - (db-file-path (db:dbfile-path 0))) - ;; (read-only (not (file-read-access? db-file-path))) - (let* ((start (current-milliseconds)) - (resdat (api:execute-requests dbstruct-local (vector (symbol->string cmd) params))) - (res (vector-ref resdat 1)) - (duration (- (current-milliseconds) start))) - (rmt:update-db-stats run-id cmd params duration) - ;; mark this run as dirty if this was a write - (if (not (member cmd api:read-only-queries)) - (let ((start-time (current-seconds))) - (mutex-lock! *db-multi-sync-mutex*) - ;; (if (not (hash-table-ref/default *db-local-sync* run-id #f)) - ;; just set it every time. Is a write more expensive than a read and does it matter? - (hash-table-set! *db-local-sync* (or run-id 0) start-time) ;; the oldest "write" - (mutex-unlock! *db-multi-sync-mutex*))) - res))) + (db-file-path (db:dbfile-path 0)) + ;; (read-only (not (file-read-access? db-file-path))) + (start (current-milliseconds)) + (resdat (api:execute-requests dbstruct-local (vector (symbol->string cmd) params))) + (success (vector-ref resdat 0)) + (res (vector-ref resdat 1)) + (duration (- (current-milliseconds) start))) + (if (not success) + (if (> remretries 0) + (begin + (debug:print 0 "ERROR: local query failed. Trying again.") + (rmt:open-qry-close-locally cmd run-id params remretries: (- remretries 1))) + (begin + (debug:print 0 "ERROR: too many retries in rmt:open-qry-close-locally, giving up") + #f)) + (begin + (rmt:update-db-stats run-id cmd params duration) + ;; mark this run as dirty if this was a write + (if (not (member cmd api:read-only-queries)) + (let ((start-time (current-seconds))) + (mutex-lock! *db-multi-sync-mutex*) + ;; (if (not (hash-table-ref/default *db-local-sync* run-id #f)) + ;; just set it every time. Is a write more expensive than a read and does it matter? + (hash-table-set! *db-local-sync* (or run-id 0) start-time) ;; the oldest "write" + (mutex-unlock! *db-multi-sync-mutex*))) + res)))) (define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) (let* ((run-id (if run-id run-id 0)) ;; (jparams (db:obj->string params)) ;; (rmt:dat->json-str params)) (res (handle-exceptions Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -160,13 +160,13 @@ (hash-table-set! *runs:denoise* key currtime) #t) #f))) (define (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) - ;;(thread-sleep! (cond - ;; ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while - ;; (else 0))) + (thread-sleep! (cond + ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while + (else 0))) (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (config-lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -361,24 +361,25 @@ mdb ;; NEEDS dbprep ADDED "SELECT id FROM servers WHERE run_id=? AND state = 'running';" run-id) res)) (define (tasks:need-server run-id) - (let ((forced (configf:lookup *configdat* "server" "required")) - (maxqry (cdr (rmt:get-max-query-average run-id))) - (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) - (cond - (forced - (if (common:low-noise-print 60 run-id "server required is set") - (debug:print-info 0 "Server required is set, starting server for run-id " run-id ".")) - #t) - ((> maxqry threshold) - (if (common:low-noise-print 60 run-id "Max query time execeeded") - (debug:print-info 0 "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, server needed for run-id " run-id ".")) - #t) - (else - #f)))) + (configf:lookup *configdat* "server" "required")) + +;; (maxqry (cdr (rmt:get-max-query-average run-id))) +;; (threshold (string->number (or (configf:lookup *configdat* "server" "server-query-threshold") "10")))) +;; (cond +;; (forced +;; (if (common:low-noise-print 60 run-id "server required is set") +;; (debug:print-info 0 "Server required is set, starting server for run-id " run-id ".")) +;; #t) +;; ((> maxqry threshold) +;; (if (common:low-noise-print 60 run-id "Max query time execeeded") +;; (debug:print-info 0 "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, server needed for run-id " run-id ".")) +;; #t) +;; (else +;; #f)))) ;; try to start a server and wait for it to be available ;; (define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries) ;; ensure a server is running for this run