Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -54,55 +54,53 @@ ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; (define (client:setup run-id #!key (remaining-tries 10)) - (debug:print 0 "INFO: client:setup remaining-tries=" remaining-tries) (if (<= remaining-tries 0) (begin (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id) (exit 1)) (let ((server-dat (and run-id (hash-table-ref/default *runremote* run-id #f)))) + (thread-sleep! 1) ;; try to avoid race conditons (if server-dat - (let ((start-res (http-transport:client-connect run-id ;; NB// confusion over server-dat and connection result! - (tasks:hostinfo-get-interface server-dat) - (tasks:hostinfo-get-port server-dat)))) - (if start-res ;; sucessful login? - (begin - (hash-table-set! *runremote* run-id start-res) - start-res) + (let ((new-dat (http-transport:client-connect run-id ;; NB// confusion over server-dat and connection result! + (car server-dat) + (cadr server-dat)))) + (if new-dat ;; sucessful login? + new-dat (begin ;; login failed + (debug:print 0 "INFO: login failed in client:setup with existing server-dat: " server-dat ", new-dat: " new-dat ", cleaning out records and then trying again") (hash-table-delete! *runremote* run-id) (open-run-close tasks:server-force-clean-run-record tasks:open-db run-id - (tasks:hostinfo-get-interface server-dat) - (tasks:hostinfo-get-port server-dat)) + (car server-dat) + (cadr server-dat)) (thread-sleep! 5) (client:setup run-id remaining-tries: (- remaining-tries 1))))) - (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id))) - (if server-dat - (let ((start-res (http-transport:client-connect run-id - (tasks:hostinfo-get-interface server-dat) - (tasks:hostinfo-get-port server-dat)))) - (if start-res - (begin - (hash-table-set! *runremote* run-id start-res) - start-res) + (let* ((server-info (open-run-close tasks:get-server tasks:open-db run-id))) + (if server-info + (let ((new-dat (http-transport:client-connect run-id + (tasks:hostinfo-get-interface server-info) + (tasks:hostinfo-get-port server-info)))) + (if new-dat + new-dat (begin ;; login failed + (debug:print 0 "INFO: login failed in client:setup with no existing server-dat: " server-dat ", new-dat: " new-dat ", and server-info: " server-info ", cleaning out records and then trying again") (hash-table-delete! *runremote* run-id) (open-run-close tasks:server-force-clean-run-record tasks:open-db run-id (tasks:hostinfo-get-interface server-dat) (tasks:hostinfo-get-port server-dat)) - (thread-sleep! 2) + ;; (thread-sleep! 2) (server:try-running run-id) (thread-sleep! 5) ;; give server a little time to start up (client:setup run-id remaining-tries: (- remaining-tries 1))))) (begin ;; no server registered - (thread-sleep! 2) + ;; (thread-sleep! 2) (server:try-running run-id) (thread-sleep! 5) ;; give server a little time to start up (client:setup run-id remaining-tries: (- remaining-tries 1))))))))) ;; keep this as a function to ease future Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -267,19 +267,19 @@ ;; connect ;; (define (http-transport:client-connect run-id iface port) (let* ((uri-dat (make-request method: 'POST uri: (uri-reference (conc "http://" iface ":" port "/ctrl")))) (uri-api-dat (make-request method: 'POST uri: (uri-reference (conc "http://" iface ":" port "/api")))) - (serverdat (list iface port uri-dat uri-api-dat)) - (login-res (rmt:login-no-auto-client-setup serverdat run-id))) - (hash-table-set! *runremote* run-id serverdat) ;; may or may not be good ... + (server-dat (list iface port uri-dat uri-api-dat)) + (login-res (rmt:login-no-auto-client-setup server-dat run-id))) + ;; (hash-table-set! *runremote* run-id serverdat) ;; may or may not be good ... (if (and (list? login-res) (car login-res)) (begin (debug:print-info 2 "Logged in and connected to " iface ":" port) - (hash-table-set! *runremote* run-id serverdat) - serverdat) + (hash-table-set! *runremote* run-id server-dat) + server-dat) (begin (debug:print-info 0 "ERROR: Failed to login or connect to " iface ":" port) #f)))) ;; run http-transport:keep-running in a parallel thread to monitor that the db is being Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -44,11 +44,11 @@ cinfo (let loop ((numtries 100)) (thread-sleep! 1) (let ((res (client:setup run-id))) (if res - (hash-table-ref/default *runremote* run-id #f) ;; client:setup filled this in (hopefully) + (hash-table-ref *runremote* run-id) ;; client:setup filled this in (hopefully) (if (> numtries 0) (loop (- numtries 1)) (begin (debug:print 0 "ERROR: 100 tries and no server, giving up") (exit 1))))))))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -107,29 +107,35 @@ ;; > file 2>&1 (define (server:try-running run-id) (let* ((rand-name (random 100)) (cmdln (conc (if (getenv "MT_MEGATEST") (getenv "MT_MEGATEST") "megatest") - " -server - -run-id " run-id " name=" rand-name " > " *toppath* "/db/" run-id "-" rand-name ".log 2>&1 &"))) + " -server - -run-id " run-id " name=" rand-name " > " *toppath* "/db/" run-id + ".log 2>&1 &"))) + ;; ".log &" ))) (debug:print 0 "INFO: Starting server (" cmdln ") as none running ...") (push-directory *toppath*) (system cmdln) (pop-directory))) (define (server:check-if-running run-id) - (let loop ((server (open-run-close tasks:get-server tasks:open-db run-id)) - (trycount 0)) + (let loop ((server-info (open-run-close tasks:get-server tasks:open-db run-id)) + (trycount 0)) (thread-sleep! 2) - (if server + (if server-info ;; note: client:start will set *runremote*. this needs to be changed ;; also, client:start will login to the server, also need to change that. ;; ;; client:start returns #t if login was successful. ;; - (let ((res (client:start run-id server))) + (let ((res (http-transport:client-connect + run-id + (tasks:hostinfo-get-interface server-info) + (tasks:hostinfo-get-port server-info)))) ;; if the server didn't respond we must remove the record (if res res (begin + (debug:print 0 "WARNING: running server not reachable, removing record: " server-info) (open-run-close tasks:server-force-clean-running-records-for-run-id tasks:open-db run-id) res))) #f))) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -95,11 +95,11 @@ (define (tasks:server-lock-slot mdb run-id) (tasks:server-clean-out-old-records-for-run-id mdb run-id) (if (< (tasks:num-in-available-state mdb run-id) 4) (begin (tasks:server-set-available mdb run-id) - (thread-sleep! 2) ;; Try removing this. It may not be needed. + (thread-sleep! 0.2) ;; Try removing this. It may not be needed. (tasks:server-am-i-the-server? mdb run-id)) #f)) ;; register that this server may come online (first to register goes though with the process) (define (tasks:server-set-available mdb run-id) @@ -128,11 +128,11 @@ "SELECT count(id) FROM servers WHERE run_id=?;" run-id) res)) (define (tasks:server-clean-out-old-records-for-run-id mdb run-id) - (sqlite3:execute mdb "DELETE FROM servers WHERE state in ('available','shutting-down') AND (strftime('%s','now') - start_time) > 10 AND run_id=?;" run-id)) + (sqlite3:execute mdb "DELETE FROM servers WHERE state in ('available','shutting-down') AND (strftime('%s','now') - start_time) > 100 AND run_id=?;" run-id)) (define (tasks:server-force-clean-running-records-for-run-id mdb run-id) (sqlite3:execute mdb "DELETE FROM servers WHERE state = 'running' AND run_id=?;" run-id)) (define (tasks:server-force-clean-run-record mdb run-id iface port) @@ -184,16 +184,19 @@ (header (db:get-header all)) (id (db:get-value-by-header first header "id")) (hostname (db:get-value-by-header first header "hostname")) (pid (db:get-value-by-header first header "pid")) (priority (db:get-value-by-header first header "priority"))) - (debug:print 0 "INFO: am-i-the-server got record " first) ;; for now a basic check. add tiebreaking by priority later - (if (and (equal? hostname (get-host-name)) - (equal? pid (current-process-id))) - id - #f))) + (let* ((my-pid (current-process-id)) + (res (if (and (equal? hostname (get-host-name)) + (equal? pid my-pid)) + id + #f))) + (debug:print 0 "INFO: am-i-the-server got record " first ", my-pid: " my-pid ", pid: " pid ", result: " res) + res))) + ;; Use: (db:get-value-by-header (car (db:get-rows dat)) (db:get-header dat) "fieldname") ;; to extract info from the structure returned ;; (define (tasks:server-get-servers-vying-for-run-id mdb run-id) Index: tests/fullrun/megatest.config ================================================================== --- tests/fullrun/megatest.config +++ tests/fullrun/megatest.config @@ -77,10 +77,11 @@ status pass fail n/a 0 1 running - 2 # These are set before all tests, override them # in the testconfig [pre-launch-env-overrides] section [env-override] + # This variable is honored by the loadrunner script. The value is in percent # a value of 200 will stop new jobs from starting. MAX_ALLOWED_LOAD 200 # MT_XTERM_CMD overrides the terminal command @@ -103,14 +104,10 @@ WRAPPEDVAR This var should have the work blah thrice: \ blah \ blah -# Set MAX_ALLOWED_LOAD for nbload. 150 percent is a good value. - -MAX_ALLOWED_LOAD 150 - # XTERM [system xterm] # RUNDEAD [system exit 56] [server]