Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -54,26 +54,24 @@ ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; (define (client:setup run-id #!key (remaining-tries 10) (failed-connects 0)) - (debug:print 0 "INFO: client:setup remaining-tries=" remaining-tries) (if (<= remaining-tries 0) (begin (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id) (exit 1)) (let ((host-info (hash-table-ref/default *runremote* run-id #f))) (debug:print-info 0 "client:setup host-info=" host-info ", remaining-tries=" remaining-tries) + (thread-sleep! 1) ;; try to avoid race conditons (if host-info (let* ((iface (car host-info)) (port (cadr host-info)) (start-res (http-transport:client-connect iface port)) ;; (ping-res (server:ping-server run-id iface port)) (ping-res (rmt:login-no-auto-client-setup start-res run-id))) (if ping-res ;; sucessful login? - (begin - (hash-table-set! *runremote* run-id start-res) start-res) ;; return the server info (if (member remaining-tries '(3 4 6)) (begin ;; login failed (debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" host-info) (hash-table-delete! *runremote* run-id) @@ -88,33 +86,29 @@ (begin (debug:print 25 "INFO: client:setup failed to connect, start-res=" start-res ", run-id=" run-id ", host-info=" host-info) (thread-sleep! 5) (client:setup run-id remaining-tries: (- remaining-tries 1)))))) ;; YUK: rename server-dat here - (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id))) + (if new-dat + new-dat (debug:print-info 0 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) - (if server-dat (let* ((iface (tasks:hostinfo-get-interface server-dat)) (port (tasks:hostinfo-get-port server-dat)) (start-res (http-transport:client-connect iface port)) ;; (ping-res (server:ping-server run-id iface port)) (ping-res (rmt:login-no-auto-client-setup start-res run-id))) - (if start-res - (begin - (hash-table-set! *runremote* run-id start-res) - start-res) (if (member remaining-tries '(2 5)) (begin ;; login failed (debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) + ;;(debug:print 0 "INFO: login failed in client:setup with no existing server-dat: " server-dat ", new-dat: " new-dat ", and server-info: " server-info ", cleaning out records and then trying again") (hash-table-delete! *runremote* run-id) (open-run-close tasks:server-force-clean-run-record tasks:open-db run-id (tasks:hostinfo-get-interface server-dat) (tasks:hostinfo-get-port server-dat) " client:setup (server-dat = #t)") - (thread-sleep! 2) (server:try-running run-id) (thread-sleep! 10) ;; give server a little time to start up (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1))) (begin (debug:print 25 "INFO: client:setup start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) @@ -124,11 +118,10 @@ (if (eq? remaining-tries 2) (begin ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)") (client:setup run-id remaining-tries: 10)) (begin - (thread-sleep! 2) (debug:print 25 "INFO: client:setup start-res (not defined here), run-id=" run-id ", server-dat=" server-dat) (if (< (open-run-close tasks:num-in-available-state tasks:open-db run-id) 3) (begin ;; (open-run-close tasks:server-clean-out-old-records-for-run-id tasks:open-db run-id " client:setup (server-dat=#f)") (server:try-running run-id))) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -282,12 +282,12 @@ ;; (uri-api-dat (make-request method: 'GET uri: (uri-reference (conc "http://" iface ":" port "/api")))) (server-dat (list iface port uri-dat uri-api-dat api-url))) ;; (login-res (server:ping-server run-id server-dat))) ;; login-no-auto-client-setup server-dat run-id))) server-dat)) ;; (if (and (list? login-res) -;; (car login-res)) -;; (begin + (hash-table-set! *runremote* run-id server-dat) + server-dat) ;; (hash-table-set! *runremote* run-id server-dat) ;; (debug:print-info 2 "Logged in and connected to " iface ":" port) ;; (hash-table-set! *runremote* run-id server-dat) ;; server-dat) ;; (begin Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -43,11 +43,11 @@ (if cinfo cinfo (let loop ((numtries 100)) (let ((res (client:setup run-id))) (if res - (hash-table-ref/default *runremote* run-id #f) ;; client:setup filled this in (hopefully) + (hash-table-ref *runremote* run-id) ;; client:setup filled this in (hopefully) (if (> numtries 0) (begin (thread-sleep! 10) (loop (- numtries 1))) (begin Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -109,19 +109,22 @@ (if (eq? run-id 0) (server:run run-id) (rmt:start-server run-id))) (define (server:check-if-running run-id) - (let loop ((server (open-run-close tasks:get-server tasks:open-db run-id)) - (trycount 0)) - (if server + (let loop ((server-info (open-run-close tasks:get-server tasks:open-db run-id)) + (trycount 0)) + (if server-info ;; note: client:start will set *runremote*. this needs to be changed ;; also, client:start will login to the server, also need to change that. ;; ;; client:start returns #t if login was successful. ;; (let ((res (server:ping-server run-id (vector-ref server 1)(vector-ref server 0)))) + run-id + (tasks:hostinfo-get-interface server-info) + (tasks:hostinfo-get-port server-info)))) ;; if the server didn't respond we must remove the record (if res #t (begin (open-run-close tasks:server-force-clean-running-records-for-run-id tasks:open-db run-id Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -95,11 +95,11 @@ (define (tasks:server-lock-slot mdb run-id) (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot") (if (< (tasks:num-in-available-state mdb run-id) 4) (begin (tasks:server-set-available mdb run-id) - (thread-sleep! 2) ;; Try removing this. It may not be needed. + (thread-sleep! 0.2) ;; Try removing this. It may not be needed. (tasks:server-am-i-the-server? mdb run-id)) #f)) ;; register that this server may come online (first to register goes though with the process) (define (tasks:server-set-available mdb run-id) @@ -205,16 +205,19 @@ (header (db:get-header all)) (id (db:get-value-by-header first header "id")) (hostname (db:get-value-by-header first header "hostname")) (pid (db:get-value-by-header first header "pid")) (priority (db:get-value-by-header first header "priority"))) - (debug:print 0 "INFO: am-i-the-server got record " first) ;; for now a basic check. add tiebreaking by priority later - (if (and (equal? hostname (get-host-name)) - (equal? pid (current-process-id))) - id - #f))) + (let* ((my-pid (current-process-id)) + (res (if (and (equal? hostname (get-host-name)) + (equal? pid my-pid)) + id + #f))) + (debug:print 0 "INFO: am-i-the-server got record " first ", my-pid: " my-pid ", pid: " pid ", result: " res) + res))) + ;; Use: (db:get-value-by-header (car (db:get-rows dat)) (db:get-header dat) "fieldname") ;; to extract info from the structure returned ;; (define (tasks:server-get-servers-vying-for-run-id mdb run-id)