Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -264,10 +264,12 @@ (thread-sleep! 45) (if (not res) (begin (debug:print 0 "WARNING: communication with the server timed out.") (mutex-unlock! *http-mutex*) + ;; Maybe the server died? Try starting it up. + (server:ensure-running run-id) (http-transport:client-send-receive serverdat msg numretries: (- numretries 1)) (if (< numretries 3) ;; on last try just exit (begin (debug:print 0 "ERROR: communication with the server timed out. Giving up.") (exit 1))))))) @@ -285,10 +287,17 @@ final))))))) ;; Send "cmd" with json payload "params" to serverdat and receive result ;; (define (http-transport:client-api-send-receive run-id serverdat cmd params #!key (numretries 30)) + (if (not serverdat) ;; get #f, something went wrong. try starting the server again and reconnecting + (begin + ;; try to restart the server and then reconnect + (server:ensure-running run-id) + (hash-table-delete! *runremote* run-id) + (client:setup run-id) + (set! serverdat (hash-table-ref/default *runremote* run-id #f)))) (let* ((fullurl (if (list? serverdat) (cadddr serverdat) ;; this is the uri for /api (begin (debug:print 0 "FATAL ERROR: http-transport:client-send-receive called with no server info") (exit 1)))) @@ -298,12 +307,15 @@ (begin ;; TODO: Send this output to a log file so it isn't lost when running as daemon (if (> numretries 0) ;; on the zeroeth retry do not print the error message - this allows the call to be used as a ping (no junk on output). (begin - (print "ERROR IN http-transport:client-send-receive " ((condition-property-accessor 'exn 'message) exn)) - (if (> (random 100) 80)(server:ensure-running run-id)) ;; every so often try starting a server + (print "ERROR IN http-transport:client-api-send-receive " ((condition-property-accessor 'exn 'message) exn)) + ;; try to restart the server and then reconnect + (server:ensure-running run-id) + (hash-table-delete! *runremote* run-id) + (client:setup run-id) (http-transport:client-api-send-receive run-id serverdat cmd params numretries: (- numretries 1))) #f)) (begin (debug:print-info 11 "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n") ;; set up the http-client here @@ -350,11 +362,11 @@ (thread-sleep! 45) (if (not res) (begin (debug:print 0 "WARNING: communication with the server timed out.") (mutex-unlock! *http-mutex*) - (http-transport:client-api-send-receive serverdat cmd params numretries: (- numretries 1)) + (http-transport:client-api-send-receive run-id serverdat cmd params numretries: (- numretries 1)) (if (< numretries 3) ;; on last try just exit (begin (debug:print 0 "ERROR: communication with the server timed out. Giving up.") (exit 1))))))) (th1 (make-thread send-recieve "with-input-from-request")) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -339,11 +339,13 @@ ;; (let ((tl (setup-for-run)) (run-id (and (args:get-arg "-run-id") (string->number (args:get-arg "-run-id"))))) (if run-id - (server:launch run-id) + (begin + (server:launch run-id) + (set! *didsomething* #t)) (debug:print 0 "ERROR: server requires run-id be specified with -run-id"))) ;; Not a server? This section will decide how to communicate ;; ;; Setup client for all expect listed here Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -50,10 +50,15 @@ (define (server:launch run-id) (if (server:check-if-running run-id) ;; a server is already running (exit) (http-transport:launch run-id))) + +(define (server:launch-no-exit run-id) + (if (server:check-if-running run-id) + #t ;; if running + (http-transport:launch run-id))) ;;====================================================================== ;; Q U E U E M A N A G E M E N T ;;====================================================================== Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -91,17 +91,24 @@ (define (tasks:hostinfo-get-transport vec) (vector-ref vec 4)) (define (tasks:hostinfo-get-pid vec) (vector-ref vec 5)) (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) (define (tasks:server-lock-slot mdb run-id) - (let ((res '()) - (best #f)) - (tasks:server-clean-out-old-records-for-run-id mdb run-id) - (if (tasks:less-than-two-available mdb run-id) - (tasks:server-set-available mdb run-id)) - (thread-sleep! 2) ;; Try removing this. It may not be needed. - (tasks:server-am-i-the-server? mdb run-id))) + (let loop ((res #f) + (num-tries 0)) + (if (and (< num-tries 5) + (not res)) + (begin + (tasks:server-clean-out-old-records-for-run-id mdb run-id) + (if (< (tasks:num-in-available-state mdb run-id) 4) + (tasks:server-set-available mdb run-id)) + (thread-sleep! 2) ;; Try removing this. It may not be needed. + (loop (tasks:server-am-i-the-server? mdb run-id) + (+ num-tries 1))) + res))) + + ;; register that this server may come online (first to register goes though with the process) (define (tasks:server-set-available mdb run-id) (sqlite3:execute mdb @@ -117,22 +124,22 @@ -1 ;; interface "http" ;; transport run-id )) -(define (tasks:less-than-two-available mdb run-id) +(define (tasks:num-in-available-state mdb run-id) (let ((res 0)) (sqlite3:for-each-row (lambda (num-in-queue) (set! res num-in-queue)) mdb "SELECT count(id) FROM servers WHERE run_id=?;" run-id) - (< res 3))) + res)) (define (tasks:server-clean-out-old-records-for-run-id mdb run-id) - (sqlite3:execute mdb "DELETE FROM servers WHERE state in ('available','shutting-down') AND (strftime('%s','now') - start_time) > 30 AND run_id=?;" run-id) + (sqlite3:execute mdb "DELETE FROM servers WHERE state in ('available','shutting-down') AND (strftime('%s','now') - start_time) > 10 AND run_id=?;" run-id) (if (server:check-if-running run-id) (sqlite3:execute mdb "DELETE FROM servers WHERE run_id=?;" run-id))) (define (tasks:server-force-clean-running-records-for-run-id mdb run-id) (sqlite3:execute mdb "DELETE FROM servers WHERE state = 'running' AND run_id=?;" run-id))