Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -50,10 +50,13 @@ ;; 1. We are a test manager and we received *transport-type* and *runremote* via cmdline ;; 2. We are a run tests, list runs or other interactive process and we must figure out ;; *transport-type* and *runremote* from the monitor.db ;; ;; client:setup +;; +;; lookup_server, +;; (define (client:setup run-id #!key (remaining-tries 3)) (if (not *toppath*) (if (not (setup-for-run)) (begin (debug:print 0 "ERROR: failed to find megatest.config, exiting") Index: docs/manual/server.dot ================================================================== --- docs/manual/server.dot +++ docs/manual/server.dot @@ -46,12 +46,15 @@ remove_server_record -> set_available; set_available -> avail_delay [label="delay 3s"]; avail_delay -> "first_in_queue?"; "first_in_queue?" -> set_running [label=yes]; - set_running -> handle_requests; - "first_in_queue?" -> "server_running?" [label=no]; + set_running -> get_next_port -> handle_requests; + "first_in_queue?" -> "dead_entry_in_queue?" [label=no]; + "dead_entry_in_queue?" -> "server_running?" [label=no]; + "dead_entry_in_queue?" -> "remove_dead_entries" [label=yes]; + remove_dead_entries -> "server_running?"; handle_requests -> start_shutdown [label="no traffic"]; handle_requests -> shutdown_request; start_shutdown -> shutdown_delay; shutdown_request -> shutdown_delay; Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -154,10 +154,13 @@ (print-error-message exn) (if (< portnum 9000) (begin (debug:print 0 "WARNING: failed to start on portnum: " portnum ", trying next port") (thread-sleep! 0.1) + + ;; get_next_port goes here + (http-transport:try-start-server ipaddrstr (+ portnum 1) server-id)) (print "ERROR: Tried and tried but could not start the server"))) ;; any error in following steps will result in a retry (set! *server-info* (list ipaddrstr portnum)) (open-run-close tasks:server-set-interface-port @@ -371,10 +374,13 @@ (thread-join! th1) (thread-terminate! th2) (debug:print-info 11 "got res=" res) res))))) +;; +;; connect +;; (define (http-transport:client-connect run-id iface port) (let* ((login-res #f) (uri-dat (make-request method: 'POST uri: (uri-reference (conc "http://" iface ":" port "/ctrl")))) (uri-api-dat (make-request method: 'POST uri: (uri-reference (conc "http://" iface ":" port "/api")))) (serverdat (list iface port uri-dat uri-api-dat))) @@ -421,10 +427,13 @@ (if (and (string? tmo) (string->number tmo)) (* 60 60 (string->number tmo)) ;; default to three days (* 3 24 60 60))))) + ;; + ;; set_running + ;; (tasks:server-set-state! tdb server-id "running") (let loop ((count 0)) ;; Use this opportunity to sync the inmemdb to db (let ((start-time (current-milliseconds)) (sync-time #f) @@ -455,20 +464,27 @@ (set! iface (car sdat)) (set! port (cadr sdat)))) ;; NOTE: Get rid of this mechanism! It really is not needed... ;; (open-run-close tasks:server-update-heartbeat tasks:open-db spid) - (tasks:server-update-heartbeat tdb server-id) + + ;; + ;; NOT USED ANY MORE + ;; + ;; (tasks:server-update-heartbeat tdb server-id) ;; (if ;; (or (> numrunning 0) ;; stay alive for two days after last access ;; Transfer *last-db-access* to last-access to use in checking that we are still alive (mutex-lock! *heartbeat-mutex*) (set! last-access *last-db-access*) (mutex-unlock! *heartbeat-mutex*) ;; (debug:print 11 "last-access=" last-access ", server-timeout=" server-timeout) + ;; + ;; no_traffic + ;; (if (and *server-run* (> (+ last-access server-timeout) (current-seconds))) (begin (debug:print-info 0 "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) @@ -476,10 +492,13 @@ (begin (debug:print-info 0 "Starting to shutdown the server.") ;; need to delete only *my* server entry (future use) (set! *time-to-exit* #t) (if *inmemdb* (db:sync-touched *inmemdb* force-sync: #t)) + ;; + ;; start_shutdown + ;; ( tasks:server-set-state! tdb server-id "shutting-down") (thread-sleep! 5) (debug:print-info 0 "Max cached queries was " *max-cache-size*) (debug:print-info 0 "Number of cached writes " *number-of-writes*) (debug:print-info 0 "Average cached write time " @@ -498,10 +517,13 @@ (debug:print-info 0 "Server shutdown complete. Exiting") (tasks:server-delete-record! tdb server-id) (exit)))))) ;; all routes though here end in exit ... +;; +;; start_server? +;; (define (http-transport:launch run-id) (set! *run-id* run-id) (if (not *toppath*) (if (not (setup-for-run)) (begin @@ -508,12 +530,18 @@ (debug:print 0 "ERROR: cannot find megatest.config, exiting") (exit)))) (debug:print-info 2 "Starting the standalone server") (if (args:get-arg "-daemonize") (daemon:ize)) + ;; + ;; set_available + ;; (let ((server-id (open-run-close tasks:server-lock-slot tasks:open-db run-id))) (if (not server-id) + ;; + ;; remove_dead_entry? + ;; (begin (debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") (open-run-close tasks:server-delete-records-for-this-pid tasks:open-db)) (if *toppath* (let* ((th2 (make-thread (lambda () Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -42,10 +42,13 @@ ;; Call this to start the actual server ;; ;; all routes though here end in exit ... +;; +;; start_server +;; (define (server:launch transport run-id) (if (not *toppath*) (if (not (setup-for-run)) (begin (debug:print 0 "ERROR: cannot find megatest.config, exiting") Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -103,11 +103,11 @@ ;; register that this server may come online (first to register goes though with the process) (define (tasks:server-set-available mdb run-id) (sqlite3:execute mdb "INSERT INTO servers (pid,hostname,port,pubport,start_time, priority,state,mt_version,heartbeat, interface,transport,run_id) - VALUES(?, ?, ?, ?, strftime('%s','now'), ?, ?, ?, strftime('%s','now'),?, ?, ?);" + VALUES(?, ?, ?, ?, strftime('%s','now'), ?, ?, ?,-1,?, ?, ?);" (current-process-id) ;; pid (get-host-name) ;; hostname -1 ;; port -1 ;; pubport (random 1000) ;; priority (used a tiebreaker on get-available)