Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -34,11 +34,12 @@ treemod.scm all : $(PREFIX)/bin/.$(ARCHSTR) mtest ndboard dboard mtut tcmt # dbmod.import.o is just a hack here -mofiles/dbfile.o : mofiles/debugprint.o dbmod.import.o +mofiles/dbfile.o : mofiles/debugprint.o mofiles/commonmod.o dbmod.import.o +db.o : dbmod.import.o mofiles/debugprint.o : mofiles/mtargs.o # ftail.scm rmtmod.scm commonmod.scm removed # MSRCFILES = ducttape-lib.scm pkts.scm stml2.scm cookie.scm mutils.scm \ # mtargs.scm commonmod.scm dbmod.scm adjutant.scm ulex.scm \ @@ -87,11 +88,11 @@ # ARCHSTR=$(shell bash -c "echo \$$MACHTYPE") PNGFILES = $(shell cd docs/manual;ls *png) -mtest: $(OFILES) readline-fix.scm megatest.o $(MOFILES) $(MOIMPFILES) megatest-version.scm +mtest: $(OFILES) readline-fix.scm $(MOFILES) $(MOIMPFILES) megatest.o megatest-version.scm csc $(CSCOPTS) $(OFILES) $(MOFILES) $(MOIMPFILES) megatest.o -o mtest showmtesthash: @echo $(MTESTHASH) @@ -136,11 +137,11 @@ ezsteps.o # mofiles/rmtmod.o \ # mofiles/commonmod.o \ -tcmt : $(TCMTOBJS) tcmt.scm megatest-version.scm +tcmt : $(TCMTOBJS) tcmt.scm megatest-version.scm $(MOFILES) $(MOIMPFILES) csc $(CSCOPTS) $(TCMTOBJS) $(MOFILES) $(MOIMPFILES) tcmt.scm -o tcmt # install documentation to $(PREFIX)/docs # DOES NOT REBUILD DOCS # @@ -173,17 +174,20 @@ monitor.o dashboard.o archive.o megatest.o : db_records.scm megatest-fossil-hash.scm tests.o runs.o dashboard.o dashboard-tests.o dashboard-context-menu.o dashboard-main.o : run_records.scm db.o ezsteps.o keys.o launch.o megatest.o monitor.o runs-for-ref.o runs.o tests.o : key_records.scm -db.o api.o : mofiles/dbmod.o mofiles/dbfile.o mofiles/debugprint.o mofiles/commonmod.o +db.o api.o : mofiles/dbmod.o mofiles/dbfile.o mofiles/debugprint.o mofiles/commonmod.o dbmod.import.o tests.o tasks.o dashboard-tasks.o : task_records.scm runs.o : test_records.scm -megatest.o : megatest-fossil-hash.scm megatest-version.scm +mofiles-made : $(MOFILES) + make $(MOIMPFILES) + +megatest.o : mofiles-made megatest-fossil-hash.scm megatest-version.scm $(MOFILES) $(MOIMPFILES) rmt.scm client.scm common.scm configf.scm dashboard-guimonitor.scm dashboard-tests.scm dashboard.scm db.scm dcommon.scm ezsteps.scm http-transport.scm index-tree.scm items.scm keys.scm launch.scm megatest.scm monitor.scm mt.scm newdashboard.scm runconfig.scm runs.scm server.scm tdb.scm tests.scm tree.scm : common_records.scm megatest-version.scm common_records.scm : altdb.scm Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -429,8 +429,8 @@ ;; (boolean? res)) ;; res ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res))))) (db:obj->string res transport: 'http))) (begin - (debug:print 0 *default-log-port* "Server refused to process request. Sever id mismatch. recived " key " expected: " *server-id* ".\nOther arguments recived: cmd=" cmd " params = " params) + (debug:print 0 *default-log-port* "Server refused to process request. Server id mismatch. recived " key " expected: " *server-id* ".\nOther arguments recived: cmd=" cmd " params = " params) (db:obj->string (conc "Server refused to process request server-id mismatch: " key ", " *server-id*) transport: 'http))))) Index: archive.scm ================================================================== --- archive.scm +++ archive.scm @@ -346,11 +346,11 @@ (rsync-exe (or (configf:lookup *configdat* "archive" "rsync") "rsync")) (print-prefix "Running: ") (archive-info (archive:allocate-new-archive-block blockid-cache *toppath* tsname min-space target-patt run-patt "megatest-db")) (archive-dir (if archive-info (cdr archive-info) #f)) (archive-id (if archive-info (car archive-info) -1)) - (home-host (common:get-homehost)) + (home-host (server:choose-server *toppath* 'homehost)) (archive-time (seconds->std-time-str (current-seconds))) (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time)) (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db")) (dbfile (conc archive-staging-db "/megatest.db"))) (create-directory archive-staging-db #t) Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -44,24 +44,12 @@ #;(define (client:logout serverdat) (let ((ok (and (socket? serverdat) (cdb:logout serverdat *toppath* (client:get-signature))))) ok)) -#;(define (client:connect iface port) - (http-transport:client-connect iface port) - #;(case (server:get-transport) - ((rpc) (rpc:client-connect iface port)) - ((http) (http:client-connect iface port)) - ((zmq) (zmq:client-connect iface port)) - (else (rpc:client-connect iface port)))) - (define (client:setup areapath #!key (remaining-tries 100) (failed-connects 0)) - (client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects) - #;(case (server:get-transport) - ((rpc) (rpc-transport:client-setup remaining-tries: remaining-tries failed-connects: failed-connects)) ;;(client:setup-rpc run-id)) - ((http)(client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects)) - (else (rpc-transport:client-setup remaining-tries: remaining-tries failed-connects: failed-connects)))) ;; (client:setup-rpc run-id)))) + (client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects)) ;; Do all the connection work, look up the transport type and set up the ;; connection if required. ;; ;; There are two scenarios. @@ -89,53 +77,52 @@ (exit 1)) ;; ;; Alternatively here, we can get the list of candidate servers and work our way ;; through them searching for a good one. ;; - (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath)) + (let* ((server-dat (server:choose-server areapath 'best)) (runremote (or area-dat *runremote*))) (if (not server-dat) ;; no server found (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)) - (let ((host (cadr server-dat)) - (port (caddr server-dat)) - (server-id (caddr (cddr server-dat)))) - (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) - (if (and (not area-dat) - (not *runremote*)) - (begin - (set! *runremote* (make-remote)) - (let* ((server-info (remote-server-info *runremote*))) - (if server-info - (begin - (remote-server-url-set! *runremote* (server:record->url server-info)) - (remote-server-id-set! *runremote* (server:record->id server-info))))))) - (if (and host port server-id) - (let* ((start-res (case *transport-type* - ((http)(http-transport:client-connect host port server-id)))) - (ping-res (case *transport-type* - ((http)(rmt:login-no-auto-client-setup start-res))))) - (if (and start-res - ping-res) - (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago - (if runremote - (begin - (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res) - (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) - start-res) - (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)))) - (begin ;; login failed but have a server record, clean out the record and try again - (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid. Fixes part of Randy;s ticket 1405717332 - (case *transport-type* - ((http)(http-transport:close-connections))) - (if *runremote* - (remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id) - ) - (thread-sleep! 1) - (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)) - ))) - (begin ;; no server registered - ;; (server:kind-run areapath) - (server:start-and-wait areapath) - (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) - (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. - (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))))))))) + (match server-dat + ((host port start-time server-id pid) + (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) + (if (and (not area-dat) + (not *runremote*)) + (begin + (set! *runremote* (make-remote)) + (let* ((server-info (remote-server-info *runremote*))) + (if server-info + (begin + (remote-server-url-set! *runremote* (server:record->url server-info)) + (remote-server-id-set! *runremote* (server:record->id server-info))))))) + (if (and host port server-id) + (let* ((start-res (http-transport:client-connect host port server-id)) + (ping-res (rmt:login-no-auto-client-setup start-res))) + (if (and start-res + ping-res) + (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago + (if runremote + (begin + (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res) + (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) + start-res) + (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)))) + (begin ;; login failed but have a server record, clean out the record and try again + (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid. Fixes part of Randy;s ticket 1405717332 + (case *transport-type* + ((http)(http-transport:close-connections))) + (if *runremote* + (remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id) + ) + (thread-sleep! 1) + (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)) + ))) + (begin ;; no server registered + ;; (server:kind-run areapath) + (server:start-and-wait areapath) + (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) + (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. + (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))))) + (else + (debug:print 0 *default-log-port* "ERROR: malformed server-dat="server-dat))))))) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -142,10 +142,11 @@ (define *passnum* 0) ;; when running track calls to run-tests or similar ;; (define *alt-log-file* #f) ;; used by -log ;; (define *common:denoise* (make-hash-table)) ;; for low noise printing (define *default-log-port* (current-error-port)) (define *time-zero* (current-seconds)) ;; for the watchdog +(define *on-exit-procs* '()) ;; add procs to this list to be executed on exit (define *default-area-tag* "local") ;; DATABASE ;; (define *dbstruct-dbs* #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this. ;; db stats @@ -315,11 +316,14 @@ (define (common:logpro-exit-code->test-status exit-code) (status-sym->string (common:logpro-exit-code->status-sym exit-code))) (defstruct remote - (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) + (hh-dat (let ((res (or (server:choose-server *toppath* 'homehost) + (cons #f #f)))) + (assert (pair? res)(conc "FATAL: hh-dat should be a pair, got "res)) + res)) (server-url #f) ;; (server:check-if-running *toppath*) #f)) (server-id #f) (server-info (if *toppath* (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (connect-time (current-seconds)) @@ -1305,72 +1309,10 @@ ;;====================================================================== ;; logic for getting homehost. Returns (host . at-home) ;; IF *toppath* is not set, wait up to five seconds trying every two seconds ;; (this is to accomodate the watchdog) ;; -(define (common:get-homehost #!key (trynum 5)) - ;; called often especially at start up. use mutex to eliminate collisions - (mutex-lock! *homehost-mutex*) - (cond - (*home-host* - (mutex-unlock! *homehost-mutex*) - *home-host*) - ((not *toppath*) - (mutex-unlock! *homehost-mutex*) - (launch:setup) ;; safely mutexed now - (if (> trynum 0) - (begin - (thread-sleep! 2) - (common:get-homehost trynum: (- trynum 1))) - #f)) - (else - (let* ((currhost (get-host-name)) - (bestadrs (server:get-best-guess-address currhost)) - ;; first look in config, then look in file .homehost, create it if not found - (homehost (or (configf:lookup *configdat* "server" "homehost" ) - (handle-exceptions - exn - (if (> trynum 0) - (let ((delay-time (* (- 5 trynum) 5))) - (mutex-unlock! *homehost-mutex*) - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying " - delay-time " seconds and trying again, message: " ((condition-property-accessor 'exn 'message) exn) - ", exn=" exn) - (thread-sleep! delay-time) - (common:get-homehost trynum: (- trynum 1))) - (begin - (mutex-unlock! *homehost-mutex*) - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time) - "] Failed to read .homehost file after trying five times. Giving up and exiting, message: " - ((condition-property-accessor 'exn 'message) exn)) - (exit 1))) - (let ((hhf (conc *toppath* "/.homehost"))) - (if (common:file-exists? hhf) - (with-input-from-file hhf read-line) - (if (file-write-access? *toppath*) - (begin - (with-output-to-file hhf - (lambda () - (print bestadrs))) - (begin - (mutex-unlock! *homehost-mutex*) - (car (common:get-homehost)))) - #f)))))) - (at-home (or (equal? homehost currhost) - (equal? homehost bestadrs)))) - (set! *home-host* (cons homehost at-home)) - (mutex-unlock! *homehost-mutex*) - *home-host*)))) - -;;====================================================================== -;; am I on the homehost? -;; -(define (common:on-homehost?) - (let ((hh (common:get-homehost))) - (if hh - (cdr hh) - #f))) ;;====================================================================== ;; do we honor the caches of the config files? ;; (define (common:use-cache?) @@ -2049,11 +1991,11 @@ (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) (define (common:wait-for-homehost-load maxnormload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f - (common:get-homehost))) + (server:choose-server *toppath* 'homehost))) (hh (if hh-dat (car hh-dat) #f))) (common:wait-for-normalized-load maxnormload msg hh))) (define (common:get-num-cpus remote-host) (let* ((actual-host (or remote-host (get-host-name)))) @@ -3344,11 +3286,11 @@ pktsdirs)) ;;====================================================================== ;; use-lt is use linktree "lt" link to find pkts dir (define (common:save-pkt pktalist-in mtconf use-lt #!key (add-only #f)) ;; add-only saves the pkt only if there is a parent already - (if (or add-only + (if (or (not add-only) (hash-table-exists? *pkts-info* 'last-parent)) (let* ((parent (hash-table-ref/default *pkts-info* 'last-parent #f)) (pktalist (if parent (cons `(parent . ,parent) pktalist-in) @@ -3359,10 +3301,11 @@ (let ((pktsdir (or (hash-table-ref/default *pkts-info* 'pkts-dir #f) (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) (pktsdir (car pktsdirs))) ;; assume it is there (hash-table-set! *pkts-info* 'pkts-dir pktsdir) pktsdir)))) + (debug:print 0 *default-log-port* "pktsdir: "pktsdir) (handle-exceptions exn (debug:print-info 0 "failed to write out packet to " pktsdir ", exn=" exn) ;; don't care if this failed for now but MUST FIX - BUG!! (if (not (file-exists? pktsdir)) (create-directory pktsdir #t)) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -3787,10 +3787,11 @@ "dashboard:runs-tab-updater")) ;;====================================================================== ;; The heavy lifting starts here ;;====================================================================== + (stop-the-train) (define (main) ;; (print "Starting dashboard main") @@ -3809,13 +3810,13 @@ (debug:print 0 *default-log-port* "Failed to find megatest.config, exiting") (exit 1) ) ) - (if (not (common:on-homehost?)) + #;(if (not (common:on-homehost?)) (begin - (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (common:get-homehost)) + (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (server:get-homehost)) (debug:print 0 *default-log-port* "It will be slower.") )) (if (and (common:file-exists? mtdb-path) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -477,11 +477,11 @@ (data-synced 0) ;; count of changed records (tmp-area (common:get-db-tmp-area)) (old2new (member 'old2new options)) (dejunk (member 'dejunk options)) (killservers (member 'killservers options)) - (servers (server:get-list *toppath*)) + (servers (server:choose-server *toppath* 'all-valid)) ;; (server:get-list *toppath*)) (src-area (if old2new *toppath* tmp-area)) (dest-area (if old2new tmp-area *toppath*)) (dbfiles (if old2new (glob (conc *toppath* "/.megatest/*.db")) (glob (conc tmp-area "/.megatest/*.db")))) (keys (db:get-keys dbstruct)) (sync-durations (make-hash-table))) @@ -4586,31 +4586,41 @@ )) (define (std-exit-procedure) ;;(common:telemetry-log-close) - (on-exit (lambda () 0)) + (on-exit (lambda () 0)) ;; why is this here? ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) (let ((no-hurry (if *time-to-exit* ;; hurry up #f (begin (set! *time-to-exit* #t) #t)))) (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.") - (if (and no-hurry (debug:debug-mode 18)) + (if (and no-hurry + (debug:debug-mode 18)) (rmt:print-db-stats)) (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds (if *dbstruct-dbs* (db:close-all *dbstruct-dbs*)) ;; one second allocated + (if (list? *on-exit-procs*) + (for-each + (lambda (proc) + (proc)) + *on-exit-procs*)) (if *task-db* (let ((db (cdr *task-db*))) (if (sqlite3:database? db) (begin (sqlite3:interrupt! db) (sqlite3:finalize! db #t) ;; (vector-set! *task-db* 0 #f) (set! *task-db* #f))))) - (http-client#close-all-connections!) + (if (and (not (args:get-arg "-server")) + *runremote*) + (begin + (debug:print-info 0 *default-log-port* "Closing all client connections...") + (http-client#close-all-connections!))) ;; (if (and *runremote* ;; (remote-conndat *runremote*)) ;; (begin ;; (http-client#close-all-connections!))) ;; for http-client (if (not (eq? *default-log-port* (current-error-port))) Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -980,14 +980,14 @@ ;;====================================================================== ;; call with dbinit=db:initialize-main-db ;; (define (db:open-db dbstruct run-id dbinit) - (mutex-lock! *db-open-mutex*) + ;; (mutex-lock! *db-open-mutex*) (let* ((dbdat (dbfile:open-db dbstruct run-id dbinit))) (set! *db-write-access* (not (dbr:dbdat-read-only dbdat))) - (mutex-unlock! *db-open-mutex*) + ;; (mutex-unlock! *db-open-mutex*) dbdat)) (define dbfile:db-init-proc (make-parameter #f)) ;; (db:with-db dbstruct run-id sqlite3:exec "select blah fgrom blaz;") @@ -1015,11 +1015,11 @@ (dbfile:print-err "INFO: "jfile" exists, delaying to reduce database load") (thread-sleep! 0.2))) (if (and use-mutex (common:low-noise-print 120 "over-50-parallel-api-requests")) (dbfile:print-err *api-process-request-count* " parallel api requests being processed in process " - (current-process-id) ", throttling access")) + (current-process-id))) ;; ", throttling access")) (condition-case (begin (if use-mutex (mutex-lock! *db-with-db-mutex*)) (let ((res (apply proc dbdat db params))) ;; the actual call is here. (if use-mutex (mutex-unlock! *db-with-db-mutex*)) Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -704,11 +704,11 @@ #:numlin-visible 5 )) (colnames (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId")) (updater (lambda () (if (dashboard:monitor-changed? commondat tabdat) - (let ((servers (server:get-list *toppath* limit: 10))) + (let ((servers (server:choose-server *toppath* 'all-valid))) ;; (server:get-list *toppath* limit: 10))) (iup:attribute-set! servers-matrix "NUMLIN" (length servers)) ;; (set! colnum 0) ;; (for-each (lambda (colname) ;; ;; (print "colnum: " colnum " colname: " colname) ;; (iup:attribute-set! servers-matrix (conc "0:" colnum) colname) @@ -716,26 +716,27 @@ ;; colnames) (set! rownum 1) (for-each (lambda (server) (set! colnum 0) - (match-let (((mod-time host port start-time server-id pid) + ;; (("192.168.0.127" 60215 1669088591.0 "c85484f764df7a8550b0224409bd4bcd") + (match-let (((host port start-time server-id pid) server)) - (let* ((uptime (- (current-seconds) mod-time)) + (let* (;; (uptime (- (current-seconds) mod-time)) (runtime (if start-time - (- mod-time start-time) + (- (current-seconds) start-time) 0)) (vals (list "-" ;; (vector-ref server 0) ;; Id "-" ;; (vector-ref server 9) ;; MT-Ver pid ;; (vector-ref server 1) ;; Pid host ;; (vector-ref server 2) ;; Hostname (conc host ":" port) ;; (conc (vector-ref server 3) ":" (vector-ref server 4)) ;; IP:Port (seconds->hr-min-sec runtime) ;; (- (current-seconds) start-time)) ;; (vector-ref server 6))) - (cond - ((< uptime 5) "alive") - ((< uptime 16) "probably alive");; less than 15 seconds since mod, call it alive (vector-ref server 8) ;; State - (else "dead")) + "-" #;(cond + ((< uptime 5) "alive") + ((< uptime 16) "probably alive");; less than 15 seconds since mod, call it alive (vector-ref server 8) ;; State + (else "dead")) "-" ;; (vector-ref server 12) ;; RunId ))) (for-each (lambda (val) (let* ((row-col (conc rownum ":" colnum)) (curr-val (iup:attribute servers-matrix row-col))) @@ -745,11 +746,11 @@ (iup:attribute-set! servers-matrix "FITTOTEXT" (conc "C" colnum)))) (set! colnum (+ 1 colnum)))) vals) (set! rownum (+ rownum 1))) (iup:attribute-set! servers-matrix "REDRAW" "ALL"))) - (sort servers (lambda (a b)(> (car a)(car b)))))))))) + (sort servers (lambda (a b)(> (caddr a)(caddr b)))))))))) (set! colnum 0) (for-each (lambda (colname) (iup:attribute-set! servers-matrix (conc "0:" colnum) colname) (iup:attribute-set! servers-matrix "FITTOTEXT" (conc "C" colnum)) (set! colnum (+ colnum 1))) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -285,10 +285,11 @@ (begin (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") (debug:print 0 *default-log-port* " message: " msg ", exn=" exn) (debug:print 0 *default-log-port* " cmd: " cmd " params: " params " key:" (or server-id "thekey")) (debug:print 0 *default-log-port* " call-chain: " call-chain))) + ;; what if another thread is communicating ok? Can't happen due to mutex (set! *runremote* #f) (set! runremote #f) ;; (if runremote ;; (remote-conndat-set! runremote #f)) ;; Killing associated server to allow clean retry.") @@ -391,11 +392,12 @@ (debug:print-error 0 *default-log-port* "call to http-transport:server-dat-update-last-access with non-vector!!")))) ;; ;; connect ;; -(define (http-transport:client-connect iface port server-id) +(define (http-transport:client-connect iface port server-id) + (debug:print-info 0 *default-log-port* "Connecting to client at "iface":"port", with server-id "server-id) (let* ((api-url (conc "http://" iface ":" port "/api")) (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) (api-req (make-request method: 'POST uri: api-uri)) (server-dat (vector iface port api-uri api-url api-req (current-seconds) server-id))) server-dat)) @@ -409,11 +411,12 @@ (define (http-transport:keep-running) ;; if none running or if > 20 seconds since ;; server last used then start shutdown ;; This thread waits for the server to come alive (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server") - (let* ((sdat #f) + (let* ((servinfofile #f) + (sdat #f) (no-sync-db (db:open-no-sync-db)) (tmp-area (common:get-db-tmp-area)) (started-file (conc tmp-area "/.server-started")) (server-start-time (current-seconds)) (server-info (let loop ((start-time (current-seconds)) @@ -426,13 +429,30 @@ (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) (if (and sdat (not changed) (> (- (current-seconds) start-time) 2)) - (begin + (let* ((servinfodir (conc *toppath*"/.servinfo")) + (ipaddr (car sdat)) + (port (cadr sdat)) + (servinf (conc servinfodir"/"ipaddr":"port))) + (set! servinfofile servinf) + (if (not (file-exists? servinfodir)) + (create-directory servinfodir #t)) + (with-output-to-file servinf + (lambda () + (let* ((serv-id (server:mk-signature))) + (set! *server-id* serv-id) + (print "SERVER STARTED: "ipaddr":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)) + (print "started: "(seconds->year-week/day-time (current-seconds)))))) + (set! *on-exit-procs* (cons + (lambda () + (delete-file* servinf)) + *on-exit-procs*)) + ;; put data about this server into a simple flat file host.port (debug:print-info 0 *default-log-port* "Received server alive signature") - (common:save-pkt `((action . alive) + #;(common:save-pkt `((action . alive) (T . server) (pid . ,(current-process-id)) (ipaddr . ,(car sdat)) (port . ,(cadr sdat))) *configdat* #t) @@ -439,13 +459,16 @@ sdat) (begin (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) (sleep 4) (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes - (begin + (let* ((ipaddr (car sdat)) + (port (cadr sdat)) + (servinf (conc *toppath*"/.servinfo/"ipaddr":"port))) (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") - (common:save-pkt `((action . died) + ;; (delete-file* servinf) ;; handled by on-exit, can be removed + #;(common:save-pkt `((action . died) (T . server) (pid . ,(current-process-id)) (ipaddr . ,(car sdat)) (port . ,(cadr sdat)) (msg . "Transport died?")) @@ -475,29 +498,17 @@ (if (not server-going) ;; *dbstruct-dbs* (begin (debug:print 0 *default-log-port* "SERVER: dbprep") (set! *dbstruct-dbs* (db:setup #t)) ;; run-id)) FIXME!!! (set! server-going #t) - (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version)) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. - - ;; (thread-start! *watchdog*) - ) + (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version))) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. (if (and no-sync-db - (common:low-noise-print 5 "sync-all")) ;; cheesy way to reduce frequency of running sync :) + (common:low-noise-print 10 "sync-all")) ;; cheesy way to reduce frequency of running sync :) (begin - (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S")) - - ;; This is tougher than it seems - have to deal with multiple dbs - ;; (db:process-transaction-queue *dbstruct-dbs*) - - (db:all-db-sync *dbstruct-dbs*) - - ;; (db:do-sync no-sync-db) - ;; (db:run-lock-and-sync *no-sync-db*) - ) - ) - ) + (if (common:low-noise-print 120 "sync-all-print") + (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S"))) + (db:all-db-sync *dbstruct-dbs*)))) ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) (rem-time (quotient (- 4000 sync-time) 1000))) @@ -518,11 +529,11 @@ (new-port (cadr sdat))) (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info") (set! iface new-iface) (set! port new-port) (if (not *server-id*) - (set! *server-id* (server:mk-signature))) + (set! *server-id* (server:mk-signature))) (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) (flush-output *default-log-port*))) ;; Transfer *db-last-access* to last-access to use in checking that we are still alive (mutex-lock! *heartbeat-mutex*) @@ -540,37 +551,32 @@ (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) (cond - #;((and *server-run* - (> (- (current-seconds) server-start-time) 420)) ;; let's try server replacement - ;; ((adj-proc-load . 0.056875) (adj-core-load . 0.11375) (1m-load . 0.91) (5m-load . 0.77) (15m-load . 1.0) (proc . 16) (core . 8) (phys . 1)) - (let* ((loaddat (common:get-normalized-cpu-load #f)) - (adj-proc-load (alist-ref 'adj-proc-load loaddat)) - (adj-core-load (alist-ref 'adj-core-load loaddat)) - (adj-load (max adj-proc-load adj-core-load))) - (if (< adj-load 2) ;; reduce chance of runaway - (server:run *toppath*)) - (db:all-db-sync *dbstruct-dbs*) - (thread-sleep! 30) - (http-transport:server-shutdown port))) ((and *server-run* (> (+ last-access server-timeout) (current-seconds))) (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions exn - (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on log file " server-log-file ". Are you out of space on that disk? exn=" exn) + (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on info file " servinfofile ". Are you out of space on that disk? exn=" exn) (if (and (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter - (not *server-overloaded*)) - (change-file-times server-log-file curr-time curr-time) - (if (common:low-noise-print 120 "start new server") - (server:kind-run *toppath*) ;; server:kind-run uses [servers] numservers - ))))) + (not *server-overloaded*) + (file-exists? servinfofile)) + (change-file-times servinfofile curr-time curr-time))) + (if (or (common:low-noise-print 120 "start new server") + (> *api-process-request-count* 50)) ;; if this server is kind of busy start up another + (begin + (debug:print-info 0 *default-log-port* "Server is busy, parallel-api-count "*api-process-request-count*", start another if possible...") + (server:kind-run *toppath*) + (if (> *api-process-request-count* 100) + (begin + (debug:print-info 0 *default-log-port* "Server is overloaded at parallel-api-count="*api-process-request-count*", removing "servinfofile) + (delete-file* servinfofile))))))) (loop 0 server-state bad-sync-count (current-milliseconds))) (else (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) (http-transport:server-shutdown port))))))) @@ -600,14 +606,17 @@ ;; (/ *total-non-write-delay* ;; *number-non-write-queries*)) ;; " ms") (db:print-current-query-stats) - (common:save-pkt `((action . exit) + #;(common:save-pkt `((action . exit) (T . server) (pid . ,(current-process-id))) - *configdat* #t) + *configdat* #t) + + ;; remove .servinfo file(s) here + (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting") (exit))) ;; all routes though here end in exit ... ;; @@ -640,14 +649,14 @@ #;(let* ((num-alive (server:get-num-alive (server:get-list *toppath*)))) (if (> num-alive 3) (begin (cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up")) (exit)))) - (common:save-pkt `((action . start) - (T . server) - (pid . ,(current-process-id))) - *configdat* #t) + #;(common:save-pkt `((action . start) + (T . server) + (pid . ,(current-process-id))) + *configdat* #t) (let* ((th2 (make-thread (lambda () (debug:print-info 0 *default-log-port* "Server run thread started") (http-transport:run (if (args:get-arg "-server") (args:get-arg "-server") Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -1563,11 +1563,11 @@ (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) ;; (list 'transport (conc *transport-type*)) ;; (list 'serverinf *server-info*) - (list 'homehost (let* ((hhdat (common:get-homehost))) + #;(list 'homehost (let* ((hhdat (server:get-homehost))) (if hhdat (car hhdat) #f))) (list 'serverurl (if *runremote* (remote-server-url *runremote*) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -656,13 +656,13 @@ (if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun") (debug:print 0 *default-log-port* (string-intersperse (argv) " "))) ;; some switches imply homehost. Exit here if not on homehost ;; -(let ((homehost-required (list "-cleanup-db" "-server"))) +(let ((homehost-required (list "-cleanup-db"))) (if (apply args:any? homehost-required) - (if (not (common:on-homehost?)) + (if (not (server:choose-server *toppath* 'home?)) (for-each (lambda (switch) (if (args:get-arg switch) (begin (debug:print 0 *default-log-port* "ERROR: you must be on the homehost to run with " switch @@ -951,11 +951,11 @@ (if (or (args:get-arg "-list-servers") (args:get-arg "-kill-servers")) (let ((tl (launch:setup))) (if tl ;; all roads from here exit - (let* ((servers (server:get-list *toppath*)) + (let* ((servers (server:choose-server *toppath* 'all-valid)) ;; (server:get-list *toppath*)) (fmtstr "~33a~22a~20a~20a~8a\n")) (format #t fmtstr "ID" "host:port" "age (hms)" "Last mod" "State") (format #t fmtstr "==" "=========" "=========" "========" "=====") (for-each ;; ( mod-time host port start-time pid ) (lambda (server) @@ -2378,11 +2378,11 @@ (if (or (getenv "MT_RUNSCRIPT") (args:get-arg "-repl") (args:get-arg "-load")) (let* ((toppath (launch:setup)) (dbstructs (if (and toppath - (common:on-homehost?)) + (server:choose-server toppath 'home?)) (db:setup #t) #f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f))) (if *toppath* (cond ((getenv "MT_RUNSCRIPT") Index: mtargs/mtargs.scm ================================================================== --- mtargs/mtargs.scm +++ mtargs/mtargs.scm @@ -57,11 +57,11 @@ (print help) (print "Usage: " (car (argv)) " ... ")) (exit 0)) ;; one-of args defined -(define (args:any-defined? . param) +(define (any-defined? . param) (let ((res #f)) (for-each (lambda (arg) (if (get-arg arg)(set! res #t))) param) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -52,10 +52,19 @@ (if cinfo cinfo (if (server:check-if-running areapath) (client:setup areapath) #f)))) + +(define (rmt:on-homehost? runremote) + (let* ((hh-dat (remote-hh-dat runremote))) + (if (pair? hh-dat) + (cdr hh-dat) + (begin + (debug:print-info 0 *default-log-port* "hh-dat="hh-dat) + #f)))) + ;;====================================================================== (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id @@ -116,11 +125,12 @@ ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"]; ;; DOT SET_HOMEHOST -> MUTEXLOCK; ;; ensure we have a homehost record (if (not (pair? (remote-hh-dat runremote))) ;; not on homehost (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little - (remote-hh-dat-set! runremote (common:get-homehost))) + (let ((hh-data (server:choose-server areapath 'homehost))) + (remote-hh-dat-set! runremote (or hh-data (cons #f #f))))) ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) (cond #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.") @@ -165,35 +175,40 @@ (remote-conndat runremote) (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on (+ (http-transport:server-dat-get-last-access (remote-conndat runremote)) (remote-server-timeout runremote)))) (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") - (http-transport:close-connections area-dat: runremote) (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. + (http-transport:close-connections area-dat: runremote) (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;;DOT CASE5 [label="local\nread"]; ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; ;;DOT CASE5 -> "rmt:open-qry-close-locally"; ;; on homehost and this is a read ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required - (cdr (remote-hh-dat runremote)) ;; on homehost + (rmt:on-homehost? runremote) (member cmd api:read-only-queries)) ;; this is a read (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") (rmt:open-qry-close-locally cmd 0 params)) ;;DOT CASE6 [label="init\nremote"]; ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6}; ;;DOT CASE6 -> "rmt:send-receive"; ;; on homehost and this is a write, we already have a server, but server has died - ((and (cdr (remote-hh-dat runremote)) ;; on homehost + + ;; reinstate this keep-alive section but inject a time condition into the (add ... + + #;((and (cdr (remote-hh-dat runremote)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url runremote) ;; have a server (not (server:ping (remote-server-url runremote) (remote-server-id runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. + (debug:print 0 *default-log-port* "WARNING: server appears to have died, trying to reconnect, case 6") + (http-transport:close-connections area-dat: runremote) ;; make sure to clean up (set! *runremote* (make-remote)) (let* ((server-info (remote-server-info *runremote*))) (if server-info (begin (remote-server-url-set! *runremote* (server:record->url server-info)) @@ -266,10 +281,32 @@ ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"]; ;;DOT CASE11 -> "RESULT" [label="call succeeded"]; ;; not on homehost, do server query (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid))))) ;;DOT } + +;; No Title +;; Error: (vector-ref) out of range +;; #(# (#("db.scm:3740: regex#regexp" #f #f) #("db.scm:3739: regex#string-substitute" #f #f) #("db.scm:3738: base64#base64-decode" #f #f) #("db.scm:3737: z3#z3:decode-buffer" #f #f) #("db.scm:3736: with-input-from-string" #f #f) #("db.scm:3741: s11n#deserialize" #f #f) #("api.scm:374: api:execute-requests" #f #f) #("api.scm:139: call-with-current-continuation" #f #f) #("api.scm:139: with-exception-handler" #f #f) #("api.scm:139: ##sys#call-with-values" #f #f) #("api.scm:158: string->symbol" #f #f) #("api.scm:160: current-milliseconds" #f #f) #("api.scm:161: dbr:dbstruct-read-only" #f #f) #("api.scm:139: k15" #f #f) #("api.scm:139: g19" #f #f) #("api.scm:142: get-call-chain" #f #f)) #("get-test-info-by-id" (1102 507299))) +;; 6 +;; +;; Call history: +;; +;; http-transport.scm:306: thread-terminate! +;; http-transport.scm:307: debug:print-info +;; common_records.scm:235: debug:debug-mode +;; rmt.scm:259: k587 +;; rmt.scm:259: g591 +;; rmt.scm:276: http-transport:server-dat-update-last-access +;; http-transport.scm:364: current-seconds +;; rmt.scm:282: debug:print-info +;; common_records.scm:235: debug:debug-mode +;; rmt.scm:283: mutex-unlock! +;; rmt.scm:287: extras-transport-succeded <-- +;; +-----------------------------------------------------------------------------+ +;; | Exit Status : 70 +;; ;; bunch of small functions factored out of send-receive to make debug easier ;; (define (extras-case-11 *default-log-port* runremote cmd params attemptnum rid) @@ -291,33 +328,10 @@ ((commfail)(vector #f "communications fail")) ((exn)(vector #f "other fail" (print-call-chain))))) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported") (exit)))) - -;; No Title -;; Error: (vector-ref) out of range -;; #(# (#("db.scm:3740: regex#regexp" #f #f) #("db.scm:3739: regex#string-substitute" #f #f) #("db.scm:3738: base64#base64-decode" #f #f) #("db.scm:3737: z3#z3:decode-buffer" #f #f) #("db.scm:3736: with-input-from-string" #f #f) #("db.scm:3741: s11n#deserialize" #f #f) #("api.scm:374: api:execute-requests" #f #f) #("api.scm:139: call-with-current-continuation" #f #f) #("api.scm:139: with-exception-handler" #f #f) #("api.scm:139: ##sys#call-with-values" #f #f) #("api.scm:158: string->symbol" #f #f) #("api.scm:160: current-milliseconds" #f #f) #("api.scm:161: dbr:dbstruct-read-only" #f #f) #("api.scm:139: k15" #f #f) #("api.scm:139: g19" #f #f) #("api.scm:142: get-call-chain" #f #f)) #("get-test-info-by-id" (1102 507299))) -;; 6 -;; -;; Call history: -;; -;; http-transport.scm:306: thread-terminate! -;; http-transport.scm:307: debug:print-info -;; common_records.scm:235: debug:debug-mode -;; rmt.scm:259: k587 -;; rmt.scm:259: g591 -;; rmt.scm:276: http-transport:server-dat-update-last-access -;; http-transport.scm:364: current-seconds -;; rmt.scm:282: debug:print-info -;; common_records.scm:235: debug:debug-mode -;; rmt.scm:283: mutex-unlock! -;; rmt.scm:287: extras-transport-succeded <-- -;; +-----------------------------------------------------------------------------+ -;; | Exit Status : 70 -;; - (dat (if (and (vector? dat-in) ;; ... check it is a correct size (> (vector-length dat-in) 1)) dat-in (vector #f (conc "communications fail (type 2), dat-in=" dat-in)))) (success (if (vector? dat) (vector-ref dat 0) #f)) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -99,10 +99,22 @@ (with-output-to-string (lambda () (write (list (current-directory) (current-process-id) (argv))))))) + +(define (server:get-client-signature) + (if *my-client-signature* *my-client-signature* + (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic + (set! *my-client-signature* sig) + *my-client-signature*))) + +(define (server:get-server-id) + (if *server-id* *server-id* + (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic + (set! *server-id* sig) + *server-id*))) ;; When using zmq this would send the message back (two step process) ;; with spiffy or rpc this simply returns the return data to be returned ;; (define (server:reply return-addr query-sig success/fail result) @@ -121,25 +133,26 @@ ;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area - (let* ((curr-host (get-host-name)) + (let* (;; (curr-host (get-host-name)) ;; (attempt-in-progress (server:start-attempted? areapath)) ;; (dot-server-url (server:check-if-running areapath)) - (curr-ip (server:get-best-guess-address curr-host)) - (curr-pid (current-process-id)) - (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) - (target-host (car homehost)) + ;; (curr-ip (server:get-best-guess-address curr-host)) + ;; (curr-pid (current-process-id)) + ;; (homehost (server:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) + ;; (target-host (car homehost)) (testsuite (common:get-testsuite-name)) (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) (profile-mode (or (configf:lookup *configdat* "misc" "profilesw") "")) (cmdln (conc (common:get-megatest-exe) - " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") - " -daemonize " - "") + " -server - ";; (or target-host "-") + (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") + " -daemonize " + "") ;; " -log " logfile " -m testsuite:" testsuite " " profile-mode )) ;; (conc " >> " logfile " 2>&1 &"))))) (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? @@ -148,134 +161,135 @@ (push-directory areapath) (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") (thread-start! log-rotate) ;; host.domain.tld match host? - (if (and target-host - ;; look at target host, is it host.domain.tld or ip address and does it - ;; match current ip or hostname - (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) - (not (equal? curr-ip target-host))) - (begin - (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) - (setenv "TARGETHOST" target-host))) - + ;; (if (and target-host + ;; ;; look at target host, is it host.domain.tld or ip address and does it + ;; ;; match current ip or hostname + ;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) + ;; (not (equal? curr-ip target-host))) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) + ;; (setenv "TARGETHOST" target-host))) + ;; (setenv "TARGETHOST_LOGF" logfile) (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") - (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) + ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) ;; given a path to a server log return: host port startseconds server-id ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let ;; example of what it's looking for in the log file: ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 (define (server:logf-get-start-info logf) - (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id - (dbprep-rx (regexp "^SERVER: dbprep")) - (dbprep-found 0)) + (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id + (dbprep-rx (regexp "^SERVER: dbprep")) + (dbprep-found 0) + (bad-dat (list #f #f #f #f #f))) (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "Unable to get server info from " logf ", exn=" exn) - (list #f #f #f #f)) ;; no idea what went wrong, call it a bad server + bad-dat) ;; no idea what went wrong, call it a bad server (with-input-from-file logf (lambda () (let loop ((inl (read-line)) (lnum 0)) (if (not (eof-object? inl)) (let ((mlst (string-match server-rx inl)) - (dbprep (string-match dbprep-rx inl)) - ) - (if dbprep - (set! dbprep-found 1) - ) + (dbprep (string-match dbprep-rx inl))) + (if dbprep (set! dbprep-found 1)) (if (not mlst) (if (< lnum 500) ;; give up if more than 500 lines of server log read (loop (read-line)(+ lnum 1)) (begin (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) - (list #f #f #f #f))) - (let ((dat (cdr mlst))) - (list (car dat) ;; host - (string->number (cadr dat)) ;; port - (string->number (caddr dat)) - (cadr (cddr dat)))))) + bad-dat)) + (match mlst + ((_ host port start server-id pid) + (list host + (string->number port) + (string->number start) + server-id + (string->number pid))) + (else + (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) + bad-dat)))) (begin - (if dbprep-found + (if dbprep-found (begin (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) - (thread-sleep! 0.5) ;; was 25 sec but that blocked things from starting? - ) - (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds))) - ) - (list #f #f #f #f))))))))) - -;; get a list of servers from the log files, with all relevant data -;; ( mod-time host port start-time pid ) -;; -(define (server:get-list areapath #!key (limit #f)) - (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) - (day-seconds (* 24 60 60))) - ;; if the directory exists continue to get the list - ;; otherwise attempt to create the logs dir and then - ;; continue - (if (if (directory-exists? (conc areapath "/logs")) - '() - (if (file-write-access? areapath) - (begin - (condition-case - (create-directory (conc areapath "/logs") #t) - (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) - (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) - (directory-exists? (conc areapath "/logs"))) - '())) - - ;; Get the list of server logs. - (let* ( - ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. - ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) - (server-logs (glob (conc areapath "/logs/server-*-*.log"))) - (num-serv-logs (length server-logs))) - (if (or (null? server-logs) (= num-serv-logs 0)) - (let () - (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) - '() - ) - (let loop ((hed (string-chomp (car server-logs))) - (tal (cdr server-logs)) - (res '())) - (let* ((mod-time (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn) - (current-seconds)) ;; 0 - (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted - (down-time (- (current-seconds) mod-time)) - (serv-dat (if (or (< num-serv-logs 10) - (< down-time 900)) ;; day-seconds)) - (server:logf-get-start-info hed) - '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at - (serv-rec (cons mod-time serv-dat)) - (fmatch (string-match fname-rx hed)) - (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) - (new-res (if (null? serv-dat) - res - (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let - (if (null? tal) - (if (and limit - (> (length new-res) limit)) - new-res ;; (take new-res limit) <= need intelligent sorting before this will work - new-res) - (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) - -(define (server:get-num-alive srvlst) + (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? + (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) + bad-dat)))))))) + +;; ;; get a list of servers from the log files, with all relevant data +;; ;; ( mod-time host port start-time pid ) +;; ;; +;; (define (server:get-list areapath #!key (limit #f)) +;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) +;; (day-seconds (* 24 60 60))) +;; ;; if the directory exists continue to get the list +;; ;; otherwise attempt to create the logs dir and then +;; ;; continue +;; (if (if (directory-exists? (conc areapath "/logs")) +;; '() +;; (if (file-write-access? areapath) +;; (begin +;; (condition-case +;; (create-directory (conc areapath "/logs") #t) +;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) +;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) +;; (directory-exists? (conc areapath "/logs"))) +;; '())) +;; +;; ;; Get the list of server logs. +;; (let* ( +;; ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. +;; ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) +;; (server-logs (glob (conc areapath "/logs/server-*-*.log"))) +;; (num-serv-logs (length server-logs))) +;; (if (or (null? server-logs) (= num-serv-logs 0)) +;; (let () +;; (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) +;; '() +;; ) +;; (let loop ((hed (string-chomp (car server-logs))) +;; (tal (cdr server-logs)) +;; (res '())) +;; (let* ((mod-time (handle-exceptions +;; exn +;; (begin +;; (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn) +;; (current-seconds)) ;; 0 +;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted +;; (down-time (- (current-seconds) mod-time)) +;; (serv-dat (if (or (< num-serv-logs 10) +;; (< down-time 900)) ;; day-seconds)) +;; (server:logf-get-start-info hed) +;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at +;; (serv-rec (cons mod-time serv-dat)) +;; (fmatch (string-match fname-rx hed)) +;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) +;; (new-res (if (null? serv-dat) +;; res +;; (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let +;; (if (null? tal) +;; (if (and limit +;; (> (length new-res) limit)) +;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work +;; new-res) +;; (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) + +#;(define (server:get-num-alive srvlst) (let ((num-alive 0)) (for-each (lambda (server) (handle-exceptions exn @@ -289,70 +303,72 @@ 0))) (if (< uptime 5)(set! num-alive (+ num-alive 1))))))) srvlst) num-alive)) -;; given a list of servers get a list of valid servers, i.e. at least -;; 10 seconds old, has started and is less than 1 hour old and is -;; active (i.e. mod-time < 10 seconds -;; -;; mod-time host port start-time pid -;; -;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off -;; and servers should stick around for about two hours or so. -;; -(define (server:get-best srvlst) - (let* ((nums (server:get-num-servers)) - (now (current-seconds)) - (slst (sort - (filter (lambda (rec) - (if (and (list? rec) - (> (length rec) 2)) - (let ((start-time (list-ref rec 3)) - (mod-time (list-ref rec 0))) - ;; (print "start-time: " start-time " mod-time: " mod-time) - (and start-time mod-time - (> (- now start-time) 0) ;; been running at least 0 seconds - (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds - (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set - (< (- now start-time) - (+ (- (string->number (configf:lookup *configdat* "server" "runtime")) - 180) - (random 360)))) ;; under one hour running time +/- 180 - )) - #f)) - srvlst) - (lambda (a b) - (< (list-ref a 3) - (list-ref b 3)))))) - (if (> (length slst) nums) - (take slst nums) - slst))) - -(define (server:get-first-best areapath) - (let ((srvrs (server:get-best (server:get-list areapath)))) - (if (and srvrs - (not (null? srvrs))) - (car srvrs) - #f))) - -(define (server:get-rand-best areapath) - (let ((srvrs (server:get-best (server:get-list areapath)))) - (if (and (list? srvrs) - (not (null? srvrs))) - (let* ((len (length srvrs)) - (idx (random len))) - (list-ref srvrs idx)) - #f))) +;; ;; given a list of servers get a list of valid servers, i.e. at least +;; ;; 10 seconds old, has started and is less than 1 hour old and is +;; ;; active (i.e. mod-time < 10 seconds +;; ;; +;; ;; mod-time host port start-time pid +;; ;; +;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off +;; ;; and servers should stick around for about two hours or so. +;; ;; +;; (define (server:get-best srvlst) +;; (let* ((nums (server:get-num-servers)) +;; (now (current-seconds)) +;; (slst (sort +;; (filter (lambda (rec) +;; (if (and (list? rec) +;; (> (length rec) 2)) +;; (let ((start-time (list-ref rec 3)) +;; (mod-time (list-ref rec 0))) +;; ;; (print "start-time: " start-time " mod-time: " mod-time) +;; (and start-time mod-time +;; (> (- now start-time) 0) ;; been running at least 0 seconds +;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds +;; (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set +;; (< (- now start-time) +;; (+ (- (string->number (configf:lookup *configdat* "server" "runtime")) +;; 180) +;; (random 360)))) ;; under one hour running time +/- 180 +;; )) +;; #f)) +;; srvlst) +;; (lambda (a b) +;; (< (list-ref a 3) +;; (list-ref b 3)))))) +;; (if (> (length slst) nums) +;; (take slst nums) +;; slst))) + +;; ;; switch from server:get-list to server:get-servers-info +;; ;; +;; (define (server:get-first-best areapath) +;; (let ((srvrs (server:get-best (server:get-list areapath)))) +;; (if (and srvrs +;; (not (null? srvrs))) +;; (car srvrs) +;; #f))) +;; +;; (define (server:get-rand-best areapath) +;; (let ((srvrs (server:get-best (server:get-list areapath)))) +;; (if (and (list? srvrs) +;; (not (null? srvrs))) +;; (let* ((len (length srvrs)) +;; (idx (random len))) +;; (list-ref srvrs idx)) +;; #f))) (define (server:record->id servr) (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn) #f) - (match-let (((mod-time host port start-time server-id pid) + (match-let (((host port start-time server-id pid) servr)) (if server-id server-id #f)))) @@ -360,28 +376,22 @@ (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn) #f) - (match-let (((mod-time host port start-time server-id pid) + (match-let (((host port start-time server-id pid) servr)) (if (and host port) (conc host ":" port) #f)))) -(define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. - (if *my-client-signature* *my-client-signature* - (let ((sig (server:mk-signature))) - (set! *my-client-signature* sig) - *my-client-signature*))) - ;; if server-start-last exists, and wasn't old enough, wait + 1, then call this function recursively until it is old enough. ;; if it is old enough, overwrite it and wait 0.25 seconds. ;; if it then has the wrong server key, wait + 1 and call this function recursively. ;; -(define (server:wait-for-server-start-last-flag areapath) +#;(define (server:wait-for-server-start-last-flag areapath) (let* ((start-flag (conc areapath "/logs/server-start-last")) ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) (server-key (conc (get-host-name) "-" (current-process-id)))) (if (file-exists? start-flag) @@ -405,20 +415,114 @@ (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) (thread-sleep! ( + 1 idletime)) (server:wait-for-server-start-last-flag areapath))))))) +;; oldest server alive determines host then choose random of youngest +;; five servers on that host +;; +(define (server:get-servers-info areapath) + (let* ((servinfodir (conc *toppath*"/.servinfo"))) + (if (not (file-exists? servinfodir)) + (create-directory servinfodir)) + (let* ((allfiles (glob (conc servinfodir"/*"))) + (res (make-hash-table))) + (for-each + (lambda (f) + (let* ((hostport (pathname-strip-directory f)) + (serverdat (server:logf-get-start-info f))) + (match serverdat + ((host port start server-id pid) + (if (and host port start server-id pid) + (hash-table-set! res hostport serverdat) + (debug:print-info 0 *default-log-port* "bad server info for "f": "serverdat))) + (else + (debug:print-info 0 *default-log-port* "bad server info for "f": "serverdat))))) + allfiles) + res))) + +;; oldest server alive determines host then choose random of youngest +;; five servers on that host +;; +;; mode: +;; best - get best server (random of newest five) +;; home - get home host based on oldest server +;; info - print info +(define (server:choose-server areapath #!optional (mode 'best)) + ;; age is current-starttime + ;; find oldest alive + ;; 1. sort by age ascending and ping until good + ;; find alive rand from youngest + ;; 1. sort by age descending + ;; 2. take five + ;; 3. check alive, discard if not and repeat + (let* ((serversdat (server:get-servers-info areapath)) + (servkeys (hash-table-keys serversdat)) + (by-time-asc (if (not (null? servkeys)) + (sort servkeys ;; list of "host:port" + (lambda (a b) + (>= (list-ref (hash-table-ref serversdat a) 2) + (list-ref (hash-table-ref serversdat b) 2)))) + '()))) + (if (not (null? by-time-asc)) + (let* ((oldest (last by-time-asc)) + (oldest-dat (hash-table-ref serversdat oldest)) + (host (list-ref oldest-dat 0)) + (all-valid (filter (lambda (x) + (equal? host (list-ref (hash-table-ref serversdat x) 0))) + by-time-asc)) + (best-five (lambda () + (if (> (length all-valid) 5) + (take all-valid 5) + all-valid))) + (names->dats (lambda (names) + (map (lambda (x) + (hash-table-ref serversdat x)) + names))) + (am-home? (lambda () + (let* ((currhost (get-host-name)) + (bestadrs (server:get-best-guess-address currhost))) + (or (equal? host currhost) + (equal? host bestadrs)))))) + (case mode + ((info) + (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) + (print "youngest: "(hash-table-ref serversdat (car all-valid)))) + ((home) host) + ((homehost) (cons host (am-home?))) ;; shut up old code + ((home?) (am-home?)) + ((best-five)(names->dats (best-five))) + ((all-valid)(names->dats all-valid)) + ((best) (let* ((best-five (best-five)) + (len (length best-five))) + (hash-table-ref serversdat (list-ref best-five (random len))))) + ((count)(length all-valid)) + (else + (debug:print 0 *default-log-port* "ERROR: invalid command "mode) + #f))) + (begin + (server:run areapath) + (thread-sleep! 3) + (case mode + ((homehost) (cons #f #f)) + (else #f)))))) - +;; would like to eventually get rid of this +;; +(define (common:on-homehost?) + (server:choose-server *toppath* 'home?)) + ;; kind start up of server, wait before allowing another server for a given ;; area to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least seconds old - (server:wait-for-server-start-last-flag areapath) - (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? + ;; (server:wait-for-server-start-last-flag areapath) + (if (< (server:choose-server areapath 'count) 10) + (server:run areapath)) + #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((lock-file (conc areapath "/logs/server-start.lock"))) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (common:simple-file-lock-and-wait lock-file expire-time: 25) (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag) (system (conc "touch " start-flag)) ;; lazy but safe @@ -434,35 +538,33 @@ (let loop ((server-info (server:check-if-running areapath)) (try-num 0)) (if (or server-info (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. (server:record->url server-info) - (let ((num-ok (length (server:get-best (server:get-list areapath))))) + (let ((num-ok (length (server:choose-server areapath 'all-valid)))) (if (and (> try-num 0) ;; first time through simply wait a little while then try again (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one - (server:kind-run areapath)) + (server:run areapath)) (thread-sleep! 5) (loop (server:check-if-running areapath) (+ try-num 1))))))) -(define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. - (define (server:get-num-servers #!key (numservers 2)) (let ((ns (string->number (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) (or ns numservers))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed - (servers (server:get-best (server:get-list areapath)))) + (servers (server:choose-server areapath 'best-five))) ;; (server:get-best (server:get-list areapath)))) (if (or (and servers (null? servers)) - (not servers) - (and (list? servers) - (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers + (not servers)) + ;; (and (list? servers) + ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers #f (let loop ((hed (car servers)) (tal (cdr servers))) (let ((res (server:check-server hed))) (if res @@ -473,15 +575,12 @@ ;; ping the given server ;; (define (server:check-server server-record) (let* ((server-url (server:record->url server-record)) - (server-id (server:record->id server-record)) - (res (case *transport-type* - ((http)(server:ping server-url server-id)) - ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) - ))) + (server-id (server:record->id server-record)) + (res (server:ping server-url server-id))) (if res server-url #f))) (define (server:kill servr) @@ -569,11 +668,11 @@ (define (server:expiration-timeout) (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below (* 3600 (string->number tmo)) - 1200))) + 60))) (define (server:get-best-guess-address hostname) (let ((res #f)) (for-each (lambda (adr)