Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -197,19 +197,19 @@ (tasks:hostinfo-get-port server-dat) " client:setup (server-dat = #t)") (if (> remaining-tries 8) (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time - (server:try-running run-id) + (server:try-running *toppath*) (thread-sleep! 5) ;; give server a little time to start up (client:setup run-id remaining-tries: (- remaining-tries 1)) ))) (begin ;; no server registered (let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id))) (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) (if (< num-available 2) - (server:try-running run-id)) + (server:try-running *toppath*)) (thread-sleep! (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. (client:setup run-id remaining-tries: (- remaining-tries 1))))))))) ;; keep this as a function to ease future (define (client:start run-id server-info) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -136,11 +136,11 @@ (define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set (define *homehost-mutex* (make-mutex)) (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) - (server-url (if *toppath* (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*) #f)) + (server-url (if *toppath* (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (or (server:get-timeout) 100))) ;; default to 100 seconds @@ -579,11 +579,11 @@ (debug:print-info 0 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num) (if (and legacy-sync (not *time-to-exit*)) (let ((dbstruct (db:setup))) (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") (let loop () - (BB> "watchdog loop. pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) + ;;(BB> "watchdog loop. pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) ;; sync for filesystem local db writes ;; (mutex-lock! *db-multi-sync-mutex*) (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write (sync-in-progress *db-sync-in-progress*) @@ -618,11 +618,11 @@ ;; keep going unless time to exit ;; (if (not *time-to-exit*) (let delay-loop ((count 0)) - (BB> "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) + ;;(BB> "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) (if (and (not *time-to-exit*) (< count 4)) ;; was 11, changing to 4. (begin (thread-sleep! 1) @@ -631,10 +631,11 @@ (if (common:low-noise-print 30) (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" this-wd-num="this-wd-num))))))) (define (std-exit-procedure) (on-exit (lambda () 0)) + ;;(BB> "std-exit-procedure called; *time-to-exit*="*time-to-exit*) (let ((no-hurry (if *time-to-exit* ;; hurry up #f (begin (set! *time-to-exit* #t) #t)))) @@ -677,10 +678,11 @@ 0) (define (std-signal-handler signum) ;; (signal-mask! signum) (set! *time-to-exit* #t) + ;;(BB> "got signal "signum) (debug:print-error 0 *default-log-port* "Received signal " signum " exiting promptly") ;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway (exit)) (set-signal-handler! signal/int std-signal-handler) ;; ^C Index: common_records.scm ================================================================== --- common_records.scm +++ common_records.scm @@ -121,10 +121,11 @@ (db:log-event (apply conc params)) (apply print params) ))))) ;; Brandon's debug printer shortcut (indulge me :) +(define *BB-process-starttime* (current-milliseconds)) (define (BB> . in-args) (let* ((stack (get-call-chain)) (location #f)) (for-each (lambda (frame) @@ -131,11 +132,11 @@ (let* ((this-loc (vector-ref frame 0)) (this-func (cadr (string-split this-loc " ")))) (if (equal? this-func "BB>") (set! location this-loc)))) stack) - (let ((dp-args (append (list 0 *default-log-port* location" " ) in-args))) + (let ((dp-args (append (list 0 *default-log-port* (conc location "@"(/ (- (current-milliseconds) *BB-process-starttime*) 1000)" ") ) in-args))) (apply debug:print dp-args)))) (define *BBpp_custom_expanders_list* (make-hash-table)) Index: dashboard-tests.scm ================================================================== --- dashboard-tests.scm +++ dashboard-tests.scm @@ -473,11 +473,11 @@ (testconfig (begin ;; (runs:set-megatest-env-vars run-id inrunname: runname testname: test-name itempath: item-path) (runs:set-megatest-env-vars run-id inkeyvals: keydat inrunname: runname intarget: keystring testname: testname itempath: item-path) ;; these may be needed by the launching process (handle-exceptions exn - (tests:get-testconfig (db:test-get-testname testdat) test-registry #f) + (tests:get-testconfig (db:test-get-testname testdat) (db:test-get-item-path testdat) test-registry #f) (tests:get-testconfig (db:test-get-testname testdat) test-registry #t)))) (viewlog (lambda (x) (if (file-exists? logfile) ;(system (conc "firefox " logfile "&")) (dashboard-tests:run-html-viewer logfile) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -291,18 +291,27 @@ (let* ((dblist (configf:get-section *configdat* "ext-sync")) (res '()) (cfgdb #f)) (for-each (lambda (dbitem) (let* ((stringsplit (string-split (cadr dbitem))) - (dbtype (car stringsplit)) - (dbpath (cadr stringsplit))) - (set! cfgdb (dbi:open (string->symbol dbtype) (cons (cons 'dbname dbpath) '()) )) - (db:initialize-main-db cfgdb) - (db:initialize-run-id-db cfgdb) - (set! res (cons (cons cfgdb dbpath) res)))) + (dbtype (string->symbol (car stringsplit))) + (dbinfo '()) + (cred '())) + (for-each + (lambda (x) + (if (not (eqv? (string->symbol x) dbtype)) + (let* ((pair (string-split x ":"))) + (if (not (eqv? pair '())) + (set! dbinfo (cons (cons (string->symbol (car pair)) (cadr pair)) dbinfo)))))) + stringsplit) + (set! cfgdb (dbi:open dbtype dbinfo)) + (set! res (cons (cons cfgdb (alist-ref 'host dbinfo)) res)) + )) dblist) - (dbr:dbstruct-slave-dbs-set! dbstruct res)))) + (print res) + (dbr:dbstruct-slave-dbs-set! dbstruct res) + ))) ;; (mutex-unlock! *rundb-mutex*) (if (and (not dbfexists) write-access) ;; *db-write-access*) ;; did not have a prior db and do have write access (begin @@ -546,10 +555,11 @@ ;; if last-update specified ("field-name" . time-in-seconds) ;; then sync only records where field-name >= time-in-seconds ;; IFF field-name exists ;; (define (db:sync-tables tbls last-update fromdb todb . slave-dbs) + (print "Slave-dbs: " slave-dbs) (set! todb (cons (dbi:convert (db:dbdat-get-db todb)) (db:dbdat-get-path todb))) (set! fromdb (cons (dbi:convert (db:dbdat-get-db fromdb)) (db:dbdat-get-path fromdb))) (handle-exceptions exn @@ -603,13 +613,14 @@ " FROM " tablename (if use-last-update ;; apply last-update criteria (conc " " (car last-update) ">=" (cdr last-update)) "") ";")) (full-ins (conc "INSERT OR REPLACE INTO " tablename " ( " (string-intersperse (map car fields) ",") " ) " - " VALUES ( " (string-intersperse (make-list num-fields "?") ",") " );")) + " VALUES ( " (string-intersperse (make-list num-fields "?") ",") " );")) (fromdat '()) (fromdats '()) + (tabletypes '()) (totrecords 0) (batch-len (string->number (or (configf:lookup *configdat* "sync" "batchsize") "10"))) (todat (make-hash-table)) (count 0)) @@ -648,10 +659,25 @@ ;; first pass implementation, just insert all changed rows (for-each (lambda (targdb) + (if (eqv? (dbi:db-dbtype (db:dbdat-get-db targdb)) 'pgd) + (let* ((prep "")) + (for-each + (lambda (row) + (set! tabletypes (cons (cons (string->symbol (vector-ref row 1)) (vector-ref row 2)) tabletypes))) + (dbi:pull-metadata (db:dbdat-get-db fromdb) tablename)) + (set! prep (string-intersperse (map (lambda (x) (alist-ref (string->symbol (car x)) tabletypes)) fields) ",")) + (set! prep (conc "PREPARE full-ins (" prep ") AS INSERT OR REPLACE INTO " tablename " ( " (string-intersperse (map car fields) ",") " ) VALUES ( ")) + (let loop ((i 1)) + (set! prep (conc prep "$" i ",")) + (if (< i (- num-fields 1)) + (loop (+ i 1)) + (set! prep (conc prep "$" (+ i 1) ")")))) + (set! full-ins prep))) + (let* ((db (dbi:convert (db:dbdat-get-db targdb))) (stmth (dbi:prepare db full-ins))) ;; (db:delay-if-busy targdb) ;; NO WAITING (for-each (lambda (fromdat-lst) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -383,30 +383,34 @@ (server-going #f)) (let loop ((count 0) (server-state 'available) (bad-sync-count 0) (start-time (current-milliseconds))) - + ;;(BB> "http-transport: top of loop; count="count" server-state="server-state" bad-sync-count="bad-sync-count" server-going="server-going) ;; Use this opportunity to sync the tmp db to megatest.db (if (not server-going) ;; *dbstruct-db* ;; Removed code is pasted below (keeping it around until we are clear it is not needed). ;; no *dbstruct-db* yet, set running after our first pass through and start the db (if (eq? server-state 'available) (let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers (if (equal? new-server-id server-id) (begin (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep") + ;;(BB> "http-transport: ->dbprep") (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access (set! *dbstruct-db* (db:setup)) ;; run-id)) (set! server-going #t) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running") - (server:write-dotserver *toppath* (conc iface ":" port)) - (delete-file* (conc *toppath* "/.starting-server"))) + ;;(BB> "http-transport: ->running") + (server:write-dotserver *toppath* iface port (current-process-id) 'http) + (thread-start! *watchdog*) + (server:complete-attempt *toppath*)) (begin ;; gotta exit nicely + ;;(BB> "http-transport: ->collision") (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision") (http-transport:server-shutdown server-id port)))))) - + ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) (rem-time (quotient (- 4000 sync-time) 1000))) (if (and (<= rem-time 4) @@ -424,11 +428,12 @@ (if (or (not (equal? sdat (list iface port))) (not server-id)) (begin (debug:print-info 0 *default-log-port* "interface changed, refreshing iface and port info") (set! iface (car sdat)) - (set! port (cadr sdat)))) + (set! port (cadr sdat)) + (server:write-dotserver *toppath* iface port (current-process-id) 'http))) ;; Transfer *db-last-access* to last-access to use in checking that we are still alive (mutex-lock! *heartbeat-mutex*) (set! last-access *db-last-access*) (mutex-unlock! *heartbeat-mutex*) @@ -443,25 +448,30 @@ (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) - (if (and *server-run* + (cond + ((not (server:confirm-dotserver *toppath* iface port (current-process-id) 'http)) + (debug:print-info 0 *default-log-port* "Server .server file does not exist or contents do not match. Initiate server shutdown.") + (http-transport:server-shutdown server-id port)) + ((and *server-run* (> (+ last-access server-timeout) (current-seconds))) - (begin - (if (common:low-noise-print 120 "server continuing") - (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))) - ;; - ;; Consider implementing some smarts here to re-insert the record or kill self is - ;; the db indicates so - ;; - ;; (if (tasks:server-am-i-the-server? tdb run-id) - ;; (tasks:server-set-state! tdb server-id "running")) - ;; - (loop 0 server-state bad-sync-count (current-milliseconds))) - (http-transport:server-shutdown server-id port)))))) + (if (common:low-noise-print 120 "server continuing") + (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))) + ;; + ;; Consider implementing some smarts here to re-insert the record or kill self is + ;; the db indicates so + ;; + ;; (if (tasks:server-am-i-the-server? tdb run-id) + ;; (tasks:server-set-state! tdb server-id "running")) + ;; + (loop 0 server-state bad-sync-count (current-milliseconds))) + (else + (debug:print-info 0 *default-log-port* "Server timeed out. seconds since last db access: " (- (current-seconds) last-access)) + (http-transport:server-shutdown server-id port))))))) ;; code cut out from above ;; ;; (condition-case ;; ;; (if (and (member (mutex-state *db-sync-mutex*) '(abandoned not-abandoned)) @@ -486,10 +496,11 @@ ;; (thread-sleep! rem-time) ;; (thread-sleep! 4))) ;; fallback for if the math is changed ... (define (http-transport:server-shutdown server-id port) (let ((tdbdat (tasks:open-db))) + ;;(BB> "http-transport:server-shutdown called") (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id)) ;; ;; start_shutdown ;; (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") @@ -513,21 +524,19 @@ " ms") (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting") (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running complete") ;; if the .server file contained :myport then we can remove it (server:remove-dotserver-file *toppath* port) + ;;(BB> "http-transport:server-shutdown -> exit") (exit))) ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch run-id) - (with-output-to-file - (conc *toppath* "/.starting-server") - (lambda () - (print (current-process-id) " on " (get-host-name)))) + (server:attempting-start *toppath*) (let* ((tdbdat (tasks:open-db))) (set! *run-id* run-id) (if (args:get-arg "-daemonize") (begin (daemon:ize) @@ -539,11 +548,11 @@ (server:check-if-running run-id)) (begin (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") (exit 0)) (begin ;; ok, no server detected, clean out any lingering records - (tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id "notresponding"))) + (tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id "notresponding"))) (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)) (remtries 4)) (if (not server-id) (if (> remtries 0) (begin @@ -552,11 +561,11 @@ (- remtries 1))) (begin ;; since we didn't get the server lock we are going to clean up and bail out (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch") - (delete-file* (conc *toppath* "/.starting-server")) + (server:complete-attempt *toppath*) )) (let* ((th2 (make-thread (lambda () (debug:print-info 0 *default-log-port* "Server run thread started") (http-transport:run (if (args:get-arg "-server") Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -280,11 +280,11 @@ ;; do all the ezsteps (if any) (if ezsteps (let* ((testconfig ;; (read-config (conc work-area "/testconfig") #f #t environ-patt: "pre-launch-env-vars")) ;; FIXME??? is allow-system ok here? ;; NOTE: it is tempting to turn off force-create of testconfig but dynamic ;; ezstep names need a full re-eval here. - (tests:get-testconfig test-name tconfigreg #t force-create: #t)) ;; 'return-procs))) + (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs))) (ezstepslst (if (hash-table? testconfig) (hash-table-ref/default testconfig "ezsteps" '()) #f))) (if testconfig (hash-table-set! *testconfigs* test-name testconfig) ;; cached for lazy reads later ... @@ -1089,11 +1089,11 @@ (list "MT_ITEMPATH" item-path) ) itemdat)) (let* ((tregistry (tests:get-all)) ;; third param (below) is system-allowed ;; for tconfig, why do we allow fallback to test-conf? - (tconfig (or (tests:get-testconfig test-name tregistry #t force-create: #t) + (tconfig (or (tests:get-testconfig test-name item-path tregistry #t force-create: #t) (begin (debug:print 0 *default-log-port* "WARNING: falling back to pre-calculated testconfig. This is likely not desired.") test-conf))) ;; force re-read now that all vars are set (useshell (let ((ush (config-lookup *configdat* "jobtools" "useshell"))) (if ush Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -1,7 +1,7 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.6302) +(define megatest-version 1.6303) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -349,12 +349,14 @@ ;; The watchdog is to keep an eye on things like db sync etc. ;; (define *watchdog* (make-thread common:watchdog "Watchdog thread")) -(thread-start! *watchdog*) -;; (BB> "thread-start! watchdog") +(if (not (args:get-arg "-server")) + (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog +;;(BB> "thread-start! watchdog") + (if (args:get-arg "-log") (let ((oup (open-output-file (args:get-arg "-log")))) (debug:print-info 0 *default-log-port* "Sending log output to " (args:get-arg "-log")) (set! *default-log-port* oup))) @@ -1990,12 +1992,19 @@ ;; Exit and clean up ;;====================================================================== (if (not *didsomething*) (debug:print 0 *default-log-port* help)) -(BB> "thread-join! watchdog") -(thread-join! *watchdog*) +;;(BB> "thread-join! watchdog") + +;; join the watchdog thread if it has been thread-start!ed (it may not have been started in the case of a server that never enters running state) +;; (symbols returned by thread-state: created ready running blocked suspended sleeping terminated dead) +(if (thread? *watchdog*) + (case (thread-state *watchdog*) + ((ready running blocked sleeping terminated dead) + (thread-join! *watchdog*)))) + (set! *time-to-exit* #t) (if (not (eq? *globalexitstatus* 0)) (if (or (args:get-arg "-run")(args:get-arg "-runtests")(args:get-arg "-runall")) (begin ADDED minimal/manyservers.sh Index: minimal/manyservers.sh ================================================================== --- /dev/null +++ minimal/manyservers.sh @@ -0,0 +1,119 @@ +#!/bin/bash + +echo manyservers.sh pid $$ + +logdir=$PWD/log-manysrv + + +function reset { + rm -f .homehost .server .server.lock links/.db/monitor.db .starting-server + } + +function launch_many_servers { + # count = $1 + # logdir = $2 + # prefx = $3 + perl -e 'foreach my $i (1 ... '$1'){print "'$2'/'$3'-srv-$i.log\n"}' | \ + xargs -P $1 -n 1 megatest -server - -run-id 0 -daemonize -log +} + + +function get_srv_pids { + ps auwx | grep "mtest -server" | grep $logdir | grep -v grep | awk '{print $2}' +} + + +if [[ -e $logdir ]]; then rm -rf $logdir; fi +if [[ ! -e $logdir ]]; then mkdir $logdir; fi + +reset + +simultaneous_servers=20 +server_collision_resolution_delay=15 +server_timeout_delay=65 + +echo "Launching $simultaneous_servers simultaneous servers" +launch_many_servers $simultaneous_servers $logdir "first" +echo "Sleeping $server_collision_resolution_delay seconds to allow new servers to die because one is already running." +sleep $server_collision_resolution_delay + +pids=`get_srv_pids` +pids_left=`echo $pids | wc -w` +echo "pids_left=$pids_left" +echo "after $server_collision_resolution_delay seconds: servers remaining=$pids_left; expecting 1" +if [[ $pids_left == 1 ]]; then + echo "All servers but 1 terminated. Still good." +else + if [[ $pids_left == 0 ]]; then + echo "All servers died too soon. Not good. Aborting." + echo "TEST FAIL" + exit 1 + else + echo "Too many servers left. Not good. Aborting." + echo "TEST FAIL" + echo $pids | xargs kill + sleep 5 + pids=`get_srv_pids` + pids_left=`echo $pids | wc -w` + if [[ ! ( $pids_left == 0 ) ]]; then + echo $pids | xargs kill -9 + fi + exit 1 + fi +fi + + + +echo "launching another volley of $simultaneous_servers. THey should all perish. right away, leaving the one server running." +launch_many_servers $simultaneous_servers $logdir "second" +sleep $server_collision_resolution_delay + +pids=`get_srv_pids` +pids_left=`echo $pids | wc -w` +echo "pids_left=$pids_left" +echo "after $server_collision_resolution_delay seconds: servers remaining=$pids_left; expecting 1" +if [[ $pids_left == 1 ]]; then + echo "All servers but 1 terminated. So far so good." +else + if [[ $pids_left == 0 ]]; then + echo "All servers died too soon. Not good. Aborting." + echo "TEST FAIL" + exit 1 + else + echo "Too many servers left. Not good. Aborting." + echo "TEST FAIL" + echo $pids | xargs kill + sleep 5 + pids=`get_srv_pids` + pids_left=`echo $pids | wc -w` + if [[ ! ( $pids_left == 0 ) ]]; then + echo $pids | xargs kill -9 + fi + exit 1 + fi +fi + + + +echo "sleeping for awhile ($server_timeout_delay seconds) to let server exit on its own for no-request timeout" +sleep $server_timeout_delay +pids=`get_srv_pids` +pids_left=`echo $pids | wc -w` +echo "after $server_timeout_delay seconds: servers remaining=$pids_left; expecting 0" + +if [[ $pids_left == 0 ]]; then + echo "No servers remain. This is good." + echo "TEST PASS" + exit 0 +else + echo "Too many servers left. Not good. Aborting." + echo "TEST FAIL" + echo $pids | xargs kill + sleep 5 + pids=`get_srv_pids` + pids_left=`echo $pids | wc -w` + if [[ ! ( $pids_left == 0 ) ]]; then + echo $pids | xargs kill -9 + fi + exit 1 +fi Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -31,12 +31,13 @@ ;;====================================================================== ;; if a server is either running or in the process of starting call client:setup ;; else return #f to let the calling proc know that there is no server available ;; -(define (rmt:get-connection-info run-id) - (let ((cinfo (remote-conndat *runremote*))) +(define (rmt:get-connection-info areapath) ;; TODO: push areapath down. + (let ((cinfo (remote-conndat *runremote*)) + (run-id 0)) (if cinfo cinfo (if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id) (client:setup run-id) #f)))) @@ -114,43 +115,57 @@ ;; (mutex-unlock! *rmt-mutex*) ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") ;; (rmt:open-qry-close-locally cmd 0 params)) - ;; no server contact made and this is a write, passively start a server - ((and (not (remote-server-url *runremote*)) + ;; on homehost, no server contact made and this is a write, passively start a server + ((and (cdr (remote-hh-dat *runremote*)) ; new + (not (remote-server-url *runremote*)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") - (let ((serverconn (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call - (if serverconn - (remote-server-url-set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed + (let ((server-url (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call + (if server-url + (remote-server-url-set! *runremote* server-url) ;; the string can be consumed by the client setup if needed (if (not (server:start-attempted? *toppath*)) (server:kind-run *toppath*)))) - (if (cdr (remote-hh-dat *runremote*)) ;; we are on the homehost, just do the call - (begin - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") - (rmt:open-qry-close-locally cmd 0 params)) - (begin ;; not on homehost, start server and wait - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.2") - (tasks:start-and-wait-for-server (tasks:open-db) 0 15) - (rmt:send-receive cmd rid params attemptnum: attemptnum)))) + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") + (rmt:open-qry-close-locally cmd 0 params)) + + + + ;;; + ;; (begin ;; not on homehost, start server and wait + ;; (mutex-unlock! *rmt-mutex*) + ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.2") + ;; (tasks:start-and-wait-for-server (tasks:open-db) 0 15) + ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;) ;) +;;;; + ;; if not on homehost ensure we have a connection to a live server ;; NOTE: we *have* a homehost record by now - ((and (not (cdr (remote-hh-dat *runremote*))) ;; are we on a homehost? + ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost + (not (remote-conndat *runremote*)) ;; and no connection + (server:read-dotserver *toppath*)) ;; .server file exists + ;; something caused the server entry in tdb to disappear, but the server is still running + (server:remove-dotserver-file *toppath* ".*") + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 20") + (rmt:send-receive cmd rid params attemptnum: (add1 attemptnum))) + ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost (not (remote-conndat *runremote*))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*)) (mutex-unlock! *rmt-mutex*) (tasks:start-and-wait-for-server (tasks:open-db) 0 15) - (remote-conndat-set! *runremote* (rmt:get-connection-info 0)) ;; calls client:setup which calls client:setup-http - (rmt:send-receive cmd rid params attemptnum: attemptnum)) + (remote-conndat-set! *runremote* (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http + (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as ;; all set up if get this far, dispatch the query ((cdr (remote-hh-dat *runremote*)) ;; we are on homehost (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 7") (rmt:open-qry-close-locally cmd (if rid rid 0) params)) + ;; not on homehost, do server query (else (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9") (let* ((conninfo (remote-conndat *runremote*)) Index: rpc-transport.scm ================================================================== --- rpc-transport.scm +++ rpc-transport.scm @@ -176,11 +176,11 @@ (if ping-res (let ((server-dat (list iface port #f #f #f))) (hash-table-set! *runremote* run-id server-dat) server-dat) (begin - (server:try-running run-id) + (server:try-running *toppath*) (thread-sleep! 2) (rpc-transport:client-setup run-id (- remtries 1))))) (let* ((server-db-info (open-run-close tasks:get-server tasks:open-db run-id))) (debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) (if server-db-info @@ -191,15 +191,15 @@ (if start-res (begin (hash-table-set! *runremote* run-id server-dat) server-dat) (begin - (server:try-running run-id) + (server:try-running *toppath*) (thread-sleep! 2) (rpc-transport:client-setup run-id (- remtries 1))))) (begin - (server:try-running run-id) + (server:try-running *toppath*) (thread-sleep! 2) (rpc-transport:client-setup run-id (- remtries 1))))))))) ;; ;; (port (if (and hostinfo (> (length hostdat) 1))(cadr hostdat) #f))) ;; (if (and port Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -8,11 +8,11 @@ ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. (require-extension (srfi 18) extras tcp s11n) -(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest directory-utils) +(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest directory-utils posix-extras) ;; (use zmq) (use spiffy uri-common intarweb http-client spiffy-request-vars) (declare (unit server)) @@ -47,11 +47,23 @@ ;; all routes though here end in exit ... ;; ;; start_server ;; (define (server:launch run-id transport-type) - (BB> "server:launch fired for run-id="run-id" transport-type="transport-type) + ;;(BB> "server:launch fired for run-id="run-id" transport-type="transport-type) + + (let ((attempt-in-progress (server:start-attempted? *toppath*))) ; check for .server-starting + (when attempt-in-progress + (debug:print-info 0 *default-log-port* "Server start attempt in progress in other process (=> "attempt-in-progress"<=). Aborting server launch attempt in this process ("(current-process-id)")") + (exit))) + + (let ((dotserver-url (server:check-if-running *toppath*))) ;; check for .server + (when dotserver-url + (debug:print-info 0 *default-log-port* "Server already running (=> "dotserver-url"<=). Aborting server launch attempt in this process ("(current-process-id)")") + (exit) + )) + (case transport-type ((http)(http-transport:launch run-id)) ;;((nmsg)(nmsg-transport:launch run-id)) ((rpc) (rpc-transport:launch run-id)) (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type)))) @@ -103,52 +115,60 @@ ;; Given a run id start a server process ### NOTE ### > file 2>&1 ;; if the run-id is zero and the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; -(define (server:run areapath) ;; areapath is ignored for now. +(define (server:run areapath) ;; areapath is *toppath* for a given testsuite area (let* ((curr-host (get-host-name)) + (attempt-in-progress (server:start-attempted? areapath)) + (dot-server-url (server:check-if-running areapath)) (curr-ip (server:get-best-guess-address curr-host)) (curr-pid (current-process-id)) (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" )) (target-host (car homehost)) (testsuite (common:get-testsuite-name)) - (logfile (conc *toppath* "/logs/server.log")) + (logfile (conc areapath "/logs/server.log")) (cmdln (conc (common:get-megatest-exe) " -server " (or target-host "-") " -run-id " 0 (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") (conc " -daemonize -log " logfile) "") " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &"))))) (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread"))) ;; we want the remote server to start in *toppath* so push there - (push-directory *toppath*) - (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") - (thread-start! log-rotate) - - ;; host.domain.tld match host? - (if (and target-host - ;; look at target host, is it host.domain.tld or ip address and does it - ;; match current ip or hostname - (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) - (not (equal? curr-ip target-host))) - (begin - (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) - (setenv "TARGETHOST" target-host))) - - (setenv "TARGETHOST_LOGF" logfile) - (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever - (system (conc "nbfake " cmdln)) - (unsetenv "TARGETHOST_LOGF") - (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) - (thread-join! log-rotate) - (pop-directory))) - + (push-directory areapath) + (cond + (attempt-in-progress + (debug:print 0 *default-log-port* "INFO: Not trying to start server because attempt is in progress: "attempt-in-progress)) + (dot-server-url + (debug:print 0 *default-log-port* "INFO: Not trying to start server because one is already running : "dot-server-url)) + (else + (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") + (thread-start! log-rotate) + + ;; host.domain.tld match host? + (if (and target-host + ;; look at target host, is it host.domain.tld or ip address and does it + ;; match current ip or hostname + (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) + (not (equal? curr-ip target-host))) + (begin + (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) + (setenv "TARGETHOST" target-host))) + + (setenv "TARGETHOST_LOGF" logfile) + (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever + (system (conc "nbfake " cmdln)) + (unsetenv "TARGETHOST_LOGF") + (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) + (thread-join! log-rotate) + (pop-directory))))) + (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) - (set! *my-client-signature* sig) - *my-client-signature*))) + (set! *my-client-signature* sig) + *my-client-signature*))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched (define (server:kind-run areapath) (let ((last-run-time (hash-table-ref/default *server-kind-run* areapath #f))) @@ -156,96 +176,168 @@ (> (- (current-seconds) last-run-time) 30)) (begin (server:run areapath) (hash-table-set! *server-kind-run* areapath (current-seconds)))))) -;; The generic run a server command. Dispatches the call to server 0 if run-id != 0 -;; -;; (define (server:try-running run-id) -;; (if (eq? run-id 0) -;; (server:run run-id) -;; (rmt:start-server run-id))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. +(define (server:attempting-start areapath) + (with-output-to-file + (conc areapath "/.starting-server") + (lambda () + (print (current-process-id) " on " (get-host-name))))) + +(define (server:complete-attempt areapath) + (delete-file* (conc areapath "/.starting-server"))) + (define (server:start-attempted? areapath) (let ((flagfile (conc areapath "/.starting-server"))) (handle-exceptions exn #f ;; if things go wrong pretend we can't see the file - (and (file-exists? flagfile) - (< (- (current-seconds) - (file-modification-time flagfile)) - 15))))) ;; exists and less than 15 seconds old - + (cond + ((and (file-exists? flagfile) + (< (- (current-seconds) + (file-modification-time flagfile)) + 15)) ;; exists and less than 15 seconds old + (with-input-from-file flagfile (lambda () (read-line)))) + ((file-exists? flagfile) ;; it is stale. + (server:complete-attempt areapath) + #f) + (else #f))))) + (define (server:read-dotserver areapath) (let ((dotfile (conc areapath "/.server"))) (handle-exceptions exn #f ;; if things go wrong pretend we can't see the file - (if (and (file-exists? dotfile) - (file-read-access? dotfile)) - (with-input-from-file - dotfile - (lambda () - (read-line))) - #f)))) + (cond + ((not (file-exists? dotfile)) + #f) + ((not (file-read-access? dotfile)) + #f) + ((> (server:dotserver-age-seconds areapath) (+ 5 (server:get-timeout))) + (server:remove-dotserver-file areapath ".*") + #f) + (else + (let* ((line + (with-input-from-file + dotfile + (lambda () + (read-line)))) + (tokens (if (string? line) (string-split line ":") #f))) + (cond + ((eq? 4 (length tokens)) + tokens) + (else #f)))))))) + +(define (server:read-dotserver->url areapath) + (let ((dotserver-tokens (server:read-dotserver areapath))) + (if dotserver-tokens + (conc (list-ref dotserver-tokens 0) ":" (list-ref dotserver-tokens 1)) + #f))) ;; write a .server file in *toppath* with hostport ;; return #t on success, #f otherwise ;; -(define (server:write-dotserver areapath hostport) +(define (server:write-dotserver areapath host port pid transport) (let ((lock-file (conc areapath "/.server.lock")) (server-file (conc areapath "/.server"))) (if (common:simple-file-lock lock-file) (let ((res (handle-exceptions exn #f ;; failed for some reason, for the moment simply return #f (with-output-to-file server-file (lambda () - (print hostport))) + (print (conc host ":" port ":" pid ":" transport)))) #t))) - (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " created") + (debug:print-info 0 *default-log-port* "server file " server-file " for " host ":" port " created pid="pid) (common:simple-file-release-lock lock-file) res) #f))) + +;; this will check that the .server file present matches the server calling this procedure. +;; if parameters match (this-pid and transport) the file will be touched and #t returned +;; otherwise #f will be returned. +(define (server:confirm-dotserver areapath this-iface this-port this-pid this-transport) + (let* ((tokens (server:read-dotserver areapath))) + (cond + ((not tokens) + (debug:print-info 0 *default-log-port* "INFO: .server file does not exist.") + #f) + ((not (eq? 4 (length tokens))) + (debug:print-info 0 *default-log-port* "INFO: .server file is corrupt. There are not 4 tokens as expeted; there are "(length tokens)".") + #f) + ((not (equal? this-iface (list-ref tokens 0))) + (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for iface, server has value >"(list-ref tokens 0)"< but this server's value is >"this-iface"<") + #f) + ((not (equal? (->string this-port) (list-ref tokens 1))) + (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for port, .server has value >"(list-ref tokens 1)"< but this server's value is >"(->string this-port)"<") + #f) + ((not (equal? (->string this-pid) (list-ref tokens 2))) + (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for pid, .server has value >"(list-ref tokens 2)"< but this server's value is >"(->string this-pid)"<") + #f) + ((not (equal? (->string this-transport) (->string (list-ref tokens 3)))) + (debug:print-info 0 *default-log-port* "INFO: .server file mismatch. for transport, .server has value >"(list-ref tokens 3)"< but this server's value is >"this-transport"<") + #f) + (else (server:touch-dotserver areapath) + #t)))) + +(define (server:touch-dotserver areapath) + (let ((server-file (conc areapath "/.server"))) + (change-file-times server-file (current-seconds) (current-seconds)))) + +(define (server:dotserver-age-seconds areapath) + (let ((server-file (conc areapath "/.server"))) + (begin + (handle-exceptions + exn + #f + (- (current-seconds) + (file-modification-time server-file)))))) + (define (server:remove-dotserver-file areapath hostport) - (let ((dotserver (server:read-dotserver areapath)) + (let ((dotserver-url (server:read-dotserver->url areapath)) (server-file (conc areapath "/.server")) (lock-file (conc areapath "/.server.lock"))) - (if (and dotserver (string-match (conc ".*:" hostport "$") dotserver)) ;; port matches, good enough info to decide to remove the file + (if (and dotserver-url (string-match (conc ".*:" hostport "$") dotserver-url)) ;; port matches, good enough info to decide to remove the file (if (common:simple-file-lock lock-file) (begin (handle-exceptions exn #f (delete-file* server-file)) (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " removed") - (common:simple-file-release-lock lock-file)))))) + (common:simple-file-release-lock lock-file)) + (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - could not get lock.")) + (debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " NOT removed - dotserver-url("dotserver-url") did not match hostport pattern ("hostport")")))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) - (let* ((dotserver (server:read-dotserver areapath))) ;; tdbdat (tasks:open-db))) - (if dotserver + (let* ((dotserver-url (server:read-dotserver->url areapath))) ;; tdbdat (tasks:open-db))) + (if dotserver-url (let* ((res (case *transport-type* - ((http)(server:ping-server dotserver)) + ((http)(server:ping-server dotserver-url)) ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) ))) (if res - dotserver - #f)) + dotserver-url + (begin + (server:remove-dotserver-file areapath ".*") ;; remove stale dotserver + #f))) #f))) ;; called in megatest.scm, host-port is string hostname:port ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; (define (server:ping host-port-in #!key (do-exit #f)) (let ((host:port (if (not host-port-in) ;; use read-dotserver to find - (server:read-dotserver *toppath*) + (server:read-dotserver->url *toppath*) (if (number? host-port-in) ;; we were handed a server-id (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) ;; (print "srec: " srec " host-port-in: " host-port-in) (if srec (conc (vector-ref srec 3) ":" (vector-ref srec 4)) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -402,11 +402,11 @@ (< delay-time delay-max-tries)) (begin (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id) (debug:print 0 *default-log-port* "Try starting server for run-id " run-id)) (thread-sleep! (/ (random 2000) 1000)) - (server:kind-run run-id) + (server:kind-run *toppath*) (thread-sleep! (min delay-time 1)) (if (not (or (server:start-attempted? *toppath*) (server:read-dotserver *toppath*))) ;; no point in trying (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1)) #f)) Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -141,11 +141,11 @@ ;; returns waitons waitors tconfigdat ;; (define (tests:get-waitons test-name all-tests-registry) - (let* ((config (tests:get-testconfig test-name all-tests-registry 'return-procs))) + (let* ((config (tests:get-testconfig test-name #f all-tests-registry 'return-procs))) (let ((instr (if config (config-lookup config "requirements" "waiton") (begin ;; No config means this is a non-existant test (debug:print-error 0 *default-log-port* "non-existent required test \"" test-name "\"") (exit 1)))) @@ -291,11 +291,11 @@ ;; Check for waiver eligibility ;; (define (tests:check-waiver-eligibility testdat prev-testdat) (let* ((test-registry (make-hash-table)) - (testconfig (tests:get-testconfig (db:test-get-testname testdat) test-registry #f)) + (testconfig (tests:get-testconfig (db:test-get-testname testdat) (db:test-get-item-path testdat) test-registry #f)) (test-rundir ;; (sdb:qry 'passstr (db:test-get-rundir testdat)) ;; ) (prev-rundir ;; (sdb:qry 'passstr (db:test-get-rundir prev-testdat)) ;; ) (waivers (if testconfig (configf:section-vars testconfig "waivers") '())) @@ -979,11 +979,11 @@ ;; if .testconfig exists in test directory read and return it ;; else if have cached copy in *testconfigs* return it IFF there is a section "have fulldata" ;; else read the testconfig file ;; if have path to test directory save the config as .testconfig and return it ;; -(define (tests:get-testconfig test-name test-registry system-allowed #!key (force-create #f)) +(define (tests:get-testconfig test-name item-path test-registry system-allowed #!key (force-create #f)) (let* ((cache-path (tests:get-test-path-from-environment)) (cache-file (and cache-path (conc cache-path "/.testconfig"))) (cache-exists (and cache-file (not force-create) ;; if force-create then pretend there is no cache to read (file-exists? cache-file))) @@ -991,14 +991,17 @@ cache-exists) (handle-exceptions exn #f ;; any issues, just give up with the cached version and re-read (configf:read-alist cache-file)) - #f))) + #f)) + (test-full-name (if (and item-path (not (string-null? item-path))) + (conc test-name "/" item-path) + test-name))) (if cached-dat cached-dat - (let ((dat (hash-table-ref/default *testconfigs* test-name #f))) + (let ((dat (hash-table-ref/default *testconfigs* test-full-name #f))) (if (and dat ;; have a locally cached version (hash-table-ref/default dat "have fulldata" #f)) ;; marked as good data? dat ;; no cached data available (let* ((treg (or test-registry @@ -1012,11 +1015,11 @@ environ-patt: (if system-allowed "pre-launch-env-vars" #f)) #f))) (if (and tcfg cache-file) (hash-table-set! tcfg "have fulldata" #t)) ;; mark this as fully read data - (if tcfg (hash-table-set! *testconfigs* test-name tcfg)) + (if tcfg (hash-table-set! *testconfigs* test-full-name tcfg)) (if (and testexists cache-file (file-write-access? cache-path)) (let ((tpath (conc cache-path "/.testconfig"))) (debug:print-info 1 *default-log-port* "Caching testconfig for " test-name " in " tpath) @@ -1240,11 +1243,12 @@ (define (tests:get-full-data test-names test-records required-tests all-tests-registry) (if (not (null? test-names)) (let loop ((hed (car test-names)) (tal (cdr test-names))) ;; 'return-procs tells the config reader to prep running system but return a proc (debug:print-info 4 *default-log-port* "hed=" hed " at top of loop") - (let* ((config (tests:get-testconfig hed all-tests-registry 'return-procs)) + ;; don't know item-path at this time, let the testconfig get the top level testconfig + (let* ((config (tests:get-testconfig hed #f all-tests-registry 'return-procs)) (waitons (let ((instr (if config (config-lookup config "requirements" "waiton") (begin ;; No config means this is a non-existant test (debug:print-error 0 *default-log-port* "non-existent required test \"" hed "\", grep through your testconfigs to find and remove or create the test. Discarding and continuing.") ""))))