Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -270,10 +270,11 @@ (if tmpdb tmpdb ;; (mutex-lock! *rundb-mutex*) (let* ((dbpath (db:dbfile-path)) ;; 0)) (dbexists (file-exists? dbpath)) + (dbfexists (file-exists? (conc dbpath "/megatest.db"))) (tmpdb (db:open-megatest-db path: dbpath)) ;; lock-create-open dbpath db:initialize-main-db)) (mtdb (db:open-megatest-db)) (refndb (db:open-megatest-db path: dbpath name: "megatest_ref.db")) (write-access (file-write-access? dbpath))) (if (and dbexists (not write-access)) @@ -280,13 +281,17 @@ (set! *db-write-access* #f)) (dbr:dbstruct-mtdb-set! dbstruct mtdb) (dbr:dbstruct-tmpdb-set! dbstruct tmpdb) ;; olddb is already a (cons db path) (dbr:dbstruct-refndb-set! dbstruct refndb) ;; (mutex-unlock! *rundb-mutex*) - (if (and (not dbexists) - *db-write-access*) ;; did not have a prior db and do have write access - (db:multi-db-sync dbstruct 'old2new)) ;; migrate data from megatest.db automatically + (if (and (not dbfexists) + write-access) ;; *db-write-access*) ;; did not have a prior db and do have write access + (begin + (debug:print 0 *default-log-port* "filling db " (db:dbdat-get-path tmpdb) " with data from " (db:dbdat-get-path mtdb)) + (db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb refndb tmpdb)) + (debug:print 0 *default-log-port* " db, " (db:dbdat-get-path tmpdb) " already exists, not propogating data from " (db:dbdat-get-path mtdb))) + ;; (db:multi-db-sync dbstruct 'old2new)) ;; migrate data from megatest.db automatically tmpdb)))) ;; Make the dbstruct, setup up auxillary db's and call for main db at least once ;; ;; called in http-transport and replicated in rmt.scm for *local* access. @@ -3111,11 +3116,12 @@ (base64:base64-decode (string-substitute (regexp "_") "=" msg #t))) (lambda ()(deserialize))) (begin - (debug:print-error 0 *default-log-port* "reception failed. Received " msg " but cannot translate it.") + (debug:print-error 0 *default-log-port* "reception failed. Received \"" msg "\" but cannot translate it.") + (print-call-chain (current-error-port)) msg))) ;; crude reply for when things go awry ((zmq nmsg)(with-input-from-string msg (lambda ()(deserialize)))) (else msg))) ;; rpc ;; This is to be the big daddy call Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -256,12 +256,12 @@ (with-input-from-request ;; was dat fullurl (list (cons 'key "thekey") (cons 'cmd cmd) (cons 'params sparams)) - read-string) - transport: 'http)) + read-string)) + transport: 'http) 0)) ;; added this speculatively ;; Shouldn't this be a call to the managed call-all-connections stuff above? (close-all-connections!) (mutex-unlock! *http-mutex*) )) @@ -276,17 +276,19 @@ (thread-terminate! th2) (debug:print-info 11 *default-log-port* "got res=" res) (if (vector? res) (if (vector-ref res 0) res - (begin ;; note: this code also called in nmsg-transport - consider consolidating it - (debug:print-error 0 *default-log-port* "error occured at server, info=" (vector-ref res 2)) - (debug:print 0 *default-log-port* " client call chain:") - (print-call-chain (current-error-port)) - (debug:print 0 *default-log-port* " server call chain:") - (pp (vector-ref res 1) (current-error-port)) - (signal (vector-ref res 0)))) + (if (debug:debug-mode 11) + (begin ;; note: this code also called in nmsg-transport - consider consolidating it + (debug:print-error 11 *default-log-port* "error occured at server, info=" (vector-ref res 2)) + (debug:print 11 *default-log-port* " client call chain:") + (print-call-chain (current-error-port)) + (debug:print 11 *default-log-port* " server call chain:") + (pp (vector-ref res 1) (current-error-port)) + (signal (vector-ref res 0))) + res)) (signal (make-composite-condition (make-property-condition 'timeout 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server"))))))) @@ -531,11 +533,12 @@ (daemon:ize) (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it (begin (current-error-port *alt-log-file*) (current-output-port *alt-log-file*))))) - (if (server:check-if-running run-id) + (if (and (server:read-dotserver *toppath*) + (server:check-if-running run-id)) (begin (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") (exit 0)) (begin ;; ok, no server detected, clean out any lingering records (tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id "notresponding"))) @@ -570,21 +573,21 @@ (thread-start! th3) (set! *didsomething* #t) (thread-join! th2) (exit)))))) -(define (http:ping run-id host-port) - (let* ((server-dat (http-transport:client-connect (car host-port)(cadr host-port))) - (login-res (rmt:login-no-auto-client-setup server-dat run-id))) - (if (and (list? login-res) - (car login-res)) - (begin - (print "LOGIN_OK") - (exit 0)) - (begin - (print "LOGIN_FAILED") - (exit 1))))) +;; (define (http:ping run-id host-port) +;; (let* ((server-dat (http-transport:client-connect (car host-port)(cadr host-port))) +;; (login-res (rmt:login-no-auto-client-setup server-dat run-id))) +;; (if (and (list? login-res) +;; (car login-res)) +;; (begin +;; (print "LOGIN_OK") +;; (exit 0)) +;; (begin +;; (print "LOGIN_FAILED") +;; (exit 1))))) (define (http-transport:server-signal-handler signum) (signal-mask! signum) (handle-exceptions exn Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -1,7 +1,7 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.6208) +(define megatest-version 1.6301) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -27,11 +27,11 @@ ;; ;; grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) - (server-url (if *toppath* (server:check-if-running *toppath*) #f)) + (server-url (if *toppath* (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (or (server:get-timeout) 100))) ;; default to 100 seconds @@ -67,79 +67,88 @@ (cond ;; give up if more than 15 attempts ((> attemptnum 15) (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.") (exit 1)) + ;; reset the connection if it has been unused too long + ((and *runremote* + (remote-conndat *runremote*) + (let ((expire-time (- start-time (remote-server-timeout *runremote*)))) + (< (http-transport:server-dat-get-last-access (remote-conndat *runremote*)) expire-time))) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") + (remote-conndat-set! *runremote* #f) + (mutex-unlock! *rmt-mutex*) + (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a record for our connection for given area ((not *runremote*) (set! *runremote* (make-remote)) (mutex-unlock! *rmt-mutex*) - (print "case 1") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a homehost record ((not (pair? (remote-hh-dat *runremote*))) ;; have a homehost record? (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little (remote-hh-dat-set! *runremote* (common:get-homehost)) (mutex-unlock! *rmt-mutex*) - (print "case 2") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a read ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (member cmd api:read-only-queries)) ;; this is a read (mutex-unlock! *rmt-mutex*) - (print "case 3") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3") (rmt:open-qry-close-locally cmd 0 params)) ;; on homehost and this is a write, we already have a server ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url *runremote*)) ;; have a server (mutex-unlock! *rmt-mutex*) - (print "case 4") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4") + (rmt:open-qry-close-locally cmd 0 params)) + ;; on homehost and this is a write, we have a server (we know because case 4 checked) + ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost + (not (member cmd api:read-only-queries))) + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") (rmt:open-qry-close-locally cmd 0 params)) ;; no server contact made and this is a write, passively start a server ((and (not (remote-server-url *runremote*)) (not (member cmd api:read-only-queries))) - (print "case 5") - (let ((serverconn (server:check-if-running *toppath*))) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") + (let ((serverconn (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call (if serverconn (remote-server-url-set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed (if (not (server:start-attempted? *toppath*)) (server:kind-run *toppath*)))) (if (cdr (remote-hh-dat *runremote*)) ;; we are on the homehost, just do the call (begin (mutex-unlock! *rmt-mutex*) - (print "case 5.1") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") (rmt:open-qry-close-locally cmd 0 params)) - (begin + (begin ;; not on homehost, start server and wait (mutex-unlock! *rmt-mutex*) - (print "case 5.2") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.2") (tasks:start-and-wait-for-server (tasks:open-db) 0 15) (rmt:send-receive cmd rid params attemptnum: attemptnum)))) ;; if not on homehost ensure we have a connection to a live server ;; NOTE: we *have* a homehost record by now ((and (not (cdr (remote-hh-dat *runremote*))) ;; are we on a homehost? (not (remote-conndat *runremote*))) ;; and no connection - (print "case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*)) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*)) (mutex-unlock! *rmt-mutex*) (tasks:start-and-wait-for-server (tasks:open-db) 0 15) - (remote-conndat-set! *runremote* (rmt:get-connection-info 0)) + (remote-conndat-set! *runremote* (rmt:get-connection-info 0)) ;; calls client:setup which calls client:setup-http (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; all set up if get this far, dispatch the query ((cdr (remote-hh-dat *runremote*)) ;; we are on homehost (mutex-unlock! *rmt-mutex*) - (print "case 7") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 7") (rmt:open-qry-close-locally cmd (if rid rid 0) params)) - ;; reset the connection if it has been unused too long - ((and (remote-conndat *runremote*) - (let ((expire-time (- start-time (remote-server-timeout *runremote*)))) - (< (http-transport:server-dat-get-last-access (remote-conndat *runremote*)) expire-time))) - (print "case 8") - (remote-conndat-set! *runremote* #f)) ;; not on homehost, do server query (else (mutex-unlock! *rmt-mutex*) - (print "case 9") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9") (let* ((conninfo (remote-conndat *runremote*)) (dat (case (remote-transport *runremote*) ((http) (condition-case ;; handling here has caused a lot of problems. However it is needed to deal with attemtped communication to servers that have gone away (http-transport:client-api-send-receive 0 conninfo cmd params) ((commfail)(vector #f "communications fail")) @@ -152,11 +161,11 @@ (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported") (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time - (print "case 9. conninfo=" conninfo " dat=" dat) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat) (if success (case (remote-transport *runremote*) ((http rpc) res) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " is unknown") @@ -163,11 +172,11 @@ (exit 1))) (begin (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) (remote-conndat-set! *runremote* #f) (remote-server-url-set! *runremote* #f) - (print "case 9.1") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") (tasks:start-and-wait-for-server (tasks:open-db) 0 15) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) (define (rmt:update-db-stats run-id rawcmd params duration) (mutex-lock! *db-stats-mutex*) @@ -269,11 +278,11 @@ ((rpc) (rpc-transport:client-api-send-receive run-id connection-info cmd params)) (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported (2)") (exit)) - ))) + )))) (if (and res (vector-ref res 0)) (vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!! #f))) ;; ;; Wrap json library for strings (why the ports crap in the first place?) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -240,27 +240,33 @@ ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; (define (server:ping host-port-in #!key (do-exit #f)) - (let ((host:port (if (number? host-port-in) ;; we were handed a server-id - (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) - ;; (print "srec: " srec " host-port-in: " host-port-in) - (if srec - (conc (vector-ref srec 3) ":" (vector-ref srec 4)) - (conc "no such server-id " host-port-in))) - host-port-in))) - (let* ((host-port (let ((slst (string-split host:port ":"))) - (if (eq? (length slst) 2) - (list (car slst)(string->number (cadr slst))) - #f))) + (let ((host:port (if (not host-port-in) ;; use read-dotserver to find + (server:read-dotserver *toppath*) + (if (number? host-port-in) ;; we were handed a server-id + (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) + ;; (print "srec: " srec " host-port-in: " host-port-in) + (if srec + (conc (vector-ref srec 3) ":" (vector-ref srec 4)) + (conc "no such server-id " host-port-in))) + host-port-in)))) + (let* ((host-port (if host:port + (let ((slst (string-split host:port ":"))) + (if (eq? (length slst) 2) + (list (car slst)(string->number (cadr slst))) + #f)) + #f)) (toppath (launch:setup))) ;; (print "host-port=" host-port) (if (not host-port) (begin - (debug:print 0 *default-log-port* "ERROR: bad host:port") - (if do-exit (exit 1))) + (if host-port-in + (debug:print 0 *default-log-port* "ERROR: bad host:port")) + (if do-exit (exit 1)) + #f) (let* ((iface (car host-port)) (port (cadr host-port)) (server-dat (case (remote-transport *runremote*) ((http) (http-transport:client-connect iface port)) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -404,11 +404,15 @@ (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id) (debug:print 0 *default-log-port* "Try starting server for run-id " run-id)) (thread-sleep! (/ (random 2000) 1000)) (server:kind-run run-id) (thread-sleep! (min delay-time 1)) - (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1)))))) + (if (not (or (server:start-attempted? *toppath*) + (server:read-dotserver *toppath*))) ;; no point in trying + (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1)) + #f)) + #f))) (define (tasks:get-all-servers mdb) (let ((res '())) (sqlite3:for-each-row (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) Index: tests/fullrun/megatest.config ================================================================== --- tests/fullrun/megatest.config +++ tests/fullrun/megatest.config @@ -249,11 +249,11 @@ ((sleeprunner) "sleeprunner") \ (else "nbfake"))} # launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log -# launcher #{shell if which bsub > /dev/null;then echo bsub -q priority -o openlava.log;else echo sleeprunner;fi} +# launcher #{ shell if which bsub > /dev/null;then echo bsub -q priority -o openlava.log;else echo sleeprunner;fi} # launcher nbfake [configf:settings trim-trailing-spaces yes] # Override the rollup for specific tests