Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -116,30 +116,29 @@ ;; - returns #( flag result ) ;; (define (api:execute-requests dbstruct dat) (handle-exceptions exn - (let ((call-chain (get-call-chain)) - ) + (let ((call-chain (get-call-chain))) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat) (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (cond ((not (vector? dat)) ;; it is an error to not receive a vector - (vector #f #f "remote must be called with a vector") ) - ((> *api-process-request-count* 20) - (vector #f 'overloaded)) + (vector #f (vector #f "remote must be called with a vector"))) + ((> *api-process-request-count* 20) ;; 20) + 'overloaded) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else - (let* ((cmd-in (vector-ref dat 0)) - (cmd (if (symbol? cmd-in) - cmd-in - (string->symbol cmd-in))) - (params (vector-ref dat 1)) - (start-t (current-milliseconds)) - (readonly-mode (dbr:dbstruct-read-only dbstruct)) - (readonly-command (member cmd api:read-only-queries)) + (let* ((cmd-in (vector-ref dat 0)) + (cmd (if (symbol? cmd-in) + cmd-in + (string->symbol cmd-in))) + (params (vector-ref dat 1)) + (start-t (current-milliseconds)) + (readonly-mode (dbr:dbstruct-read-only dbstruct)) + (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) (res (if writecmd-in-readonly-mode (conc "attempt to run write command "cmd" on a read-only database") (case cmd @@ -273,19 +272,23 @@ ;; TESTMETA ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) ;; TASKS - ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)))))) + ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)) + (else + (debug:print 0 *default-log-port* "ERROR: bad api call " cmd) + (conc "ERROR: BAD api call " cmd)))))) + ;; save all stats (let ((delta-t (- (current-milliseconds) start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) - (if (not writecmd-in-readonly-mode) - (vector #t res) - (vector #f res))))))) + (if writecmd-in-readonly-mode + (vector #f res) + (vector #t res))))))) ;; http-server send-response ;; api:process-request ;; db:* ;; @@ -293,13 +296,16 @@ ;; (define (api:process-request dbstruct $) ;; the $ is the request vars proc (set! *api-process-request-count* (+ *api-process-request-count* 1)) (let* ((cmd ($ 'cmd)) (paramsj ($ 'params)) - (params (db:string->obj paramsj transport: 'http)) ;; (rmt:json-str->dat paramsj)) - (resdat (api:execute-requests dbstruct (vector cmd params))) ;; #( flag result ) - (res (vector-ref resdat 1))) + (params (db:string->obj paramsj transport: 'http)) ;; incoming data from the POST (or is it a GET?) + (resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) + (success (vector-ref resdat 0)) + (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?) + (if (not success) + (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) (if (> *api-process-request-count* *max-api-process-requests*) (set! *max-api-process-requests* *api-process-request-count*)) (set! *api-process-request-count* (- *api-process-request-count* 1)) ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds ;; (rmt:dat->json-str Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -266,12 +266,12 @@ (thread-start! th2) (thread-join! th1) (thread-terminate! th2) (debug:print-info 11 *default-log-port* "got res=" res) (if (vector? res) - (if (vector-ref res 0) - res + (if (vector-ref res 0) ;; this is the first flag or the second flag? + res ;; this is the *inner* vector? seriously? why? (if (debug:debug-mode 11) (begin ;; note: this code also called in nmsg-transport - consider consolidating it (debug:print-error 11 *default-log-port* "error occured at server, info=" (vector-ref res 2)) (debug:print 11 *default-log-port* " client call chain:") (print-call-chain (current-error-port)) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -196,34 +196,28 @@ (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time ;; (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = "runremote) - (if success - (case (remote-transport runremote) - ((http) - (mutex-unlock! *rmt-mutex*) - res) - (else - (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " is unknown") - (mutex-unlock! *rmt-mutex*) - (exit 1))) - (if (eq? res 'overloaded) + (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) + (mutex-unlock! *rmt-mutex*) + (if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end + (if (and (symbol? res) + (eq? res 'overloaded)) (let ((wait-delay (+ attemptnum (* attemptnum 10)))) (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") (thread-sleep! wait-delay) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) - (begin - (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) - (remote-conndat-set! runremote #f) - (remote-server-url-set! runremote #f) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") - (mutex-unlock! *rmt-mutex*) - (if (not (server:check-if-running *toppath*)) - (server:start-and-wait *toppath*)) - (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))))))))) + res) ;; All good, return res + (begin + (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) + (remote-conndat-set! runremote #f) + (remote-server-url-set! runremote #f) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") + (if (not (server:check-if-running *toppath*)) + (server:start-and-wait *toppath*)) + (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) ;; (define (rmt:update-db-stats run-id rawcmd params duration) ;; (mutex-lock! *db-stats-mutex*) ;; (handle-exceptions ;; exn Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -140,26 +140,29 @@ ;; given a path to a server log return: host port startseconds ;; (define (server:logf-get-start-info logf) (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs - (with-input-from-file - logf - (lambda () - (let loop ((inl (read-line)) - (lnum 0)) - (if (not (eof-object? inl)) - (let ((mlst (string-match rx inl))) - (if (not mlst) - (if (< lnum 500) ;; give up if more than 500 lines of server log read - (loop (read-line)(+ lnum 1)) - (list #f #f #f)) - (let ((dat (cdr mlst))) - (list (car dat) ;; host - (string->number (cadr dat)) ;; port - (string->number (caddr dat)))))) - (list #f #f #f))))))) + (handle-exceptions + exn + (list #f #f #f) ;; no idea what went wrong, call it a bad server + (with-input-from-file + logf + (lambda () + (let loop ((inl (read-line)) + (lnum 0)) + (if (not (eof-object? inl)) + (let ((mlst (string-match rx inl))) + (if (not mlst) + (if (< lnum 500) ;; give up if more than 500 lines of server log read + (loop (read-line)(+ lnum 1)) + (list #f #f #f)) + (let ((dat (cdr mlst))) + (list (car dat) ;; host + (string->number (cadr dat)) ;; port + (string->number (caddr dat)))))) + (list #f #f #f)))))))) ;; get a list of servers with all relevant data ;; ( mod-time host port start-time pid ) ;; (define (server:get-list areapath #!key (limit #f)) @@ -184,13 +187,13 @@ '() (let loop ((hed (car server-logs)) (tal (cdr server-logs)) (res '())) (let* ((mod-time (handle-exceptions - exn - 0 - (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted + exn + (current-seconds) ;; 0 + (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted (down-time (- (current-seconds) mod-time)) (serv-dat (if (or (< num-serv-logs 10) (< down-time day-seconds)) (server:logf-get-start-info hed) '())) ;; don't waste time processing server files not touched in the past day if there are more than ten servers to look at Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -486,11 +486,14 @@ (change-directory orig-dir) ;; NB// tests:test-set-toplog! is remote internal... (tests:test-set-toplog! run-id test-name outputfilename)) ;; didn't get the lock, check to see if current update started later than this ;; update, if so we can exit without doing any work - (if (> my-start-time (file-modification-time lockf)) + (if (> my-start-time (handle-exceptions + exn + 0 + (file-modification-time lockf))) ;; we started since current re-gen in flight, delay a little and try again (begin (debug:print-info 1 *default-log-port* "Waiting to update " outputfilename ", another test currently updating it") (thread-sleep! (+ 5 (random 5))) ;; delay between 5 and 10 seconds (loop (common:simple-file-lock lockf))))))))))