From 24d59c756884f3b4a05dfb9414998443873f418d Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Thu, 31 Jan 2019 23:18:20 +0000 Subject: [PATCH 1/3] better logging for federation connections --- .../federation/matrix_federation_agent.py | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/synapse/http/federation/matrix_federation_agent.py b/synapse/http/federation/matrix_federation_agent.py index 57dcab472..3a0525e36 100644 --- a/synapse/http/federation/matrix_federation_agent.py +++ b/synapse/http/federation/matrix_federation_agent.py @@ -23,6 +23,7 @@ from zope.interface import implementer from twisted.internet import defer from twisted.internet.endpoints import HostnameEndpoint, wrapClientTLS +from twisted.internet.interfaces import IStreamClientEndpoint from twisted.web.client import URI, Agent, HTTPConnectionPool, RedirectAgent, readBody from twisted.web.http import stringToDatetime from twisted.web.http_headers import Headers @@ -152,12 +153,9 @@ class MatrixFederationAgent(object): class EndpointFactory(object): @staticmethod def endpointForURI(_uri): - logger.info( - "Connecting to %s:%i", - res.target_host.decode("ascii"), - res.target_port, + ep = LoggingHostnameEndpoint( + self._reactor, res.target_host, res.target_port, ) - ep = HostnameEndpoint(self._reactor, res.target_host, res.target_port) if tls_options is not None: ep = wrapClientTLS(tls_options, ep) return ep @@ -342,6 +340,19 @@ class MatrixFederationAgent(object): defer.returnValue(result) +@implementer(IStreamClientEndpoint) +class LoggingHostnameEndpoint(object): + """A wrapper for HostnameEndpint which logs when it connects""" + def __init__(self, reactor, host, port, *args, **kwargs): + self.host = host + self.port = port + self.ep = HostnameEndpoint(reactor, host, port, *args, **kwargs) + + def connect(self, protocol_factory): + logger.info("Connecting to %s:%i", self.host, self.port) + return self.ep.connect(protocol_factory) + + def _cache_period_from_headers(headers, time_now=time.time): cache_controls = _parse_cache_control(headers) From 3c8a41140e883d0be60d4ec469c753d81775d375 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Fri, 1 Feb 2019 00:36:24 +0000 Subject: [PATCH 2/3] Cache failures to parse .well-known Also add a Measure block around the .well-known fetch --- .../federation/matrix_federation_agent.py | 56 ++++++++++++++----- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/synapse/http/federation/matrix_federation_agent.py b/synapse/http/federation/matrix_federation_agent.py index 3a0525e36..50baa8bf0 100644 --- a/synapse/http/federation/matrix_federation_agent.py +++ b/synapse/http/federation/matrix_federation_agent.py @@ -30,8 +30,10 @@ from twisted.web.http_headers import Headers from twisted.web.iweb import IAgent from synapse.http.federation.srv_resolver import SrvResolver, pick_server_from_list +from synapse.util import Clock from synapse.util.caches.ttlcache import TTLCache from synapse.util.logcontext import make_deferred_yieldable +from synapse.util.metrics import Measure # period to cache .well-known results for by default WELL_KNOWN_DEFAULT_CACHE_PERIOD = 24 * 3600 @@ -45,6 +47,8 @@ WELL_KNOWN_INVALID_CACHE_PERIOD = 1 * 3600 # cap for .well-known cache period WELL_KNOWN_MAX_CACHE_PERIOD = 48 * 3600 +# magic value to mark an invalid well-known +INVALID_WELL_KNOWN = object() logger = logging.getLogger(__name__) well_known_cache = TTLCache('well-known') @@ -79,6 +83,8 @@ class MatrixFederationAgent(object): _well_known_cache=well_known_cache, ): self._reactor = reactor + self._clock = Clock(reactor) + self._tls_client_options_factory = tls_client_options_factory if _srv_resolver is None: _srv_resolver = SrvResolver() @@ -99,6 +105,11 @@ class MatrixFederationAgent(object): ) self._well_known_agent = _well_known_agent + # our cache of .well-known lookup results, mapping from server name + # to delegated name. The values can be: + # `bytes`: a valid server-name + # `None`: there is no .well-known here + # INVALID_WELL_KNWOWN: the .well-known here is invalid self._well_known_cache = _well_known_cache @defer.inlineCallbacks @@ -281,14 +292,35 @@ class MatrixFederationAgent(object): None if there was no .well-known file. """ try: - cached = self._well_known_cache[server_name] - defer.returnValue(cached) + result = self._well_known_cache[server_name] except KeyError: - pass + # TODO: should we linearise so that we don't end up doing two .well-known + # requests for the same server in parallel? + with Measure(self._clock, "get_well_known"): + result, cache_period = yield self._do_get_well_known(server_name) - # TODO: should we linearise so that we don't end up doing two .well-known requests - # for the same server in parallel? + if cache_period > 0: + self._well_known_cache.set(server_name, result, cache_period) + if result == INVALID_WELL_KNOWN: + raise Exception("invalid .well-known on this server") + + defer.returnValue(result) + + @defer.inlineCallbacks + def _do_get_well_known(self, server_name): + """Actually fetch and parse a .well-known, without checking the cache + + Args: + server_name (bytes): name of the server, from the requested url + + Returns: + Deferred[Tuple[bytes|None|object],int]: + result, cache period, where result is one of: + - the new server name from the .well-known (as a `bytes`) + - None if there was no .well-known file. + - INVALID_WELL_KNOWN if the .well-known was invalid + """ uri = b"https://%s/.well-known/matrix/server" % (server_name, ) uri_str = uri.decode("ascii") logger.info("Fetching %s", uri_str) @@ -306,9 +338,7 @@ class MatrixFederationAgent(object): # after startup cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD cache_period += random.uniform(0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER) - - self._well_known_cache.set(server_name, None, cache_period) - defer.returnValue(None) + defer.returnValue((None, cache_period)) try: parsed_body = json.loads(body.decode('utf-8')) @@ -318,7 +348,10 @@ class MatrixFederationAgent(object): if "m.server" not in parsed_body: raise Exception("Missing key 'm.server'") except Exception as e: - raise Exception("invalid .well-known response from %s: %s" % (uri_str, e,)) + logger.info("invalid .well-known response from %s: %s", uri_str, e) + cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD + cache_period += random.uniform(0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER) + defer.returnValue((INVALID_WELL_KNOWN, cache_period)) result = parsed_body["m.server"].encode("ascii") @@ -334,10 +367,7 @@ class MatrixFederationAgent(object): else: cache_period = min(cache_period, WELL_KNOWN_MAX_CACHE_PERIOD) - if cache_period > 0: - self._well_known_cache.set(server_name, result, cache_period) - - defer.returnValue(result) + defer.returnValue((result, cache_period)) @implementer(IStreamClientEndpoint) From 0390c961acae36be719e538f38f1659b51c02e46 Mon Sep 17 00:00:00 2001 From: Richard van der Hoff Date: Fri, 1 Feb 2019 09:40:58 +0000 Subject: [PATCH 3/3] changelog --- changelog.d/4542.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/4542.misc diff --git a/changelog.d/4542.misc b/changelog.d/4542.misc new file mode 100644 index 000000000..74c84e020 --- /dev/null +++ b/changelog.d/4542.misc @@ -0,0 +1 @@ +Improve performance of handling servers with invalid .well-known