Add metrics to track success/otherwise of replication requests (#8406)

One hope is that this might provide some insights into #3365.
This commit is contained in:
Richard van der Hoff 2020-09-29 11:06:11 +01:00 committed by GitHub
parent 1c262431f9
commit 866c84da8d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 29 additions and 12 deletions

1
changelog.d/8406.feature Normal file
View file

@ -0,0 +1 @@
Add prometheus metrics for replication requests.

View file

@ -20,18 +20,28 @@ import urllib
from inspect import signature from inspect import signature
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
from synapse.api.errors import ( from prometheus_client import Counter, Gauge
CodeMessageException,
HttpResponseException, from synapse.api.errors import HttpResponseException, SynapseError
RequestSendFailed, from synapse.http import RequestTimedOutError
SynapseError,
)
from synapse.logging.opentracing import inject_active_span_byte_dict, trace from synapse.logging.opentracing import inject_active_span_byte_dict, trace
from synapse.util.caches.response_cache import ResponseCache from synapse.util.caches.response_cache import ResponseCache
from synapse.util.stringutils import random_string from synapse.util.stringutils import random_string
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_pending_outgoing_requests = Gauge(
"synapse_pending_outgoing_replication_requests",
"Number of active outgoing replication requests, by replication method name",
["name"],
)
_outgoing_request_counter = Counter(
"synapse_outgoing_replication_requests",
"Number of outgoing replication requests, by replication method name and result",
["name", "code"],
)
class ReplicationEndpoint(metaclass=abc.ABCMeta): class ReplicationEndpoint(metaclass=abc.ABCMeta):
"""Helper base class for defining new replication HTTP endpoints. """Helper base class for defining new replication HTTP endpoints.
@ -138,7 +148,10 @@ class ReplicationEndpoint(metaclass=abc.ABCMeta):
instance_map = hs.config.worker.instance_map instance_map = hs.config.worker.instance_map
outgoing_gauge = _pending_outgoing_requests.labels(cls.NAME)
@trace(opname="outgoing_replication_request") @trace(opname="outgoing_replication_request")
@outgoing_gauge.track_inprogress()
async def send_request(instance_name="master", **kwargs): async def send_request(instance_name="master", **kwargs):
if instance_name == local_instance_name: if instance_name == local_instance_name:
raise Exception("Trying to send HTTP request to self") raise Exception("Trying to send HTTP request to self")
@ -193,23 +206,26 @@ class ReplicationEndpoint(metaclass=abc.ABCMeta):
try: try:
result = await request_func(uri, data, headers=headers) result = await request_func(uri, data, headers=headers)
break break
except CodeMessageException as e: except RequestTimedOutError:
if e.code != 504 or not cls.RETRY_ON_TIMEOUT: if not cls.RETRY_ON_TIMEOUT:
raise raise
logger.warning("%s request timed out", cls.NAME) logger.warning("%s request timed out; retrying", cls.NAME)
# If we timed out we probably don't need to worry about backing # If we timed out we probably don't need to worry about backing
# off too much, but lets just wait a little anyway. # off too much, but lets just wait a little anyway.
await clock.sleep(1) await clock.sleep(1)
except HttpResponseException as e: except HttpResponseException as e:
# We convert to SynapseError as we know that it was a SynapseError # We convert to SynapseError as we know that it was a SynapseError
# on the master process that we should send to the client. (And # on the main process that we should send to the client. (And
# importantly, not stack traces everywhere) # importantly, not stack traces everywhere)
_outgoing_request_counter.labels(cls.NAME, e.code).inc()
raise e.to_synapse_error() raise e.to_synapse_error()
except RequestSendFailed as e: except Exception as e:
raise SynapseError(502, "Failed to talk to master") from e _outgoing_request_counter.labels(cls.NAME, "ERR").inc()
raise SynapseError(502, "Failed to talk to main process") from e
_outgoing_request_counter.labels(cls.NAME, 200).inc()
return result return result
return send_request return send_request