forked from MirrorHub/synapse
Add metrics to track success/otherwise of replication requests (#8406)
One hope is that this might provide some insights into #3365.
This commit is contained in:
parent
1c262431f9
commit
866c84da8d
2 changed files with 29 additions and 12 deletions
1
changelog.d/8406.feature
Normal file
1
changelog.d/8406.feature
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Add prometheus metrics for replication requests.
|
|
@ -20,18 +20,28 @@ import urllib
|
||||||
from inspect import signature
|
from inspect import signature
|
||||||
from typing import Dict, List, Tuple
|
from typing import Dict, List, Tuple
|
||||||
|
|
||||||
from synapse.api.errors import (
|
from prometheus_client import Counter, Gauge
|
||||||
CodeMessageException,
|
|
||||||
HttpResponseException,
|
from synapse.api.errors import HttpResponseException, SynapseError
|
||||||
RequestSendFailed,
|
from synapse.http import RequestTimedOutError
|
||||||
SynapseError,
|
|
||||||
)
|
|
||||||
from synapse.logging.opentracing import inject_active_span_byte_dict, trace
|
from synapse.logging.opentracing import inject_active_span_byte_dict, trace
|
||||||
from synapse.util.caches.response_cache import ResponseCache
|
from synapse.util.caches.response_cache import ResponseCache
|
||||||
from synapse.util.stringutils import random_string
|
from synapse.util.stringutils import random_string
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_pending_outgoing_requests = Gauge(
|
||||||
|
"synapse_pending_outgoing_replication_requests",
|
||||||
|
"Number of active outgoing replication requests, by replication method name",
|
||||||
|
["name"],
|
||||||
|
)
|
||||||
|
|
||||||
|
_outgoing_request_counter = Counter(
|
||||||
|
"synapse_outgoing_replication_requests",
|
||||||
|
"Number of outgoing replication requests, by replication method name and result",
|
||||||
|
["name", "code"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class ReplicationEndpoint(metaclass=abc.ABCMeta):
|
class ReplicationEndpoint(metaclass=abc.ABCMeta):
|
||||||
"""Helper base class for defining new replication HTTP endpoints.
|
"""Helper base class for defining new replication HTTP endpoints.
|
||||||
|
@ -138,7 +148,10 @@ class ReplicationEndpoint(metaclass=abc.ABCMeta):
|
||||||
|
|
||||||
instance_map = hs.config.worker.instance_map
|
instance_map = hs.config.worker.instance_map
|
||||||
|
|
||||||
|
outgoing_gauge = _pending_outgoing_requests.labels(cls.NAME)
|
||||||
|
|
||||||
@trace(opname="outgoing_replication_request")
|
@trace(opname="outgoing_replication_request")
|
||||||
|
@outgoing_gauge.track_inprogress()
|
||||||
async def send_request(instance_name="master", **kwargs):
|
async def send_request(instance_name="master", **kwargs):
|
||||||
if instance_name == local_instance_name:
|
if instance_name == local_instance_name:
|
||||||
raise Exception("Trying to send HTTP request to self")
|
raise Exception("Trying to send HTTP request to self")
|
||||||
|
@ -193,23 +206,26 @@ class ReplicationEndpoint(metaclass=abc.ABCMeta):
|
||||||
try:
|
try:
|
||||||
result = await request_func(uri, data, headers=headers)
|
result = await request_func(uri, data, headers=headers)
|
||||||
break
|
break
|
||||||
except CodeMessageException as e:
|
except RequestTimedOutError:
|
||||||
if e.code != 504 or not cls.RETRY_ON_TIMEOUT:
|
if not cls.RETRY_ON_TIMEOUT:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
logger.warning("%s request timed out", cls.NAME)
|
logger.warning("%s request timed out; retrying", cls.NAME)
|
||||||
|
|
||||||
# If we timed out we probably don't need to worry about backing
|
# If we timed out we probably don't need to worry about backing
|
||||||
# off too much, but lets just wait a little anyway.
|
# off too much, but lets just wait a little anyway.
|
||||||
await clock.sleep(1)
|
await clock.sleep(1)
|
||||||
except HttpResponseException as e:
|
except HttpResponseException as e:
|
||||||
# We convert to SynapseError as we know that it was a SynapseError
|
# We convert to SynapseError as we know that it was a SynapseError
|
||||||
# on the master process that we should send to the client. (And
|
# on the main process that we should send to the client. (And
|
||||||
# importantly, not stack traces everywhere)
|
# importantly, not stack traces everywhere)
|
||||||
|
_outgoing_request_counter.labels(cls.NAME, e.code).inc()
|
||||||
raise e.to_synapse_error()
|
raise e.to_synapse_error()
|
||||||
except RequestSendFailed as e:
|
except Exception as e:
|
||||||
raise SynapseError(502, "Failed to talk to master") from e
|
_outgoing_request_counter.labels(cls.NAME, "ERR").inc()
|
||||||
|
raise SynapseError(502, "Failed to talk to main process") from e
|
||||||
|
|
||||||
|
_outgoing_request_counter.labels(cls.NAME, 200).inc()
|
||||||
return result
|
return result
|
||||||
|
|
||||||
return send_request
|
return send_request
|
||||||
|
|
Loading…
Reference in a new issue