mirror of
https://mau.dev/maunium/synapse.git
synced 2024-11-14 22:11:47 +01:00
Retry some http replication failures (#12182)
This allows for the target process to be down for around a minute which provides time for restarts during synapse upgrades/config updates. Closes: #12178 Signed off by Nick Mills-Barrett nick@beeper.com
This commit is contained in:
parent
dc8d825ef2
commit
180d8ff0d4
2 changed files with 37 additions and 11 deletions
1
changelog.d/12182.misc
Normal file
1
changelog.d/12182.misc
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Retry HTTP replication failures, this should prevent 502's when restarting stateful workers (main, event persisters, stream writers). Contributed by Nick @ Beeper.
|
|
@ -21,6 +21,7 @@ from typing import TYPE_CHECKING, Any, Awaitable, Callable, Dict, List, Tuple
|
||||||
|
|
||||||
from prometheus_client import Counter, Gauge
|
from prometheus_client import Counter, Gauge
|
||||||
|
|
||||||
|
from twisted.internet.error import ConnectError, DNSLookupError
|
||||||
from twisted.web.server import Request
|
from twisted.web.server import Request
|
||||||
|
|
||||||
from synapse.api.errors import HttpResponseException, SynapseError
|
from synapse.api.errors import HttpResponseException, SynapseError
|
||||||
|
@ -87,6 +88,10 @@ class ReplicationEndpoint(metaclass=abc.ABCMeta):
|
||||||
`_handle_request` must return a Deferred.
|
`_handle_request` must return a Deferred.
|
||||||
RETRY_ON_TIMEOUT(bool): Whether or not to retry the request when a 504
|
RETRY_ON_TIMEOUT(bool): Whether or not to retry the request when a 504
|
||||||
is received.
|
is received.
|
||||||
|
RETRY_ON_CONNECT_ERROR (bool): Whether or not to retry the request when
|
||||||
|
a connection error is received.
|
||||||
|
RETRY_ON_CONNECT_ERROR_ATTEMPTS (int): Number of attempts to retry when
|
||||||
|
receiving connection errors, each will backoff exponentially longer.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
NAME: str = abc.abstractproperty() # type: ignore
|
NAME: str = abc.abstractproperty() # type: ignore
|
||||||
|
@ -94,6 +99,8 @@ class ReplicationEndpoint(metaclass=abc.ABCMeta):
|
||||||
METHOD = "POST"
|
METHOD = "POST"
|
||||||
CACHE = True
|
CACHE = True
|
||||||
RETRY_ON_TIMEOUT = True
|
RETRY_ON_TIMEOUT = True
|
||||||
|
RETRY_ON_CONNECT_ERROR = True
|
||||||
|
RETRY_ON_CONNECT_ERROR_ATTEMPTS = 5 # =63s (2^6-1)
|
||||||
|
|
||||||
def __init__(self, hs: "HomeServer"):
|
def __init__(self, hs: "HomeServer"):
|
||||||
if self.CACHE:
|
if self.CACHE:
|
||||||
|
@ -236,18 +243,20 @@ class ReplicationEndpoint(metaclass=abc.ABCMeta):
|
||||||
"/".join(url_args),
|
"/".join(url_args),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
headers: Dict[bytes, List[bytes]] = {}
|
||||||
|
# Add an authorization header, if configured.
|
||||||
|
if replication_secret:
|
||||||
|
headers[b"Authorization"] = [b"Bearer " + replication_secret]
|
||||||
|
opentracing.inject_header_dict(headers, check_destination=False)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# Keep track of attempts made so we can bail if we don't manage to
|
||||||
|
# connect to the target after N tries.
|
||||||
|
attempts = 0
|
||||||
# We keep retrying the same request for timeouts. This is so that we
|
# We keep retrying the same request for timeouts. This is so that we
|
||||||
# have a good idea that the request has either succeeded or failed
|
# have a good idea that the request has either succeeded or failed
|
||||||
# on the master, and so whether we should clean up or not.
|
# on the master, and so whether we should clean up or not.
|
||||||
while True:
|
while True:
|
||||||
headers: Dict[bytes, List[bytes]] = {}
|
|
||||||
# Add an authorization header, if configured.
|
|
||||||
if replication_secret:
|
|
||||||
headers[b"Authorization"] = [
|
|
||||||
b"Bearer " + replication_secret
|
|
||||||
]
|
|
||||||
opentracing.inject_header_dict(headers, check_destination=False)
|
|
||||||
try:
|
try:
|
||||||
result = await request_func(uri, data, headers=headers)
|
result = await request_func(uri, data, headers=headers)
|
||||||
break
|
break
|
||||||
|
@ -260,6 +269,22 @@ class ReplicationEndpoint(metaclass=abc.ABCMeta):
|
||||||
# If we timed out we probably don't need to worry about backing
|
# If we timed out we probably don't need to worry about backing
|
||||||
# off too much, but lets just wait a little anyway.
|
# off too much, but lets just wait a little anyway.
|
||||||
await clock.sleep(1)
|
await clock.sleep(1)
|
||||||
|
except (ConnectError, DNSLookupError) as e:
|
||||||
|
if not cls.RETRY_ON_CONNECT_ERROR:
|
||||||
|
raise
|
||||||
|
if attempts > cls.RETRY_ON_CONNECT_ERROR_ATTEMPTS:
|
||||||
|
raise
|
||||||
|
|
||||||
|
delay = 2 ** attempts
|
||||||
|
logger.warning(
|
||||||
|
"%s request connection failed; retrying in %ds: %r",
|
||||||
|
cls.NAME,
|
||||||
|
delay,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
|
||||||
|
await clock.sleep(delay)
|
||||||
|
attempts += 1
|
||||||
except HttpResponseException as e:
|
except HttpResponseException as e:
|
||||||
# We convert to SynapseError as we know that it was a SynapseError
|
# We convert to SynapseError as we know that it was a SynapseError
|
||||||
# on the main process that we should send to the client. (And
|
# on the main process that we should send to the client. (And
|
||||||
|
|
Loading…
Reference in a new issue