From 0ec12a37538d0df07d96cfc9cf5f5208f7453607 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 20 Jan 2023 21:04:33 +0000 Subject: [PATCH] Reduce max time we wait for stream positions (#14881) Now that we wait for stream positions whenever we do a HTTP replication hit, we need to be less brutal in the case where we do timeout (as we have bugs around this). --- changelog.d/14881.misc | 1 + synapse/replication/http/_base.py | 2 -- synapse/replication/tcp/client.py | 21 +++++++++++---------- 3 files changed, 12 insertions(+), 12 deletions(-) create mode 100644 changelog.d/14881.misc diff --git a/changelog.d/14881.misc b/changelog.d/14881.misc new file mode 100644 index 000000000..be89d092b --- /dev/null +++ b/changelog.d/14881.misc @@ -0,0 +1 @@ +Reduce max time we wait for stream positions. diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py index 709327b97..908f3f1db 100644 --- a/synapse/replication/http/_base.py +++ b/synapse/replication/http/_base.py @@ -352,7 +352,6 @@ class ReplicationEndpoint(metaclass=abc.ABCMeta): instance_name=instance_name, stream_name=stream_name, position=position, - raise_on_timeout=False, ) return result @@ -414,7 +413,6 @@ class ReplicationEndpoint(metaclass=abc.ABCMeta): instance_name=content[_STREAM_POSITION_KEY]["instance_name"], stream_name=stream_name, position=position, - raise_on_timeout=False, ) if self.CACHE: diff --git a/synapse/replication/tcp/client.py b/synapse/replication/tcp/client.py index 6e242c574..493f61667 100644 --- a/synapse/replication/tcp/client.py +++ b/synapse/replication/tcp/client.py @@ -59,7 +59,7 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) # How long we allow callers to wait for replication updates before timing out. -_WAIT_FOR_REPLICATION_TIMEOUT_SECONDS = 30 +_WAIT_FOR_REPLICATION_TIMEOUT_SECONDS = 5 class DirectTcpReplicationClientFactory(ReconnectingClientFactory): @@ -326,7 +326,6 @@ class ReplicationDataHandler: instance_name: str, stream_name: str, position: int, - raise_on_timeout: bool = True, ) -> None: """Wait until this instance has received updates up to and including the given stream position. @@ -335,8 +334,6 @@ class ReplicationDataHandler: instance_name stream_name position - raise_on_timeout: Whether to raise an exception if we time out - waiting for the updates, or if we log an error and return. """ if instance_name == self._instance_name: @@ -365,19 +362,23 @@ class ReplicationDataHandler: # We measure here to get in flight counts and average waiting time. with Measure(self._clock, "repl.wait_for_stream_position"): - logger.info("Waiting for repl stream %r to reach %s", stream_name, position) + logger.info( + "Waiting for repl stream %r to reach %s (%s)", + stream_name, + position, + instance_name, + ) try: await make_deferred_yieldable(deferred) except defer.TimeoutError: logger.error("Timed out waiting for stream %s", stream_name) - - if raise_on_timeout: - raise - return logger.info( - "Finished waiting for repl stream %r to reach %s", stream_name, position + "Finished waiting for repl stream %r to reach %s (%s)", + stream_name, + position, + instance_name, ) def stop_pusher(self, user_id: str, app_id: str, pushkey: str) -> None: