Properly update retry_last_ts when hitting the maximum retry interval (#16156)

* Properly update retry_last_ts when hitting the maximum retry interval This was broken in 1.87 when the maximum retry interval got changed from almost infinite to a week (and made configurable). fixes #16101 Signed-off-by: Nicolas Werner <nicolas.werner@hotmail.de> * Add changelog * Change fix + add test * Add comment --------- Signed-off-by: Nicolas Werner <nicolas.werner@hotmail.de> Co-authored-by: Mathieu Velten <mathieuv@matrix.org>
2024-11-15 14:32:30 +01:00 · 2023-08-23 08:35:23 +00:00 · 2023-08-23 08:35:23 +00:00 · 19a1cda084
commit 19a1cda084
parent dffe095642
3 changed files with 55 additions and 1 deletions
--- a/changelog.d/16156.bugfix
+++ b/changelog.d/16156.bugfix
@ -0,0 +1 @@
 Fix a bug introduced in 1.87 where synapse would send an excessive amount of federation requests to servers which have been offline for a long time. Contributed by Nico.
--- a/synapse/storage/databases/main/transactions.py
+++ b/synapse/storage/databases/main/transactions.py
@ -242,6 +242,8 @@ class TransactionWorkerStore(CacheInvalidationWorkerStore):
    ) -> None:
        # Upsert retry time interval if retry_interval is zero (i.e. we're
        # resetting it) or greater than the existing retry interval.
        # We also upsert when the new retry interval is the same as the existing one,
        # since it will be the case when `destination_max_retry_interval` is reached.
        #
        # WARNING: This is executed in autocommit, so we shouldn't add any more
        # SQL calls in here (without being very careful).
@ -257,7 +259,7 @@ class TransactionWorkerStore(CacheInvalidationWorkerStore):
                WHERE
                    EXCLUDED.retry_interval = 0
                    OR destinations.retry_interval IS NULL
-                    OR destinations.retry_interval < EXCLUDED.retry_interval
+                    OR destinations.retry_interval <= EXCLUDED.retry_interval
        """
        txn.execute(sql, (destination, failure_ts, retry_last_ts, retry_interval))
--- a/tests/util/test_retryutils.py
+++ b/tests/util/test_retryutils.py
@ -108,3 +108,54 @@ class RetryLimiterTestCase(HomeserverTestCase):
        new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
        self.assertIsNone(new_timings)
    def test_max_retry_interval(self) -> None:
        """Test that `destination_max_retry_interval` setting works as expected"""
        store = self.hs.get_datastores().main
        destination_max_retry_interval_ms = (
            self.hs.config.federation.destination_max_retry_interval_ms
        )
        self.get_success(get_retry_limiter("test_dest", self.clock, store))
        self.pump(1)
        failure_ts = self.clock.time_msec()
        # Simulate reaching destination_max_retry_interval
        self.get_success(
            store.set_destination_retry_timings(
                "test_dest",
                failure_ts=failure_ts,
                retry_last_ts=failure_ts,
                retry_interval=destination_max_retry_interval_ms,
            )
        )
        # Check it fails
        self.get_failure(
            get_retry_limiter("test_dest", self.clock, store), NotRetryingDestination
        )
        # Get past retry_interval and we can try again, and still throw an error to continue the backoff
        self.reactor.advance(destination_max_retry_interval_ms / 1000 + 1)
        limiter = self.get_success(get_retry_limiter("test_dest", self.clock, store))
        self.pump(1)
        try:
            with limiter:
                self.pump(1)
                raise AssertionError("argh")
        except AssertionError:
            pass
        self.pump()
        # retry_interval does not increase and stays at destination_max_retry_interval_ms
        new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
        assert new_timings is not None
        self.assertEqual(new_timings.retry_interval, destination_max_retry_interval_ms)
        # Check it fails
        self.get_failure(
            get_retry_limiter("test_dest", self.clock, store), NotRetryingDestination
        )
		`@ -0,0 +1 @@`
							`Fix a bug introduced in 1.87 where synapse would send an excessive amount of federation requests to servers which have been offline for a long time. Contributed by Nico.`