Don't hammer the database for destination retry timings every ~5mins (#10036)

This commit is contained in:
Erik Johnston 2021-05-21 17:57:08 +01:00 committed by GitHub
parent e8ac9ac8ca
commit 3e831f24ff
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 62 additions and 76 deletions

1
changelog.d/10036.misc Normal file
View file

@ -0,0 +1 @@
Properly invalidate caches for destination retry timings every (instead of expiring entries every 5 minutes).

View file

@ -61,7 +61,6 @@ from synapse.replication.slave.storage.pushers import SlavedPusherStore
from synapse.replication.slave.storage.receipts import SlavedReceiptsStore from synapse.replication.slave.storage.receipts import SlavedReceiptsStore
from synapse.replication.slave.storage.registration import SlavedRegistrationStore from synapse.replication.slave.storage.registration import SlavedRegistrationStore
from synapse.replication.slave.storage.room import RoomStore from synapse.replication.slave.storage.room import RoomStore
from synapse.replication.slave.storage.transactions import SlavedTransactionStore
from synapse.rest.admin import register_servlets_for_media_repo from synapse.rest.admin import register_servlets_for_media_repo
from synapse.rest.client.v1 import events, login, presence, room from synapse.rest.client.v1 import events, login, presence, room
from synapse.rest.client.v1.initial_sync import InitialSyncRestServlet from synapse.rest.client.v1.initial_sync import InitialSyncRestServlet
@ -237,7 +236,6 @@ class GenericWorkerSlavedStore(
DirectoryStore, DirectoryStore,
SlavedApplicationServiceStore, SlavedApplicationServiceStore,
SlavedRegistrationStore, SlavedRegistrationStore,
SlavedTransactionStore,
SlavedProfileStore, SlavedProfileStore,
SlavedClientIpStore, SlavedClientIpStore,
SlavedFilteringStore, SlavedFilteringStore,

View file

@ -160,7 +160,7 @@ class Authenticator:
# If we get a valid signed request from the other side, its probably # If we get a valid signed request from the other side, its probably
# alive # alive
retry_timings = await self.store.get_destination_retry_timings(origin) retry_timings = await self.store.get_destination_retry_timings(origin)
if retry_timings and retry_timings["retry_last_ts"]: if retry_timings and retry_timings.retry_last_ts:
run_in_background(self._reset_retry_timings, origin) run_in_background(self._reset_retry_timings, origin)
return origin return origin

View file

@ -1,21 +0,0 @@
# Copyright 2015, 2016 OpenMarket Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from synapse.storage.databases.main.transactions import TransactionStore
from ._base import BaseSlavedStore
class SlavedTransactionStore(TransactionStore, BaseSlavedStore):
pass

View file

@ -67,7 +67,7 @@ from .state import StateStore
from .stats import StatsStore from .stats import StatsStore
from .stream import StreamStore from .stream import StreamStore
from .tags import TagsStore from .tags import TagsStore
from .transactions import TransactionStore from .transactions import TransactionWorkerStore
from .ui_auth import UIAuthStore from .ui_auth import UIAuthStore
from .user_directory import UserDirectoryStore from .user_directory import UserDirectoryStore
from .user_erasure_store import UserErasureStore from .user_erasure_store import UserErasureStore
@ -83,7 +83,7 @@ class DataStore(
StreamStore, StreamStore,
ProfileStore, ProfileStore,
PresenceStore, PresenceStore,
TransactionStore, TransactionWorkerStore,
DirectoryStore, DirectoryStore,
KeyStore, KeyStore,
StateStore, StateStore,

View file

@ -16,13 +16,15 @@ import logging
from collections import namedtuple from collections import namedtuple
from typing import Iterable, List, Optional, Tuple from typing import Iterable, List, Optional, Tuple
import attr
from canonicaljson import encode_canonical_json from canonicaljson import encode_canonical_json
from synapse.metrics.background_process_metrics import wrap_as_background_process from synapse.metrics.background_process_metrics import wrap_as_background_process
from synapse.storage._base import SQLBaseStore, db_to_json from synapse.storage._base import db_to_json
from synapse.storage.database import DatabasePool, LoggingTransaction from synapse.storage.database import DatabasePool, LoggingTransaction
from synapse.storage.databases.main.cache import CacheInvalidationWorkerStore
from synapse.types import JsonDict from synapse.types import JsonDict
from synapse.util.caches.expiringcache import ExpiringCache from synapse.util.caches.descriptors import cached
db_binary_type = memoryview db_binary_type = memoryview
@ -38,10 +40,23 @@ _UpdateTransactionRow = namedtuple(
"_TransactionRow", ("response_code", "response_json") "_TransactionRow", ("response_code", "response_json")
) )
SENTINEL = object()
@attr.s(slots=True, frozen=True, auto_attribs=True)
class DestinationRetryTimings:
"""The current destination retry timing info for a remote server."""
# The first time we tried and failed to reach the remote server, in ms.
failure_ts: int
# The last time we tried and failed to reach the remote server, in ms.
retry_last_ts: int
# How long since the last time we tried to reach the remote server before
# trying again, in ms.
retry_interval: int
class TransactionWorkerStore(SQLBaseStore): class TransactionWorkerStore(CacheInvalidationWorkerStore):
def __init__(self, database: DatabasePool, db_conn, hs): def __init__(self, database: DatabasePool, db_conn, hs):
super().__init__(database, db_conn, hs) super().__init__(database, db_conn, hs)
@ -60,19 +75,6 @@ class TransactionWorkerStore(SQLBaseStore):
"_cleanup_transactions", _cleanup_transactions_txn "_cleanup_transactions", _cleanup_transactions_txn
) )
class TransactionStore(TransactionWorkerStore):
"""A collection of queries for handling PDUs."""
def __init__(self, database: DatabasePool, db_conn, hs):
super().__init__(database, db_conn, hs)
self._destination_retry_cache = ExpiringCache(
cache_name="get_destination_retry_timings",
clock=self._clock,
expiry_ms=5 * 60 * 1000,
)
async def get_received_txn_response( async def get_received_txn_response(
self, transaction_id: str, origin: str self, transaction_id: str, origin: str
) -> Optional[Tuple[int, JsonDict]]: ) -> Optional[Tuple[int, JsonDict]]:
@ -145,7 +147,11 @@ class TransactionStore(TransactionWorkerStore):
desc="set_received_txn_response", desc="set_received_txn_response",
) )
async def get_destination_retry_timings(self, destination): @cached(max_entries=10000)
async def get_destination_retry_timings(
self,
destination: str,
) -> Optional[DestinationRetryTimings]:
"""Gets the current retry timings (if any) for a given destination. """Gets the current retry timings (if any) for a given destination.
Args: Args:
@ -156,34 +162,29 @@ class TransactionStore(TransactionWorkerStore):
Otherwise a dict for the retry scheme Otherwise a dict for the retry scheme
""" """
result = self._destination_retry_cache.get(destination, SENTINEL)
if result is not SENTINEL:
return result
result = await self.db_pool.runInteraction( result = await self.db_pool.runInteraction(
"get_destination_retry_timings", "get_destination_retry_timings",
self._get_destination_retry_timings, self._get_destination_retry_timings,
destination, destination,
) )
# We don't hugely care about race conditions between getting and
# invalidating the cache, since we time out fairly quickly anyway.
self._destination_retry_cache[destination] = result
return result return result
def _get_destination_retry_timings(self, txn, destination): def _get_destination_retry_timings(
self, txn, destination: str
) -> Optional[DestinationRetryTimings]:
result = self.db_pool.simple_select_one_txn( result = self.db_pool.simple_select_one_txn(
txn, txn,
table="destinations", table="destinations",
keyvalues={"destination": destination}, keyvalues={"destination": destination},
retcols=("destination", "failure_ts", "retry_last_ts", "retry_interval"), retcols=("failure_ts", "retry_last_ts", "retry_interval"),
allow_none=True, allow_none=True,
) )
# check we have a row and retry_last_ts is not null or zero # check we have a row and retry_last_ts is not null or zero
# (retry_last_ts can't be negative) # (retry_last_ts can't be negative)
if result and result["retry_last_ts"]: if result and result["retry_last_ts"]:
return result return DestinationRetryTimings(**result)
else: else:
return None return None
@ -204,7 +205,6 @@ class TransactionStore(TransactionWorkerStore):
retry_interval: how long until next retry in ms retry_interval: how long until next retry in ms
""" """
self._destination_retry_cache.pop(destination, None)
if self.database_engine.can_native_upsert: if self.database_engine.can_native_upsert:
return await self.db_pool.runInteraction( return await self.db_pool.runInteraction(
"set_destination_retry_timings", "set_destination_retry_timings",
@ -252,6 +252,10 @@ class TransactionStore(TransactionWorkerStore):
txn.execute(sql, (destination, failure_ts, retry_last_ts, retry_interval)) txn.execute(sql, (destination, failure_ts, retry_last_ts, retry_interval))
self._invalidate_cache_and_stream(
txn, self.get_destination_retry_timings, (destination,)
)
def _set_destination_retry_timings_emulated( def _set_destination_retry_timings_emulated(
self, txn, destination, failure_ts, retry_last_ts, retry_interval self, txn, destination, failure_ts, retry_last_ts, retry_interval
): ):
@ -295,6 +299,10 @@ class TransactionStore(TransactionWorkerStore):
}, },
) )
self._invalidate_cache_and_stream(
txn, self.get_destination_retry_timings, (destination,)
)
async def store_destination_rooms_entries( async def store_destination_rooms_entries(
self, self,
destinations: Iterable[str], destinations: Iterable[str],

View file

@ -82,11 +82,9 @@ async def get_retry_limiter(destination, clock, store, ignore_backoff=False, **k
retry_timings = await store.get_destination_retry_timings(destination) retry_timings = await store.get_destination_retry_timings(destination)
if retry_timings: if retry_timings:
failure_ts = retry_timings["failure_ts"] failure_ts = retry_timings.failure_ts
retry_last_ts, retry_interval = ( retry_last_ts = retry_timings.retry_last_ts
retry_timings["retry_last_ts"], retry_interval = retry_timings.retry_interval
retry_timings["retry_interval"],
)
now = int(clock.time_msec()) now = int(clock.time_msec())

View file

@ -89,14 +89,8 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase):
self.event_source = hs.get_event_sources().sources["typing"] self.event_source = hs.get_event_sources().sources["typing"]
self.datastore = hs.get_datastore() self.datastore = hs.get_datastore()
retry_timings_res = {
"destination": "",
"retry_last_ts": 0,
"retry_interval": 0,
"failure_ts": None,
}
self.datastore.get_destination_retry_timings = Mock( self.datastore.get_destination_retry_timings = Mock(
return_value=defer.succeed(retry_timings_res) return_value=defer.succeed(None)
) )
self.datastore.get_device_updates_by_remote = Mock( self.datastore.get_device_updates_by_remote = Mock(

View file

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from synapse.storage.databases.main.transactions import DestinationRetryTimings
from synapse.util.retryutils import MAX_RETRY_INTERVAL from synapse.util.retryutils import MAX_RETRY_INTERVAL
from tests.unittest import HomeserverTestCase from tests.unittest import HomeserverTestCase
@ -36,8 +37,11 @@ class TransactionStoreTestCase(HomeserverTestCase):
d = self.store.get_destination_retry_timings("example.com") d = self.store.get_destination_retry_timings("example.com")
r = self.get_success(d) r = self.get_success(d)
self.assert_dict( self.assertEqual(
{"retry_last_ts": 50, "retry_interval": 100, "failure_ts": 1000}, r DestinationRetryTimings(
retry_last_ts=50, retry_interval=100, failure_ts=1000
),
r,
) )
def test_initial_set_transactions(self): def test_initial_set_transactions(self):

View file

@ -51,10 +51,12 @@ class RetryLimiterTestCase(HomeserverTestCase):
except AssertionError: except AssertionError:
pass pass
self.pump()
new_timings = self.get_success(store.get_destination_retry_timings("test_dest")) new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
self.assertEqual(new_timings["failure_ts"], failure_ts) self.assertEqual(new_timings.failure_ts, failure_ts)
self.assertEqual(new_timings["retry_last_ts"], failure_ts) self.assertEqual(new_timings.retry_last_ts, failure_ts)
self.assertEqual(new_timings["retry_interval"], MIN_RETRY_INTERVAL) self.assertEqual(new_timings.retry_interval, MIN_RETRY_INTERVAL)
# now if we try again we should get a failure # now if we try again we should get a failure
self.get_failure( self.get_failure(
@ -77,14 +79,16 @@ class RetryLimiterTestCase(HomeserverTestCase):
except AssertionError: except AssertionError:
pass pass
self.pump()
new_timings = self.get_success(store.get_destination_retry_timings("test_dest")) new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
self.assertEqual(new_timings["failure_ts"], failure_ts) self.assertEqual(new_timings.failure_ts, failure_ts)
self.assertEqual(new_timings["retry_last_ts"], retry_ts) self.assertEqual(new_timings.retry_last_ts, retry_ts)
self.assertGreaterEqual( self.assertGreaterEqual(
new_timings["retry_interval"], MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 0.5 new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 0.5
) )
self.assertLessEqual( self.assertLessEqual(
new_timings["retry_interval"], MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0 new_timings.retry_interval, MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0
) )
# #