mirror of
https://mau.dev/maunium/synapse.git
synced 2024-12-14 20:53:52 +01:00
add a cache to have_seen_event (#9953)
Empirically, this helped my server considerably when handling gaps in Matrix HQ. The problem was that we would repeatedly call have_seen_events for the same set of (50K or so) auth_events, each of which would take many minutes to complete, even though it's only an index scan.
This commit is contained in:
parent
10e6d2abce
commit
b4b2fd2ece
10 changed files with 206 additions and 21 deletions
1
changelog.d/9953.feature
Normal file
1
changelog.d/9953.feature
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Improve performance of incoming federation transactions in large rooms.
|
1
changelog.d/9973.feature
Normal file
1
changelog.d/9973.feature
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Improve performance of incoming federation transactions in large rooms.
|
|
@ -1 +0,0 @@
|
||||||
Make `LruCache.invalidate` support tree invalidation, and remove `invalidate_many`.
|
|
|
@ -577,7 +577,9 @@ class FederationHandler(BaseHandler):
|
||||||
|
|
||||||
# Fetch the state events from the DB, and check we have the auth events.
|
# Fetch the state events from the DB, and check we have the auth events.
|
||||||
event_map = await self.store.get_events(state_event_ids, allow_rejected=True)
|
event_map = await self.store.get_events(state_event_ids, allow_rejected=True)
|
||||||
auth_events_in_store = await self.store.have_seen_events(auth_event_ids)
|
auth_events_in_store = await self.store.have_seen_events(
|
||||||
|
room_id, auth_event_ids
|
||||||
|
)
|
||||||
|
|
||||||
# Check for missing events. We handle state and auth event seperately,
|
# Check for missing events. We handle state and auth event seperately,
|
||||||
# as we want to pull the state from the DB, but we don't for the auth
|
# as we want to pull the state from the DB, but we don't for the auth
|
||||||
|
@ -610,7 +612,7 @@ class FederationHandler(BaseHandler):
|
||||||
|
|
||||||
if missing_auth_events:
|
if missing_auth_events:
|
||||||
auth_events_in_store = await self.store.have_seen_events(
|
auth_events_in_store = await self.store.have_seen_events(
|
||||||
missing_auth_events
|
room_id, missing_auth_events
|
||||||
)
|
)
|
||||||
missing_auth_events.difference_update(auth_events_in_store)
|
missing_auth_events.difference_update(auth_events_in_store)
|
||||||
|
|
||||||
|
@ -710,7 +712,7 @@ class FederationHandler(BaseHandler):
|
||||||
|
|
||||||
missing_auth_events = set(auth_event_ids) - fetched_events.keys()
|
missing_auth_events = set(auth_event_ids) - fetched_events.keys()
|
||||||
missing_auth_events.difference_update(
|
missing_auth_events.difference_update(
|
||||||
await self.store.have_seen_events(missing_auth_events)
|
await self.store.have_seen_events(room_id, missing_auth_events)
|
||||||
)
|
)
|
||||||
logger.debug("We are also missing %i auth events", len(missing_auth_events))
|
logger.debug("We are also missing %i auth events", len(missing_auth_events))
|
||||||
|
|
||||||
|
@ -2475,7 +2477,7 @@ class FederationHandler(BaseHandler):
|
||||||
#
|
#
|
||||||
# we start by checking if they are in the store, and then try calling /event_auth/.
|
# we start by checking if they are in the store, and then try calling /event_auth/.
|
||||||
if missing_auth:
|
if missing_auth:
|
||||||
have_events = await self.store.have_seen_events(missing_auth)
|
have_events = await self.store.have_seen_events(event.room_id, missing_auth)
|
||||||
logger.debug("Events %s are in the store", have_events)
|
logger.debug("Events %s are in the store", have_events)
|
||||||
missing_auth.difference_update(have_events)
|
missing_auth.difference_update(have_events)
|
||||||
|
|
||||||
|
@ -2494,7 +2496,7 @@ class FederationHandler(BaseHandler):
|
||||||
return context
|
return context
|
||||||
|
|
||||||
seen_remotes = await self.store.have_seen_events(
|
seen_remotes = await self.store.have_seen_events(
|
||||||
[e.event_id for e in remote_auth_chain]
|
event.room_id, [e.event_id for e in remote_auth_chain]
|
||||||
)
|
)
|
||||||
|
|
||||||
for e in remote_auth_chain:
|
for e in remote_auth_chain:
|
||||||
|
|
|
@ -168,6 +168,7 @@ class CacheInvalidationWorkerStore(SQLBaseStore):
|
||||||
backfilled,
|
backfilled,
|
||||||
):
|
):
|
||||||
self._invalidate_get_event_cache(event_id)
|
self._invalidate_get_event_cache(event_id)
|
||||||
|
self.have_seen_event.invalidate((room_id, event_id))
|
||||||
|
|
||||||
self.get_latest_event_ids_in_room.invalidate((room_id,))
|
self.get_latest_event_ids_in_room.invalidate((room_id,))
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,7 @@ from typing import (
|
||||||
Iterable,
|
Iterable,
|
||||||
List,
|
List,
|
||||||
Optional,
|
Optional,
|
||||||
|
Set,
|
||||||
Tuple,
|
Tuple,
|
||||||
overload,
|
overload,
|
||||||
)
|
)
|
||||||
|
@ -55,7 +56,7 @@ from synapse.storage.engines import PostgresEngine
|
||||||
from synapse.storage.util.id_generators import MultiWriterIdGenerator, StreamIdGenerator
|
from synapse.storage.util.id_generators import MultiWriterIdGenerator, StreamIdGenerator
|
||||||
from synapse.storage.util.sequence import build_sequence_generator
|
from synapse.storage.util.sequence import build_sequence_generator
|
||||||
from synapse.types import JsonDict, get_domain_from_id
|
from synapse.types import JsonDict, get_domain_from_id
|
||||||
from synapse.util.caches.descriptors import cached
|
from synapse.util.caches.descriptors import cached, cachedList
|
||||||
from synapse.util.caches.lrucache import LruCache
|
from synapse.util.caches.lrucache import LruCache
|
||||||
from synapse.util.iterutils import batch_iter
|
from synapse.util.iterutils import batch_iter
|
||||||
from synapse.util.metrics import Measure
|
from synapse.util.metrics import Measure
|
||||||
|
@ -1045,32 +1046,74 @@ class EventsWorkerStore(SQLBaseStore):
|
||||||
|
|
||||||
return {r["event_id"] for r in rows}
|
return {r["event_id"] for r in rows}
|
||||||
|
|
||||||
async def have_seen_events(self, event_ids):
|
async def have_seen_events(
|
||||||
|
self, room_id: str, event_ids: Iterable[str]
|
||||||
|
) -> Set[str]:
|
||||||
"""Given a list of event ids, check if we have already processed them.
|
"""Given a list of event ids, check if we have already processed them.
|
||||||
|
|
||||||
|
The room_id is only used to structure the cache (so that it can later be
|
||||||
|
invalidated by room_id) - there is no guarantee that the events are actually
|
||||||
|
in the room in question.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
event_ids (iterable[str]):
|
room_id: Room we are polling
|
||||||
|
event_ids: events we are looking for
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
set[str]: The events we have already seen.
|
set[str]: The events we have already seen.
|
||||||
"""
|
"""
|
||||||
# if the event cache contains the event, obviously we've seen it.
|
res = await self._have_seen_events_dict(
|
||||||
results = {x for x in event_ids if self._get_event_cache.contains(x)}
|
(room_id, event_id) for event_id in event_ids
|
||||||
|
)
|
||||||
|
return {eid for ((_rid, eid), have_event) in res.items() if have_event}
|
||||||
|
|
||||||
def have_seen_events_txn(txn, chunk):
|
@cachedList("have_seen_event", "keys")
|
||||||
sql = "SELECT event_id FROM events as e WHERE "
|
async def _have_seen_events_dict(
|
||||||
|
self, keys: Iterable[Tuple[str, str]]
|
||||||
|
) -> Dict[Tuple[str, str], bool]:
|
||||||
|
"""Helper for have_seen_events
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
a dict {(room_id, event_id)-> bool}
|
||||||
|
"""
|
||||||
|
# if the event cache contains the event, obviously we've seen it.
|
||||||
|
|
||||||
|
cache_results = {
|
||||||
|
(rid, eid) for (rid, eid) in keys if self._get_event_cache.contains((eid,))
|
||||||
|
}
|
||||||
|
results = {x: True for x in cache_results}
|
||||||
|
|
||||||
|
def have_seen_events_txn(txn, chunk: Tuple[Tuple[str, str], ...]):
|
||||||
|
# we deliberately do *not* query the database for room_id, to make the
|
||||||
|
# query an index-only lookup on `events_event_id_key`.
|
||||||
|
#
|
||||||
|
# We therefore pull the events from the database into a set...
|
||||||
|
|
||||||
|
sql = "SELECT event_id FROM events AS e WHERE "
|
||||||
clause, args = make_in_list_sql_clause(
|
clause, args = make_in_list_sql_clause(
|
||||||
txn.database_engine, "e.event_id", chunk
|
txn.database_engine, "e.event_id", [eid for (_rid, eid) in chunk]
|
||||||
)
|
)
|
||||||
txn.execute(sql + clause, args)
|
txn.execute(sql + clause, args)
|
||||||
results.update(row[0] for row in txn)
|
found_events = {eid for eid, in txn}
|
||||||
|
|
||||||
for chunk in batch_iter((x for x in event_ids if x not in results), 100):
|
# ... and then we can update the results for each row in the batch
|
||||||
|
results.update({(rid, eid): (eid in found_events) for (rid, eid) in chunk})
|
||||||
|
|
||||||
|
# each batch requires its own index scan, so we make the batches as big as
|
||||||
|
# possible.
|
||||||
|
for chunk in batch_iter((k for k in keys if k not in cache_results), 500):
|
||||||
await self.db_pool.runInteraction(
|
await self.db_pool.runInteraction(
|
||||||
"have_seen_events", have_seen_events_txn, chunk
|
"have_seen_events", have_seen_events_txn, chunk
|
||||||
)
|
)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
@cached(max_entries=100000, tree=True)
|
||||||
|
async def have_seen_event(self, room_id: str, event_id: str):
|
||||||
|
# this only exists for the benefit of the @cachedList descriptor on
|
||||||
|
# _have_seen_events_dict
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
def _get_current_state_event_counts_txn(self, txn, room_id):
|
def _get_current_state_event_counts_txn(self, txn, room_id):
|
||||||
"""
|
"""
|
||||||
See get_current_state_event_counts.
|
See get_current_state_event_counts.
|
||||||
|
|
|
@ -16,14 +16,14 @@ import logging
|
||||||
from typing import Any, List, Set, Tuple
|
from typing import Any, List, Set, Tuple
|
||||||
|
|
||||||
from synapse.api.errors import SynapseError
|
from synapse.api.errors import SynapseError
|
||||||
from synapse.storage._base import SQLBaseStore
|
from synapse.storage.databases.main import CacheInvalidationWorkerStore
|
||||||
from synapse.storage.databases.main.state import StateGroupWorkerStore
|
from synapse.storage.databases.main.state import StateGroupWorkerStore
|
||||||
from synapse.types import RoomStreamToken
|
from synapse.types import RoomStreamToken
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class PurgeEventsStore(StateGroupWorkerStore, SQLBaseStore):
|
class PurgeEventsStore(StateGroupWorkerStore, CacheInvalidationWorkerStore):
|
||||||
async def purge_history(
|
async def purge_history(
|
||||||
self, room_id: str, token: str, delete_local_events: bool
|
self, room_id: str, token: str, delete_local_events: bool
|
||||||
) -> Set[int]:
|
) -> Set[int]:
|
||||||
|
@ -203,8 +203,6 @@ class PurgeEventsStore(StateGroupWorkerStore, SQLBaseStore):
|
||||||
"DELETE FROM event_to_state_groups "
|
"DELETE FROM event_to_state_groups "
|
||||||
"WHERE event_id IN (SELECT event_id from events_to_purge)"
|
"WHERE event_id IN (SELECT event_id from events_to_purge)"
|
||||||
)
|
)
|
||||||
for event_id, _ in event_rows:
|
|
||||||
txn.call_after(self._get_state_group_for_event.invalidate, (event_id,))
|
|
||||||
|
|
||||||
# Delete all remote non-state events
|
# Delete all remote non-state events
|
||||||
for table in (
|
for table in (
|
||||||
|
@ -283,6 +281,20 @@ class PurgeEventsStore(StateGroupWorkerStore, SQLBaseStore):
|
||||||
# so make sure to keep this actually last.
|
# so make sure to keep this actually last.
|
||||||
txn.execute("DROP TABLE events_to_purge")
|
txn.execute("DROP TABLE events_to_purge")
|
||||||
|
|
||||||
|
for event_id, should_delete in event_rows:
|
||||||
|
self._invalidate_cache_and_stream(
|
||||||
|
txn, self._get_state_group_for_event, (event_id,)
|
||||||
|
)
|
||||||
|
|
||||||
|
# XXX: This is racy, since have_seen_events could be called between the
|
||||||
|
# transaction completing and the invalidation running. On the other hand,
|
||||||
|
# that's no different to calling `have_seen_events` just before the
|
||||||
|
# event is deleted from the database.
|
||||||
|
if should_delete:
|
||||||
|
self._invalidate_cache_and_stream(
|
||||||
|
txn, self.have_seen_event, (room_id, event_id)
|
||||||
|
)
|
||||||
|
|
||||||
logger.info("[purge] done")
|
logger.info("[purge] done")
|
||||||
|
|
||||||
return referenced_state_groups
|
return referenced_state_groups
|
||||||
|
@ -422,7 +434,11 @@ class PurgeEventsStore(StateGroupWorkerStore, SQLBaseStore):
|
||||||
# index on them. In any case we should be clearing out 'stream' tables
|
# index on them. In any case we should be clearing out 'stream' tables
|
||||||
# periodically anyway (#5888)
|
# periodically anyway (#5888)
|
||||||
|
|
||||||
# TODO: we could probably usefully do a bunch of cache invalidation here
|
# TODO: we could probably usefully do a bunch more cache invalidation here
|
||||||
|
|
||||||
|
# XXX: as with purge_history, this is racy, but no worse than other races
|
||||||
|
# that already exist.
|
||||||
|
self._invalidate_cache_and_stream(txn, self.have_seen_event, (room_id,))
|
||||||
|
|
||||||
logger.info("[purge] done")
|
logger.info("[purge] done")
|
||||||
|
|
||||||
|
|
13
tests/storage/databases/__init__.py
Normal file
13
tests/storage/databases/__init__.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
# Copyright 2021 The Matrix.org Foundation C.I.C.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
13
tests/storage/databases/main/__init__.py
Normal file
13
tests/storage/databases/main/__init__.py
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
# Copyright 2021 The Matrix.org Foundation C.I.C.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
96
tests/storage/databases/main/test_events_worker.py
Normal file
96
tests/storage/databases/main/test_events_worker.py
Normal file
|
@ -0,0 +1,96 @@
|
||||||
|
# Copyright 2021 The Matrix.org Foundation C.I.C.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import json
|
||||||
|
|
||||||
|
from synapse.logging.context import LoggingContext
|
||||||
|
from synapse.storage.databases.main.events_worker import EventsWorkerStore
|
||||||
|
|
||||||
|
from tests import unittest
|
||||||
|
|
||||||
|
|
||||||
|
class HaveSeenEventsTestCase(unittest.HomeserverTestCase):
|
||||||
|
def prepare(self, reactor, clock, hs):
|
||||||
|
self.store: EventsWorkerStore = hs.get_datastore()
|
||||||
|
|
||||||
|
# insert some test data
|
||||||
|
for rid in ("room1", "room2"):
|
||||||
|
self.get_success(
|
||||||
|
self.store.db_pool.simple_insert(
|
||||||
|
"rooms",
|
||||||
|
{"room_id": rid, "room_version": 4},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
for idx, (rid, eid) in enumerate(
|
||||||
|
(
|
||||||
|
("room1", "event10"),
|
||||||
|
("room1", "event11"),
|
||||||
|
("room1", "event12"),
|
||||||
|
("room2", "event20"),
|
||||||
|
)
|
||||||
|
):
|
||||||
|
self.get_success(
|
||||||
|
self.store.db_pool.simple_insert(
|
||||||
|
"events",
|
||||||
|
{
|
||||||
|
"event_id": eid,
|
||||||
|
"room_id": rid,
|
||||||
|
"topological_ordering": idx,
|
||||||
|
"stream_ordering": idx,
|
||||||
|
"type": "test",
|
||||||
|
"processed": True,
|
||||||
|
"outlier": False,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.get_success(
|
||||||
|
self.store.db_pool.simple_insert(
|
||||||
|
"event_json",
|
||||||
|
{
|
||||||
|
"event_id": eid,
|
||||||
|
"room_id": rid,
|
||||||
|
"json": json.dumps({"type": "test", "room_id": rid}),
|
||||||
|
"internal_metadata": "{}",
|
||||||
|
"format_version": 3,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_simple(self):
|
||||||
|
with LoggingContext(name="test") as ctx:
|
||||||
|
res = self.get_success(
|
||||||
|
self.store.have_seen_events("room1", ["event10", "event19"])
|
||||||
|
)
|
||||||
|
self.assertEquals(res, {"event10"})
|
||||||
|
|
||||||
|
# that should result in a single db query
|
||||||
|
self.assertEquals(ctx.get_resource_usage().db_txn_count, 1)
|
||||||
|
|
||||||
|
# a second lookup of the same events should cause no queries
|
||||||
|
with LoggingContext(name="test") as ctx:
|
||||||
|
res = self.get_success(
|
||||||
|
self.store.have_seen_events("room1", ["event10", "event19"])
|
||||||
|
)
|
||||||
|
self.assertEquals(res, {"event10"})
|
||||||
|
self.assertEquals(ctx.get_resource_usage().db_txn_count, 0)
|
||||||
|
|
||||||
|
def test_query_via_event_cache(self):
|
||||||
|
# fetch an event into the event cache
|
||||||
|
self.get_success(self.store.get_event("event10"))
|
||||||
|
|
||||||
|
# looking it up should now cause no db hits
|
||||||
|
with LoggingContext(name="test") as ctx:
|
||||||
|
res = self.get_success(self.store.have_seen_events("room1", ["event10"]))
|
||||||
|
self.assertEquals(res, {"event10"})
|
||||||
|
self.assertEquals(ctx.get_resource_usage().db_txn_count, 0)
|
Loading…
Reference in a new issue