Catch-up after Federation Outage (split, 1) (#8230)

Signed-off-by: Olivier Wilkinson (reivilibre) <olivier@librepush.net>
This commit is contained in:
reivilibre 2020-09-04 12:22:23 +01:00 committed by GitHub
parent e351298444
commit 58f61f10f7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 201 additions and 7 deletions

1
changelog.d/8230.misc Normal file
View file

@ -0,0 +1 @@
Track the latest event for every destination and room for catch-up after federation outage.

View file

@ -209,7 +209,7 @@ class FederationSender:
logger.debug("Sending %s to %r", event, destinations) logger.debug("Sending %s to %r", event, destinations)
if destinations: if destinations:
self._send_pdu(event, destinations) await self._send_pdu(event, destinations)
now = self.clock.time_msec() now = self.clock.time_msec()
ts = await self.store.get_received_ts(event.event_id) ts = await self.store.get_received_ts(event.event_id)
@ -265,7 +265,7 @@ class FederationSender:
finally: finally:
self._is_processing = False self._is_processing = False
def _send_pdu(self, pdu: EventBase, destinations: Iterable[str]) -> None: async def _send_pdu(self, pdu: EventBase, destinations: Iterable[str]) -> None:
# We loop through all destinations to see whether we already have # We loop through all destinations to see whether we already have
# a transaction in progress. If we do, stick it in the pending_pdus # a transaction in progress. If we do, stick it in the pending_pdus
# table and we'll get back to it later. # table and we'll get back to it later.
@ -280,6 +280,13 @@ class FederationSender:
sent_pdus_destination_dist_total.inc(len(destinations)) sent_pdus_destination_dist_total.inc(len(destinations))
sent_pdus_destination_dist_count.inc() sent_pdus_destination_dist_count.inc()
# track the fact that we have a PDU for these destinations,
# to allow us to perform catch-up later on if the remote is unreachable
# for a while.
await self.store.store_destination_rooms_entries(
destinations, pdu.room_id, pdu.internal_metadata.stream_ordering,
)
for destination in destinations: for destination in destinations:
self._get_per_destination_queue(destination).send_pdu(pdu) self._get_per_destination_queue(destination).send_pdu(pdu)

View file

@ -952,7 +952,7 @@ class DatabasePool:
key_names: Collection[str], key_names: Collection[str],
key_values: Collection[Iterable[Any]], key_values: Collection[Iterable[Any]],
value_names: Collection[str], value_names: Collection[str],
value_values: Iterable[Iterable[str]], value_values: Iterable[Iterable[Any]],
) -> None: ) -> None:
""" """
Upsert, many times. Upsert, many times.
@ -981,7 +981,7 @@ class DatabasePool:
key_names: Iterable[str], key_names: Iterable[str],
key_values: Collection[Iterable[Any]], key_values: Collection[Iterable[Any]],
value_names: Collection[str], value_names: Collection[str],
value_values: Iterable[Iterable[str]], value_values: Iterable[Iterable[Any]],
) -> None: ) -> None:
""" """
Upsert, many times, but without native UPSERT support or batching. Upsert, many times, but without native UPSERT support or batching.

View file

@ -69,6 +69,7 @@ class PurgeEventsStore(StateGroupWorkerStore, SQLBaseStore):
# room_depth # room_depth
# state_groups # state_groups
# state_groups_state # state_groups_state
# destination_rooms
# we will build a temporary table listing the events so that we don't # we will build a temporary table listing the events so that we don't
# have to keep shovelling the list back and forth across the # have to keep shovelling the list back and forth across the
@ -336,6 +337,7 @@ class PurgeEventsStore(StateGroupWorkerStore, SQLBaseStore):
# and finally, the tables with an index on room_id (or no useful index) # and finally, the tables with an index on room_id (or no useful index)
for table in ( for table in (
"current_state_events", "current_state_events",
"destination_rooms",
"event_backward_extremities", "event_backward_extremities",
"event_forward_extremities", "event_forward_extremities",
"event_json", "event_json",

View file

@ -0,0 +1,42 @@
/* Copyright 2020 The Matrix.org Foundation C.I.C
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-- This schema delta alters the schema to enable 'catching up' remote homeservers
-- after there has been a connectivity problem for any reason.
-- This stores, for each (destination, room) pair, the stream_ordering of the
-- latest event for that destination.
CREATE TABLE IF NOT EXISTS destination_rooms (
-- the destination in question.
destination TEXT NOT NULL REFERENCES destinations (destination),
-- the ID of the room in question
room_id TEXT NOT NULL REFERENCES rooms (room_id),
-- the stream_ordering of the event
stream_ordering INTEGER NOT NULL,
PRIMARY KEY (destination, room_id)
-- We don't declare a foreign key on stream_ordering here because that'd mean
-- we'd need to either maintain an index (expensive) or do a table scan of
-- destination_rooms whenever we delete an event (also potentially expensive).
-- In addition to that, a foreign key on stream_ordering would be redundant
-- as this row doesn't need to refer to a specific event; if the event gets
-- deleted then it doesn't affect the validity of the stream_ordering here.
);
-- This index is needed to make it so that a deletion of a room (in the rooms
-- table) can be efficient, as otherwise a table scan would need to be performed
-- to check that no destination_rooms rows point to the room to be deleted.
-- Also: it makes it efficient to delete all the entries for a given room ID,
-- such as when purging a room.
CREATE INDEX IF NOT EXISTS destination_rooms_room_id
ON destination_rooms (room_id);

View file

@ -15,13 +15,14 @@
import logging import logging
from collections import namedtuple from collections import namedtuple
from typing import Optional, Tuple from typing import Iterable, Optional, Tuple
from canonicaljson import encode_canonical_json from canonicaljson import encode_canonical_json
from synapse.metrics.background_process_metrics import run_as_background_process from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.storage._base import SQLBaseStore, db_to_json from synapse.storage._base import SQLBaseStore, db_to_json
from synapse.storage.database import DatabasePool from synapse.storage.database import DatabasePool, LoggingTransaction
from synapse.storage.engines import PostgresEngine, Sqlite3Engine
from synapse.types import JsonDict from synapse.types import JsonDict
from synapse.util.caches.expiringcache import ExpiringCache from synapse.util.caches.expiringcache import ExpiringCache
@ -164,7 +165,9 @@ class TransactionStore(SQLBaseStore):
allow_none=True, allow_none=True,
) )
if result and result["retry_last_ts"] > 0: # check we have a row and retry_last_ts is not null or zero
# (retry_last_ts can't be negative)
if result and result["retry_last_ts"]:
return result return result
else: else:
return None return None
@ -273,3 +276,60 @@ class TransactionStore(SQLBaseStore):
await self.db_pool.runInteraction( await self.db_pool.runInteraction(
"_cleanup_transactions", _cleanup_transactions_txn "_cleanup_transactions", _cleanup_transactions_txn
) )
async def store_destination_rooms_entries(
self, destinations: Iterable[str], room_id: str, stream_ordering: int,
) -> None:
"""
Updates or creates `destination_rooms` entries in batch for a single event.
Args:
destinations: list of destinations
room_id: the room_id of the event
stream_ordering: the stream_ordering of the event
"""
return await self.db_pool.runInteraction(
"store_destination_rooms_entries",
self._store_destination_rooms_entries_txn,
destinations,
room_id,
stream_ordering,
)
def _store_destination_rooms_entries_txn(
self,
txn: LoggingTransaction,
destinations: Iterable[str],
room_id: str,
stream_ordering: int,
) -> None:
# ensure we have a `destinations` row for this destination, as there is
# a foreign key constraint.
if isinstance(self.database_engine, PostgresEngine):
q = """
INSERT INTO destinations (destination)
VALUES (?)
ON CONFLICT DO NOTHING;
"""
elif isinstance(self.database_engine, Sqlite3Engine):
q = """
INSERT OR IGNORE INTO destinations (destination)
VALUES (?);
"""
else:
raise RuntimeError("Unknown database engine")
txn.execute_batch(q, ((destination,) for destination in destinations))
rows = [(destination, room_id) for destination in destinations]
self.db_pool.simple_upsert_many_txn(
txn,
"destination_rooms",
["destination", "room_id"],
rows,
["stream_ordering"],
[(stream_ordering,)] * len(rows),
)

View file

@ -0,0 +1,82 @@
from mock import Mock
from synapse.rest import admin
from synapse.rest.client.v1 import login, room
from tests.test_utils import event_injection, make_awaitable
from tests.unittest import FederatingHomeserverTestCase, override_config
class FederationCatchUpTestCases(FederatingHomeserverTestCase):
servlets = [
admin.register_servlets,
room.register_servlets,
login.register_servlets,
]
def make_homeserver(self, reactor, clock):
return self.setup_test_homeserver(
federation_transport_client=Mock(spec=["send_transaction"]),
)
def prepare(self, reactor, clock, hs):
# stub out get_current_hosts_in_room
state_handler = hs.get_state_handler()
# This mock is crucial for destination_rooms to be populated.
state_handler.get_current_hosts_in_room = Mock(
return_value=make_awaitable(["test", "host2"])
)
def get_destination_room(self, room: str, destination: str = "host2") -> dict:
"""
Gets the destination_rooms entry for a (destination, room_id) pair.
Args:
room: room ID
destination: what destination, default is "host2"
Returns:
Dictionary of { event_id: str, stream_ordering: int }
"""
event_id, stream_ordering = self.get_success(
self.hs.get_datastore().db_pool.execute(
"test:get_destination_rooms",
None,
"""
SELECT event_id, stream_ordering
FROM destination_rooms dr
JOIN events USING (stream_ordering)
WHERE dr.destination = ? AND dr.room_id = ?
""",
destination,
room,
)
)[0]
return {"event_id": event_id, "stream_ordering": stream_ordering}
@override_config({"send_federation": True})
def test_catch_up_destination_rooms_tracking(self):
"""
Tests that we populate the `destination_rooms` table as needed.
"""
self.register_user("u1", "you the one")
u1_token = self.login("u1", "you the one")
room = self.helper.create_room_as("u1", tok=u1_token)
self.get_success(
event_injection.inject_member_event(self.hs, room, "@user:host2", "join")
)
event_id_1 = self.helper.send(room, "wombats!", tok=u1_token)["event_id"]
row_1 = self.get_destination_room(room)
event_id_2 = self.helper.send(room, "rabbits!", tok=u1_token)["event_id"]
row_2 = self.get_destination_room(room)
# check: events correctly registered in order
self.assertEqual(row_1["event_id"], event_id_1)
self.assertEqual(row_2["event_id"], event_id_2)
self.assertEqual(row_1["stream_ordering"], row_2["stream_ordering"] - 1)