mirror of
https://mau.dev/maunium/synapse.git
synced 2025-01-21 12:52:02 +01:00
Faster room joins: Try other destinations when resyncing the state of a partial-state room (#12812)
Signed-off-by: Sean Quah <seanq@matrix.org>
This commit is contained in:
parent
3594f6c1f3
commit
2fba1076c5
4 changed files with 94 additions and 9 deletions
1
changelog.d/12812.misc
Normal file
1
changelog.d/12812.misc
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Try other homeservers when re-syncing state for rooms with partial state.
|
|
@ -405,6 +405,9 @@ class FederationClient(FederationBase):
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
a tuple of (state event_ids, auth event_ids)
|
a tuple of (state event_ids, auth event_ids)
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
InvalidResponseError: if fields in the response have the wrong type.
|
||||||
"""
|
"""
|
||||||
result = await self.transport_layer.get_room_state_ids(
|
result = await self.transport_layer.get_room_state_ids(
|
||||||
destination, room_id, event_id=event_id
|
destination, room_id, event_id=event_id
|
||||||
|
@ -416,7 +419,7 @@ class FederationClient(FederationBase):
|
||||||
if not isinstance(state_event_ids, list) or not isinstance(
|
if not isinstance(state_event_ids, list) or not isinstance(
|
||||||
auth_event_ids, list
|
auth_event_ids, list
|
||||||
):
|
):
|
||||||
raise Exception("invalid response from /state_ids")
|
raise InvalidResponseError("invalid response from /state_ids")
|
||||||
|
|
||||||
return state_event_ids, auth_event_ids
|
return state_event_ids, auth_event_ids
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,16 @@ import itertools
|
||||||
import logging
|
import logging
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
|
from typing import (
|
||||||
|
TYPE_CHECKING,
|
||||||
|
Collection,
|
||||||
|
Dict,
|
||||||
|
Iterable,
|
||||||
|
List,
|
||||||
|
Optional,
|
||||||
|
Tuple,
|
||||||
|
Union,
|
||||||
|
)
|
||||||
|
|
||||||
import attr
|
import attr
|
||||||
from signedjson.key import decode_verify_key_bytes
|
from signedjson.key import decode_verify_key_bytes
|
||||||
|
@ -34,6 +43,7 @@ from synapse.api.errors import (
|
||||||
CodeMessageException,
|
CodeMessageException,
|
||||||
Codes,
|
Codes,
|
||||||
FederationDeniedError,
|
FederationDeniedError,
|
||||||
|
FederationError,
|
||||||
HttpResponseException,
|
HttpResponseException,
|
||||||
NotFoundError,
|
NotFoundError,
|
||||||
RequestSendFailed,
|
RequestSendFailed,
|
||||||
|
@ -545,7 +555,8 @@ class FederationHandler:
|
||||||
run_as_background_process(
|
run_as_background_process(
|
||||||
desc="sync_partial_state_room",
|
desc="sync_partial_state_room",
|
||||||
func=self._sync_partial_state_room,
|
func=self._sync_partial_state_room,
|
||||||
destination=origin,
|
initial_destination=origin,
|
||||||
|
other_destinations=ret.servers_in_room,
|
||||||
room_id=room_id,
|
room_id=room_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1454,13 +1465,16 @@ class FederationHandler:
|
||||||
|
|
||||||
async def _sync_partial_state_room(
|
async def _sync_partial_state_room(
|
||||||
self,
|
self,
|
||||||
destination: str,
|
initial_destination: Optional[str],
|
||||||
|
other_destinations: Collection[str],
|
||||||
room_id: str,
|
room_id: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Background process to resync the state of a partial-state room
|
"""Background process to resync the state of a partial-state room
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
destination: homeserver to pull the state from
|
initial_destination: the initial homeserver to pull the state from
|
||||||
|
other_destinations: other homeservers to try to pull the state from, if
|
||||||
|
`initial_destination` is unavailable
|
||||||
room_id: room to be resynced
|
room_id: room to be resynced
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -1472,8 +1486,29 @@ class FederationHandler:
|
||||||
# really leave, that might mean we have difficulty getting the room state over
|
# really leave, that might mean we have difficulty getting the room state over
|
||||||
# federation.
|
# federation.
|
||||||
#
|
#
|
||||||
# TODO(faster_joins): try other destinations if the one we have fails
|
# TODO(faster_joins): we need some way of prioritising which homeservers in
|
||||||
|
# `other_destinations` to try first, otherwise we'll spend ages trying dead
|
||||||
|
# homeservers for large rooms.
|
||||||
|
|
||||||
|
if initial_destination is None and len(other_destinations) == 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"Cannot resync state of {room_id}: no destinations provided"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Make an infinite iterator of destinations to try. Once we find a working
|
||||||
|
# destination, we'll stick with it until it flakes.
|
||||||
|
if initial_destination is not None:
|
||||||
|
# Move `initial_destination` to the front of the list.
|
||||||
|
destinations = list(other_destinations)
|
||||||
|
if initial_destination in destinations:
|
||||||
|
destinations.remove(initial_destination)
|
||||||
|
destinations = [initial_destination] + destinations
|
||||||
|
destination_iter = itertools.cycle(destinations)
|
||||||
|
else:
|
||||||
|
destination_iter = itertools.cycle(other_destinations)
|
||||||
|
|
||||||
|
# `destination` is the current remote homeserver we're pulling from.
|
||||||
|
destination = next(destination_iter)
|
||||||
logger.info("Syncing state for room %s via %s", room_id, destination)
|
logger.info("Syncing state for room %s via %s", room_id, destination)
|
||||||
|
|
||||||
# we work through the queue in order of increasing stream ordering.
|
# we work through the queue in order of increasing stream ordering.
|
||||||
|
@ -1511,6 +1546,41 @@ class FederationHandler:
|
||||||
allow_rejected=True,
|
allow_rejected=True,
|
||||||
)
|
)
|
||||||
for event in events:
|
for event in events:
|
||||||
await self._federation_event_handler.update_state_for_partial_state_event(
|
for attempt in itertools.count():
|
||||||
destination, event
|
try:
|
||||||
)
|
await self._federation_event_handler.update_state_for_partial_state_event(
|
||||||
|
destination, event
|
||||||
|
)
|
||||||
|
break
|
||||||
|
except FederationError as e:
|
||||||
|
if attempt == len(destinations) - 1:
|
||||||
|
# We have tried every remote server for this event. Give up.
|
||||||
|
# TODO(faster_joins) giving up isn't the right thing to do
|
||||||
|
# if there's a temporary network outage. retrying
|
||||||
|
# indefinitely is also not the right thing to do if we can
|
||||||
|
# reach all homeservers and they all claim they don't have
|
||||||
|
# the state we want.
|
||||||
|
logger.error(
|
||||||
|
"Failed to get state for %s at %s from %s because %s, "
|
||||||
|
"giving up!",
|
||||||
|
room_id,
|
||||||
|
event,
|
||||||
|
destination,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Try the next remote server.
|
||||||
|
logger.info(
|
||||||
|
"Failed to get state for %s at %s from %s because %s",
|
||||||
|
room_id,
|
||||||
|
event,
|
||||||
|
destination,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
destination = next(destination_iter)
|
||||||
|
logger.info(
|
||||||
|
"Syncing state for room %s via %s instead",
|
||||||
|
room_id,
|
||||||
|
destination,
|
||||||
|
)
|
||||||
|
|
|
@ -505,6 +505,9 @@ class FederationEventHandler:
|
||||||
Args:
|
Args:
|
||||||
destination: server to request full state from
|
destination: server to request full state from
|
||||||
event: partial-state event to be de-partial-stated
|
event: partial-state event to be de-partial-stated
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FederationError if we fail to request state from the remote server.
|
||||||
"""
|
"""
|
||||||
logger.info("Updating state for %s", event.event_id)
|
logger.info("Updating state for %s", event.event_id)
|
||||||
with nested_logging_context(suffix=event.event_id):
|
with nested_logging_context(suffix=event.event_id):
|
||||||
|
@ -815,6 +818,10 @@ class FederationEventHandler:
|
||||||
Returns:
|
Returns:
|
||||||
if we already had all the prev events, `None`. Otherwise, returns
|
if we already had all the prev events, `None`. Otherwise, returns
|
||||||
the event ids of the state at `event`.
|
the event ids of the state at `event`.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FederationError if we fail to get the state from the remote server after any
|
||||||
|
missing `prev_event`s.
|
||||||
"""
|
"""
|
||||||
room_id = event.room_id
|
room_id = event.room_id
|
||||||
event_id = event.event_id
|
event_id = event.event_id
|
||||||
|
@ -901,6 +908,10 @@ class FederationEventHandler:
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The event ids of the state *after* the given event.
|
The event ids of the state *after* the given event.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
InvalidResponseError: if the remote homeserver's response contains fields
|
||||||
|
of the wrong type.
|
||||||
"""
|
"""
|
||||||
(
|
(
|
||||||
state_event_ids,
|
state_event_ids,
|
||||||
|
|
Loading…
Add table
Reference in a new issue