mirror of
https://mau.dev/maunium/synapse.git
synced 2025-01-19 03:11:56 +01:00
Fix races in room stats (and other) updates. (#6187)
Hopefully this will fix the occasional failures we were seeing in the room directory. The problem was that events are not necessarily persisted (and `current_state_delta_stream` updated) in the same order as their stream_id. So for instance current_state_delta 9 might be persisted *before* current_state_delta 8. Then, when the room stats saw stream_id 9, it assumed it had done everything up to 9, and never came back to do stream_id 8. We can solve this easily by only processing up to the stream_id where we know all events have been persisted.
This commit is contained in:
parent
562b4e51dd
commit
a139420a3c
7 changed files with 63 additions and 25 deletions
1
changelog.d/6187.bugfix
Normal file
1
changelog.d/6187.bugfix
Normal file
|
@ -0,0 +1 @@
|
|||
Fix occasional missed updates in the room and user directories.
|
|
@ -803,17 +803,25 @@ class PresenceHandler(object):
|
|||
# Loop round handling deltas until we're up to date
|
||||
while True:
|
||||
with Measure(self.clock, "presence_delta"):
|
||||
deltas = yield self.store.get_current_state_deltas(self._event_pos)
|
||||
if not deltas:
|
||||
room_max_stream_ordering = self.store.get_room_max_stream_ordering()
|
||||
if self._event_pos == room_max_stream_ordering:
|
||||
return
|
||||
|
||||
logger.debug(
|
||||
"Processing presence stats %s->%s",
|
||||
self._event_pos,
|
||||
room_max_stream_ordering,
|
||||
)
|
||||
max_pos, deltas = yield self.store.get_current_state_deltas(
|
||||
self._event_pos, room_max_stream_ordering
|
||||
)
|
||||
yield self._handle_state_delta(deltas)
|
||||
|
||||
self._event_pos = deltas[-1]["stream_id"]
|
||||
self._event_pos = max_pos
|
||||
|
||||
# Expose current event processing position to prometheus
|
||||
synapse.metrics.event_processing_positions.labels("presence").set(
|
||||
self._event_pos
|
||||
max_pos
|
||||
)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
|
|
|
@ -87,21 +87,23 @@ class StatsHandler(StateDeltasHandler):
|
|||
# Be sure to read the max stream_ordering *before* checking if there are any outstanding
|
||||
# deltas, since there is otherwise a chance that we could miss updates which arrive
|
||||
# after we check the deltas.
|
||||
room_max_stream_ordering = yield self.store.get_room_max_stream_ordering()
|
||||
room_max_stream_ordering = self.store.get_room_max_stream_ordering()
|
||||
if self.pos == room_max_stream_ordering:
|
||||
break
|
||||
|
||||
deltas = yield self.store.get_current_state_deltas(self.pos)
|
||||
logger.debug(
|
||||
"Processing room stats %s->%s", self.pos, room_max_stream_ordering
|
||||
)
|
||||
max_pos, deltas = yield self.store.get_current_state_deltas(
|
||||
self.pos, room_max_stream_ordering
|
||||
)
|
||||
|
||||
if deltas:
|
||||
logger.debug("Handling %d state deltas", len(deltas))
|
||||
room_deltas, user_deltas = yield self._handle_deltas(deltas)
|
||||
|
||||
max_pos = deltas[-1]["stream_id"]
|
||||
else:
|
||||
room_deltas = {}
|
||||
user_deltas = {}
|
||||
max_pos = room_max_stream_ordering
|
||||
|
||||
# Then count deltas for total_events and total_event_bytes.
|
||||
room_count, user_count = yield self.store.get_changes_room_total_events_and_bytes(
|
||||
|
|
|
@ -138,21 +138,28 @@ class UserDirectoryHandler(StateDeltasHandler):
|
|||
# Loop round handling deltas until we're up to date
|
||||
while True:
|
||||
with Measure(self.clock, "user_dir_delta"):
|
||||
deltas = yield self.store.get_current_state_deltas(self.pos)
|
||||
if not deltas:
|
||||
room_max_stream_ordering = self.store.get_room_max_stream_ordering()
|
||||
if self.pos == room_max_stream_ordering:
|
||||
return
|
||||
|
||||
logger.debug(
|
||||
"Processing user stats %s->%s", self.pos, room_max_stream_ordering
|
||||
)
|
||||
max_pos, deltas = yield self.store.get_current_state_deltas(
|
||||
self.pos, room_max_stream_ordering
|
||||
)
|
||||
|
||||
logger.info("Handling %d state deltas", len(deltas))
|
||||
yield self._handle_deltas(deltas)
|
||||
|
||||
self.pos = deltas[-1]["stream_id"]
|
||||
self.pos = max_pos
|
||||
|
||||
# Expose current event processing position to prometheus
|
||||
synapse.metrics.event_processing_positions.labels("user_dir").set(
|
||||
self.pos
|
||||
max_pos
|
||||
)
|
||||
|
||||
yield self.store.update_user_directory_stream_pos(self.pos)
|
||||
yield self.store.update_user_directory_stream_pos(max_pos)
|
||||
|
||||
@defer.inlineCallbacks
|
||||
def _handle_deltas(self, deltas):
|
||||
|
|
|
@ -21,7 +21,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
class StateDeltasStore(SQLBaseStore):
|
||||
def get_current_state_deltas(self, prev_stream_id):
|
||||
def get_current_state_deltas(self, prev_stream_id: int, max_stream_id: int):
|
||||
"""Fetch a list of room state changes since the given stream id
|
||||
|
||||
Each entry in the result contains the following fields:
|
||||
|
@ -36,15 +36,27 @@ class StateDeltasStore(SQLBaseStore):
|
|||
|
||||
Args:
|
||||
prev_stream_id (int): point to get changes since (exclusive)
|
||||
max_stream_id (int): the point that we know has been correctly persisted
|
||||
- ie, an upper limit to return changes from.
|
||||
|
||||
Returns:
|
||||
Deferred[list[dict]]: results
|
||||
Deferred[tuple[int, list[dict]]: A tuple consisting of:
|
||||
- the stream id which these results go up to
|
||||
- list of current_state_delta_stream rows. If it is empty, we are
|
||||
up to date.
|
||||
"""
|
||||
prev_stream_id = int(prev_stream_id)
|
||||
|
||||
# check we're not going backwards
|
||||
assert prev_stream_id <= max_stream_id
|
||||
|
||||
if not self._curr_state_delta_stream_cache.has_any_entity_changed(
|
||||
prev_stream_id
|
||||
):
|
||||
return []
|
||||
# if the CSDs haven't changed between prev_stream_id and now, we
|
||||
# know for certain that they haven't changed between prev_stream_id and
|
||||
# max_stream_id.
|
||||
return max_stream_id, []
|
||||
|
||||
def get_current_state_deltas_txn(txn):
|
||||
# First we calculate the max stream id that will give us less than
|
||||
|
@ -54,21 +66,29 @@ class StateDeltasStore(SQLBaseStore):
|
|||
sql = """
|
||||
SELECT stream_id, count(*)
|
||||
FROM current_state_delta_stream
|
||||
WHERE stream_id > ?
|
||||
WHERE stream_id > ? AND stream_id <= ?
|
||||
GROUP BY stream_id
|
||||
ORDER BY stream_id ASC
|
||||
LIMIT 100
|
||||
"""
|
||||
txn.execute(sql, (prev_stream_id,))
|
||||
txn.execute(sql, (prev_stream_id, max_stream_id))
|
||||
|
||||
total = 0
|
||||
max_stream_id = prev_stream_id
|
||||
for max_stream_id, count in txn:
|
||||
|
||||
for stream_id, count in txn:
|
||||
total += count
|
||||
if total > 100:
|
||||
# We arbitarily limit to 100 entries to ensure we don't
|
||||
# select toooo many.
|
||||
logger.debug(
|
||||
"Clipping current_state_delta_stream rows to stream_id %i",
|
||||
stream_id,
|
||||
)
|
||||
clipped_stream_id = stream_id
|
||||
break
|
||||
else:
|
||||
# if there's no problem, we may as well go right up to the max_stream_id
|
||||
clipped_stream_id = max_stream_id
|
||||
|
||||
# Now actually get the deltas
|
||||
sql = """
|
||||
|
@ -77,8 +97,8 @@ class StateDeltasStore(SQLBaseStore):
|
|||
WHERE ? < stream_id AND stream_id <= ?
|
||||
ORDER BY stream_id ASC
|
||||
"""
|
||||
txn.execute(sql, (prev_stream_id, max_stream_id))
|
||||
return self.cursor_to_dict(txn)
|
||||
txn.execute(sql, (prev_stream_id, clipped_stream_id))
|
||||
return clipped_stream_id, self.cursor_to_dict(txn)
|
||||
|
||||
return self.runInteraction(
|
||||
"get_current_state_deltas", get_current_state_deltas_txn
|
||||
|
|
|
@ -139,7 +139,7 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase):
|
|||
defer.succeed(1)
|
||||
)
|
||||
|
||||
self.datastore.get_current_state_deltas.return_value = None
|
||||
self.datastore.get_current_state_deltas.return_value = (0, None)
|
||||
|
||||
self.datastore.get_to_device_stream_token = lambda: 0
|
||||
self.datastore.get_new_device_msgs_for_remote = lambda *args, **kargs: ([], 0)
|
||||
|
|
|
@ -62,7 +62,7 @@ class UserRegisterTestCase(unittest.HomeserverTestCase):
|
|||
self.device_handler.check_device_registered = Mock(return_value="FAKE")
|
||||
|
||||
self.datastore = Mock(return_value=Mock())
|
||||
self.datastore.get_current_state_deltas = Mock(return_value=[])
|
||||
self.datastore.get_current_state_deltas = Mock(return_value=(0, []))
|
||||
|
||||
self.secrets = Mock()
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue