diff --git a/changelog.d/13749.bugfix b/changelog.d/13749.bugfix new file mode 100644 index 000000000..8ffafec07 --- /dev/null +++ b/changelog.d/13749.bugfix @@ -0,0 +1 @@ +Fix a long standing bug where device lists would remain cached when remote users left and rejoined the last room shared with the local homeserver. diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py index c5ac16964..901e2310b 100644 --- a/synapse/handlers/device.py +++ b/synapse/handlers/device.py @@ -45,7 +45,6 @@ from synapse.types import ( JsonDict, StreamKeyType, StreamToken, - UserID, get_domain_from_id, get_verify_key_from_cross_signing_key, ) @@ -324,8 +323,6 @@ class DeviceHandler(DeviceWorkerHandler): self.device_list_updater.incoming_device_list_update, ) - hs.get_distributor().observe("user_left_room", self.user_left_room) - # Whether `_handle_new_device_update_async` is currently processing. self._handle_new_device_update_is_processing = False @@ -569,14 +566,6 @@ class DeviceHandler(DeviceWorkerHandler): StreamKeyType.DEVICE_LIST, position, users=[from_user_id] ) - async def user_left_room(self, user: UserID, room_id: str) -> None: - user_id = user.to_string() - room_ids = await self.store.get_rooms_for_user(user_id) - if not room_ids: - # We no longer share rooms with this user, so we'll no longer - # receive device updates. Mark this in DB. - await self.store.mark_remote_user_device_list_as_unsubscribed(user_id) - async def store_dehydrated_device( self, user_id: str, diff --git a/synapse/handlers/e2e_keys.py b/synapse/handlers/e2e_keys.py index ec81639c7..8eed63ccf 100644 --- a/synapse/handlers/e2e_keys.py +++ b/synapse/handlers/e2e_keys.py @@ -175,6 +175,32 @@ class E2eKeysHandler: user_ids_not_in_cache, remote_results, ) = await self.store.get_user_devices_from_cache(query_list) + + # Check that the homeserver still shares a room with all cached users. + # Note that this check may be slightly racy when a remote user leaves a + # room after we have fetched their cached device list. In the worst case + # we will do extra federation queries for devices that we had cached. + cached_users = set(remote_results.keys()) + valid_cached_users = ( + await self.store.get_users_server_still_shares_room_with( + remote_results.keys() + ) + ) + invalid_cached_users = cached_users - valid_cached_users + if invalid_cached_users: + # Fix up results. If we get here, there is either a bug in device + # list tracking, or we hit the race mentioned above. + user_ids_not_in_cache.update(invalid_cached_users) + for invalid_user_id in invalid_cached_users: + remote_results.pop(invalid_user_id) + # This log message may be removed if it turns out it's almost + # entirely triggered by races. + logger.error( + "Devices for %s were cached, but the server no longer shares " + "any rooms with them. The cached device lists are stale.", + invalid_cached_users, + ) + for user_id, devices in remote_results.items(): user_devices = results.setdefault(user_id, {}) for device_id, device in devices.items(): diff --git a/synapse/storage/controllers/persist_events.py b/synapse/storage/controllers/persist_events.py index dad3731b9..501dbbc99 100644 --- a/synapse/storage/controllers/persist_events.py +++ b/synapse/storage/controllers/persist_events.py @@ -598,9 +598,9 @@ class EventsPersistenceStorageController: # room state_delta_for_room: Dict[str, DeltaState] = {} - # Set of remote users which were in rooms the server has left. We - # should check if we still share any rooms and if not we mark their - # device lists as stale. + # Set of remote users which were in rooms the server has left or who may + # have left rooms the server is in. We should check if we still share any + # rooms and if not we mark their device lists as stale. potentially_left_users: Set[str] = set() if not backfilled: @@ -725,6 +725,20 @@ class EventsPersistenceStorageController: current_state = {} delta.no_longer_in_room = True + # Add all remote users that might have left rooms. + potentially_left_users.update( + user_id + for event_type, user_id in delta.to_delete + if event_type == EventTypes.Member + and not self.is_mine_id(user_id) + ) + potentially_left_users.update( + user_id + for event_type, user_id in delta.to_insert.keys() + if event_type == EventTypes.Member + and not self.is_mine_id(user_id) + ) + state_delta_for_room[room_id] = delta await self.persist_events_store._persist_events_and_state_updates( diff --git a/tests/handlers/test_e2e_keys.py b/tests/handlers/test_e2e_keys.py index 1e6ad4b66..95698bc27 100644 --- a/tests/handlers/test_e2e_keys.py +++ b/tests/handlers/test_e2e_keys.py @@ -891,6 +891,12 @@ class E2eKeysHandlerTestCase(unittest.HomeserverTestCase): new_callable=mock.MagicMock, return_value=make_awaitable(["some_room_id"]), ) + mock_get_users = mock.patch.object( + self.store, + "get_users_server_still_shares_room_with", + new_callable=mock.MagicMock, + return_value=make_awaitable({remote_user_id}), + ) mock_request = mock.patch.object( self.hs.get_federation_client(), "query_user_devices", @@ -898,7 +904,7 @@ class E2eKeysHandlerTestCase(unittest.HomeserverTestCase): return_value=make_awaitable(response_body), ) - with mock_get_rooms, mock_request as mocked_federation_request: + with mock_get_rooms, mock_get_users, mock_request as mocked_federation_request: # Make the first query and sanity check it succeeds. response_1 = self.get_success( e2e_handler.query_devices(