to_device updates could be dropped when consuming the replication stream (#15349)

Co-authored-by: reivilibre <oliverw@matrix.org>
2024-06-17 01:59:26 +02:00 · 2023-03-30 19:41:14 +02:00 · 2023-03-30 19:41:14 +02:00 · 6f68e32bfb
parent 91c3f32673
commit 6f68e32bfb
5 changed files with 98 additions and 15 deletions
--- a/changelog.d/15349.bugfix
+++ b/changelog.d/15349.bugfix
@ -0,0 +1 @@
+Fix a long-standing bug where some to_device messages could be dropped when using workers.
--- a/synapse/storage/databases/main/deviceinbox.py
+++ b/synapse/storage/databases/main/deviceinbox.py
@ -617,14 +617,14 @@ class DeviceInboxWorkerStore(SQLBaseStore):
            # We limit like this as we might have multiple rows per stream_id, and
            # we want to make sure we always get all entries for any stream_id
            # we return.
-            upper_pos = min(current_id, last_id + limit)
+            upto_token = min(current_id, last_id + limit)
            sql = (
                "SELECT max(stream_id), user_id"
                " FROM device_inbox"
                " WHERE ? < stream_id AND stream_id <= ?"
                " GROUP BY user_id"
            )
-            txn.execute(sql, (last_id, upper_pos))
+            txn.execute(sql, (last_id, upto_token))
            updates = [(row[0], row[1:]) for row in txn]

            sql = (
@ -633,19 +633,13 @@ class DeviceInboxWorkerStore(SQLBaseStore):
                " WHERE ? < stream_id AND stream_id <= ?"
                " GROUP BY destination"
            )
-            txn.execute(sql, (last_id, upper_pos))
+            txn.execute(sql, (last_id, upto_token))
            updates.extend((row[0], row[1:]) for row in txn)

            # Order by ascending stream ordering
            updates.sort()

-            limited = False
-            upto_token = current_id
-            if len(updates) >= limit:
-                upto_token = updates[-1][0]
-                limited = True
-
-            return updates, upto_token, limited
+            return updates, upto_token, upto_token < current_id

        return await self.db_pool.runInteraction(
            "get_all_new_device_messages", get_all_new_device_messages_txn
--- a/tests/replication/_base.py
+++ b/tests/replication/_base.py
@ -54,6 +54,10 @@ class BaseStreamTestCase(unittest.HomeserverTestCase):
    if not hiredis:
        skip = "Requires hiredis"

+    if not USE_POSTGRES_FOR_TESTS:
+        # Redis replication only takes place on Postgres
+        skip = "Requires Postgres"
+
    def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
        # build a replication server
        server_factory = ReplicationStreamProtocolFactory(hs)
--- a/tests/replication/tcp/streams/test_account_data.py
+++ b/tests/replication/tcp/streams/test_account_data.py
@ -37,11 +37,6 @@ class AccountDataStreamTestCase(BaseStreamTestCase):
        # also one global update
        self.get_success(store.add_account_data_for_user("test_user", "m.global", {}))

-        # tell the notifier to catch up to avoid duplicate rows.
-        # workaround for https://github.com/matrix-org/synapse/issues/7360
-        # FIXME remove this when the above is fixed
-        self.replicate()
-
        # check we're testing what we think we are: no rows should yet have been
        # received
        self.assertEqual([], self.test_handler.received_rdata_rows)
--- a/tests/replication/tcp/streams/test_to_device.py
+++ b/tests/replication/tcp/streams/test_to_device.py
@ -0,0 +1,89 @@
+# Copyright 2023 The Matrix.org Foundation C.I.C.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+
+import synapse
+from synapse.replication.tcp.streams._base import _STREAM_UPDATE_TARGET_ROW_COUNT
+from synapse.types import JsonDict
+
+from tests.replication._base import BaseStreamTestCase
+
+logger = logging.getLogger(__name__)
+
+
+class ToDeviceStreamTestCase(BaseStreamTestCase):
+    servlets = [
+        synapse.rest.admin.register_servlets,
+        synapse.rest.client.login.register_servlets,
+    ]
+
+    def test_to_device_stream(self) -> None:
+        store = self.hs.get_datastores().main
+
+        user1 = self.register_user("user1", "pass")
+        self.login("user1", "pass", "device")
+        user2 = self.register_user("user2", "pass")
+        self.login("user2", "pass", "device")
+
+        # connect to pull the updates related to users creation/login
+        self.reconnect()
+        self.replicate()
+        self.test_handler.received_rdata_rows.clear()
+        # disconnect so we can accumulate the updates without pulling them
+        self.disconnect()
+
+        msg: JsonDict = {}
+        msg["sender"] = "@sender:example.org"
+        msg["type"] = "m.new_device"
+
+        # add messages to the device inbox for user1 up until the
+        # limit defined for a stream update batch
+        for i in range(0, _STREAM_UPDATE_TARGET_ROW_COUNT):
+            msg["content"] = {"device": {}}
+            messages = {user1: {"device": msg}}
+
+            self.get_success(
+                store.add_messages_from_remote_to_device_inbox(
+                    "example.org",
+                    f"{i}",
+                    messages,
+                )
+            )
+
+        # add one more message, for user2 this time
+        # this message would be dropped before fixing #15335
+        msg["content"] = {"device": {}}
+        messages = {user2: {"device": msg}}
+
+        self.get_success(
+            store.add_messages_from_remote_to_device_inbox(
+                "example.org",
+                f"{_STREAM_UPDATE_TARGET_ROW_COUNT}",
+                messages,
+            )
+        )
+
+        # replication is disconnected so we shouldn't get any updates yet
+        self.assertEqual([], self.test_handler.received_rdata_rows)
+
+        # now reconnect to pull the updates
+        self.reconnect()
+        self.replicate()
+
+        # we should receive the fact that we have to_device updates
+        # for user1 and user2
+        received_rows = self.test_handler.received_rdata_rows
+        self.assertEqual(len(received_rows), 2)
+        self.assertEqual(received_rows[0][2].entity, user1)
+        self.assertEqual(received_rows[1][2].entity, user2)
				`@ -0,0 +1 @@`
				`Fix a long-standing bug where some to_device messages could be dropped when using workers.`