From 9900f7c231f8af536fce229117b0a406dc629293 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Wed, 26 Apr 2023 17:00:11 +0100 Subject: [PATCH] Add admin endpoint to query room sizes (#15482) --- changelog.d/15482.feature | 1 + docs/admin_api/statistics.md | 49 ++++++++++ synapse/rest/admin/__init__.py | 6 +- synapse/rest/admin/statistics.py | 25 ++++++ synapse/storage/controllers/__init__.py | 2 + synapse/storage/controllers/stats.py | 113 ++++++++++++++++++++++++ 6 files changed, 195 insertions(+), 1 deletion(-) create mode 100644 changelog.d/15482.feature create mode 100644 synapse/storage/controllers/stats.py diff --git a/changelog.d/15482.feature b/changelog.d/15482.feature new file mode 100644 index 000000000..f3e9f2a5b --- /dev/null +++ b/changelog.d/15482.feature @@ -0,0 +1 @@ +Add admin endpoint to query the largest rooms by disk space used in the database. diff --git a/docs/admin_api/statistics.md b/docs/admin_api/statistics.md index 03b3621e5..2bd417e90 100644 --- a/docs/admin_api/statistics.md +++ b/docs/admin_api/statistics.md @@ -81,3 +81,52 @@ The following fields are returned in the JSON response body: - `user_id` - string - Fully-qualified user ID (ex. `@user:server.com`). * `next_token` - integer - Opaque value used for pagination. See above. * `total` - integer - Total number of users after filtering. + + +# Get largest rooms by size in database + +Returns the 10 largest rooms and an estimate of how much space in the database +they are taking. + +This does not include the size of any associated media associated with the room. + +Returns an error on SQLite. + +*Note:* This uses the planner statistics from PostgreSQL to do the estimates, +which means that the returned information can vary widely from reality. However, +it should be enough to get a rough idea of where database disk space is going. + + +The API is: + +``` +GET /_synapse/admin/v1/statistics/statistics/database/rooms +``` + +A response body like the following is returned: + +```json +{ + "rooms": [ + { + "room_id": "!OGEhHVWSdvArJzumhm:matrix.org", + "estimated_size": 47325417353 + } + ], +} +``` + + + +**Response** + +The following fields are returned in the JSON response body: + +* `rooms` - An array of objects, sorted by largest room first. Objects contain + the following fields: + - `room_id` - string - The room ID. + - `estimated_size` - integer - Estimated disk space used in bytes by the room + in the database. + + +*Added in Synapse 1.83.0* diff --git a/synapse/rest/admin/__init__.py b/synapse/rest/admin/__init__.py index 79f22a59f..770df261c 100644 --- a/synapse/rest/admin/__init__.py +++ b/synapse/rest/admin/__init__.py @@ -68,7 +68,10 @@ from synapse.rest.admin.rooms import ( RoomTimestampToEventRestServlet, ) from synapse.rest.admin.server_notice_servlet import SendServerNoticeServlet -from synapse.rest.admin.statistics import UserMediaStatisticsRestServlet +from synapse.rest.admin.statistics import ( + LargestRoomsStatistics, + UserMediaStatisticsRestServlet, +) from synapse.rest.admin.username_available import UsernameAvailableRestServlet from synapse.rest.admin.users import ( AccountDataRestServlet, @@ -259,6 +262,7 @@ def register_servlets(hs: "HomeServer", http_server: HttpServer) -> None: UserRestServletV2(hs).register(http_server) UsersRestServletV2(hs).register(http_server) UserMediaStatisticsRestServlet(hs).register(http_server) + LargestRoomsStatistics(hs).register(http_server) EventReportDetailRestServlet(hs).register(http_server) EventReportsRestServlet(hs).register(http_server) AccountDataRestServlet(hs).register(http_server) diff --git a/synapse/rest/admin/statistics.py b/synapse/rest/admin/statistics.py index 9c45f4650..19780e4b4 100644 --- a/synapse/rest/admin/statistics.py +++ b/synapse/rest/admin/statistics.py @@ -113,3 +113,28 @@ class UserMediaStatisticsRestServlet(RestServlet): ret["next_token"] = start + len(users_media) return HTTPStatus.OK, ret + + +class LargestRoomsStatistics(RestServlet): + """Get the largest rooms by database size. + + Only works when using PostgreSQL. + """ + + PATTERNS = admin_patterns("/statistics/database/rooms$") + + def __init__(self, hs: "HomeServer"): + self.auth = hs.get_auth() + self.stats_controller = hs.get_storage_controllers().stats + + async def on_GET(self, request: SynapseRequest) -> Tuple[int, JsonDict]: + await assert_requester_is_admin(self.auth, request) + + room_sizes = await self.stats_controller.get_room_db_size_estimate() + + return HTTPStatus.OK, { + "rooms": [ + {"room_id": room_id, "estimated_size": size} + for room_id, size in room_sizes + ] + } diff --git a/synapse/storage/controllers/__init__.py b/synapse/storage/controllers/__init__.py index 45101cda7..0ef860263 100644 --- a/synapse/storage/controllers/__init__.py +++ b/synapse/storage/controllers/__init__.py @@ -19,6 +19,7 @@ from synapse.storage.controllers.persist_events import ( ) from synapse.storage.controllers.purge_events import PurgeEventsStorageController from synapse.storage.controllers.state import StateStorageController +from synapse.storage.controllers.stats import StatsController from synapse.storage.databases import Databases from synapse.storage.databases.main import DataStore @@ -40,6 +41,7 @@ class StorageControllers: self.purge_events = PurgeEventsStorageController(hs, stores) self.state = StateStorageController(hs, stores) + self.stats = StatsController(hs, stores) self.persistence = None if stores.persist_events: diff --git a/synapse/storage/controllers/stats.py b/synapse/storage/controllers/stats.py new file mode 100644 index 000000000..988e44c6a --- /dev/null +++ b/synapse/storage/controllers/stats.py @@ -0,0 +1,113 @@ +# Copyright 2023 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from collections import Counter +from typing import TYPE_CHECKING, Collection, List, Tuple + +from synapse.api.errors import SynapseError +from synapse.storage.database import LoggingTransaction +from synapse.storage.databases import Databases +from synapse.storage.engines import PostgresEngine + +if TYPE_CHECKING: + from synapse.server import HomeServer + +logger = logging.getLogger(__name__) + + +class StatsController: + """High level interface for getting statistics.""" + + def __init__(self, hs: "HomeServer", stores: Databases): + self.stores = stores + + async def get_room_db_size_estimate(self) -> List[Tuple[str, int]]: + """Get an estimate of the largest rooms and how much database space they + use, in bytes. + + Only works against PostgreSQL. + + Note: this uses the postgres statistics so is a very rough estimate. + """ + + # Note: We look at both tables on the main and state databases. + if not isinstance(self.stores.main.database_engine, PostgresEngine): + raise SynapseError(400, "Endpoint requires using PostgreSQL") + + if not isinstance(self.stores.state.database_engine, PostgresEngine): + raise SynapseError(400, "Endpoint requires using PostgreSQL") + + # For each "large" table, we go through and get the largest rooms + # and an estimate of how much space they take. We can then sum the + # results and return the top 10. + # + # This isn't the most accurate, but given all of these are estimates + # anyway its good enough. + room_estimates: Counter[str] = Counter() + + # Return size of the table on disk, including indexes and TOAST. + table_sql = """ + SELECT pg_total_relation_size(?) + """ + + # Get an estimate for the largest rooms and their frequency. + # + # Note: the cast here is a hack to cast from `anyarray` to an actual + # type. This ensures that psycopg2 passes us a back a a Python list. + column_sql = """ + SELECT + most_common_vals::TEXT::TEXT[], most_common_freqs::TEXT::NUMERIC[] + FROM pg_stats + WHERE tablename = ? and attname = 'room_id' + """ + + def get_room_db_size_estimate_txn( + txn: LoggingTransaction, + tables: Collection[str], + ) -> None: + for table in tables: + txn.execute(table_sql, (table,)) + row = txn.fetchone() + assert row is not None + (table_size,) = row + + txn.execute(column_sql, (table,)) + row = txn.fetchone() + assert row is not None + vals, freqs = row + + for room_id, freq in zip(vals, freqs): + room_estimates[room_id] += int(freq * table_size) + + await self.stores.main.db_pool.runInteraction( + "get_room_db_size_estimate_main", + get_room_db_size_estimate_txn, + ( + "event_json", + "events", + "event_search", + "event_edges", + "event_push_actions", + "stream_ordering_to_exterm", + ), + ) + + await self.stores.state.db_pool.runInteraction( + "get_room_db_size_estimate_state", + get_room_db_size_estimate_txn, + ("state_groups_state",), + ) + + return room_estimates.most_common(10)