0
0
Fork 1
mirror of https://mau.dev/maunium/synapse.git synced 2024-09-24 18:49:01 +02:00

Speed up rebuilding of the user directory for local users (#15529)

The idea here is to batch up the work.
This commit is contained in:
Erik Johnston 2023-05-03 14:41:37 +01:00 committed by GitHub
parent 9890f23469
commit fc3a878220
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 171 additions and 76 deletions

1
changelog.d/15529.misc Normal file
View file

@ -0,0 +1 @@
Speed up rebuilding of the user directory for local users.

View file

@ -386,13 +386,20 @@ class LoggingTransaction:
self.executemany(sql, args) self.executemany(sql, args)
def execute_values( def execute_values(
self, sql: str, values: Iterable[Iterable[Any]], fetch: bool = True self,
sql: str,
values: Iterable[Iterable[Any]],
template: Optional[str] = None,
fetch: bool = True,
) -> List[Tuple]: ) -> List[Tuple]:
"""Corresponds to psycopg2.extras.execute_values. Only available when """Corresponds to psycopg2.extras.execute_values. Only available when
using postgres. using postgres.
The `fetch` parameter must be set to False if the query does not return The `fetch` parameter must be set to False if the query does not return
rows (e.g. INSERTs). rows (e.g. INSERTs).
The `template` is the snippet to merge to every item in argslist to
compose the query.
""" """
assert isinstance(self.database_engine, PostgresEngine) assert isinstance(self.database_engine, PostgresEngine)
from psycopg2.extras import execute_values from psycopg2.extras import execute_values
@ -400,7 +407,9 @@ class LoggingTransaction:
return self._do_execute( return self._do_execute(
# TODO: is it safe for values to be Iterable[Iterable[Any]] here? # TODO: is it safe for values to be Iterable[Iterable[Any]] here?
# https://www.psycopg.org/docs/extras.html?highlight=execute_batch#psycopg2.extras.execute_values says values should be Sequence[Sequence] # https://www.psycopg.org/docs/extras.html?highlight=execute_batch#psycopg2.extras.execute_values says values should be Sequence[Sequence]
lambda the_sql: execute_values(self.txn, the_sql, values, fetch=fetch), lambda the_sql: execute_values(
self.txn, the_sql, values, template=template, fetch=fetch
),
sql, sql,
) )

View file

@ -27,6 +27,8 @@ from typing import (
cast, cast,
) )
import attr
try: try:
# Figure out if ICU support is available for searching users. # Figure out if ICU support is available for searching users.
import icu import icu
@ -66,6 +68,19 @@ logger = logging.getLogger(__name__)
TEMP_TABLE = "_temp_populate_user_directory" TEMP_TABLE = "_temp_populate_user_directory"
@attr.s(auto_attribs=True, frozen=True)
class _UserDirProfile:
"""Helper type for the user directory code for an entry to be inserted into
the directory.
"""
user_id: str
# If the display name or avatar URL are unexpected types, replace with None
display_name: Optional[str] = attr.ib(default=None, converter=non_null_str_or_none)
avatar_url: Optional[str] = attr.ib(default=None, converter=non_null_str_or_none)
class UserDirectoryBackgroundUpdateStore(StateDeltasStore): class UserDirectoryBackgroundUpdateStore(StateDeltasStore):
# How many records do we calculate before sending it to # How many records do we calculate before sending it to
# add_users_who_share_private_rooms? # add_users_who_share_private_rooms?
@ -381,19 +396,59 @@ class UserDirectoryBackgroundUpdateStore(StateDeltasStore):
% (len(users_to_work_on), progress["remaining"]) % (len(users_to_work_on), progress["remaining"])
) )
for user_id in users_to_work_on: # First filter down to users we want to insert into the user directory.
if await self.should_include_local_user_in_dir(user_id): users_to_insert = [
profile = await self.get_profileinfo(get_localpart_from_id(user_id)) # type: ignore[attr-defined] user_id
await self.update_profile_in_user_dir( for user_id in users_to_work_on
user_id, profile.display_name, profile.avatar_url if await self.should_include_local_user_in_dir(user_id)
]
# Next fetch their profiles. Note that the `user_id` here is the
# *localpart*, and that not all users have profiles.
profile_rows = await self.db_pool.simple_select_many_batch(
table="profiles",
column="user_id",
iterable=[get_localpart_from_id(u) for u in users_to_insert],
retcols=(
"user_id",
"displayname",
"avatar_url",
),
keyvalues={},
desc="populate_user_directory_process_users_get_profiles",
)
profiles = {
f"@{row['user_id']}:{self.server_name}": _UserDirProfile(
f"@{row['user_id']}:{self.server_name}",
row["displayname"],
row["avatar_url"],
)
for row in profile_rows
}
profiles_to_insert = [
profiles.get(user_id) or _UserDirProfile(user_id)
for user_id in users_to_insert
]
# Actually insert the users with their profiles into the directory.
await self.db_pool.runInteraction(
"populate_user_directory_process_users_insertion",
self._update_profiles_in_user_dir_txn,
profiles_to_insert,
) )
# We've finished processing a user. Delete it from the table. # We've finished processing the users. Delete it from the table.
await self.db_pool.simple_delete_one( await self.db_pool.simple_delete_many(
TEMP_TABLE + "_users", {"user_id": user_id} table=TEMP_TABLE + "_users",
column="user_id",
iterable=users_to_work_on,
keyvalues={},
desc="populate_user_directory_process_users_delete",
) )
# Update the remaining counter. # Update the remaining counter.
progress["remaining"] -= 1 progress["remaining"] -= len(users_to_work_on)
await self.db_pool.runInteraction( await self.db_pool.runInteraction(
"populate_user_directory", "populate_user_directory",
self.db_pool.updates._background_update_progress_txn, self.db_pool.updates._background_update_progress_txn,
@ -584,72 +639,102 @@ class UserDirectoryBackgroundUpdateStore(StateDeltasStore):
Update or add a user's profile in the user directory. Update or add a user's profile in the user directory.
If the user is remote, the profile will be marked as not stale. If the user is remote, the profile will be marked as not stale.
""" """
# If the display name or avatar URL are unexpected types, replace with None. await self.db_pool.runInteraction(
display_name = non_null_str_or_none(display_name) "update_profiles_in_user_dir",
avatar_url = non_null_str_or_none(avatar_url) self._update_profiles_in_user_dir_txn,
[_UserDirProfile(user_id, display_name, avatar_url)],
)
def _update_profile_in_user_dir_txn(txn: LoggingTransaction) -> None: def _update_profiles_in_user_dir_txn(
self.db_pool.simple_upsert_txn( self,
txn: LoggingTransaction,
profiles: Sequence[_UserDirProfile],
) -> None:
self.db_pool.simple_upsert_many_txn(
txn, txn,
table="user_directory", table="user_directory",
keyvalues={"user_id": user_id}, key_names=("user_id",),
values={"display_name": display_name, "avatar_url": avatar_url}, key_values=[(p.user_id,) for p in profiles],
value_names=("display_name", "avatar_url"),
value_values=[
(
p.display_name,
p.avatar_url,
)
for p in profiles
],
) )
if not self.hs.is_mine_id(user_id):
# Remote users: Make sure the profile is not marked as stale anymore. # Remote users: Make sure the profile is not marked as stale anymore.
self.db_pool.simple_delete_txn( remote_users = [
p.user_id for p in profiles if not self.hs.is_mine_id(p.user_id)
]
if remote_users:
self.db_pool.simple_delete_many_txn(
txn, txn,
table="user_directory_stale_remote_users", table="user_directory_stale_remote_users",
keyvalues={"user_id": user_id}, column="user_id",
values=remote_users,
keyvalues={},
) )
# The display name that goes into the database index.
index_display_name = display_name
if index_display_name is not None:
index_display_name = _filter_text_for_index(index_display_name)
if isinstance(self.database_engine, PostgresEngine): if isinstance(self.database_engine, PostgresEngine):
# We weight the localpart most highly, then display name and finally # We weight the localpart most highly, then display name and finally
# server name # server name
template = """
(
%s,
setweight(to_tsvector('simple', %s), 'A')
|| setweight(to_tsvector('simple', %s), 'D')
|| setweight(to_tsvector('simple', COALESCE(%s, '')), 'B')
)
"""
sql = """ sql = """
INSERT INTO user_directory_search(user_id, vector) INSERT INTO user_directory_search(user_id, vector)
VALUES (?, VALUES ? ON CONFLICT (user_id) DO UPDATE SET vector=EXCLUDED.vector
setweight(to_tsvector('simple', ?), 'A')
|| setweight(to_tsvector('simple', ?), 'D')
|| setweight(to_tsvector('simple', COALESCE(?, '')), 'B')
) ON CONFLICT (user_id) DO UPDATE SET vector=EXCLUDED.vector
""" """
txn.execute( txn.execute_values(
sql, sql,
[
( (
user_id, p.user_id,
get_localpart_from_id(user_id), get_localpart_from_id(p.user_id),
get_domain_from_id(user_id), get_domain_from_id(p.user_id),
index_display_name, _filter_text_for_index(p.display_name)
), if p.display_name
else None,
)
for p in profiles
],
template=template,
fetch=False,
) )
elif isinstance(self.database_engine, Sqlite3Engine): elif isinstance(self.database_engine, Sqlite3Engine):
value = ( values = []
"%s %s" % (user_id, index_display_name) for p in profiles:
if index_display_name if p.display_name is not None:
else user_id index_display_name = _filter_text_for_index(p.display_name)
) value = f"{p.user_id} {index_display_name}"
self.db_pool.simple_upsert_txn( else:
value = p.user_id
values.append((value,))
self.db_pool.simple_upsert_many_txn(
txn, txn,
table="user_directory_search", table="user_directory_search",
keyvalues={"user_id": user_id}, key_names=("user_id",),
values={"value": value}, key_values=[(p.user_id,) for p in profiles],
value_names=("value",),
value_values=values,
) )
else: else:
# This should be unreachable. # This should be unreachable.
raise Exception("Unrecognized database engine") raise Exception("Unrecognized database engine")
txn.call_after(self.get_user_in_directory.invalidate, (user_id,)) for p in profiles:
txn.call_after(self.get_user_in_directory.invalidate, (p.user_id,))
await self.db_pool.runInteraction(
"update_profile_in_user_dir", _update_profile_in_user_dir_txn
)
async def add_users_who_share_private_room( async def add_users_who_share_private_room(
self, room_id: str, user_id_tuples: Iterable[Tuple[str, str]] self, room_id: str, user_id_tuples: Iterable[Tuple[str, str]]