Add optional ICU support for user search (#14464)

Fixes #13655

This change uses ICU (International Components for Unicode) to improve boundary detection in user search.

This change also adds a new dependency on libicu-dev and pkg-config for the Debian packages, which are available in all supported distros.
This commit is contained in:
Brendan Abolivier 2022-12-12 13:21:17 +01:00 committed by GitHub
parent a5d8fee097
commit 2a3cd59dd0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 166 additions and 6 deletions

View file

@ -0,0 +1 @@
Improve user search for international display names.

7
debian/changelog vendored
View file

@ -1,3 +1,10 @@
matrix-synapse-py3 (1.74.0~rc1) UNRELEASED; urgency=medium
* New dependency on libicu-dev to provide improved results for user
search.
-- Synapse Packaging team <packages@matrix.org> Tue, 06 Dec 2022 15:28:10 +0000
matrix-synapse-py3 (1.73.0) stable; urgency=medium matrix-synapse-py3 (1.73.0) stable; urgency=medium
* New Synapse release 1.73.0. * New Synapse release 1.73.0.

2
debian/control vendored
View file

@ -8,6 +8,8 @@ Build-Depends:
dh-virtualenv (>= 1.1), dh-virtualenv (>= 1.1),
libsystemd-dev, libsystemd-dev,
libpq-dev, libpq-dev,
libicu-dev,
pkg-config,
lsb-release, lsb-release,
python3-dev, python3-dev,
python3, python3,

View file

@ -97,6 +97,8 @@ RUN \
zlib1g-dev \ zlib1g-dev \
git \ git \
curl \ curl \
libicu-dev \
pkg-config \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*

View file

@ -84,6 +84,8 @@ RUN apt-get update -qq -o Acquire::Languages=none \
python3-venv \ python3-venv \
sqlite3 \ sqlite3 \
libpq-dev \ libpq-dev \
libicu-dev \
pkg-config \
xmlsec1 xmlsec1
# Install rust and ensure it's in the PATH # Install rust and ensure it's in the PATH

16
poetry.lock generated
View file

@ -837,6 +837,14 @@ category = "dev"
optional = false optional = false
python-versions = ">=3.5" python-versions = ">=3.5"
[[package]]
name = "pyicu"
version = "2.10.2"
description = "Python extension wrapping the ICU C++ API"
category = "main"
optional = true
python-versions = "*"
[[package]] [[package]]
name = "pyjwt" name = "pyjwt"
version = "2.4.0" version = "2.4.0"
@ -1622,7 +1630,7 @@ docs = ["Sphinx", "repoze.sphinx.autointerface"]
test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"] test = ["zope.i18nmessageid", "zope.testing", "zope.testrunner"]
[extras] [extras]
all = ["matrix-synapse-ldap3", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "authlib", "lxml", "sentry-sdk", "jaeger-client", "opentracing", "txredisapi", "hiredis", "Pympler"] all = ["matrix-synapse-ldap3", "psycopg2", "psycopg2cffi", "psycopg2cffi-compat", "pysaml2", "authlib", "lxml", "sentry-sdk", "jaeger-client", "opentracing", "txredisapi", "hiredis", "Pympler", "pyicu"]
cache-memory = ["Pympler"] cache-memory = ["Pympler"]
jwt = ["authlib"] jwt = ["authlib"]
matrix-synapse-ldap3 = ["matrix-synapse-ldap3"] matrix-synapse-ldap3 = ["matrix-synapse-ldap3"]
@ -1635,11 +1643,12 @@ sentry = ["sentry-sdk"]
systemd = ["systemd-python"] systemd = ["systemd-python"]
test = ["parameterized", "idna"] test = ["parameterized", "idna"]
url-preview = ["lxml"] url-preview = ["lxml"]
user-search = ["pyicu"]
[metadata] [metadata]
lock-version = "1.1" lock-version = "1.1"
python-versions = "^3.7.1" python-versions = "^3.7.1"
content-hash = "8c44ceeb9df5c3ab43040400e0a6b895de49417e61293a1ba027640b34f03263" content-hash = "f20007013f33bc35a01e412c48adc62a936030f3074e06286674c5ad7f44d300"
[metadata.files] [metadata.files]
attrs = [ attrs = [
@ -2427,6 +2436,9 @@ pygments = [
{file = "Pygments-2.11.2-py3-none-any.whl", hash = "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65"}, {file = "Pygments-2.11.2-py3-none-any.whl", hash = "sha256:44238f1b60a76d78fc8ca0528ee429702aae011c265fe6a8dd8b63049ae41c65"},
{file = "Pygments-2.11.2.tar.gz", hash = "sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"}, {file = "Pygments-2.11.2.tar.gz", hash = "sha256:4e426f72023d88d03b2fa258de560726ce890ff3b630f88c21cbb8b2503b8c6a"},
] ]
pyicu = [
{file = "PyICU-2.10.2.tar.gz", hash = "sha256:0c3309eea7fab6857507ace62403515b60fe096cbfb4f90d14f55ff75c5441c1"},
]
pyjwt = [ pyjwt = [
{file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"}, {file = "PyJWT-2.4.0-py3-none-any.whl", hash = "sha256:72d1d253f32dbd4f5c88eaf1fdc62f3a19f676ccbadb9dbc5d07e951b2b26daf"},
{file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"}, {file = "PyJWT-2.4.0.tar.gz", hash = "sha256:d42908208c699b3b973cbeb01a969ba6a96c821eefb1c5bfe4c390c01d67abba"},

View file

@ -208,6 +208,7 @@ hiredis = { version = "*", optional = true }
Pympler = { version = "*", optional = true } Pympler = { version = "*", optional = true }
parameterized = { version = ">=0.7.4", optional = true } parameterized = { version = ">=0.7.4", optional = true }
idna = { version = ">=2.5", optional = true } idna = { version = ">=2.5", optional = true }
pyicu = { version = ">=2.10.2", optional = true }
[tool.poetry.extras] [tool.poetry.extras]
# NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified # NB: Packages that should be part of `pip install matrix-synapse[all]` need to be specified
@ -230,6 +231,10 @@ redis = ["txredisapi", "hiredis"]
# Required to use experimental `caches.track_memory_usage` config option. # Required to use experimental `caches.track_memory_usage` config option.
cache-memory = ["pympler"] cache-memory = ["pympler"]
test = ["parameterized", "idna"] test = ["parameterized", "idna"]
# Allows for better search for international characters in the user directory. This
# requires libicu's development headers installed on the system (e.g. libicu-dev on
# Debian-based distributions).
user-search = ["pyicu"]
# The duplication here is awful. I hate hate hate hate hate it. However, for now I want # The duplication here is awful. I hate hate hate hate hate it. However, for now I want
# to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations: # to ensure you can still `pip install matrix-synapse[all]` like today. Two motivations:
@ -261,6 +266,8 @@ all = [
"txredisapi", "hiredis", "txredisapi", "hiredis",
# cache-memory # cache-memory
"pympler", "pympler",
# improved user search
"pyicu",
# omitted: # omitted:
# - test: it's useful to have this separate from dev deps in the olddeps job # - test: it's useful to have this separate from dev deps in the olddeps job
# - systemd: this is a system-based requirement # - systemd: this is a system-based requirement

25
stubs/icu.pyi Normal file
View file

@ -0,0 +1,25 @@
# Copyright 2022 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Stub for PyICU.
class Locale:
@staticmethod
def getDefault() -> Locale: ...
class BreakIterator:
@staticmethod
def createWordInstance(locale: Locale) -> BreakIterator: ...
def setText(self, text: str) -> None: ...
def nextBoundary(self) -> int: ...

View file

@ -26,6 +26,14 @@ from typing import (
cast, cast,
) )
try:
# Figure out if ICU support is available for searching users.
import icu
USE_ICU = True
except ModuleNotFoundError:
USE_ICU = False
from typing_extensions import TypedDict from typing_extensions import TypedDict
from synapse.api.errors import StoreError from synapse.api.errors import StoreError
@ -900,7 +908,7 @@ def _parse_query_sqlite(search_term: str) -> str:
""" """
# Pull out the individual words, discarding any non-word characters. # Pull out the individual words, discarding any non-word characters.
results = re.findall(r"([\w\-]+)", search_term, re.UNICODE) results = _parse_words(search_term)
return " & ".join("(%s* OR %s)" % (result, result) for result in results) return " & ".join("(%s* OR %s)" % (result, result) for result in results)
@ -910,12 +918,63 @@ def _parse_query_postgres(search_term: str) -> Tuple[str, str, str]:
We use this so that we can add prefix matching, which isn't something We use this so that we can add prefix matching, which isn't something
that is supported by default. that is supported by default.
""" """
results = _parse_words(search_term)
# Pull out the individual words, discarding any non-word characters.
results = re.findall(r"([\w\-]+)", search_term, re.UNICODE)
both = " & ".join("(%s:* | %s)" % (result, result) for result in results) both = " & ".join("(%s:* | %s)" % (result, result) for result in results)
exact = " & ".join("%s" % (result,) for result in results) exact = " & ".join("%s" % (result,) for result in results)
prefix = " & ".join("%s:*" % (result,) for result in results) prefix = " & ".join("%s:*" % (result,) for result in results)
return both, exact, prefix return both, exact, prefix
def _parse_words(search_term: str) -> List[str]:
"""Split the provided search string into a list of its words.
If support for ICU (International Components for Unicode) is available, use it.
Otherwise, fall back to using a regex to detect word boundaries. This latter
solution works well enough for most latin-based languages, but doesn't work as well
with other languages.
Args:
search_term: The search string.
Returns:
A list of the words in the search string.
"""
if USE_ICU:
return _parse_words_with_icu(search_term)
return re.findall(r"([\w\-]+)", search_term, re.UNICODE)
def _parse_words_with_icu(search_term: str) -> List[str]:
"""Break down the provided search string into its individual words using ICU
(International Components for Unicode).
Args:
search_term: The search string.
Returns:
A list of the words in the search string.
"""
results = []
breaker = icu.BreakIterator.createWordInstance(icu.Locale.getDefault())
breaker.setText(search_term)
i = 0
while True:
j = breaker.nextBoundary()
if j < 0:
break
result = search_term[i:j]
# libicu considers spaces and punctuation between words as words, but we don't
# want to include those in results as they would result in syntax errors in SQL
# queries (e.g. "foo bar" would result in the search query including "foo & &
# bar").
if len(re.findall(r"([\w\-]+)", result, re.UNICODE)):
results.append(result)
i = j
return results

View file

@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import re
from typing import Any, Dict, Set, Tuple from typing import Any, Dict, Set, Tuple
from unittest import mock from unittest import mock
from unittest.mock import Mock, patch from unittest.mock import Mock, patch
@ -30,6 +31,12 @@ from synapse.util import Clock
from tests.test_utils.event_injection import inject_member_event from tests.test_utils.event_injection import inject_member_event
from tests.unittest import HomeserverTestCase, override_config from tests.unittest import HomeserverTestCase, override_config
try:
import icu
except ImportError:
icu = None # type: ignore
ALICE = "@alice:a" ALICE = "@alice:a"
BOB = "@bob:b" BOB = "@bob:b"
BOBBY = "@bobby:a" BOBBY = "@bobby:a"
@ -467,3 +474,39 @@ class UserDirectoryStoreTestCase(HomeserverTestCase):
r["results"][0], r["results"][0],
{"user_id": BELA, "display_name": "Bela", "avatar_url": None}, {"user_id": BELA, "display_name": "Bela", "avatar_url": None},
) )
class UserDirectoryICUTestCase(HomeserverTestCase):
if not icu:
skip = "Requires PyICU"
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.store = hs.get_datastores().main
self.user_dir_helper = GetUserDirectoryTables(self.store)
def test_icu_word_boundary(self) -> None:
"""Tests that we correctly detect word boundaries when ICU (International
Components for Unicode) support is available.
"""
display_name = "Gáo"
# This word is not broken down correctly by Python's regular expressions,
# likely because á is actually a lowercase a followed by a U+0301 combining
# acute accent. This is specifically something that ICU support fixes.
matches = re.findall(r"([\w\-]+)", display_name, re.UNICODE)
self.assertEqual(len(matches), 2)
self.get_success(
self.store.update_profile_in_user_dir(ALICE, display_name, None)
)
self.get_success(self.store.add_users_in_public_rooms("!room:id", (ALICE,)))
# Check that searching for this user yields the correct result.
r = self.get_success(self.store.search_user_dir(BOB, display_name, 10))
self.assertFalse(r["limited"])
self.assertEqual(len(r["results"]), 1)
self.assertDictEqual(
r["results"][0],
{"user_id": ALICE, "display_name": display_name, "avatar_url": None},
)