2021-09-22 08:25:26 -07:00
|
|
|
# Copyright 2021 The Matrix.org Foundation C.I.C.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
2022-11-09 09:55:34 -05:00
|
|
|
from typing import List, Tuple
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
from unittest.case import SkipTest
|
|
|
|
|
|
|
|
from twisted.test.proto_helpers import MemoryReactor
|
|
|
|
|
2021-09-22 08:25:26 -07:00
|
|
|
import synapse.rest.admin
|
2022-02-24 11:52:28 +00:00
|
|
|
from synapse.api.constants import EventTypes
|
|
|
|
from synapse.api.errors import StoreError
|
2021-09-22 08:25:26 -07:00
|
|
|
from synapse.rest.client import login, room
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
from synapse.server import HomeServer
|
|
|
|
from synapse.storage.databases.main import DataStore
|
|
|
|
from synapse.storage.databases.main.search import Phrase, SearchToken, _tokenize_query
|
2021-09-22 08:25:26 -07:00
|
|
|
from synapse.storage.engines import PostgresEngine
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
from synapse.storage.engines.sqlite import Sqlite3Engine
|
|
|
|
from synapse.util import Clock
|
2021-09-22 08:25:26 -07:00
|
|
|
|
2022-02-24 11:52:28 +00:00
|
|
|
from tests.unittest import HomeserverTestCase, skip_unless
|
|
|
|
from tests.utils import USE_POSTGRES_FOR_TESTS
|
2021-09-22 08:25:26 -07:00
|
|
|
|
|
|
|
|
2022-02-24 11:52:28 +00:00
|
|
|
class EventSearchInsertionTest(HomeserverTestCase):
|
2021-09-22 08:25:26 -07:00
|
|
|
servlets = [
|
|
|
|
synapse.rest.admin.register_servlets_for_client_rest_resource,
|
|
|
|
login.register_servlets,
|
|
|
|
room.register_servlets,
|
|
|
|
]
|
|
|
|
|
2022-12-09 12:36:32 -05:00
|
|
|
def test_null_byte(self) -> None:
|
2021-09-22 08:25:26 -07:00
|
|
|
"""
|
|
|
|
Postgres/SQLite don't like null bytes going into the search tables. Internally
|
|
|
|
we replace those with a space.
|
|
|
|
|
|
|
|
Ensure this doesn't break anything.
|
|
|
|
"""
|
|
|
|
|
|
|
|
# Register a user and create a room, create some messages
|
|
|
|
self.register_user("alice", "password")
|
|
|
|
access_token = self.login("alice", "password")
|
|
|
|
room_id = self.helper.create_room_as("alice", tok=access_token)
|
|
|
|
|
|
|
|
# Send messages and ensure they don't cause an internal server
|
|
|
|
# error
|
|
|
|
for body in ["hi\u0000bob", "another message", "hi alice"]:
|
|
|
|
response = self.helper.send(room_id, body, tok=access_token)
|
|
|
|
self.assertIn("event_id", response)
|
|
|
|
|
|
|
|
# Check that search works for the message where the null byte was replaced
|
2022-02-23 11:04:02 +00:00
|
|
|
store = self.hs.get_datastores().main
|
2021-09-22 08:25:26 -07:00
|
|
|
result = self.get_success(
|
|
|
|
store.search_msgs([room_id], "hi bob", ["content.body"])
|
|
|
|
)
|
2022-02-28 07:12:29 -05:00
|
|
|
self.assertEqual(result.get("count"), 1)
|
2021-09-22 08:25:26 -07:00
|
|
|
if isinstance(store.database_engine, PostgresEngine):
|
|
|
|
self.assertIn("hi", result.get("highlights"))
|
|
|
|
self.assertIn("bob", result.get("highlights"))
|
|
|
|
|
|
|
|
# Check that search works for an unrelated message
|
|
|
|
result = self.get_success(
|
|
|
|
store.search_msgs([room_id], "another", ["content.body"])
|
|
|
|
)
|
2022-02-28 07:12:29 -05:00
|
|
|
self.assertEqual(result.get("count"), 1)
|
2021-09-22 08:25:26 -07:00
|
|
|
if isinstance(store.database_engine, PostgresEngine):
|
|
|
|
self.assertIn("another", result.get("highlights"))
|
|
|
|
|
|
|
|
# Check that search works for a search term that overlaps with the message
|
|
|
|
# containing a null byte and an unrelated message.
|
|
|
|
result = self.get_success(store.search_msgs([room_id], "hi", ["content.body"]))
|
2022-02-28 07:12:29 -05:00
|
|
|
self.assertEqual(result.get("count"), 2)
|
2021-09-22 08:25:26 -07:00
|
|
|
result = self.get_success(
|
|
|
|
store.search_msgs([room_id], "hi alice", ["content.body"])
|
|
|
|
)
|
|
|
|
if isinstance(store.database_engine, PostgresEngine):
|
|
|
|
self.assertIn("alice", result.get("highlights"))
|
2022-02-24 11:52:28 +00:00
|
|
|
|
2022-12-09 12:36:32 -05:00
|
|
|
def test_non_string(self) -> None:
|
2022-02-24 11:52:28 +00:00
|
|
|
"""Test that non-string `value`s are not inserted into `event_search`.
|
|
|
|
|
|
|
|
This is particularly important when using sqlite, since a sqlite column can hold
|
|
|
|
both strings and integers. When using Postgres, integers are automatically
|
|
|
|
converted to strings.
|
|
|
|
|
|
|
|
Regression test for #11918.
|
|
|
|
"""
|
|
|
|
store = self.hs.get_datastores().main
|
|
|
|
|
|
|
|
# Register a user and create a room
|
|
|
|
user_id = self.register_user("alice", "password")
|
|
|
|
access_token = self.login("alice", "password")
|
|
|
|
room_id = self.helper.create_room_as("alice", tok=access_token)
|
|
|
|
room_version = self.get_success(store.get_room_version(room_id))
|
|
|
|
|
|
|
|
# Construct a message with a numeric body to be received over federation
|
|
|
|
# The message can't be sent using the client API, since Synapse's event
|
|
|
|
# validation will reject it.
|
|
|
|
prev_event_ids = self.get_success(store.get_prev_events_for_room(room_id))
|
|
|
|
prev_event = self.get_success(store.get_event(prev_event_ids[0]))
|
|
|
|
prev_state_map = self.get_success(
|
2022-05-31 13:17:50 +01:00
|
|
|
self.hs.get_storage_controllers().state.get_state_ids_for_event(
|
|
|
|
prev_event_ids[0]
|
|
|
|
)
|
2022-02-24 11:52:28 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
event_dict = {
|
|
|
|
"type": EventTypes.Message,
|
|
|
|
"content": {"msgtype": "m.text", "body": 2},
|
|
|
|
"room_id": room_id,
|
|
|
|
"sender": user_id,
|
|
|
|
"prev_events": prev_event_ids,
|
|
|
|
"origin_server_ts": self.clock.time_msec(),
|
|
|
|
}
|
|
|
|
builder = self.hs.get_event_builder_factory().for_room_version(
|
|
|
|
room_version, event_dict
|
|
|
|
)
|
|
|
|
event = self.get_success(
|
|
|
|
builder.build(
|
|
|
|
prev_event_ids=prev_event_ids,
|
|
|
|
auth_event_ids=self.hs.get_event_auth_handler().compute_auth_events(
|
|
|
|
builder,
|
|
|
|
prev_state_map,
|
|
|
|
for_verification=False,
|
|
|
|
),
|
2023-02-14 14:03:35 -05:00
|
|
|
depth=prev_event.depth + 1,
|
2022-02-24 11:52:28 +00:00
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
# Receive the event
|
|
|
|
self.get_success(
|
|
|
|
self.hs.get_federation_event_handler().on_receive_pdu(
|
|
|
|
self.hs.hostname, event
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
# The event should not have an entry in the `event_search` table
|
|
|
|
f = self.get_failure(
|
|
|
|
store.db_pool.simple_select_one_onecol(
|
|
|
|
"event_search",
|
|
|
|
{"room_id": room_id, "event_id": event.event_id},
|
|
|
|
"event_id",
|
|
|
|
),
|
|
|
|
StoreError,
|
|
|
|
)
|
|
|
|
self.assertEqual(f.value.code, 404)
|
|
|
|
|
|
|
|
@skip_unless(not USE_POSTGRES_FOR_TESTS, "requires sqlite")
|
2022-12-09 12:36:32 -05:00
|
|
|
def test_sqlite_non_string_deletion_background_update(self) -> None:
|
2022-02-24 11:52:28 +00:00
|
|
|
"""Test the background update to delete bad rows from `event_search`."""
|
|
|
|
store = self.hs.get_datastores().main
|
|
|
|
|
|
|
|
# Populate `event_search` with dummy data
|
|
|
|
self.get_success(
|
|
|
|
store.db_pool.simple_insert_many(
|
|
|
|
"event_search",
|
|
|
|
keys=["event_id", "room_id", "key", "value"],
|
|
|
|
values=[
|
|
|
|
("event1", "room_id", "content.body", "hi"),
|
|
|
|
("event2", "room_id", "content.body", "2"),
|
|
|
|
("event3", "room_id", "content.body", 3),
|
|
|
|
],
|
|
|
|
desc="populate_event_search",
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
# Run the background update
|
|
|
|
store.db_pool.updates._all_done = False
|
|
|
|
self.get_success(
|
|
|
|
store.db_pool.simple_insert(
|
|
|
|
"background_updates",
|
|
|
|
{
|
|
|
|
"update_name": "event_search_sqlite_delete_non_strings",
|
|
|
|
"progress_json": "{}",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
)
|
|
|
|
self.wait_for_background_updates()
|
|
|
|
|
|
|
|
# The non-string `value`s ought to be gone now.
|
|
|
|
values = self.get_success(
|
|
|
|
store.db_pool.simple_select_onecol(
|
|
|
|
"event_search",
|
|
|
|
{"room_id": "room_id"},
|
|
|
|
"value",
|
|
|
|
),
|
|
|
|
)
|
|
|
|
self.assertCountEqual(values, ["hi", "2"])
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
|
|
|
|
|
|
|
|
class MessageSearchTest(HomeserverTestCase):
|
|
|
|
"""
|
|
|
|
Check message search.
|
|
|
|
|
|
|
|
A powerful way to check the behaviour is to run the following in Postgres >= 11:
|
|
|
|
|
|
|
|
# SELECT websearch_to_tsquery('english', <your string>);
|
|
|
|
|
|
|
|
The result can be compared to the tokenized version for SQLite and Postgres < 11.
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
servlets = [
|
|
|
|
synapse.rest.admin.register_servlets_for_client_rest_resource,
|
|
|
|
login.register_servlets,
|
|
|
|
room.register_servlets,
|
|
|
|
]
|
|
|
|
|
|
|
|
PHRASE = "the quick brown fox jumps over the lazy dog"
|
|
|
|
|
2022-11-09 09:55:34 -05:00
|
|
|
# Each entry is a search query, followed by a boolean of whether it is in the phrase.
|
|
|
|
COMMON_CASES = [
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
("nope", False),
|
|
|
|
("brown", True),
|
|
|
|
("quick brown", True),
|
|
|
|
("brown quick", True),
|
|
|
|
("quick \t brown", True),
|
|
|
|
("jump", True),
|
|
|
|
("brown nope", False),
|
2022-11-09 09:55:34 -05:00
|
|
|
('"brown quick"', False),
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
('"jumps over"', True),
|
2022-11-09 09:55:34 -05:00
|
|
|
('"quick fox"', False),
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
("nope OR doublenope", False),
|
2022-11-09 09:55:34 -05:00
|
|
|
("furphy OR fox", True),
|
|
|
|
("fox -nope", True),
|
|
|
|
("fox -brown", False),
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
('"fox" quick', True),
|
|
|
|
('"quick brown', True),
|
|
|
|
('" quick "', True),
|
|
|
|
('" nope"', False),
|
|
|
|
]
|
|
|
|
# TODO Test non-ASCII cases.
|
|
|
|
|
|
|
|
# Case that fail on SQLite.
|
2022-11-09 09:55:34 -05:00
|
|
|
POSTGRES_CASES = [
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
# SQLite treats NOT as a binary operator.
|
2022-11-09 09:55:34 -05:00
|
|
|
("- fox", False),
|
|
|
|
("- nope", True),
|
|
|
|
('"-fox quick', False),
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
# PostgreSQL skips stop words.
|
|
|
|
('"the quick brown"', True),
|
|
|
|
('"over lazy"', True),
|
|
|
|
]
|
|
|
|
|
|
|
|
def prepare(
|
|
|
|
self, reactor: MemoryReactor, clock: Clock, homeserver: HomeServer
|
|
|
|
) -> None:
|
|
|
|
# Register a user and create a room, create some messages
|
|
|
|
self.register_user("alice", "password")
|
|
|
|
self.access_token = self.login("alice", "password")
|
|
|
|
self.room_id = self.helper.create_room_as("alice", tok=self.access_token)
|
|
|
|
|
|
|
|
# Send the phrase as a message and check it was created
|
|
|
|
response = self.helper.send(self.room_id, self.PHRASE, tok=self.access_token)
|
|
|
|
self.assertIn("event_id", response)
|
|
|
|
|
2022-10-27 09:58:12 -04:00
|
|
|
# The behaviour of a missing trailing double quote changed in PostgreSQL 14
|
|
|
|
# from ignoring the initial double quote to treating it as a phrase.
|
|
|
|
main_store = homeserver.get_datastores().main
|
|
|
|
found = False
|
|
|
|
if isinstance(main_store.database_engine, PostgresEngine):
|
|
|
|
assert main_store.database_engine._version is not None
|
|
|
|
found = main_store.database_engine._version < 140000
|
2022-11-09 09:55:34 -05:00
|
|
|
self.COMMON_CASES.append(('"fox quick', found))
|
2022-10-27 09:58:12 -04:00
|
|
|
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
def test_tokenize_query(self) -> None:
|
|
|
|
"""Test the custom logic to tokenize a user's query."""
|
|
|
|
cases = (
|
|
|
|
("brown", ["brown"]),
|
|
|
|
("quick brown", ["quick", SearchToken.And, "brown"]),
|
|
|
|
("quick \t brown", ["quick", SearchToken.And, "brown"]),
|
|
|
|
('"brown quick"', [Phrase(["brown", "quick"])]),
|
|
|
|
("furphy OR fox", ["furphy", SearchToken.Or, "fox"]),
|
|
|
|
("fox -brown", ["fox", SearchToken.Not, "brown"]),
|
|
|
|
("- fox", [SearchToken.Not, "fox"]),
|
|
|
|
('"fox" quick', [Phrase(["fox"]), SearchToken.And, "quick"]),
|
2022-10-27 09:58:12 -04:00
|
|
|
# No trailing double quote.
|
|
|
|
('"fox quick', [Phrase(["fox", "quick"])]),
|
|
|
|
('"-fox quick', [Phrase(["-fox", "quick"])]),
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
('" quick "', [Phrase(["quick"])]),
|
|
|
|
(
|
|
|
|
'q"uick brow"n',
|
|
|
|
[
|
|
|
|
"q",
|
|
|
|
SearchToken.And,
|
|
|
|
Phrase(["uick", "brow"]),
|
|
|
|
SearchToken.And,
|
|
|
|
"n",
|
|
|
|
],
|
|
|
|
),
|
|
|
|
(
|
|
|
|
'-"quick brown"',
|
|
|
|
[SearchToken.Not, Phrase(["quick", "brown"])],
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
for query, expected in cases:
|
|
|
|
tokenized = _tokenize_query(query)
|
|
|
|
self.assertEqual(
|
|
|
|
tokenized, expected, f"{tokenized} != {expected} for {query}"
|
|
|
|
)
|
|
|
|
|
|
|
|
def _check_test_cases(
|
2022-11-09 09:55:34 -05:00
|
|
|
self, store: DataStore, cases: List[Tuple[str, bool]]
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
) -> None:
|
|
|
|
# Run all the test cases versus search_msgs
|
|
|
|
for query, expect_to_contain in cases:
|
|
|
|
result = self.get_success(
|
|
|
|
store.search_msgs([self.room_id], query, ["content.body"])
|
|
|
|
)
|
|
|
|
self.assertEquals(
|
|
|
|
result["count"],
|
|
|
|
1 if expect_to_contain else 0,
|
|
|
|
f"expected '{query}' to match '{self.PHRASE}'"
|
|
|
|
if expect_to_contain
|
|
|
|
else f"'{query}' unexpectedly matched '{self.PHRASE}'",
|
|
|
|
)
|
|
|
|
self.assertEquals(
|
|
|
|
len(result["results"]),
|
|
|
|
1 if expect_to_contain else 0,
|
|
|
|
"results array length should match count",
|
|
|
|
)
|
|
|
|
|
|
|
|
# Run them again versus search_rooms
|
|
|
|
for query, expect_to_contain in cases:
|
|
|
|
result = self.get_success(
|
|
|
|
store.search_rooms([self.room_id], query, ["content.body"], 10)
|
|
|
|
)
|
|
|
|
self.assertEquals(
|
|
|
|
result["count"],
|
|
|
|
1 if expect_to_contain else 0,
|
|
|
|
f"expected '{query}' to match '{self.PHRASE}'"
|
|
|
|
if expect_to_contain
|
|
|
|
else f"'{query}' unexpectedly matched '{self.PHRASE}'",
|
|
|
|
)
|
|
|
|
self.assertEquals(
|
|
|
|
len(result["results"]),
|
|
|
|
1 if expect_to_contain else 0,
|
|
|
|
"results array length should match count",
|
|
|
|
)
|
|
|
|
|
2022-12-09 12:36:32 -05:00
|
|
|
def test_postgres_web_search_for_phrase(self) -> None:
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
"""
|
|
|
|
Test searching for phrases using typical web search syntax, as per postgres' websearch_to_tsquery.
|
|
|
|
This test is skipped unless the postgres instance supports websearch_to_tsquery.
|
|
|
|
|
2022-11-09 09:55:34 -05:00
|
|
|
See https://www.postgresql.org/docs/current/textsearch-controls.html
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
"""
|
|
|
|
|
|
|
|
store = self.hs.get_datastores().main
|
|
|
|
if not isinstance(store.database_engine, PostgresEngine):
|
|
|
|
raise SkipTest("Test only applies when postgres is used as the database")
|
|
|
|
|
2022-11-09 09:55:34 -05:00
|
|
|
self._check_test_cases(store, self.COMMON_CASES + self.POSTGRES_CASES)
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
|
2022-12-09 12:36:32 -05:00
|
|
|
def test_sqlite_search(self) -> None:
|
Unified search query syntax using the full-text search capabilities of the underlying DB. (#11635)
Support a unified search query syntax which leverages more of the full-text
search of each database supported by Synapse.
Supports, with the same syntax across Postgresql 11+ and Sqlite:
- quoted "search terms"
- `AND`, `OR`, `-` (negation) operators
- Matching words based on their stem, e.g. searches for "dog" matches
documents containing "dogs".
This is achieved by
- If on postgresql 11+, pass the user input to `websearch_to_tsquery`
- If on sqlite, manually parse the query and transform it into the sqlite-specific
query syntax.
Note that postgresql 10, which is close to end-of-life, falls back to using
`phraseto_tsquery`, which only supports a subset of the features.
Multiple terms separated by a space are implicitly ANDed.
Note that:
1. There is no escaping of full-text syntax that might be supported by the database;
e.g. `NOT`, `NEAR`, `*` in sqlite. This runs the risk that people might discover this
as accidental functionality and depend on something we don't guarantee.
2. English text is assumed for stemming. To support other languages, either the target
language needs to be known at the time of indexing the message (via room metadata,
or otherwise), or a separate index for each language supported could be created.
Sqlite docs: https://www.sqlite.org/fts3.html#full_text_index_queries
Postgres docs: https://www.postgresql.org/docs/11/textsearch-controls.html
2022-10-25 19:05:22 +01:00
|
|
|
"""
|
|
|
|
Test sqlite searching for phrases.
|
|
|
|
"""
|
|
|
|
store = self.hs.get_datastores().main
|
|
|
|
if not isinstance(store.database_engine, Sqlite3Engine):
|
|
|
|
raise SkipTest("Test only applies when sqlite is used as the database")
|
|
|
|
|
2022-11-09 09:55:34 -05:00
|
|
|
self._check_test_cases(store, self.COMMON_CASES)
|