Refactor oEmbed previews (#10814)

The major change is moving the decision of whether to use oEmbed
further up the call-stack. This reverts the _download_url method to
being a "dumb" functionwhich takes a single URL and downloads it
(as it was before #7920).

This also makes more minor refactorings:

* Renames internal variables for clarity.
* Factors out shared code between the HTML and rich oEmbed
  previews.
* Fixes tests to preview an oEmbed image.
This commit is contained in:
Patrick Cloke 2021-09-21 12:09:57 -04:00 committed by GitHub
parent 2843058a8b
commit ba7a91aea5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 299 additions and 220 deletions

View file

@ -0,0 +1 @@
Improve oEmbed previews by processing the author name, photo, and video information.

View file

@ -25,16 +25,14 @@ When Synapse is asked to preview a URL it does the following:
3. Kicks off a background process to generate a preview:
1. Checks the database cache by URL and timestamp and returns the result if it
has not expired and was successful (a 2xx return code).
2. Checks if the URL matches an oEmbed pattern. If it does, fetch the oEmbed
response. If this is an image, replace the URL to fetch and continue. If
if it is HTML content, use the HTML as the document and continue.
3. If it doesn't match an oEmbed pattern, downloads the URL and stores it
into a file via the media storage provider and saves the local media
metadata.
5. If the media is an image:
2. Checks if the URL matches an [oEmbed](https://oembed.com/) pattern. If it
does, update the URL to download.
3. Downloads the URL and stores it into a file via the media storage provider
and saves the local media metadata.
4. If the media is an image:
1. Generates thumbnails.
2. Generates an Open Graph response based on image properties.
6. If the media is HTML:
5. If the media is HTML:
1. Decodes the HTML via the stored file.
2. Generates an Open Graph response from the HTML.
3. If an image exists in the Open Graph response:
@ -42,6 +40,13 @@ When Synapse is asked to preview a URL it does the following:
provider and saves the local media metadata.
2. Generates thumbnails.
3. Updates the Open Graph response based on image properties.
6. If the media is JSON and an oEmbed URL was found:
1. Convert the oEmbed response to an Open Graph response.
2. If a thumbnail or image is in the oEmbed response:
1. Downloads the URL and stores it into a file via the media storage
provider and saves the local media metadata.
2. Generates thumbnails.
3. Updates the Open Graph response based on image properties.
7. Stores the result in the database cache.
4. Returns the result.

View file

@ -12,11 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import urllib.parse
from typing import TYPE_CHECKING, Optional
import attr
from synapse.http.client import SimpleHttpClient
from synapse.types import JsonDict
from synapse.util import json_decoder
if TYPE_CHECKING:
from synapse.server import HomeServer
@ -24,18 +27,15 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
@attr.s(slots=True, auto_attribs=True)
@attr.s(slots=True, frozen=True, auto_attribs=True)
class OEmbedResult:
# Either HTML content or URL must be provided.
html: Optional[str]
url: Optional[str]
title: Optional[str]
# Number of seconds to cache the content.
cache_age: int
class OEmbedError(Exception):
"""An error occurred processing the oEmbed object."""
# The Open Graph result (converted from the oEmbed result).
open_graph_result: JsonDict
# Number of seconds to cache the content, according to the oEmbed response.
#
# This will be None if no cache-age is provided in the oEmbed response (or
# if the oEmbed response cannot be turned into an Open Graph response).
cache_age: Optional[int]
class OEmbedProvider:
@ -81,75 +81,106 @@ class OEmbedProvider:
"""
for url_pattern, endpoint in self._oembed_patterns.items():
if url_pattern.fullmatch(url):
return endpoint
# TODO Specify max height / width.
# Note that only the JSON format is supported, some endpoints want
# this in the URL, others want it as an argument.
endpoint = endpoint.replace("{format}", "json")
args = {"url": url, "format": "json"}
query_str = urllib.parse.urlencode(args, True)
return f"{endpoint}?{query_str}"
# No match.
return None
async def get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult:
def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
"""
Request content from an oEmbed endpoint.
Parse the oEmbed response into an Open Graph response.
Args:
endpoint: The oEmbed API endpoint.
url: The URL to pass to the API.
url: The URL which is being previewed (not the one which was
requested).
raw_body: The oEmbed response as JSON encoded as bytes.
Returns:
An object representing the metadata returned.
Raises:
OEmbedError if fetching or parsing of the oEmbed information fails.
json-encoded Open Graph data
"""
try:
logger.debug("Trying to get oEmbed content for url '%s'", url)
# Note that only the JSON format is supported, some endpoints want
# this in the URL, others want it as an argument.
endpoint = endpoint.replace("{format}", "json")
result = await self._client.get_json(
endpoint,
# TODO Specify max height / width.
args={"url": url, "format": "json"},
)
# oEmbed responses *must* be UTF-8 according to the spec.
oembed = json_decoder.decode(raw_body.decode("utf-8"))
# Ensure there's a version of 1.0.
if result.get("version") != "1.0":
raise OEmbedError("Invalid version: %s" % (result.get("version"),))
oembed_type = result.get("type")
oembed_version = oembed["version"]
if oembed_version != "1.0":
raise RuntimeError(f"Invalid version: {oembed_version}")
# Ensure the cache age is None or an int.
cache_age = result.get("cache_age")
cache_age = oembed.get("cache_age")
if cache_age:
cache_age = int(cache_age)
oembed_result = OEmbedResult(None, None, result.get("title"), cache_age)
# The results.
open_graph_response = {"og:title": oembed.get("title")}
# HTML content.
# If a thumbnail exists, use it. Note that dimensions will be calculated later.
if "thumbnail_url" in oembed:
open_graph_response["og:image"] = oembed["thumbnail_url"]
# Process each type separately.
oembed_type = oembed["type"]
if oembed_type == "rich":
oembed_result.html = result.get("html")
return oembed_result
calc_description_and_urls(open_graph_response, oembed["html"])
if oembed_type == "photo":
oembed_result.url = result.get("url")
return oembed_result
elif oembed_type == "photo":
# If this is a photo, use the full image, not the thumbnail.
open_graph_response["og:image"] = oembed["url"]
# TODO Handle link and video types.
if "thumbnail_url" in result:
oembed_result.url = result.get("thumbnail_url")
return oembed_result
raise OEmbedError("Incompatible oEmbed information.")
except OEmbedError as e:
# Trap OEmbedErrors first so we can directly re-raise them.
logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
raise
else:
raise RuntimeError(f"Unknown oEmbed type: {oembed_type}")
except Exception as e:
# Trap any exception and let the code follow as usual.
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading oEmbed metadata from %s: %r", url, e)
raise OEmbedError() from e
logger.warning(f"Error parsing oEmbed metadata from {url}: {e:r}")
open_graph_response = {}
cache_age = None
return OEmbedResult(open_graph_response, cache_age)
def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
"""
Calculate description for an HTML document.
This uses lxml to convert the HTML document into plaintext. If errors
occur during processing of the document, an empty response is returned.
Args:
open_graph_response: The current Open Graph summary. This is updated with additional fields.
html_body: The HTML document, as bytes.
Returns:
The summary
"""
# If there's no body, nothing useful is going to be found.
if not html_body:
return
from lxml import etree
# Create an HTML parser. If this fails, log and return no metadata.
parser = etree.HTMLParser(recover=True, encoding="utf-8")
# Attempt to parse the body. If this fails, log and return no metadata.
tree = etree.fromstring(html_body, parser)
# The data was successfully parsed, but no tree was found.
if tree is None:
return
from synapse.rest.media.v1.preview_url_resource import _calc_description
description = _calc_description(tree)
if description:
open_graph_response["og:description"] = description

View file

@ -44,7 +44,7 @@ from synapse.logging.context import make_deferred_yieldable, run_in_background
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.rest.media.v1._base import get_filename_from_headers
from synapse.rest.media.v1.media_storage import MediaStorage
from synapse.rest.media.v1.oembed import OEmbedError, OEmbedProvider
from synapse.rest.media.v1.oembed import OEmbedProvider
from synapse.types import JsonDict
from synapse.util import json_encoder
from synapse.util.async_helpers import ObservableDeferred
@ -73,6 +73,7 @@ OG_TAG_NAME_MAXLEN = 50
OG_TAG_VALUE_MAXLEN = 1000
ONE_HOUR = 60 * 60 * 1000
ONE_DAY = 24 * ONE_HOUR
@attr.s(slots=True, frozen=True, auto_attribs=True)
@ -255,10 +256,19 @@ class PreviewUrlResource(DirectServeJsonResource):
og = og.encode("utf8")
return og
media_info = await self._download_url(url, user)
# If this URL can be accessed via oEmbed, use that instead.
url_to_download = url
oembed_url = self._oembed.get_oembed_url(url)
if oembed_url:
url_to_download = oembed_url
media_info = await self._download_url(url_to_download, user)
logger.debug("got media_info of '%s'", media_info)
# The number of milliseconds that the response should be considered valid.
expiration_ms = media_info.expires
if _is_media(media_info.media_type):
file_id = media_info.filesystem_id
dims = await self.media_repo._generate_thumbnails(
@ -288,34 +298,22 @@ class PreviewUrlResource(DirectServeJsonResource):
encoding = get_html_media_encoding(body, media_info.media_type)
og = decode_and_calc_og(body, media_info.uri, encoding)
# pre-cache the image for posterity
# FIXME: it might be cleaner to use the same flow as the main /preview_url
# request itself and benefit from the same caching etc. But for now we
# just rely on the caching on the master request to speed things up.
if "og:image" in og and og["og:image"]:
image_info = await self._download_url(
_rebase_url(og["og:image"], media_info.uri), user
)
await self._precache_image_url(user, media_info, og)
if _is_media(image_info.media_type):
# TODO: make sure we don't choke on white-on-transparent images
file_id = image_info.filesystem_id
dims = await self.media_repo._generate_thumbnails(
None, file_id, file_id, image_info.media_type, url_cache=True
)
if dims:
og["og:image:width"] = dims["width"]
og["og:image:height"] = dims["height"]
else:
logger.warning("Couldn't get dims for %s", og["og:image"])
elif oembed_url and _is_json(media_info.media_type):
# Handle an oEmbed response.
with open(media_info.filename, "rb") as file:
body = file.read()
oembed_response = self._oembed.parse_oembed_response(media_info.uri, body)
og = oembed_response.open_graph_result
# Use the cache age from the oEmbed result, instead of the HTTP response.
if oembed_response.cache_age is not None:
expiration_ms = oembed_response.cache_age
await self._precache_image_url(user, media_info, og)
og[
"og:image"
] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
og["og:image:type"] = image_info.media_type
og["matrix:image:size"] = image_info.media_length
else:
del og["og:image"]
else:
logger.warning("Failed to find any OG data in %s", url)
og = {}
@ -336,12 +334,15 @@ class PreviewUrlResource(DirectServeJsonResource):
jsonog = json_encoder.encode(og)
# Cap the amount of time to consider a response valid.
expiration_ms = min(expiration_ms, ONE_DAY)
# store OG in history-aware DB cache
await self.store.store_url_cache(
url,
media_info.response_code,
media_info.etag,
media_info.expires + media_info.created_ts_ms,
media_info.created_ts_ms + expiration_ms,
jsonog,
media_info.filesystem_id,
media_info.created_ts_ms,
@ -358,88 +359,52 @@ class PreviewUrlResource(DirectServeJsonResource):
file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
# If this URL can be accessed via oEmbed, use that instead.
url_to_download: Optional[str] = url
oembed_url = self._oembed.get_oembed_url(url)
if oembed_url:
# The result might be a new URL to download, or it might be HTML content.
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
try:
oembed_result = await self._oembed.get_oembed_content(oembed_url, url)
if oembed_result.url:
url_to_download = oembed_result.url
elif oembed_result.html:
url_to_download = None
except OEmbedError:
# If an error occurs, try doing a normal preview.
pass
if url_to_download:
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
try:
logger.debug("Trying to get preview for url '%s'", url_to_download)
length, headers, uri, code = await self.client.get_file(
url_to_download,
output_stream=f,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
# handler will return a SynapseError to the client instead of
# blank data or a 500.
raise
except DNSLookupError:
# DNS lookup returned no results
# Note: This will also be the case if one of the resolved IP
# addresses is blacklisted
raise SynapseError(
502,
"DNS resolution failure during URL preview generation",
Codes.UNKNOWN,
)
except Exception as e:
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading %s: %r", url_to_download, e)
raise SynapseError(
500,
"Failed to download content: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)
await finish()
if b"Content-Type" in headers:
media_type = headers[b"Content-Type"][0].decode("ascii")
else:
media_type = "application/octet-stream"
download_name = get_filename_from_headers(headers)
# FIXME: we should calculate a proper expiration based on the
# Cache-Control and Expire headers. But for now, assume 1 hour.
expires = ONE_HOUR
etag = (
headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
logger.debug("Trying to get preview for url '%s'", url)
length, headers, uri, code = await self.client.get_file(
url,
output_stream=f,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
)
else:
# we can only get here if we did an oembed request and have an oembed_result.html
assert oembed_result.html is not None
assert oembed_url is not None
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
# handler will return a SynapseError to the client instead of
# blank data or a 500.
raise
except DNSLookupError:
# DNS lookup returned no results
# Note: This will also be the case if one of the resolved IP
# addresses is blacklisted
raise SynapseError(
502,
"DNS resolution failure during URL preview generation",
Codes.UNKNOWN,
)
except Exception as e:
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading %s: %r", url, e)
html_bytes = oembed_result.html.encode("utf-8")
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
f.write(html_bytes)
await finish()
raise SynapseError(
500,
"Failed to download content: %s"
% (traceback.format_exception_only(sys.exc_info()[0], e),),
Codes.UNKNOWN,
)
await finish()
media_type = "text/html"
download_name = oembed_result.title
length = len(html_bytes)
# If a specific cache age was not given, assume 1 hour.
expires = oembed_result.cache_age or ONE_HOUR
uri = oembed_url
code = 200
etag = None
if b"Content-Type" in headers:
media_type = headers[b"Content-Type"][0].decode("ascii")
else:
media_type = "application/octet-stream"
download_name = get_filename_from_headers(headers)
# FIXME: we should calculate a proper expiration based on the
# Cache-Control and Expire headers. But for now, assume 1 hour.
expires = ONE_HOUR
etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
try:
time_now_ms = self.clock.time_msec()
@ -474,6 +439,46 @@ class PreviewUrlResource(DirectServeJsonResource):
etag=etag,
)
async def _precache_image_url(
self, user: str, media_info: MediaInfo, og: JsonDict
) -> None:
"""
Pre-cache the image (if one exists) for posterity
Args:
user: The user requesting the preview.
media_info: The media being previewed.
og: The Open Graph dictionary. This is modified with image information.
"""
# If there's no image or it is blank, there's nothing to do.
if "og:image" not in og or not og["og:image"]:
return
# FIXME: it might be cleaner to use the same flow as the main /preview_url
# request itself and benefit from the same caching etc. But for now we
# just rely on the caching on the master request to speed things up.
image_info = await self._download_url(
_rebase_url(og["og:image"], media_info.uri), user
)
if _is_media(image_info.media_type):
# TODO: make sure we don't choke on white-on-transparent images
file_id = image_info.filesystem_id
dims = await self.media_repo._generate_thumbnails(
None, file_id, file_id, image_info.media_type, url_cache=True
)
if dims:
og["og:image:width"] = dims["width"]
og["og:image:height"] = dims["height"]
else:
logger.warning("Couldn't get dims for %s", og["og:image"])
og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
og["og:image:type"] = image_info.media_type
og["matrix:image:size"] = image_info.media_length
else:
del og["og:image"]
def _start_expire_url_cache_data(self) -> Deferred:
return run_as_background_process(
"expire_url_cache_data", self._expire_url_cache_data
@ -527,7 +532,7 @@ class PreviewUrlResource(DirectServeJsonResource):
# These may be cached for a bit on the client (i.e., they
# may have a room open with a preview url thing open).
# So we wait a couple of days before deleting, just in case.
expire_before = now - 2 * 24 * ONE_HOUR
expire_before = now - 2 * ONE_DAY
media_ids = await self.store.get_url_cache_media_before(expire_before)
removed_media = []
@ -669,7 +674,18 @@ def decode_and_calc_og(
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
# suck our tree into lxml and define our OG response.
"""
Calculate metadata for an HTML document.
This uses lxml to search the HTML document for Open Graph data.
Args:
tree: The parsed HTML document.
media_url: The URI used to download the body.
Returns:
The Open Graph response as a dictionary.
"""
# if we see any image URLs in the OG response, then spider them
# (although the client could choose to do this by asking for previews of those
@ -743,35 +759,7 @@ def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
if meta_description:
og["og:description"] = meta_description[0]
else:
# grab any text nodes which are inside the <body/> tag...
# unless they are within an HTML5 semantic markup tag...
# <header/>, <nav/>, <aside/>, <footer/>
# ...or if they are within a <script/> or <style/> tag.
# This is a very very very coarse approximation to a plain text
# render of the page.
# We don't just use XPATH here as that is slow on some machines.
from lxml import etree
TAGS_TO_REMOVE = (
"header",
"nav",
"aside",
"footer",
"script",
"noscript",
"style",
etree.Comment,
)
# Split all the text nodes into paragraphs (by splitting on new
# lines)
text_nodes = (
re.sub(r"\s+", "\n", el).strip()
for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
)
og["og:description"] = summarize_paragraphs(text_nodes)
og["og:description"] = _calc_description(tree)
elif og["og:description"]:
# This must be a non-empty string at this point.
assert isinstance(og["og:description"], str)
@ -782,6 +770,46 @@ def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
return og
def _calc_description(tree: "etree.Element") -> Optional[str]:
"""
Calculate a text description based on an HTML document.
Grabs any text nodes which are inside the <body/> tag, unless they are within
an HTML5 semantic markup tag (<header/>, <nav/>, <aside/>, <footer/>), or
if they are within a <script/> or <style/> tag.
This is a very very very coarse approximation to a plain text render of the page.
Args:
tree: The parsed HTML document.
Returns:
The plain text description, or None if one cannot be generated.
"""
# We don't just use XPATH here as that is slow on some machines.
from lxml import etree
TAGS_TO_REMOVE = (
"header",
"nav",
"aside",
"footer",
"script",
"noscript",
"style",
etree.Comment,
)
# Split all the text nodes into paragraphs (by splitting on new
# lines)
text_nodes = (
re.sub(r"\s+", "\n", el).strip()
for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE)
)
return summarize_paragraphs(text_nodes)
def _iterate_over_text(
tree: "etree.Element", *tags_to_ignore: Iterable[Union[str, "etree.Comment"]]
) -> Generator[str, None, None]:
@ -841,11 +869,25 @@ def _is_html(content_type: str) -> bool:
)
def _is_json(content_type: str) -> bool:
return content_type.lower().startswith("application/json")
def summarize_paragraphs(
text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
) -> Optional[str]:
# Try to get a summary of between 200 and 500 words, respecting
# first paragraph and then word boundaries.
"""
Try to get a summary respecting first paragraph and then word boundaries.
Args:
text_nodes: The paragraphs to summarize.
min_size: The minimum number of words to include.
max_size: The maximum number of words to include.
Returns:
A summary of the text nodes, or None if that was not possible.
"""
# TODO: Respect sentences?
description = ""
@ -868,7 +910,7 @@ def summarize_paragraphs(
new_desc = ""
# This splits the paragraph into words, but keeping the
# (preceeding) whitespace intact so we can easily concat
# (preceding) whitespace intact so we can easily concat
# words back together.
for match in re.finditer(r"\s*\S+", description):
word = match.group()

View file

@ -24,6 +24,7 @@ from synapse.config.oembed import OEmbedEndpointConfig
from tests import unittest
from tests.server import FakeTransport
from tests.test_utils import SMALL_PNG
try:
import lxml
@ -576,13 +577,6 @@ class URLPreviewTests(unittest.HomeserverTestCase):
}
oembed_content = json.dumps(result).encode("utf-8")
end_content = (
b"<html><head>"
b"<title>Some Title</title>"
b'<meta property="og:description" content="hi" />'
b"</head></html>"
)
channel = self.make_request(
"GET",
"preview_url?url=http://twitter.com/matrixdotorg/status/12345",
@ -606,6 +600,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
self.pump()
# Ensure a second request is made to the photo URL.
client = self.reactor.tcpClients[1][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
@ -613,18 +608,23 @@ class URLPreviewTests(unittest.HomeserverTestCase):
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: text/html; charset="utf8"\r\n\r\n'
b"Content-Type: image/png\r\n\r\n"
)
% (len(end_content),)
+ end_content
% (len(SMALL_PNG),)
+ SMALL_PNG
)
self.pump()
# Ensure the URL is what was requested.
self.assertIn(b"/matrixdotorg", server.data)
self.assertEqual(channel.code, 200)
self.assertEqual(
channel.json_body, {"og:title": "Some Title", "og:description": "hi"}
)
self.assertIsNone(channel.json_body["og:title"])
self.assertTrue(channel.json_body["og:image"].startswith("mxc://"))
self.assertEqual(channel.json_body["og:image:height"], 1)
self.assertEqual(channel.json_body["og:image:width"], 1)
self.assertEqual(channel.json_body["og:image:type"], "image/png")
def test_oembed_rich(self):
"""Test an oEmbed endpoint which returns HTML content via the 'rich' type."""