mirror of
https://mau.dev/maunium/synapse.git
synced 2024-11-12 04:52:26 +01:00
Clean-up logic for rebasing URLs during URL preview. (#12219)
By using urljoin from the standard library and reducing the number of places URLs are rebased.
This commit is contained in:
parent
dda9b7fc4d
commit
4587b35929
4 changed files with 26 additions and 91 deletions
1
changelog.d/12219.misc
Normal file
1
changelog.d/12219.misc
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Clean-up logic around rebasing URLs for URL image previews.
|
|
@ -16,7 +16,6 @@ import itertools
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
|
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union
|
||||||
from urllib import parse as urlparse
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
@ -144,9 +143,7 @@ def decode_body(
|
||||||
return etree.fromstring(body, parser)
|
return etree.fromstring(body, parser)
|
||||||
|
|
||||||
|
|
||||||
def parse_html_to_open_graph(
|
def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
|
||||||
tree: "etree.Element", media_uri: str
|
|
||||||
) -> Dict[str, Optional[str]]:
|
|
||||||
"""
|
"""
|
||||||
Parse the HTML document into an Open Graph response.
|
Parse the HTML document into an Open Graph response.
|
||||||
|
|
||||||
|
@ -155,7 +152,6 @@ def parse_html_to_open_graph(
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
tree: The parsed HTML document.
|
tree: The parsed HTML document.
|
||||||
media_url: The URI used to download the body.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The Open Graph response as a dictionary.
|
The Open Graph response as a dictionary.
|
||||||
|
@ -209,7 +205,7 @@ def parse_html_to_open_graph(
|
||||||
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
|
"//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
|
||||||
)
|
)
|
||||||
if meta_image:
|
if meta_image:
|
||||||
og["og:image"] = rebase_url(meta_image[0], media_uri)
|
og["og:image"] = meta_image[0]
|
||||||
else:
|
else:
|
||||||
# TODO: consider inlined CSS styles as well as width & height attribs
|
# TODO: consider inlined CSS styles as well as width & height attribs
|
||||||
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
|
images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
|
||||||
|
@ -320,37 +316,6 @@ def _iterate_over_text(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def rebase_url(url: str, base: str) -> str:
|
|
||||||
"""
|
|
||||||
Resolves a potentially relative `url` against an absolute `base` URL.
|
|
||||||
|
|
||||||
For example:
|
|
||||||
|
|
||||||
>>> rebase_url("subpage", "https://example.com/foo/")
|
|
||||||
'https://example.com/foo/subpage'
|
|
||||||
>>> rebase_url("sibling", "https://example.com/foo")
|
|
||||||
'https://example.com/sibling'
|
|
||||||
>>> rebase_url("/bar", "https://example.com/foo/")
|
|
||||||
'https://example.com/bar'
|
|
||||||
>>> rebase_url("https://alice.com/a/", "https://example.com/foo/")
|
|
||||||
'https://alice.com/a'
|
|
||||||
"""
|
|
||||||
base_parts = urlparse.urlparse(base)
|
|
||||||
# Convert the parsed URL to a list for (potential) modification.
|
|
||||||
url_parts = list(urlparse.urlparse(url))
|
|
||||||
# Add a scheme, if one does not exist.
|
|
||||||
if not url_parts[0]:
|
|
||||||
url_parts[0] = base_parts.scheme or "http"
|
|
||||||
# Fix up the hostname, if this is not a data URL.
|
|
||||||
if url_parts[0] != "data" and not url_parts[1]:
|
|
||||||
url_parts[1] = base_parts.netloc
|
|
||||||
# If the path does not start with a /, nest it under the base path's last
|
|
||||||
# directory.
|
|
||||||
if not url_parts[2].startswith("/"):
|
|
||||||
url_parts[2] = re.sub(r"/[^/]+$", "/", base_parts.path) + url_parts[2]
|
|
||||||
return urlparse.urlunparse(url_parts)
|
|
||||||
|
|
||||||
|
|
||||||
def summarize_paragraphs(
|
def summarize_paragraphs(
|
||||||
text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
|
text_nodes: Iterable[str], min_size: int = 200, max_size: int = 500
|
||||||
) -> Optional[str]:
|
) -> Optional[str]:
|
||||||
|
|
|
@ -22,7 +22,7 @@ import shutil
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
|
from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
|
||||||
from urllib import parse as urlparse
|
from urllib.parse import urljoin, urlparse, urlsplit
|
||||||
from urllib.request import urlopen
|
from urllib.request import urlopen
|
||||||
|
|
||||||
import attr
|
import attr
|
||||||
|
@ -44,11 +44,7 @@ from synapse.metrics.background_process_metrics import run_as_background_process
|
||||||
from synapse.rest.media.v1._base import get_filename_from_headers
|
from synapse.rest.media.v1._base import get_filename_from_headers
|
||||||
from synapse.rest.media.v1.media_storage import MediaStorage
|
from synapse.rest.media.v1.media_storage import MediaStorage
|
||||||
from synapse.rest.media.v1.oembed import OEmbedProvider
|
from synapse.rest.media.v1.oembed import OEmbedProvider
|
||||||
from synapse.rest.media.v1.preview_html import (
|
from synapse.rest.media.v1.preview_html import decode_body, parse_html_to_open_graph
|
||||||
decode_body,
|
|
||||||
parse_html_to_open_graph,
|
|
||||||
rebase_url,
|
|
||||||
)
|
|
||||||
from synapse.types import JsonDict, UserID
|
from synapse.types import JsonDict, UserID
|
||||||
from synapse.util import json_encoder
|
from synapse.util import json_encoder
|
||||||
from synapse.util.async_helpers import ObservableDeferred
|
from synapse.util.async_helpers import ObservableDeferred
|
||||||
|
@ -187,7 +183,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
ts = self.clock.time_msec()
|
ts = self.clock.time_msec()
|
||||||
|
|
||||||
# XXX: we could move this into _do_preview if we wanted.
|
# XXX: we could move this into _do_preview if we wanted.
|
||||||
url_tuple = urlparse.urlsplit(url)
|
url_tuple = urlsplit(url)
|
||||||
for entry in self.url_preview_url_blacklist:
|
for entry in self.url_preview_url_blacklist:
|
||||||
match = True
|
match = True
|
||||||
for attrib in entry:
|
for attrib in entry:
|
||||||
|
@ -322,7 +318,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
|
|
||||||
# Parse Open Graph information from the HTML in case the oEmbed
|
# Parse Open Graph information from the HTML in case the oEmbed
|
||||||
# response failed or is incomplete.
|
# response failed or is incomplete.
|
||||||
og_from_html = parse_html_to_open_graph(tree, media_info.uri)
|
og_from_html = parse_html_to_open_graph(tree)
|
||||||
|
|
||||||
# Compile the Open Graph response by using the scraped
|
# Compile the Open Graph response by using the scraped
|
||||||
# information from the HTML and overlaying any information
|
# information from the HTML and overlaying any information
|
||||||
|
@ -588,12 +584,17 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
if "og:image" not in og or not og["og:image"]:
|
if "og:image" not in og or not og["og:image"]:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# The image URL from the HTML might be relative to the previewed page,
|
||||||
|
# convert it to an URL which can be requested directly.
|
||||||
|
image_url = og["og:image"]
|
||||||
|
url_parts = urlparse(image_url)
|
||||||
|
if url_parts.scheme != "data":
|
||||||
|
image_url = urljoin(media_info.uri, image_url)
|
||||||
|
|
||||||
# FIXME: it might be cleaner to use the same flow as the main /preview_url
|
# FIXME: it might be cleaner to use the same flow as the main /preview_url
|
||||||
# request itself and benefit from the same caching etc. But for now we
|
# request itself and benefit from the same caching etc. But for now we
|
||||||
# just rely on the caching on the master request to speed things up.
|
# just rely on the caching on the master request to speed things up.
|
||||||
image_info = await self._handle_url(
|
image_info = await self._handle_url(image_url, user, allow_data_urls=True)
|
||||||
rebase_url(og["og:image"], media_info.uri), user, allow_data_urls=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if _is_media(image_info.media_type):
|
if _is_media(image_info.media_type):
|
||||||
# TODO: make sure we don't choke on white-on-transparent images
|
# TODO: make sure we don't choke on white-on-transparent images
|
||||||
|
|
|
@ -16,7 +16,6 @@ from synapse.rest.media.v1.preview_html import (
|
||||||
_get_html_media_encodings,
|
_get_html_media_encodings,
|
||||||
decode_body,
|
decode_body,
|
||||||
parse_html_to_open_graph,
|
parse_html_to_open_graph,
|
||||||
rebase_url,
|
|
||||||
summarize_paragraphs,
|
summarize_paragraphs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -161,7 +160,7 @@ class CalcOgTestCase(unittest.TestCase):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tree = decode_body(html, "http://example.com/test.html")
|
tree = decode_body(html, "http://example.com/test.html")
|
||||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
og = parse_html_to_open_graph(tree)
|
||||||
|
|
||||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
|
@ -177,7 +176,7 @@ class CalcOgTestCase(unittest.TestCase):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tree = decode_body(html, "http://example.com/test.html")
|
tree = decode_body(html, "http://example.com/test.html")
|
||||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
og = parse_html_to_open_graph(tree)
|
||||||
|
|
||||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
|
@ -196,7 +195,7 @@ class CalcOgTestCase(unittest.TestCase):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tree = decode_body(html, "http://example.com/test.html")
|
tree = decode_body(html, "http://example.com/test.html")
|
||||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
og = parse_html_to_open_graph(tree)
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
og,
|
og,
|
||||||
|
@ -218,7 +217,7 @@ class CalcOgTestCase(unittest.TestCase):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tree = decode_body(html, "http://example.com/test.html")
|
tree = decode_body(html, "http://example.com/test.html")
|
||||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
og = parse_html_to_open_graph(tree)
|
||||||
|
|
||||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
|
@ -232,7 +231,7 @@ class CalcOgTestCase(unittest.TestCase):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tree = decode_body(html, "http://example.com/test.html")
|
tree = decode_body(html, "http://example.com/test.html")
|
||||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
og = parse_html_to_open_graph(tree)
|
||||||
|
|
||||||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
||||||
|
|
||||||
|
@ -247,7 +246,7 @@ class CalcOgTestCase(unittest.TestCase):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tree = decode_body(html, "http://example.com/test.html")
|
tree = decode_body(html, "http://example.com/test.html")
|
||||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
og = parse_html_to_open_graph(tree)
|
||||||
|
|
||||||
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
|
||||||
|
|
||||||
|
@ -262,7 +261,7 @@ class CalcOgTestCase(unittest.TestCase):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tree = decode_body(html, "http://example.com/test.html")
|
tree = decode_body(html, "http://example.com/test.html")
|
||||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
og = parse_html_to_open_graph(tree)
|
||||||
|
|
||||||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
||||||
|
|
||||||
|
@ -290,7 +289,7 @@ class CalcOgTestCase(unittest.TestCase):
|
||||||
<head><title>Foo</title></head><body>Some text.</body></html>
|
<head><title>Foo</title></head><body>Some text.</body></html>
|
||||||
""".strip()
|
""".strip()
|
||||||
tree = decode_body(html, "http://example.com/test.html")
|
tree = decode_body(html, "http://example.com/test.html")
|
||||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
og = parse_html_to_open_graph(tree)
|
||||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
def test_invalid_encoding(self) -> None:
|
def test_invalid_encoding(self) -> None:
|
||||||
|
@ -304,7 +303,7 @@ class CalcOgTestCase(unittest.TestCase):
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
|
tree = decode_body(html, "http://example.com/test.html", "invalid-encoding")
|
||||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
og = parse_html_to_open_graph(tree)
|
||||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
def test_invalid_encoding2(self) -> None:
|
def test_invalid_encoding2(self) -> None:
|
||||||
|
@ -319,7 +318,7 @@ class CalcOgTestCase(unittest.TestCase):
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
tree = decode_body(html, "http://example.com/test.html")
|
tree = decode_body(html, "http://example.com/test.html")
|
||||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
og = parse_html_to_open_graph(tree)
|
||||||
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
def test_windows_1252(self) -> None:
|
def test_windows_1252(self) -> None:
|
||||||
|
@ -333,7 +332,7 @@ class CalcOgTestCase(unittest.TestCase):
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
tree = decode_body(html, "http://example.com/test.html")
|
tree = decode_body(html, "http://example.com/test.html")
|
||||||
og = parse_html_to_open_graph(tree, "http://example.com/test.html")
|
og = parse_html_to_open_graph(tree)
|
||||||
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
|
||||||
|
|
||||||
|
|
||||||
|
@ -448,34 +447,3 @@ class MediaEncodingTestCase(unittest.TestCase):
|
||||||
'text/html; charset="invalid"',
|
'text/html; charset="invalid"',
|
||||||
)
|
)
|
||||||
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
|
self.assertEqual(list(encodings), ["utf-8", "cp1252"])
|
||||||
|
|
||||||
|
|
||||||
class RebaseUrlTestCase(unittest.TestCase):
|
|
||||||
def test_relative(self) -> None:
|
|
||||||
"""Relative URLs should be resolved based on the context of the base URL."""
|
|
||||||
self.assertEqual(
|
|
||||||
rebase_url("subpage", "https://example.com/foo/"),
|
|
||||||
"https://example.com/foo/subpage",
|
|
||||||
)
|
|
||||||
self.assertEqual(
|
|
||||||
rebase_url("sibling", "https://example.com/foo"),
|
|
||||||
"https://example.com/sibling",
|
|
||||||
)
|
|
||||||
self.assertEqual(
|
|
||||||
rebase_url("/bar", "https://example.com/foo/"),
|
|
||||||
"https://example.com/bar",
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_absolute(self) -> None:
|
|
||||||
"""Absolute URLs should not be modified."""
|
|
||||||
self.assertEqual(
|
|
||||||
rebase_url("https://alice.com/a/", "https://example.com/foo/"),
|
|
||||||
"https://alice.com/a/",
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_data(self) -> None:
|
|
||||||
"""Data URLs should not be modified."""
|
|
||||||
self.assertEqual(
|
|
||||||
rebase_url("data:,Hello%2C%20World%21", "https://example.com/foo/"),
|
|
||||||
"data:,Hello%2C%20World%21",
|
|
||||||
)
|
|
||||||
|
|
Loading…
Reference in a new issue