forked from MirrorHub/synapse
Handle additional errors when previewing URLs. (#9333)
* Handle the case of lxml not finding a document tree. * Parse the document encoding from the XML tag.
This commit is contained in:
parent
b0b2cac057
commit
0963d39ea6
3 changed files with 145 additions and 30 deletions
1
changelog.d/9333.bugfix
Normal file
1
changelog.d/9333.bugfix
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Fix additional errors when previewing URLs: "AttributeError 'NoneType' object has no attribute 'xpath'" and "ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.".
|
|
@ -58,7 +58,10 @@ if TYPE_CHECKING:
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
|
_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
|
||||||
|
_xml_encoding_match = re.compile(
|
||||||
|
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I
|
||||||
|
)
|
||||||
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
|
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
|
||||||
|
|
||||||
OG_TAG_NAME_MAXLEN = 50
|
OG_TAG_NAME_MAXLEN = 50
|
||||||
|
@ -300,24 +303,7 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
with open(media_info["filename"], "rb") as file:
|
with open(media_info["filename"], "rb") as file:
|
||||||
body = file.read()
|
body = file.read()
|
||||||
|
|
||||||
encoding = None
|
encoding = get_html_media_encoding(body, media_info["media_type"])
|
||||||
|
|
||||||
# Let's try and figure out if it has an encoding set in a meta tag.
|
|
||||||
# Limit it to the first 1kb, since it ought to be in the meta tags
|
|
||||||
# at the top.
|
|
||||||
match = _charset_match.search(body[:1000])
|
|
||||||
|
|
||||||
# If we find a match, it should take precedence over the
|
|
||||||
# Content-Type header, so set it here.
|
|
||||||
if match:
|
|
||||||
encoding = match.group(1).decode("ascii")
|
|
||||||
|
|
||||||
# If we don't find a match, we'll look at the HTTP Content-Type, and
|
|
||||||
# if that doesn't exist, we'll fall back to UTF-8.
|
|
||||||
if not encoding:
|
|
||||||
content_match = _content_type_match.match(media_info["media_type"])
|
|
||||||
encoding = content_match.group(1) if content_match else "utf-8"
|
|
||||||
|
|
||||||
og = decode_and_calc_og(body, media_info["uri"], encoding)
|
og = decode_and_calc_og(body, media_info["uri"], encoding)
|
||||||
|
|
||||||
# pre-cache the image for posterity
|
# pre-cache the image for posterity
|
||||||
|
@ -689,6 +675,48 @@ class PreviewUrlResource(DirectServeJsonResource):
|
||||||
logger.debug("No media removed from url cache")
|
logger.debug("No media removed from url cache")
|
||||||
|
|
||||||
|
|
||||||
|
def get_html_media_encoding(body: bytes, content_type: str) -> str:
|
||||||
|
"""
|
||||||
|
Get the encoding of the body based on the (presumably) HTML body or media_type.
|
||||||
|
|
||||||
|
The precedence used for finding a character encoding is:
|
||||||
|
|
||||||
|
1. meta tag with a charset declared.
|
||||||
|
2. The XML document's character encoding attribute.
|
||||||
|
3. The Content-Type header.
|
||||||
|
4. Fallback to UTF-8.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
body: The HTML document, as bytes.
|
||||||
|
content_type: The Content-Type header.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The character encoding of the body, as a string.
|
||||||
|
"""
|
||||||
|
# Limit searches to the first 1kb, since it ought to be at the top.
|
||||||
|
body_start = body[:1024]
|
||||||
|
|
||||||
|
# Let's try and figure out if it has an encoding set in a meta tag.
|
||||||
|
match = _charset_match.search(body_start)
|
||||||
|
if match:
|
||||||
|
return match.group(1).decode("ascii")
|
||||||
|
|
||||||
|
# TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
|
||||||
|
|
||||||
|
# If we didn't find a match, see if it an XML document with an encoding.
|
||||||
|
match = _xml_encoding_match.match(body_start)
|
||||||
|
if match:
|
||||||
|
return match.group(1).decode("ascii")
|
||||||
|
|
||||||
|
# If we don't find a match, we'll look at the HTTP Content-Type, and
|
||||||
|
# if that doesn't exist, we'll fall back to UTF-8.
|
||||||
|
content_match = _content_type_match.match(content_type)
|
||||||
|
if content_match:
|
||||||
|
return content_match.group(1)
|
||||||
|
|
||||||
|
return "utf-8"
|
||||||
|
|
||||||
|
|
||||||
def decode_and_calc_og(
|
def decode_and_calc_og(
|
||||||
body: bytes, media_uri: str, request_encoding: Optional[str] = None
|
body: bytes, media_uri: str, request_encoding: Optional[str] = None
|
||||||
) -> Dict[str, Optional[str]]:
|
) -> Dict[str, Optional[str]]:
|
||||||
|
@ -725,6 +753,11 @@ def decode_and_calc_og(
|
||||||
def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
|
def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
|
||||||
# Attempt to parse the body. If this fails, log and return no metadata.
|
# Attempt to parse the body. If this fails, log and return no metadata.
|
||||||
tree = etree.fromstring(body_attempt, parser)
|
tree = etree.fromstring(body_attempt, parser)
|
||||||
|
|
||||||
|
# The data was successfully parsed, but no tree was found.
|
||||||
|
if tree is None:
|
||||||
|
return {}
|
||||||
|
|
||||||
return _calc_og(tree, media_uri)
|
return _calc_og(tree, media_uri)
|
||||||
|
|
||||||
# Attempt to parse the body. If this fails, log and return no metadata.
|
# Attempt to parse the body. If this fails, log and return no metadata.
|
||||||
|
|
|
@ -15,6 +15,7 @@
|
||||||
|
|
||||||
from synapse.rest.media.v1.preview_url_resource import (
|
from synapse.rest.media.v1.preview_url_resource import (
|
||||||
decode_and_calc_og,
|
decode_and_calc_og,
|
||||||
|
get_html_media_encoding,
|
||||||
summarize_paragraphs,
|
summarize_paragraphs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -26,7 +27,7 @@ except ImportError:
|
||||||
lxml = None
|
lxml = None
|
||||||
|
|
||||||
|
|
||||||
class PreviewTestCase(unittest.TestCase):
|
class SummarizeTestCase(unittest.TestCase):
|
||||||
if not lxml:
|
if not lxml:
|
||||||
skip = "url preview feature requires lxml"
|
skip = "url preview feature requires lxml"
|
||||||
|
|
||||||
|
@ -144,12 +145,12 @@ class PreviewTestCase(unittest.TestCase):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class PreviewUrlTestCase(unittest.TestCase):
|
class CalcOgTestCase(unittest.TestCase):
|
||||||
if not lxml:
|
if not lxml:
|
||||||
skip = "url preview feature requires lxml"
|
skip = "url preview feature requires lxml"
|
||||||
|
|
||||||
def test_simple(self):
|
def test_simple(self):
|
||||||
html = """
|
html = b"""
|
||||||
<html>
|
<html>
|
||||||
<head><title>Foo</title></head>
|
<head><title>Foo</title></head>
|
||||||
<body>
|
<body>
|
||||||
|
@ -163,7 +164,7 @@ class PreviewUrlTestCase(unittest.TestCase):
|
||||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
def test_comment(self):
|
def test_comment(self):
|
||||||
html = """
|
html = b"""
|
||||||
<html>
|
<html>
|
||||||
<head><title>Foo</title></head>
|
<head><title>Foo</title></head>
|
||||||
<body>
|
<body>
|
||||||
|
@ -178,7 +179,7 @@ class PreviewUrlTestCase(unittest.TestCase):
|
||||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
def test_comment2(self):
|
def test_comment2(self):
|
||||||
html = """
|
html = b"""
|
||||||
<html>
|
<html>
|
||||||
<head><title>Foo</title></head>
|
<head><title>Foo</title></head>
|
||||||
<body>
|
<body>
|
||||||
|
@ -202,7 +203,7 @@ class PreviewUrlTestCase(unittest.TestCase):
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_script(self):
|
def test_script(self):
|
||||||
html = """
|
html = b"""
|
||||||
<html>
|
<html>
|
||||||
<head><title>Foo</title></head>
|
<head><title>Foo</title></head>
|
||||||
<body>
|
<body>
|
||||||
|
@ -217,7 +218,7 @@ class PreviewUrlTestCase(unittest.TestCase):
|
||||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
def test_missing_title(self):
|
def test_missing_title(self):
|
||||||
html = """
|
html = b"""
|
||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
Some text.
|
Some text.
|
||||||
|
@ -230,7 +231,7 @@ class PreviewUrlTestCase(unittest.TestCase):
|
||||||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
||||||
|
|
||||||
def test_h1_as_title(self):
|
def test_h1_as_title(self):
|
||||||
html = """
|
html = b"""
|
||||||
<html>
|
<html>
|
||||||
<meta property="og:description" content="Some text."/>
|
<meta property="og:description" content="Some text."/>
|
||||||
<body>
|
<body>
|
||||||
|
@ -244,7 +245,7 @@ class PreviewUrlTestCase(unittest.TestCase):
|
||||||
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
|
||||||
|
|
||||||
def test_missing_title_and_broken_h1(self):
|
def test_missing_title_and_broken_h1(self):
|
||||||
html = """
|
html = b"""
|
||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
<h1><a href="foo"/></h1>
|
<h1><a href="foo"/></h1>
|
||||||
|
@ -258,13 +259,20 @@ class PreviewUrlTestCase(unittest.TestCase):
|
||||||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
||||||
|
|
||||||
def test_empty(self):
|
def test_empty(self):
|
||||||
html = ""
|
"""Test a body with no data in it."""
|
||||||
|
html = b""
|
||||||
|
og = decode_and_calc_og(html, "http://example.com/test.html")
|
||||||
|
self.assertEqual(og, {})
|
||||||
|
|
||||||
|
def test_no_tree(self):
|
||||||
|
"""A valid body with no tree in it."""
|
||||||
|
html = b"\x00"
|
||||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
og = decode_and_calc_og(html, "http://example.com/test.html")
|
||||||
self.assertEqual(og, {})
|
self.assertEqual(og, {})
|
||||||
|
|
||||||
def test_invalid_encoding(self):
|
def test_invalid_encoding(self):
|
||||||
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
|
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
|
||||||
html = """
|
html = b"""
|
||||||
<html>
|
<html>
|
||||||
<head><title>Foo</title></head>
|
<head><title>Foo</title></head>
|
||||||
<body>
|
<body>
|
||||||
|
@ -290,3 +298,76 @@ class PreviewUrlTestCase(unittest.TestCase):
|
||||||
"""
|
"""
|
||||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
og = decode_and_calc_og(html, "http://example.com/test.html")
|
||||||
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
|
|
||||||
|
class MediaEncodingTestCase(unittest.TestCase):
|
||||||
|
def test_meta_charset(self):
|
||||||
|
"""A character encoding is found via the meta tag."""
|
||||||
|
encoding = get_html_media_encoding(
|
||||||
|
b"""
|
||||||
|
<html>
|
||||||
|
<head><meta charset="ascii">
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
""",
|
||||||
|
"text/html",
|
||||||
|
)
|
||||||
|
self.assertEqual(encoding, "ascii")
|
||||||
|
|
||||||
|
# A less well-formed version.
|
||||||
|
encoding = get_html_media_encoding(
|
||||||
|
b"""
|
||||||
|
<html>
|
||||||
|
<head>< meta charset = ascii>
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
""",
|
||||||
|
"text/html",
|
||||||
|
)
|
||||||
|
self.assertEqual(encoding, "ascii")
|
||||||
|
|
||||||
|
def test_xml_encoding(self):
|
||||||
|
"""A character encoding is found via the meta tag."""
|
||||||
|
encoding = get_html_media_encoding(
|
||||||
|
b"""
|
||||||
|
<?xml version="1.0" encoding="ascii"?>
|
||||||
|
<html>
|
||||||
|
</html>
|
||||||
|
""",
|
||||||
|
"text/html",
|
||||||
|
)
|
||||||
|
self.assertEqual(encoding, "ascii")
|
||||||
|
|
||||||
|
def test_meta_xml_encoding(self):
|
||||||
|
"""Meta tags take precedence over XML encoding."""
|
||||||
|
encoding = get_html_media_encoding(
|
||||||
|
b"""
|
||||||
|
<?xml version="1.0" encoding="ascii"?>
|
||||||
|
<html>
|
||||||
|
<head><meta charset="UTF-16">
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
""",
|
||||||
|
"text/html",
|
||||||
|
)
|
||||||
|
self.assertEqual(encoding, "UTF-16")
|
||||||
|
|
||||||
|
def test_content_type(self):
|
||||||
|
"""A character encoding is found via the Content-Type header."""
|
||||||
|
# Test a few variations of the header.
|
||||||
|
headers = (
|
||||||
|
'text/html; charset="ascii";',
|
||||||
|
"text/html;charset=ascii;",
|
||||||
|
'text/html; charset="ascii"',
|
||||||
|
"text/html; charset=ascii",
|
||||||
|
'text/html; charset="ascii;',
|
||||||
|
'text/html; charset=ascii";',
|
||||||
|
)
|
||||||
|
for header in headers:
|
||||||
|
encoding = get_html_media_encoding(b"", header)
|
||||||
|
self.assertEqual(encoding, "ascii")
|
||||||
|
|
||||||
|
def test_fallback(self):
|
||||||
|
"""A character encoding cannot be found in the body or header."""
|
||||||
|
encoding = get_html_media_encoding(b"", "text/html")
|
||||||
|
self.assertEqual(encoding, "utf-8")
|
||||||
|
|
Loading…
Reference in a new issue