Handle additional errors when previewing URLs. (#9333)

* Handle the case of lxml not finding a document tree. * Parse the document encoding from the XML tag.
2024-11-13 21:41:30 +01:00 · 2021-02-08 12:33:30 -05:00 · 2021-02-08 12:33:30 -05:00 · 0963d39ea6
commit 0963d39ea6
parent b0b2cac057
3 changed files with 145 additions and 30 deletions
--- a/changelog.d/9333.bugfix
+++ b/changelog.d/9333.bugfix
@ -0,0 +1 @@
 Fix additional errors when previewing URLs: "AttributeError 'NoneType' object has no attribute 'xpath'" and "ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.".
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@ -58,7 +58,10 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
-_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
+_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
 _xml_encoding_match = re.compile(
    br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I
 )
 _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
 OG_TAG_NAME_MAXLEN = 50
@ -300,24 +303,7 @@ class PreviewUrlResource(DirectServeJsonResource):
            with open(media_info["filename"], "rb") as file:
                body = file.read()
-            encoding = None
+            encoding = get_html_media_encoding(body, media_info["media_type"])
            # Let's try and figure out if it has an encoding set in a meta tag.
            # Limit it to the first 1kb, since it ought to be in the meta tags
            # at the top.
            match = _charset_match.search(body[:1000])
            # If we find a match, it should take precedence over the
            # Content-Type header, so set it here.
            if match:
                encoding = match.group(1).decode("ascii")
            # If we don't find a match, we'll look at the HTTP Content-Type, and
            # if that doesn't exist, we'll fall back to UTF-8.
            if not encoding:
                content_match = _content_type_match.match(media_info["media_type"])
                encoding = content_match.group(1) if content_match else "utf-8"
            og = decode_and_calc_og(body, media_info["uri"], encoding)
            # pre-cache the image for posterity
@ -689,6 +675,48 @@ class PreviewUrlResource(DirectServeJsonResource):
            logger.debug("No media removed from url cache")
 def get_html_media_encoding(body: bytes, content_type: str) -> str:
    """
    Get the encoding of the body based on the (presumably) HTML body or media_type.
    The precedence used for finding a character encoding is:
    1. meta tag with a charset declared.
    2. The XML document's character encoding attribute.
    3. The Content-Type header.
    4. Fallback to UTF-8.
    Args:
        body: The HTML document, as bytes.
        content_type: The Content-Type header.
    Returns:
        The character encoding of the body, as a string.
    """
    # Limit searches to the first 1kb, since it ought to be at the top.
    body_start = body[:1024]
    # Let's try and figure out if it has an encoding set in a meta tag.
    match = _charset_match.search(body_start)
    if match:
        return match.group(1).decode("ascii")
    # TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
    # If we didn't find a match, see if it an XML document with an encoding.
    match = _xml_encoding_match.match(body_start)
    if match:
        return match.group(1).decode("ascii")
    # If we don't find a match, we'll look at the HTTP Content-Type, and
    # if that doesn't exist, we'll fall back to UTF-8.
    content_match = _content_type_match.match(content_type)
    if content_match:
        return content_match.group(1)
    return "utf-8"
 def decode_and_calc_og(
    body: bytes, media_uri: str, request_encoding: Optional[str] = None
 ) -> Dict[str, Optional[str]]:
@ -725,6 +753,11 @@ def decode_and_calc_og(
    def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
        # Attempt to parse the body. If this fails, log and return no metadata.
        tree = etree.fromstring(body_attempt, parser)
        # The data was successfully parsed, but no tree was found.
        if tree is None:
            return {}
        return _calc_og(tree, media_uri)
    # Attempt to parse the body. If this fails, log and return no metadata.
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@ -15,6 +15,7 @@
 from synapse.rest.media.v1.preview_url_resource import (
    decode_and_calc_og,
    get_html_media_encoding,
    summarize_paragraphs,
 )
@ -26,7 +27,7 @@ except ImportError:
    lxml = None
-class PreviewTestCase(unittest.TestCase):
+class SummarizeTestCase(unittest.TestCase):
    if not lxml:
        skip = "url preview feature requires lxml"
@ -144,12 +145,12 @@ class PreviewTestCase(unittest.TestCase):
        )
-class PreviewUrlTestCase(unittest.TestCase):
+class CalcOgTestCase(unittest.TestCase):
    if not lxml:
        skip = "url preview feature requires lxml"
    def test_simple(self):
-        html = """
+        html = b"""
        <html>
        <head><title>Foo</title></head>
        <body>
@ -163,7 +164,7 @@ class PreviewUrlTestCase(unittest.TestCase):
        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
    def test_comment(self):
-        html = """
+        html = b"""
        <html>
        <head><title>Foo</title></head>
        <body>
@ -178,7 +179,7 @@ class PreviewUrlTestCase(unittest.TestCase):
        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
    def test_comment2(self):
-        html = """
+        html = b"""
        <html>
        <head><title>Foo</title></head>
        <body>
@ -202,7 +203,7 @@ class PreviewUrlTestCase(unittest.TestCase):
        )
    def test_script(self):
-        html = """
+        html = b"""
        <html>
        <head><title>Foo</title></head>
        <body>
@ -217,7 +218,7 @@ class PreviewUrlTestCase(unittest.TestCase):
        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
    def test_missing_title(self):
-        html = """
+        html = b"""
        <html>
        <body>
        Some text.
@ -230,7 +231,7 @@ class PreviewUrlTestCase(unittest.TestCase):
        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
    def test_h1_as_title(self):
-        html = """
+        html = b"""
        <html>
        <meta property="og:description" content="Some text."/>
        <body>
@ -244,7 +245,7 @@ class PreviewUrlTestCase(unittest.TestCase):
        self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
    def test_missing_title_and_broken_h1(self):
-        html = """
+        html = b"""
        <html>
        <body>
        <h1><a href="foo"/></h1>
@ -258,13 +259,20 @@ class PreviewUrlTestCase(unittest.TestCase):
        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
    def test_empty(self):
-        html = ""
+        """Test a body with no data in it."""
        html = b""
        og = decode_and_calc_og(html, "http://example.com/test.html")
        self.assertEqual(og, {})
    def test_no_tree(self):
        """A valid body with no tree in it."""
        html = b"\x00"
        og = decode_and_calc_og(html, "http://example.com/test.html")
        self.assertEqual(og, {})
    def test_invalid_encoding(self):
        """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
-        html = """
+        html = b"""
        <html>
        <head><title>Foo</title></head>
        <body>
@ -290,3 +298,76 @@ class PreviewUrlTestCase(unittest.TestCase):
        """
        og = decode_and_calc_og(html, "http://example.com/test.html")
        self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
 class MediaEncodingTestCase(unittest.TestCase):
    def test_meta_charset(self):
        """A character encoding is found via the meta tag."""
        encoding = get_html_media_encoding(
            b"""
        <html>
        <head><meta charset="ascii">
        </head>
        </html>
        """,
            "text/html",
        )
        self.assertEqual(encoding, "ascii")
        # A less well-formed version.
        encoding = get_html_media_encoding(
            b"""
        <html>
        <head>< meta charset = ascii>
        </head>
        </html>
        """,
            "text/html",
        )
        self.assertEqual(encoding, "ascii")
    def test_xml_encoding(self):
        """A character encoding is found via the meta tag."""
        encoding = get_html_media_encoding(
            b"""
        <?xml version="1.0" encoding="ascii"?>
        <html>
        </html>
        """,
            "text/html",
        )
        self.assertEqual(encoding, "ascii")
    def test_meta_xml_encoding(self):
        """Meta tags take precedence over XML encoding."""
        encoding = get_html_media_encoding(
            b"""
        <?xml version="1.0" encoding="ascii"?>
        <html>
        <head><meta charset="UTF-16">
        </head>
        </html>
        """,
            "text/html",
        )
        self.assertEqual(encoding, "UTF-16")
    def test_content_type(self):
        """A character encoding is found via the Content-Type header."""
        # Test a few variations of the header.
        headers = (
            'text/html; charset="ascii";',
            "text/html;charset=ascii;",
            'text/html;  charset="ascii"',
            "text/html; charset=ascii",
            'text/html; charset="ascii;',
            'text/html; charset=ascii";',
        )
        for header in headers:
            encoding = get_html_media_encoding(b"", header)
            self.assertEqual(encoding, "ascii")
    def test_fallback(self):
        """A character encoding cannot be found in the body or header."""
        encoding = get_html_media_encoding(b"", "text/html")
        self.assertEqual(encoding, "utf-8")
		`@ -0,0 +1 @@`
							`Fix additional errors when previewing URLs: "AttributeError 'NoneType' object has no attribute 'xpath'" and "ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.".`