Fix URL preview errors when previewing XML documents. (#11196)

2021-10-27 10:48:02 -04:00 · 2021-10-27 10:48:02 -04:00 · b3e843be88
parent e0ef8fe58d
commit b3e843be88
3 changed files with 22 additions and 3 deletions
--- a/changelog.d/11196.bugfix
+++ b/changelog.d/11196.bugfix
@ -0,0 +1 @@
+Fix a bug introduced in v1.46.0rc1 where URL previews of some XML documents would fail.
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@ -718,9 +718,12 @@ def decode_body(
    if not body:
        return None

+    # The idea here is that multiple encodings are tried until one works.
+    # Unfortunately the result is never used and then LXML will decode the string
+    # again with the found encoding.
    for encoding in get_html_media_encodings(body, content_type):
        try:
-            body_str = body.decode(encoding)
+            body.decode(encoding)
        except Exception:
            pass
        else:
@ -732,11 +735,11 @@ def decode_body(
    from lxml import etree

    # Create an HTML parser.
-    parser = etree.HTMLParser(recover=True, encoding="utf-8")
+    parser = etree.HTMLParser(recover=True, encoding=encoding)

    # Attempt to parse the body. Returns None if the body was successfully
    # parsed, but no tree was found.
-    return etree.fromstring(body_str, parser)
+    return etree.fromstring(body, parser)


 def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
--- a/tests/test_preview.py
+++ b/tests/test_preview.py
@ -277,6 +277,21 @@ class CalcOgTestCase(unittest.TestCase):
        tree = decode_body(html, "http://example.com/test.html")
        self.assertIsNone(tree)

+    def test_xml(self):
+        """Test decoding XML and ensure it works properly."""
+        # Note that the strip() call is important to ensure the xml tag starts
+        # at the initial byte.
+        html = b"""
+        <?xml version="1.0" encoding="UTF-8"?>
+
+        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+        <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+        <head><title>Foo</title></head><body>Some text.</body></html>
+        """.strip()
+        tree = decode_body(html, "http://example.com/test.html")
+        og = _calc_og(tree, "http://example.com/test.html")
+        self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
+
    def test_invalid_encoding(self):
        """An invalid character encoding should be ignored and treated as UTF-8, if possible."""
        html = b"""
				`@ -0,0 +1 @@`
				`Fix a bug introduced in v1.46.0rc1 where URL previews of some XML documents would fail.`