Improve URL previews for some pages (#12951)

* Skip `og` and `meta` tags where the value is empty. * Fallback to the favicon if there are no other images. * Ignore tags meant for navigation.
2022-06-03 12:09:12 -04:00 · 2022-06-03 12:09:12 -04:00 · 01df5bacac
commit 01df5bacac
parent 888a29f412
3 changed files with 72 additions and 18 deletions
--- a/changelog.d/12951.feature
+++ b/changelog.d/12951.feature
@ -0,0 +1 @@
 Improve URL previews for pages with empty elements.
--- a/synapse/rest/media/v1/preview_html.py
+++ b/synapse/rest/media/v1/preview_html.py
@ -30,6 +30,9 @@ _xml_encoding_match = re.compile(
 )
 _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
 # Certain elements aren't meant for display.
 ARIA_ROLES_TO_IGNORE = {"directory", "menu", "menubar", "toolbar"}
 def _normalise_encoding(encoding: str) -> Optional[str]:
    """Use the Python codec's name as the normalised entry."""
@ -174,12 +177,14 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
    # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3",
    og: Dict[str, Optional[str]] = {}
-    for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
+    for tag in tree.xpath(
-        if "content" in tag.attrib:
+        "//*/meta[starts-with(@property, 'og:')][@content][not(@content='')]"
    ):
        # if we've got more than 50 tags, someone is taking the piss
        if len(og) >= 50:
            logger.warning("Skipping OG for page with too many 'og:' tags")
            return {}
        og[tag.attrib["property"]] = tag.attrib["content"]
    # TODO: grab article: meta tags too, e.g.:
@ -192,21 +197,23 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
    # "article:modified_time" content="2016-04-01T18:31:53+00:00" />
    if "og:title" not in og:
-        # do some basic spidering of the HTML
+        # Attempt to find a title from the title tag, or the biggest header on the page.
-        title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]")
+        title = tree.xpath("((//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1])/text()")
-        if title and title[0].text is not None:
+        if title:
-            og["og:title"] = title[0].text.strip()
+            og["og:title"] = title[0].strip()
        else:
            og["og:title"] = None
    if "og:image" not in og:
        # TODO: extract a favicon failing all else
        meta_image = tree.xpath(
-            "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content"
+            "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image'][not(@content='')]/@content[1]"
        )
        # If a meta image is found, use it.
        if meta_image:
            og["og:image"] = meta_image[0]
        else:
            # Try to find images which are larger than 10px by 10px.
            #
            # TODO: consider inlined CSS styles as well as width & height attribs
            images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]")
            images = sorted(
@ -215,17 +222,24 @@ def parse_html_to_open_graph(tree: "etree.Element") -> Dict[str, Optional[str]]:
                    -1 * float(i.attrib["width"]) * float(i.attrib["height"])
                ),
            )
            # If no images were found, try to find *any* images.
            if not images:
-                images = tree.xpath("//img[@src]")
+                images = tree.xpath("//img[@src][1]")
            if images:
                og["og:image"] = images[0].attrib["src"]
            # Finally, fallback to the favicon if nothing else.
            else:
                favicons = tree.xpath("//link[@href][contains(@rel, 'icon')]/@href[1]")
                if favicons:
                    og["og:image"] = favicons[0]
    if "og:description" not in og:
        # Check the first meta description tag for content.
        meta_description = tree.xpath(
-            "//*/meta"
+            "//*/meta[translate(@name, 'DESCRIPTION', 'description')='description'][not(@content='')]/@content[1]"
            "[translate(@name, 'DESCRIPTION', 'description')='description']"
            "/@content"
        )
        # If a meta description is found with content, use it.
        if meta_description:
            og["og:description"] = meta_description[0]
        else:
@ -306,6 +320,10 @@ def _iterate_over_text(
        if isinstance(el, str):
            yield el
        elif el.tag not in tags_to_ignore:
            # If the element isn't meant for display, ignore it.
            if el.get("role") in ARIA_ROLES_TO_IGNORE:
                continue
            # el.text is the text before the first child, so we can immediately
            # return it if the text exists.
            if el.text:
--- a/tests/rest/media/v1/test_html_preview.py
+++ b/tests/rest/media/v1/test_html_preview.py
@ -145,7 +145,7 @@ class SummarizeTestCase(unittest.TestCase):
        )
-class CalcOgTestCase(unittest.TestCase):
+class OpenGraphFromHtmlTestCase(unittest.TestCase):
    if not lxml:
        skip = "url preview feature requires lxml"
@ -235,6 +235,21 @@ class CalcOgTestCase(unittest.TestCase):
        self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
        # Another variant is a title with no content.
        html = b"""
        <html>
        <head><title></title></head>
        <body>
        <h1>Title</h1>
        </body>
        </html>
        """
        tree = decode_body(html, "http://example.com/test.html")
        og = parse_html_to_open_graph(tree)
        self.assertEqual(og, {"og:title": "Title", "og:description": "Title"})
    def test_h1_as_title(self) -> None:
        html = b"""
        <html>
@ -250,6 +265,26 @@ class CalcOgTestCase(unittest.TestCase):
        self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
    def test_empty_description(self) -> None:
        """Description tags with empty content should be ignored."""
        html = b"""
        <html>
        <meta property="og:description" content=""/>
        <meta property="og:description"/>
        <meta name="description" content=""/>
        <meta name="description"/>
        <meta name="description" content="Finally!"/>
        <body>
        <h1>Title</h1>
        </body>
        </html>
        """
        tree = decode_body(html, "http://example.com/test.html")
        og = parse_html_to_open_graph(tree)
        self.assertEqual(og, {"og:title": "Title", "og:description": "Finally!"})
    def test_missing_title_and_broken_h1(self) -> None:
        html = b"""
        <html>
		`@ -0,0 +1 @@`
							`Improve URL previews for pages with empty elements.`