Include more information in oEmbed previews. (#10819)

* Improved titles (fall back to the author name if there's not title) and include the site name. * Handle photo/video payloads. * Include the original URL in the Open Graph response. * Fix the expiration time (by properly converting from seconds to milliseconds).
2021-09-22 09:45:20 -04:00 · 2021-09-22 09:45:20 -04:00 · 6fc8be9a1b
parent 9391de3f37
commit 6fc8be9a1b
4 changed files with 68 additions and 14 deletions
--- a/changelog.d/10819.feature
+++ b/changelog.d/10819.feature
@ -0,0 +1 @@
+Improve oEmbed previews by processing the author name, photo, and video information.
--- a/synapse/rest/media/v1/oembed.py
+++ b/synapse/rest/media/v1/oembed.py
@ -13,7 +13,7 @@
 #  limitations under the License.
 import logging
 import urllib.parse
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, List, Optional

 import attr

@ -22,6 +22,8 @@ from synapse.types import JsonDict
 from synapse.util import json_decoder

 if TYPE_CHECKING:
+    from lxml import etree
+
    from synapse.server import HomeServer

 logger = logging.getLogger(__name__)
@ -31,7 +33,7 @@ logger = logging.getLogger(__name__)
 class OEmbedResult:
    # The Open Graph result (converted from the oEmbed result).
    open_graph_result: JsonDict
-    # Number of seconds to cache the content, according to the oEmbed response.
+    # Number of milliseconds to cache the content, according to the oEmbed response.
    #
    # This will be None if no cache-age is provided in the oEmbed response (or
    # if the oEmbed response cannot be turned into an Open Graph response).
@ -119,10 +121,22 @@ class OEmbedProvider:
            # Ensure the cache age is None or an int.
            cache_age = oembed.get("cache_age")
            if cache_age:
-                cache_age = int(cache_age)
+                cache_age = int(cache_age) * 1000

            # The results.
-            open_graph_response = {"og:title": oembed.get("title")}
+            open_graph_response = {
+                "og:url": url,
+            }
+
+            # Use either title or author's name as the title.
+            title = oembed.get("title") or oembed.get("author_name")
+            if title:
+                open_graph_response["og:title"] = title
+
+            # Use the provider name and as the site.
+            provider_name = oembed.get("provider_name")
+            if provider_name:
+                open_graph_response["og:site_name"] = provider_name

            # If a thumbnail exists, use it. Note that dimensions will be calculated later.
            if "thumbnail_url" in oembed:
@ -137,6 +151,15 @@ class OEmbedProvider:
                # If this is a photo, use the full image, not the thumbnail.
                open_graph_response["og:image"] = oembed["url"]

+            elif oembed_type == "video":
+                open_graph_response["og:type"] = "video.other"
+                calc_description_and_urls(open_graph_response, oembed["html"])
+                open_graph_response["og:video:width"] = oembed["width"]
+                open_graph_response["og:video:height"] = oembed["height"]
+
+            elif oembed_type == "link":
+                open_graph_response["og:type"] = "website"
+
            else:
                raise RuntimeError(f"Unknown oEmbed type: {oembed_type}")

@ -149,6 +172,14 @@ class OEmbedProvider:
        return OEmbedResult(open_graph_response, cache_age)


+def _fetch_urls(tree: "etree.Element", tag_name: str) -> List[str]:
+    results = []
+    for tag in tree.xpath("//*/" + tag_name):
+        if "src" in tag.attrib:
+            results.append(tag.attrib["src"])
+    return results
+
+
 def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) -> None:
    """
    Calculate description for an HTML document.
@ -179,6 +210,16 @@ def calc_description_and_urls(open_graph_response: JsonDict, html_body: str) ->
    if tree is None:
        return

+    # Attempt to find interesting URLs (images, videos, embeds).
+    if "og:image" not in open_graph_response:
+        image_urls = _fetch_urls(tree, "img")
+        if image_urls:
+            open_graph_response["og:image"] = image_urls[0]
+
+    video_urls = _fetch_urls(tree, "video") + _fetch_urls(tree, "embed")
+    if video_urls:
+        open_graph_response["og:video"] = video_urls[0]
+
    from synapse.rest.media.v1.preview_url_resource import _calc_description

    description = _calc_description(tree)
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@ -305,7 +305,7 @@ class PreviewUrlResource(DirectServeJsonResource):
            with open(media_info.filename, "rb") as file:
                body = file.read()

-            oembed_response = self._oembed.parse_oembed_response(media_info.uri, body)
+            oembed_response = self._oembed.parse_oembed_response(url, body)
            og = oembed_response.open_graph_result

            # Use the cache age from the oEmbed result, instead of the HTTP response.
--- a/tests/rest/media/v1/test_url_preview.py
+++ b/tests/rest/media/v1/test_url_preview.py
@ -620,11 +620,12 @@ class URLPreviewTests(unittest.HomeserverTestCase):
        self.assertIn(b"/matrixdotorg", server.data)

        self.assertEqual(channel.code, 200)
-        self.assertIsNone(channel.json_body["og:title"])
-        self.assertTrue(channel.json_body["og:image"].startswith("mxc://"))
-        self.assertEqual(channel.json_body["og:image:height"], 1)
-        self.assertEqual(channel.json_body["og:image:width"], 1)
-        self.assertEqual(channel.json_body["og:image:type"], "image/png")
+        body = channel.json_body
+        self.assertEqual(body["og:url"], "http://twitter.com/matrixdotorg/status/12345")
+        self.assertTrue(body["og:image"].startswith("mxc://"))
+        self.assertEqual(body["og:image:height"], 1)
+        self.assertEqual(body["og:image:width"], 1)
+        self.assertEqual(body["og:image:type"], "image/png")

    def test_oembed_rich(self):
        """Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
@ -633,6 +634,8 @@ class URLPreviewTests(unittest.HomeserverTestCase):
        result = {
            "version": "1.0",
            "type": "rich",
+            # Note that this provides the author, not the title.
+            "author_name": "Alice",
            "html": "<div>Content Preview</div>",
        }
        end_content = json.dumps(result).encode("utf-8")
@ -660,9 +663,14 @@ class URLPreviewTests(unittest.HomeserverTestCase):

        self.pump()
        self.assertEqual(channel.code, 200)
+        body = channel.json_body
        self.assertEqual(
-            channel.json_body,
-            {"og:title": None, "og:description": "Content Preview"},
+            body,
+            {
+                "og:url": "http://twitter.com/matrixdotorg/status/12345",
+                "og:title": "Alice",
+                "og:description": "Content Preview",
+            },
        )

    def test_oembed_format(self):
@ -705,7 +713,11 @@ class URLPreviewTests(unittest.HomeserverTestCase):
        self.assertIn(b"format=json", server.data)

        self.assertEqual(channel.code, 200)
+        body = channel.json_body
        self.assertEqual(
-            channel.json_body,
-            {"og:title": None, "og:description": "Content Preview"},
+            body,
+            {
+                "og:url": "http://www.hulu.com/watch/12345",
+                "og:description": "Content Preview",
+            },
        )
				`@ -0,0 +1 @@`
				`Improve oEmbed previews by processing the author name, photo, and video information.`