From 7dd0c1730a1ea5962a77b9bbb883c1690b25b686 Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Sun, 24 Jan 2016 18:47:27 -0500 Subject: [PATCH] initial WIP of a tentative preview_url endpoint - incomplete, untested, experimental, etc. just putting it here for safekeeping for now --- docs/url_previews.rst | 74 ++++++++ synapse/config/repository.py | 6 +- synapse/http/client.py | 81 +++++++++ synapse/rest/media/v1/media_repository.py | 3 + synapse/rest/media/v1/preview_url_resource.py | 164 ++++++++++++++++++ 5 files changed, 327 insertions(+), 1 deletion(-) create mode 100644 docs/url_previews.rst create mode 100644 synapse/rest/media/v1/preview_url_resource.py diff --git a/docs/url_previews.rst b/docs/url_previews.rst new file mode 100644 index 000000000..1dc6ee0c4 --- /dev/null +++ b/docs/url_previews.rst @@ -0,0 +1,74 @@ +URL Previews +============ + +Design notes on a URL previewing service for Matrix: + +Options are: + + 1. Have an AS which listens for URLs, downloads them, and inserts an event that describes their metadata. + * Pros: + * Decouples the implementation entirely from Synapse. + * Uses existing Matrix events & content repo to store the metadata. + * Cons: + * Which AS should provide this service for a room, and why should you trust it? + * Doesn't work well with E2E; you'd have to cut the AS into every room + * the AS would end up subscribing to every room anyway. + + 2. Have a generic preview API (nothing to do with Matrix) that provides a previewing service: + * Pros: + * Simple and flexible; can be used by any clients at any point + * Cons: + * If each HS provides one of these independently, all the HSes in a room may needlessly DoS the target URI + * We need somewhere to store the URL metadata rather than just using Matrix itself + * We can't piggyback on matrix to distribute the metadata between HSes. + + 3. Make the synapse of the sending user responsible for spidering the URL and inserting an event asynchronously which describes the metadata. + * Pros: + * Works transparently for all clients + * Piggy-backs nicely on using Matrix for distributing the metadata. + * No confusion as to which AS + * Cons: + * Doesn't work with E2E + * We might want to decouple the implementation of the spider from the HS, given spider behaviour can be quite complicated and evolve much more rapidly than the HS. It's more like a bot than a core part of the server. + + 4. Make the sending client use the preview API and insert the event itself when successful. + * Pros: + * Works well with E2E + * No custom server functionality + * Lets the client customise the preview that they send (like on FB) + * Cons: + * Entirely specific to the sending client, whereas it'd be nice if /any/ URL was correctly previewed if clients support it. + + 5. Have the option of specifying a shared (centralised) previewing service used by a room, to avoid all the different HSes in the room DoSing the target. + +Best solution is probably a combination of both 2 and 4. + * Sending clients do their best to create and send a preview at the point of sending the message, perhaps delaying the message until the preview is computed? (This also lets the user validate the preview before sending) + * Receiving clients have the option of going and creating their own preview if one doesn't arrive soon enough (or if the original sender didn't create one) + +This is a bit magical though in that the preview could come from two entirely different sources - the sending HS or your local one. However, this can always be exposed to users: "Generate your own URL previews if none are available?" + +This is tantamount also to senders calculating their own thumbnails for sending in advance of the main content - we are trusting the sender not to lie about the content in the thumbnail. Whereas currently thumbnails are calculated by the receiving homeserver to avoid this attack. + +However, this kind of phishing attack does exist whether we let senders pick their thumbnails or not, in that a malicious sender can send normal text messages around the attachment claiming it to be legitimate. We could rely on (future) reputation/abuse management to punish users who phish (be it with bogus metadata or bogus descriptions). Bogus metadata is particularly bad though, especially if it's avoidable. + +As a first cut, let's do #2 and have the receiver hit the API to calculate its own previews (as it does currently for image thumbnails). We can then extend/optimise this to option 4 as a special extra if needed. + +API +--- + +GET /_matrix/media/r0/previewUrl?url=http://wherever.com +200 OK +{ + "og:type" : "article" + "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" + "og:title" : "Matrix on Twitter" + "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" + "og:description" : "“Synapse 0.12 is out! Lots of polishing, performance & bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP”" + "og:site_name" : "Twitter" +} + +* Downloads the URL + * If HTML, just stores it in RAM and parses it for OG meta tags + * Download any media OG meta tags to the media repo, and refer to them in the OG via mxc:// URIs. + * If a media filetype we know we can thumbnail: store it on disk, and hand it to the thumbnailer. Generate OG meta tags from the thumbnailer contents. + * Otherwise, don't bother downloading further. diff --git a/synapse/config/repository.py b/synapse/config/repository.py index 2fcf87244..33fff5616 100644 --- a/synapse/config/repository.py +++ b/synapse/config/repository.py @@ -53,6 +53,7 @@ class ContentRepositoryConfig(Config): def read_config(self, config): self.max_upload_size = self.parse_size(config["max_upload_size"]) self.max_image_pixels = self.parse_size(config["max_image_pixels"]) + self.max_spider_size = self.parse_size(config["max_spider_size"]) self.media_store_path = self.ensure_directory(config["media_store_path"]) self.uploads_path = self.ensure_directory(config["uploads_path"]) self.dynamic_thumbnails = config["dynamic_thumbnails"] @@ -73,6 +74,9 @@ class ContentRepositoryConfig(Config): # The largest allowed upload size in bytes max_upload_size: "10M" + # The largest allowed URL preview spidering size in bytes + max_spider_size: "10M" + # Maximum number of pixels that will be thumbnailed max_image_pixels: "32M" @@ -80,7 +84,7 @@ class ContentRepositoryConfig(Config): # the resolution requested by the client. If true then whenever # a new resolution is requested by the client the server will # generate a new thumbnail. If false the server will pick a thumbnail - # from a precalcualted list. + # from a precalculated list. dynamic_thumbnails: false # List of thumbnail to precalculate when an image is uploaded. diff --git a/synapse/http/client.py b/synapse/http/client.py index fdd90b1c3..25d319f12 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -238,6 +238,87 @@ class SimpleHttpClient(object): else: raise CodeMessageException(response.code, body) + # XXX: FIXME: This is horribly copy-pasted from matrixfederationclient. + # The two should be factored out. + + @defer.inlineCallbacks + def get_file(self, url, output_stream, args={}, max_size=None): + """GETs a file from a given URL + Args: + url (str): The URL to GET + output_stream (file): File to write the response body to. + Returns: + A (int,dict) tuple of the file length and a dict of the response + headers. + """ + + def body_callback(method, url_bytes, headers_dict): + self.sign_request(destination, method, url_bytes, headers_dict) + return None + + response = yield self.request( + "GET", + url.encode("ascii"), + headers=Headers({ + b"User-Agent": [self.user_agent], + }) + ) + + headers = dict(response.headers.getAllRawHeaders()) + + if headers['Content-Length'] > max_size: + logger.warn("Requested URL is too large > %r bytes" % (self.max_size,)) + # XXX: do we want to explicitly drop the connection here somehow? if so, how? + raise # what should we be raising here? + + # TODO: if our Content-Type is HTML or something, just read the first + # N bytes into RAM rather than saving it all to disk only to read it + # straight back in again + + try: + length = yield preserve_context_over_fn( + _readBodyToFile, + response, output_stream, max_size + ) + except: + logger.exception("Failed to download body") + raise + + defer.returnValue((length, headers)) + + +# XXX: FIXME: This is horribly copy-pasted from matrixfederationclient. +# The two should be factored out. + +class _ReadBodyToFileProtocol(protocol.Protocol): + def __init__(self, stream, deferred, max_size): + self.stream = stream + self.deferred = deferred + self.length = 0 + self.max_size = max_size + + def dataReceived(self, data): + self.stream.write(data) + self.length += len(data) + if self.max_size is not None and self.length >= self.max_size: + logger.warn("Requested URL is too large > %r bytes" % (self.max_size,)) + self.deferred = defer.Deferred() + self.transport.loseConnection() + + def connectionLost(self, reason): + if reason.check(ResponseDone): + self.deferred.callback(self.length) + else: + self.deferred.errback(reason) + + +# XXX: FIXME: This is horribly copy-pasted from matrixfederationclient. +# The two should be factored out. + +def _readBodyToFile(response, stream, max_size): + d = defer.Deferred() + response.deliverBody(_ReadBodyToFileProtocol(stream, d, max_size)) + return d class CaptchaServerHttpClient(SimpleHttpClient): """ diff --git a/synapse/rest/media/v1/media_repository.py b/synapse/rest/media/v1/media_repository.py index 7dfb027dd..8f3491b91 100644 --- a/synapse/rest/media/v1/media_repository.py +++ b/synapse/rest/media/v1/media_repository.py @@ -17,6 +17,7 @@ from .upload_resource import UploadResource from .download_resource import DownloadResource from .thumbnail_resource import ThumbnailResource from .identicon_resource import IdenticonResource +from .preview_url_resource import PreviewUrlResource from .filepath import MediaFilePaths from twisted.web.resource import Resource @@ -78,3 +79,5 @@ class MediaRepositoryResource(Resource): self.putChild("download", DownloadResource(hs, filepaths)) self.putChild("thumbnail", ThumbnailResource(hs, filepaths)) self.putChild("identicon", IdenticonResource()) + self.putChild("preview_url", PreviewUrlResource(hs, filepaths)) + diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py new file mode 100644 index 000000000..fb8ab3096 --- /dev/null +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -0,0 +1,164 @@ +# Copyright 2016 OpenMarket Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from twisted.web.resource import Resource +from lxml import html +from synapse.http.client import SimpleHttpClient +from synapse.http.server import respond_with_json_bytes +from simplejson import json + +import logging +logger = logging.getLogger(__name__) + +class PreviewUrlResource(Resource): + isLeaf = True + + def __init__(self, hs, filepaths): + Resource.__init__(self) + self.client = SimpleHttpClient(hs) + self.filepaths = filepaths + self.max_spider_size = hs.config.max_spider_size + self.server_name = hs.hostname + self.clock = hs.get_clock() + + def render_GET(self, request): + self._async_render_GET(request) + return NOT_DONE_YET + + @request_handler + @defer.inlineCallbacks + def _async_render_GET(self, request): + url = request.args.get("url") + + try: + # TODO: keep track of whether there's an ongoing request for this preview + # and block and return their details if there is one. + + media_info = self._download_url(url) + except: + os.remove(fname) + raise + + if self._is_media(media_type): + dims = yield self._generate_local_thumbnails( + media_info.filesystem_id, media_info + ) + + og = { + "og:description" : media_info.download_name, + "og:image" : "mxc://%s/%s" % (self.server_name, media_info.filesystem_id), + "og:image:type" : media_info.media_type, + "og:image:width" : dims.width, + "og:image:height" : dims.height, + } + + # define our OG response for this media + elif self._is_html(media_type): + tree = html.parse(media_info.filename) + + # suck it up into lxml and define our OG response. + # if we see any URLs in the OG response, then spider them + # (although the client could choose to do this by asking for previews of those URLs to avoid DoSing the server) + + # "og:type" : "article" + # "og:url" : "https://twitter.com/matrixdotorg/status/684074366691356672" + # "og:title" : "Matrix on Twitter" + # "og:image" : "https://pbs.twimg.com/profile_images/500400952029888512/yI0qtFi7_400x400.png" + # "og:description" : "“Synapse 0.12 is out! Lots of polishing, performance & bugfixes: /sync API, /r0 prefix, fulltext search, 3PID invites https://t.co/5alhXLLEGP”" + # "og:site_name" : "Twitter" + + og = {} + for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): + og[tag.attrib['property']] = tag.attrib['content'] + + # TODO: store our OG details in a cache (and expire them when stale) + # TODO: delete the content to stop diskfilling, as we only ever cared about its OG + + respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) + + def _download_url(url): + requester = yield self.auth.get_user_by_req(request) + + # XXX: horrible duplication with base_resource's _download_remote_file() + file_id = random_string(24) + + fname = self.filepaths.local_media_filepath(file_id) + self._makedirs(fname) + + try: + with open(fname, "wb") as f: + length, headers = yield self.client.get_file( + url, output_stream=f, max_size=self.max_spider_size, + ) + media_type = headers["Content-Type"][0] + time_now_ms = self.clock.time_msec() + + content_disposition = headers.get("Content-Disposition", None) + if content_disposition: + _, params = cgi.parse_header(content_disposition[0],) + download_name = None + + # First check if there is a valid UTF-8 filename + download_name_utf8 = params.get("filename*", None) + if download_name_utf8: + if download_name_utf8.lower().startswith("utf-8''"): + download_name = download_name_utf8[7:] + + # If there isn't check for an ascii name. + if not download_name: + download_name_ascii = params.get("filename", None) + if download_name_ascii and is_ascii(download_name_ascii): + download_name = download_name_ascii + + if download_name: + download_name = urlparse.unquote(download_name) + try: + download_name = download_name.decode("utf-8") + except UnicodeDecodeError: + download_name = None + else: + download_name = None + + yield self.store.store_local_media( + media_id=fname, + media_type=media_type, + time_now_ms=self.clock.time_msec(), + upload_name=download_name, + media_length=length, + user_id=requester.user, + ) + + except: + os.remove(fname) + raise + + return { + "media_type": media_type, + "media_length": length, + "download_name": download_name, + "created_ts": time_now_ms, + "filesystem_id": file_id, + "filename": fname, + } + + + def _is_media(content_type): + if content_type.lower().startswith("image/"): + return True + + def _is_html(content_type): + content_type = content_type.lower() + if content_type == "text/html" or + content_type.startswith("application/xhtml"): + return True