Fix URL preview bugs (type error when loading cache from db, content-type including quotes) (#4157)

This commit is contained in:
Amber Brown 2018-11-08 01:37:43 +11:00 committed by GitHub
parent c8ba79327b
commit b3708830b8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 187 additions and 10 deletions

1
changelog.d/4157.bugfix Normal file
View file

@ -0,0 +1 @@
Loading URL previews from the DB cache on Postgres will no longer cause Unicode type errors when responding to the request, and URL previews will no longer fail if the remote server returns a Content-Type header with the chartype in quotes.

View file

@ -468,13 +468,13 @@ def set_cors_headers(request):
Args: Args:
request (twisted.web.http.Request): The http request to add CORs to. request (twisted.web.http.Request): The http request to add CORs to.
""" """
request.setHeader("Access-Control-Allow-Origin", "*") request.setHeader(b"Access-Control-Allow-Origin", b"*")
request.setHeader( request.setHeader(
"Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS" b"Access-Control-Allow-Methods", b"GET, POST, PUT, DELETE, OPTIONS"
) )
request.setHeader( request.setHeader(
"Access-Control-Allow-Headers", b"Access-Control-Allow-Headers",
"Origin, X-Requested-With, Content-Type, Accept, Authorization" b"Origin, X-Requested-With, Content-Type, Accept, Authorization"
) )

View file

@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import cgi import cgi
import datetime import datetime
import errno import errno
@ -24,6 +25,7 @@ import shutil
import sys import sys
import traceback import traceback
import six
from six import string_types from six import string_types
from six.moves import urllib_parse as urlparse from six.moves import urllib_parse as urlparse
@ -98,7 +100,7 @@ class PreviewUrlResource(Resource):
# XXX: if get_user_by_req fails, what should we do in an async render? # XXX: if get_user_by_req fails, what should we do in an async render?
requester = yield self.auth.get_user_by_req(request) requester = yield self.auth.get_user_by_req(request)
url = parse_string(request, "url") url = parse_string(request, "url")
if "ts" in request.args: if b"ts" in request.args:
ts = parse_integer(request, "ts") ts = parse_integer(request, "ts")
else: else:
ts = self.clock.time_msec() ts = self.clock.time_msec()
@ -180,7 +182,12 @@ class PreviewUrlResource(Resource):
cache_result["expires_ts"] > ts and cache_result["expires_ts"] > ts and
cache_result["response_code"] / 100 == 2 cache_result["response_code"] / 100 == 2
): ):
defer.returnValue(cache_result["og"]) # It may be stored as text in the database, not as bytes (such as
# PostgreSQL). If so, encode it back before handing it on.
og = cache_result["og"]
if isinstance(og, six.text_type):
og = og.encode('utf8')
defer.returnValue(og)
return return
media_info = yield self._download_url(url, user) media_info = yield self._download_url(url, user)
@ -213,14 +220,17 @@ class PreviewUrlResource(Resource):
elif _is_html(media_info['media_type']): elif _is_html(media_info['media_type']):
# TODO: somehow stop a big HTML tree from exploding synapse's RAM # TODO: somehow stop a big HTML tree from exploding synapse's RAM
file = open(media_info['filename']) with open(media_info['filename'], 'rb') as file:
body = file.read() body = file.read()
file.close()
# clobber the encoding from the content-type, or default to utf-8 # clobber the encoding from the content-type, or default to utf-8
# XXX: this overrides any <meta/> or XML charset headers in the body # XXX: this overrides any <meta/> or XML charset headers in the body
# which may pose problems, but so far seems to work okay. # which may pose problems, but so far seems to work okay.
match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I) match = re.match(
r'.*; *charset="?(.*?)"?(;|$)',
media_info['media_type'],
re.I
)
encoding = match.group(1) if match else "utf-8" encoding = match.group(1) if match else "utf-8"
og = decode_and_calc_og(body, media_info['uri'], encoding) og = decode_and_calc_og(body, media_info['uri'], encoding)

View file

@ -0,0 +1,164 @@
# -*- coding: utf-8 -*-
# Copyright 2018 New Vector Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from mock import Mock
from twisted.internet.defer import Deferred
from synapse.config.repository import MediaStorageProviderConfig
from synapse.util.module_loader import load_module
from tests import unittest
class URLPreviewTests(unittest.HomeserverTestCase):
hijack_auth = True
user_id = "@test:user"
def make_homeserver(self, reactor, clock):
self.storage_path = self.mktemp()
os.mkdir(self.storage_path)
config = self.default_config()
config.url_preview_enabled = True
config.max_spider_size = 9999999
config.url_preview_url_blacklist = []
config.media_store_path = self.storage_path
provider_config = {
"module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend",
"store_local": True,
"store_synchronous": False,
"store_remote": True,
"config": {"directory": self.storage_path},
}
loaded = list(load_module(provider_config)) + [
MediaStorageProviderConfig(False, False, False)
]
config.media_storage_providers = [loaded]
hs = self.setup_test_homeserver(config=config)
return hs
def prepare(self, reactor, clock, hs):
self.fetches = []
def get_file(url, output_stream, max_size):
"""
Returns tuple[int,dict,str,int] of file length, response headers,
absolute URI, and response code.
"""
def write_to(r):
data, response = r
output_stream.write(data)
return response
d = Deferred()
d.addCallback(write_to)
self.fetches.append((d, url))
return d
client = Mock()
client.get_file = get_file
self.media_repo = hs.get_media_repository_resource()
preview_url = self.media_repo.children[b'preview_url']
preview_url.client = client
self.preview_url = preview_url
def test_cache_returns_correct_type(self):
request, channel = self.make_request(
"GET", "url_preview?url=matrix.org", shorthand=False
)
request.render(self.preview_url)
self.pump()
# We've made one fetch
self.assertEqual(len(self.fetches), 1)
end_content = (
b'<html><head>'
b'<meta property="og:title" content="~matrix~" />'
b'<meta property="og:description" content="hi" />'
b'</head></html>'
)
self.fetches[0][0].callback(
(
end_content,
(
len(end_content),
{
b"Content-Length": [b"%d" % (len(end_content))],
b"Content-Type": [b'text/html; charset="utf8"'],
},
"https://example.com",
200,
),
)
)
self.pump()
self.assertEqual(channel.code, 200)
self.assertEqual(
channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
)
# Check the cache returns the correct response
request, channel = self.make_request(
"GET", "url_preview?url=matrix.org", shorthand=False
)
request.render(self.preview_url)
self.pump()
# Only one fetch, still, since we'll lean on the cache
self.assertEqual(len(self.fetches), 1)
# Check the cache response has the same content
self.assertEqual(channel.code, 200)
self.assertEqual(
channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
)
# Clear the in-memory cache
self.assertIn("matrix.org", self.preview_url._cache)
self.preview_url._cache.pop("matrix.org")
self.assertNotIn("matrix.org", self.preview_url._cache)
# Check the database cache returns the correct response
request, channel = self.make_request(
"GET", "url_preview?url=matrix.org", shorthand=False
)
request.render(self.preview_url)
self.pump()
# Only one fetch, still, since we'll lean on the cache
self.assertEqual(len(self.fetches), 1)
# Check the cache response has the same content
self.assertEqual(channel.code, 200)
self.assertEqual(
channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
)

View file

@ -57,6 +57,8 @@ class FakeChannel(object):
self.result["headers"] = headers self.result["headers"] = headers
def write(self, content): def write(self, content):
assert isinstance(content, bytes), "Should be bytes! " + repr(content)
if "body" not in self.result: if "body" not in self.result:
self.result["body"] = b"" self.result["body"] = b""