forked from MirrorHub/synapse
Fix URL preview bugs (type error when loading cache from db, content-type including quotes) (#4157)
This commit is contained in:
parent
c8ba79327b
commit
b3708830b8
5 changed files with 187 additions and 10 deletions
1
changelog.d/4157.bugfix
Normal file
1
changelog.d/4157.bugfix
Normal file
|
@ -0,0 +1 @@
|
|||
Loading URL previews from the DB cache on Postgres will no longer cause Unicode type errors when responding to the request, and URL previews will no longer fail if the remote server returns a Content-Type header with the chartype in quotes.
|
|
@ -468,13 +468,13 @@ def set_cors_headers(request):
|
|||
Args:
|
||||
request (twisted.web.http.Request): The http request to add CORs to.
|
||||
"""
|
||||
request.setHeader("Access-Control-Allow-Origin", "*")
|
||||
request.setHeader(b"Access-Control-Allow-Origin", b"*")
|
||||
request.setHeader(
|
||||
"Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS"
|
||||
b"Access-Control-Allow-Methods", b"GET, POST, PUT, DELETE, OPTIONS"
|
||||
)
|
||||
request.setHeader(
|
||||
"Access-Control-Allow-Headers",
|
||||
"Origin, X-Requested-With, Content-Type, Accept, Authorization"
|
||||
b"Access-Control-Allow-Headers",
|
||||
b"Origin, X-Requested-With, Content-Type, Accept, Authorization"
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import cgi
|
||||
import datetime
|
||||
import errno
|
||||
|
@ -24,6 +25,7 @@ import shutil
|
|||
import sys
|
||||
import traceback
|
||||
|
||||
import six
|
||||
from six import string_types
|
||||
from six.moves import urllib_parse as urlparse
|
||||
|
||||
|
@ -98,7 +100,7 @@ class PreviewUrlResource(Resource):
|
|||
# XXX: if get_user_by_req fails, what should we do in an async render?
|
||||
requester = yield self.auth.get_user_by_req(request)
|
||||
url = parse_string(request, "url")
|
||||
if "ts" in request.args:
|
||||
if b"ts" in request.args:
|
||||
ts = parse_integer(request, "ts")
|
||||
else:
|
||||
ts = self.clock.time_msec()
|
||||
|
@ -180,7 +182,12 @@ class PreviewUrlResource(Resource):
|
|||
cache_result["expires_ts"] > ts and
|
||||
cache_result["response_code"] / 100 == 2
|
||||
):
|
||||
defer.returnValue(cache_result["og"])
|
||||
# It may be stored as text in the database, not as bytes (such as
|
||||
# PostgreSQL). If so, encode it back before handing it on.
|
||||
og = cache_result["og"]
|
||||
if isinstance(og, six.text_type):
|
||||
og = og.encode('utf8')
|
||||
defer.returnValue(og)
|
||||
return
|
||||
|
||||
media_info = yield self._download_url(url, user)
|
||||
|
@ -213,14 +220,17 @@ class PreviewUrlResource(Resource):
|
|||
elif _is_html(media_info['media_type']):
|
||||
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
|
||||
|
||||
file = open(media_info['filename'])
|
||||
body = file.read()
|
||||
file.close()
|
||||
with open(media_info['filename'], 'rb') as file:
|
||||
body = file.read()
|
||||
|
||||
# clobber the encoding from the content-type, or default to utf-8
|
||||
# XXX: this overrides any <meta/> or XML charset headers in the body
|
||||
# which may pose problems, but so far seems to work okay.
|
||||
match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I)
|
||||
match = re.match(
|
||||
r'.*; *charset="?(.*?)"?(;|$)',
|
||||
media_info['media_type'],
|
||||
re.I
|
||||
)
|
||||
encoding = match.group(1) if match else "utf-8"
|
||||
|
||||
og = decode_and_calc_og(body, media_info['uri'], encoding)
|
||||
|
|
164
tests/rest/media/v1/test_url_preview.py
Normal file
164
tests/rest/media/v1/test_url_preview.py
Normal file
|
@ -0,0 +1,164 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# Copyright 2018 New Vector Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
|
||||
from mock import Mock
|
||||
|
||||
from twisted.internet.defer import Deferred
|
||||
|
||||
from synapse.config.repository import MediaStorageProviderConfig
|
||||
from synapse.util.module_loader import load_module
|
||||
|
||||
from tests import unittest
|
||||
|
||||
|
||||
class URLPreviewTests(unittest.HomeserverTestCase):
|
||||
|
||||
hijack_auth = True
|
||||
user_id = "@test:user"
|
||||
|
||||
def make_homeserver(self, reactor, clock):
|
||||
|
||||
self.storage_path = self.mktemp()
|
||||
os.mkdir(self.storage_path)
|
||||
|
||||
config = self.default_config()
|
||||
config.url_preview_enabled = True
|
||||
config.max_spider_size = 9999999
|
||||
config.url_preview_url_blacklist = []
|
||||
config.media_store_path = self.storage_path
|
||||
|
||||
provider_config = {
|
||||
"module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend",
|
||||
"store_local": True,
|
||||
"store_synchronous": False,
|
||||
"store_remote": True,
|
||||
"config": {"directory": self.storage_path},
|
||||
}
|
||||
|
||||
loaded = list(load_module(provider_config)) + [
|
||||
MediaStorageProviderConfig(False, False, False)
|
||||
]
|
||||
|
||||
config.media_storage_providers = [loaded]
|
||||
|
||||
hs = self.setup_test_homeserver(config=config)
|
||||
|
||||
return hs
|
||||
|
||||
def prepare(self, reactor, clock, hs):
|
||||
|
||||
self.fetches = []
|
||||
|
||||
def get_file(url, output_stream, max_size):
|
||||
"""
|
||||
Returns tuple[int,dict,str,int] of file length, response headers,
|
||||
absolute URI, and response code.
|
||||
"""
|
||||
|
||||
def write_to(r):
|
||||
data, response = r
|
||||
output_stream.write(data)
|
||||
return response
|
||||
|
||||
d = Deferred()
|
||||
d.addCallback(write_to)
|
||||
self.fetches.append((d, url))
|
||||
return d
|
||||
|
||||
client = Mock()
|
||||
client.get_file = get_file
|
||||
|
||||
self.media_repo = hs.get_media_repository_resource()
|
||||
preview_url = self.media_repo.children[b'preview_url']
|
||||
preview_url.client = client
|
||||
self.preview_url = preview_url
|
||||
|
||||
def test_cache_returns_correct_type(self):
|
||||
|
||||
request, channel = self.make_request(
|
||||
"GET", "url_preview?url=matrix.org", shorthand=False
|
||||
)
|
||||
request.render(self.preview_url)
|
||||
self.pump()
|
||||
|
||||
# We've made one fetch
|
||||
self.assertEqual(len(self.fetches), 1)
|
||||
|
||||
end_content = (
|
||||
b'<html><head>'
|
||||
b'<meta property="og:title" content="~matrix~" />'
|
||||
b'<meta property="og:description" content="hi" />'
|
||||
b'</head></html>'
|
||||
)
|
||||
|
||||
self.fetches[0][0].callback(
|
||||
(
|
||||
end_content,
|
||||
(
|
||||
len(end_content),
|
||||
{
|
||||
b"Content-Length": [b"%d" % (len(end_content))],
|
||||
b"Content-Type": [b'text/html; charset="utf8"'],
|
||||
},
|
||||
"https://example.com",
|
||||
200,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
self.pump()
|
||||
self.assertEqual(channel.code, 200)
|
||||
self.assertEqual(
|
||||
channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
|
||||
)
|
||||
|
||||
# Check the cache returns the correct response
|
||||
request, channel = self.make_request(
|
||||
"GET", "url_preview?url=matrix.org", shorthand=False
|
||||
)
|
||||
request.render(self.preview_url)
|
||||
self.pump()
|
||||
|
||||
# Only one fetch, still, since we'll lean on the cache
|
||||
self.assertEqual(len(self.fetches), 1)
|
||||
|
||||
# Check the cache response has the same content
|
||||
self.assertEqual(channel.code, 200)
|
||||
self.assertEqual(
|
||||
channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
|
||||
)
|
||||
|
||||
# Clear the in-memory cache
|
||||
self.assertIn("matrix.org", self.preview_url._cache)
|
||||
self.preview_url._cache.pop("matrix.org")
|
||||
self.assertNotIn("matrix.org", self.preview_url._cache)
|
||||
|
||||
# Check the database cache returns the correct response
|
||||
request, channel = self.make_request(
|
||||
"GET", "url_preview?url=matrix.org", shorthand=False
|
||||
)
|
||||
request.render(self.preview_url)
|
||||
self.pump()
|
||||
|
||||
# Only one fetch, still, since we'll lean on the cache
|
||||
self.assertEqual(len(self.fetches), 1)
|
||||
|
||||
# Check the cache response has the same content
|
||||
self.assertEqual(channel.code, 200)
|
||||
self.assertEqual(
|
||||
channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
|
||||
)
|
|
@ -57,6 +57,8 @@ class FakeChannel(object):
|
|||
self.result["headers"] = headers
|
||||
|
||||
def write(self, content):
|
||||
assert isinstance(content, bytes), "Should be bytes! " + repr(content)
|
||||
|
||||
if "body" not in self.result:
|
||||
self.result["body"] = b""
|
||||
|
||||
|
|
Loading…
Reference in a new issue