Fix Content-Disposition in media repository (#4176)

This commit is contained in:
Amber Brown 2018-11-15 15:55:58 -06:00 committed by GitHub
parent 835779f7fb
commit 8b1affe7d5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 271 additions and 90 deletions

1
changelog.d/4176.bugfix Normal file
View file

@ -0,0 +1 @@
The media repository now no longer fails to decode UTF-8 filenames when downloading remote media.

View file

@ -16,6 +16,7 @@
import logging
import os
from six import PY3
from six.moves import urllib
from twisted.internet import defer
@ -48,26 +49,21 @@ def parse_media_id(request):
return server_name, media_id, file_name
except Exception:
raise SynapseError(
404,
"Invalid media id token %r" % (request.postpath,),
Codes.UNKNOWN,
404, "Invalid media id token %r" % (request.postpath,), Codes.UNKNOWN
)
def respond_404(request):
respond_with_json(
request, 404,
cs_error(
"Not found %r" % (request.postpath,),
code=Codes.NOT_FOUND,
),
send_cors=True
request,
404,
cs_error("Not found %r" % (request.postpath,), code=Codes.NOT_FOUND),
send_cors=True,
)
@defer.inlineCallbacks
def respond_with_file(request, media_type, file_path,
file_size=None, upload_name=None):
def respond_with_file(request, media_type, file_path, file_size=None, upload_name=None):
logger.debug("Responding with %r", file_path)
if os.path.isfile(file_path):
@ -97,31 +93,26 @@ def add_file_headers(request, media_type, file_size, upload_name):
file_size (int): Size in bytes of the media, if known.
upload_name (str): The name of the requested file, if any.
"""
def _quote(x):
return urllib.parse.quote(x.encode("utf-8"))
request.setHeader(b"Content-Type", media_type.encode("UTF-8"))
if upload_name:
if is_ascii(upload_name):
disposition = ("inline; filename=%s" % (_quote(upload_name),)).encode("ascii")
disposition = "inline; filename=%s" % (_quote(upload_name),)
else:
disposition = (
"inline; filename*=utf-8''%s" % (_quote(upload_name),)).encode("ascii")
disposition = "inline; filename*=utf-8''%s" % (_quote(upload_name),)
request.setHeader(b"Content-Disposition", disposition)
request.setHeader(b"Content-Disposition", disposition.encode('ascii'))
# cache for at least a day.
# XXX: we might want to turn this off for data we don't want to
# recommend caching as it's sensitive or private - or at least
# select private. don't bother setting Expires as all our
# clients are smart enough to be happy with Cache-Control
request.setHeader(
b"Cache-Control", b"public,max-age=86400,s-maxage=86400"
)
request.setHeader(
b"Content-Length", b"%d" % (file_size,)
)
request.setHeader(b"Cache-Control", b"public,max-age=86400,s-maxage=86400")
request.setHeader(b"Content-Length", b"%d" % (file_size,))
@defer.inlineCallbacks
@ -153,6 +144,7 @@ class Responder(object):
Responder is a context manager which *must* be used, so that any resources
held can be cleaned up.
"""
def write_to_consumer(self, consumer):
"""Stream response into consumer
@ -186,9 +178,18 @@ class FileInfo(object):
thumbnail_method (str)
thumbnail_type (str): Content type of thumbnail, e.g. image/png
"""
def __init__(self, server_name, file_id, url_cache=False,
thumbnail=False, thumbnail_width=None, thumbnail_height=None,
thumbnail_method=None, thumbnail_type=None):
def __init__(
self,
server_name,
file_id,
url_cache=False,
thumbnail=False,
thumbnail_width=None,
thumbnail_height=None,
thumbnail_method=None,
thumbnail_type=None,
):
self.server_name = server_name
self.file_id = file_id
self.url_cache = url_cache
@ -197,3 +198,74 @@ class FileInfo(object):
self.thumbnail_height = thumbnail_height
self.thumbnail_method = thumbnail_method
self.thumbnail_type = thumbnail_type
def get_filename_from_headers(headers):
"""
Get the filename of the downloaded file by inspecting the
Content-Disposition HTTP header.
Args:
headers (twisted.web.http_headers.Headers): The HTTP
request headers.
Returns:
A Unicode string of the filename, or None.
"""
content_disposition = headers.get(b"Content-Disposition", [b''])
# No header, bail out.
if not content_disposition[0]:
return
# dict of unicode: bytes, corresponding to the key value sections of the
# Content-Disposition header.
params = {}
parts = content_disposition[0].split(b";")
for i in parts:
# Split into key-value pairs, if able
# We don't care about things like `inline`, so throw it out
if b"=" not in i:
continue
key, value = i.strip().split(b"=")
params[key.decode('ascii')] = value
upload_name = None
# First check if there is a valid UTF-8 filename
upload_name_utf8 = params.get("filename*", None)
if upload_name_utf8:
if upload_name_utf8.lower().startswith(b"utf-8''"):
upload_name_utf8 = upload_name_utf8[7:]
# We have a filename*= section. This MUST be ASCII, and any UTF-8
# bytes are %-quoted.
if PY3:
try:
# Once it is decoded, we can then unquote the %-encoded
# parts strictly into a unicode string.
upload_name = urllib.parse.unquote(
upload_name_utf8.decode('ascii'), errors="strict"
)
except UnicodeDecodeError:
# Incorrect UTF-8.
pass
else:
# On Python 2, we first unquote the %-encoded parts and then
# decode it strictly using UTF-8.
try:
upload_name = urllib.parse.unquote(upload_name_utf8).decode('utf8')
except UnicodeDecodeError:
pass
# If there isn't check for an ascii name.
if not upload_name:
upload_name_ascii = params.get("filename", None)
if upload_name_ascii and is_ascii(upload_name_ascii):
# Make sure there's no %-quoted bytes. If there is, reject it as
# non-valid ASCII.
if b"%" not in upload_name_ascii:
upload_name = upload_name_ascii.decode('ascii')
# This may be None here, indicating we did not find a matching name.
return upload_name

View file

@ -14,14 +14,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import cgi
import errno
import logging
import os
import shutil
from six import PY3, iteritems
from six.moves.urllib import parse as urlparse
from six import iteritems
import twisted.internet.error
import twisted.web.http
@ -34,14 +32,18 @@ from synapse.api.errors import (
NotFoundError,
SynapseError,
)
from synapse.http.matrixfederationclient import MatrixFederationHttpClient
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.util import logcontext
from synapse.util.async_helpers import Linearizer
from synapse.util.retryutils import NotRetryingDestination
from synapse.util.stringutils import is_ascii, random_string
from synapse.util.stringutils import random_string
from ._base import FileInfo, respond_404, respond_with_responder
from ._base import (
FileInfo,
get_filename_from_headers,
respond_404,
respond_with_responder,
)
from .config_resource import MediaConfigResource
from .download_resource import DownloadResource
from .filepath import MediaFilePaths
@ -62,7 +64,7 @@ class MediaRepository(object):
def __init__(self, hs):
self.hs = hs
self.auth = hs.get_auth()
self.client = MatrixFederationHttpClient(hs)
self.client = hs.get_http_client()
self.clock = hs.get_clock()
self.server_name = hs.hostname
self.store = hs.get_datastore()
@ -397,39 +399,9 @@ class MediaRepository(object):
yield finish()
media_type = headers[b"Content-Type"][0].decode('ascii')
upload_name = get_filename_from_headers(headers)
time_now_ms = self.clock.time_msec()
content_disposition = headers.get(b"Content-Disposition", None)
if content_disposition:
_, params = cgi.parse_header(content_disposition[0].decode('ascii'),)
upload_name = None
# First check if there is a valid UTF-8 filename
upload_name_utf8 = params.get("filename*", None)
if upload_name_utf8:
if upload_name_utf8.lower().startswith("utf-8''"):
upload_name = upload_name_utf8[7:]
# If there isn't check for an ascii name.
if not upload_name:
upload_name_ascii = params.get("filename", None)
if upload_name_ascii and is_ascii(upload_name_ascii):
upload_name = upload_name_ascii
if upload_name:
if PY3:
upload_name = urlparse.unquote(upload_name)
else:
upload_name = urlparse.unquote(upload_name.encode('ascii'))
try:
if isinstance(upload_name, bytes):
upload_name = upload_name.decode("utf-8")
except UnicodeDecodeError:
upload_name = None
else:
upload_name = None
logger.info("Stored remote media in file %r", fname)
yield self.store.store_cached_remote_media(

View file

@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import cgi
import datetime
import errno
import fnmatch
@ -44,10 +43,11 @@ from synapse.http.server import (
)
from synapse.http.servlet import parse_integer, parse_string
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.rest.media.v1._base import get_filename_from_headers
from synapse.util.async_helpers import ObservableDeferred
from synapse.util.caches.expiringcache import ExpiringCache
from synapse.util.logcontext import make_deferred_yieldable, run_in_background
from synapse.util.stringutils import is_ascii, random_string
from synapse.util.stringutils import random_string
from ._base import FileInfo
@ -336,31 +336,7 @@ class PreviewUrlResource(Resource):
media_type = "application/octet-stream"
time_now_ms = self.clock.time_msec()
content_disposition = headers.get(b"Content-Disposition", None)
if content_disposition:
_, params = cgi.parse_header(content_disposition[0],)
download_name = None
# First check if there is a valid UTF-8 filename
download_name_utf8 = params.get("filename*", None)
if download_name_utf8:
if download_name_utf8.lower().startswith("utf-8''"):
download_name = download_name_utf8[7:]
# If there isn't check for an ascii name.
if not download_name:
download_name_ascii = params.get("filename", None)
if download_name_ascii and is_ascii(download_name_ascii):
download_name = download_name_ascii
if download_name:
download_name = urlparse.unquote(download_name)
try:
download_name = download_name.decode("utf-8")
except UnicodeDecodeError:
download_name = None
else:
download_name = None
download_name = get_filename_from_headers(headers)
yield self.store.store_local_media(
media_id=file_id,

View file

@ -17,15 +17,20 @@
import os
import shutil
import tempfile
from binascii import unhexlify
from mock import Mock
from six.moves.urllib import parse
from twisted.internet import defer, reactor
from twisted.internet.defer import Deferred
from synapse.config.repository import MediaStorageProviderConfig
from synapse.rest.media.v1._base import FileInfo
from synapse.rest.media.v1.filepath import MediaFilePaths
from synapse.rest.media.v1.media_storage import MediaStorage
from synapse.rest.media.v1.storage_provider import FileStorageProviderBackend
from synapse.util.module_loader import load_module
from tests import unittest
@ -83,3 +88,143 @@ class MediaStorageTests(unittest.TestCase):
body = f.read()
self.assertEqual(test_body, body)
class MediaRepoTests(unittest.HomeserverTestCase):
hijack_auth = True
user_id = "@test:user"
def make_homeserver(self, reactor, clock):
self.fetches = []
def get_file(destination, path, output_stream, args=None, max_size=None):
"""
Returns tuple[int,dict,str,int] of file length, response headers,
absolute URI, and response code.
"""
def write_to(r):
data, response = r
output_stream.write(data)
return response
d = Deferred()
d.addCallback(write_to)
self.fetches.append((d, destination, path, args))
return d
client = Mock()
client.get_file = get_file
self.storage_path = self.mktemp()
os.mkdir(self.storage_path)
config = self.default_config()
config.media_store_path = self.storage_path
config.thumbnail_requirements = {}
config.max_image_pixels = 2000000
provider_config = {
"module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend",
"store_local": True,
"store_synchronous": False,
"store_remote": True,
"config": {"directory": self.storage_path},
}
loaded = list(load_module(provider_config)) + [
MediaStorageProviderConfig(False, False, False)
]
config.media_storage_providers = [loaded]
hs = self.setup_test_homeserver(config=config, http_client=client)
return hs
def prepare(self, reactor, clock, hs):
self.media_repo = hs.get_media_repository_resource()
self.download_resource = self.media_repo.children[b'download']
# smol png
self.end_content = unhexlify(
b"89504e470d0a1a0a0000000d4948445200000001000000010806"
b"0000001f15c4890000000a49444154789c63000100000500010d"
b"0a2db40000000049454e44ae426082"
)
def _req(self, content_disposition):
request, channel = self.make_request(
"GET", "example.com/12345", shorthand=False
)
request.render(self.download_resource)
self.pump()
# We've made one fetch, to example.com, using the media URL, and asking
# the other server not to do a remote fetch
self.assertEqual(len(self.fetches), 1)
self.assertEqual(self.fetches[0][1], "example.com")
self.assertEqual(
self.fetches[0][2], "/_matrix/media/v1/download/example.com/12345"
)
self.assertEqual(self.fetches[0][3], {"allow_remote": "false"})
headers = {
b"Content-Length": [b"%d" % (len(self.end_content))],
b"Content-Type": [b'image/png'],
}
if content_disposition:
headers[b"Content-Disposition"] = [content_disposition]
self.fetches[0][0].callback(
(self.end_content, (len(self.end_content), headers))
)
self.pump()
self.assertEqual(channel.code, 200)
return channel
def test_disposition_filename_ascii(self):
"""
If the filename is filename=<ascii> then Synapse will decode it as an
ASCII string, and use filename= in the response.
"""
channel = self._req(b"inline; filename=out.png")
headers = channel.headers
self.assertEqual(headers.getRawHeaders(b"Content-Type"), [b"image/png"])
self.assertEqual(
headers.getRawHeaders(b"Content-Disposition"), [b"inline; filename=out.png"]
)
def test_disposition_filenamestar_utf8escaped(self):
"""
If the filename is filename=*utf8''<utf8 escaped> then Synapse will
correctly decode it as the UTF-8 string, and use filename* in the
response.
"""
filename = parse.quote(u"\u2603".encode('utf8')).encode('ascii')
channel = self._req(b"inline; filename*=utf-8''" + filename + b".png")
headers = channel.headers
self.assertEqual(headers.getRawHeaders(b"Content-Type"), [b"image/png"])
self.assertEqual(
headers.getRawHeaders(b"Content-Disposition"),
[b"inline; filename*=utf-8''" + filename + b".png"],
)
def test_disposition_none(self):
"""
If there is no filename, one isn't passed on in the Content-Disposition
of the request.
"""
channel = self._req(None)
headers = channel.headers
self.assertEqual(headers.getRawHeaders(b"Content-Type"), [b"image/png"])
self.assertEqual(headers.getRawHeaders(b"Content-Disposition"), None)

View file

@ -14,6 +14,8 @@ from twisted.internet.error import DNSLookupError
from twisted.internet.interfaces import IReactorPluggableNameResolver
from twisted.python.failure import Failure
from twisted.test.proto_helpers import MemoryReactorClock
from twisted.web.http import unquote
from twisted.web.http_headers import Headers
from synapse.http.site import SynapseRequest
from synapse.util import Clock
@ -50,6 +52,15 @@ class FakeChannel(object):
raise Exception("No result yet.")
return int(self.result["code"])
@property
def headers(self):
if not self.result:
raise Exception("No result yet.")
h = Headers()
for i in self.result["headers"]:
h.addRawHeader(*i)
return h
def writeHeaders(self, version, code, reason, headers):
self.result["version"] = version
self.result["code"] = code
@ -152,6 +163,9 @@ def make_request(
path = b"/_matrix/client/r0/" + path
path = path.replace(b"//", b"/")
if not path.startswith(b"/"):
path = b"/" + path
if isinstance(content, text_type):
content = content.encode('utf8')
@ -161,6 +175,7 @@ def make_request(
req = request(site, channel)
req.process = lambda: b""
req.content = BytesIO(content)
req.postpath = list(map(unquote, path[1:].split(b'/')))
if access_token:
req.requestHeaders.addRawHeader(