Implement a content type allow list for URL previews (#11936)

This implements an allow list for content types for which Synapse will attempt URL preview. If a URL resolves to a resource with a content type which isn't in the list, the download will terminate immediately.

This makes sense given that Synapse would never successfully generate a URL preview for such files in the first place, and helps prevent issues with streaming media servers, such as #8302.

Signed-off-by: Denis Kasak dkasak@termina.org.uk
This commit is contained in:
Denis Kasak 2022-02-10 15:43:01 +00:00 committed by GitHub
parent 06e5a76322
commit 337f38cac3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 99 additions and 0 deletions

1
changelog.d/11936.bugfix Normal file
View file

@ -0,0 +1 @@
Implement an allow list of content types for which we will attempt to preview a URL. This prevents Synapse from making useless longer-lived connections to streaming media servers.

View file

@ -20,6 +20,7 @@ from typing import (
TYPE_CHECKING,
Any,
BinaryIO,
Callable,
Dict,
Iterable,
List,
@ -693,12 +694,18 @@ class SimpleHttpClient:
output_stream: BinaryIO,
max_size: Optional[int] = None,
headers: Optional[RawHeaders] = None,
is_allowed_content_type: Optional[Callable[[str], bool]] = None,
) -> Tuple[int, Dict[bytes, List[bytes]], str, int]:
"""GETs a file from a given URL
Args:
url: The URL to GET
output_stream: File to write the response body to.
headers: A map from header name to a list of values for that header
is_allowed_content_type: A predicate to determine whether the
content type of the file we're downloading is allowed. If set and
it evaluates to False when called with the content type, the
request will be terminated before completing the download by
raising SynapseError.
Returns:
A tuple of the file length, dict of the response
headers, absolute URI of the response and HTTP response code.
@ -726,6 +733,17 @@ class SimpleHttpClient:
HTTPStatus.BAD_GATEWAY, "Got error %d" % (response.code,), Codes.UNKNOWN
)
if is_allowed_content_type and b"Content-Type" in resp_headers:
content_type = resp_headers[b"Content-Type"][0].decode("ascii")
if not is_allowed_content_type(content_type):
raise SynapseError(
HTTPStatus.BAD_GATEWAY,
(
"Requested file's content type not allowed for this operation: %s"
% content_type
),
)
# TODO: if our Content-Type is HTML or something, just read the first
# N bytes into RAM rather than saving it all to disk only to read it
# straight back in again

View file

@ -403,6 +403,7 @@ class PreviewUrlResource(DirectServeJsonResource):
output_stream=output_stream,
max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language},
is_allowed_content_type=_is_previewable,
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
@ -761,3 +762,10 @@ def _is_html(content_type: str) -> bool:
def _is_json(content_type: str) -> bool:
return content_type.lower().startswith("application/json")
def _is_previewable(content_type: str) -> bool:
"""Returns True for content types for which we will perform URL preview and False
otherwise."""
return _is_html(content_type) or _is_media(content_type) or _is_json(content_type)

View file

@ -243,6 +243,78 @@ class URLPreviewTests(unittest.HomeserverTestCase):
self.assertEqual(channel.code, 200)
self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
def test_video_rejected(self):
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
end_content = b"anything"
channel = self.make_request(
"GET",
"preview_url?url=http://matrix.org",
shorthand=False,
await_result=False,
)
self.pump()
client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b"Content-Type: video/mp4\r\n\r\n"
)
% (len(end_content))
+ end_content
)
self.pump()
self.assertEqual(channel.code, 502)
self.assertEqual(
channel.json_body,
{
"errcode": "M_UNKNOWN",
"error": "Requested file's content type not allowed for this operation: video/mp4",
},
)
def test_audio_rejected(self):
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
end_content = b"anything"
channel = self.make_request(
"GET",
"preview_url?url=http://matrix.org",
shorthand=False,
await_result=False,
)
self.pump()
client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b"Content-Type: audio/aac\r\n\r\n"
)
% (len(end_content))
+ end_content
)
self.pump()
self.assertEqual(channel.code, 502)
self.assertEqual(
channel.json_body,
{
"errcode": "M_UNKNOWN",
"error": "Requested file's content type not allowed for this operation: audio/aac",
},
)
def test_non_ascii_preview_content_type(self):
self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]