Add url_preview_enabled config option to turn on/off preview_url endpoint. defaults to off.

Add url_preview_ip_range_blacklist to let admins specify internal IP ranges that must not be spidered.
Add url_preview_url_blacklist to let admins specify URL patterns that must not be spidered.
Implement a custom SpiderEndpoint and associated support classes to implement url_preview_ip_range_blacklist
Add commentary and generally address PR feedback
This commit is contained in:
Matthew Hodgson 2016-04-08 18:37:15 +01:00
parent d6e7333ae4
commit dafef5a688
6 changed files with 214 additions and 31 deletions

View file

@ -16,6 +16,8 @@
from ._base import Config from ._base import Config
from collections import namedtuple from collections import namedtuple
import sys
ThumbnailRequirement = namedtuple( ThumbnailRequirement = namedtuple(
"ThumbnailRequirement", ["width", "height", "method", "media_type"] "ThumbnailRequirement", ["width", "height", "method", "media_type"]
) )
@ -23,7 +25,7 @@ ThumbnailRequirement = namedtuple(
def parse_thumbnail_requirements(thumbnail_sizes): def parse_thumbnail_requirements(thumbnail_sizes):
""" Takes a list of dictionaries with "width", "height", and "method" keys """ Takes a list of dictionaries with "width", "height", and "method" keys
and creates a map from image media types to the thumbnail size, thumnailing and creates a map from image media types to the thumbnail size, thumbnailing
method, and thumbnail media type to precalculate method, and thumbnail media type to precalculate
Args: Args:
@ -60,6 +62,18 @@ class ContentRepositoryConfig(Config):
self.thumbnail_requirements = parse_thumbnail_requirements( self.thumbnail_requirements = parse_thumbnail_requirements(
config["thumbnail_sizes"] config["thumbnail_sizes"]
) )
self.url_preview_enabled = config["url_preview_enabled"]
if self.url_preview_enabled:
try:
from netaddr import IPSet
if "url_preview_ip_range_blacklist" in config:
self.url_preview_ip_range_blacklist = IPSet(
config["url_preview_ip_range_blacklist"]
)
if "url_preview_url_blacklist" in config:
self.url_preview_url_blacklist = config["url_preview_url_blacklist"]
except ImportError:
sys.stderr.write("\nmissing netaddr dep - disabling preview_url API\n")
def default_config(self, **kwargs): def default_config(self, **kwargs):
media_store = self.default_path("media_store") media_store = self.default_path("media_store")
@ -74,9 +88,6 @@ class ContentRepositoryConfig(Config):
# The largest allowed upload size in bytes # The largest allowed upload size in bytes
max_upload_size: "10M" max_upload_size: "10M"
# The largest allowed URL preview spidering size in bytes
max_spider_size: "10M"
# Maximum number of pixels that will be thumbnailed # Maximum number of pixels that will be thumbnailed
max_image_pixels: "32M" max_image_pixels: "32M"
@ -104,4 +115,62 @@ class ContentRepositoryConfig(Config):
- width: 800 - width: 800
height: 600 height: 600
method: scale method: scale
# Is the preview URL API enabled? If enabled, you *must* specify
# an explicit url_preview_ip_range_blacklist of IPs that the spider is
# denied from accessing.
url_preview_enabled: False
# List of IP address CIDR ranges that the URL preview spider is denied
# from accessing. There are no defaults: you must explicitly
# specify a list for URL previewing to work. You should specify any
# internal services in your network that you do not want synapse to try
# to connect to, otherwise anyone in any Matrix room could cause your
# synapse to issue arbitrary GET requests to your internal services,
# causing serious security issues.
#
# url_preview_ip_range_blacklist:
# - '127.0.0.0/8'
# - '10.0.0.0/8'
# - '172.16.0.0/12'
# - '192.168.0.0/16'
# Optional list of URL matches that the URL preview spider is
# denied from accessing. You should use url_preview_ip_range_blacklist
# in preference to this, otherwise someone could define a public DNS
# entry that points to a private IP address and circumvent the blacklist.
# This is more useful if you know there is an entire shape of URL that
# you know that will never want synapse to try to spider.
#
# Each list entry is a dictionary of url component attributes as returned
# by urlparse.urlsplit as applied to the absolute form of the URL. See
# https://docs.python.org/2/library/urlparse.html#urlparse.urlsplit
# The values of the dictionary are treated as an filename match pattern
# applied to that component of URLs, unless they start with a ^ in which
# case they are treated as a regular expression match. If all the
# specified component matches for a given list item succeed, the URL is
# blacklisted.
#
# url_preview_url_blacklist:
# # blacklist any URL with a username in its URI
# - username: '*''
#
# # blacklist all *.google.com URLs
# - netloc: 'google.com'
# - netloc: '*.google.com'
#
# # blacklist all plain HTTP URLs
# - scheme: 'http'
#
# # blacklist http(s)://www.acme.com/foo
# - netloc: 'www.acme.com'
# path: '/foo'
#
# # blacklist any URL with a literal IPv4 address
# - netloc: '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$'
# The largest allowed URL preview spidering size in bytes
max_spider_size: "10M"
""" % locals() """ % locals()

View file

@ -20,10 +20,12 @@ from synapse.api.errors import (
) )
from synapse.util.logcontext import preserve_context_over_fn from synapse.util.logcontext import preserve_context_over_fn
import synapse.metrics import synapse.metrics
from synapse.http.endpoint import SpiderEndpoint
from canonicaljson import encode_canonical_json from canonicaljson import encode_canonical_json
from twisted.internet import defer, reactor, ssl, protocol from twisted.internet import defer, reactor, ssl, protocol
from twisted.internet.endpoints import SSL4ClientEndpoint, TCP4ClientEndpoint
from twisted.web.client import ( from twisted.web.client import (
BrowserLikeRedirectAgent, ContentDecoderAgent, GzipDecoder, Agent, BrowserLikeRedirectAgent, ContentDecoderAgent, GzipDecoder, Agent,
readBody, FileBodyProducer, PartialDownloadError, readBody, FileBodyProducer, PartialDownloadError,
@ -364,6 +366,35 @@ class CaptchaServerHttpClient(SimpleHttpClient):
defer.returnValue(e.response) defer.returnValue(e.response)
class SpiderEndpointFactory(object):
def __init__(self, hs):
self.blacklist = hs.config.url_preview_ip_range_blacklist
self.policyForHTTPS = hs.get_http_client_context_factory()
def endpointForURI(self, uri):
logger.info("Getting endpoint for %s", uri.toBytes())
if uri.scheme == "http":
return SpiderEndpoint(
reactor, uri.host, uri.port, self.blacklist,
endpoint=TCP4ClientEndpoint,
endpoint_kw_args={
'timeout': 15
},
)
elif uri.scheme == "https":
tlsPolicy = self.policyForHTTPS.creatorForNetloc(uri.host, uri.port)
return SpiderEndpoint(
reactor, uri.host, uri.port, self.blacklist,
endpoint=SSL4ClientEndpoint,
endpoint_kw_args={
'sslContextFactory': tlsPolicy,
'timeout': 15
},
)
else:
logger.warn("Can't get endpoint for unrecognised scheme %s", uri.scheme)
class SpiderHttpClient(SimpleHttpClient): class SpiderHttpClient(SimpleHttpClient):
""" """
Separate HTTP client for spidering arbitrary URLs. Separate HTTP client for spidering arbitrary URLs.
@ -375,11 +406,14 @@ class SpiderHttpClient(SimpleHttpClient):
def __init__(self, hs): def __init__(self, hs):
SimpleHttpClient.__init__(self, hs) SimpleHttpClient.__init__(self, hs)
# clobber the base class's agent and UA: # clobber the base class's agent and UA:
self.agent = ContentDecoderAgent(BrowserLikeRedirectAgent(Agent( self.agent = ContentDecoderAgent(
reactor, BrowserLikeRedirectAgent(
connectTimeout=15, Agent.usingEndpointFactory(
contextFactory=hs.get_http_client_context_factory() reactor,
)), [('gzip', GzipDecoder)]) SpiderEndpointFactory(hs)
)
), [('gzip', GzipDecoder)]
)
# We could look like Chrome: # We could look like Chrome:
# self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) # self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko)
# Chrome Safari" % hs.version_string) # Chrome Safari" % hs.version_string)

View file

@ -74,6 +74,37 @@ def matrix_federation_endpoint(reactor, destination, ssl_context_factory=None,
return transport_endpoint(reactor, domain, port, **endpoint_kw_args) return transport_endpoint(reactor, domain, port, **endpoint_kw_args)
class SpiderEndpoint(object):
"""An endpoint which refuses to connect to blacklisted IP addresses
Implements twisted.internet.interfaces.IStreamClientEndpoint.
"""
def __init__(self, reactor, host, port, blacklist,
endpoint=TCP4ClientEndpoint, endpoint_kw_args={}):
self.reactor = reactor
self.host = host
self.port = port
self.blacklist = blacklist
self.endpoint = endpoint
self.endpoint_kw_args = endpoint_kw_args
@defer.inlineCallbacks
def connect(self, protocolFactory):
address = yield self.reactor.resolve(self.host)
from netaddr import IPAddress
if IPAddress(address) in self.blacklist:
raise ConnectError(
"Refusing to spider blacklisted IP address %s" % address
)
logger.info("Connecting to %s:%s", address, self.port)
endpoint = self.endpoint(
self.reactor, address, self.port, **self.endpoint_kw_args
)
connection = yield endpoint.connect(protocolFactory)
defer.returnValue(connection)
class SRVClientEndpoint(object): class SRVClientEndpoint(object):
"""An endpoint which looks up SRV records for a service. """An endpoint which looks up SRV records for a service.
Cycles through the list of servers starting with each call to connect Cycles through the list of servers starting with each call to connect
@ -118,7 +149,7 @@ class SRVClientEndpoint(object):
return self.default_server return self.default_server
else: else:
raise ConnectError( raise ConnectError(
"Not server available for %s", self.service_name "Not server available for %s" % self.service_name
) )
min_priority = self.servers[0].priority min_priority = self.servers[0].priority
@ -166,7 +197,7 @@ def resolve_service(service_name, dns_client=client, cache=SERVER_CACHE):
and answers[0].type == dns.SRV and answers[0].type == dns.SRV
and answers[0].payload and answers[0].payload
and answers[0].payload.target == dns.Name('.')): and answers[0].payload.target == dns.Name('.')):
raise ConnectError("Service %s unavailable", service_name) raise ConnectError("Service %s unavailable" % service_name)
for answer in answers: for answer in answers:
if answer.type != dns.SRV or not answer.payload: if answer.type != dns.SRV or not answer.payload:

View file

@ -36,13 +36,16 @@ REQUIREMENTS = {
"blist": ["blist"], "blist": ["blist"],
"pysaml2>=3.0.0,<4.0.0": ["saml2>=3.0.0,<4.0.0"], "pysaml2>=3.0.0,<4.0.0": ["saml2>=3.0.0,<4.0.0"],
"pymacaroons-pynacl": ["pymacaroons"], "pymacaroons-pynacl": ["pymacaroons"],
"lxml>=3.6.0": ["lxml"],
"pyjwt": ["jwt"], "pyjwt": ["jwt"],
} }
CONDITIONAL_REQUIREMENTS = { CONDITIONAL_REQUIREMENTS = {
"web_client": { "web_client": {
"matrix_angular_sdk>=0.6.8": ["syweb>=0.6.8"], "matrix_angular_sdk>=0.6.8": ["syweb>=0.6.8"],
} },
"preview_url": {
"lxml>=3.6.0": ["lxml"],
"netaddr>=0.7.18": ["netaddr"],
},
} }

View file

@ -79,4 +79,9 @@ class MediaRepositoryResource(Resource):
self.putChild("download", DownloadResource(hs, filepaths)) self.putChild("download", DownloadResource(hs, filepaths))
self.putChild("thumbnail", ThumbnailResource(hs, filepaths)) self.putChild("thumbnail", ThumbnailResource(hs, filepaths))
self.putChild("identicon", IdenticonResource()) self.putChild("identicon", IdenticonResource())
self.putChild("preview_url", PreviewUrlResource(hs, filepaths)) if hs.config.url_preview_enabled:
try:
self.putChild("preview_url", PreviewUrlResource(hs, filepaths))
except Exception as e:
logger.warn("Failed to mount preview_url")
logger.exception(e)

View file

@ -17,34 +17,52 @@ from .base_resource import BaseMediaResource
from twisted.web.server import NOT_DONE_YET from twisted.web.server import NOT_DONE_YET
from twisted.internet import defer from twisted.internet import defer
from lxml import html from urlparse import urlparse, urlsplit, urlunparse
from urlparse import urlparse, urlunparse
from synapse.api.errors import Codes
from synapse.util.stringutils import random_string from synapse.util.stringutils import random_string
from synapse.util.caches.expiringcache import ExpiringCache from synapse.util.caches.expiringcache import ExpiringCache
from synapse.http.client import SpiderHttpClient from synapse.http.client import SpiderHttpClient
from synapse.http.server import ( from synapse.http.server import (
request_handler, respond_with_json, respond_with_json_bytes request_handler, respond_with_json_bytes
) )
from synapse.util.async import ObservableDeferred from synapse.util.async import ObservableDeferred
from synapse.util.stringutils import is_ascii from synapse.util.stringutils import is_ascii
import os import os
import re import re
import fnmatch
import cgi import cgi
import ujson as json import ujson as json
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
try:
from lxml import html
except ImportError:
pass
class PreviewUrlResource(BaseMediaResource): class PreviewUrlResource(BaseMediaResource):
isLeaf = True isLeaf = True
def __init__(self, hs, filepaths): def __init__(self, hs, filepaths):
if not html:
logger.warn("Disabling PreviewUrlResource as lxml not available")
raise
if not hasattr(hs.config, "url_preview_ip_range_blacklist"):
logger.warn(
"For security, you must specify an explicit target IP address "
"blacklist in url_preview_ip_range_blacklist for url previewing "
"to work"
)
raise
BaseMediaResource.__init__(self, hs, filepaths) BaseMediaResource.__init__(self, hs, filepaths)
self.client = SpiderHttpClient(hs) self.client = SpiderHttpClient(hs)
if hasattr(hs.config, "url_preview_url_blacklist"):
self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist
# simple memory cache mapping urls to OG metadata # simple memory cache mapping urls to OG metadata
self.cache = ExpiringCache( self.cache = ExpiringCache(
@ -74,6 +92,36 @@ class PreviewUrlResource(BaseMediaResource):
else: else:
ts = self.clock.time_msec() ts = self.clock.time_msec()
# impose the URL pattern blacklist
if hasattr(self, "url_preview_url_blacklist"):
url_tuple = urlsplit(url)
for entry in self.url_preview_url_blacklist:
match = True
for attrib in entry:
pattern = entry[attrib]
value = getattr(url_tuple, attrib)
logger.debug("Matching attrib '%s' with value '%s' against pattern '%s'" % (
attrib, value, pattern
))
if value is None:
match = False
continue
if pattern.startswith('^'):
if not re.match(pattern, getattr(url_tuple, attrib)):
match = False
continue
else:
if not fnmatch.fnmatch(getattr(url_tuple, attrib), pattern):
match = False
continue
if match:
logger.warn(
"URL %s blocked by url_blacklist entry %s", url, entry
)
raise
# first check the memory cache - good to handle all the clients on this # first check the memory cache - good to handle all the clients on this
# HS thundering away to preview the same URL at the same time. # HS thundering away to preview the same URL at the same time.
try: try:
@ -177,17 +225,6 @@ class PreviewUrlResource(BaseMediaResource):
respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True)
except: except:
# XXX: if we don't explicitly respond here, the request never returns.
# isn't this what server.py's wrapper is meant to be doing for us?
respond_with_json(
request,
500,
{
"error": "Internal server error",
"errcode": Codes.UNKNOWN,
},
send_cors=True
)
raise raise
@defer.inlineCallbacks @defer.inlineCallbacks
@ -282,8 +319,12 @@ class PreviewUrlResource(BaseMediaResource):
if meta_description: if meta_description:
og['og:description'] = meta_description[0] og['og:description'] = meta_description[0]
else: else:
# text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | " # grab any text nodes which are inside the <body/> tag...
# "//p/text() | //div/text() | //span/text() | //a/text()") # unless they are within an HTML5 semantic markup tag...
# <header/>, <nav/>, <aside/>, <footer/>
# ...or if they are within a <script/> or <style/> tag.
# This is a very very very coarse approximation to a plain text
# render of the page.
text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | " text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | "
"ancestor::aside | ancestor::footer | " "ancestor::aside | ancestor::footer | "
"ancestor::script | ancestor::style)]" + "ancestor::script | ancestor::style)]" +