Add url_preview_ip_range_blacklist to let admins specify internal IP ranges that must not be spidered. Add url_preview_url_blacklist to let admins specify URL patterns that must not be spidered. Implement a custom SpiderEndpoint and associated support classes to implement url_preview_ip_range_blacklist Add commentary and generally address PR feedbacktags/v0.15.0-rc1
@@ -16,6 +16,8 @@ | |||
from ._base import Config | |||
from collections import namedtuple | |||
import sys | |||
ThumbnailRequirement = namedtuple( | |||
"ThumbnailRequirement", ["width", "height", "method", "media_type"] | |||
) | |||
@@ -23,7 +25,7 @@ ThumbnailRequirement = namedtuple( | |||
def parse_thumbnail_requirements(thumbnail_sizes): | |||
""" Takes a list of dictionaries with "width", "height", and "method" keys | |||
and creates a map from image media types to the thumbnail size, thumnailing | |||
and creates a map from image media types to the thumbnail size, thumbnailing | |||
method, and thumbnail media type to precalculate | |||
Args: | |||
@@ -60,6 +62,18 @@ class ContentRepositoryConfig(Config): | |||
self.thumbnail_requirements = parse_thumbnail_requirements( | |||
config["thumbnail_sizes"] | |||
) | |||
self.url_preview_enabled = config["url_preview_enabled"] | |||
if self.url_preview_enabled: | |||
try: | |||
from netaddr import IPSet | |||
if "url_preview_ip_range_blacklist" in config: | |||
self.url_preview_ip_range_blacklist = IPSet( | |||
config["url_preview_ip_range_blacklist"] | |||
) | |||
if "url_preview_url_blacklist" in config: | |||
self.url_preview_url_blacklist = config["url_preview_url_blacklist"] | |||
except ImportError: | |||
sys.stderr.write("\nmissing netaddr dep - disabling preview_url API\n") | |||
def default_config(self, **kwargs): | |||
media_store = self.default_path("media_store") | |||
@@ -74,9 +88,6 @@ class ContentRepositoryConfig(Config): | |||
# The largest allowed upload size in bytes | |||
max_upload_size: "10M" | |||
# The largest allowed URL preview spidering size in bytes | |||
max_spider_size: "10M" | |||
# Maximum number of pixels that will be thumbnailed | |||
max_image_pixels: "32M" | |||
@@ -104,4 +115,62 @@ class ContentRepositoryConfig(Config): | |||
- width: 800 | |||
height: 600 | |||
method: scale | |||
# Is the preview URL API enabled? If enabled, you *must* specify | |||
# an explicit url_preview_ip_range_blacklist of IPs that the spider is | |||
# denied from accessing. | |||
url_preview_enabled: False | |||
# List of IP address CIDR ranges that the URL preview spider is denied | |||
# from accessing. There are no defaults: you must explicitly | |||
# specify a list for URL previewing to work. You should specify any | |||
# internal services in your network that you do not want synapse to try | |||
# to connect to, otherwise anyone in any Matrix room could cause your | |||
# synapse to issue arbitrary GET requests to your internal services, | |||
# causing serious security issues. | |||
# | |||
# url_preview_ip_range_blacklist: | |||
# - '127.0.0.0/8' | |||
# - '10.0.0.0/8' | |||
# - '172.16.0.0/12' | |||
# - '192.168.0.0/16' | |||
# Optional list of URL matches that the URL preview spider is | |||
# denied from accessing. You should use url_preview_ip_range_blacklist | |||
# in preference to this, otherwise someone could define a public DNS | |||
# entry that points to a private IP address and circumvent the blacklist. | |||
# This is more useful if you know there is an entire shape of URL that | |||
# you know that will never want synapse to try to spider. | |||
# | |||
# Each list entry is a dictionary of url component attributes as returned | |||
# by urlparse.urlsplit as applied to the absolute form of the URL. See | |||
# https://docs.python.org/2/library/urlparse.html#urlparse.urlsplit | |||
# The values of the dictionary are treated as an filename match pattern | |||
# applied to that component of URLs, unless they start with a ^ in which | |||
# case they are treated as a regular expression match. If all the | |||
# specified component matches for a given list item succeed, the URL is | |||
# blacklisted. | |||
# | |||
# url_preview_url_blacklist: | |||
# # blacklist any URL with a username in its URI | |||
# - username: '*'' | |||
# | |||
# # blacklist all *.google.com URLs | |||
# - netloc: 'google.com' | |||
# - netloc: '*.google.com' | |||
# | |||
# # blacklist all plain HTTP URLs | |||
# - scheme: 'http' | |||
# | |||
# # blacklist http(s)://www.acme.com/foo | |||
# - netloc: 'www.acme.com' | |||
# path: '/foo' | |||
# | |||
# # blacklist any URL with a literal IPv4 address | |||
# - netloc: '^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$' | |||
# The largest allowed URL preview spidering size in bytes | |||
max_spider_size: "10M" | |||
""" % locals() |
@@ -20,10 +20,12 @@ from synapse.api.errors import ( | |||
) | |||
from synapse.util.logcontext import preserve_context_over_fn | |||
import synapse.metrics | |||
from synapse.http.endpoint import SpiderEndpoint | |||
from canonicaljson import encode_canonical_json | |||
from twisted.internet import defer, reactor, ssl, protocol | |||
from twisted.internet.endpoints import SSL4ClientEndpoint, TCP4ClientEndpoint | |||
from twisted.web.client import ( | |||
BrowserLikeRedirectAgent, ContentDecoderAgent, GzipDecoder, Agent, | |||
readBody, FileBodyProducer, PartialDownloadError, | |||
@@ -364,6 +366,35 @@ class CaptchaServerHttpClient(SimpleHttpClient): | |||
defer.returnValue(e.response) | |||
class SpiderEndpointFactory(object): | |||
def __init__(self, hs): | |||
self.blacklist = hs.config.url_preview_ip_range_blacklist | |||
self.policyForHTTPS = hs.get_http_client_context_factory() | |||
def endpointForURI(self, uri): | |||
logger.info("Getting endpoint for %s", uri.toBytes()) | |||
if uri.scheme == "http": | |||
return SpiderEndpoint( | |||
reactor, uri.host, uri.port, self.blacklist, | |||
endpoint=TCP4ClientEndpoint, | |||
endpoint_kw_args={ | |||
'timeout': 15 | |||
}, | |||
) | |||
elif uri.scheme == "https": | |||
tlsPolicy = self.policyForHTTPS.creatorForNetloc(uri.host, uri.port) | |||
return SpiderEndpoint( | |||
reactor, uri.host, uri.port, self.blacklist, | |||
endpoint=SSL4ClientEndpoint, | |||
endpoint_kw_args={ | |||
'sslContextFactory': tlsPolicy, | |||
'timeout': 15 | |||
}, | |||
) | |||
else: | |||
logger.warn("Can't get endpoint for unrecognised scheme %s", uri.scheme) | |||
class SpiderHttpClient(SimpleHttpClient): | |||
""" | |||
Separate HTTP client for spidering arbitrary URLs. | |||
@@ -375,11 +406,14 @@ class SpiderHttpClient(SimpleHttpClient): | |||
def __init__(self, hs): | |||
SimpleHttpClient.__init__(self, hs) | |||
# clobber the base class's agent and UA: | |||
self.agent = ContentDecoderAgent(BrowserLikeRedirectAgent(Agent( | |||
reactor, | |||
connectTimeout=15, | |||
contextFactory=hs.get_http_client_context_factory() | |||
)), [('gzip', GzipDecoder)]) | |||
self.agent = ContentDecoderAgent( | |||
BrowserLikeRedirectAgent( | |||
Agent.usingEndpointFactory( | |||
reactor, | |||
SpiderEndpointFactory(hs) | |||
) | |||
), [('gzip', GzipDecoder)] | |||
) | |||
# We could look like Chrome: | |||
# self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) | |||
# Chrome Safari" % hs.version_string) | |||
@@ -74,6 +74,37 @@ def matrix_federation_endpoint(reactor, destination, ssl_context_factory=None, | |||
return transport_endpoint(reactor, domain, port, **endpoint_kw_args) | |||
class SpiderEndpoint(object): | |||
"""An endpoint which refuses to connect to blacklisted IP addresses | |||
Implements twisted.internet.interfaces.IStreamClientEndpoint. | |||
""" | |||
def __init__(self, reactor, host, port, blacklist, | |||
endpoint=TCP4ClientEndpoint, endpoint_kw_args={}): | |||
self.reactor = reactor | |||
self.host = host | |||
self.port = port | |||
self.blacklist = blacklist | |||
self.endpoint = endpoint | |||
self.endpoint_kw_args = endpoint_kw_args | |||
@defer.inlineCallbacks | |||
def connect(self, protocolFactory): | |||
address = yield self.reactor.resolve(self.host) | |||
from netaddr import IPAddress | |||
if IPAddress(address) in self.blacklist: | |||
raise ConnectError( | |||
"Refusing to spider blacklisted IP address %s" % address | |||
) | |||
logger.info("Connecting to %s:%s", address, self.port) | |||
endpoint = self.endpoint( | |||
self.reactor, address, self.port, **self.endpoint_kw_args | |||
) | |||
connection = yield endpoint.connect(protocolFactory) | |||
defer.returnValue(connection) | |||
class SRVClientEndpoint(object): | |||
"""An endpoint which looks up SRV records for a service. | |||
Cycles through the list of servers starting with each call to connect | |||
@@ -118,7 +149,7 @@ class SRVClientEndpoint(object): | |||
return self.default_server | |||
else: | |||
raise ConnectError( | |||
"Not server available for %s", self.service_name | |||
"Not server available for %s" % self.service_name | |||
) | |||
min_priority = self.servers[0].priority | |||
@@ -166,7 +197,7 @@ def resolve_service(service_name, dns_client=client, cache=SERVER_CACHE): | |||
and answers[0].type == dns.SRV | |||
and answers[0].payload | |||
and answers[0].payload.target == dns.Name('.')): | |||
raise ConnectError("Service %s unavailable", service_name) | |||
raise ConnectError("Service %s unavailable" % service_name) | |||
for answer in answers: | |||
if answer.type != dns.SRV or not answer.payload: | |||
@@ -36,13 +36,16 @@ REQUIREMENTS = { | |||
"blist": ["blist"], | |||
"pysaml2>=3.0.0,<4.0.0": ["saml2>=3.0.0,<4.0.0"], | |||
"pymacaroons-pynacl": ["pymacaroons"], | |||
"lxml>=3.6.0": ["lxml"], | |||
"pyjwt": ["jwt"], | |||
} | |||
CONDITIONAL_REQUIREMENTS = { | |||
"web_client": { | |||
"matrix_angular_sdk>=0.6.8": ["syweb>=0.6.8"], | |||
} | |||
}, | |||
"preview_url": { | |||
"lxml>=3.6.0": ["lxml"], | |||
"netaddr>=0.7.18": ["netaddr"], | |||
}, | |||
} | |||
@@ -79,4 +79,9 @@ class MediaRepositoryResource(Resource): | |||
self.putChild("download", DownloadResource(hs, filepaths)) | |||
self.putChild("thumbnail", ThumbnailResource(hs, filepaths)) | |||
self.putChild("identicon", IdenticonResource()) | |||
self.putChild("preview_url", PreviewUrlResource(hs, filepaths)) | |||
if hs.config.url_preview_enabled: | |||
try: | |||
self.putChild("preview_url", PreviewUrlResource(hs, filepaths)) | |||
except Exception as e: | |||
logger.warn("Failed to mount preview_url") | |||
logger.exception(e) |
@@ -17,34 +17,52 @@ from .base_resource import BaseMediaResource | |||
from twisted.web.server import NOT_DONE_YET | |||
from twisted.internet import defer | |||
from lxml import html | |||
from urlparse import urlparse, urlunparse | |||
from urlparse import urlparse, urlsplit, urlunparse | |||
from synapse.api.errors import Codes | |||
from synapse.util.stringutils import random_string | |||
from synapse.util.caches.expiringcache import ExpiringCache | |||
from synapse.http.client import SpiderHttpClient | |||
from synapse.http.server import ( | |||
request_handler, respond_with_json, respond_with_json_bytes | |||
request_handler, respond_with_json_bytes | |||
) | |||
from synapse.util.async import ObservableDeferred | |||
from synapse.util.stringutils import is_ascii | |||
import os | |||
import re | |||
import fnmatch | |||
import cgi | |||
import ujson as json | |||
import logging | |||
logger = logging.getLogger(__name__) | |||
try: | |||
from lxml import html | |||
except ImportError: | |||
pass | |||
class PreviewUrlResource(BaseMediaResource): | |||
isLeaf = True | |||
def __init__(self, hs, filepaths): | |||
if not html: | |||
logger.warn("Disabling PreviewUrlResource as lxml not available") | |||
raise | |||
if not hasattr(hs.config, "url_preview_ip_range_blacklist"): | |||
logger.warn( | |||
"For security, you must specify an explicit target IP address " | |||
"blacklist in url_preview_ip_range_blacklist for url previewing " | |||
"to work" | |||
) | |||
raise | |||
BaseMediaResource.__init__(self, hs, filepaths) | |||
self.client = SpiderHttpClient(hs) | |||
if hasattr(hs.config, "url_preview_url_blacklist"): | |||
self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist | |||
# simple memory cache mapping urls to OG metadata | |||
self.cache = ExpiringCache( | |||
@@ -74,6 +92,36 @@ class PreviewUrlResource(BaseMediaResource): | |||
else: | |||
ts = self.clock.time_msec() | |||
# impose the URL pattern blacklist | |||
if hasattr(self, "url_preview_url_blacklist"): | |||
url_tuple = urlsplit(url) | |||
for entry in self.url_preview_url_blacklist: | |||
match = True | |||
for attrib in entry: | |||
pattern = entry[attrib] | |||
value = getattr(url_tuple, attrib) | |||
logger.debug("Matching attrib '%s' with value '%s' against pattern '%s'" % ( | |||
attrib, value, pattern | |||
)) | |||
if value is None: | |||
match = False | |||
continue | |||
if pattern.startswith('^'): | |||
if not re.match(pattern, getattr(url_tuple, attrib)): | |||
match = False | |||
continue | |||
else: | |||
if not fnmatch.fnmatch(getattr(url_tuple, attrib), pattern): | |||
match = False | |||
continue | |||
if match: | |||
logger.warn( | |||
"URL %s blocked by url_blacklist entry %s", url, entry | |||
) | |||
raise | |||
# first check the memory cache - good to handle all the clients on this | |||
# HS thundering away to preview the same URL at the same time. | |||
try: | |||
@@ -177,17 +225,6 @@ class PreviewUrlResource(BaseMediaResource): | |||
respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) | |||
except: | |||
# XXX: if we don't explicitly respond here, the request never returns. | |||
# isn't this what server.py's wrapper is meant to be doing for us? | |||
respond_with_json( | |||
request, | |||
500, | |||
{ | |||
"error": "Internal server error", | |||
"errcode": Codes.UNKNOWN, | |||
}, | |||
send_cors=True | |||
) | |||
raise | |||
@defer.inlineCallbacks | |||
@@ -282,8 +319,12 @@ class PreviewUrlResource(BaseMediaResource): | |||
if meta_description: | |||
og['og:description'] = meta_description[0] | |||
else: | |||
# text_nodes = tree.xpath("//h1/text() | //h2/text() | //h3/text() | " | |||
# "//p/text() | //div/text() | //span/text() | //a/text()") | |||
# grab any text nodes which are inside the <body/> tag... | |||
# unless they are within an HTML5 semantic markup tag... | |||
# <header/>, <nav/>, <aside/>, <footer/> | |||
# ...or if they are within a <script/> or <style/> tag. | |||
# This is a very very very coarse approximation to a plain text | |||
# render of the page. | |||
text_nodes = tree.xpath("//text()[not(ancestor::header | ancestor::nav | " | |||
"ancestor::aside | ancestor::footer | " | |||
"ancestor::script | ancestor::style)]" + | |||