You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1156 lines
39 KiB

  1. # Copyright 2014-2016 OpenMarket Ltd
  2. # Copyright 2018 New Vector Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import logging
  16. import urllib.parse
  17. from http import HTTPStatus
  18. from io import BytesIO
  19. from typing import (
  20. TYPE_CHECKING,
  21. Any,
  22. BinaryIO,
  23. Callable,
  24. Dict,
  25. List,
  26. Mapping,
  27. Optional,
  28. Tuple,
  29. Union,
  30. )
  31. import treq
  32. from canonicaljson import encode_canonical_json
  33. from netaddr import AddrFormatError, IPAddress, IPSet
  34. from prometheus_client import Counter
  35. from typing_extensions import Protocol
  36. from zope.interface import implementer, provider
  37. from OpenSSL import SSL
  38. from OpenSSL.SSL import VERIFY_NONE
  39. from twisted.internet import defer, error as twisted_error, protocol, ssl
  40. from twisted.internet.address import IPv4Address, IPv6Address
  41. from twisted.internet.interfaces import (
  42. IAddress,
  43. IDelayedCall,
  44. IHostResolution,
  45. IOpenSSLContextFactory,
  46. IReactorCore,
  47. IReactorPluggableNameResolver,
  48. IReactorTime,
  49. IResolutionReceiver,
  50. ITCPTransport,
  51. )
  52. from twisted.internet.protocol import connectionDone
  53. from twisted.internet.task import Cooperator
  54. from twisted.python.failure import Failure
  55. from twisted.web._newclient import ResponseDone
  56. from twisted.web.client import (
  57. Agent,
  58. HTTPConnectionPool,
  59. ResponseNeverReceived,
  60. readBody,
  61. )
  62. from twisted.web.http import PotentialDataLoss
  63. from twisted.web.http_headers import Headers
  64. from twisted.web.iweb import (
  65. UNKNOWN_LENGTH,
  66. IAgent,
  67. IBodyProducer,
  68. IPolicyForHTTPS,
  69. IResponse,
  70. )
  71. from synapse.api.errors import Codes, HttpResponseException, SynapseError
  72. from synapse.http import QuieterFileBodyProducer, RequestTimedOutError, redact_uri
  73. from synapse.http.proxyagent import ProxyAgent
  74. from synapse.http.replicationagent import ReplicationAgent
  75. from synapse.http.types import QueryParams
  76. from synapse.logging.context import make_deferred_yieldable, run_in_background
  77. from synapse.logging.opentracing import set_tag, start_active_span, tags
  78. from synapse.types import ISynapseReactor
  79. from synapse.util import json_decoder
  80. from synapse.util.async_helpers import timeout_deferred
  81. if TYPE_CHECKING:
  82. from synapse.server import HomeServer
  83. logger = logging.getLogger(__name__)
  84. outgoing_requests_counter = Counter("synapse_http_client_requests", "", ["method"])
  85. incoming_responses_counter = Counter(
  86. "synapse_http_client_responses", "", ["method", "code"]
  87. )
  88. # the type of the headers map, to be passed to the t.w.h.Headers.
  89. #
  90. # The actual type accepted by Twisted is
  91. # Mapping[Union[str, bytes], Sequence[Union[str, bytes]] ,
  92. # allowing us to mix and match str and bytes freely. However: any str is also a
  93. # Sequence[str]; passing a header string value which is a
  94. # standalone str is interpreted as a sequence of 1-codepoint strings. This is a disastrous footgun.
  95. # We use a narrower value type (RawHeaderValue) to avoid this footgun.
  96. #
  97. # We also simplify the keys to be either all str or all bytes. This helps because
  98. # Dict[K, V] is invariant in K (and indeed V).
  99. RawHeaders = Union[Mapping[str, "RawHeaderValue"], Mapping[bytes, "RawHeaderValue"]]
  100. # the value actually has to be a List, but List is invariant so we can't specify that
  101. # the entries can either be Lists or bytes.
  102. RawHeaderValue = Union[
  103. List[str],
  104. List[bytes],
  105. List[Union[str, bytes]],
  106. Tuple[str, ...],
  107. Tuple[bytes, ...],
  108. Tuple[Union[str, bytes], ...],
  109. ]
  110. def _is_ip_blocked(
  111. ip_address: IPAddress, allowlist: Optional[IPSet], blocklist: IPSet
  112. ) -> bool:
  113. """
  114. Compares an IP address to allowed and disallowed IP sets.
  115. Args:
  116. ip_address: The IP address to check
  117. allowlist: Allowed IP addresses.
  118. blocklist: Disallowed IP addresses.
  119. Returns:
  120. True if the IP address is in the blocklist and not in the allowlist.
  121. """
  122. if ip_address in blocklist:
  123. if allowlist is None or ip_address not in allowlist:
  124. return True
  125. return False
  126. _EPSILON = 0.00000001
  127. def _make_scheduler(
  128. reactor: IReactorTime,
  129. ) -> Callable[[Callable[[], object]], IDelayedCall]:
  130. """Makes a schedular suitable for a Cooperator using the given reactor.
  131. (This is effectively just a copy from `twisted.internet.task`)
  132. """
  133. def _scheduler(x: Callable[[], object]) -> IDelayedCall:
  134. return reactor.callLater(_EPSILON, x)
  135. return _scheduler
  136. class _IPBlockingResolver:
  137. """
  138. A proxy for reactor.nameResolver which only produces non-blocklisted IP
  139. addresses, preventing DNS rebinding attacks.
  140. """
  141. def __init__(
  142. self,
  143. reactor: IReactorPluggableNameResolver,
  144. ip_allowlist: Optional[IPSet],
  145. ip_blocklist: IPSet,
  146. ):
  147. """
  148. Args:
  149. reactor: The twisted reactor.
  150. ip_allowlist: IP addresses to allow.
  151. ip_blocklist: IP addresses to disallow.
  152. """
  153. self._reactor = reactor
  154. self._ip_allowlist = ip_allowlist
  155. self._ip_blocklist = ip_blocklist
  156. def resolveHostName(
  157. self, recv: IResolutionReceiver, hostname: str, portNumber: int = 0
  158. ) -> IResolutionReceiver:
  159. addresses: List[IAddress] = []
  160. def _callback() -> None:
  161. has_bad_ip = False
  162. for address in addresses:
  163. # We only expect IPv4 and IPv6 addresses since only A/AAAA lookups
  164. # should go through this path.
  165. if not isinstance(address, (IPv4Address, IPv6Address)):
  166. continue
  167. ip_address = IPAddress(address.host)
  168. if _is_ip_blocked(ip_address, self._ip_allowlist, self._ip_blocklist):
  169. logger.info(
  170. "Blocked %s from DNS resolution to %s" % (ip_address, hostname)
  171. )
  172. has_bad_ip = True
  173. # if we have a blocked IP, we'd like to raise an error to block the
  174. # request, but all we can really do from here is claim that there were no
  175. # valid results.
  176. if not has_bad_ip:
  177. for address in addresses:
  178. recv.addressResolved(address)
  179. recv.resolutionComplete()
  180. @provider(IResolutionReceiver)
  181. class EndpointReceiver:
  182. @staticmethod
  183. def resolutionBegan(resolutionInProgress: IHostResolution) -> None:
  184. recv.resolutionBegan(resolutionInProgress)
  185. @staticmethod
  186. def addressResolved(address: IAddress) -> None:
  187. addresses.append(address)
  188. @staticmethod
  189. def resolutionComplete() -> None:
  190. _callback()
  191. self._reactor.nameResolver.resolveHostName(
  192. EndpointReceiver, hostname, portNumber=portNumber
  193. )
  194. return recv
  195. # ISynapseReactor implies IReactorCore, but explicitly marking it this as an implementer
  196. # of IReactorCore seems to keep mypy-zope happier.
  197. @implementer(IReactorCore, ISynapseReactor)
  198. class BlocklistingReactorWrapper:
  199. """
  200. A Reactor wrapper which will prevent DNS resolution to blocked IP
  201. addresses, to prevent DNS rebinding.
  202. """
  203. def __init__(
  204. self,
  205. reactor: IReactorPluggableNameResolver,
  206. ip_allowlist: Optional[IPSet],
  207. ip_blocklist: IPSet,
  208. ):
  209. self._reactor = reactor
  210. # We need to use a DNS resolver which filters out blocked IP
  211. # addresses, to prevent DNS rebinding.
  212. self._nameResolver = _IPBlockingResolver(
  213. self._reactor, ip_allowlist, ip_blocklist
  214. )
  215. def __getattr__(self, attr: str) -> Any:
  216. # Passthrough to the real reactor except for the DNS resolver.
  217. if attr == "nameResolver":
  218. return self._nameResolver
  219. else:
  220. return getattr(self._reactor, attr)
  221. class BlocklistingAgentWrapper(Agent):
  222. """
  223. An Agent wrapper which will prevent access to IP addresses being accessed
  224. directly (without an IP address lookup).
  225. """
  226. def __init__(
  227. self,
  228. agent: IAgent,
  229. ip_blocklist: IPSet,
  230. ip_allowlist: Optional[IPSet] = None,
  231. ):
  232. """
  233. Args:
  234. agent: The Agent to wrap.
  235. ip_allowlist: IP addresses to allow.
  236. ip_blocklist: IP addresses to disallow.
  237. """
  238. self._agent = agent
  239. self._ip_allowlist = ip_allowlist
  240. self._ip_blocklist = ip_blocklist
  241. def request(
  242. self,
  243. method: bytes,
  244. uri: bytes,
  245. headers: Optional[Headers] = None,
  246. bodyProducer: Optional[IBodyProducer] = None,
  247. ) -> defer.Deferred:
  248. h = urllib.parse.urlparse(uri.decode("ascii"))
  249. try:
  250. # h.hostname is Optional[str], None raises an AddrFormatError, so
  251. # this is safe even though IPAddress requires a str.
  252. ip_address = IPAddress(h.hostname) # type: ignore[arg-type]
  253. except AddrFormatError:
  254. # Not an IP
  255. pass
  256. else:
  257. if _is_ip_blocked(ip_address, self._ip_allowlist, self._ip_blocklist):
  258. logger.info("Blocking access to %s" % (ip_address,))
  259. e = SynapseError(HTTPStatus.FORBIDDEN, "IP address blocked")
  260. return defer.fail(Failure(e))
  261. return self._agent.request(
  262. method, uri, headers=headers, bodyProducer=bodyProducer
  263. )
  264. class BaseHttpClient:
  265. """
  266. A simple, no-frills HTTP client with methods that wrap up common ways of
  267. using HTTP in Matrix. Does not come with a default Agent, subclasses will need to
  268. define their own.
  269. Args:
  270. hs: The HomeServer instance to pass in
  271. treq_args: Extra keyword arguments to be given to treq.request.
  272. """
  273. agent: IAgent
  274. def __init__(
  275. self,
  276. hs: "HomeServer",
  277. treq_args: Optional[Dict[str, Any]] = None,
  278. ):
  279. self.hs = hs
  280. self.reactor = hs.get_reactor()
  281. self._extra_treq_args = treq_args or {}
  282. self.clock = hs.get_clock()
  283. user_agent = hs.version_string
  284. if hs.config.server.user_agent_suffix:
  285. user_agent = "%s %s" % (
  286. user_agent,
  287. hs.config.server.user_agent_suffix,
  288. )
  289. self.user_agent = user_agent.encode("ascii")
  290. # We use this for our body producers to ensure that they use the correct
  291. # reactor.
  292. self._cooperator = Cooperator(scheduler=_make_scheduler(hs.get_reactor()))
  293. async def request(
  294. self,
  295. method: str,
  296. uri: str,
  297. data: Optional[bytes] = None,
  298. headers: Optional[Headers] = None,
  299. ) -> IResponse:
  300. """
  301. Args:
  302. method: HTTP method to use.
  303. uri: URI to query.
  304. data: Data to send in the request body, if applicable.
  305. headers: Request headers.
  306. Returns:
  307. Response object, once the headers have been read.
  308. Raises:
  309. RequestTimedOutError if the request times out before the headers are read
  310. """
  311. outgoing_requests_counter.labels(method).inc()
  312. # log request but strip `access_token` (AS requests for example include this)
  313. logger.debug("Sending request %s %s", method, redact_uri(uri))
  314. with start_active_span(
  315. "outgoing-client-request",
  316. tags={
  317. tags.SPAN_KIND: tags.SPAN_KIND_RPC_CLIENT,
  318. tags.HTTP_METHOD: method,
  319. tags.HTTP_URL: uri,
  320. },
  321. finish_on_close=True,
  322. ):
  323. try:
  324. body_producer = None
  325. if data is not None:
  326. body_producer = QuieterFileBodyProducer(
  327. BytesIO(data),
  328. cooperator=self._cooperator,
  329. )
  330. request_deferred: defer.Deferred = treq.request(
  331. method,
  332. uri,
  333. agent=self.agent,
  334. data=body_producer,
  335. headers=headers,
  336. # Avoid buffering the body in treq since we do not reuse
  337. # response bodies.
  338. unbuffered=True,
  339. **self._extra_treq_args,
  340. )
  341. # we use our own timeout mechanism rather than treq's as a workaround
  342. # for https://twistedmatrix.com/trac/ticket/9534.
  343. request_deferred = timeout_deferred(
  344. request_deferred,
  345. 60,
  346. self.hs.get_reactor(),
  347. )
  348. # turn timeouts into RequestTimedOutErrors
  349. request_deferred.addErrback(_timeout_to_request_timed_out_error)
  350. response = await make_deferred_yieldable(request_deferred)
  351. incoming_responses_counter.labels(method, response.code).inc()
  352. logger.info(
  353. "Received response to %s %s: %s",
  354. method,
  355. redact_uri(uri),
  356. response.code,
  357. )
  358. return response
  359. except Exception as e:
  360. incoming_responses_counter.labels(method, "ERR").inc()
  361. logger.info(
  362. "Error sending request to %s %s: %s %s",
  363. method,
  364. redact_uri(uri),
  365. type(e).__name__,
  366. e.args[0],
  367. )
  368. set_tag(tags.ERROR, True)
  369. set_tag("error_reason", e.args[0])
  370. raise
  371. async def post_urlencoded_get_json(
  372. self,
  373. uri: str,
  374. args: Optional[Mapping[str, Union[str, List[str]]]] = None,
  375. headers: Optional[RawHeaders] = None,
  376. ) -> Any:
  377. """
  378. Args:
  379. uri: uri to query
  380. args: parameters to be url-encoded in the body
  381. headers: a map from header name to a list of values for that header
  382. Returns:
  383. parsed json
  384. Raises:
  385. RequestTimedOutError: if there is a timeout before the response headers
  386. are received. Note there is currently no timeout on reading the response
  387. body.
  388. HttpResponseException: On a non-2xx HTTP response.
  389. ValueError: if the response was not JSON
  390. """
  391. # TODO: Do we ever want to log message contents?
  392. logger.debug("post_urlencoded_get_json args: %s", args)
  393. query_bytes = encode_query_args(args)
  394. actual_headers = {
  395. b"Content-Type": [b"application/x-www-form-urlencoded"],
  396. b"User-Agent": [self.user_agent],
  397. b"Accept": [b"application/json"],
  398. }
  399. if headers:
  400. actual_headers.update(headers) # type: ignore
  401. response = await self.request(
  402. "POST", uri, headers=Headers(actual_headers), data=query_bytes
  403. )
  404. body = await make_deferred_yieldable(readBody(response))
  405. if 200 <= response.code < 300:
  406. return json_decoder.decode(body.decode("utf-8"))
  407. else:
  408. raise HttpResponseException(
  409. response.code, response.phrase.decode("ascii", errors="replace"), body
  410. )
  411. async def post_json_get_json(
  412. self, uri: str, post_json: Any, headers: Optional[RawHeaders] = None
  413. ) -> Any:
  414. """
  415. Args:
  416. uri: URI to query.
  417. post_json: request body, to be encoded as json
  418. headers: a map from header name to a list of values for that header
  419. Returns:
  420. parsed json
  421. Raises:
  422. RequestTimedOutError: if there is a timeout before the response headers
  423. are received. Note there is currently no timeout on reading the response
  424. body.
  425. HttpResponseException: On a non-2xx HTTP response.
  426. ValueError: if the response was not JSON
  427. """
  428. json_str = encode_canonical_json(post_json)
  429. logger.debug("HTTP POST %s -> %s", json_str, uri)
  430. actual_headers = {
  431. b"Content-Type": [b"application/json"],
  432. b"User-Agent": [self.user_agent],
  433. b"Accept": [b"application/json"],
  434. }
  435. if headers:
  436. actual_headers.update(headers) # type: ignore
  437. response = await self.request(
  438. "POST", uri, headers=Headers(actual_headers), data=json_str
  439. )
  440. body = await make_deferred_yieldable(readBody(response))
  441. if 200 <= response.code < 300:
  442. return json_decoder.decode(body.decode("utf-8"))
  443. else:
  444. raise HttpResponseException(
  445. response.code, response.phrase.decode("ascii", errors="replace"), body
  446. )
  447. async def get_json(
  448. self,
  449. uri: str,
  450. args: Optional[QueryParams] = None,
  451. headers: Optional[RawHeaders] = None,
  452. ) -> Any:
  453. """Gets some json from the given URI.
  454. Args:
  455. uri: The URI to request, not including query parameters
  456. args: A dictionary used to create query string
  457. headers: a map from header name to a list of values for that header
  458. Returns:
  459. Succeeds when we get a 2xx HTTP response, with the HTTP body as JSON.
  460. Raises:
  461. RequestTimedOutError: if there is a timeout before the response headers
  462. are received. Note there is currently no timeout on reading the response
  463. body.
  464. HttpResponseException On a non-2xx HTTP response.
  465. ValueError: if the response was not JSON
  466. """
  467. actual_headers = {b"Accept": [b"application/json"]}
  468. if headers:
  469. actual_headers.update(headers) # type: ignore
  470. body = await self.get_raw(uri, args, headers=actual_headers)
  471. return json_decoder.decode(body.decode("utf-8"))
  472. async def put_json(
  473. self,
  474. uri: str,
  475. json_body: Any,
  476. args: Optional[QueryParams] = None,
  477. headers: Optional[RawHeaders] = None,
  478. ) -> Any:
  479. """Puts some json to the given URI.
  480. Args:
  481. uri: The URI to request, not including query parameters
  482. json_body: The JSON to put in the HTTP body,
  483. args: A dictionary used to create query strings
  484. headers: a map from header name to a list of values for that header
  485. Returns:
  486. Succeeds when we get a 2xx HTTP response, with the HTTP body as JSON.
  487. Raises:
  488. RequestTimedOutError: if there is a timeout before the response headers
  489. are received. Note there is currently no timeout on reading the response
  490. body.
  491. HttpResponseException On a non-2xx HTTP response.
  492. ValueError: if the response was not JSON
  493. """
  494. if args:
  495. query_str = urllib.parse.urlencode(args, True)
  496. uri = "%s?%s" % (uri, query_str)
  497. json_str = encode_canonical_json(json_body)
  498. actual_headers = {
  499. b"Content-Type": [b"application/json"],
  500. b"User-Agent": [self.user_agent],
  501. b"Accept": [b"application/json"],
  502. }
  503. if headers:
  504. actual_headers.update(headers) # type: ignore
  505. response = await self.request(
  506. "PUT", uri, headers=Headers(actual_headers), data=json_str
  507. )
  508. body = await make_deferred_yieldable(readBody(response))
  509. if 200 <= response.code < 300:
  510. return json_decoder.decode(body.decode("utf-8"))
  511. else:
  512. raise HttpResponseException(
  513. response.code, response.phrase.decode("ascii", errors="replace"), body
  514. )
  515. async def get_raw(
  516. self,
  517. uri: str,
  518. args: Optional[QueryParams] = None,
  519. headers: Optional[RawHeaders] = None,
  520. ) -> bytes:
  521. """Gets raw text from the given URI.
  522. Args:
  523. uri: The URI to request, not including query parameters
  524. args: A dictionary used to create query strings
  525. headers: a map from header name to a list of values for that header
  526. Returns:
  527. Succeeds when we get a 2xx HTTP response, with the
  528. HTTP body as bytes.
  529. Raises:
  530. RequestTimedOutError: if there is a timeout before the response headers
  531. are received. Note there is currently no timeout on reading the response
  532. body.
  533. HttpResponseException on a non-2xx HTTP response.
  534. """
  535. if args:
  536. query_str = urllib.parse.urlencode(args, True)
  537. uri = "%s?%s" % (uri, query_str)
  538. actual_headers = {b"User-Agent": [self.user_agent]}
  539. if headers:
  540. actual_headers.update(headers) # type: ignore
  541. response = await self.request("GET", uri, headers=Headers(actual_headers))
  542. body = await make_deferred_yieldable(readBody(response))
  543. if 200 <= response.code < 300:
  544. return body
  545. else:
  546. raise HttpResponseException(
  547. response.code, response.phrase.decode("ascii", errors="replace"), body
  548. )
  549. # XXX: FIXME: This is horribly copy-pasted from matrixfederationclient.
  550. # The two should be factored out.
  551. async def get_file(
  552. self,
  553. url: str,
  554. output_stream: BinaryIO,
  555. max_size: Optional[int] = None,
  556. headers: Optional[RawHeaders] = None,
  557. is_allowed_content_type: Optional[Callable[[str], bool]] = None,
  558. ) -> Tuple[int, Dict[bytes, List[bytes]], str, int]:
  559. """GETs a file from a given URL
  560. Args:
  561. url: The URL to GET
  562. output_stream: File to write the response body to.
  563. headers: A map from header name to a list of values for that header
  564. is_allowed_content_type: A predicate to determine whether the
  565. content type of the file we're downloading is allowed. If set and
  566. it evaluates to False when called with the content type, the
  567. request will be terminated before completing the download by
  568. raising SynapseError.
  569. Returns:
  570. A tuple of the file length, dict of the response
  571. headers, absolute URI of the response and HTTP response code.
  572. Raises:
  573. RequestTimedOutError: if there is a timeout before the response headers
  574. are received. Note there is currently no timeout on reading the response
  575. body.
  576. SynapseError: if the response is not a 2xx, the remote file is too large, or
  577. another exception happens during the download.
  578. """
  579. actual_headers = {b"User-Agent": [self.user_agent]}
  580. if headers:
  581. actual_headers.update(headers) # type: ignore
  582. response = await self.request("GET", url, headers=Headers(actual_headers))
  583. resp_headers = dict(response.headers.getAllRawHeaders())
  584. if response.code > 299:
  585. logger.warning("Got %d when downloading %s" % (response.code, url))
  586. raise SynapseError(
  587. HTTPStatus.BAD_GATEWAY, "Got error %d" % (response.code,), Codes.UNKNOWN
  588. )
  589. if is_allowed_content_type and b"Content-Type" in resp_headers:
  590. content_type = resp_headers[b"Content-Type"][0].decode("ascii")
  591. if not is_allowed_content_type(content_type):
  592. raise SynapseError(
  593. HTTPStatus.BAD_GATEWAY,
  594. (
  595. "Requested file's content type not allowed for this operation: %s"
  596. % content_type
  597. ),
  598. )
  599. # TODO: if our Content-Type is HTML or something, just read the first
  600. # N bytes into RAM rather than saving it all to disk only to read it
  601. # straight back in again
  602. try:
  603. d = read_body_with_max_size(response, output_stream, max_size)
  604. # Ensure that the body is not read forever.
  605. d = timeout_deferred(d, 30, self.hs.get_reactor())
  606. length = await make_deferred_yieldable(d)
  607. except BodyExceededMaxSize:
  608. raise SynapseError(
  609. HTTPStatus.BAD_GATEWAY,
  610. "Requested file is too large > %r bytes" % (max_size,),
  611. Codes.TOO_LARGE,
  612. )
  613. except defer.TimeoutError:
  614. raise SynapseError(
  615. HTTPStatus.BAD_GATEWAY,
  616. "Requested file took too long to download",
  617. Codes.TOO_LARGE,
  618. )
  619. except Exception as e:
  620. raise SynapseError(
  621. HTTPStatus.BAD_GATEWAY, ("Failed to download remote body: %s" % e)
  622. ) from e
  623. return (
  624. length,
  625. resp_headers,
  626. response.request.absoluteURI.decode("ascii"),
  627. response.code,
  628. )
  629. class SimpleHttpClient(BaseHttpClient):
  630. """
  631. An HTTP client capable of crossing a proxy and respecting a block/allow list.
  632. This also configures a larger / longer lasting HTTP connection pool.
  633. Args:
  634. hs: The HomeServer instance to pass in
  635. treq_args: Extra keyword arguments to be given to treq.request.
  636. ip_blocklist: The IP addresses that we may not request.
  637. ip_allowlist: The allowed IP addresses, that we can
  638. request if it were otherwise caught in a blocklist.
  639. use_proxy: Whether proxy settings should be discovered and used
  640. from conventional environment variables.
  641. """
  642. def __init__(
  643. self,
  644. hs: "HomeServer",
  645. treq_args: Optional[Dict[str, Any]] = None,
  646. ip_allowlist: Optional[IPSet] = None,
  647. ip_blocklist: Optional[IPSet] = None,
  648. use_proxy: bool = False,
  649. ):
  650. super().__init__(hs, treq_args=treq_args)
  651. self._ip_allowlist = ip_allowlist
  652. self._ip_blocklist = ip_blocklist
  653. if self._ip_blocklist:
  654. # If we have an IP blocklist, we need to use a DNS resolver which
  655. # filters out blocked IP addresses, to prevent DNS rebinding.
  656. self.reactor: ISynapseReactor = BlocklistingReactorWrapper(
  657. self.reactor, self._ip_allowlist, self._ip_blocklist
  658. )
  659. # the pusher makes lots of concurrent SSL connections to Sygnal, and tends to
  660. # do so in batches, so we need to allow the pool to keep lots of idle
  661. # connections around.
  662. pool = HTTPConnectionPool(self.reactor)
  663. # XXX: The justification for using the cache factor here is that larger
  664. # instances will need both more cache and more connections.
  665. # Still, this should probably be a separate dial
  666. pool.maxPersistentPerHost = max(int(100 * hs.config.caches.global_factor), 5)
  667. pool.cachedConnectionTimeout = 2 * 60
  668. self.agent: IAgent = ProxyAgent(
  669. self.reactor,
  670. hs.get_reactor(),
  671. connectTimeout=15,
  672. contextFactory=self.hs.get_http_client_context_factory(),
  673. pool=pool,
  674. use_proxy=use_proxy,
  675. )
  676. if self._ip_blocklist:
  677. # If we have an IP blocklist, we then install the Agent which prevents
  678. # direct access to IP addresses, that are not caught by the DNS resolution.
  679. self.agent = BlocklistingAgentWrapper(
  680. self.agent,
  681. ip_blocklist=self._ip_blocklist,
  682. ip_allowlist=self._ip_allowlist,
  683. )
  684. class ReplicationClient(BaseHttpClient):
  685. """Client for connecting to replication endpoints via HTTP and HTTPS.
  686. Attributes:
  687. agent: The custom Twisted Agent used for constructing the connection.
  688. """
  689. def __init__(
  690. self,
  691. hs: "HomeServer",
  692. ):
  693. """
  694. Args:
  695. hs: The HomeServer instance to pass in
  696. """
  697. super().__init__(hs)
  698. # Use a pool, but a very small one.
  699. pool = HTTPConnectionPool(self.reactor)
  700. pool.maxPersistentPerHost = 5
  701. pool.cachedConnectionTimeout = 2 * 60
  702. self.agent: IAgent = ReplicationAgent(
  703. hs.get_reactor(),
  704. hs.config.worker.instance_map,
  705. contextFactory=hs.get_http_client_context_factory(),
  706. pool=pool,
  707. )
  708. async def request(
  709. self,
  710. method: str,
  711. uri: str,
  712. data: Optional[bytes] = None,
  713. headers: Optional[Headers] = None,
  714. ) -> IResponse:
  715. """
  716. Make a request, differs from BaseHttpClient.request in that it does not use treq.
  717. Args:
  718. method: HTTP method to use.
  719. uri: URI to query.
  720. data: Data to send in the request body, if applicable.
  721. headers: Request headers.
  722. Returns:
  723. Response object, once the headers have been read.
  724. Raises:
  725. RequestTimedOutError if the request times out before the headers are read
  726. """
  727. outgoing_requests_counter.labels(method).inc()
  728. logger.debug("Sending request %s %s", method, uri)
  729. with start_active_span(
  730. "outgoing-replication-request",
  731. tags={
  732. tags.SPAN_KIND: tags.SPAN_KIND_RPC_CLIENT,
  733. tags.HTTP_METHOD: method,
  734. tags.HTTP_URL: uri,
  735. },
  736. finish_on_close=True,
  737. ):
  738. try:
  739. body_producer = None
  740. if data is not None:
  741. body_producer = QuieterFileBodyProducer(
  742. BytesIO(data),
  743. cooperator=self._cooperator,
  744. )
  745. # Skip the fancy treq stuff, we don't need cookie handling, redirects,
  746. # or buffered response bodies.
  747. method_bytes = method.encode("ascii")
  748. uri_bytes = uri.encode("ascii")
  749. # To preserve the logging context, the timeout is treated
  750. # in a similar way to `defer.gatherResults`:
  751. # * Each logging context-preserving fork is wrapped in
  752. # `run_in_background`. In this case there is only one,
  753. # since the timeout fork is not logging-context aware.
  754. # * The `Deferred` that joins the forks back together is
  755. # wrapped in `make_deferred_yieldable` to restore the
  756. # logging context regardless of the path taken.
  757. # (The logic/comments for this came from MatrixFederationHttpClient)
  758. request_deferred = run_in_background(
  759. self.agent.request,
  760. method_bytes,
  761. uri_bytes,
  762. headers,
  763. bodyProducer=body_producer,
  764. )
  765. # we use our own timeout mechanism rather than twisted's as a workaround
  766. # for https://twistedmatrix.com/trac/ticket/9534.
  767. # (Updated url https://github.com/twisted/twisted/issues/9534)
  768. request_deferred = timeout_deferred(
  769. request_deferred,
  770. 60,
  771. self.hs.get_reactor(),
  772. )
  773. # turn timeouts into RequestTimedOutErrors
  774. request_deferred.addErrback(_timeout_to_request_timed_out_error)
  775. response = await make_deferred_yieldable(request_deferred)
  776. incoming_responses_counter.labels(method, response.code).inc()
  777. logger.info(
  778. "Received response to %s %s: %s",
  779. method,
  780. uri,
  781. response.code,
  782. )
  783. return response
  784. except Exception as e:
  785. incoming_responses_counter.labels(method, "ERR").inc()
  786. logger.info(
  787. "Error sending request to %s %s: %s %s",
  788. method,
  789. uri,
  790. type(e).__name__,
  791. e.args[0],
  792. )
  793. set_tag(tags.ERROR, True)
  794. set_tag("error_reason", e.args[0])
  795. raise
  796. def _timeout_to_request_timed_out_error(f: Failure) -> Failure:
  797. if f.check(twisted_error.TimeoutError, twisted_error.ConnectingCancelledError):
  798. # The TCP connection has its own timeout (set by the 'connectTimeout' param
  799. # on the Agent), which raises twisted_error.TimeoutError exception.
  800. raise RequestTimedOutError("Timeout connecting to remote server")
  801. elif f.check(defer.TimeoutError, ResponseNeverReceived):
  802. # this one means that we hit our overall timeout on the request
  803. raise RequestTimedOutError("Timeout waiting for response from remote server")
  804. return f
  805. class ByteWriteable(Protocol):
  806. """The type of object which must be passed into read_body_with_max_size.
  807. Typically this is a file object.
  808. """
  809. def write(self, data: bytes) -> int:
  810. pass
  811. class BodyExceededMaxSize(Exception):
  812. """The maximum allowed size of the HTTP body was exceeded."""
  813. class _DiscardBodyWithMaxSizeProtocol(protocol.Protocol):
  814. """A protocol which immediately errors upon receiving data."""
  815. transport: Optional[ITCPTransport] = None
  816. def __init__(self, deferred: defer.Deferred):
  817. self.deferred = deferred
  818. def _maybe_fail(self) -> None:
  819. """
  820. Report a max size exceed error and disconnect the first time this is called.
  821. """
  822. if not self.deferred.called:
  823. self.deferred.errback(BodyExceededMaxSize())
  824. # Close the connection (forcefully) since all the data will get
  825. # discarded anyway.
  826. assert self.transport is not None
  827. self.transport.abortConnection()
  828. def dataReceived(self, data: bytes) -> None:
  829. self._maybe_fail()
  830. def connectionLost(self, reason: Failure = connectionDone) -> None:
  831. self._maybe_fail()
  832. class _ReadBodyWithMaxSizeProtocol(protocol.Protocol):
  833. """A protocol which reads body to a stream, erroring if the body exceeds a maximum size."""
  834. transport: Optional[ITCPTransport] = None
  835. def __init__(
  836. self, stream: ByteWriteable, deferred: defer.Deferred, max_size: Optional[int]
  837. ):
  838. self.stream = stream
  839. self.deferred = deferred
  840. self.length = 0
  841. self.max_size = max_size
  842. def dataReceived(self, data: bytes) -> None:
  843. # If the deferred was called, bail early.
  844. if self.deferred.called:
  845. return
  846. try:
  847. self.stream.write(data)
  848. except Exception:
  849. self.deferred.errback()
  850. return
  851. self.length += len(data)
  852. # The first time the maximum size is exceeded, error and cancel the
  853. # connection. dataReceived might be called again if data was received
  854. # in the meantime.
  855. if self.max_size is not None and self.length >= self.max_size:
  856. self.deferred.errback(BodyExceededMaxSize())
  857. # Close the connection (forcefully) since all the data will get
  858. # discarded anyway.
  859. assert self.transport is not None
  860. self.transport.abortConnection()
  861. def connectionLost(self, reason: Failure = connectionDone) -> None:
  862. # If the maximum size was already exceeded, there's nothing to do.
  863. if self.deferred.called:
  864. return
  865. if reason.check(ResponseDone):
  866. self.deferred.callback(self.length)
  867. elif reason.check(PotentialDataLoss):
  868. # This applies to requests which don't set `Content-Length` or a
  869. # `Transfer-Encoding` in the response because in this case the end of the
  870. # response is indicated by the connection being closed, an event which may
  871. # also be due to a transient network problem or other error. But since this
  872. # behavior is expected of some servers (like YouTube), let's ignore it.
  873. # Stolen from https://github.com/twisted/treq/pull/49/files
  874. # http://twistedmatrix.com/trac/ticket/4840
  875. self.deferred.callback(self.length)
  876. else:
  877. self.deferred.errback(reason)
  878. def read_body_with_max_size(
  879. response: IResponse, stream: ByteWriteable, max_size: Optional[int]
  880. ) -> "defer.Deferred[int]":
  881. """
  882. Read a HTTP response body to a file-object. Optionally enforcing a maximum file size.
  883. If the maximum file size is reached, the returned Deferred will resolve to a
  884. Failure with a BodyExceededMaxSize exception.
  885. Args:
  886. response: The HTTP response to read from.
  887. stream: The file-object to write to.
  888. max_size: The maximum file size to allow.
  889. Returns:
  890. A Deferred which resolves to the length of the read body.
  891. """
  892. d: "defer.Deferred[int]" = defer.Deferred()
  893. # If the Content-Length header gives a size larger than the maximum allowed
  894. # size, do not bother downloading the body.
  895. if max_size is not None and response.length != UNKNOWN_LENGTH:
  896. if response.length > max_size:
  897. response.deliverBody(_DiscardBodyWithMaxSizeProtocol(d))
  898. return d
  899. response.deliverBody(_ReadBodyWithMaxSizeProtocol(stream, d, max_size))
  900. return d
  901. def encode_query_args(args: Optional[QueryParams]) -> bytes:
  902. """
  903. Encodes a map of query arguments to bytes which can be appended to a URL.
  904. Args:
  905. args: The query arguments, a mapping of string to string or list of strings.
  906. Returns:
  907. The query arguments encoded as bytes.
  908. """
  909. if args is None:
  910. return b""
  911. query_str = urllib.parse.urlencode(args, True)
  912. return query_str.encode("utf8")
  913. @implementer(IPolicyForHTTPS)
  914. class InsecureInterceptableContextFactory(ssl.ContextFactory):
  915. """
  916. Factory for PyOpenSSL SSL contexts which accepts any certificate for any domain.
  917. Do not use this since it allows an attacker to intercept your communications.
  918. """
  919. def __init__(self) -> None:
  920. self._context = SSL.Context(SSL.SSLv23_METHOD)
  921. self._context.set_verify(VERIFY_NONE, lambda *_: False)
  922. def getContext(self) -> SSL.Context:
  923. return self._context
  924. def creatorForNetloc(self, hostname: bytes, port: int) -> IOpenSSLContextFactory:
  925. return self
  926. def is_unknown_endpoint(
  927. e: HttpResponseException, synapse_error: Optional[SynapseError] = None
  928. ) -> bool:
  929. """
  930. Returns true if the response was due to an endpoint being unimplemented.
  931. Args:
  932. e: The error response received from the remote server.
  933. synapse_error: The above error converted to a SynapseError. This is
  934. automatically generated if not provided.
  935. """
  936. if synapse_error is None:
  937. synapse_error = e.to_synapse_error()
  938. # Matrix v1.6 specifies that servers should return a 404 or 405 with an errcode
  939. # of M_UNRECOGNIZED when they receive a request to an unknown endpoint or
  940. # to an unknown method, respectively.
  941. #
  942. # Older versions of servers don't return proper errors, so be graceful. But,
  943. # also handle that some endpoints truly do return 404 errors.
  944. return (
  945. # 404 is an unknown endpoint, 405 is a known endpoint, but unknown method.
  946. (e.code == 404 or e.code == 405)
  947. and (
  948. # Consider empty body or non-JSON bodies to be unrecognised (matches
  949. # older Dendrites & Conduits).
  950. not e.response
  951. or not e.response.startswith(b"{")
  952. # The proper response JSON with M_UNRECOGNIZED errcode.
  953. or synapse_error.errcode == Codes.UNRECOGNIZED
  954. )
  955. ) or (
  956. # Older Synapses returned a 400 error.
  957. e.code == 400
  958. and synapse_error.errcode == Codes.UNRECOGNIZED
  959. )