You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

487 line
15 KiB

  1. # Copyright 2014-2016 OpenMarket Ltd
  2. # Copyright 2019-2021 The Matrix.org Foundation C.I.C.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import logging
  16. import os
  17. import urllib
  18. from abc import ABC, abstractmethod
  19. from types import TracebackType
  20. from typing import Awaitable, Dict, Generator, List, Optional, Tuple, Type
  21. import attr
  22. from twisted.internet.interfaces import IConsumer
  23. from twisted.protocols.basic import FileSender
  24. from twisted.web.server import Request
  25. from synapse.api.errors import Codes, cs_error
  26. from synapse.http.server import finish_request, respond_with_json
  27. from synapse.http.site import SynapseRequest
  28. from synapse.logging.context import make_deferred_yieldable
  29. from synapse.util.stringutils import is_ascii
  30. logger = logging.getLogger(__name__)
  31. # list all text content types that will have the charset default to UTF-8 when
  32. # none is given
  33. TEXT_CONTENT_TYPES = [
  34. "text/css",
  35. "text/csv",
  36. "text/html",
  37. "text/calendar",
  38. "text/plain",
  39. "text/javascript",
  40. "application/json",
  41. "application/ld+json",
  42. "application/rtf",
  43. "image/svg+xml",
  44. "text/xml",
  45. ]
  46. # A list of all content types that are "safe" to be rendered inline in a browser.
  47. INLINE_CONTENT_TYPES = [
  48. "text/css",
  49. "text/plain",
  50. "text/csv",
  51. "application/json",
  52. "application/ld+json",
  53. # We allow some media files deemed as safe, which comes from the matrix-react-sdk.
  54. # https://github.com/matrix-org/matrix-react-sdk/blob/a70fcfd0bcf7f8c85986da18001ea11597989a7c/src/utils/blobs.ts#L51
  55. # SVGs are *intentionally* omitted.
  56. "image/jpeg",
  57. "image/gif",
  58. "image/png",
  59. "image/apng",
  60. "image/webp",
  61. "image/avif",
  62. "video/mp4",
  63. "video/webm",
  64. "video/ogg",
  65. "video/quicktime",
  66. "audio/mp4",
  67. "audio/webm",
  68. "audio/aac",
  69. "audio/mpeg",
  70. "audio/ogg",
  71. "audio/wave",
  72. "audio/wav",
  73. "audio/x-wav",
  74. "audio/x-pn-wav",
  75. "audio/flac",
  76. "audio/x-flac",
  77. ]
  78. def respond_404(request: SynapseRequest) -> None:
  79. assert request.path is not None
  80. respond_with_json(
  81. request,
  82. 404,
  83. cs_error("Not found '%s'" % (request.path.decode(),), code=Codes.NOT_FOUND),
  84. send_cors=True,
  85. )
  86. async def respond_with_file(
  87. request: SynapseRequest,
  88. media_type: str,
  89. file_path: str,
  90. file_size: Optional[int] = None,
  91. upload_name: Optional[str] = None,
  92. ) -> None:
  93. logger.debug("Responding with %r", file_path)
  94. if os.path.isfile(file_path):
  95. if file_size is None:
  96. stat = os.stat(file_path)
  97. file_size = stat.st_size
  98. add_file_headers(request, media_type, file_size, upload_name)
  99. with open(file_path, "rb") as f:
  100. await make_deferred_yieldable(FileSender().beginFileTransfer(f, request))
  101. finish_request(request)
  102. else:
  103. respond_404(request)
  104. def add_file_headers(
  105. request: Request,
  106. media_type: str,
  107. file_size: Optional[int],
  108. upload_name: Optional[str],
  109. ) -> None:
  110. """Adds the correct response headers in preparation for responding with the
  111. media.
  112. Args:
  113. request
  114. media_type: The media/content type.
  115. file_size: Size in bytes of the media, if known.
  116. upload_name: The name of the requested file, if any.
  117. """
  118. def _quote(x: str) -> str:
  119. return urllib.parse.quote(x.encode("utf-8"))
  120. # Default to a UTF-8 charset for text content types.
  121. # ex, uses UTF-8 for 'text/css' but not 'text/css; charset=UTF-16'
  122. if media_type.lower() in TEXT_CONTENT_TYPES:
  123. content_type = media_type + "; charset=UTF-8"
  124. else:
  125. content_type = media_type
  126. request.setHeader(b"Content-Type", content_type.encode("UTF-8"))
  127. # A strict subset of content types is allowed to be inlined so that they may
  128. # be viewed directly in a browser. Other file types are forced to be downloads.
  129. if media_type.lower() in INLINE_CONTENT_TYPES:
  130. disposition = "inline"
  131. else:
  132. disposition = "attachment"
  133. if upload_name:
  134. # RFC6266 section 4.1 [1] defines both `filename` and `filename*`.
  135. #
  136. # `filename` is defined to be a `value`, which is defined by RFC2616
  137. # section 3.6 [2] to be a `token` or a `quoted-string`, where a `token`
  138. # is (essentially) a single US-ASCII word, and a `quoted-string` is a
  139. # US-ASCII string surrounded by double-quotes, using backslash as an
  140. # escape character. Note that %-encoding is *not* permitted.
  141. #
  142. # `filename*` is defined to be an `ext-value`, which is defined in
  143. # RFC5987 section 3.2.1 [3] to be `charset "'" [ language ] "'" value-chars`,
  144. # where `value-chars` is essentially a %-encoded string in the given charset.
  145. #
  146. # [1]: https://tools.ietf.org/html/rfc6266#section-4.1
  147. # [2]: https://tools.ietf.org/html/rfc2616#section-3.6
  148. # [3]: https://tools.ietf.org/html/rfc5987#section-3.2.1
  149. # We avoid the quoted-string version of `filename`, because (a) synapse didn't
  150. # correctly interpret those as of 0.99.2 and (b) they are a bit of a pain and we
  151. # may as well just do the filename* version.
  152. if _can_encode_filename_as_token(upload_name):
  153. disposition = "%s; filename=%s" % (
  154. disposition,
  155. upload_name,
  156. )
  157. else:
  158. disposition = "%s; filename*=utf-8''%s" % (
  159. disposition,
  160. _quote(upload_name),
  161. )
  162. request.setHeader(b"Content-Disposition", disposition.encode("ascii"))
  163. # cache for at least a day.
  164. # XXX: we might want to turn this off for data we don't want to
  165. # recommend caching as it's sensitive or private - or at least
  166. # select private. don't bother setting Expires as all our
  167. # clients are smart enough to be happy with Cache-Control
  168. request.setHeader(b"Cache-Control", b"public,max-age=86400,s-maxage=86400")
  169. if file_size is not None:
  170. request.setHeader(b"Content-Length", b"%d" % (file_size,))
  171. # Tell web crawlers to not index, archive, or follow links in media. This
  172. # should help to prevent things in the media repo from showing up in web
  173. # search results.
  174. request.setHeader(b"X-Robots-Tag", "noindex, nofollow, noarchive, noimageindex")
  175. # separators as defined in RFC2616. SP and HT are handled separately.
  176. # see _can_encode_filename_as_token.
  177. _FILENAME_SEPARATOR_CHARS = {
  178. "(",
  179. ")",
  180. "<",
  181. ">",
  182. "@",
  183. ",",
  184. ";",
  185. ":",
  186. "\\",
  187. '"',
  188. "/",
  189. "[",
  190. "]",
  191. "?",
  192. "=",
  193. "{",
  194. "}",
  195. }
  196. def _can_encode_filename_as_token(x: str) -> bool:
  197. for c in x:
  198. # from RFC2616:
  199. #
  200. # token = 1*<any CHAR except CTLs or separators>
  201. #
  202. # separators = "(" | ")" | "<" | ">" | "@"
  203. # | "," | ";" | ":" | "\" | <">
  204. # | "/" | "[" | "]" | "?" | "="
  205. # | "{" | "}" | SP | HT
  206. #
  207. # CHAR = <any US-ASCII character (octets 0 - 127)>
  208. #
  209. # CTL = <any US-ASCII control character
  210. # (octets 0 - 31) and DEL (127)>
  211. #
  212. if ord(c) >= 127 or ord(c) <= 32 or c in _FILENAME_SEPARATOR_CHARS:
  213. return False
  214. return True
  215. async def respond_with_responder(
  216. request: SynapseRequest,
  217. responder: "Optional[Responder]",
  218. media_type: str,
  219. file_size: Optional[int],
  220. upload_name: Optional[str] = None,
  221. ) -> None:
  222. """Responds to the request with given responder. If responder is None then
  223. returns 404.
  224. Args:
  225. request
  226. responder
  227. media_type: The media/content type.
  228. file_size: Size in bytes of the media. If not known it should be None
  229. upload_name: The name of the requested file, if any.
  230. """
  231. if not responder:
  232. respond_404(request)
  233. return
  234. # If we have a responder we *must* use it as a context manager.
  235. with responder:
  236. if request._disconnected:
  237. logger.warning(
  238. "Not sending response to request %s, already disconnected.", request
  239. )
  240. return
  241. logger.debug("Responding to media request with responder %s", responder)
  242. add_file_headers(request, media_type, file_size, upload_name)
  243. try:
  244. await responder.write_to_consumer(request)
  245. except Exception as e:
  246. # The majority of the time this will be due to the client having gone
  247. # away. Unfortunately, Twisted simply throws a generic exception at us
  248. # in that case.
  249. logger.warning("Failed to write to consumer: %s %s", type(e), e)
  250. # Unregister the producer, if it has one, so Twisted doesn't complain
  251. if request.producer:
  252. request.unregisterProducer()
  253. finish_request(request)
  254. class Responder(ABC):
  255. """Represents a response that can be streamed to the requester.
  256. Responder is a context manager which *must* be used, so that any resources
  257. held can be cleaned up.
  258. """
  259. @abstractmethod
  260. def write_to_consumer(self, consumer: IConsumer) -> Awaitable:
  261. """Stream response into consumer
  262. Args:
  263. consumer: The consumer to stream into.
  264. Returns:
  265. Resolves once the response has finished being written
  266. """
  267. raise NotImplementedError()
  268. def __enter__(self) -> None: # noqa: B027
  269. pass
  270. def __exit__( # noqa: B027
  271. self,
  272. exc_type: Optional[Type[BaseException]],
  273. exc_val: Optional[BaseException],
  274. exc_tb: Optional[TracebackType],
  275. ) -> None:
  276. pass
  277. @attr.s(slots=True, frozen=True, auto_attribs=True)
  278. class ThumbnailInfo:
  279. """Details about a generated thumbnail."""
  280. width: int
  281. height: int
  282. method: str
  283. # Content type of thumbnail, e.g. image/png
  284. type: str
  285. # The size of the media file, in bytes.
  286. length: Optional[int] = None
  287. @attr.s(slots=True, frozen=True, auto_attribs=True)
  288. class FileInfo:
  289. """Details about a requested/uploaded file."""
  290. # The server name where the media originated from, or None if local.
  291. server_name: Optional[str]
  292. # The local ID of the file. For local files this is the same as the media_id
  293. file_id: str
  294. # If the file is for the url preview cache
  295. url_cache: bool = False
  296. # Whether the file is a thumbnail or not.
  297. thumbnail: Optional[ThumbnailInfo] = None
  298. # The below properties exist to maintain compatibility with third-party modules.
  299. @property
  300. def thumbnail_width(self) -> Optional[int]:
  301. if not self.thumbnail:
  302. return None
  303. return self.thumbnail.width
  304. @property
  305. def thumbnail_height(self) -> Optional[int]:
  306. if not self.thumbnail:
  307. return None
  308. return self.thumbnail.height
  309. @property
  310. def thumbnail_method(self) -> Optional[str]:
  311. if not self.thumbnail:
  312. return None
  313. return self.thumbnail.method
  314. @property
  315. def thumbnail_type(self) -> Optional[str]:
  316. if not self.thumbnail:
  317. return None
  318. return self.thumbnail.type
  319. @property
  320. def thumbnail_length(self) -> Optional[int]:
  321. if not self.thumbnail:
  322. return None
  323. return self.thumbnail.length
  324. def get_filename_from_headers(headers: Dict[bytes, List[bytes]]) -> Optional[str]:
  325. """
  326. Get the filename of the downloaded file by inspecting the
  327. Content-Disposition HTTP header.
  328. Args:
  329. headers: The HTTP request headers.
  330. Returns:
  331. The filename, or None.
  332. """
  333. content_disposition = headers.get(b"Content-Disposition", [b""])
  334. # No header, bail out.
  335. if not content_disposition[0]:
  336. return None
  337. _, params = _parse_header(content_disposition[0])
  338. upload_name = None
  339. # First check if there is a valid UTF-8 filename
  340. upload_name_utf8 = params.get(b"filename*", None)
  341. if upload_name_utf8:
  342. if upload_name_utf8.lower().startswith(b"utf-8''"):
  343. upload_name_utf8 = upload_name_utf8[7:]
  344. # We have a filename*= section. This MUST be ASCII, and any UTF-8
  345. # bytes are %-quoted.
  346. try:
  347. # Once it is decoded, we can then unquote the %-encoded
  348. # parts strictly into a unicode string.
  349. upload_name = urllib.parse.unquote(
  350. upload_name_utf8.decode("ascii"), errors="strict"
  351. )
  352. except UnicodeDecodeError:
  353. # Incorrect UTF-8.
  354. pass
  355. # If there isn't check for an ascii name.
  356. if not upload_name:
  357. upload_name_ascii = params.get(b"filename", None)
  358. if upload_name_ascii and is_ascii(upload_name_ascii):
  359. upload_name = upload_name_ascii.decode("ascii")
  360. # This may be None here, indicating we did not find a matching name.
  361. return upload_name
  362. def _parse_header(line: bytes) -> Tuple[bytes, Dict[bytes, bytes]]:
  363. """Parse a Content-type like header.
  364. Cargo-culted from `cgi`, but works on bytes rather than strings.
  365. Args:
  366. line: header to be parsed
  367. Returns:
  368. The main content-type, followed by the parameter dictionary
  369. """
  370. parts = _parseparam(b";" + line)
  371. key = next(parts)
  372. pdict = {}
  373. for p in parts:
  374. i = p.find(b"=")
  375. if i >= 0:
  376. name = p[:i].strip().lower()
  377. value = p[i + 1 :].strip()
  378. # strip double-quotes
  379. if len(value) >= 2 and value[0:1] == value[-1:] == b'"':
  380. value = value[1:-1]
  381. value = value.replace(b"\\\\", b"\\").replace(b'\\"', b'"')
  382. pdict[name] = value
  383. return key, pdict
  384. def _parseparam(s: bytes) -> Generator[bytes, None, None]:
  385. """Generator which splits the input on ;, respecting double-quoted sequences
  386. Cargo-culted from `cgi`, but works on bytes rather than strings.
  387. Args:
  388. s: header to be parsed
  389. Returns:
  390. The split input
  391. """
  392. while s[:1] == b";":
  393. s = s[1:]
  394. # look for the next ;
  395. end = s.find(b";")
  396. # if there is an odd number of " marks between here and the next ;, skip to the
  397. # next ; instead
  398. while end > 0 and (s.count(b'"', 0, end) - s.count(b'\\"', 0, end)) % 2:
  399. end = s.find(b";", end + 1)
  400. if end < 0:
  401. end = len(s)
  402. f = s[:end]
  403. yield f.strip()
  404. s = s[end:]