Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 
 
 
 

1255 linhas
45 KiB

  1. # Copyright 2014-2016 OpenMarket Ltd
  2. # Copyright 2018-2021 The Matrix.org Foundation C.I.C.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import errno
  16. import logging
  17. import os
  18. import shutil
  19. from io import BytesIO
  20. from typing import IO, TYPE_CHECKING, Dict, List, Optional, Set, Tuple
  21. import attr
  22. from matrix_common.types.mxc_uri import MXCUri
  23. import twisted.internet.error
  24. import twisted.web.http
  25. from twisted.internet.defer import Deferred
  26. from synapse.api.errors import (
  27. Codes,
  28. FederationDeniedError,
  29. HttpResponseException,
  30. NotFoundError,
  31. RequestSendFailed,
  32. SynapseError,
  33. cs_error,
  34. )
  35. from synapse.config.repository import ThumbnailRequirement
  36. from synapse.http.server import respond_with_json
  37. from synapse.http.site import SynapseRequest
  38. from synapse.logging.context import defer_to_thread
  39. from synapse.logging.opentracing import trace
  40. from synapse.media._base import (
  41. FileInfo,
  42. Responder,
  43. ThumbnailInfo,
  44. get_filename_from_headers,
  45. respond_404,
  46. respond_with_responder,
  47. )
  48. from synapse.media.filepath import MediaFilePaths
  49. from synapse.media.media_storage import MediaStorage
  50. from synapse.media.storage_provider import StorageProviderWrapper
  51. from synapse.media.thumbnailer import Thumbnailer, ThumbnailError
  52. from synapse.media.url_previewer import UrlPreviewer
  53. from synapse.metrics.background_process_metrics import run_as_background_process
  54. from synapse.storage.databases.main.media_repository import LocalMedia, RemoteMedia
  55. from synapse.types import UserID
  56. from synapse.util.async_helpers import Linearizer
  57. from synapse.util.retryutils import NotRetryingDestination
  58. from synapse.util.stringutils import random_string
  59. if TYPE_CHECKING:
  60. from synapse.server import HomeServer
  61. logger = logging.getLogger(__name__)
  62. # How often to run the background job to update the "recently accessed"
  63. # attribute of local and remote media.
  64. UPDATE_RECENTLY_ACCESSED_TS = 60 * 1000 # 1 minute
  65. # How often to run the background job to check for local and remote media
  66. # that should be purged according to the configured media retention settings.
  67. MEDIA_RETENTION_CHECK_PERIOD_MS = 60 * 60 * 1000 # 1 hour
  68. class MediaRepository:
  69. def __init__(self, hs: "HomeServer"):
  70. self.hs = hs
  71. self.auth = hs.get_auth()
  72. self.client = hs.get_federation_client()
  73. self.clock = hs.get_clock()
  74. self.server_name = hs.hostname
  75. self.store = hs.get_datastores().main
  76. self.max_upload_size = hs.config.media.max_upload_size
  77. self.max_image_pixels = hs.config.media.max_image_pixels
  78. self.unused_expiration_time = hs.config.media.unused_expiration_time
  79. self.max_pending_media_uploads = hs.config.media.max_pending_media_uploads
  80. Thumbnailer.set_limits(self.max_image_pixels)
  81. self.primary_base_path: str = hs.config.media.media_store_path
  82. self.filepaths: MediaFilePaths = MediaFilePaths(self.primary_base_path)
  83. self.dynamic_thumbnails = hs.config.media.dynamic_thumbnails
  84. self.thumbnail_requirements = hs.config.media.thumbnail_requirements
  85. self.remote_media_linearizer = Linearizer(name="media_remote")
  86. self.recently_accessed_remotes: Set[Tuple[str, str]] = set()
  87. self.recently_accessed_locals: Set[str] = set()
  88. self.federation_domain_whitelist = (
  89. hs.config.federation.federation_domain_whitelist
  90. )
  91. self.prevent_media_downloads_from = hs.config.media.prevent_media_downloads_from
  92. # List of StorageProviders where we should search for media and
  93. # potentially upload to.
  94. storage_providers = []
  95. for (
  96. clz,
  97. provider_config,
  98. wrapper_config,
  99. ) in hs.config.media.media_storage_providers:
  100. backend = clz(hs, provider_config)
  101. provider = StorageProviderWrapper(
  102. backend,
  103. store_local=wrapper_config.store_local,
  104. store_remote=wrapper_config.store_remote,
  105. store_synchronous=wrapper_config.store_synchronous,
  106. )
  107. storage_providers.append(provider)
  108. self.media_storage: MediaStorage = MediaStorage(
  109. self.hs, self.primary_base_path, self.filepaths, storage_providers
  110. )
  111. self.clock.looping_call(
  112. self._start_update_recently_accessed, UPDATE_RECENTLY_ACCESSED_TS
  113. )
  114. # Media retention configuration options
  115. self._media_retention_local_media_lifetime_ms = (
  116. hs.config.media.media_retention_local_media_lifetime_ms
  117. )
  118. self._media_retention_remote_media_lifetime_ms = (
  119. hs.config.media.media_retention_remote_media_lifetime_ms
  120. )
  121. # Check whether local or remote media retention is configured
  122. if (
  123. hs.config.media.media_retention_local_media_lifetime_ms is not None
  124. or hs.config.media.media_retention_remote_media_lifetime_ms is not None
  125. ):
  126. # Run the background job to apply media retention rules routinely,
  127. # with the duration between runs dictated by the homeserver config.
  128. self.clock.looping_call(
  129. self._start_apply_media_retention_rules,
  130. MEDIA_RETENTION_CHECK_PERIOD_MS,
  131. )
  132. if hs.config.media.url_preview_enabled:
  133. self.url_previewer: Optional[UrlPreviewer] = UrlPreviewer(
  134. hs, self, self.media_storage
  135. )
  136. else:
  137. self.url_previewer = None
  138. def _start_update_recently_accessed(self) -> Deferred:
  139. return run_as_background_process(
  140. "update_recently_accessed_media", self._update_recently_accessed
  141. )
  142. def _start_apply_media_retention_rules(self) -> Deferred:
  143. return run_as_background_process(
  144. "apply_media_retention_rules", self._apply_media_retention_rules
  145. )
  146. async def _update_recently_accessed(self) -> None:
  147. remote_media = self.recently_accessed_remotes
  148. self.recently_accessed_remotes = set()
  149. local_media = self.recently_accessed_locals
  150. self.recently_accessed_locals = set()
  151. await self.store.update_cached_last_access_time(
  152. local_media, remote_media, self.clock.time_msec()
  153. )
  154. def mark_recently_accessed(self, server_name: Optional[str], media_id: str) -> None:
  155. """Mark the given media as recently accessed.
  156. Args:
  157. server_name: Origin server of media, or None if local
  158. media_id: The media ID of the content
  159. """
  160. if server_name:
  161. self.recently_accessed_remotes.add((server_name, media_id))
  162. else:
  163. self.recently_accessed_locals.add(media_id)
  164. @trace
  165. async def create_media_id(self, auth_user: UserID) -> Tuple[str, int]:
  166. """Create and store a media ID for a local user and return the MXC URI and its
  167. expiration.
  168. Args:
  169. auth_user: The user_id of the uploader
  170. Returns:
  171. A tuple containing the MXC URI of the stored content and the timestamp at
  172. which the MXC URI expires.
  173. """
  174. media_id = random_string(24)
  175. now = self.clock.time_msec()
  176. await self.store.store_local_media_id(
  177. media_id=media_id,
  178. time_now_ms=now,
  179. user_id=auth_user,
  180. )
  181. return f"mxc://{self.server_name}/{media_id}", now + self.unused_expiration_time
  182. @trace
  183. async def reached_pending_media_limit(self, auth_user: UserID) -> Tuple[bool, int]:
  184. """Check if the user is over the limit for pending media uploads.
  185. Args:
  186. auth_user: The user_id of the uploader
  187. Returns:
  188. A tuple with a boolean and an integer indicating whether the user has too
  189. many pending media uploads and the timestamp at which the first pending
  190. media will expire, respectively.
  191. """
  192. pending, first_expiration_ts = await self.store.count_pending_media(
  193. user_id=auth_user
  194. )
  195. return pending >= self.max_pending_media_uploads, first_expiration_ts
  196. @trace
  197. async def verify_can_upload(self, media_id: str, auth_user: UserID) -> None:
  198. """Verify that the media ID can be uploaded to by the given user. This
  199. function checks that:
  200. * the media ID exists
  201. * the media ID does not already have content
  202. * the user uploading is the same as the one who created the media ID
  203. * the media ID has not expired
  204. Args:
  205. media_id: The media ID to verify
  206. auth_user: The user_id of the uploader
  207. """
  208. media = await self.store.get_local_media(media_id)
  209. if media is None:
  210. raise SynapseError(404, "Unknow media ID", errcode=Codes.NOT_FOUND)
  211. if media.user_id != auth_user.to_string():
  212. raise SynapseError(
  213. 403,
  214. "Only the creator of the media ID can upload to it",
  215. errcode=Codes.FORBIDDEN,
  216. )
  217. if media.media_length is not None:
  218. raise SynapseError(
  219. 409,
  220. "Media ID already has content",
  221. errcode=Codes.CANNOT_OVERWRITE_MEDIA,
  222. )
  223. expired_time_ms = self.clock.time_msec() - self.unused_expiration_time
  224. if media.created_ts < expired_time_ms:
  225. raise NotFoundError("Media ID has expired")
  226. @trace
  227. async def update_content(
  228. self,
  229. media_id: str,
  230. media_type: str,
  231. upload_name: Optional[str],
  232. content: IO,
  233. content_length: int,
  234. auth_user: UserID,
  235. ) -> None:
  236. """Update the content of the given media ID.
  237. Args:
  238. media_id: The media ID to replace.
  239. media_type: The content type of the file.
  240. upload_name: The name of the file, if provided.
  241. content: A file like object that is the content to store
  242. content_length: The length of the content
  243. auth_user: The user_id of the uploader
  244. """
  245. file_info = FileInfo(server_name=None, file_id=media_id)
  246. fname = await self.media_storage.store_file(content, file_info)
  247. logger.info("Stored local media in file %r", fname)
  248. await self.store.update_local_media(
  249. media_id=media_id,
  250. media_type=media_type,
  251. upload_name=upload_name,
  252. media_length=content_length,
  253. user_id=auth_user,
  254. )
  255. try:
  256. await self._generate_thumbnails(None, media_id, media_id, media_type)
  257. except Exception as e:
  258. logger.info("Failed to generate thumbnails: %s", e)
  259. @trace
  260. async def create_content(
  261. self,
  262. media_type: str,
  263. upload_name: Optional[str],
  264. content: IO,
  265. content_length: int,
  266. auth_user: UserID,
  267. ) -> MXCUri:
  268. """Store uploaded content for a local user and return the mxc URL
  269. Args:
  270. media_type: The content type of the file.
  271. upload_name: The name of the file, if provided.
  272. content: A file like object that is the content to store
  273. content_length: The length of the content
  274. auth_user: The user_id of the uploader
  275. Returns:
  276. The mxc url of the stored content
  277. """
  278. media_id = random_string(24)
  279. file_info = FileInfo(server_name=None, file_id=media_id)
  280. fname = await self.media_storage.store_file(content, file_info)
  281. logger.info("Stored local media in file %r", fname)
  282. await self.store.store_local_media(
  283. media_id=media_id,
  284. media_type=media_type,
  285. time_now_ms=self.clock.time_msec(),
  286. upload_name=upload_name,
  287. media_length=content_length,
  288. user_id=auth_user,
  289. )
  290. try:
  291. await self._generate_thumbnails(None, media_id, media_id, media_type)
  292. except Exception as e:
  293. logger.info("Failed to generate thumbnails: %s", e)
  294. return MXCUri(self.server_name, media_id)
  295. def respond_not_yet_uploaded(self, request: SynapseRequest) -> None:
  296. respond_with_json(
  297. request,
  298. 504,
  299. cs_error("Media has not been uploaded yet", code=Codes.NOT_YET_UPLOADED),
  300. send_cors=True,
  301. )
  302. async def get_local_media_info(
  303. self, request: SynapseRequest, media_id: str, max_timeout_ms: int
  304. ) -> Optional[LocalMedia]:
  305. """Gets the info dictionary for given local media ID. If the media has
  306. not been uploaded yet, this function will wait up to ``max_timeout_ms``
  307. milliseconds for the media to be uploaded.
  308. Args:
  309. request: The incoming request.
  310. media_id: The media ID of the content. (This is the same as
  311. the file_id for local content.)
  312. max_timeout_ms: the maximum number of milliseconds to wait for the
  313. media to be uploaded.
  314. Returns:
  315. Either the info dictionary for the given local media ID or
  316. ``None``. If ``None``, then no further processing is necessary as
  317. this function will send the necessary JSON response.
  318. """
  319. wait_until = self.clock.time_msec() + max_timeout_ms
  320. while True:
  321. # Get the info for the media
  322. media_info = await self.store.get_local_media(media_id)
  323. if not media_info:
  324. logger.info("Media %s is unknown", media_id)
  325. respond_404(request)
  326. return None
  327. if media_info.quarantined_by:
  328. logger.info("Media %s is quarantined", media_id)
  329. respond_404(request)
  330. return None
  331. # The file has been uploaded, so stop looping
  332. if media_info.media_length is not None:
  333. return media_info
  334. # Check if the media ID has expired and still hasn't been uploaded to.
  335. now = self.clock.time_msec()
  336. expired_time_ms = now - self.unused_expiration_time
  337. if media_info.created_ts < expired_time_ms:
  338. logger.info("Media %s has expired without being uploaded", media_id)
  339. respond_404(request)
  340. return None
  341. if now >= wait_until:
  342. break
  343. await self.clock.sleep(0.5)
  344. logger.info("Media %s has not yet been uploaded", media_id)
  345. self.respond_not_yet_uploaded(request)
  346. return None
  347. async def get_local_media(
  348. self,
  349. request: SynapseRequest,
  350. media_id: str,
  351. name: Optional[str],
  352. max_timeout_ms: int,
  353. ) -> None:
  354. """Responds to requests for local media, if exists, or returns 404.
  355. Args:
  356. request: The incoming request.
  357. media_id: The media ID of the content. (This is the same as
  358. the file_id for local content.)
  359. name: Optional name that, if specified, will be used as
  360. the filename in the Content-Disposition header of the response.
  361. max_timeout_ms: the maximum number of milliseconds to wait for the
  362. media to be uploaded.
  363. Returns:
  364. Resolves once a response has successfully been written to request
  365. """
  366. media_info = await self.get_local_media_info(request, media_id, max_timeout_ms)
  367. if not media_info:
  368. return
  369. self.mark_recently_accessed(None, media_id)
  370. media_type = media_info.media_type
  371. if not media_type:
  372. media_type = "application/octet-stream"
  373. media_length = media_info.media_length
  374. upload_name = name if name else media_info.upload_name
  375. url_cache = media_info.url_cache
  376. file_info = FileInfo(None, media_id, url_cache=bool(url_cache))
  377. responder = await self.media_storage.fetch_media(file_info)
  378. await respond_with_responder(
  379. request, responder, media_type, media_length, upload_name
  380. )
  381. async def get_remote_media(
  382. self,
  383. request: SynapseRequest,
  384. server_name: str,
  385. media_id: str,
  386. name: Optional[str],
  387. max_timeout_ms: int,
  388. ) -> None:
  389. """Respond to requests for remote media.
  390. Args:
  391. request: The incoming request.
  392. server_name: Remote server_name where the media originated.
  393. media_id: The media ID of the content (as defined by the remote server).
  394. name: Optional name that, if specified, will be used as
  395. the filename in the Content-Disposition header of the response.
  396. max_timeout_ms: the maximum number of milliseconds to wait for the
  397. media to be uploaded.
  398. Returns:
  399. Resolves once a response has successfully been written to request
  400. """
  401. if (
  402. self.federation_domain_whitelist is not None
  403. and server_name not in self.federation_domain_whitelist
  404. ):
  405. raise FederationDeniedError(server_name)
  406. # Don't let users download media from domains listed in the config, even
  407. # if we might have the media to serve. This is Trust & Safety tooling to
  408. # block some servers' media from being accessible to local users.
  409. # See `prevent_media_downloads_from` config docs for more info.
  410. if server_name in self.prevent_media_downloads_from:
  411. respond_404(request)
  412. return
  413. self.mark_recently_accessed(server_name, media_id)
  414. # We linearize here to ensure that we don't try and download remote
  415. # media multiple times concurrently
  416. key = (server_name, media_id)
  417. async with self.remote_media_linearizer.queue(key):
  418. responder, media_info = await self._get_remote_media_impl(
  419. server_name, media_id, max_timeout_ms
  420. )
  421. # We deliberately stream the file outside the lock
  422. if responder and media_info:
  423. upload_name = name if name else media_info.upload_name
  424. await respond_with_responder(
  425. request,
  426. responder,
  427. media_info.media_type,
  428. media_info.media_length,
  429. upload_name,
  430. )
  431. else:
  432. respond_404(request)
  433. async def get_remote_media_info(
  434. self, server_name: str, media_id: str, max_timeout_ms: int
  435. ) -> RemoteMedia:
  436. """Gets the media info associated with the remote file, downloading
  437. if necessary.
  438. Args:
  439. server_name: Remote server_name where the media originated.
  440. media_id: The media ID of the content (as defined by the remote server).
  441. max_timeout_ms: the maximum number of milliseconds to wait for the
  442. media to be uploaded.
  443. Returns:
  444. The media info of the file
  445. """
  446. if (
  447. self.federation_domain_whitelist is not None
  448. and server_name not in self.federation_domain_whitelist
  449. ):
  450. raise FederationDeniedError(server_name)
  451. # We linearize here to ensure that we don't try and download remote
  452. # media multiple times concurrently
  453. key = (server_name, media_id)
  454. async with self.remote_media_linearizer.queue(key):
  455. responder, media_info = await self._get_remote_media_impl(
  456. server_name, media_id, max_timeout_ms
  457. )
  458. # Ensure we actually use the responder so that it releases resources
  459. if responder:
  460. with responder:
  461. pass
  462. return media_info
  463. async def _get_remote_media_impl(
  464. self, server_name: str, media_id: str, max_timeout_ms: int
  465. ) -> Tuple[Optional[Responder], RemoteMedia]:
  466. """Looks for media in local cache, if not there then attempt to
  467. download from remote server.
  468. Args:
  469. server_name: Remote server_name where the media originated.
  470. media_id: The media ID of the content (as defined by the
  471. remote server).
  472. max_timeout_ms: the maximum number of milliseconds to wait for the
  473. media to be uploaded.
  474. Returns:
  475. A tuple of responder and the media info of the file.
  476. """
  477. media_info = await self.store.get_cached_remote_media(server_name, media_id)
  478. # file_id is the ID we use to track the file locally. If we've already
  479. # seen the file then reuse the existing ID, otherwise generate a new
  480. # one.
  481. # If we have an entry in the DB, try and look for it
  482. if media_info:
  483. file_id = media_info.filesystem_id
  484. file_info = FileInfo(server_name, file_id)
  485. if media_info.quarantined_by:
  486. logger.info("Media is quarantined")
  487. raise NotFoundError()
  488. if not media_info.media_type:
  489. media_info = attr.evolve(
  490. media_info, media_type="application/octet-stream"
  491. )
  492. responder = await self.media_storage.fetch_media(file_info)
  493. if responder:
  494. return responder, media_info
  495. # Failed to find the file anywhere, lets download it.
  496. try:
  497. media_info = await self._download_remote_file(
  498. server_name, media_id, max_timeout_ms
  499. )
  500. except SynapseError:
  501. raise
  502. except Exception as e:
  503. # An exception may be because we downloaded media in another
  504. # process, so let's check if we magically have the media.
  505. media_info = await self.store.get_cached_remote_media(server_name, media_id)
  506. if not media_info:
  507. raise e
  508. file_id = media_info.filesystem_id
  509. if not media_info.media_type:
  510. media_info = attr.evolve(media_info, media_type="application/octet-stream")
  511. file_info = FileInfo(server_name, file_id)
  512. # We generate thumbnails even if another process downloaded the media
  513. # as a) it's conceivable that the other download request dies before it
  514. # generates thumbnails, but mainly b) we want to be sure the thumbnails
  515. # have finished being generated before responding to the client,
  516. # otherwise they'll request thumbnails and get a 404 if they're not
  517. # ready yet.
  518. await self._generate_thumbnails(
  519. server_name, media_id, file_id, media_info.media_type
  520. )
  521. responder = await self.media_storage.fetch_media(file_info)
  522. return responder, media_info
  523. async def _download_remote_file(
  524. self,
  525. server_name: str,
  526. media_id: str,
  527. max_timeout_ms: int,
  528. ) -> RemoteMedia:
  529. """Attempt to download the remote file from the given server name,
  530. using the given file_id as the local id.
  531. Args:
  532. server_name: Originating server
  533. media_id: The media ID of the content (as defined by the
  534. remote server). This is different than the file_id, which is
  535. locally generated.
  536. max_timeout_ms: the maximum number of milliseconds to wait for the
  537. media to be uploaded.
  538. Returns:
  539. The media info of the file.
  540. """
  541. file_id = random_string(24)
  542. file_info = FileInfo(server_name=server_name, file_id=file_id)
  543. with self.media_storage.store_into_file(file_info) as (f, fname, finish):
  544. try:
  545. length, headers = await self.client.download_media(
  546. server_name,
  547. media_id,
  548. output_stream=f,
  549. max_size=self.max_upload_size,
  550. max_timeout_ms=max_timeout_ms,
  551. )
  552. except RequestSendFailed as e:
  553. logger.warning(
  554. "Request failed fetching remote media %s/%s: %r",
  555. server_name,
  556. media_id,
  557. e,
  558. )
  559. raise SynapseError(502, "Failed to fetch remote media")
  560. except HttpResponseException as e:
  561. logger.warning(
  562. "HTTP error fetching remote media %s/%s: %s",
  563. server_name,
  564. media_id,
  565. e.response,
  566. )
  567. if e.code == twisted.web.http.NOT_FOUND:
  568. raise e.to_synapse_error()
  569. raise SynapseError(502, "Failed to fetch remote media")
  570. except SynapseError:
  571. logger.warning(
  572. "Failed to fetch remote media %s/%s", server_name, media_id
  573. )
  574. raise
  575. except NotRetryingDestination:
  576. logger.warning("Not retrying destination %r", server_name)
  577. raise SynapseError(502, "Failed to fetch remote media")
  578. except Exception:
  579. logger.exception(
  580. "Failed to fetch remote media %s/%s", server_name, media_id
  581. )
  582. raise SynapseError(502, "Failed to fetch remote media")
  583. await finish()
  584. if b"Content-Type" in headers:
  585. media_type = headers[b"Content-Type"][0].decode("ascii")
  586. else:
  587. media_type = "application/octet-stream"
  588. upload_name = get_filename_from_headers(headers)
  589. time_now_ms = self.clock.time_msec()
  590. # Multiple remote media download requests can race (when using
  591. # multiple media repos), so this may throw a violation constraint
  592. # exception. If it does we'll delete the newly downloaded file from
  593. # disk (as we're in the ctx manager).
  594. #
  595. # However: we've already called `finish()` so we may have also
  596. # written to the storage providers. This is preferable to the
  597. # alternative where we call `finish()` *after* this, where we could
  598. # end up having an entry in the DB but fail to write the files to
  599. # the storage providers.
  600. await self.store.store_cached_remote_media(
  601. origin=server_name,
  602. media_id=media_id,
  603. media_type=media_type,
  604. time_now_ms=time_now_ms,
  605. upload_name=upload_name,
  606. media_length=length,
  607. filesystem_id=file_id,
  608. )
  609. logger.info("Stored remote media in file %r", fname)
  610. return RemoteMedia(
  611. media_origin=server_name,
  612. media_id=media_id,
  613. media_type=media_type,
  614. media_length=length,
  615. upload_name=upload_name,
  616. created_ts=time_now_ms,
  617. filesystem_id=file_id,
  618. last_access_ts=time_now_ms,
  619. quarantined_by=None,
  620. )
  621. def _get_thumbnail_requirements(
  622. self, media_type: str
  623. ) -> Tuple[ThumbnailRequirement, ...]:
  624. scpos = media_type.find(";")
  625. if scpos > 0:
  626. media_type = media_type[:scpos]
  627. return self.thumbnail_requirements.get(media_type, ())
  628. def _generate_thumbnail(
  629. self,
  630. thumbnailer: Thumbnailer,
  631. t_width: int,
  632. t_height: int,
  633. t_method: str,
  634. t_type: str,
  635. ) -> Optional[BytesIO]:
  636. m_width = thumbnailer.width
  637. m_height = thumbnailer.height
  638. if m_width * m_height >= self.max_image_pixels:
  639. logger.info(
  640. "Image too large to thumbnail %r x %r > %r",
  641. m_width,
  642. m_height,
  643. self.max_image_pixels,
  644. )
  645. return None
  646. if thumbnailer.transpose_method is not None:
  647. m_width, m_height = thumbnailer.transpose()
  648. if t_method == "crop":
  649. return thumbnailer.crop(t_width, t_height, t_type)
  650. elif t_method == "scale":
  651. t_width, t_height = thumbnailer.aspect(t_width, t_height)
  652. t_width = min(m_width, t_width)
  653. t_height = min(m_height, t_height)
  654. return thumbnailer.scale(t_width, t_height, t_type)
  655. return None
  656. async def generate_local_exact_thumbnail(
  657. self,
  658. media_id: str,
  659. t_width: int,
  660. t_height: int,
  661. t_method: str,
  662. t_type: str,
  663. url_cache: bool,
  664. ) -> Optional[str]:
  665. input_path = await self.media_storage.ensure_media_is_in_local_cache(
  666. FileInfo(None, media_id, url_cache=url_cache)
  667. )
  668. try:
  669. thumbnailer = Thumbnailer(input_path)
  670. except ThumbnailError as e:
  671. logger.warning(
  672. "Unable to generate a thumbnail for local media %s using a method of %s and type of %s: %s",
  673. media_id,
  674. t_method,
  675. t_type,
  676. e,
  677. )
  678. return None
  679. with thumbnailer:
  680. t_byte_source = await defer_to_thread(
  681. self.hs.get_reactor(),
  682. self._generate_thumbnail,
  683. thumbnailer,
  684. t_width,
  685. t_height,
  686. t_method,
  687. t_type,
  688. )
  689. if t_byte_source:
  690. try:
  691. file_info = FileInfo(
  692. server_name=None,
  693. file_id=media_id,
  694. url_cache=url_cache,
  695. thumbnail=ThumbnailInfo(
  696. width=t_width,
  697. height=t_height,
  698. method=t_method,
  699. type=t_type,
  700. length=t_byte_source.tell(),
  701. ),
  702. )
  703. output_path = await self.media_storage.store_file(
  704. t_byte_source, file_info
  705. )
  706. finally:
  707. t_byte_source.close()
  708. logger.info("Stored thumbnail in file %r", output_path)
  709. t_len = os.path.getsize(output_path)
  710. await self.store.store_local_thumbnail(
  711. media_id, t_width, t_height, t_type, t_method, t_len
  712. )
  713. return output_path
  714. # Could not generate thumbnail.
  715. return None
  716. async def generate_remote_exact_thumbnail(
  717. self,
  718. server_name: str,
  719. file_id: str,
  720. media_id: str,
  721. t_width: int,
  722. t_height: int,
  723. t_method: str,
  724. t_type: str,
  725. ) -> Optional[str]:
  726. input_path = await self.media_storage.ensure_media_is_in_local_cache(
  727. FileInfo(server_name, file_id)
  728. )
  729. try:
  730. thumbnailer = Thumbnailer(input_path)
  731. except ThumbnailError as e:
  732. logger.warning(
  733. "Unable to generate a thumbnail for remote media %s from %s using a method of %s and type of %s: %s",
  734. media_id,
  735. server_name,
  736. t_method,
  737. t_type,
  738. e,
  739. )
  740. return None
  741. with thumbnailer:
  742. t_byte_source = await defer_to_thread(
  743. self.hs.get_reactor(),
  744. self._generate_thumbnail,
  745. thumbnailer,
  746. t_width,
  747. t_height,
  748. t_method,
  749. t_type,
  750. )
  751. if t_byte_source:
  752. try:
  753. file_info = FileInfo(
  754. server_name=server_name,
  755. file_id=file_id,
  756. thumbnail=ThumbnailInfo(
  757. width=t_width,
  758. height=t_height,
  759. method=t_method,
  760. type=t_type,
  761. length=t_byte_source.tell(),
  762. ),
  763. )
  764. output_path = await self.media_storage.store_file(
  765. t_byte_source, file_info
  766. )
  767. finally:
  768. t_byte_source.close()
  769. logger.info("Stored thumbnail in file %r", output_path)
  770. t_len = os.path.getsize(output_path)
  771. await self.store.store_remote_media_thumbnail(
  772. server_name,
  773. media_id,
  774. file_id,
  775. t_width,
  776. t_height,
  777. t_type,
  778. t_method,
  779. t_len,
  780. )
  781. return output_path
  782. # Could not generate thumbnail.
  783. return None
  784. @trace
  785. async def _generate_thumbnails(
  786. self,
  787. server_name: Optional[str],
  788. media_id: str,
  789. file_id: str,
  790. media_type: str,
  791. url_cache: bool = False,
  792. ) -> Optional[dict]:
  793. """Generate and store thumbnails for an image.
  794. Args:
  795. server_name: The server name if remote media, else None if local
  796. media_id: The media ID of the content. (This is the same as
  797. the file_id for local content)
  798. file_id: Local file ID
  799. media_type: The content type of the file
  800. url_cache: If we are thumbnailing images downloaded for the URL cache,
  801. used exclusively by the url previewer
  802. Returns:
  803. Dict with "width" and "height" keys of original image or None if the
  804. media cannot be thumbnailed.
  805. """
  806. requirements = self._get_thumbnail_requirements(media_type)
  807. if not requirements:
  808. return None
  809. input_path = await self.media_storage.ensure_media_is_in_local_cache(
  810. FileInfo(server_name, file_id, url_cache=url_cache)
  811. )
  812. try:
  813. thumbnailer = Thumbnailer(input_path)
  814. except ThumbnailError as e:
  815. logger.warning(
  816. "Unable to generate thumbnails for remote media %s from %s of type %s: %s",
  817. media_id,
  818. server_name,
  819. media_type,
  820. e,
  821. )
  822. return None
  823. with thumbnailer:
  824. m_width = thumbnailer.width
  825. m_height = thumbnailer.height
  826. if m_width * m_height >= self.max_image_pixels:
  827. logger.info(
  828. "Image too large to thumbnail %r x %r > %r",
  829. m_width,
  830. m_height,
  831. self.max_image_pixels,
  832. )
  833. return None
  834. if thumbnailer.transpose_method is not None:
  835. m_width, m_height = await defer_to_thread(
  836. self.hs.get_reactor(), thumbnailer.transpose
  837. )
  838. # We deduplicate the thumbnail sizes by ignoring the cropped versions if
  839. # they have the same dimensions of a scaled one.
  840. thumbnails: Dict[Tuple[int, int, str], str] = {}
  841. for requirement in requirements:
  842. if requirement.method == "crop":
  843. thumbnails.setdefault(
  844. (requirement.width, requirement.height, requirement.media_type),
  845. requirement.method,
  846. )
  847. elif requirement.method == "scale":
  848. t_width, t_height = thumbnailer.aspect(
  849. requirement.width, requirement.height
  850. )
  851. t_width = min(m_width, t_width)
  852. t_height = min(m_height, t_height)
  853. thumbnails[
  854. (t_width, t_height, requirement.media_type)
  855. ] = requirement.method
  856. # Now we generate the thumbnails for each dimension, store it
  857. for (t_width, t_height, t_type), t_method in thumbnails.items():
  858. # Generate the thumbnail
  859. if t_method == "crop":
  860. t_byte_source = await defer_to_thread(
  861. self.hs.get_reactor(),
  862. thumbnailer.crop,
  863. t_width,
  864. t_height,
  865. t_type,
  866. )
  867. elif t_method == "scale":
  868. t_byte_source = await defer_to_thread(
  869. self.hs.get_reactor(),
  870. thumbnailer.scale,
  871. t_width,
  872. t_height,
  873. t_type,
  874. )
  875. else:
  876. logger.error("Unrecognized method: %r", t_method)
  877. continue
  878. if not t_byte_source:
  879. continue
  880. file_info = FileInfo(
  881. server_name=server_name,
  882. file_id=file_id,
  883. url_cache=url_cache,
  884. thumbnail=ThumbnailInfo(
  885. width=t_width,
  886. height=t_height,
  887. method=t_method,
  888. type=t_type,
  889. length=t_byte_source.tell(),
  890. ),
  891. )
  892. with self.media_storage.store_into_file(file_info) as (
  893. f,
  894. fname,
  895. finish,
  896. ):
  897. try:
  898. await self.media_storage.write_to_file(t_byte_source, f)
  899. await finish()
  900. finally:
  901. t_byte_source.close()
  902. t_len = os.path.getsize(fname)
  903. # Write to database
  904. if server_name:
  905. # Multiple remote media download requests can race (when
  906. # using multiple media repos), so this may throw a violation
  907. # constraint exception. If it does we'll delete the newly
  908. # generated thumbnail from disk (as we're in the ctx
  909. # manager).
  910. #
  911. # However: we've already called `finish()` so we may have
  912. # also written to the storage providers. This is preferable
  913. # to the alternative where we call `finish()` *after* this,
  914. # where we could end up having an entry in the DB but fail
  915. # to write the files to the storage providers.
  916. try:
  917. await self.store.store_remote_media_thumbnail(
  918. server_name,
  919. media_id,
  920. file_id,
  921. t_width,
  922. t_height,
  923. t_type,
  924. t_method,
  925. t_len,
  926. )
  927. except Exception as e:
  928. thumbnail_exists = (
  929. await self.store.get_remote_media_thumbnail(
  930. server_name,
  931. media_id,
  932. t_width,
  933. t_height,
  934. t_type,
  935. )
  936. )
  937. if not thumbnail_exists:
  938. raise e
  939. else:
  940. await self.store.store_local_thumbnail(
  941. media_id, t_width, t_height, t_type, t_method, t_len
  942. )
  943. return {"width": m_width, "height": m_height}
  944. async def _apply_media_retention_rules(self) -> None:
  945. """
  946. Purge old local and remote media according to the media retention rules
  947. defined in the homeserver config.
  948. """
  949. # Purge remote media
  950. if self._media_retention_remote_media_lifetime_ms is not None:
  951. # Calculate a threshold timestamp derived from the configured lifetime. Any
  952. # media that has not been accessed since this timestamp will be removed.
  953. remote_media_threshold_timestamp_ms = (
  954. self.clock.time_msec() - self._media_retention_remote_media_lifetime_ms
  955. )
  956. logger.info(
  957. "Purging remote media last accessed before"
  958. f" {remote_media_threshold_timestamp_ms}"
  959. )
  960. await self.delete_old_remote_media(
  961. before_ts=remote_media_threshold_timestamp_ms
  962. )
  963. # And now do the same for local media
  964. if self._media_retention_local_media_lifetime_ms is not None:
  965. # This works the same as the remote media threshold
  966. local_media_threshold_timestamp_ms = (
  967. self.clock.time_msec() - self._media_retention_local_media_lifetime_ms
  968. )
  969. logger.info(
  970. "Purging local media last accessed before"
  971. f" {local_media_threshold_timestamp_ms}"
  972. )
  973. await self.delete_old_local_media(
  974. before_ts=local_media_threshold_timestamp_ms,
  975. keep_profiles=True,
  976. delete_quarantined_media=False,
  977. delete_protected_media=False,
  978. )
  979. async def delete_old_remote_media(self, before_ts: int) -> Dict[str, int]:
  980. old_media = await self.store.get_remote_media_ids(
  981. before_ts, include_quarantined_media=False
  982. )
  983. deleted = 0
  984. for origin, media_id, file_id in old_media:
  985. key = (origin, media_id)
  986. logger.info("Deleting: %r", key)
  987. # TODO: Should we delete from the backup store
  988. async with self.remote_media_linearizer.queue(key):
  989. full_path = self.filepaths.remote_media_filepath(origin, file_id)
  990. try:
  991. os.remove(full_path)
  992. except OSError as e:
  993. logger.warning("Failed to remove file: %r", full_path)
  994. if e.errno == errno.ENOENT:
  995. pass
  996. else:
  997. continue
  998. thumbnail_dir = self.filepaths.remote_media_thumbnail_dir(
  999. origin, file_id
  1000. )
  1001. shutil.rmtree(thumbnail_dir, ignore_errors=True)
  1002. await self.store.delete_remote_media(origin, media_id)
  1003. deleted += 1
  1004. return {"deleted": deleted}
  1005. async def delete_local_media_ids(
  1006. self, media_ids: List[str]
  1007. ) -> Tuple[List[str], int]:
  1008. """
  1009. Delete the given local or remote media ID from this server
  1010. Args:
  1011. media_id: The media ID to delete.
  1012. Returns:
  1013. A tuple of (list of deleted media IDs, total deleted media IDs).
  1014. """
  1015. return await self._remove_local_media_from_disk(media_ids)
  1016. async def delete_old_local_media(
  1017. self,
  1018. before_ts: int,
  1019. size_gt: int = 0,
  1020. keep_profiles: bool = True,
  1021. delete_quarantined_media: bool = False,
  1022. delete_protected_media: bool = False,
  1023. ) -> Tuple[List[str], int]:
  1024. """
  1025. Delete local or remote media from this server by size and timestamp. Removes
  1026. media files, any thumbnails and cached URLs.
  1027. Args:
  1028. before_ts: Unix timestamp in ms.
  1029. Files that were last used before this timestamp will be deleted.
  1030. size_gt: Size of the media in bytes. Files that are larger will be deleted.
  1031. keep_profiles: Switch to delete also files that are still used in image data
  1032. (e.g user profile, room avatar). If false these files will be deleted.
  1033. delete_quarantined_media: If True, media marked as quarantined will be deleted.
  1034. delete_protected_media: If True, media marked as protected will be deleted.
  1035. Returns:
  1036. A tuple of (list of deleted media IDs, total deleted media IDs).
  1037. """
  1038. old_media = await self.store.get_local_media_ids(
  1039. before_ts,
  1040. size_gt,
  1041. keep_profiles,
  1042. include_quarantined_media=delete_quarantined_media,
  1043. include_protected_media=delete_protected_media,
  1044. )
  1045. return await self._remove_local_media_from_disk(old_media)
  1046. async def _remove_local_media_from_disk(
  1047. self, media_ids: List[str]
  1048. ) -> Tuple[List[str], int]:
  1049. """
  1050. Delete local or remote media from this server. Removes media files,
  1051. any thumbnails and cached URLs.
  1052. Args:
  1053. media_ids: List of media_id to delete
  1054. Returns:
  1055. A tuple of (list of deleted media IDs, total deleted media IDs).
  1056. """
  1057. removed_media = []
  1058. for media_id in media_ids:
  1059. logger.info("Deleting media with ID '%s'", media_id)
  1060. full_path = self.filepaths.local_media_filepath(media_id)
  1061. try:
  1062. os.remove(full_path)
  1063. except OSError as e:
  1064. logger.warning("Failed to remove file: %r: %s", full_path, e)
  1065. if e.errno == errno.ENOENT:
  1066. pass
  1067. else:
  1068. continue
  1069. thumbnail_dir = self.filepaths.local_media_thumbnail_dir(media_id)
  1070. shutil.rmtree(thumbnail_dir, ignore_errors=True)
  1071. await self.store.delete_remote_media(self.server_name, media_id)
  1072. await self.store.delete_url_cache((media_id,))
  1073. await self.store.delete_url_cache_media((media_id,))
  1074. removed_media.append(media_id)
  1075. return removed_media, len(removed_media)