您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 
 
 
 

519 行
21 KiB

  1. # Copyright 2015, 2016 OpenMarket Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """
  15. This module controls the reliability for application service transactions.
  16. The nominal flow through this module looks like:
  17. __________
  18. 1---ASa[e]-->| Service |--> Queue ASa[f]
  19. 2----ASb[e]->| Queuer |
  20. 3--ASa[f]--->|__________|-----------+ ASa[e], ASb[e]
  21. V
  22. -````````- +------------+
  23. |````````|<--StoreTxn-|Transaction |
  24. |Database| | Controller |---> SEND TO AS
  25. `--------` +------------+
  26. What happens on SEND TO AS depends on the state of the Application Service:
  27. - If the AS is marked as DOWN, do nothing.
  28. - If the AS is marked as UP, send the transaction.
  29. * SUCCESS : Increment where the AS is up to txn-wise and nuke the txn
  30. contents from the db.
  31. * FAILURE : Marked AS as DOWN and start Recoverer.
  32. Recoverer attempts to recover ASes who have died. The flow for this looks like:
  33. ,--------------------- backoff++ --------------.
  34. V |
  35. START ---> Wait exp ------> Get oldest txn ID from ----> FAILURE
  36. backoff DB and try to send it
  37. ^ |___________
  38. Mark AS as | V
  39. UP & quit +---------- YES SUCCESS
  40. | | |
  41. NO <--- Have more txns? <------ Mark txn success & nuke <-+
  42. from db; incr AS pos.
  43. Reset backoff.
  44. This is all tied together by the AppServiceScheduler which DIs the required
  45. components.
  46. """
  47. import logging
  48. from typing import (
  49. TYPE_CHECKING,
  50. Awaitable,
  51. Callable,
  52. Collection,
  53. Dict,
  54. Iterable,
  55. List,
  56. Optional,
  57. Sequence,
  58. Set,
  59. Tuple,
  60. )
  61. from synapse.appservice import (
  62. ApplicationService,
  63. ApplicationServiceState,
  64. TransactionOneTimeKeysCount,
  65. TransactionUnusedFallbackKeys,
  66. )
  67. from synapse.appservice.api import ApplicationServiceApi
  68. from synapse.events import EventBase
  69. from synapse.logging.context import run_in_background
  70. from synapse.metrics.background_process_metrics import run_as_background_process
  71. from synapse.storage.databases.main import DataStore
  72. from synapse.types import DeviceListUpdates, JsonMapping
  73. from synapse.util import Clock
  74. if TYPE_CHECKING:
  75. from synapse.server import HomeServer
  76. logger = logging.getLogger(__name__)
  77. # Maximum number of events to provide in an AS transaction.
  78. MAX_PERSISTENT_EVENTS_PER_TRANSACTION = 100
  79. # Maximum number of ephemeral events to provide in an AS transaction.
  80. MAX_EPHEMERAL_EVENTS_PER_TRANSACTION = 100
  81. # Maximum number of to-device messages to provide in an AS transaction.
  82. MAX_TO_DEVICE_MESSAGES_PER_TRANSACTION = 100
  83. class ApplicationServiceScheduler:
  84. """Public facing API for this module. Does the required DI to tie the
  85. components together. This also serves as the "event_pool", which in this
  86. case is a simple array.
  87. """
  88. def __init__(self, hs: "HomeServer"):
  89. self.clock = hs.get_clock()
  90. self.store = hs.get_datastores().main
  91. self.as_api = hs.get_application_service_api()
  92. self.txn_ctrl = _TransactionController(self.clock, self.store, self.as_api)
  93. self.queuer = _ServiceQueuer(self.txn_ctrl, self.clock, hs)
  94. async def start(self) -> None:
  95. logger.info("Starting appservice scheduler")
  96. # check for any DOWN ASes and start recoverers for them.
  97. services = await self.store.get_appservices_by_state(
  98. ApplicationServiceState.DOWN
  99. )
  100. for service in services:
  101. self.txn_ctrl.start_recoverer(service)
  102. def enqueue_for_appservice(
  103. self,
  104. appservice: ApplicationService,
  105. events: Optional[Collection[EventBase]] = None,
  106. ephemeral: Optional[Collection[JsonMapping]] = None,
  107. to_device_messages: Optional[Collection[JsonMapping]] = None,
  108. device_list_summary: Optional[DeviceListUpdates] = None,
  109. ) -> None:
  110. """
  111. Enqueue some data to be sent off to an application service.
  112. Args:
  113. appservice: The application service to create and send a transaction to.
  114. events: The persistent room events to send.
  115. ephemeral: The ephemeral events to send.
  116. to_device_messages: The to-device messages to send. These differ from normal
  117. to-device messages sent to clients, as they have 'to_device_id' and
  118. 'to_user_id' fields.
  119. device_list_summary: A summary of users that the application service either needs
  120. to refresh the device lists of, or those that the application service need no
  121. longer track the device lists of.
  122. """
  123. # We purposefully allow this method to run with empty events/ephemeral
  124. # collections, so that callers do not need to check iterable size themselves.
  125. if (
  126. not events
  127. and not ephemeral
  128. and not to_device_messages
  129. and not device_list_summary
  130. ):
  131. return
  132. if events:
  133. self.queuer.queued_events.setdefault(appservice.id, []).extend(events)
  134. if ephemeral:
  135. self.queuer.queued_ephemeral.setdefault(appservice.id, []).extend(ephemeral)
  136. if to_device_messages:
  137. self.queuer.queued_to_device_messages.setdefault(appservice.id, []).extend(
  138. to_device_messages
  139. )
  140. if device_list_summary:
  141. self.queuer.queued_device_list_summaries.setdefault(
  142. appservice.id, []
  143. ).append(device_list_summary)
  144. # Kick off a new application service transaction
  145. self.queuer.start_background_request(appservice)
  146. class _ServiceQueuer:
  147. """Queue of events waiting to be sent to appservices.
  148. Groups events into transactions per-appservice, and sends them on to the
  149. TransactionController. Makes sure that we only have one transaction in flight per
  150. appservice at a given time.
  151. """
  152. def __init__(
  153. self, txn_ctrl: "_TransactionController", clock: Clock, hs: "HomeServer"
  154. ):
  155. # dict of {service_id: [events]}
  156. self.queued_events: Dict[str, List[EventBase]] = {}
  157. # dict of {service_id: [events]}
  158. self.queued_ephemeral: Dict[str, List[JsonMapping]] = {}
  159. # dict of {service_id: [to_device_message_json]}
  160. self.queued_to_device_messages: Dict[str, List[JsonMapping]] = {}
  161. # dict of {service_id: [device_list_summary]}
  162. self.queued_device_list_summaries: Dict[str, List[DeviceListUpdates]] = {}
  163. # the appservices which currently have a transaction in flight
  164. self.requests_in_flight: Set[str] = set()
  165. self.txn_ctrl = txn_ctrl
  166. self.clock = clock
  167. self._msc3202_transaction_extensions_enabled: bool = (
  168. hs.config.experimental.msc3202_transaction_extensions
  169. )
  170. self._store = hs.get_datastores().main
  171. def start_background_request(self, service: ApplicationService) -> None:
  172. # start a sender for this appservice if we don't already have one
  173. if service.id in self.requests_in_flight:
  174. return
  175. run_as_background_process("as-sender", self._send_request, service)
  176. async def _send_request(self, service: ApplicationService) -> None:
  177. # sanity-check: we shouldn't get here if this service already has a sender
  178. # running.
  179. assert service.id not in self.requests_in_flight
  180. self.requests_in_flight.add(service.id)
  181. try:
  182. while True:
  183. all_events = self.queued_events.get(service.id, [])
  184. events = all_events[:MAX_PERSISTENT_EVENTS_PER_TRANSACTION]
  185. del all_events[:MAX_PERSISTENT_EVENTS_PER_TRANSACTION]
  186. all_events_ephemeral = self.queued_ephemeral.get(service.id, [])
  187. ephemeral = all_events_ephemeral[:MAX_EPHEMERAL_EVENTS_PER_TRANSACTION]
  188. del all_events_ephemeral[:MAX_EPHEMERAL_EVENTS_PER_TRANSACTION]
  189. all_to_device_messages = self.queued_to_device_messages.get(
  190. service.id, []
  191. )
  192. to_device_messages_to_send = all_to_device_messages[
  193. :MAX_TO_DEVICE_MESSAGES_PER_TRANSACTION
  194. ]
  195. del all_to_device_messages[:MAX_TO_DEVICE_MESSAGES_PER_TRANSACTION]
  196. # Consolidate any pending device list summaries into a single, up-to-date
  197. # summary.
  198. # Note: this code assumes that in a single DeviceListUpdates, a user will
  199. # never be in both "changed" and "left" sets.
  200. device_list_summary = DeviceListUpdates()
  201. for summary in self.queued_device_list_summaries.get(service.id, []):
  202. # For every user in the incoming "changed" set:
  203. # * Remove them from the existing "left" set if necessary
  204. # (as we need to start tracking them again)
  205. # * Add them to the existing "changed" set if necessary.
  206. device_list_summary.left.difference_update(summary.changed)
  207. device_list_summary.changed.update(summary.changed)
  208. # For every user in the incoming "left" set:
  209. # * Remove them from the existing "changed" set if necessary
  210. # (we no longer need to track them)
  211. # * Add them to the existing "left" set if necessary.
  212. device_list_summary.changed.difference_update(summary.left)
  213. device_list_summary.left.update(summary.left)
  214. self.queued_device_list_summaries.clear()
  215. if (
  216. not events
  217. and not ephemeral
  218. and not to_device_messages_to_send
  219. # DeviceListUpdates is True if either the 'changed' or 'left' sets have
  220. # at least one entry, otherwise False
  221. and not device_list_summary
  222. ):
  223. return
  224. one_time_keys_count: Optional[TransactionOneTimeKeysCount] = None
  225. unused_fallback_keys: Optional[TransactionUnusedFallbackKeys] = None
  226. if (
  227. self._msc3202_transaction_extensions_enabled
  228. and service.msc3202_transaction_extensions
  229. ):
  230. # Compute the one-time key counts and fallback key usage states
  231. # for the users which are mentioned in this transaction,
  232. # as well as the appservice's sender.
  233. (
  234. one_time_keys_count,
  235. unused_fallback_keys,
  236. ) = await self._compute_msc3202_otk_counts_and_fallback_keys(
  237. service, events, ephemeral, to_device_messages_to_send
  238. )
  239. try:
  240. await self.txn_ctrl.send(
  241. service,
  242. events,
  243. ephemeral,
  244. to_device_messages_to_send,
  245. one_time_keys_count,
  246. unused_fallback_keys,
  247. device_list_summary,
  248. )
  249. except Exception:
  250. logger.exception("AS request failed")
  251. finally:
  252. self.requests_in_flight.discard(service.id)
  253. async def _compute_msc3202_otk_counts_and_fallback_keys(
  254. self,
  255. service: ApplicationService,
  256. events: Iterable[EventBase],
  257. ephemerals: Iterable[JsonMapping],
  258. to_device_messages: Iterable[JsonMapping],
  259. ) -> Tuple[TransactionOneTimeKeysCount, TransactionUnusedFallbackKeys]:
  260. """
  261. Given a list of the events, ephemeral messages and to-device messages,
  262. - first computes a list of application services users that may have
  263. interesting updates to the one-time key counts or fallback key usage.
  264. - then computes one-time key counts and fallback key usages for those users.
  265. Given a list of application service users that are interesting,
  266. compute one-time key counts and fallback key usages for the users.
  267. """
  268. # Set of 'interesting' users who may have updates
  269. users: Set[str] = set()
  270. # The sender is always included
  271. users.add(service.sender)
  272. # All AS users that would receive the PDUs or EDUs sent to these rooms
  273. # are classed as 'interesting'.
  274. rooms_of_interesting_users: Set[str] = set()
  275. # PDUs
  276. rooms_of_interesting_users.update(event.room_id for event in events)
  277. # EDUs
  278. rooms_of_interesting_users.update(
  279. ephemeral["room_id"]
  280. for ephemeral in ephemerals
  281. if ephemeral.get("room_id") is not None
  282. )
  283. # Look up the AS users in those rooms
  284. for room_id in rooms_of_interesting_users:
  285. users.update(
  286. await self._store.get_app_service_users_in_room(room_id, service)
  287. )
  288. # Add recipients of to-device messages.
  289. users.update(
  290. device_message["to_user_id"] for device_message in to_device_messages
  291. )
  292. # Compute and return the counts / fallback key usage states
  293. otk_counts = await self._store.count_bulk_e2e_one_time_keys_for_as(users)
  294. unused_fbks = await self._store.get_e2e_bulk_unused_fallback_key_types(users)
  295. return otk_counts, unused_fbks
  296. class _TransactionController:
  297. """Transaction manager.
  298. Builds AppServiceTransactions and runs their lifecycle. Also starts a Recoverer
  299. if a transaction fails.
  300. (Note we have only have one of these in the homeserver.)
  301. """
  302. def __init__(self, clock: Clock, store: DataStore, as_api: ApplicationServiceApi):
  303. self.clock = clock
  304. self.store = store
  305. self.as_api = as_api
  306. # map from service id to recoverer instance
  307. self.recoverers: Dict[str, "_Recoverer"] = {}
  308. # for UTs
  309. self.RECOVERER_CLASS = _Recoverer
  310. async def send(
  311. self,
  312. service: ApplicationService,
  313. events: Sequence[EventBase],
  314. ephemeral: Optional[List[JsonMapping]] = None,
  315. to_device_messages: Optional[List[JsonMapping]] = None,
  316. one_time_keys_count: Optional[TransactionOneTimeKeysCount] = None,
  317. unused_fallback_keys: Optional[TransactionUnusedFallbackKeys] = None,
  318. device_list_summary: Optional[DeviceListUpdates] = None,
  319. ) -> None:
  320. """
  321. Create a transaction with the given data and send to the provided
  322. application service.
  323. Args:
  324. service: The application service to send the transaction to.
  325. events: The persistent events to include in the transaction.
  326. ephemeral: The ephemeral events to include in the transaction.
  327. to_device_messages: The to-device messages to include in the transaction.
  328. one_time_keys_count: Counts of remaining one-time keys for relevant
  329. appservice devices in the transaction.
  330. unused_fallback_keys: Lists of unused fallback keys for relevant
  331. appservice devices in the transaction.
  332. device_list_summary: The device list summary to include in the transaction.
  333. """
  334. try:
  335. service_is_up = await self._is_service_up(service)
  336. # Don't create empty txns when in recovery mode (ephemeral events are dropped)
  337. if not service_is_up and not events:
  338. return
  339. txn = await self.store.create_appservice_txn(
  340. service=service,
  341. events=events,
  342. ephemeral=ephemeral or [],
  343. to_device_messages=to_device_messages or [],
  344. one_time_keys_count=one_time_keys_count or {},
  345. unused_fallback_keys=unused_fallback_keys or {},
  346. device_list_summary=device_list_summary or DeviceListUpdates(),
  347. )
  348. if service_is_up:
  349. sent = await txn.send(self.as_api)
  350. if sent:
  351. await txn.complete(self.store)
  352. else:
  353. run_in_background(self._on_txn_fail, service)
  354. except Exception:
  355. logger.exception("Error creating appservice transaction")
  356. run_in_background(self._on_txn_fail, service)
  357. async def on_recovered(self, recoverer: "_Recoverer") -> None:
  358. logger.info(
  359. "Successfully recovered application service AS ID %s", recoverer.service.id
  360. )
  361. self.recoverers.pop(recoverer.service.id)
  362. logger.info("Remaining active recoverers: %s", len(self.recoverers))
  363. await self.store.set_appservice_state(
  364. recoverer.service, ApplicationServiceState.UP
  365. )
  366. async def _on_txn_fail(self, service: ApplicationService) -> None:
  367. try:
  368. await self.store.set_appservice_state(service, ApplicationServiceState.DOWN)
  369. self.start_recoverer(service)
  370. except Exception:
  371. logger.exception("Error starting AS recoverer")
  372. def start_recoverer(self, service: ApplicationService) -> None:
  373. """Start a Recoverer for the given service
  374. Args:
  375. service:
  376. """
  377. logger.info("Starting recoverer for AS ID %s", service.id)
  378. assert service.id not in self.recoverers
  379. recoverer = self.RECOVERER_CLASS(
  380. self.clock, self.store, self.as_api, service, self.on_recovered
  381. )
  382. self.recoverers[service.id] = recoverer
  383. recoverer.recover()
  384. logger.info("Now %i active recoverers", len(self.recoverers))
  385. async def _is_service_up(self, service: ApplicationService) -> bool:
  386. state = await self.store.get_appservice_state(service)
  387. return state == ApplicationServiceState.UP or state is None
  388. class _Recoverer:
  389. """Manages retries and backoff for a DOWN appservice.
  390. We have one of these for each appservice which is currently considered DOWN.
  391. Args:
  392. clock (synapse.util.Clock):
  393. store (synapse.storage.DataStore):
  394. as_api (synapse.appservice.api.ApplicationServiceApi):
  395. service (synapse.appservice.ApplicationService): the service we are managing
  396. callback (callable[_Recoverer]): called once the service recovers.
  397. """
  398. def __init__(
  399. self,
  400. clock: Clock,
  401. store: DataStore,
  402. as_api: ApplicationServiceApi,
  403. service: ApplicationService,
  404. callback: Callable[["_Recoverer"], Awaitable[None]],
  405. ):
  406. self.clock = clock
  407. self.store = store
  408. self.as_api = as_api
  409. self.service = service
  410. self.callback = callback
  411. self.backoff_counter = 1
  412. def recover(self) -> None:
  413. delay = 2**self.backoff_counter
  414. logger.info("Scheduling retries on %s in %fs", self.service.id, delay)
  415. self.clock.call_later(
  416. delay, run_as_background_process, "as-recoverer", self.retry
  417. )
  418. def _backoff(self) -> None:
  419. # cap the backoff to be around 8.5min => (2^9) = 512 secs
  420. if self.backoff_counter < 9:
  421. self.backoff_counter += 1
  422. self.recover()
  423. async def retry(self) -> None:
  424. logger.info("Starting retries on %s", self.service.id)
  425. try:
  426. while True:
  427. txn = await self.store.get_oldest_unsent_txn(self.service)
  428. if not txn:
  429. # nothing left: we're done!
  430. await self.callback(self)
  431. return
  432. logger.info(
  433. "Retrying transaction %s for AS ID %s", txn.id, txn.service.id
  434. )
  435. sent = await txn.send(self.as_api)
  436. if not sent:
  437. break
  438. await txn.complete(self.store)
  439. # reset the backoff counter and then process the next transaction
  440. self.backoff_counter = 1
  441. except Exception:
  442. logger.exception("Unexpected error running retries")
  443. # we didn't manage to send all of the transactions before we got an error of
  444. # some flavour: reschedule the next retry.
  445. self._backoff()