You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

439 lines
17 KiB

  1. # Copyright 2018 New Vector
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import logging
  15. from typing import TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, cast
  16. from synapse.metrics.background_process_metrics import wrap_as_background_process
  17. from synapse.storage.database import (
  18. DatabasePool,
  19. LoggingDatabaseConnection,
  20. LoggingTransaction,
  21. make_in_list_sql_clause,
  22. )
  23. from synapse.storage.databases.main.registration import RegistrationWorkerStore
  24. from synapse.util.caches.descriptors import cached
  25. from synapse.util.threepids import canonicalise_email
  26. if TYPE_CHECKING:
  27. from synapse.server import HomeServer
  28. logger = logging.getLogger(__name__)
  29. # Number of msec of granularity to store the monthly_active_user timestamp
  30. # This means it is not necessary to update the table on every request
  31. LAST_SEEN_GRANULARITY = 60 * 60 * 1000
  32. class MonthlyActiveUsersWorkerStore(RegistrationWorkerStore):
  33. def __init__(
  34. self,
  35. database: DatabasePool,
  36. db_conn: LoggingDatabaseConnection,
  37. hs: "HomeServer",
  38. ):
  39. super().__init__(database, db_conn, hs)
  40. self._clock = hs.get_clock()
  41. self.hs = hs
  42. if hs.config.redis.redis_enabled:
  43. # If we're using Redis, we can shift this update process off to
  44. # the background worker
  45. self._update_on_this_worker = hs.config.worker.run_background_tasks
  46. else:
  47. # If we're NOT using Redis, this must be handled by the master
  48. self._update_on_this_worker = hs.get_instance_name() == "master"
  49. self._limit_usage_by_mau = hs.config.server.limit_usage_by_mau
  50. self._max_mau_value = hs.config.server.max_mau_value
  51. self._mau_stats_only = hs.config.server.mau_stats_only
  52. if self._update_on_this_worker:
  53. # Do not add more reserved users than the total allowable number
  54. self.db_pool.new_transaction(
  55. db_conn,
  56. "initialise_mau_threepids",
  57. [],
  58. [],
  59. [],
  60. self._initialise_reserved_users,
  61. hs.config.server.mau_limits_reserved_threepids[: self._max_mau_value],
  62. )
  63. @cached(num_args=0)
  64. async def get_monthly_active_count(self) -> int:
  65. """Generates current count of monthly active users
  66. Returns:
  67. Number of current monthly active users
  68. """
  69. def _count_users(txn: LoggingTransaction) -> int:
  70. # Exclude app service users
  71. sql = """
  72. SELECT COUNT(*)
  73. FROM monthly_active_users
  74. LEFT JOIN users
  75. ON monthly_active_users.user_id=users.name
  76. WHERE (users.appservice_id IS NULL OR users.appservice_id = '');
  77. """
  78. txn.execute(sql)
  79. (count,) = cast(Tuple[int], txn.fetchone())
  80. return count
  81. return await self.db_pool.runInteraction("count_users", _count_users)
  82. @cached(num_args=0)
  83. async def get_monthly_active_count_by_service(self) -> Mapping[str, int]:
  84. """Generates current count of monthly active users broken down by service.
  85. A service is typically an appservice but also includes native matrix users.
  86. Since the `monthly_active_users` table is populated from the `user_ips` table
  87. `config.appservice.track_appservice_user_ips` must be set to `true` for this
  88. method to return anything other than native matrix users.
  89. Returns:
  90. A mapping between app_service_id and the number of occurrences.
  91. """
  92. def _count_users_by_service(txn: LoggingTransaction) -> Dict[str, int]:
  93. sql = """
  94. SELECT COALESCE(appservice_id, 'native'), COUNT(*)
  95. FROM monthly_active_users
  96. LEFT JOIN users ON monthly_active_users.user_id=users.name
  97. GROUP BY appservice_id;
  98. """
  99. txn.execute(sql)
  100. result = cast(List[Tuple[str, int]], txn.fetchall())
  101. return dict(result)
  102. return await self.db_pool.runInteraction(
  103. "count_users_by_service", _count_users_by_service
  104. )
  105. async def get_monthly_active_users_by_service(
  106. self, start_timestamp: Optional[int] = None, end_timestamp: Optional[int] = None
  107. ) -> List[Tuple[str, str]]:
  108. """Generates list of monthly active users and their services.
  109. Please see "get_monthly_active_count_by_service" docstring for more details
  110. about services.
  111. Arguments:
  112. start_timestamp: If specified, only include users that were first active
  113. at or after this point
  114. end_timestamp: If specified, only include users that were first active
  115. at or before this point
  116. Returns:
  117. A list of tuples (appservice_id, user_id). "native" is emitted as the
  118. appservice for users that don't come from appservices (i.e. native Matrix
  119. users).
  120. """
  121. if start_timestamp is not None and end_timestamp is not None:
  122. where_clause = 'WHERE "timestamp" >= ? and "timestamp" <= ?'
  123. query_params = [start_timestamp, end_timestamp]
  124. elif start_timestamp is not None:
  125. where_clause = 'WHERE "timestamp" >= ?'
  126. query_params = [start_timestamp]
  127. elif end_timestamp is not None:
  128. where_clause = 'WHERE "timestamp" <= ?'
  129. query_params = [end_timestamp]
  130. else:
  131. where_clause = ""
  132. query_params = []
  133. def _list_users(txn: LoggingTransaction) -> List[Tuple[str, str]]:
  134. sql = f"""
  135. SELECT COALESCE(appservice_id, 'native'), user_id
  136. FROM monthly_active_users
  137. LEFT JOIN users ON monthly_active_users.user_id=users.name
  138. {where_clause};
  139. """
  140. txn.execute(sql, query_params)
  141. return cast(List[Tuple[str, str]], txn.fetchall())
  142. return await self.db_pool.runInteraction("list_users", _list_users)
  143. async def get_registered_reserved_users(self) -> List[str]:
  144. """Of the reserved threepids defined in config, retrieve those that are associated
  145. with registered users
  146. Returns:
  147. User IDs of actual users that are reserved
  148. """
  149. users = []
  150. for tp in self.hs.config.server.mau_limits_reserved_threepids[
  151. : self.hs.config.server.max_mau_value
  152. ]:
  153. user_id = await self.hs.get_datastores().main.get_user_id_by_threepid(
  154. tp["medium"], canonicalise_email(tp["address"])
  155. )
  156. if user_id:
  157. users.append(user_id)
  158. return users
  159. @cached(num_args=1)
  160. async def user_last_seen_monthly_active(self, user_id: str) -> Optional[int]:
  161. """
  162. Checks if a given user is part of the monthly active user group
  163. Arguments:
  164. user_id: user to add/update
  165. Return:
  166. Timestamp since last seen, None if never seen
  167. """
  168. return await self.db_pool.simple_select_one_onecol(
  169. table="monthly_active_users",
  170. keyvalues={"user_id": user_id},
  171. retcol="timestamp",
  172. allow_none=True,
  173. desc="user_last_seen_monthly_active",
  174. )
  175. @wrap_as_background_process("reap_monthly_active_users")
  176. async def reap_monthly_active_users(self) -> None:
  177. """Cleans out monthly active user table to ensure that no stale
  178. entries exist.
  179. """
  180. def _reap_users(txn: LoggingTransaction, reserved_users: List[str]) -> None:
  181. """
  182. Args:
  183. reserved_users: reserved users to preserve
  184. """
  185. thirty_days_ago = int(self._clock.time_msec()) - (1000 * 60 * 60 * 24 * 30)
  186. in_clause, in_clause_args = make_in_list_sql_clause(
  187. self.database_engine, "user_id", reserved_users
  188. )
  189. txn.execute(
  190. "DELETE FROM monthly_active_users WHERE timestamp < ? AND NOT %s"
  191. % (in_clause,),
  192. [thirty_days_ago] + in_clause_args,
  193. )
  194. if self._limit_usage_by_mau:
  195. # If MAU user count still exceeds the MAU threshold, then delete on
  196. # a least recently active basis.
  197. # Note it is not possible to write this query using OFFSET due to
  198. # incompatibilities in how sqlite and postgres support the feature.
  199. # Sqlite requires 'LIMIT -1 OFFSET ?', the LIMIT must be present,
  200. # while Postgres does not require 'LIMIT', but also does not support
  201. # negative LIMIT values. So there is no way to write it that both can
  202. # support
  203. # Limit must be >= 0 for postgres
  204. num_of_non_reserved_users_to_remove = max(
  205. self._max_mau_value - len(reserved_users), 0
  206. )
  207. # It is important to filter reserved users twice to guard
  208. # against the case where the reserved user is present in the
  209. # SELECT, meaning that a legitimate mau is deleted.
  210. sql = """
  211. DELETE FROM monthly_active_users
  212. WHERE user_id NOT IN (
  213. SELECT user_id FROM monthly_active_users
  214. WHERE NOT %s
  215. ORDER BY timestamp DESC
  216. LIMIT ?
  217. )
  218. AND NOT %s
  219. """ % (
  220. in_clause,
  221. in_clause,
  222. )
  223. query_args = (
  224. in_clause_args
  225. + [num_of_non_reserved_users_to_remove]
  226. + in_clause_args
  227. )
  228. txn.execute(sql, query_args)
  229. # It seems poor to invalidate the whole cache. Postgres supports
  230. # 'Returning' which would allow me to invalidate only the
  231. # specific users, but sqlite has no way to do this and instead
  232. # I would need to SELECT and the DELETE which without locking
  233. # is racy.
  234. # Have resolved to invalidate the whole cache for now and do
  235. # something about it if and when the perf becomes significant
  236. self._invalidate_all_cache_and_stream(
  237. txn, self.user_last_seen_monthly_active
  238. )
  239. self._invalidate_cache_and_stream(txn, self.get_monthly_active_count, ())
  240. reserved_users = await self.get_registered_reserved_users()
  241. await self.db_pool.runInteraction(
  242. "reap_monthly_active_users", _reap_users, reserved_users
  243. )
  244. def _initialise_reserved_users(
  245. self, txn: LoggingTransaction, threepids: List[dict]
  246. ) -> None:
  247. """Ensures that reserved threepids are accounted for in the MAU table, should
  248. be called on start up.
  249. Args:
  250. txn:
  251. threepids: List of threepid dicts to reserve
  252. """
  253. assert (
  254. self._update_on_this_worker
  255. ), "This worker is not designated to update MAUs"
  256. # XXX what is this function trying to achieve? It upserts into
  257. # monthly_active_users for each *registered* reserved mau user, but why?
  258. #
  259. # - shouldn't there already be an entry for each reserved user (at least
  260. # if they have been active recently)?
  261. #
  262. # - if it's important that the timestamp is kept up to date, why do we only
  263. # run this at startup?
  264. for tp in threepids:
  265. user_id = self.get_user_id_by_threepid_txn(txn, tp["medium"], tp["address"])
  266. if user_id:
  267. is_support = self.is_support_user_txn(txn, user_id)
  268. if not is_support:
  269. # We do this manually here to avoid hitting https://github.com/matrix-org/synapse/issues/6791
  270. self.db_pool.simple_upsert_txn(
  271. txn,
  272. table="monthly_active_users",
  273. keyvalues={"user_id": user_id},
  274. values={"timestamp": int(self._clock.time_msec())},
  275. )
  276. else:
  277. logger.warning("mau limit reserved threepid %s not found in db" % tp)
  278. async def upsert_monthly_active_user(self, user_id: str) -> None:
  279. """Updates or inserts the user into the monthly active user table, which
  280. is used to track the current MAU usage of the server
  281. Args:
  282. user_id: user to add/update
  283. """
  284. assert (
  285. self._update_on_this_worker
  286. ), "This worker is not designated to update MAUs"
  287. # Support user never to be included in MAU stats. Note I can't easily call this
  288. # from upsert_monthly_active_user_txn because then I need a _txn form of
  289. # is_support_user which is complicated because I want to cache the result.
  290. # Therefore I call it here and ignore the case where
  291. # upsert_monthly_active_user_txn is called directly from
  292. # _initialise_reserved_users reasoning that it would be very strange to
  293. # include a support user in this context.
  294. is_support = await self.is_support_user(user_id)
  295. if is_support:
  296. return
  297. await self.db_pool.runInteraction(
  298. "upsert_monthly_active_user", self.upsert_monthly_active_user_txn, user_id
  299. )
  300. def upsert_monthly_active_user_txn(
  301. self, txn: LoggingTransaction, user_id: str
  302. ) -> None:
  303. """Updates or inserts monthly active user member
  304. We consciously do not call is_support_txn from this method because it
  305. is not possible to cache the response. is_support_txn will be false in
  306. almost all cases, so it seems reasonable to call it only for
  307. upsert_monthly_active_user and to call is_support_txn manually
  308. for cases where upsert_monthly_active_user_txn is called directly,
  309. like _initialise_reserved_users
  310. In short, don't call this method with support users. (Support users
  311. should not appear in the MAU stats).
  312. Args:
  313. txn:
  314. user_id: user to add/update
  315. """
  316. assert (
  317. self._update_on_this_worker
  318. ), "This worker is not designated to update MAUs"
  319. # Am consciously deciding to lock the table on the basis that is ought
  320. # never be a big table and alternative approaches (batching multiple
  321. # upserts into a single txn) introduced a lot of extra complexity.
  322. # See https://github.com/matrix-org/synapse/issues/3854 for more
  323. self.db_pool.simple_upsert_txn(
  324. txn,
  325. table="monthly_active_users",
  326. keyvalues={"user_id": user_id},
  327. values={"timestamp": int(self._clock.time_msec())},
  328. )
  329. self._invalidate_cache_and_stream(txn, self.get_monthly_active_count, ())
  330. self._invalidate_cache_and_stream(
  331. txn, self.get_monthly_active_count_by_service, ()
  332. )
  333. self._invalidate_cache_and_stream(
  334. txn, self.user_last_seen_monthly_active, (user_id,)
  335. )
  336. async def populate_monthly_active_users(self, user_id: str) -> None:
  337. """Checks on the state of monthly active user limits and optionally
  338. add the user to the monthly active tables
  339. Args:
  340. user_id: the user_id to query
  341. """
  342. assert (
  343. self._update_on_this_worker
  344. ), "This worker is not designated to update MAUs"
  345. if self._limit_usage_by_mau or self._mau_stats_only:
  346. # Trial users and guests should not be included as part of MAU group
  347. is_guest = await self.is_guest(user_id)
  348. if is_guest:
  349. return
  350. is_trial = await self.is_trial_user(user_id)
  351. if is_trial:
  352. return
  353. last_seen_timestamp = await self.user_last_seen_monthly_active(user_id)
  354. now = self.hs.get_clock().time_msec()
  355. # We want to reduce to the total number of db writes, and are happy
  356. # to trade accuracy of timestamp in order to lighten load. This means
  357. # We always insert new users (where MAU threshold has not been reached),
  358. # but only update if we have not previously seen the user for
  359. # LAST_SEEN_GRANULARITY ms
  360. if last_seen_timestamp is None:
  361. # In the case where mau_stats_only is True and limit_usage_by_mau is
  362. # False, there is no point in checking get_monthly_active_count - it
  363. # adds no value and will break the logic if max_mau_value is exceeded.
  364. if not self._limit_usage_by_mau:
  365. await self.upsert_monthly_active_user(user_id)
  366. else:
  367. count = await self.get_monthly_active_count()
  368. if count < self._max_mau_value:
  369. await self.upsert_monthly_active_user(user_id)
  370. elif now - last_seen_timestamp > LAST_SEEN_GRANULARITY:
  371. await self.upsert_monthly_active_user(user_id)