You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

362 lines
14 KiB

  1. # -*- coding: utf-8 -*-
  2. # Copyright 2018 New Vector
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import logging
  16. from typing import List
  17. from twisted.internet import defer
  18. from synapse.storage._base import SQLBaseStore
  19. from synapse.storage.database import DatabasePool, make_in_list_sql_clause
  20. from synapse.util.caches.descriptors import cached
  21. logger = logging.getLogger(__name__)
  22. # Number of msec of granularity to store the monthly_active_user timestamp
  23. # This means it is not necessary to update the table on every request
  24. LAST_SEEN_GRANULARITY = 60 * 60 * 1000
  25. class MonthlyActiveUsersWorkerStore(SQLBaseStore):
  26. def __init__(self, database: DatabasePool, db_conn, hs):
  27. super(MonthlyActiveUsersWorkerStore, self).__init__(database, db_conn, hs)
  28. self._clock = hs.get_clock()
  29. self.hs = hs
  30. @cached(num_args=0)
  31. def get_monthly_active_count(self):
  32. """Generates current count of monthly active users
  33. Returns:
  34. Defered[int]: Number of current monthly active users
  35. """
  36. def _count_users(txn):
  37. sql = "SELECT COALESCE(count(*), 0) FROM monthly_active_users"
  38. txn.execute(sql)
  39. (count,) = txn.fetchone()
  40. return count
  41. return self.db_pool.runInteraction("count_users", _count_users)
  42. @cached(num_args=0)
  43. def get_monthly_active_count_by_service(self):
  44. """Generates current count of monthly active users broken down by service.
  45. A service is typically an appservice but also includes native matrix users.
  46. Since the `monthly_active_users` table is populated from the `user_ips` table
  47. `config.track_appservice_user_ips` must be set to `true` for this
  48. method to return anything other than native matrix users.
  49. Returns:
  50. Deferred[dict]: dict that includes a mapping between app_service_id
  51. and the number of occurrences.
  52. """
  53. def _count_users_by_service(txn):
  54. sql = """
  55. SELECT COALESCE(appservice_id, 'native'), COALESCE(count(*), 0)
  56. FROM monthly_active_users
  57. LEFT JOIN users ON monthly_active_users.user_id=users.name
  58. GROUP BY appservice_id;
  59. """
  60. txn.execute(sql)
  61. result = txn.fetchall()
  62. return dict(result)
  63. return self.db_pool.runInteraction(
  64. "count_users_by_service", _count_users_by_service
  65. )
  66. async def get_registered_reserved_users(self) -> List[str]:
  67. """Of the reserved threepids defined in config, retrieve those that are associated
  68. with registered users
  69. Returns:
  70. User IDs of actual users that are reserved
  71. """
  72. users = []
  73. for tp in self.hs.config.mau_limits_reserved_threepids[
  74. : self.hs.config.max_mau_value
  75. ]:
  76. user_id = await self.hs.get_datastore().get_user_id_by_threepid(
  77. tp["medium"], tp["address"]
  78. )
  79. if user_id:
  80. users.append(user_id)
  81. return users
  82. @cached(num_args=1)
  83. def user_last_seen_monthly_active(self, user_id):
  84. """
  85. Checks if a given user is part of the monthly active user group
  86. Arguments:
  87. user_id (str): user to add/update
  88. Return:
  89. Deferred[int] : timestamp since last seen, None if never seen
  90. """
  91. return self.db_pool.simple_select_one_onecol(
  92. table="monthly_active_users",
  93. keyvalues={"user_id": user_id},
  94. retcol="timestamp",
  95. allow_none=True,
  96. desc="user_last_seen_monthly_active",
  97. )
  98. class MonthlyActiveUsersStore(MonthlyActiveUsersWorkerStore):
  99. def __init__(self, database: DatabasePool, db_conn, hs):
  100. super(MonthlyActiveUsersStore, self).__init__(database, db_conn, hs)
  101. self._limit_usage_by_mau = hs.config.limit_usage_by_mau
  102. self._mau_stats_only = hs.config.mau_stats_only
  103. self._max_mau_value = hs.config.max_mau_value
  104. # Do not add more reserved users than the total allowable number
  105. # cur = LoggingTransaction(
  106. self.db_pool.new_transaction(
  107. db_conn,
  108. "initialise_mau_threepids",
  109. [],
  110. [],
  111. self._initialise_reserved_users,
  112. hs.config.mau_limits_reserved_threepids[: self._max_mau_value],
  113. )
  114. def _initialise_reserved_users(self, txn, threepids):
  115. """Ensures that reserved threepids are accounted for in the MAU table, should
  116. be called on start up.
  117. Args:
  118. txn (cursor):
  119. threepids (list[dict]): List of threepid dicts to reserve
  120. """
  121. # XXX what is this function trying to achieve? It upserts into
  122. # monthly_active_users for each *registered* reserved mau user, but why?
  123. #
  124. # - shouldn't there already be an entry for each reserved user (at least
  125. # if they have been active recently)?
  126. #
  127. # - if it's important that the timestamp is kept up to date, why do we only
  128. # run this at startup?
  129. for tp in threepids:
  130. user_id = self.get_user_id_by_threepid_txn(txn, tp["medium"], tp["address"])
  131. if user_id:
  132. is_support = self.is_support_user_txn(txn, user_id)
  133. if not is_support:
  134. # We do this manually here to avoid hitting #6791
  135. self.db_pool.simple_upsert_txn(
  136. txn,
  137. table="monthly_active_users",
  138. keyvalues={"user_id": user_id},
  139. values={"timestamp": int(self._clock.time_msec())},
  140. )
  141. else:
  142. logger.warning("mau limit reserved threepid %s not found in db" % tp)
  143. async def reap_monthly_active_users(self):
  144. """Cleans out monthly active user table to ensure that no stale
  145. entries exist.
  146. """
  147. def _reap_users(txn, reserved_users):
  148. """
  149. Args:
  150. reserved_users (tuple): reserved users to preserve
  151. """
  152. thirty_days_ago = int(self._clock.time_msec()) - (1000 * 60 * 60 * 24 * 30)
  153. in_clause, in_clause_args = make_in_list_sql_clause(
  154. self.database_engine, "user_id", reserved_users
  155. )
  156. txn.execute(
  157. "DELETE FROM monthly_active_users WHERE timestamp < ? AND NOT %s"
  158. % (in_clause,),
  159. [thirty_days_ago] + in_clause_args,
  160. )
  161. if self._limit_usage_by_mau:
  162. # If MAU user count still exceeds the MAU threshold, then delete on
  163. # a least recently active basis.
  164. # Note it is not possible to write this query using OFFSET due to
  165. # incompatibilities in how sqlite and postgres support the feature.
  166. # Sqlite requires 'LIMIT -1 OFFSET ?', the LIMIT must be present,
  167. # while Postgres does not require 'LIMIT', but also does not support
  168. # negative LIMIT values. So there is no way to write it that both can
  169. # support
  170. # Limit must be >= 0 for postgres
  171. num_of_non_reserved_users_to_remove = max(
  172. self._max_mau_value - len(reserved_users), 0
  173. )
  174. # It is important to filter reserved users twice to guard
  175. # against the case where the reserved user is present in the
  176. # SELECT, meaning that a legitimate mau is deleted.
  177. sql = """
  178. DELETE FROM monthly_active_users
  179. WHERE user_id NOT IN (
  180. SELECT user_id FROM monthly_active_users
  181. WHERE NOT %s
  182. ORDER BY timestamp DESC
  183. LIMIT ?
  184. )
  185. AND NOT %s
  186. """ % (
  187. in_clause,
  188. in_clause,
  189. )
  190. query_args = (
  191. in_clause_args
  192. + [num_of_non_reserved_users_to_remove]
  193. + in_clause_args
  194. )
  195. txn.execute(sql, query_args)
  196. # It seems poor to invalidate the whole cache. Postgres supports
  197. # 'Returning' which would allow me to invalidate only the
  198. # specific users, but sqlite has no way to do this and instead
  199. # I would need to SELECT and the DELETE which without locking
  200. # is racy.
  201. # Have resolved to invalidate the whole cache for now and do
  202. # something about it if and when the perf becomes significant
  203. self._invalidate_all_cache_and_stream(
  204. txn, self.user_last_seen_monthly_active
  205. )
  206. self._invalidate_cache_and_stream(txn, self.get_monthly_active_count, ())
  207. reserved_users = await self.get_registered_reserved_users()
  208. await self.db_pool.runInteraction(
  209. "reap_monthly_active_users", _reap_users, reserved_users
  210. )
  211. @defer.inlineCallbacks
  212. def upsert_monthly_active_user(self, user_id):
  213. """Updates or inserts the user into the monthly active user table, which
  214. is used to track the current MAU usage of the server
  215. Args:
  216. user_id (str): user to add/update
  217. Returns:
  218. Deferred
  219. """
  220. # Support user never to be included in MAU stats. Note I can't easily call this
  221. # from upsert_monthly_active_user_txn because then I need a _txn form of
  222. # is_support_user which is complicated because I want to cache the result.
  223. # Therefore I call it here and ignore the case where
  224. # upsert_monthly_active_user_txn is called directly from
  225. # _initialise_reserved_users reasoning that it would be very strange to
  226. # include a support user in this context.
  227. is_support = yield self.is_support_user(user_id)
  228. if is_support:
  229. return
  230. yield self.db_pool.runInteraction(
  231. "upsert_monthly_active_user", self.upsert_monthly_active_user_txn, user_id
  232. )
  233. def upsert_monthly_active_user_txn(self, txn, user_id):
  234. """Updates or inserts monthly active user member
  235. We consciously do not call is_support_txn from this method because it
  236. is not possible to cache the response. is_support_txn will be false in
  237. almost all cases, so it seems reasonable to call it only for
  238. upsert_monthly_active_user and to call is_support_txn manually
  239. for cases where upsert_monthly_active_user_txn is called directly,
  240. like _initialise_reserved_users
  241. In short, don't call this method with support users. (Support users
  242. should not appear in the MAU stats).
  243. Args:
  244. txn (cursor):
  245. user_id (str): user to add/update
  246. Returns:
  247. bool: True if a new entry was created, False if an
  248. existing one was updated.
  249. """
  250. # Am consciously deciding to lock the table on the basis that is ought
  251. # never be a big table and alternative approaches (batching multiple
  252. # upserts into a single txn) introduced a lot of extra complexity.
  253. # See https://github.com/matrix-org/synapse/issues/3854 for more
  254. is_insert = self.db_pool.simple_upsert_txn(
  255. txn,
  256. table="monthly_active_users",
  257. keyvalues={"user_id": user_id},
  258. values={"timestamp": int(self._clock.time_msec())},
  259. )
  260. self._invalidate_cache_and_stream(txn, self.get_monthly_active_count, ())
  261. self._invalidate_cache_and_stream(
  262. txn, self.get_monthly_active_count_by_service, ()
  263. )
  264. self._invalidate_cache_and_stream(
  265. txn, self.user_last_seen_monthly_active, (user_id,)
  266. )
  267. return is_insert
  268. @defer.inlineCallbacks
  269. def populate_monthly_active_users(self, user_id):
  270. """Checks on the state of monthly active user limits and optionally
  271. add the user to the monthly active tables
  272. Args:
  273. user_id(str): the user_id to query
  274. """
  275. if self._limit_usage_by_mau or self._mau_stats_only:
  276. # Trial users and guests should not be included as part of MAU group
  277. is_guest = yield self.is_guest(user_id)
  278. if is_guest:
  279. return
  280. is_trial = yield self.is_trial_user(user_id)
  281. if is_trial:
  282. return
  283. last_seen_timestamp = yield self.user_last_seen_monthly_active(user_id)
  284. now = self.hs.get_clock().time_msec()
  285. # We want to reduce to the total number of db writes, and are happy
  286. # to trade accuracy of timestamp in order to lighten load. This means
  287. # We always insert new users (where MAU threshold has not been reached),
  288. # but only update if we have not previously seen the user for
  289. # LAST_SEEN_GRANULARITY ms
  290. if last_seen_timestamp is None:
  291. # In the case where mau_stats_only is True and limit_usage_by_mau is
  292. # False, there is no point in checking get_monthly_active_count - it
  293. # adds no value and will break the logic if max_mau_value is exceeded.
  294. if not self._limit_usage_by_mau:
  295. yield self.upsert_monthly_active_user(user_id)
  296. else:
  297. count = yield self.get_monthly_active_count()
  298. if count < self._max_mau_value:
  299. yield self.upsert_monthly_active_user(user_id)
  300. elif now - last_seen_timestamp > LAST_SEEN_GRANULARITY:
  301. yield self.upsert_monthly_active_user(user_id)