Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 
 
 
 

399 lignes
15 KiB

  1. # Copyright 2023 The Matrix.org Foundation C.I.C.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. import logging
  15. from typing import TYPE_CHECKING, Awaitable, Callable, Dict, List, Optional, Set, Tuple
  16. from twisted.python.failure import Failure
  17. from synapse.logging.context import nested_logging_context
  18. from synapse.metrics import LaterGauge
  19. from synapse.metrics.background_process_metrics import (
  20. run_as_background_process,
  21. wrap_as_background_process,
  22. )
  23. from synapse.types import JsonMapping, ScheduledTask, TaskStatus
  24. from synapse.util.stringutils import random_string
  25. if TYPE_CHECKING:
  26. from synapse.server import HomeServer
  27. logger = logging.getLogger(__name__)
  28. class TaskScheduler:
  29. """
  30. This is a simple task sheduler aimed at resumable tasks: usually we use `run_in_background`
  31. to launch a background task, or Twisted `deferLater` if we want to do so later on.
  32. The problem with that is that the tasks will just stop and never be resumed if synapse
  33. is stopped for whatever reason.
  34. How this works:
  35. - A function mapped to a named action should first be registered with `register_action`.
  36. This function will be called when trying to resuming tasks after a synapse shutdown,
  37. so this registration should happen when synapse is initialised, NOT right before scheduling
  38. a task.
  39. - A task can then be launched using this named action with `schedule_task`. A `params` dict
  40. can be passed, and it will be available to the registered function when launched. This task
  41. can be launch either now-ish, or later on by giving a `timestamp` parameter.
  42. The function may call `update_task` at any time to update the `result` of the task,
  43. and this can be used to resume the task at a specific point and/or to convey a result to
  44. the code launching the task.
  45. You can also specify the `result` (and/or an `error`) when returning from the function.
  46. The reconciliation loop runs every minute, so this is not a precise scheduler.
  47. There is a limit of 10 concurrent tasks, so tasks may be delayed if the pool is already
  48. full. In this regard, please take great care that scheduled tasks can actually finished.
  49. For now there is no mechanism to stop a running task if it is stuck.
  50. Tasks will be run on the worker specified with `run_background_tasks_on` config,
  51. or the main one by default.
  52. """
  53. # Precision of the scheduler, evaluation of tasks to run will only happen
  54. # every `SCHEDULE_INTERVAL_MS` ms
  55. SCHEDULE_INTERVAL_MS = 1 * 60 * 1000 # 1mn
  56. # How often to clean up old tasks.
  57. CLEANUP_INTERVAL_MS = 30 * 60 * 1000
  58. # Time before a complete or failed task is deleted from the DB
  59. KEEP_TASKS_FOR_MS = 7 * 24 * 60 * 60 * 1000 # 1 week
  60. # Maximum number of tasks that can run at the same time
  61. MAX_CONCURRENT_RUNNING_TASKS = 5
  62. # Time from the last task update after which we will log a warning
  63. LAST_UPDATE_BEFORE_WARNING_MS = 24 * 60 * 60 * 1000 # 24hrs
  64. def __init__(self, hs: "HomeServer"):
  65. self._hs = hs
  66. self._store = hs.get_datastores().main
  67. self._clock = hs.get_clock()
  68. self._running_tasks: Set[str] = set()
  69. # A map between action names and their registered function
  70. self._actions: Dict[
  71. str,
  72. Callable[
  73. [ScheduledTask],
  74. Awaitable[Tuple[TaskStatus, Optional[JsonMapping], Optional[str]]],
  75. ],
  76. ] = {}
  77. self._run_background_tasks = hs.config.worker.run_background_tasks
  78. # Flag to make sure we only try and launch new tasks once at a time.
  79. self._launching_new_tasks = False
  80. if self._run_background_tasks:
  81. self._clock.looping_call(
  82. self._launch_scheduled_tasks,
  83. TaskScheduler.SCHEDULE_INTERVAL_MS,
  84. )
  85. self._clock.looping_call(
  86. self._clean_scheduled_tasks,
  87. TaskScheduler.SCHEDULE_INTERVAL_MS,
  88. )
  89. LaterGauge(
  90. "synapse_scheduler_running_tasks",
  91. "The number of concurrent running tasks handled by the TaskScheduler",
  92. labels=None,
  93. caller=lambda: len(self._running_tasks),
  94. )
  95. def register_action(
  96. self,
  97. function: Callable[
  98. [ScheduledTask],
  99. Awaitable[Tuple[TaskStatus, Optional[JsonMapping], Optional[str]]],
  100. ],
  101. action_name: str,
  102. ) -> None:
  103. """Register a function to be executed when an action is scheduled with
  104. the specified action name.
  105. Actions need to be registered as early as possible so that a resumed action
  106. can find its matching function. It's usually better to NOT do that right before
  107. calling `schedule_task` but rather in an `__init__` method.
  108. Args:
  109. function: The function to be executed for this action. The parameter
  110. passed to the function when launched is the `ScheduledTask` being run.
  111. The function should return a tuple of new `status`, `result`
  112. and `error` as specified in `ScheduledTask`.
  113. action_name: The name of the action to be associated with the function
  114. """
  115. self._actions[action_name] = function
  116. async def schedule_task(
  117. self,
  118. action: str,
  119. *,
  120. resource_id: Optional[str] = None,
  121. timestamp: Optional[int] = None,
  122. params: Optional[JsonMapping] = None,
  123. ) -> str:
  124. """Schedule a new potentially resumable task. A function matching the specified
  125. `action` should have be registered with `register_action` before the task is run.
  126. Args:
  127. action: the name of a previously registered action
  128. resource_id: a task can be associated with a resource id to facilitate
  129. getting all tasks associated with a specific resource
  130. timestamp: if `None`, the task will be launched as soon as possible, otherwise it
  131. will be launch as soon as possible after the `timestamp` value.
  132. Note that this scheduler is not meant to be precise, and the scheduling
  133. could be delayed if too many tasks are already running
  134. params: a set of parameters that can be easily accessed from inside the
  135. executed function
  136. Returns:
  137. The id of the scheduled task
  138. """
  139. status = TaskStatus.SCHEDULED
  140. if timestamp is None or timestamp < self._clock.time_msec():
  141. timestamp = self._clock.time_msec()
  142. status = TaskStatus.ACTIVE
  143. task = ScheduledTask(
  144. random_string(16),
  145. action,
  146. status,
  147. timestamp,
  148. resource_id,
  149. params,
  150. result=None,
  151. error=None,
  152. )
  153. await self._store.insert_scheduled_task(task)
  154. if status == TaskStatus.ACTIVE:
  155. if self._run_background_tasks:
  156. await self._launch_task(task)
  157. else:
  158. self._hs.get_replication_command_handler().send_new_active_task(task.id)
  159. return task.id
  160. async def update_task(
  161. self,
  162. id: str,
  163. *,
  164. timestamp: Optional[int] = None,
  165. status: Optional[TaskStatus] = None,
  166. result: Optional[JsonMapping] = None,
  167. error: Optional[str] = None,
  168. ) -> bool:
  169. """Update some task associated values. This is exposed publicly so it can
  170. be used inside task functions, mainly to update the result and be able to
  171. resume a task at a specific step after a restart of synapse.
  172. It can also be used to stage a task, by setting the `status` to `SCHEDULED` with
  173. a new timestamp.
  174. The `status` can only be set to `ACTIVE` or `SCHEDULED`, `COMPLETE` and `FAILED`
  175. are terminal status and can only be set by returning it in the function.
  176. Args:
  177. id: the id of the task to update
  178. timestamp: useful to schedule a new stage of the task at a later date
  179. status: the new `TaskStatus` of the task
  180. result: the new result of the task
  181. error: the new error of the task
  182. """
  183. if status == TaskStatus.COMPLETE or status == TaskStatus.FAILED:
  184. raise Exception(
  185. "update_task can't be called with a FAILED or COMPLETE status"
  186. )
  187. if timestamp is None:
  188. timestamp = self._clock.time_msec()
  189. return await self._store.update_scheduled_task(
  190. id,
  191. timestamp,
  192. status=status,
  193. result=result,
  194. error=error,
  195. )
  196. async def get_task(self, id: str) -> Optional[ScheduledTask]:
  197. """Get a specific task description by id.
  198. Args:
  199. id: the id of the task to retrieve
  200. Returns:
  201. The task information or `None` if it doesn't exist or it has
  202. already been removed because it's too old.
  203. """
  204. return await self._store.get_scheduled_task(id)
  205. async def get_tasks(
  206. self,
  207. *,
  208. actions: Optional[List[str]] = None,
  209. resource_id: Optional[str] = None,
  210. statuses: Optional[List[TaskStatus]] = None,
  211. max_timestamp: Optional[int] = None,
  212. limit: Optional[int] = None,
  213. ) -> List[ScheduledTask]:
  214. """Get a list of tasks. Returns all the tasks if no args is provided.
  215. If an arg is `None` all tasks matching the other args will be selected.
  216. If an arg is an empty list, the corresponding value of the task needs
  217. to be `None` to be selected.
  218. Args:
  219. actions: Limit the returned tasks to those specific action names
  220. resource_id: Limit the returned tasks to the specific resource id, if specified
  221. statuses: Limit the returned tasks to the specific statuses
  222. max_timestamp: Limit the returned tasks to the ones that have
  223. a timestamp inferior to the specified one
  224. limit: Only return `limit` number of rows if set.
  225. Returns
  226. A list of `ScheduledTask`, ordered by increasing timestamps
  227. """
  228. return await self._store.get_scheduled_tasks(
  229. actions=actions,
  230. resource_id=resource_id,
  231. statuses=statuses,
  232. max_timestamp=max_timestamp,
  233. limit=limit,
  234. )
  235. async def delete_task(self, id: str) -> None:
  236. """Delete a task. Running tasks can't be deleted.
  237. Can only be called from the worker handling the task scheduling.
  238. Args:
  239. id: id of the task to delete
  240. """
  241. task = await self.get_task(id)
  242. if task is None:
  243. raise Exception(f"Task {id} does not exist")
  244. if task.status == TaskStatus.ACTIVE:
  245. raise Exception(f"Task {id} is currently ACTIVE and can't be deleted")
  246. await self._store.delete_scheduled_task(id)
  247. def launch_task_by_id(self, id: str) -> None:
  248. """Try launching the task with the given ID."""
  249. # Don't bother trying to launch new tasks if we're already at capacity.
  250. if len(self._running_tasks) >= TaskScheduler.MAX_CONCURRENT_RUNNING_TASKS:
  251. return
  252. run_as_background_process("launch_task_by_id", self._launch_task_by_id, id)
  253. async def _launch_task_by_id(self, id: str) -> None:
  254. """Helper async function for `launch_task_by_id`."""
  255. task = await self.get_task(id)
  256. if task:
  257. await self._launch_task(task)
  258. @wrap_as_background_process("launch_scheduled_tasks")
  259. async def _launch_scheduled_tasks(self) -> None:
  260. """Retrieve and launch scheduled tasks that should be running at that time."""
  261. # Don't bother trying to launch new tasks if we're already at capacity.
  262. if len(self._running_tasks) >= TaskScheduler.MAX_CONCURRENT_RUNNING_TASKS:
  263. return
  264. if self._launching_new_tasks:
  265. return
  266. self._launching_new_tasks = True
  267. try:
  268. for task in await self.get_tasks(
  269. statuses=[TaskStatus.ACTIVE], limit=self.MAX_CONCURRENT_RUNNING_TASKS
  270. ):
  271. await self._launch_task(task)
  272. for task in await self.get_tasks(
  273. statuses=[TaskStatus.SCHEDULED],
  274. max_timestamp=self._clock.time_msec(),
  275. limit=self.MAX_CONCURRENT_RUNNING_TASKS,
  276. ):
  277. await self._launch_task(task)
  278. finally:
  279. self._launching_new_tasks = False
  280. @wrap_as_background_process("clean_scheduled_tasks")
  281. async def _clean_scheduled_tasks(self) -> None:
  282. """Clean old complete or failed jobs to avoid clutter the DB."""
  283. now = self._clock.time_msec()
  284. for task in await self._store.get_scheduled_tasks(
  285. statuses=[TaskStatus.FAILED, TaskStatus.COMPLETE],
  286. max_timestamp=now - TaskScheduler.KEEP_TASKS_FOR_MS,
  287. ):
  288. # FAILED and COMPLETE tasks should never be running
  289. assert task.id not in self._running_tasks
  290. await self._store.delete_scheduled_task(task.id)
  291. async def _launch_task(self, task: ScheduledTask) -> None:
  292. """Launch a scheduled task now.
  293. Args:
  294. task: the task to launch
  295. """
  296. assert self._run_background_tasks
  297. if task.action not in self._actions:
  298. raise Exception(
  299. f"No function associated with action {task.action} of the scheduled task {task.id}"
  300. )
  301. function = self._actions[task.action]
  302. async def wrapper() -> None:
  303. with nested_logging_context(task.id):
  304. try:
  305. (status, result, error) = await function(task)
  306. except Exception:
  307. f = Failure()
  308. logger.error(
  309. f"scheduled task {task.id} failed",
  310. exc_info=(f.type, f.value, f.getTracebackObject()),
  311. )
  312. status = TaskStatus.FAILED
  313. result = None
  314. error = f.getErrorMessage()
  315. await self._store.update_scheduled_task(
  316. task.id,
  317. self._clock.time_msec(),
  318. status=status,
  319. result=result,
  320. error=error,
  321. )
  322. self._running_tasks.remove(task.id)
  323. # Try launch a new task since we've finished with this one.
  324. self._clock.call_later(0.1, self._launch_scheduled_tasks)
  325. if len(self._running_tasks) >= TaskScheduler.MAX_CONCURRENT_RUNNING_TASKS:
  326. return
  327. if (
  328. self._clock.time_msec()
  329. > task.timestamp + TaskScheduler.LAST_UPDATE_BEFORE_WARNING_MS
  330. ):
  331. logger.warn(
  332. f"Task {task.id} (action {task.action}) has seen no update for more than 24h and may be stuck"
  333. )
  334. if task.id in self._running_tasks:
  335. return
  336. self._running_tasks.add(task.id)
  337. await self.update_task(task.id, status=TaskStatus.ACTIVE)
  338. run_as_background_process(f"task-{task.action}", wrapper)