Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 
 
 
 
 

258 řádky
8.2 KiB

  1. # Copyright 2014-2016 OpenMarket Ltd
  2. # Copyright 2018 New Vector Ltd
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. import logging
  16. import threading
  17. import traceback
  18. from typing import Dict, Mapping, Set, Tuple
  19. from prometheus_client.core import Counter, Histogram
  20. from synapse.logging.context import current_context
  21. from synapse.metrics import LaterGauge
  22. logger = logging.getLogger(__name__)
  23. # total number of responses served, split by method/servlet/tag
  24. response_count = Counter(
  25. "synapse_http_server_response_count", "", ["method", "servlet", "tag"]
  26. )
  27. requests_counter = Counter(
  28. "synapse_http_server_requests_received", "", ["method", "servlet"]
  29. )
  30. outgoing_responses_counter = Counter(
  31. "synapse_http_server_responses", "", ["method", "code"]
  32. )
  33. response_timer = Histogram(
  34. "synapse_http_server_response_time_seconds",
  35. "sec",
  36. ["method", "servlet", "tag", "code"],
  37. )
  38. response_ru_utime = Counter(
  39. "synapse_http_server_response_ru_utime_seconds", "sec", ["method", "servlet", "tag"]
  40. )
  41. response_ru_stime = Counter(
  42. "synapse_http_server_response_ru_stime_seconds", "sec", ["method", "servlet", "tag"]
  43. )
  44. response_db_txn_count = Counter(
  45. "synapse_http_server_response_db_txn_count", "", ["method", "servlet", "tag"]
  46. )
  47. # seconds spent waiting for db txns, excluding scheduling time, when processing
  48. # this request
  49. response_db_txn_duration = Counter(
  50. "synapse_http_server_response_db_txn_duration_seconds",
  51. "",
  52. ["method", "servlet", "tag"],
  53. )
  54. # seconds spent waiting for a db connection, when processing this request
  55. response_db_sched_duration = Counter(
  56. "synapse_http_server_response_db_sched_duration_seconds",
  57. "",
  58. ["method", "servlet", "tag"],
  59. )
  60. # size in bytes of the response written
  61. response_size = Counter(
  62. "synapse_http_server_response_size", "", ["method", "servlet", "tag"]
  63. )
  64. # In flight metrics are incremented while the requests are in flight, rather
  65. # than when the response was written.
  66. in_flight_requests_ru_utime = Counter(
  67. "synapse_http_server_in_flight_requests_ru_utime_seconds", "", ["method", "servlet"]
  68. )
  69. in_flight_requests_ru_stime = Counter(
  70. "synapse_http_server_in_flight_requests_ru_stime_seconds", "", ["method", "servlet"]
  71. )
  72. in_flight_requests_db_txn_count = Counter(
  73. "synapse_http_server_in_flight_requests_db_txn_count", "", ["method", "servlet"]
  74. )
  75. # seconds spent waiting for db txns, excluding scheduling time, when processing
  76. # this request
  77. in_flight_requests_db_txn_duration = Counter(
  78. "synapse_http_server_in_flight_requests_db_txn_duration_seconds",
  79. "",
  80. ["method", "servlet"],
  81. )
  82. # seconds spent waiting for a db connection, when processing this request
  83. in_flight_requests_db_sched_duration = Counter(
  84. "synapse_http_server_in_flight_requests_db_sched_duration_seconds",
  85. "",
  86. ["method", "servlet"],
  87. )
  88. _in_flight_requests: Set["RequestMetrics"] = set()
  89. # Protects the _in_flight_requests set from concurrent access
  90. _in_flight_requests_lock = threading.Lock()
  91. def _get_in_flight_counts() -> Mapping[Tuple[str, ...], int]:
  92. """Returns a count of all in flight requests by (method, server_name)"""
  93. # Cast to a list to prevent it changing while the Prometheus
  94. # thread is collecting metrics
  95. with _in_flight_requests_lock:
  96. reqs = list(_in_flight_requests)
  97. for rm in reqs:
  98. rm.update_metrics()
  99. # Map from (method, name) -> int, the number of in flight requests of that
  100. # type. The key type is Tuple[str, str], but we leave the length unspecified
  101. # for compatability with LaterGauge's annotations.
  102. counts: Dict[Tuple[str, ...], int] = {}
  103. for rm in reqs:
  104. key = (rm.method, rm.name)
  105. counts[key] = counts.get(key, 0) + 1
  106. return counts
  107. LaterGauge(
  108. "synapse_http_server_in_flight_requests_count",
  109. "",
  110. ["method", "servlet"],
  111. _get_in_flight_counts,
  112. )
  113. class RequestMetrics:
  114. def start(self, time_sec: float, name: str, method: str) -> None:
  115. self.start_ts = time_sec
  116. self.start_context = current_context()
  117. self.name = name
  118. self.method = method
  119. if self.start_context:
  120. # _request_stats records resource usage that we have already added
  121. # to the "in flight" metrics.
  122. self._request_stats = self.start_context.get_resource_usage()
  123. else:
  124. logger.error(
  125. "Tried to start a RequestMetric from the sentinel context.\n%s",
  126. "".join(traceback.format_stack()),
  127. )
  128. with _in_flight_requests_lock:
  129. _in_flight_requests.add(self)
  130. def stop(self, time_sec: float, response_code: int, sent_bytes: int) -> None:
  131. with _in_flight_requests_lock:
  132. _in_flight_requests.discard(self)
  133. context = current_context()
  134. tag = ""
  135. if context:
  136. tag = context.tag
  137. if context != self.start_context:
  138. logger.error(
  139. "Context have unexpectedly changed %r, %r",
  140. context,
  141. self.start_context,
  142. )
  143. return
  144. else:
  145. logger.error(
  146. "Trying to stop RequestMetrics in the sentinel context.\n%s",
  147. "".join(traceback.format_stack()),
  148. )
  149. return
  150. response_code_str = str(response_code)
  151. outgoing_responses_counter.labels(self.method, response_code_str).inc()
  152. response_count.labels(self.method, self.name, tag).inc()
  153. response_timer.labels(self.method, self.name, tag, response_code_str).observe(
  154. time_sec - self.start_ts
  155. )
  156. resource_usage = context.get_resource_usage()
  157. response_ru_utime.labels(self.method, self.name, tag).inc(
  158. resource_usage.ru_utime
  159. )
  160. response_ru_stime.labels(self.method, self.name, tag).inc(
  161. resource_usage.ru_stime
  162. )
  163. response_db_txn_count.labels(self.method, self.name, tag).inc(
  164. resource_usage.db_txn_count
  165. )
  166. response_db_txn_duration.labels(self.method, self.name, tag).inc(
  167. resource_usage.db_txn_duration_sec
  168. )
  169. response_db_sched_duration.labels(self.method, self.name, tag).inc(
  170. resource_usage.db_sched_duration_sec
  171. )
  172. response_size.labels(self.method, self.name, tag).inc(sent_bytes)
  173. # We always call this at the end to ensure that we update the metrics
  174. # regardless of whether a call to /metrics while the request was in
  175. # flight.
  176. self.update_metrics()
  177. def update_metrics(self) -> None:
  178. """Updates the in flight metrics with values from this request."""
  179. if not self.start_context:
  180. logger.error(
  181. "Tried to update a RequestMetric from the sentinel context.\n%s",
  182. "".join(traceback.format_stack()),
  183. )
  184. return
  185. new_stats = self.start_context.get_resource_usage()
  186. diff = new_stats - self._request_stats
  187. self._request_stats = new_stats
  188. # max() is used since rapid use of ru_stime/ru_utime can end up with the
  189. # count going backwards due to NTP, time smearing, fine-grained
  190. # correction, or floating points. Who knows, really?
  191. in_flight_requests_ru_utime.labels(self.method, self.name).inc(
  192. max(diff.ru_utime, 0)
  193. )
  194. in_flight_requests_ru_stime.labels(self.method, self.name).inc(
  195. max(diff.ru_stime, 0)
  196. )
  197. in_flight_requests_db_txn_count.labels(self.method, self.name).inc(
  198. diff.db_txn_count
  199. )
  200. in_flight_requests_db_txn_duration.labels(self.method, self.name).inc(
  201. diff.db_txn_duration_sec
  202. )
  203. in_flight_requests_db_sched_duration.labels(self.method, self.name).inc(
  204. diff.db_sched_duration_sec
  205. )