diff --git a/CHANGES.md b/CHANGES.md index 14fafc260..a39fe661c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,12 @@ +Synapse 1.66.0rc2 (2022-08-30) +============================== + +Bugfixes +-------- + +- Fix a bug introduced in Synapse 1.66.0rc1 where the new rate limit metrics were misreported (`synapse_rate_limit_sleep_affected_hosts`, `synapse_rate_limit_reject_affected_hosts`). ([\#13649](https://github.com/matrix-org/synapse/issues/13649)) + + Synapse 1.66.0rc1 (2022-08-23) ============================== diff --git a/debian/changelog b/debian/changelog index 3257adef7..0709ae24e 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -matrix-synapse-py3 (1.66.0~rc1+nmu1) UNRELEASED; urgency=medium +matrix-synapse-py3 (1.66.0~rc2+nmu1) UNRELEASED; urgency=medium * Update debhelper to compatibility level 12. * Drop the preinst script stopping synapse. @@ -6,6 +6,12 @@ matrix-synapse-py3 (1.66.0~rc1+nmu1) UNRELEASED; urgency=medium -- Jörg Behrmann Tue, 23 Aug 2022 17:17:00 +0100 +matrix-synapse-py3 (1.66.0~rc2) stable; urgency=medium + + * New Synapse release 1.66.0rc2. + + -- Synapse Packaging team Tue, 30 Aug 2022 12:25:19 +0100 + matrix-synapse-py3 (1.66.0~rc1) stable; urgency=medium * New Synapse release 1.66.0rc1. diff --git a/pyproject.toml b/pyproject.toml index 745b6067a..714689783 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,7 @@ skip_gitignore = true [tool.poetry] name = "matrix-synapse" -version = "1.66.0rc1" +version = "1.66.0rc2" description = "Homeserver for the Matrix decentralised comms protocol" authors = ["Matrix.org Team and Contributors "] license = "Apache-2.0" diff --git a/synapse/server.py b/synapse/server.py index 181984a1a..c2e55bf0b 100644 --- a/synapse/server.py +++ b/synapse/server.py @@ -756,7 +756,9 @@ class HomeServer(metaclass=abc.ABCMeta): @cache_in_self def get_federation_ratelimiter(self) -> FederationRateLimiter: return FederationRateLimiter( - self.get_clock(), config=self.config.ratelimiting.rc_federation + self.get_clock(), + config=self.config.ratelimiting.rc_federation, + metrics_name="federation_servlets", ) @cache_in_self diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index f678b52cb..9f64fed0d 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -15,10 +15,23 @@ import collections import contextlib import logging +import threading import typing -from typing import Any, DefaultDict, Iterator, List, Set +from typing import ( + Any, + Callable, + DefaultDict, + Dict, + Iterator, + List, + Mapping, + Optional, + Set, + Tuple, +) from prometheus_client.core import Counter +from typing_extensions import ContextManager from twisted.internet import defer @@ -40,12 +53,20 @@ logger = logging.getLogger(__name__) # Track how much the ratelimiter is affecting requests -rate_limit_sleep_counter = Counter("synapse_rate_limit_sleep", "") -rate_limit_reject_counter = Counter("synapse_rate_limit_reject", "") +rate_limit_sleep_counter = Counter( + "synapse_rate_limit_sleep", + "Number of requests slept by the rate limiter", + ["rate_limiter_name"], +) +rate_limit_reject_counter = Counter( + "synapse_rate_limit_reject", + "Number of requests rejected by the rate limiter", + ["rate_limiter_name"], +) queue_wait_timer = Histogram( "synapse_rate_limit_queue_wait_time_seconds", - "sec", - [], + "Amount of time spent waiting for the rate limiter to let our request through.", + ["rate_limiter_name"], buckets=( 0.005, 0.01, @@ -65,35 +86,92 @@ queue_wait_timer = Histogram( ) +_rate_limiter_instances: Set["FederationRateLimiter"] = set() +# Protects the _rate_limiter_instances set from concurrent access +_rate_limiter_instances_lock = threading.Lock() + + +def _get_counts_from_rate_limiter_instance( + count_func: Callable[["FederationRateLimiter"], int] +) -> Mapping[Tuple[str, ...], int]: + """Returns a count of something (slept/rejected hosts) by (metrics_name)""" + # Cast to a list to prevent it changing while the Prometheus + # thread is collecting metrics + with _rate_limiter_instances_lock: + rate_limiter_instances = list(_rate_limiter_instances) + + # Map from (metrics_name,) -> int, the number of something like slept hosts + # or rejected hosts. The key type is Tuple[str], but we leave the length + # unspecified for compatability with LaterGauge's annotations. + counts: Dict[Tuple[str, ...], int] = {} + for rate_limiter_instance in rate_limiter_instances: + # Only track metrics if they provided a `metrics_name` to + # differentiate this instance of the rate limiter. + if rate_limiter_instance.metrics_name: + key = (rate_limiter_instance.metrics_name,) + counts[key] = count_func(rate_limiter_instance) + + return counts + + +# We track the number of affected hosts per time-period so we can +# differentiate one really noisy homeserver from a general +# ratelimit tuning problem across the federation. +LaterGauge( + "synapse_rate_limit_sleep_affected_hosts", + "Number of hosts that had requests put to sleep", + ["rate_limiter_name"], + lambda: _get_counts_from_rate_limiter_instance( + lambda rate_limiter_instance: sum( + ratelimiter.should_sleep() + for ratelimiter in rate_limiter_instance.ratelimiters.values() + ) + ), +) +LaterGauge( + "synapse_rate_limit_reject_affected_hosts", + "Number of hosts that had requests rejected", + ["rate_limiter_name"], + lambda: _get_counts_from_rate_limiter_instance( + lambda rate_limiter_instance: sum( + ratelimiter.should_reject() + for ratelimiter in rate_limiter_instance.ratelimiters.values() + ) + ), +) + + class FederationRateLimiter: - def __init__(self, clock: Clock, config: FederationRatelimitSettings): + """Used to rate limit request per-host.""" + + def __init__( + self, + clock: Clock, + config: FederationRatelimitSettings, + metrics_name: Optional[str] = None, + ): + """ + Args: + clock + config + metrics_name: The name of the rate limiter so we can differentiate it + from the rest in the metrics. If `None`, we don't track metrics + for this rate limiter. + + """ + self.metrics_name = metrics_name + def new_limiter() -> "_PerHostRatelimiter": - return _PerHostRatelimiter(clock=clock, config=config) + return _PerHostRatelimiter( + clock=clock, config=config, metrics_name=metrics_name + ) self.ratelimiters: DefaultDict[ str, "_PerHostRatelimiter" ] = collections.defaultdict(new_limiter) - # We track the number of affected hosts per time-period so we can - # differentiate one really noisy homeserver from a general - # ratelimit tuning problem across the federation. - LaterGauge( - "synapse_rate_limit_sleep_affected_hosts", - "Number of hosts that had requests put to sleep", - [], - lambda: sum( - ratelimiter.should_sleep() for ratelimiter in self.ratelimiters.values() - ), - ) - LaterGauge( - "synapse_rate_limit_reject_affected_hosts", - "Number of hosts that had requests rejected", - [], - lambda: sum( - ratelimiter.should_reject() - for ratelimiter in self.ratelimiters.values() - ), - ) + with _rate_limiter_instances_lock: + _rate_limiter_instances.add(self) def ratelimit(self, host: str) -> "_GeneratorContextManager[defer.Deferred[None]]": """Used to ratelimit an incoming request from a given host @@ -114,13 +192,23 @@ class FederationRateLimiter: class _PerHostRatelimiter: - def __init__(self, clock: Clock, config: FederationRatelimitSettings): + def __init__( + self, + clock: Clock, + config: FederationRatelimitSettings, + metrics_name: Optional[str] = None, + ): """ Args: clock config + metrics_name: The name of the rate limiter so we can differentiate it + from the rest in the metrics. If `None`, we don't track metrics + for this rate limiter. + from the rest in the metrics """ self.clock = clock + self.metrics_name = metrics_name self.window_size = config.window_size self.sleep_limit = config.sleep_limit @@ -178,7 +266,10 @@ class _PerHostRatelimiter: return len(self.request_times) > self.sleep_limit async def _on_enter_with_tracing(self, request_id: object) -> None: - with start_active_span("ratelimit wait"), queue_wait_timer.time(): + maybe_metrics_cm: ContextManager = contextlib.nullcontext() + if self.metrics_name: + maybe_metrics_cm = queue_wait_timer.labels(self.metrics_name).time() + with start_active_span("ratelimit wait"), maybe_metrics_cm: await self._on_enter(request_id) def _on_enter(self, request_id: object) -> "defer.Deferred[None]": @@ -193,7 +284,8 @@ class _PerHostRatelimiter: # sleeping or in the ready queue). if self.should_reject(): logger.debug("Ratelimiter(%s): rejecting request", self.host) - rate_limit_reject_counter.inc() + if self.metrics_name: + rate_limit_reject_counter.labels(self.metrics_name).inc() raise LimitExceededError( retry_after_ms=int(self.window_size / self.sleep_limit) ) @@ -228,7 +320,8 @@ class _PerHostRatelimiter: id(request_id), self.sleep_sec, ) - rate_limit_sleep_counter.inc() + if self.metrics_name: + rate_limit_sleep_counter.labels(self.metrics_name).inc() ret_defer = run_in_background(self.clock.sleep, self.sleep_sec) self.sleeping_requests.add(request_id)