From a2d7195e0111ee6b2fedaabb0f02cfae648cd347 Mon Sep 17 00:00:00 2001 From: David Robertson Date: Wed, 22 Sep 2021 10:59:52 +0100 Subject: [PATCH] Track why we're evicting from caches (#10829) So we can see distinguish between "evicting because the cache is too big" and "evicting because the cache entries haven't been recently used". --- changelog.d/10829.misc | 1 + synapse/util/caches/__init__.py | 31 +++++++++++++++++++++------- synapse/util/caches/expiringcache.py | 10 ++++----- synapse/util/caches/lrucache.py | 4 ++-- 4 files changed, 31 insertions(+), 15 deletions(-) create mode 100644 changelog.d/10829.misc diff --git a/changelog.d/10829.misc b/changelog.d/10829.misc new file mode 100644 index 000000000..ac5fd6b04 --- /dev/null +++ b/changelog.d/10829.misc @@ -0,0 +1 @@ +Track cache eviction rates more finely in Prometheus' monitoring. \ No newline at end of file diff --git a/synapse/util/caches/__init__.py b/synapse/util/caches/__init__.py index cab1bf0c1..df4d61e4b 100644 --- a/synapse/util/caches/__init__.py +++ b/synapse/util/caches/__init__.py @@ -12,8 +12,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import collections import logging +import typing +from enum import Enum, auto from sys import intern from typing import Callable, Dict, Optional, Sized @@ -34,7 +36,7 @@ collectors_by_name: Dict[str, "CacheMetric"] = {} cache_size = Gauge("synapse_util_caches_cache:size", "", ["name"]) cache_hits = Gauge("synapse_util_caches_cache:hits", "", ["name"]) -cache_evicted = Gauge("synapse_util_caches_cache:evicted_size", "", ["name"]) +cache_evicted = Gauge("synapse_util_caches_cache:evicted_size", "", ["name", "reason"]) cache_total = Gauge("synapse_util_caches_cache:total", "", ["name"]) cache_max_size = Gauge("synapse_util_caches_cache_max_size", "", ["name"]) cache_memory_usage = Gauge( @@ -46,11 +48,16 @@ cache_memory_usage = Gauge( response_cache_size = Gauge("synapse_util_caches_response_cache:size", "", ["name"]) response_cache_hits = Gauge("synapse_util_caches_response_cache:hits", "", ["name"]) response_cache_evicted = Gauge( - "synapse_util_caches_response_cache:evicted_size", "", ["name"] + "synapse_util_caches_response_cache:evicted_size", "", ["name", "reason"] ) response_cache_total = Gauge("synapse_util_caches_response_cache:total", "", ["name"]) +class EvictionReason(Enum): + size = auto() + time = auto() + + @attr.s(slots=True) class CacheMetric: @@ -61,7 +68,9 @@ class CacheMetric: hits = attr.ib(default=0) misses = attr.ib(default=0) - evicted_size = attr.ib(default=0) + eviction_size_by_reason: typing.Counter[EvictionReason] = attr.ib( + factory=collections.Counter + ) memory_usage = attr.ib(default=None) def inc_hits(self) -> None: @@ -70,8 +79,8 @@ class CacheMetric: def inc_misses(self) -> None: self.misses += 1 - def inc_evictions(self, size: int = 1) -> None: - self.evicted_size += size + def inc_evictions(self, reason: EvictionReason, size: int = 1) -> None: + self.eviction_size_by_reason[reason] += size def inc_memory_usage(self, memory: int) -> None: if self.memory_usage is None: @@ -94,14 +103,20 @@ class CacheMetric: if self._cache_type == "response_cache": response_cache_size.labels(self._cache_name).set(len(self._cache)) response_cache_hits.labels(self._cache_name).set(self.hits) - response_cache_evicted.labels(self._cache_name).set(self.evicted_size) + for reason in EvictionReason: + response_cache_evicted.labels(self._cache_name, reason.name).set( + self.eviction_size_by_reason[reason] + ) response_cache_total.labels(self._cache_name).set( self.hits + self.misses ) else: cache_size.labels(self._cache_name).set(len(self._cache)) cache_hits.labels(self._cache_name).set(self.hits) - cache_evicted.labels(self._cache_name).set(self.evicted_size) + for reason in EvictionReason: + cache_evicted.labels(self._cache_name, reason.name).set( + self.eviction_size_by_reason[reason] + ) cache_total.labels(self._cache_name).set(self.hits + self.misses) if getattr(self._cache, "max_size", None): cache_max_size.labels(self._cache_name).set(self._cache.max_size) diff --git a/synapse/util/caches/expiringcache.py b/synapse/util/caches/expiringcache.py index bde16b857..c3f72aa06 100644 --- a/synapse/util/caches/expiringcache.py +++ b/synapse/util/caches/expiringcache.py @@ -22,7 +22,7 @@ from typing_extensions import Literal from synapse.config import cache as cache_config from synapse.metrics.background_process_metrics import run_as_background_process from synapse.util import Clock -from synapse.util.caches import register_cache +from synapse.util.caches import EvictionReason, register_cache logger = logging.getLogger(__name__) @@ -98,9 +98,9 @@ class ExpiringCache(Generic[KT, VT]): while self._max_size and len(self) > self._max_size: _key, value = self._cache.popitem(last=False) if self.iterable: - self.metrics.inc_evictions(len(value.value)) + self.metrics.inc_evictions(EvictionReason.size, len(value.value)) else: - self.metrics.inc_evictions() + self.metrics.inc_evictions(EvictionReason.size) def __getitem__(self, key: KT) -> VT: try: @@ -175,9 +175,9 @@ class ExpiringCache(Generic[KT, VT]): for k in keys_to_delete: value = self._cache.pop(k) if self.iterable: - self.metrics.inc_evictions(len(value.value)) + self.metrics.inc_evictions(EvictionReason.time, len(value.value)) else: - self.metrics.inc_evictions() + self.metrics.inc_evictions(EvictionReason.time) logger.debug( "[%s] _prune_cache before: %d, after len: %d", diff --git a/synapse/util/caches/lrucache.py b/synapse/util/caches/lrucache.py index 39dce9dd4..ea6e8dc8d 100644 --- a/synapse/util/caches/lrucache.py +++ b/synapse/util/caches/lrucache.py @@ -40,7 +40,7 @@ from twisted.internet.interfaces import IReactorTime from synapse.config import cache as cache_config from synapse.metrics.background_process_metrics import wrap_as_background_process from synapse.util import Clock, caches -from synapse.util.caches import CacheMetric, register_cache +from synapse.util.caches import CacheMetric, EvictionReason, register_cache from synapse.util.caches.treecache import TreeCache, iterate_tree_cache_entry from synapse.util.linked_list import ListNode @@ -403,7 +403,7 @@ class LruCache(Generic[KT, VT]): evicted_len = delete_node(node) cache.pop(node.key, None) if metrics: - metrics.inc_evictions(evicted_len) + metrics.inc_evictions(EvictionReason.size, evicted_len) def synchronized(f: FT) -> FT: @wraps(f)