forked from MirrorHub/synapse
Improve reactor_tick_time
metric (#11724)
The existing implementation of the `python_twisted_reactor_tick_time` metric is pretty useless, because it *only* measures the time taken to execute timed calls and callbacks from threads. That neglects everything that happens off the back of I/O, which is obviously quite a lot for us. To improve this, I've hooked into a different place in the reactor - in particular, where it calls `epoll`. That call is the only place it should wait for something to happen - the rest of the loop *should* be quick. I've also removed `python_twisted_reactor_pending_calls`, because I don't believe anyone ever looks at it, and it's a nuisance to populate.
This commit is contained in:
parent
6b241f5286
commit
6a78ede569
5 changed files with 86 additions and 109 deletions
1
changelog.d/11724.misc
Normal file
1
changelog.d/11724.misc
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Improve accuracy of `python_twisted_reactor_tick_time` prometheus metric.
|
1
changelog.d/11724.removal
Normal file
1
changelog.d/11724.removal
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Remove `python_twisted_reactor_pending_calls` prometheus metric.
|
|
@ -92,22 +92,6 @@ new PromConsole.Graph({
|
||||||
})
|
})
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<h3>Pending calls per tick</h3>
|
|
||||||
<div id="reactor_pending_calls"></div>
|
|
||||||
<script>
|
|
||||||
new PromConsole.Graph({
|
|
||||||
node: document.querySelector("#reactor_pending_calls"),
|
|
||||||
expr: "rate(python_twisted_reactor_pending_calls_sum[30s]) / rate(python_twisted_reactor_pending_calls_count[30s])",
|
|
||||||
name: "[[job]]-[[index]]",
|
|
||||||
min: 0,
|
|
||||||
renderer: "line",
|
|
||||||
height: 150,
|
|
||||||
yAxisFormatter: PromConsole.NumberFormatter.humanize,
|
|
||||||
yHoverFormatter: PromConsole.NumberFormatter.humanize,
|
|
||||||
yTitle: "Pending Calls"
|
|
||||||
})
|
|
||||||
</script>
|
|
||||||
|
|
||||||
<h1>Storage</h1>
|
<h1>Storage</h1>
|
||||||
|
|
||||||
<h3>Queries</h3>
|
<h3>Queries</h3>
|
||||||
|
|
|
@ -12,15 +12,12 @@
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
import functools
|
|
||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
import threading
|
import threading
|
||||||
import time
|
|
||||||
from typing import (
|
from typing import (
|
||||||
Any,
|
|
||||||
Callable,
|
Callable,
|
||||||
Dict,
|
Dict,
|
||||||
Generic,
|
Generic,
|
||||||
|
@ -33,7 +30,6 @@ from typing import (
|
||||||
Type,
|
Type,
|
||||||
TypeVar,
|
TypeVar,
|
||||||
Union,
|
Union,
|
||||||
cast,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
import attr
|
import attr
|
||||||
|
@ -44,11 +40,9 @@ from prometheus_client.core import (
|
||||||
GaugeMetricFamily,
|
GaugeMetricFamily,
|
||||||
)
|
)
|
||||||
|
|
||||||
from twisted.internet import reactor
|
|
||||||
from twisted.internet.base import ReactorBase
|
|
||||||
from twisted.python.threadpool import ThreadPool
|
from twisted.python.threadpool import ThreadPool
|
||||||
|
|
||||||
import synapse
|
import synapse.metrics._reactor_metrics
|
||||||
from synapse.metrics._exposition import (
|
from synapse.metrics._exposition import (
|
||||||
MetricsResource,
|
MetricsResource,
|
||||||
generate_latest,
|
generate_latest,
|
||||||
|
@ -368,21 +362,6 @@ class CPUMetrics:
|
||||||
REGISTRY.register(CPUMetrics())
|
REGISTRY.register(CPUMetrics())
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# Twisted reactor metrics
|
|
||||||
#
|
|
||||||
|
|
||||||
tick_time = Histogram(
|
|
||||||
"python_twisted_reactor_tick_time",
|
|
||||||
"Tick time of the Twisted reactor (sec)",
|
|
||||||
buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5],
|
|
||||||
)
|
|
||||||
pending_calls_metric = Histogram(
|
|
||||||
"python_twisted_reactor_pending_calls",
|
|
||||||
"Pending calls",
|
|
||||||
buckets=[1, 2, 5, 10, 25, 50, 100, 250, 500, 1000],
|
|
||||||
)
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Federation Metrics
|
# Federation Metrics
|
||||||
#
|
#
|
||||||
|
@ -434,8 +413,6 @@ build_info.labels(
|
||||||
" ".join([platform.system(), platform.release()]),
|
" ".join([platform.system(), platform.release()]),
|
||||||
).set(1)
|
).set(1)
|
||||||
|
|
||||||
last_ticked = time.time()
|
|
||||||
|
|
||||||
# 3PID send info
|
# 3PID send info
|
||||||
threepid_send_requests = Histogram(
|
threepid_send_requests = Histogram(
|
||||||
"synapse_threepid_send_requests_with_tries",
|
"synapse_threepid_send_requests_with_tries",
|
||||||
|
@ -483,75 +460,6 @@ def register_threadpool(name: str, threadpool: ThreadPool) -> None:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class ReactorLastSeenMetric:
|
|
||||||
def collect(self) -> Iterable[Metric]:
|
|
||||||
cm = GaugeMetricFamily(
|
|
||||||
"python_twisted_reactor_last_seen",
|
|
||||||
"Seconds since the Twisted reactor was last seen",
|
|
||||||
)
|
|
||||||
cm.add_metric([], time.time() - last_ticked)
|
|
||||||
yield cm
|
|
||||||
|
|
||||||
|
|
||||||
REGISTRY.register(ReactorLastSeenMetric())
|
|
||||||
|
|
||||||
F = TypeVar("F", bound=Callable[..., Any])
|
|
||||||
|
|
||||||
|
|
||||||
def runUntilCurrentTimer(reactor: ReactorBase, func: F) -> F:
|
|
||||||
@functools.wraps(func)
|
|
||||||
def f(*args: Any, **kwargs: Any) -> Any:
|
|
||||||
now = reactor.seconds()
|
|
||||||
num_pending = 0
|
|
||||||
|
|
||||||
# _newTimedCalls is one long list of *all* pending calls. Below loop
|
|
||||||
# is based off of impl of reactor.runUntilCurrent
|
|
||||||
for delayed_call in reactor._newTimedCalls:
|
|
||||||
if delayed_call.time > now:
|
|
||||||
break
|
|
||||||
|
|
||||||
if delayed_call.delayed_time > 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
num_pending += 1
|
|
||||||
|
|
||||||
num_pending += len(reactor.threadCallQueue)
|
|
||||||
start = time.time()
|
|
||||||
ret = func(*args, **kwargs)
|
|
||||||
end = time.time()
|
|
||||||
|
|
||||||
# record the amount of wallclock time spent running pending calls.
|
|
||||||
# This is a proxy for the actual amount of time between reactor polls,
|
|
||||||
# since about 25% of time is actually spent running things triggered by
|
|
||||||
# I/O events, but that is harder to capture without rewriting half the
|
|
||||||
# reactor.
|
|
||||||
tick_time.observe(end - start)
|
|
||||||
pending_calls_metric.observe(num_pending)
|
|
||||||
|
|
||||||
# Update the time we last ticked, for the metric to test whether
|
|
||||||
# Synapse's reactor has frozen
|
|
||||||
global last_ticked
|
|
||||||
last_ticked = end
|
|
||||||
|
|
||||||
return ret
|
|
||||||
|
|
||||||
return cast(F, f)
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Ensure the reactor has all the attributes we expect
|
|
||||||
reactor.seconds # type: ignore
|
|
||||||
reactor.runUntilCurrent # type: ignore
|
|
||||||
reactor._newTimedCalls # type: ignore
|
|
||||||
reactor.threadCallQueue # type: ignore
|
|
||||||
|
|
||||||
# runUntilCurrent is called when we have pending calls. It is called once
|
|
||||||
# per iteratation after fd polling.
|
|
||||||
reactor.runUntilCurrent = runUntilCurrentTimer(reactor, reactor.runUntilCurrent) # type: ignore
|
|
||||||
except AttributeError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"MetricsResource",
|
"MetricsResource",
|
||||||
"generate_latest",
|
"generate_latest",
|
||||||
|
|
83
synapse/metrics/_reactor_metrics.py
Normal file
83
synapse/metrics/_reactor_metrics.py
Normal file
|
@ -0,0 +1,83 @@
|
||||||
|
# Copyright 2022 The Matrix.org Foundation C.I.C.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import select
|
||||||
|
import time
|
||||||
|
from typing import Any, Iterable, List, Tuple
|
||||||
|
|
||||||
|
from prometheus_client import Histogram, Metric
|
||||||
|
from prometheus_client.core import REGISTRY, GaugeMetricFamily
|
||||||
|
|
||||||
|
from twisted.internet import reactor
|
||||||
|
|
||||||
|
#
|
||||||
|
# Twisted reactor metrics
|
||||||
|
#
|
||||||
|
|
||||||
|
tick_time = Histogram(
|
||||||
|
"python_twisted_reactor_tick_time",
|
||||||
|
"Tick time of the Twisted reactor (sec)",
|
||||||
|
buckets=[0.001, 0.002, 0.005, 0.01, 0.025, 0.05, 0.1, 0.2, 0.5, 1, 2, 5],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class EpollWrapper:
|
||||||
|
"""a wrapper for an epoll object which records the time between polls"""
|
||||||
|
|
||||||
|
def __init__(self, poller: "select.epoll"):
|
||||||
|
self.last_polled = time.time()
|
||||||
|
self._poller = poller
|
||||||
|
|
||||||
|
def poll(self, *args, **kwargs) -> List[Tuple[int, int]]: # type: ignore[no-untyped-def]
|
||||||
|
# record the time since poll() was last called. This gives a good proxy for
|
||||||
|
# how long it takes to run everything in the reactor - ie, how long anything
|
||||||
|
# waiting for the next tick will have to wait.
|
||||||
|
tick_time.observe(time.time() - self.last_polled)
|
||||||
|
|
||||||
|
ret = self._poller.poll(*args, **kwargs)
|
||||||
|
|
||||||
|
self.last_polled = time.time()
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def __getattr__(self, item: str) -> Any:
|
||||||
|
return getattr(self._poller, item)
|
||||||
|
|
||||||
|
|
||||||
|
class ReactorLastSeenMetric:
|
||||||
|
def __init__(self, epoll_wrapper: EpollWrapper):
|
||||||
|
self._epoll_wrapper = epoll_wrapper
|
||||||
|
|
||||||
|
def collect(self) -> Iterable[Metric]:
|
||||||
|
cm = GaugeMetricFamily(
|
||||||
|
"python_twisted_reactor_last_seen",
|
||||||
|
"Seconds since the Twisted reactor was last seen",
|
||||||
|
)
|
||||||
|
cm.add_metric([], time.time() - self._epoll_wrapper.last_polled)
|
||||||
|
yield cm
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
# if the reactor has a `_poller` attribute, which is an `epoll` object
|
||||||
|
# (ie, it's an EPollReactor), we wrap the `epoll` with a thing that will
|
||||||
|
# measure the time between ticks
|
||||||
|
from select import epoll
|
||||||
|
|
||||||
|
poller = reactor._poller # type: ignore[attr-defined]
|
||||||
|
except (AttributeError, ImportError):
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
if isinstance(poller, epoll):
|
||||||
|
poller = EpollWrapper(poller)
|
||||||
|
reactor._poller = poller # type: ignore[attr-defined]
|
||||||
|
REGISTRY.register(ReactorLastSeenMetric(poller))
|
Loading…
Reference in a new issue