mirror of
https://mau.dev/maunium/synapse.git
synced 2024-11-15 22:42:23 +01:00
Refactor logcontext resource usage tracking (#3501)
Factor out the resource usage tracking out to a separate object, which can be passed around and copied independently of the logcontext itself.
This commit is contained in:
parent
16b10666e7
commit
55370331da
5 changed files with 146 additions and 130 deletions
0
changelog.d/3501.misc
Normal file
0
changelog.d/3501.misc
Normal file
|
@ -149,7 +149,9 @@ class RequestMetrics(object):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.method = method
|
self.method = method
|
||||||
|
|
||||||
self._request_stats = _RequestStats.from_context(self.start_context)
|
# _request_stats records resource usage that we have already added
|
||||||
|
# to the "in flight" metrics.
|
||||||
|
self._request_stats = self.start_context.get_resource_usage()
|
||||||
|
|
||||||
_in_flight_requests.add(self)
|
_in_flight_requests.add(self)
|
||||||
|
|
||||||
|
@ -177,18 +179,22 @@ class RequestMetrics(object):
|
||||||
time_sec - self.start
|
time_sec - self.start
|
||||||
)
|
)
|
||||||
|
|
||||||
ru_utime, ru_stime = context.get_resource_usage()
|
resource_usage = context.get_resource_usage()
|
||||||
|
|
||||||
response_ru_utime.labels(request.method, self.name, tag).inc(ru_utime)
|
response_ru_utime.labels(request.method, self.name, tag).inc(
|
||||||
response_ru_stime.labels(request.method, self.name, tag).inc(ru_stime)
|
resource_usage.ru_utime,
|
||||||
|
)
|
||||||
|
response_ru_stime.labels(request.method, self.name, tag).inc(
|
||||||
|
resource_usage.ru_stime,
|
||||||
|
)
|
||||||
response_db_txn_count.labels(request.method, self.name, tag).inc(
|
response_db_txn_count.labels(request.method, self.name, tag).inc(
|
||||||
context.db_txn_count
|
resource_usage.db_txn_count
|
||||||
)
|
)
|
||||||
response_db_txn_duration.labels(request.method, self.name, tag).inc(
|
response_db_txn_duration.labels(request.method, self.name, tag).inc(
|
||||||
context.db_txn_duration_sec
|
resource_usage.db_txn_duration_sec
|
||||||
)
|
)
|
||||||
response_db_sched_duration.labels(request.method, self.name, tag).inc(
|
response_db_sched_duration.labels(request.method, self.name, tag).inc(
|
||||||
context.db_sched_duration_sec
|
resource_usage.db_sched_duration_sec
|
||||||
)
|
)
|
||||||
|
|
||||||
response_size.labels(request.method, self.name, tag).inc(request.sentLength)
|
response_size.labels(request.method, self.name, tag).inc(request.sentLength)
|
||||||
|
@ -201,7 +207,10 @@ class RequestMetrics(object):
|
||||||
def update_metrics(self):
|
def update_metrics(self):
|
||||||
"""Updates the in flight metrics with values from this request.
|
"""Updates the in flight metrics with values from this request.
|
||||||
"""
|
"""
|
||||||
diff = self._request_stats.update(self.start_context)
|
new_stats = self.start_context.get_resource_usage()
|
||||||
|
|
||||||
|
diff = new_stats - self._request_stats
|
||||||
|
self._request_stats = new_stats
|
||||||
|
|
||||||
in_flight_requests_ru_utime.labels(self.method, self.name).inc(diff.ru_utime)
|
in_flight_requests_ru_utime.labels(self.method, self.name).inc(diff.ru_utime)
|
||||||
in_flight_requests_ru_stime.labels(self.method, self.name).inc(diff.ru_stime)
|
in_flight_requests_ru_stime.labels(self.method, self.name).inc(diff.ru_stime)
|
||||||
|
@ -217,61 +226,3 @@ class RequestMetrics(object):
|
||||||
in_flight_requests_db_sched_duration.labels(self.method, self.name).inc(
|
in_flight_requests_db_sched_duration.labels(self.method, self.name).inc(
|
||||||
diff.db_sched_duration_sec
|
diff.db_sched_duration_sec
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class _RequestStats(object):
|
|
||||||
"""Keeps tracks of various metrics for an in flight request.
|
|
||||||
"""
|
|
||||||
|
|
||||||
__slots__ = [
|
|
||||||
"ru_utime",
|
|
||||||
"ru_stime",
|
|
||||||
"db_txn_count",
|
|
||||||
"db_txn_duration_sec",
|
|
||||||
"db_sched_duration_sec",
|
|
||||||
]
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, ru_utime, ru_stime, db_txn_count, db_txn_duration_sec, db_sched_duration_sec
|
|
||||||
):
|
|
||||||
self.ru_utime = ru_utime
|
|
||||||
self.ru_stime = ru_stime
|
|
||||||
self.db_txn_count = db_txn_count
|
|
||||||
self.db_txn_duration_sec = db_txn_duration_sec
|
|
||||||
self.db_sched_duration_sec = db_sched_duration_sec
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def from_context(context):
|
|
||||||
ru_utime, ru_stime = context.get_resource_usage()
|
|
||||||
|
|
||||||
return _RequestStats(
|
|
||||||
ru_utime, ru_stime,
|
|
||||||
context.db_txn_count,
|
|
||||||
context.db_txn_duration_sec,
|
|
||||||
context.db_sched_duration_sec,
|
|
||||||
)
|
|
||||||
|
|
||||||
def update(self, context):
|
|
||||||
"""Updates the current values and returns the difference between the
|
|
||||||
old and new values.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
_RequestStats: The difference between the old and new values
|
|
||||||
"""
|
|
||||||
new = _RequestStats.from_context(context)
|
|
||||||
|
|
||||||
diff = _RequestStats(
|
|
||||||
new.ru_utime - self.ru_utime,
|
|
||||||
new.ru_stime - self.ru_stime,
|
|
||||||
new.db_txn_count - self.db_txn_count,
|
|
||||||
new.db_txn_duration_sec - self.db_txn_duration_sec,
|
|
||||||
new.db_sched_duration_sec - self.db_sched_duration_sec,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.ru_utime = new.ru_utime
|
|
||||||
self.ru_stime = new.ru_stime
|
|
||||||
self.db_txn_count = new.db_txn_count
|
|
||||||
self.db_txn_duration_sec = new.db_txn_duration_sec
|
|
||||||
self.db_sched_duration_sec = new.db_sched_duration_sec
|
|
||||||
|
|
||||||
return diff
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ from twisted.web.server import Request, Site
|
||||||
|
|
||||||
from synapse.http import redact_uri
|
from synapse.http import redact_uri
|
||||||
from synapse.http.request_metrics import RequestMetrics
|
from synapse.http.request_metrics import RequestMetrics
|
||||||
from synapse.util.logcontext import LoggingContext
|
from synapse.util.logcontext import LoggingContext, ContextResourceUsage
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -95,15 +95,9 @@ class SynapseRequest(Request):
|
||||||
def _finished_processing(self):
|
def _finished_processing(self):
|
||||||
try:
|
try:
|
||||||
context = LoggingContext.current_context()
|
context = LoggingContext.current_context()
|
||||||
ru_utime, ru_stime = context.get_resource_usage()
|
usage = context.get_resource_usage()
|
||||||
db_txn_count = context.db_txn_count
|
|
||||||
db_txn_duration_sec = context.db_txn_duration_sec
|
|
||||||
db_sched_duration_sec = context.db_sched_duration_sec
|
|
||||||
evt_db_fetch_count = context.evt_db_fetch_count
|
|
||||||
except Exception:
|
except Exception:
|
||||||
ru_utime, ru_stime = (0, 0)
|
usage = ContextResourceUsage()
|
||||||
db_txn_count, db_txn_duration_sec = (0, 0)
|
|
||||||
evt_db_fetch_count = 0
|
|
||||||
|
|
||||||
end_time = time.time()
|
end_time = time.time()
|
||||||
|
|
||||||
|
@ -130,18 +124,18 @@ class SynapseRequest(Request):
|
||||||
self.site.site_tag,
|
self.site.site_tag,
|
||||||
authenticated_entity,
|
authenticated_entity,
|
||||||
end_time - self.start_time,
|
end_time - self.start_time,
|
||||||
ru_utime,
|
usage.ru_utime,
|
||||||
ru_stime,
|
usage.ru_stime,
|
||||||
db_sched_duration_sec,
|
usage.db_sched_duration_sec,
|
||||||
db_txn_duration_sec,
|
usage.db_txn_duration_sec,
|
||||||
int(db_txn_count),
|
int(usage.db_txn_count),
|
||||||
self.sentLength,
|
self.sentLength,
|
||||||
self.code,
|
self.code,
|
||||||
self.method,
|
self.method,
|
||||||
self.get_redacted_uri(),
|
self.get_redacted_uri(),
|
||||||
self.clientproto,
|
self.clientproto,
|
||||||
user_agent,
|
user_agent,
|
||||||
evt_db_fetch_count,
|
usage.evt_db_fetch_count,
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -49,6 +49,90 @@ except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class ContextResourceUsage(object):
|
||||||
|
"""Object for tracking the resources used by a log context
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
ru_utime (float): user CPU time (in seconds)
|
||||||
|
ru_stime (float): system CPU time (in seconds)
|
||||||
|
db_txn_count (int): number of database transactions done
|
||||||
|
db_sched_duration_sec (float): amount of time spent waiting for a
|
||||||
|
database connection
|
||||||
|
db_txn_duration_sec (float): amount of time spent doing database
|
||||||
|
transactions (excluding scheduling time)
|
||||||
|
evt_db_fetch_count (int): number of events requested from the database
|
||||||
|
"""
|
||||||
|
|
||||||
|
__slots__ = [
|
||||||
|
"ru_stime", "ru_utime",
|
||||||
|
"db_txn_count", "db_txn_duration_sec", "db_sched_duration_sec",
|
||||||
|
"evt_db_fetch_count",
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(self, copy_from=None):
|
||||||
|
"""Create a new ContextResourceUsage
|
||||||
|
|
||||||
|
Args:
|
||||||
|
copy_from (ContextResourceUsage|None): if not None, an object to
|
||||||
|
copy stats from
|
||||||
|
"""
|
||||||
|
if copy_from is None:
|
||||||
|
self.reset()
|
||||||
|
else:
|
||||||
|
self.ru_utime = copy_from.ru_utime
|
||||||
|
self.ru_stime = copy_from.ru_stime
|
||||||
|
self.db_txn_count = copy_from.db_txn_count
|
||||||
|
|
||||||
|
self.db_txn_duration_sec = copy_from.db_txn_duration_sec
|
||||||
|
self.db_sched_duration_sec = copy_from.db_sched_duration_sec
|
||||||
|
self.evt_db_fetch_count = copy_from.evt_db_fetch_count
|
||||||
|
|
||||||
|
def copy(self):
|
||||||
|
return ContextResourceUsage(copy_from=self)
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
self.ru_stime = 0.
|
||||||
|
self.ru_utime = 0.
|
||||||
|
self.db_txn_count = 0
|
||||||
|
|
||||||
|
self.db_txn_duration_sec = 0
|
||||||
|
self.db_sched_duration_sec = 0
|
||||||
|
self.evt_db_fetch_count = 0
|
||||||
|
|
||||||
|
def __iadd__(self, other):
|
||||||
|
"""Add another ContextResourceUsage's stats to this one's.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
other (ContextResourceUsage): the other resource usage object
|
||||||
|
"""
|
||||||
|
self.ru_utime += other.ru_utime
|
||||||
|
self.ru_stime += other.ru_stime
|
||||||
|
self.db_txn_count += other.db_txn_count
|
||||||
|
self.db_txn_duration_sec += other.db_txn_duration_sec
|
||||||
|
self.db_sched_duration_sec += other.db_sched_duration_sec
|
||||||
|
self.evt_db_fetch_count += other.evt_db_fetch_count
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __isub__(self, other):
|
||||||
|
self.ru_utime -= other.ru_utime
|
||||||
|
self.ru_stime -= other.ru_stime
|
||||||
|
self.db_txn_count -= other.db_txn_count
|
||||||
|
self.db_txn_duration_sec -= other.db_txn_duration_sec
|
||||||
|
self.db_sched_duration_sec -= other.db_sched_duration_sec
|
||||||
|
self.evt_db_fetch_count -= other.evt_db_fetch_count
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __add__(self, other):
|
||||||
|
res = ContextResourceUsage(copy_from=self)
|
||||||
|
res += other
|
||||||
|
return res
|
||||||
|
|
||||||
|
def __sub__(self, other):
|
||||||
|
res = ContextResourceUsage(copy_from=self)
|
||||||
|
res -= other
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
class LoggingContext(object):
|
class LoggingContext(object):
|
||||||
"""Additional context for log formatting. Contexts are scoped within a
|
"""Additional context for log formatting. Contexts are scoped within a
|
||||||
"with" block.
|
"with" block.
|
||||||
|
@ -58,9 +142,8 @@ class LoggingContext(object):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
__slots__ = [
|
__slots__ = [
|
||||||
"previous_context", "name", "ru_stime", "ru_utime",
|
"previous_context", "name",
|
||||||
"db_txn_count", "db_txn_duration_sec", "db_sched_duration_sec",
|
"_resource_usage",
|
||||||
"evt_db_fetch_count",
|
|
||||||
"usage_start",
|
"usage_start",
|
||||||
"main_thread", "alive",
|
"main_thread", "alive",
|
||||||
"request", "tag",
|
"request", "tag",
|
||||||
|
@ -103,18 +186,9 @@ class LoggingContext(object):
|
||||||
def __init__(self, name=None):
|
def __init__(self, name=None):
|
||||||
self.previous_context = LoggingContext.current_context()
|
self.previous_context = LoggingContext.current_context()
|
||||||
self.name = name
|
self.name = name
|
||||||
self.ru_stime = 0.
|
|
||||||
self.ru_utime = 0.
|
|
||||||
self.db_txn_count = 0
|
|
||||||
|
|
||||||
# sec spent waiting for db txns, excluding scheduling time
|
# track the resources used by this context so far
|
||||||
self.db_txn_duration_sec = 0
|
self._resource_usage = ContextResourceUsage()
|
||||||
|
|
||||||
# sec spent waiting for db txns to be scheduled
|
|
||||||
self.db_sched_duration_sec = 0
|
|
||||||
|
|
||||||
# number of events this thread has fetched from the db
|
|
||||||
self.evt_db_fetch_count = 0
|
|
||||||
|
|
||||||
# If alive has the thread resource usage when the logcontext last
|
# If alive has the thread resource usage when the logcontext last
|
||||||
# became active.
|
# became active.
|
||||||
|
@ -207,39 +281,43 @@ class LoggingContext(object):
|
||||||
logger.warning("Stopped logcontext %s on different thread", self)
|
logger.warning("Stopped logcontext %s on different thread", self)
|
||||||
return
|
return
|
||||||
|
|
||||||
# When we stop, let's record the resource used since we started
|
# When we stop, let's record the cpu used since we started
|
||||||
if self.usage_start:
|
if not self.usage_start:
|
||||||
usage_end = get_thread_resource_usage()
|
logger.warning(
|
||||||
|
"Called stop on logcontext %s without calling start", self,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
self.ru_utime += usage_end.ru_utime - self.usage_start.ru_utime
|
usage_end = get_thread_resource_usage()
|
||||||
self.ru_stime += usage_end.ru_stime - self.usage_start.ru_stime
|
|
||||||
|
|
||||||
self.usage_start = None
|
self._resource_usage.ru_utime += usage_end.ru_utime - self.usage_start.ru_utime
|
||||||
else:
|
self._resource_usage.ru_stime += usage_end.ru_stime - self.usage_start.ru_stime
|
||||||
logger.warning("Called stop on logcontext %s without calling start", self)
|
|
||||||
|
self.usage_start = None
|
||||||
|
|
||||||
def get_resource_usage(self):
|
def get_resource_usage(self):
|
||||||
"""Get CPU time used by this logcontext so far.
|
"""Get resources used by this logcontext so far.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tuple[float, float]: The user and system CPU usage in seconds
|
ContextResourceUsage: a *copy* of the object tracking resource
|
||||||
|
usage so far
|
||||||
"""
|
"""
|
||||||
ru_utime = self.ru_utime
|
# we always return a copy, for consistency
|
||||||
ru_stime = self.ru_stime
|
res = self._resource_usage.copy()
|
||||||
|
|
||||||
# If we are on the correct thread and we're currently running then we
|
# If we are on the correct thread and we're currently running then we
|
||||||
# can include resource usage so far.
|
# can include resource usage so far.
|
||||||
is_main_thread = threading.current_thread() is self.main_thread
|
is_main_thread = threading.current_thread() is self.main_thread
|
||||||
if self.alive and self.usage_start and is_main_thread:
|
if self.alive and self.usage_start and is_main_thread:
|
||||||
current = get_thread_resource_usage()
|
current = get_thread_resource_usage()
|
||||||
ru_utime += current.ru_utime - self.usage_start.ru_utime
|
res.ru_utime += current.ru_utime - self.usage_start.ru_utime
|
||||||
ru_stime += current.ru_stime - self.usage_start.ru_stime
|
res.ru_stime += current.ru_stime - self.usage_start.ru_stime
|
||||||
|
|
||||||
return ru_utime, ru_stime
|
return res
|
||||||
|
|
||||||
def add_database_transaction(self, duration_sec):
|
def add_database_transaction(self, duration_sec):
|
||||||
self.db_txn_count += 1
|
self._resource_usage.db_txn_count += 1
|
||||||
self.db_txn_duration_sec += duration_sec
|
self._resource_usage.db_txn_duration_sec += duration_sec
|
||||||
|
|
||||||
def add_database_scheduled(self, sched_sec):
|
def add_database_scheduled(self, sched_sec):
|
||||||
"""Record a use of the database pool
|
"""Record a use of the database pool
|
||||||
|
@ -248,7 +326,7 @@ class LoggingContext(object):
|
||||||
sched_sec (float): number of seconds it took us to get a
|
sched_sec (float): number of seconds it took us to get a
|
||||||
connection
|
connection
|
||||||
"""
|
"""
|
||||||
self.db_sched_duration_sec += sched_sec
|
self._resource_usage.db_sched_duration_sec += sched_sec
|
||||||
|
|
||||||
def record_event_fetch(self, event_count):
|
def record_event_fetch(self, event_count):
|
||||||
"""Record a number of events being fetched from the db
|
"""Record a number of events being fetched from the db
|
||||||
|
@ -256,7 +334,7 @@ class LoggingContext(object):
|
||||||
Args:
|
Args:
|
||||||
event_count (int): number of events being fetched
|
event_count (int): number of events being fetched
|
||||||
"""
|
"""
|
||||||
self.evt_db_fetch_count += event_count
|
self._resource_usage.evt_db_fetch_count += event_count
|
||||||
|
|
||||||
|
|
||||||
class LoggingContextFilter(logging.Filter):
|
class LoggingContextFilter(logging.Filter):
|
||||||
|
|
|
@ -60,10 +60,9 @@ def measure_func(name):
|
||||||
|
|
||||||
class Measure(object):
|
class Measure(object):
|
||||||
__slots__ = [
|
__slots__ = [
|
||||||
"clock", "name", "start_context", "start", "new_context", "ru_utime",
|
"clock", "name", "start_context", "start",
|
||||||
"ru_stime",
|
|
||||||
"db_txn_count", "db_txn_duration_sec", "db_sched_duration_sec",
|
|
||||||
"created_context",
|
"created_context",
|
||||||
|
"start_usage",
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, clock, name):
|
def __init__(self, clock, name):
|
||||||
|
@ -81,10 +80,7 @@ class Measure(object):
|
||||||
self.start_context.__enter__()
|
self.start_context.__enter__()
|
||||||
self.created_context = True
|
self.created_context = True
|
||||||
|
|
||||||
self.ru_utime, self.ru_stime = self.start_context.get_resource_usage()
|
self.start_usage = self.start_context.get_resource_usage()
|
||||||
self.db_txn_count = self.start_context.db_txn_count
|
|
||||||
self.db_txn_duration_sec = self.start_context.db_txn_duration_sec
|
|
||||||
self.db_sched_duration_sec = self.start_context.db_sched_duration_sec
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
if isinstance(exc_type, Exception) or not self.start_context:
|
if isinstance(exc_type, Exception) or not self.start_context:
|
||||||
|
@ -108,15 +104,12 @@ class Measure(object):
|
||||||
logger.warn("Expected context. (%r)", self.name)
|
logger.warn("Expected context. (%r)", self.name)
|
||||||
return
|
return
|
||||||
|
|
||||||
ru_utime, ru_stime = context.get_resource_usage()
|
usage = context.get_resource_usage() - self.start_usage
|
||||||
|
block_ru_utime.labels(self.name).inc(usage.ru_utime)
|
||||||
block_ru_utime.labels(self.name).inc(ru_utime - self.ru_utime)
|
block_ru_stime.labels(self.name).inc(usage.ru_stime)
|
||||||
block_ru_stime.labels(self.name).inc(ru_stime - self.ru_stime)
|
block_db_txn_count.labels(self.name).inc(usage.db_txn_count)
|
||||||
block_db_txn_count.labels(self.name).inc(context.db_txn_count - self.db_txn_count)
|
block_db_txn_duration.labels(self.name).inc(usage.db_txn_duration_sec)
|
||||||
block_db_txn_duration.labels(self.name).inc(
|
block_db_sched_duration.labels(self.name).inc(usage.db_sched_duration_sec)
|
||||||
context.db_txn_duration_sec - self.db_txn_duration_sec)
|
|
||||||
block_db_sched_duration.labels(self.name).inc(
|
|
||||||
context.db_sched_duration_sec - self.db_sched_duration_sec)
|
|
||||||
|
|
||||||
if self.created_context:
|
if self.created_context:
|
||||||
self.start_context.__exit__(exc_type, exc_val, exc_tb)
|
self.start_context.__exit__(exc_type, exc_val, exc_tb)
|
||||||
|
|
Loading…
Reference in a new issue