Merge pull request #2785 from matrix-org/rav/reorganise_metrics_again

Reorganise request and block metrics
This commit is contained in:
Richard van der Hoff 2018-01-16 14:10:21 +00:00 committed by GitHub
commit febdca4b37
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 209 additions and 47 deletions

View file

@ -2,7 +2,10 @@ Unreleased
==========
synctl no longer starts the main synapse when using ``-a`` option with workers.
A new worker file should be added with ``worker_app: synapse.app.homeserver``
A new worker file should be added with ``worker_app: synapse.app.homeserver``.
This release also begins the process of renaming a number of the metrics
reported to prometheus. See `docs/metrics-howto.rst <docs/metrics-howto.rst#block-and-response-metrics-renamed-for-0-27-0>`_.
Changes in synapse v0.26.0 (2018-01-05)

View file

@ -16,7 +16,7 @@ How to monitor Synapse metrics using Prometheus
metrics_port: 9092
Also ensure that ``enable_metrics`` is set to ``True``.
Restart synapse.
3. Add a prometheus target for synapse.
@ -28,11 +28,58 @@ How to monitor Synapse metrics using Prometheus
static_configs:
- targets: ["my.server.here:9092"]
If your prometheus is older than 1.5.2, you will need to replace
If your prometheus is older than 1.5.2, you will need to replace
``static_configs`` in the above with ``target_groups``.
Restart prometheus.
Block and response metrics renamed for 0.27.0
---------------------------------------------
Synapse 0.27.0 begins the process of rationalising the duplicate ``*:count``
metrics reported for the resource tracking for code blocks and HTTP requests.
At the same time, the corresponding ``*:total`` metrics are being renamed, as
the ``:total`` suffix no longer makes sense in the absence of a corresponding
``:count`` metric.
To enable a graceful migration path, this release just adds new names for the
metrics being renamed. A future release will remove the old ones.
The following table shows the new metrics, and the old metrics which they are
replacing.
==================================================== ===================================================
New name Old name
==================================================== ===================================================
synapse_util_metrics_block_count synapse_util_metrics_block_timer:count
synapse_util_metrics_block_count synapse_util_metrics_block_ru_utime:count
synapse_util_metrics_block_count synapse_util_metrics_block_ru_stime:count
synapse_util_metrics_block_count synapse_util_metrics_block_db_txn_count:count
synapse_util_metrics_block_count synapse_util_metrics_block_db_txn_duration:count
synapse_util_metrics_block_time_seconds synapse_util_metrics_block_timer:total
synapse_util_metrics_block_ru_utime_seconds synapse_util_metrics_block_ru_utime:total
synapse_util_metrics_block_ru_stime_seconds synapse_util_metrics_block_ru_stime:total
synapse_util_metrics_block_db_txn_count synapse_util_metrics_block_db_txn_count:total
synapse_util_metrics_block_db_txn_duration_seconds synapse_util_metrics_block_db_txn_duration:total
synapse_http_server_response_count synapse_http_server_requests
synapse_http_server_response_count synapse_http_server_response_time:count
synapse_http_server_response_count synapse_http_server_response_ru_utime:count
synapse_http_server_response_count synapse_http_server_response_ru_stime:count
synapse_http_server_response_count synapse_http_server_response_db_txn_count:count
synapse_http_server_response_count synapse_http_server_response_db_txn_duration:count
synapse_http_server_response_time_seconds synapse_http_server_response_time:total
synapse_http_server_response_ru_utime_seconds synapse_http_server_response_ru_utime:total
synapse_http_server_response_ru_stime_seconds synapse_http_server_response_ru_stime:total
synapse_http_server_response_db_txn_count synapse_http_server_response_db_txn_count:total
synapse_http_server_response_db_txn_duration_seconds synapse_http_server_response_db_txn_duration:total
==================================================== ===================================================
Standard Metric Names
---------------------
@ -42,7 +89,7 @@ have been changed to seconds, from miliseconds.
================================== =============================
New name Old name
---------------------------------- -----------------------------
================================== =============================
process_cpu_user_seconds_total process_resource_utime / 1000
process_cpu_system_seconds_total process_resource_stime / 1000
process_open_fds (no 'type' label) process_fds
@ -52,8 +99,8 @@ The python-specific counts of garbage collector performance have been renamed.
=========================== ======================
New name Old name
--------------------------- ----------------------
python_gc_time reactor_gc_time
=========================== ======================
python_gc_time reactor_gc_time
python_gc_unreachable_total reactor_gc_unreachable
python_gc_counts reactor_gc_counts
=========================== ======================
@ -62,7 +109,7 @@ The twisted-specific reactor metrics have been renamed.
==================================== =====================
New name Old name
------------------------------------ ---------------------
==================================== =====================
python_twisted_reactor_pending_calls reactor_pending_calls
python_twisted_reactor_tick_time reactor_tick_time
==================================== =====================

View file

@ -42,34 +42,62 @@ logger = logging.getLogger(__name__)
metrics = synapse.metrics.get_metrics_for(__name__)
incoming_requests_counter = metrics.register_counter(
"requests",
# total number of responses served, split by method/servlet/tag
response_count = metrics.register_counter(
"response_count",
labels=["method", "servlet", "tag"],
alternative_names=(
# the following are all deprecated aliases for the same metric
metrics.name_prefix + x for x in (
"_requests",
"_response_time:count",
"_response_ru_utime:count",
"_response_ru_stime:count",
"_response_db_txn_count:count",
"_response_db_txn_duration:count",
)
)
)
outgoing_responses_counter = metrics.register_counter(
"responses",
labels=["method", "code"],
)
response_timer = metrics.register_distribution(
"response_time",
labels=["method", "servlet", "tag"]
response_timer = metrics.register_counter(
"response_time_seconds",
labels=["method", "servlet", "tag"],
alternative_names=(
metrics.name_prefix + "_response_time:total",
),
)
response_ru_utime = metrics.register_distribution(
"response_ru_utime", labels=["method", "servlet", "tag"]
response_ru_utime = metrics.register_counter(
"response_ru_utime_seconds", labels=["method", "servlet", "tag"],
alternative_names=(
metrics.name_prefix + "_response_ru_utime:total",
),
)
response_ru_stime = metrics.register_distribution(
"response_ru_stime", labels=["method", "servlet", "tag"]
response_ru_stime = metrics.register_counter(
"response_ru_stime_seconds", labels=["method", "servlet", "tag"],
alternative_names=(
metrics.name_prefix + "_response_ru_stime:total",
),
)
response_db_txn_count = metrics.register_distribution(
"response_db_txn_count", labels=["method", "servlet", "tag"]
response_db_txn_count = metrics.register_counter(
"response_db_txn_count", labels=["method", "servlet", "tag"],
alternative_names=(
metrics.name_prefix + "_response_db_txn_count:total",
),
)
response_db_txn_duration = metrics.register_distribution(
"response_db_txn_duration", labels=["method", "servlet", "tag"]
response_db_txn_duration = metrics.register_counter(
"response_db_txn_duration_seconds", labels=["method", "servlet", "tag"],
alternative_names=(
metrics.name_prefix + "_response_db_txn_duration:total",
),
)
@ -330,7 +358,7 @@ class RequestMetrics(object):
)
return
incoming_requests_counter.inc(request.method, self.name, tag)
response_count.inc(request.method, self.name, tag)
response_timer.inc_by(
clock.time_msec() - self.start, request.method,

View file

@ -17,16 +17,33 @@
from itertools import chain
# TODO(paul): I can't believe Python doesn't have one of these
def map_concat(func, items):
# flatten a list-of-lists
return list(chain.from_iterable(map(func, items)))
def flatten(items):
"""Flatten a list of lists
Args:
items: iterable[iterable[X]]
Returns:
list[X]: flattened list
"""
return list(chain.from_iterable(items))
class BaseMetric(object):
"""Base class for metrics which report a single value per label set
"""
def __init__(self, name, labels=[]):
self.name = name
def __init__(self, name, labels=[], alternative_names=[]):
"""
Args:
name (str): principal name for this metric
labels (list(str)): names of the labels which will be reported
for this metric
alternative_names (iterable(str)): list of alternative names for
this metric. This can be useful to provide a migration path
when renaming metrics.
"""
self._names = [name] + list(alternative_names)
self.labels = labels # OK not to clone as we never write it
def dimension(self):
@ -36,7 +53,7 @@ class BaseMetric(object):
return not len(self.labels)
def _render_labelvalue(self, value):
# TODO: some kind of value escape
# TODO: escape backslashes, quotes and newlines
return '"%s"' % (value)
def _render_key(self, values):
@ -47,6 +64,36 @@ class BaseMetric(object):
for k, v in zip(self.labels, values)])
)
def _render_for_labels(self, label_values, value):
"""Render this metric for a single set of labels
Args:
label_values (list[str]): values for each of the labels
value: value of the metric at with these labels
Returns:
iterable[str]: rendered metric
"""
rendered_labels = self._render_key(label_values)
return (
"%s%s %.12g" % (name, rendered_labels, value)
for name in self._names
)
def render(self):
"""Render this metric
Each metric is rendered as:
name{label1="val1",label2="val2"} value
https://prometheus.io/docs/instrumenting/exposition_formats/#text-format-details
Returns:
iterable[str]: rendered metrics
"""
raise NotImplementedError()
class CounterMetric(BaseMetric):
"""The simplest kind of metric; one that stores a monotonically-increasing
@ -62,6 +109,10 @@ class CounterMetric(BaseMetric):
def __init__(self, *args, **kwargs):
super(CounterMetric, self).__init__(*args, **kwargs)
# dict[list[str]]: value for each set of label values. the keys are the
# label values, in the same order as the labels in self.labels.
#
# (if the metric is a scalar, the (single) key is the empty list).
self.counts = {}
# Scalar metrics are never empty
@ -84,11 +135,11 @@ class CounterMetric(BaseMetric):
def inc(self, *values):
self.inc_by(1, *values)
def render_item(self, k):
return ["%s%s %.12g" % (self.name, self._render_key(k), self.counts[k])]
def render(self):
return map_concat(self.render_item, sorted(self.counts.keys()))
return flatten(
self._render_for_labels(k, self.counts[k])
for k in sorted(self.counts.keys())
)
class CallbackMetric(BaseMetric):
@ -105,10 +156,12 @@ class CallbackMetric(BaseMetric):
value = self.callback()
if self.is_scalar():
return ["%s %.12g" % (self.name, value)]
return list(self._render_for_labels([], value))
return ["%s%s %.12g" % (self.name, self._render_key(k), value[k])
for k in sorted(value.keys())]
return flatten(
self._render_for_labels(k, value[k])
for k in sorted(value.keys())
)
class DistributionMetric(object):

View file

@ -27,25 +27,56 @@ logger = logging.getLogger(__name__)
metrics = synapse.metrics.get_metrics_for(__name__)
block_timer = metrics.register_distribution(
"block_timer",
labels=["block_name"]
# total number of times we have hit this block
response_count = metrics.register_counter(
"block_count",
labels=["block_name"],
alternative_names=(
# the following are all deprecated aliases for the same metric
metrics.name_prefix + x for x in (
"_block_timer:count",
"_block_ru_utime:count",
"_block_ru_stime:count",
"_block_db_txn_count:count",
"_block_db_txn_duration:count",
)
)
)
block_ru_utime = metrics.register_distribution(
"block_ru_utime", labels=["block_name"]
block_timer = metrics.register_counter(
"block_time_seconds",
labels=["block_name"],
alternative_names=(
metrics.name_prefix + "_block_timer:total",
),
)
block_ru_stime = metrics.register_distribution(
"block_ru_stime", labels=["block_name"]
block_ru_utime = metrics.register_counter(
"block_ru_utime_seconds", labels=["block_name"],
alternative_names=(
metrics.name_prefix + "_block_ru_utime:total",
),
)
block_db_txn_count = metrics.register_distribution(
"block_db_txn_count", labels=["block_name"]
block_ru_stime = metrics.register_counter(
"block_ru_stime_seconds", labels=["block_name"],
alternative_names=(
metrics.name_prefix + "_block_ru_stime:total",
),
)
block_db_txn_duration = metrics.register_distribution(
"block_db_txn_duration", labels=["block_name"]
block_db_txn_count = metrics.register_counter(
"block_db_txn_count", labels=["block_name"],
alternative_names=(
metrics.name_prefix + "_block_db_txn_count:total",
),
)
block_db_txn_duration = metrics.register_counter(
"block_db_txn_duration_seconds", labels=["block_name"],
alternative_names=(
metrics.name_prefix + "_block_db_txn_count:total",
),
)