Merge pull request #1193 from matrix-org/paul/metrics

More updates to Promethese metrics exposition
This commit is contained in:
Paul Evans 2016-11-03 17:22:15 +00:00 committed by GitHub
commit 7fc2b5c063
3 changed files with 18 additions and 71 deletions

View file

@ -51,9 +51,9 @@ python_gc_counts reactor_gc_counts
The twisted-specific reactor metrics have been renamed. The twisted-specific reactor metrics have been renamed.
==================================== ================= ==================================== =====================
New name Old name New name Old name
------------------------------------ ----------------- ------------------------------------ ---------------------
python_twisted_reactor_pending_calls reactor_tick_time python_twisted_reactor_pending_calls reactor_pending_calls
python_twisted_reactor_tick_time reactor_tick_time python_twisted_reactor_tick_time reactor_tick_time
==================================== ================= ==================================== =====================

View file

@ -111,18 +111,20 @@ def render_all():
return "\n".join(strs) return "\n".join(strs)
reactor_metrics = get_metrics_for("reactor") register_process_collector(get_metrics_for("process"))
tick_time = reactor_metrics.register_distribution("tick_time")
pending_calls_metric = reactor_metrics.register_distribution("pending_calls")
gc_time = reactor_metrics.register_distribution("gc_time", labels=["gen"])
gc_unreachable = reactor_metrics.register_counter("gc_unreachable", labels=["gen"])
reactor_metrics.register_callback( python_metrics = get_metrics_for("python")
gc_time = python_metrics.register_distribution("gc_time", labels=["gen"])
gc_unreachable = python_metrics.register_counter("gc_unreachable_total", labels=["gen"])
python_metrics.register_callback(
"gc_counts", lambda: {(i,): v for i, v in enumerate(gc.get_count())}, labels=["gen"] "gc_counts", lambda: {(i,): v for i, v in enumerate(gc.get_count())}, labels=["gen"]
) )
register_process_collector(get_metrics_for("process")) reactor_metrics = get_metrics_for("python.twisted.reactor")
tick_time = reactor_metrics.register_distribution("tick_time")
pending_calls_metric = reactor_metrics.register_distribution("pending_calls")
def runUntilCurrentTimer(func): def runUntilCurrentTimer(func):

View file

@ -13,12 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Because otherwise 'resource' collides with synapse.metrics.resource
from __future__ import absolute_import
import os import os
import stat
from resource import getrusage, RUSAGE_SELF
TICKS_PER_SEC = 100 TICKS_PER_SEC = 100
@ -29,16 +24,6 @@ HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat")
HAVE_PROC_SELF_LIMITS = os.path.exists("/proc/self/limits") HAVE_PROC_SELF_LIMITS = os.path.exists("/proc/self/limits")
HAVE_PROC_SELF_FD = os.path.exists("/proc/self/fd") HAVE_PROC_SELF_FD = os.path.exists("/proc/self/fd")
TYPES = {
stat.S_IFSOCK: "SOCK",
stat.S_IFLNK: "LNK",
stat.S_IFREG: "REG",
stat.S_IFBLK: "BLK",
stat.S_IFDIR: "DIR",
stat.S_IFCHR: "CHR",
stat.S_IFIFO: "FIFO",
}
# Field indexes from /proc/self/stat, taken from the proc(5) manpage # Field indexes from /proc/self/stat, taken from the proc(5) manpage
STAT_FIELDS = { STAT_FIELDS = {
"utime": 14, "utime": 14,
@ -49,9 +34,7 @@ STAT_FIELDS = {
} }
rusage = None
stats = {} stats = {}
fd_counts = None
# In order to report process_start_time_seconds we need to know the # In order to report process_start_time_seconds we need to know the
# machine's boot time, because the value in /proc/self/stat is relative to # machine's boot time, because the value in /proc/self/stat is relative to
@ -65,9 +48,6 @@ if HAVE_PROC_STAT:
def update_resource_metrics(): def update_resource_metrics():
global rusage
rusage = getrusage(RUSAGE_SELF)
if HAVE_PROC_SELF_STAT: if HAVE_PROC_SELF_STAT:
global stats global stats
with open("/proc/self/stat") as s: with open("/proc/self/stat") as s:
@ -80,52 +60,17 @@ def update_resource_metrics():
# we've lost the first two fields in PID and COMMAND above # we've lost the first two fields in PID and COMMAND above
stats[name] = int(raw_stats[index - 3]) stats[name] = int(raw_stats[index - 3])
global fd_counts
fd_counts = _process_fds()
def _process_fds():
counts = {(k,): 0 for k in TYPES.values()}
counts[("other",)] = 0
def _count_fds():
# Not every OS will have a /proc/self/fd directory # Not every OS will have a /proc/self/fd directory
if not HAVE_PROC_SELF_FD: if not HAVE_PROC_SELF_FD:
return counts return 0
for fd in os.listdir("/proc/self/fd"): return len(os.listdir("/proc/self/fd"))
try:
s = os.stat("/proc/self/fd/%s" % (fd))
fmt = stat.S_IFMT(s.st_mode)
if fmt in TYPES:
t = TYPES[fmt]
else:
t = "other"
counts[(t,)] += 1
except OSError:
# the dirh itself used by listdir() is usually missing by now
pass
return counts
def register_process_collector(process_metrics): def register_process_collector(process_metrics):
# Legacy synapse-invented metric names process_metrics.register_collector(update_resource_metrics)
resource_metrics = process_metrics.make_subspace("resource")
resource_metrics.register_collector(update_resource_metrics)
# msecs
resource_metrics.register_callback("utime", lambda: rusage.ru_utime * 1000)
resource_metrics.register_callback("stime", lambda: rusage.ru_stime * 1000)
# kilobytes
resource_metrics.register_callback("maxrss", lambda: rusage.ru_maxrss * 1024)
process_metrics.register_callback("fds", _process_fds, labels=["type"])
# New prometheus-standard metric names
if HAVE_PROC_SELF_STAT: if HAVE_PROC_SELF_STAT:
process_metrics.register_callback( process_metrics.register_callback(
@ -158,7 +103,7 @@ def register_process_collector(process_metrics):
if HAVE_PROC_SELF_FD: if HAVE_PROC_SELF_FD:
process_metrics.register_callback( process_metrics.register_callback(
"open_fds", "open_fds",
lambda: sum(fd_counts.values()) lambda: _count_fds()
) )
if HAVE_PROC_SELF_LIMITS: if HAVE_PROC_SELF_LIMITS: