Track DB scheduling delay per-request

For each request, track the amount of time spent waiting for a db
connection. This entails adding it to the LoggingContext and we may as well add
metrics for it while we are passing.
This commit is contained in:
Richard van der Hoff 2018-01-12 00:27:14 +00:00
parent 0f5d2cc37c
commit 3d12d97415
5 changed files with 43 additions and 4 deletions

View file

@ -102,6 +102,10 @@ response_db_txn_duration = metrics.register_counter(
), ),
) )
# seconds spent waiting for a db connection, when processing this request
response_db_sched_duration = metrics.register_counter(
"response_db_sched_duration_seconds", labels=["method", "servlet", "tag"]
)
_next_request_id = 0 _next_request_id = 0
@ -381,6 +385,9 @@ class RequestMetrics(object):
response_db_txn_duration.inc_by( response_db_txn_duration.inc_by(
context.db_txn_duration_ms / 1000., request.method, self.name, tag context.db_txn_duration_ms / 1000., request.method, self.name, tag
) )
response_db_sched_duration.inc_by(
context.db_sched_duration_ms / 1000., request.method, self.name, tag
)
class RootRedirect(resource.Resource): class RootRedirect(resource.Resource):

View file

@ -67,13 +67,14 @@ class SynapseRequest(Request):
ru_utime, ru_stime = context.get_resource_usage() ru_utime, ru_stime = context.get_resource_usage()
db_txn_count = context.db_txn_count db_txn_count = context.db_txn_count
db_txn_duration_ms = context.db_txn_duration_ms db_txn_duration_ms = context.db_txn_duration_ms
db_sched_duration_ms = context.db_sched_duration_ms
except Exception: except Exception:
ru_utime, ru_stime = (0, 0) ru_utime, ru_stime = (0, 0)
db_txn_count, db_txn_duration_ms = (0, 0) db_txn_count, db_txn_duration_ms = (0, 0)
self.site.access_logger.info( self.site.access_logger.info(
"%s - %s - {%s}" "%s - %s - {%s}"
" Processed request: %dms (%dms, %dms) (%dms/%d)" " Processed request: %dms (%dms, %dms) (%dms/%dms/%d)"
" %sB %s \"%s %s %s\" \"%s\"", " %sB %s \"%s %s %s\" \"%s\"",
self.getClientIP(), self.getClientIP(),
self.site.site_tag, self.site.site_tag,
@ -81,6 +82,7 @@ class SynapseRequest(Request):
int(time.time() * 1000) - self.start_time, int(time.time() * 1000) - self.start_time,
int(ru_utime * 1000), int(ru_utime * 1000),
int(ru_stime * 1000), int(ru_stime * 1000),
db_sched_duration_ms,
db_txn_duration_ms, db_txn_duration_ms,
int(db_txn_count), int(db_txn_count),
self.sentLength, self.sentLength,

View file

@ -347,7 +347,9 @@ class SQLBaseStore(object):
def inner_func(conn, *args, **kwargs): def inner_func(conn, *args, **kwargs):
with LoggingContext("runWithConnection") as context: with LoggingContext("runWithConnection") as context:
sql_scheduling_timer.inc_by(time.time() * 1000 - start_time) sched_duration_ms = time.time() * 1000 - start_time
sql_scheduling_timer.inc_by(sched_duration_ms)
current_context.add_database_scheduled(sched_duration_ms)
if self.database_engine.is_connection_closed(conn): if self.database_engine.is_connection_closed(conn):
logger.debug("Reconnecting closed database connection") logger.debug("Reconnecting closed database connection")

View file

@ -59,7 +59,8 @@ class LoggingContext(object):
__slots__ = [ __slots__ = [
"previous_context", "name", "ru_stime", "ru_utime", "previous_context", "name", "ru_stime", "ru_utime",
"db_txn_count", "db_txn_duration_ms", "usage_start", "usage_end", "db_txn_count", "db_txn_duration_ms", "db_sched_duration_ms",
"usage_start", "usage_end",
"main_thread", "alive", "main_thread", "alive",
"request", "tag", "request", "tag",
] ]
@ -86,6 +87,9 @@ class LoggingContext(object):
def add_database_transaction(self, duration_ms): def add_database_transaction(self, duration_ms):
pass pass
def add_database_scheduled(self, sched_ms):
pass
def __nonzero__(self): def __nonzero__(self):
return False return False
@ -101,6 +105,9 @@ class LoggingContext(object):
# ms spent waiting for db txns, excluding scheduling time # ms spent waiting for db txns, excluding scheduling time
self.db_txn_duration_ms = 0 self.db_txn_duration_ms = 0
# ms spent waiting for db txns to be scheduled
self.db_sched_duration_ms = 0
self.usage_start = None self.usage_start = None
self.usage_end = None self.usage_end = None
self.main_thread = threading.current_thread() self.main_thread = threading.current_thread()
@ -210,6 +217,15 @@ class LoggingContext(object):
self.db_txn_count += 1 self.db_txn_count += 1
self.db_txn_duration_ms += duration_ms self.db_txn_duration_ms += duration_ms
def add_database_scheduled(self, sched_ms):
"""Record a use of the database pool
Args:
sched_ms (int): number of milliseconds it took us to get a
connection
"""
self.db_sched_duration_ms += sched_ms
class LoggingContextFilter(logging.Filter): class LoggingContextFilter(logging.Filter):
"""Logging filter that adds values from the current logging context to each """Logging filter that adds values from the current logging context to each

View file

@ -80,6 +80,11 @@ block_db_txn_duration = metrics.register_counter(
), ),
) )
# seconds spent waiting for a db connection, in this block
block_db_sched_duration = metrics.register_counter(
"block_db_sched_duration_seconds", labels=["block_name"],
)
def measure_func(name): def measure_func(name):
def wrapper(func): def wrapper(func):
@ -96,7 +101,9 @@ def measure_func(name):
class Measure(object): class Measure(object):
__slots__ = [ __slots__ = [
"clock", "name", "start_context", "start", "new_context", "ru_utime", "clock", "name", "start_context", "start", "new_context", "ru_utime",
"ru_stime", "db_txn_count", "db_txn_duration_ms", "created_context" "ru_stime",
"db_txn_count", "db_txn_duration_ms", "db_sched_duration_ms",
"created_context",
] ]
def __init__(self, clock, name): def __init__(self, clock, name):
@ -117,6 +124,7 @@ class Measure(object):
self.ru_utime, self.ru_stime = self.start_context.get_resource_usage() self.ru_utime, self.ru_stime = self.start_context.get_resource_usage()
self.db_txn_count = self.start_context.db_txn_count self.db_txn_count = self.start_context.db_txn_count
self.db_txn_duration_ms = self.start_context.db_txn_duration_ms self.db_txn_duration_ms = self.start_context.db_txn_duration_ms
self.db_sched_duration_ms = self.start_context.db_sched_duration_ms
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
if isinstance(exc_type, Exception) or not self.start_context: if isinstance(exc_type, Exception) or not self.start_context:
@ -149,6 +157,10 @@ class Measure(object):
(context.db_txn_duration_ms - self.db_txn_duration_ms) / 1000., (context.db_txn_duration_ms - self.db_txn_duration_ms) / 1000.,
self.name self.name
) )
block_db_sched_duration.inc_by(
(context.db_sched_duration_ms - self.db_sched_duration_ms) / 1000.,
self.name
)
if self.created_context: if self.created_context:
self.start_context.__exit__(exc_type, exc_val, exc_tb) self.start_context.__exit__(exc_type, exc_val, exc_tb)