Track DB scheduling delay per-request

For each request, track the amount of time spent waiting for a db connection. This entails adding it to the LoggingContext and we may as well add metrics for it while we are passing.
2018-01-12 00:27:14 +00:00 · 2018-01-12 00:27:14 +00:00 · 3d12d97415
commit 3d12d97415
parent 0f5d2cc37c
5 changed files with 43 additions and 4 deletions
--- a/synapse/http/server.py
+++ b/synapse/http/server.py
@ -102,6 +102,10 @@ response_db_txn_duration = metrics.register_counter(
    ),
 )

+# seconds spent waiting for a db connection, when processing this request
+response_db_sched_duration = metrics.register_counter(
+    "response_db_sched_duration_seconds", labels=["method", "servlet", "tag"]
+)

 _next_request_id = 0

@ -381,6 +385,9 @@ class RequestMetrics(object):
        response_db_txn_duration.inc_by(
            context.db_txn_duration_ms / 1000., request.method, self.name, tag
        )
+        response_db_sched_duration.inc_by(
+            context.db_sched_duration_ms / 1000., request.method, self.name, tag
+        )


 class RootRedirect(resource.Resource):
--- a/synapse/http/site.py
+++ b/synapse/http/site.py
@ -67,13 +67,14 @@ class SynapseRequest(Request):
            ru_utime, ru_stime = context.get_resource_usage()
            db_txn_count = context.db_txn_count
            db_txn_duration_ms = context.db_txn_duration_ms
+            db_sched_duration_ms = context.db_sched_duration_ms
        except Exception:
            ru_utime, ru_stime = (0, 0)
            db_txn_count, db_txn_duration_ms = (0, 0)

        self.site.access_logger.info(
            "%s - %s - {%s}"
-            " Processed request: %dms (%dms, %dms) (%dms/%d)"
+            " Processed request: %dms (%dms, %dms) (%dms/%dms/%d)"
            " %sB %s \"%s %s %s\" \"%s\"",
            self.getClientIP(),
            self.site.site_tag,
@ -81,6 +82,7 @@ class SynapseRequest(Request):
            int(time.time() * 1000) - self.start_time,
            int(ru_utime * 1000),
            int(ru_stime * 1000),
+            db_sched_duration_ms,
            db_txn_duration_ms,
            int(db_txn_count),
            self.sentLength,
--- a/synapse/storage/_base.py
+++ b/synapse/storage/_base.py
@ -347,7 +347,9 @@ class SQLBaseStore(object):

        def inner_func(conn, *args, **kwargs):
            with LoggingContext("runWithConnection") as context:
-                sql_scheduling_timer.inc_by(time.time() * 1000 - start_time)
+                sched_duration_ms = time.time() * 1000 - start_time
+                sql_scheduling_timer.inc_by(sched_duration_ms)
+                current_context.add_database_scheduled(sched_duration_ms)

                if self.database_engine.is_connection_closed(conn):
                    logger.debug("Reconnecting closed database connection")
--- a/synapse/util/logcontext.py
+++ b/synapse/util/logcontext.py
@ -59,7 +59,8 @@ class LoggingContext(object):

    __slots__ = [
        "previous_context", "name", "ru_stime", "ru_utime",
-        "db_txn_count", "db_txn_duration_ms", "usage_start", "usage_end",
+        "db_txn_count", "db_txn_duration_ms", "db_sched_duration_ms",
+        "usage_start", "usage_end",
        "main_thread", "alive",
        "request", "tag",
    ]
@ -86,6 +87,9 @@ class LoggingContext(object):
        def add_database_transaction(self, duration_ms):
            pass

+        def add_database_scheduled(self, sched_ms):
+            pass
+
        def __nonzero__(self):
            return False

@ -101,6 +105,9 @@ class LoggingContext(object):
        # ms spent waiting for db txns, excluding scheduling time
        self.db_txn_duration_ms = 0

+        # ms spent waiting for db txns to be scheduled
+        self.db_sched_duration_ms = 0
+
        self.usage_start = None
        self.usage_end = None
        self.main_thread = threading.current_thread()
@ -210,6 +217,15 @@ class LoggingContext(object):
        self.db_txn_count += 1
        self.db_txn_duration_ms += duration_ms

+    def add_database_scheduled(self, sched_ms):
+        """Record a use of the database pool
+
+        Args:
+            sched_ms (int): number of milliseconds it took us to get a
+                connection
+        """
+        self.db_sched_duration_ms += sched_ms
+

 class LoggingContextFilter(logging.Filter):
    """Logging filter that adds values from the current logging context to each
--- a/synapse/util/metrics.py
+++ b/synapse/util/metrics.py
@ -80,6 +80,11 @@ block_db_txn_duration = metrics.register_counter(
    ),
 )

+# seconds spent waiting for a db connection, in this block
+block_db_sched_duration = metrics.register_counter(
+    "block_db_sched_duration_seconds", labels=["block_name"],
+)
+

 def measure_func(name):
    def wrapper(func):
@ -96,7 +101,9 @@ def measure_func(name):
 class Measure(object):
    __slots__ = [
        "clock", "name", "start_context", "start", "new_context", "ru_utime",
-        "ru_stime", "db_txn_count", "db_txn_duration_ms", "created_context"
+        "ru_stime",
+        "db_txn_count", "db_txn_duration_ms", "db_sched_duration_ms",
+        "created_context",
    ]

    def __init__(self, clock, name):
@ -117,6 +124,7 @@ class Measure(object):
        self.ru_utime, self.ru_stime = self.start_context.get_resource_usage()
        self.db_txn_count = self.start_context.db_txn_count
        self.db_txn_duration_ms = self.start_context.db_txn_duration_ms
+        self.db_sched_duration_ms = self.start_context.db_sched_duration_ms

    def __exit__(self, exc_type, exc_val, exc_tb):
        if isinstance(exc_type, Exception) or not self.start_context:
@ -149,6 +157,10 @@ class Measure(object):
            (context.db_txn_duration_ms - self.db_txn_duration_ms) / 1000.,
            self.name
        )
+        block_db_sched_duration.inc_by(
+            (context.db_sched_duration_ms - self.db_sched_duration_ms) / 1000.,
+            self.name
+        )

        if self.created_context:
            self.start_context.__exit__(exc_type, exc_val, exc_tb)