mirror of
https://mau.dev/maunium/synapse.git
synced 2024-11-15 14:32:30 +01:00
Add 'failure_ts' column to 'destinations' table (#6016)
Track the time that a server started failing at, for general analysis purposes.
This commit is contained in:
parent
850dcfd2d3
commit
1e19ce00bf
7 changed files with 195 additions and 12 deletions
1
changelog.d/6016.misc
Normal file
1
changelog.d/6016.misc
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Add a 'failure_ts' column to the 'destinations' database table.
|
25
synapse/storage/schema/delta/56/destinations_failure_ts.sql
Normal file
25
synapse/storage/schema/delta/56/destinations_failure_ts.sql
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
/* Copyright 2019 The Matrix.org Foundation C.I.C
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Record the timestamp when a given server started failing
|
||||||
|
*/
|
||||||
|
ALTER TABLE destinations ADD failure_ts BIGINT;
|
||||||
|
|
||||||
|
/* as a rough approximation, we assume that the server started failing at
|
||||||
|
* retry_interval before the last retry
|
||||||
|
*/
|
||||||
|
UPDATE destinations SET failure_ts = retry_last_ts - retry_interval
|
||||||
|
WHERE retry_last_ts > 0;
|
|
@ -165,7 +165,7 @@ class TransactionStore(SQLBaseStore):
|
||||||
txn,
|
txn,
|
||||||
table="destinations",
|
table="destinations",
|
||||||
keyvalues={"destination": destination},
|
keyvalues={"destination": destination},
|
||||||
retcols=("destination", "retry_last_ts", "retry_interval"),
|
retcols=("destination", "failure_ts", "retry_last_ts", "retry_interval"),
|
||||||
allow_none=True,
|
allow_none=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -174,12 +174,15 @@ class TransactionStore(SQLBaseStore):
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def set_destination_retry_timings(self, destination, retry_last_ts, retry_interval):
|
def set_destination_retry_timings(
|
||||||
|
self, destination, failure_ts, retry_last_ts, retry_interval
|
||||||
|
):
|
||||||
"""Sets the current retry timings for a given destination.
|
"""Sets the current retry timings for a given destination.
|
||||||
Both timings should be zero if retrying is no longer occuring.
|
Both timings should be zero if retrying is no longer occuring.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
destination (str)
|
destination (str)
|
||||||
|
failure_ts (int|None) - when the server started failing (ms since epoch)
|
||||||
retry_last_ts (int) - time of last retry attempt in unix epoch ms
|
retry_last_ts (int) - time of last retry attempt in unix epoch ms
|
||||||
retry_interval (int) - how long until next retry in ms
|
retry_interval (int) - how long until next retry in ms
|
||||||
"""
|
"""
|
||||||
|
@ -189,12 +192,13 @@ class TransactionStore(SQLBaseStore):
|
||||||
"set_destination_retry_timings",
|
"set_destination_retry_timings",
|
||||||
self._set_destination_retry_timings,
|
self._set_destination_retry_timings,
|
||||||
destination,
|
destination,
|
||||||
|
failure_ts,
|
||||||
retry_last_ts,
|
retry_last_ts,
|
||||||
retry_interval,
|
retry_interval,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _set_destination_retry_timings(
|
def _set_destination_retry_timings(
|
||||||
self, txn, destination, retry_last_ts, retry_interval
|
self, txn, destination, failure_ts, retry_last_ts, retry_interval
|
||||||
):
|
):
|
||||||
|
|
||||||
if self.database_engine.can_native_upsert:
|
if self.database_engine.can_native_upsert:
|
||||||
|
@ -202,9 +206,12 @@ class TransactionStore(SQLBaseStore):
|
||||||
# resetting it) or greater than the existing retry interval.
|
# resetting it) or greater than the existing retry interval.
|
||||||
|
|
||||||
sql = """
|
sql = """
|
||||||
INSERT INTO destinations (destination, retry_last_ts, retry_interval)
|
INSERT INTO destinations (
|
||||||
VALUES (?, ?, ?)
|
destination, failure_ts, retry_last_ts, retry_interval
|
||||||
|
)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
ON CONFLICT (destination) DO UPDATE SET
|
ON CONFLICT (destination) DO UPDATE SET
|
||||||
|
failure_ts = EXCLUDED.failure_ts,
|
||||||
retry_last_ts = EXCLUDED.retry_last_ts,
|
retry_last_ts = EXCLUDED.retry_last_ts,
|
||||||
retry_interval = EXCLUDED.retry_interval
|
retry_interval = EXCLUDED.retry_interval
|
||||||
WHERE
|
WHERE
|
||||||
|
@ -212,7 +219,7 @@ class TransactionStore(SQLBaseStore):
|
||||||
OR destinations.retry_interval < EXCLUDED.retry_interval
|
OR destinations.retry_interval < EXCLUDED.retry_interval
|
||||||
"""
|
"""
|
||||||
|
|
||||||
txn.execute(sql, (destination, retry_last_ts, retry_interval))
|
txn.execute(sql, (destination, failure_ts, retry_last_ts, retry_interval))
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -225,7 +232,7 @@ class TransactionStore(SQLBaseStore):
|
||||||
txn,
|
txn,
|
||||||
table="destinations",
|
table="destinations",
|
||||||
keyvalues={"destination": destination},
|
keyvalues={"destination": destination},
|
||||||
retcols=("retry_last_ts", "retry_interval"),
|
retcols=("failure_ts", "retry_last_ts", "retry_interval"),
|
||||||
allow_none=True,
|
allow_none=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -235,6 +242,7 @@ class TransactionStore(SQLBaseStore):
|
||||||
table="destinations",
|
table="destinations",
|
||||||
values={
|
values={
|
||||||
"destination": destination,
|
"destination": destination,
|
||||||
|
"failure_ts": failure_ts,
|
||||||
"retry_last_ts": retry_last_ts,
|
"retry_last_ts": retry_last_ts,
|
||||||
"retry_interval": retry_interval,
|
"retry_interval": retry_interval,
|
||||||
},
|
},
|
||||||
|
@ -245,6 +253,7 @@ class TransactionStore(SQLBaseStore):
|
||||||
"destinations",
|
"destinations",
|
||||||
keyvalues={"destination": destination},
|
keyvalues={"destination": destination},
|
||||||
updatevalues={
|
updatevalues={
|
||||||
|
"failure_ts": failure_ts,
|
||||||
"retry_last_ts": retry_last_ts,
|
"retry_last_ts": retry_last_ts,
|
||||||
"retry_interval": retry_interval,
|
"retry_interval": retry_interval,
|
||||||
},
|
},
|
||||||
|
|
|
@ -80,11 +80,13 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False, **kwargs)
|
||||||
# We aren't ready to retry that destination.
|
# We aren't ready to retry that destination.
|
||||||
raise
|
raise
|
||||||
"""
|
"""
|
||||||
|
failure_ts = None
|
||||||
retry_last_ts, retry_interval = (0, 0)
|
retry_last_ts, retry_interval = (0, 0)
|
||||||
|
|
||||||
retry_timings = yield store.get_destination_retry_timings(destination)
|
retry_timings = yield store.get_destination_retry_timings(destination)
|
||||||
|
|
||||||
if retry_timings:
|
if retry_timings:
|
||||||
|
failure_ts = retry_timings["failure_ts"]
|
||||||
retry_last_ts, retry_interval = (
|
retry_last_ts, retry_interval = (
|
||||||
retry_timings["retry_last_ts"],
|
retry_timings["retry_last_ts"],
|
||||||
retry_timings["retry_interval"],
|
retry_timings["retry_interval"],
|
||||||
|
@ -108,6 +110,7 @@ def get_retry_limiter(destination, clock, store, ignore_backoff=False, **kwargs)
|
||||||
destination,
|
destination,
|
||||||
clock,
|
clock,
|
||||||
store,
|
store,
|
||||||
|
failure_ts,
|
||||||
retry_interval,
|
retry_interval,
|
||||||
backoff_on_failure=backoff_on_failure,
|
backoff_on_failure=backoff_on_failure,
|
||||||
**kwargs
|
**kwargs
|
||||||
|
@ -120,6 +123,7 @@ class RetryDestinationLimiter(object):
|
||||||
destination,
|
destination,
|
||||||
clock,
|
clock,
|
||||||
store,
|
store,
|
||||||
|
failure_ts,
|
||||||
retry_interval,
|
retry_interval,
|
||||||
backoff_on_404=False,
|
backoff_on_404=False,
|
||||||
backoff_on_failure=True,
|
backoff_on_failure=True,
|
||||||
|
@ -133,6 +137,8 @@ class RetryDestinationLimiter(object):
|
||||||
destination (str)
|
destination (str)
|
||||||
clock (Clock)
|
clock (Clock)
|
||||||
store (DataStore)
|
store (DataStore)
|
||||||
|
failure_ts (int|None): when this destination started failing (in ms since
|
||||||
|
the epoch), or zero if the last request was successful
|
||||||
retry_interval (int): The next retry interval taken from the
|
retry_interval (int): The next retry interval taken from the
|
||||||
database in milliseconds, or zero if the last request was
|
database in milliseconds, or zero if the last request was
|
||||||
successful.
|
successful.
|
||||||
|
@ -145,6 +151,7 @@ class RetryDestinationLimiter(object):
|
||||||
self.store = store
|
self.store = store
|
||||||
self.destination = destination
|
self.destination = destination
|
||||||
|
|
||||||
|
self.failure_ts = failure_ts
|
||||||
self.retry_interval = retry_interval
|
self.retry_interval = retry_interval
|
||||||
self.backoff_on_404 = backoff_on_404
|
self.backoff_on_404 = backoff_on_404
|
||||||
self.backoff_on_failure = backoff_on_failure
|
self.backoff_on_failure = backoff_on_failure
|
||||||
|
@ -186,6 +193,7 @@ class RetryDestinationLimiter(object):
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Connection to %s was successful; clearing backoff", self.destination
|
"Connection to %s was successful; clearing backoff", self.destination
|
||||||
)
|
)
|
||||||
|
self.failure_ts = None
|
||||||
retry_last_ts = 0
|
retry_last_ts = 0
|
||||||
self.retry_interval = 0
|
self.retry_interval = 0
|
||||||
elif not self.backoff_on_failure:
|
elif not self.backoff_on_failure:
|
||||||
|
@ -211,11 +219,17 @@ class RetryDestinationLimiter(object):
|
||||||
)
|
)
|
||||||
retry_last_ts = int(self.clock.time_msec())
|
retry_last_ts = int(self.clock.time_msec())
|
||||||
|
|
||||||
|
if self.failure_ts is None:
|
||||||
|
self.failure_ts = retry_last_ts
|
||||||
|
|
||||||
@defer.inlineCallbacks
|
@defer.inlineCallbacks
|
||||||
def store_retry_timings():
|
def store_retry_timings():
|
||||||
try:
|
try:
|
||||||
yield self.store.set_destination_retry_timings(
|
yield self.store.set_destination_retry_timings(
|
||||||
self.destination, retry_last_ts, self.retry_interval
|
self.destination,
|
||||||
|
self.failure_ts,
|
||||||
|
retry_last_ts,
|
||||||
|
self.retry_interval,
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("Failed to store destination_retry_timings")
|
logger.exception("Failed to store destination_retry_timings")
|
||||||
|
|
|
@ -99,7 +99,12 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase):
|
||||||
self.event_source = hs.get_event_sources().sources["typing"]
|
self.event_source = hs.get_event_sources().sources["typing"]
|
||||||
|
|
||||||
self.datastore = hs.get_datastore()
|
self.datastore = hs.get_datastore()
|
||||||
retry_timings_res = {"destination": "", "retry_last_ts": 0, "retry_interval": 0}
|
retry_timings_res = {
|
||||||
|
"destination": "",
|
||||||
|
"retry_last_ts": 0,
|
||||||
|
"retry_interval": 0,
|
||||||
|
"failure_ts": None,
|
||||||
|
}
|
||||||
self.datastore.get_destination_retry_timings.return_value = defer.succeed(
|
self.datastore.get_destination_retry_timings.return_value = defer.succeed(
|
||||||
retry_timings_res
|
retry_timings_res
|
||||||
)
|
)
|
||||||
|
|
|
@ -29,17 +29,19 @@ class TransactionStoreTestCase(HomeserverTestCase):
|
||||||
r = self.get_success(d)
|
r = self.get_success(d)
|
||||||
self.assertIsNone(r)
|
self.assertIsNone(r)
|
||||||
|
|
||||||
d = self.store.set_destination_retry_timings("example.com", 50, 100)
|
d = self.store.set_destination_retry_timings("example.com", 1000, 50, 100)
|
||||||
self.get_success(d)
|
self.get_success(d)
|
||||||
|
|
||||||
d = self.store.get_destination_retry_timings("example.com")
|
d = self.store.get_destination_retry_timings("example.com")
|
||||||
r = self.get_success(d)
|
r = self.get_success(d)
|
||||||
|
|
||||||
self.assert_dict({"retry_last_ts": 50, "retry_interval": 100}, r)
|
self.assert_dict(
|
||||||
|
{"retry_last_ts": 50, "retry_interval": 100, "failure_ts": 1000}, r
|
||||||
|
)
|
||||||
|
|
||||||
def test_initial_set_transactions(self):
|
def test_initial_set_transactions(self):
|
||||||
"""Tests that we can successfully set the destination retries (there
|
"""Tests that we can successfully set the destination retries (there
|
||||||
was a bug around invalidating the cache that broke this)
|
was a bug around invalidating the cache that broke this)
|
||||||
"""
|
"""
|
||||||
d = self.store.set_destination_retry_timings("example.com", 50, 100)
|
d = self.store.set_destination_retry_timings("example.com", 1000, 50, 100)
|
||||||
self.get_success(d)
|
self.get_success(d)
|
||||||
|
|
127
tests/util/test_retryutils.py
Normal file
127
tests/util/test_retryutils.py
Normal file
|
@ -0,0 +1,127 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright 2019 The Matrix.org Foundation C.I.C.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
from synapse.util.retryutils import (
|
||||||
|
MIN_RETRY_INTERVAL,
|
||||||
|
RETRY_MULTIPLIER,
|
||||||
|
NotRetryingDestination,
|
||||||
|
get_retry_limiter,
|
||||||
|
)
|
||||||
|
|
||||||
|
from tests.unittest import HomeserverTestCase
|
||||||
|
|
||||||
|
|
||||||
|
class RetryLimiterTestCase(HomeserverTestCase):
|
||||||
|
def test_new_destination(self):
|
||||||
|
"""A happy-path case with a new destination and a successful operation"""
|
||||||
|
store = self.hs.get_datastore()
|
||||||
|
d = get_retry_limiter("test_dest", self.clock, store)
|
||||||
|
self.pump()
|
||||||
|
limiter = self.successResultOf(d)
|
||||||
|
|
||||||
|
# advance the clock a bit before making the request
|
||||||
|
self.pump(1)
|
||||||
|
|
||||||
|
with limiter:
|
||||||
|
pass
|
||||||
|
|
||||||
|
d = store.get_destination_retry_timings("test_dest")
|
||||||
|
self.pump()
|
||||||
|
new_timings = self.successResultOf(d)
|
||||||
|
self.assertIsNone(new_timings)
|
||||||
|
|
||||||
|
def test_limiter(self):
|
||||||
|
"""General test case which walks through the process of a failing request"""
|
||||||
|
store = self.hs.get_datastore()
|
||||||
|
|
||||||
|
d = get_retry_limiter("test_dest", self.clock, store)
|
||||||
|
self.pump()
|
||||||
|
limiter = self.successResultOf(d)
|
||||||
|
|
||||||
|
self.pump(1)
|
||||||
|
try:
|
||||||
|
with limiter:
|
||||||
|
self.pump(1)
|
||||||
|
failure_ts = self.clock.time_msec()
|
||||||
|
raise AssertionError("argh")
|
||||||
|
except AssertionError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# wait for the update to land
|
||||||
|
self.pump()
|
||||||
|
|
||||||
|
d = store.get_destination_retry_timings("test_dest")
|
||||||
|
self.pump()
|
||||||
|
new_timings = self.successResultOf(d)
|
||||||
|
self.assertEqual(new_timings["failure_ts"], failure_ts)
|
||||||
|
self.assertEqual(new_timings["retry_last_ts"], failure_ts)
|
||||||
|
self.assertEqual(new_timings["retry_interval"], MIN_RETRY_INTERVAL)
|
||||||
|
|
||||||
|
# now if we try again we should get a failure
|
||||||
|
d = get_retry_limiter("test_dest", self.clock, store)
|
||||||
|
self.pump()
|
||||||
|
self.failureResultOf(d, NotRetryingDestination)
|
||||||
|
|
||||||
|
#
|
||||||
|
# advance the clock and try again
|
||||||
|
#
|
||||||
|
|
||||||
|
self.pump(MIN_RETRY_INTERVAL)
|
||||||
|
d = get_retry_limiter("test_dest", self.clock, store)
|
||||||
|
self.pump()
|
||||||
|
limiter = self.successResultOf(d)
|
||||||
|
|
||||||
|
self.pump(1)
|
||||||
|
try:
|
||||||
|
with limiter:
|
||||||
|
self.pump(1)
|
||||||
|
retry_ts = self.clock.time_msec()
|
||||||
|
raise AssertionError("argh")
|
||||||
|
except AssertionError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# wait for the update to land
|
||||||
|
self.pump()
|
||||||
|
|
||||||
|
d = store.get_destination_retry_timings("test_dest")
|
||||||
|
self.pump()
|
||||||
|
new_timings = self.successResultOf(d)
|
||||||
|
self.assertEqual(new_timings["failure_ts"], failure_ts)
|
||||||
|
self.assertEqual(new_timings["retry_last_ts"], retry_ts)
|
||||||
|
self.assertGreaterEqual(
|
||||||
|
new_timings["retry_interval"], MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 0.5
|
||||||
|
)
|
||||||
|
self.assertLessEqual(
|
||||||
|
new_timings["retry_interval"], MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0
|
||||||
|
)
|
||||||
|
|
||||||
|
#
|
||||||
|
# one more go, with success
|
||||||
|
#
|
||||||
|
self.pump(MIN_RETRY_INTERVAL * RETRY_MULTIPLIER * 2.0)
|
||||||
|
d = get_retry_limiter("test_dest", self.clock, store)
|
||||||
|
self.pump()
|
||||||
|
limiter = self.successResultOf(d)
|
||||||
|
|
||||||
|
self.pump(1)
|
||||||
|
with limiter:
|
||||||
|
self.pump(1)
|
||||||
|
|
||||||
|
# wait for the update to land
|
||||||
|
self.pump()
|
||||||
|
|
||||||
|
d = store.get_destination_retry_timings("test_dest")
|
||||||
|
self.pump()
|
||||||
|
new_timings = self.successResultOf(d)
|
||||||
|
self.assertIsNone(new_timings)
|
Loading…
Reference in a new issue