mirror of
https://mau.dev/maunium/synapse.git
synced 2024-12-18 13:44:02 +01:00
Reduce user_ips bloat during dedupe background update
The background update to remove duplicate rows naively deleted and reinserted the duplicates. For large tables with a large number of duplicates this causes a lot of bloat (with postgres), as the inserted rows are appended to the table, since deleted rows will not be overwritten until a VACUUM has happened. This should hopefully also help ensure that the query in the last batch uses the correct index, as inserting a large number of new rows without analyzing will upset the query planner.
This commit is contained in:
parent
46b8a79b3a
commit
362d80b770
1 changed files with 60 additions and 3 deletions
|
@ -167,12 +167,16 @@ class ClientIpStore(background_updates.BackgroundUpdateStore):
|
||||||
clause = "? <= last_seen AND last_seen < ?"
|
clause = "? <= last_seen AND last_seen < ?"
|
||||||
args = (begin_last_seen, end_last_seen)
|
args = (begin_last_seen, end_last_seen)
|
||||||
|
|
||||||
|
# (Note: The DISTINCT in the inner query is important to ensure that
|
||||||
|
# the COUNT(*) is accurate, otherwise double counting may happen due
|
||||||
|
# to the join effectively being a cross product)
|
||||||
txn.execute(
|
txn.execute(
|
||||||
"""
|
"""
|
||||||
SELECT user_id, access_token, ip,
|
SELECT user_id, access_token, ip,
|
||||||
MAX(device_id), MAX(user_agent), MAX(last_seen)
|
MAX(device_id), MAX(user_agent), MAX(last_seen),
|
||||||
|
COUNT(*)
|
||||||
FROM (
|
FROM (
|
||||||
SELECT user_id, access_token, ip
|
SELECT DISTINCT user_id, access_token, ip
|
||||||
FROM user_ips
|
FROM user_ips
|
||||||
WHERE {}
|
WHERE {}
|
||||||
) c
|
) c
|
||||||
|
@ -186,7 +190,60 @@ class ClientIpStore(background_updates.BackgroundUpdateStore):
|
||||||
|
|
||||||
# We've got some duplicates
|
# We've got some duplicates
|
||||||
for i in res:
|
for i in res:
|
||||||
user_id, access_token, ip, device_id, user_agent, last_seen = i
|
user_id, access_token, ip, device_id, user_agent, last_seen, count = i
|
||||||
|
|
||||||
|
# We want to delete the duplicates so we end up with only a
|
||||||
|
# single row.
|
||||||
|
#
|
||||||
|
# The naive way of doing this would be just to delete all rows
|
||||||
|
# and reinsert a constructed row. However, if there are a lot of
|
||||||
|
# duplicate rows this can cause the table to grow a lot, which
|
||||||
|
# can be problematic in two ways:
|
||||||
|
# 1. If user_ips is already large then this can cause the
|
||||||
|
# table to rapidly grow, potentially filling the disk.
|
||||||
|
# 2. Reinserting a lot of rows can confuse the table
|
||||||
|
# statistics for postgres, causing it to not use the
|
||||||
|
# correct indices for the query above, resulting in a full
|
||||||
|
# table scan. This is incredibly slow for large tables and
|
||||||
|
# can kill database performance. (This seems to mainly
|
||||||
|
# happen for the last query where the clause is simply `? <
|
||||||
|
# last_seen`)
|
||||||
|
#
|
||||||
|
# So instead we want to delete all but *one* of the duplicate
|
||||||
|
# rows. That is hard to do reliably, so we cheat and do a two
|
||||||
|
# step process:
|
||||||
|
# 1. Delete all rows with a last_seen strictly less than the
|
||||||
|
# max last_seen. This hopefully results in deleting all but
|
||||||
|
# one row the majority of the time, but there may be
|
||||||
|
# duplicate last_seen
|
||||||
|
# 2. If multiple rows remain, we fall back to the naive method
|
||||||
|
# and simply delete all rows and reinsert.
|
||||||
|
#
|
||||||
|
# Note that this relies on no new duplicate rows being inserted,
|
||||||
|
# but if that is happening then this entire process is futile
|
||||||
|
# anyway.
|
||||||
|
|
||||||
|
# Do step 1:
|
||||||
|
|
||||||
|
txn.execute(
|
||||||
|
"""
|
||||||
|
DELETE FROM user_ips
|
||||||
|
WHERE user_id = ? AND access_token = ? AND ip = ? AND last_seen < ?
|
||||||
|
""",
|
||||||
|
(user_id, access_token, ip, last_seen)
|
||||||
|
)
|
||||||
|
if txn.rowcount == count - 1:
|
||||||
|
# We deleted all but one of the duplicate rows, i.e. there
|
||||||
|
# is exactly one remaining and so there is nothing left to
|
||||||
|
# do.
|
||||||
|
continue
|
||||||
|
elif txn.rowcount >= count:
|
||||||
|
raise Exception(
|
||||||
|
"We deleted more duplicate rows from 'user_ips' than expected",
|
||||||
|
)
|
||||||
|
|
||||||
|
# The previous step didn't delete enough rows, so we fallback to
|
||||||
|
# step 2:
|
||||||
|
|
||||||
# Drop all the duplicates
|
# Drop all the duplicates
|
||||||
txn.execute(
|
txn.execute(
|
||||||
|
|
Loading…
Reference in a new issue