From 1fc1bc2a51b025e46e3d44056ea1836dfc48cc6e Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 27 Oct 2016 14:14:44 +0100 Subject: [PATCH] Fix user_threepids schema delta The delta `37/user_threepids.sql` aimed to update all the email addresses to be lower case, however duplicate emails may exist in the table already. This commit adds a step where the delta moves the duplicate emails to a new `medium` `email_old`. Only the most recently used account keeps the binding intact. We move rather than delete so that we retain some record of which emails were associated with which account. --- .../schema/delta/37/user_threepids.sql | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/synapse/storage/schema/delta/37/user_threepids.sql b/synapse/storage/schema/delta/37/user_threepids.sql index ef8813e72..5f483a2ed 100644 --- a/synapse/storage/schema/delta/37/user_threepids.sql +++ b/synapse/storage/schema/delta/37/user_threepids.sql @@ -17,7 +17,44 @@ * Update any email addresses that were stored with mixed case into all * lowercase */ -UPDATE user_threepids SET address = LOWER(address) where medium = 'email'; + + -- There may be "duplicate" emails (with different case) already in the table, + -- so we find them and move all but the most recently used account. + UPDATE user_threepids + SET medium = 'email_old' + WHERE medium = 'email' + AND address IN ( + -- `user_last_seen` maps user_ids to the last time we saw them + WITH user_last_seen AS ( + SELECT user_id, max(last_seen) AS ts FROM user_ips GROUP BY user_id + ), + -- `duplicate_addresses` is a table of all the email addresses that + -- appear multiple times and the most recently we saw any of their users + duplicate_addresses AS ( + SELECT lower(u1.address) AS address, max(ts.ts) AS max_ts + FROM user_threepids AS u1 + INNER JOIN user_threepids AS u2 ON u1.medium = u2.medium AND lower(u1.address) = lower(u2.address) AND u1.address != u2.address + INNER JOIN user_last_seen as ts ON ts.user_id = u1.user_id + WHERE u1.medium = 'email' AND u2.medium = 'email' + GROUP BY lower(u1.address) + ) + -- We select all the addresses that are linked to the user_id that is NOT + -- the most recently seen. + SELECT u.address + FROM + user_threepids AS u, + duplicate_addresses, + user_last_seen AS ts + WHERE + lower(u.address) = duplicate_addresses.address + AND u.user_id = ts.user_id + AND ts.ts != max_ts -- NOT the most recently used + ); + + +-- This update is now safe since we've removed the duplicate addresses. +UPDATE user_threepids SET address = LOWER(address) WHERE medium = 'email'; + /* Add an index for the select we do on passwored reset */ CREATE INDEX user_threepids_medium_address on user_threepids (medium, address);