From 8a57cc3123d36606fa2cf014b541e39c68cff72c Mon Sep 17 00:00:00 2001 From: Benjamin Saunders Date: Sun, 14 Aug 2016 11:50:22 -0700 Subject: [PATCH 1/7] Add missing database corruption recovery case Signed-off-by: Benjamin Saunders --- synapse/storage/events.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index d2feee8db..ad026b5e0 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -600,7 +600,8 @@ class EventsStore(SQLBaseStore): "rejections", "redactions", "room_memberships", - "state_events" + "state_events", + "topics" ): txn.executemany( "DELETE FROM %s WHERE event_id = ?" % (table,), From dc3a00f24f301ab08750fdd8ca6ae040ba290e1e Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 15 Aug 2016 17:04:39 +0100 Subject: [PATCH 2/7] Refactor user_delete_access_tokens. Invalidate get_user_by_access_token to slaves. --- synapse/handlers/auth.py | 6 +-- synapse/push/pusherpool.py | 8 ++-- synapse/storage/registration.py | 78 +++++++++++++++------------------ 3 files changed, 43 insertions(+), 49 deletions(-) diff --git a/synapse/handlers/auth.py b/synapse/handlers/auth.py index a582d6334..6986930c0 100644 --- a/synapse/handlers/auth.py +++ b/synapse/handlers/auth.py @@ -741,7 +741,7 @@ class AuthHandler(BaseHandler): def set_password(self, user_id, newpassword, requester=None): password_hash = self.hash(newpassword) - except_access_token_ids = [requester.access_token_id] if requester else [] + except_access_token_id = requester.access_token_id if requester else None try: yield self.store.user_set_password_hash(user_id, password_hash) @@ -750,10 +750,10 @@ class AuthHandler(BaseHandler): raise SynapseError(404, "Unknown user", Codes.NOT_FOUND) raise e yield self.store.user_delete_access_tokens( - user_id, except_access_token_ids + user_id, except_access_token_id ) yield self.hs.get_pusherpool().remove_pushers_by_user( - user_id, except_access_token_ids + user_id, except_access_token_id ) @defer.inlineCallbacks diff --git a/synapse/push/pusherpool.py b/synapse/push/pusherpool.py index 5853ec36a..54c0f1b84 100644 --- a/synapse/push/pusherpool.py +++ b/synapse/push/pusherpool.py @@ -102,14 +102,14 @@ class PusherPool: yield self.remove_pusher(p['app_id'], p['pushkey'], p['user_name']) @defer.inlineCallbacks - def remove_pushers_by_user(self, user_id, except_token_ids=[]): + def remove_pushers_by_user(self, user_id, except_access_token_id=None): all = yield self.store.get_all_pushers() logger.info( - "Removing all pushers for user %s except access tokens ids %r", - user_id, except_token_ids + "Removing all pushers for user %s except access tokens id %r", + user_id, except_access_token_id ) for p in all: - if p['user_name'] == user_id and p['access_token'] not in except_token_ids: + if p['user_name'] == user_id and p['access_token'] != except_access_token_id: logger.info( "Removing pusher for app id %s, pushkey %s, user %s", p['app_id'], p['pushkey'], p['user_name'] diff --git a/synapse/storage/registration.py b/synapse/storage/registration.py index 7e7d32eb6..19cb3b31c 100644 --- a/synapse/storage/registration.py +++ b/synapse/storage/registration.py @@ -251,7 +251,7 @@ class RegistrationStore(background_updates.BackgroundUpdateStore): self.get_user_by_id.invalidate((user_id,)) @defer.inlineCallbacks - def user_delete_access_tokens(self, user_id, except_token_ids=[], + def user_delete_access_tokens(self, user_id, except_token_id=None, device_id=None, delete_refresh_tokens=False): """ @@ -259,7 +259,7 @@ class RegistrationStore(background_updates.BackgroundUpdateStore): Args: user_id (str): ID of user the tokens belong to - except_token_ids (list[str]): list of access_tokens which should + except_token_id (str): list of access_tokens IDs which should *not* be deleted device_id (str|None): ID of device the tokens are associated with. If None, tokens associated with any device (or no device) will @@ -269,53 +269,45 @@ class RegistrationStore(background_updates.BackgroundUpdateStore): Returns: defer.Deferred: """ - def f(txn, table, except_tokens, call_after_delete): - sql = "SELECT token FROM %s WHERE user_id = ?" % table - clauses = [user_id] - + def f(txn): + keyvalues = { + "user_id": user_id, + } if device_id is not None: - sql += " AND device_id = ?" - clauses.append(device_id) + keyvalues["device_id"] = device_id - if except_tokens: - sql += " AND id NOT IN (%s)" % ( - ",".join(["?" for _ in except_tokens]), - ) - clauses += except_tokens - - txn.execute(sql, clauses) - - rows = txn.fetchall() - - n = 100 - chunks = [rows[i:i + n] for i in xrange(0, len(rows), n)] - for chunk in chunks: - if call_after_delete: - for row in chunk: - txn.call_after(call_after_delete, (row[0],)) - - txn.execute( - "DELETE FROM %s WHERE token in (%s)" % ( - table, - ",".join(["?" for _ in chunk]), - ), [r[0] for r in chunk] + if delete_refresh_tokens: + self._simple_delete_txn( + txn, + table="refresh_tokens", + keyvalues=keyvalues, ) - # delete refresh tokens first, to stop new access tokens being - # allocated while our backs are turned - if delete_refresh_tokens: - yield self.runInteraction( - "user_delete_access_tokens", f, - table="refresh_tokens", - except_tokens=[], - call_after_delete=None, + items = keyvalues.items() + where_clause = " AND ".join(k + " = ?" for k, _ in items) + values = [v for _, v in items] + if except_token_id: + where_clause += " AND id != ?" + values.append(except_token_id) + + txn.execute( + "SELECT token FROM access_tokens WHERE %s" % where_clause, + values + ) + rows = self.cursor_to_dict(txn) + + for row in rows: + self._invalidate_cache_and_stream( + txn, self.get_user_by_access_token, (row["token"],) + ) + + txn.execute( + "DELETE FROM access_tokens WHERE %s" % where_clause, + values ) yield self.runInteraction( "user_delete_access_tokens", f, - table="access_tokens", - except_tokens=except_token_ids, - call_after_delete=self.get_user_by_access_token.invalidate, ) def delete_access_token(self, access_token): @@ -328,7 +320,9 @@ class RegistrationStore(background_updates.BackgroundUpdateStore): }, ) - txn.call_after(self.get_user_by_access_token.invalidate, (access_token,)) + self._invalidate_cache_and_stream( + txn, self.get_user_by_access_token, (access_token,) + ) return self.runInteraction("delete_access_token", f) From a2427981b798eb157d48b4e695a0d025357a2b01 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 15 Aug 2016 18:10:54 +0100 Subject: [PATCH 3/7] Use cached get_user_by_access_token in slaves --- synapse/replication/slave/storage/_base.py | 2 +- synapse/replication/slave/storage/registration.py | 2 +- synapse/storage/_base.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/synapse/replication/slave/storage/_base.py b/synapse/replication/slave/storage/_base.py index d839d169a..f19540d6b 100644 --- a/synapse/replication/slave/storage/_base.py +++ b/synapse/replication/slave/storage/_base.py @@ -51,6 +51,6 @@ class BaseSlavedStore(SQLBaseStore): try: getattr(self, cache_func).invalidate(tuple(keys)) except AttributeError: - logger.warn("Got unexpected cache_func: %r", cache_func) + logger.info("Got unexpected cache_func: %r", cache_func) self._cache_id_gen.advance(int(stream["position"])) return defer.succeed(None) diff --git a/synapse/replication/slave/storage/registration.py b/synapse/replication/slave/storage/registration.py index 307833f9e..38b78b97f 100644 --- a/synapse/replication/slave/storage/registration.py +++ b/synapse/replication/slave/storage/registration.py @@ -25,6 +25,6 @@ class SlavedRegistrationStore(BaseSlavedStore): # TODO: use the cached version and invalidate deleted tokens get_user_by_access_token = RegistrationStore.__dict__[ "get_user_by_access_token" - ].orig + ] _query_for_auth = DataStore._query_for_auth.__func__ diff --git a/synapse/storage/_base.py b/synapse/storage/_base.py index b0923a9ca..0a2e78fd8 100644 --- a/synapse/storage/_base.py +++ b/synapse/storage/_base.py @@ -880,6 +880,7 @@ class SQLBaseStore(object): ctx = self._cache_id_gen.get_next() stream_id = ctx.__enter__() txn.call_after(ctx.__exit__, None, None, None) + txn.call_after(self.hs.get_notifier().on_new_replication_data) self._simple_insert_txn( txn, From 2ee1bd124c2d9ac7f11d0b32c9e40a842db14fe9 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 16 Aug 2016 11:34:36 +0100 Subject: [PATCH 4/7] Limit number of extremeties in backfill request This works around a bug where if we make a backfill request with too many extremeties it causes the request URI to be too long. --- synapse/handlers/federation.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index ff6bb475b..328f8f484 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -274,7 +274,7 @@ class FederationHandler(BaseHandler): @log_function @defer.inlineCallbacks - def backfill(self, dest, room_id, limit, extremities=[]): + def backfill(self, dest, room_id, limit, extremities): """ Trigger a backfill request to `dest` for the given `room_id` This will attempt to get more events from the remote. This may return @@ -284,9 +284,6 @@ class FederationHandler(BaseHandler): if dest == self.server_name: raise SynapseError(400, "Can't backfill from self.") - if not extremities: - extremities = yield self.store.get_oldest_events_in_room(room_id) - events = yield self.replication_layer.backfill( dest, room_id, @@ -455,6 +452,10 @@ class FederationHandler(BaseHandler): ) max_depth = sorted_extremeties_tuple[0][1] + # We don't want to specify too many extremities as it causes the backfill + # request URI to be too long. + extremities = dict(sorted_extremeties_tuple[:5]) + if current_depth > max_depth: logger.debug( "Not backfilling as we don't need to. %d < %d", From 48b5829aea007bce620ad3db7bfd1dc25cbf9837 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 16 Aug 2016 14:53:18 +0100 Subject: [PATCH 5/7] Fix up preview URL API. Add tests. This includes: - Splitting out methods of a class into stand alone functions, to make them easier to test. - Adding unit tests to split out functions, testing HTML -> preview. - Handle the fact that elements in lxml may have tail text. --- synapse/rest/media/v1/preview_url_resource.py | 353 ++++++++++-------- tests/test_preview.py | 80 +++- 2 files changed, 275 insertions(+), 158 deletions(-) diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index bdd0e60c5..beafc8e73 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -163,7 +163,7 @@ class PreviewUrlResource(Resource): logger.debug("got media_info of '%s'" % media_info) - if self._is_media(media_info['media_type']): + if _is_media(media_info['media_type']): dims = yield self.media_repo._generate_local_thumbnails( media_info['filesystem_id'], media_info ) @@ -184,11 +184,9 @@ class PreviewUrlResource(Resource): logger.warn("Couldn't get dims for %s" % url) # define our OG response for this media - elif self._is_html(media_info['media_type']): + elif _is_html(media_info['media_type']): # TODO: somehow stop a big HTML tree from exploding synapse's RAM - from lxml import etree - file = open(media_info['filename']) body = file.read() file.close() @@ -199,17 +197,35 @@ class PreviewUrlResource(Resource): match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I) encoding = match.group(1) if match else "utf-8" - try: - parser = etree.HTMLParser(recover=True, encoding=encoding) - tree = etree.fromstring(body, parser) - og = yield self._calc_og(tree, media_info, requester) - except UnicodeDecodeError: - # blindly try decoding the body as utf-8, which seems to fix - # the charset mismatches on https://google.com - parser = etree.HTMLParser(recover=True, encoding=encoding) - tree = etree.fromstring(body.decode('utf-8', 'ignore'), parser) - og = yield self._calc_og(tree, media_info, requester) + og = decode_and_calc_og(body, media_info['uri'], encoding) + # pre-cache the image for posterity + # FIXME: it might be cleaner to use the same flow as the main /preview_url + # request itself and benefit from the same caching etc. But for now we + # just rely on the caching on the master request to speed things up. + if 'og:image' in og and og['og:image']: + image_info = yield self._download_url( + _rebase_url(og['og:image'], media_info['uri']), requester.user + ) + + if _is_media(image_info['media_type']): + # TODO: make sure we don't choke on white-on-transparent images + dims = yield self.media_repo._generate_local_thumbnails( + image_info['filesystem_id'], image_info + ) + if dims: + og["og:image:width"] = dims['width'] + og["og:image:height"] = dims['height'] + else: + logger.warn("Couldn't get dims for %s" % og["og:image"]) + + og["og:image"] = "mxc://%s/%s" % ( + self.server_name, image_info['filesystem_id'] + ) + og["og:image:type"] = image_info['media_type'] + og["matrix:image:size"] = image_info['media_length'] + else: + del og["og:image"] else: logger.warn("Failed to find any OG data in %s", url) og = {} @@ -232,139 +248,6 @@ class PreviewUrlResource(Resource): respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) - @defer.inlineCallbacks - def _calc_og(self, tree, media_info, requester): - # suck our tree into lxml and define our OG response. - - # if we see any image URLs in the OG response, then spider them - # (although the client could choose to do this by asking for previews of those - # URLs to avoid DoSing the server) - - # "og:type" : "video", - # "og:url" : "https://www.youtube.com/watch?v=LXDBoHyjmtw", - # "og:site_name" : "YouTube", - # "og:video:type" : "application/x-shockwave-flash", - # "og:description" : "Fun stuff happening here", - # "og:title" : "RemoteJam - Matrix team hack for Disrupt Europe Hackathon", - # "og:image" : "https://i.ytimg.com/vi/LXDBoHyjmtw/maxresdefault.jpg", - # "og:video:url" : "http://www.youtube.com/v/LXDBoHyjmtw?version=3&autohide=1", - # "og:video:width" : "1280" - # "og:video:height" : "720", - # "og:video:secure_url": "https://www.youtube.com/v/LXDBoHyjmtw?version=3", - - og = {} - for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"): - if 'content' in tag.attrib: - og[tag.attrib['property']] = tag.attrib['content'] - - # TODO: grab article: meta tags too, e.g.: - - # "article:publisher" : "https://www.facebook.com/thethudonline" /> - # "article:author" content="https://www.facebook.com/thethudonline" /> - # "article:tag" content="baby" /> - # "article:section" content="Breaking News" /> - # "article:published_time" content="2016-03-31T19:58:24+00:00" /> - # "article:modified_time" content="2016-04-01T18:31:53+00:00" /> - - if 'og:title' not in og: - # do some basic spidering of the HTML - title = tree.xpath("(//title)[1] | (//h1)[1] | (//h2)[1] | (//h3)[1]") - og['og:title'] = title[0].text.strip() if title else None - - if 'og:image' not in og: - # TODO: extract a favicon failing all else - meta_image = tree.xpath( - "//*/meta[translate(@itemprop, 'IMAGE', 'image')='image']/@content" - ) - if meta_image: - og['og:image'] = self._rebase_url(meta_image[0], media_info['uri']) - else: - # TODO: consider inlined CSS styles as well as width & height attribs - images = tree.xpath("//img[@src][number(@width)>10][number(@height)>10]") - images = sorted(images, key=lambda i: ( - -1 * float(i.attrib['width']) * float(i.attrib['height']) - )) - if not images: - images = tree.xpath("//img[@src]") - if images: - og['og:image'] = images[0].attrib['src'] - - # pre-cache the image for posterity - # FIXME: it might be cleaner to use the same flow as the main /preview_url - # request itself and benefit from the same caching etc. But for now we - # just rely on the caching on the master request to speed things up. - if 'og:image' in og and og['og:image']: - image_info = yield self._download_url( - self._rebase_url(og['og:image'], media_info['uri']), requester.user - ) - - if self._is_media(image_info['media_type']): - # TODO: make sure we don't choke on white-on-transparent images - dims = yield self.media_repo._generate_local_thumbnails( - image_info['filesystem_id'], image_info - ) - if dims: - og["og:image:width"] = dims['width'] - og["og:image:height"] = dims['height'] - else: - logger.warn("Couldn't get dims for %s" % og["og:image"]) - - og["og:image"] = "mxc://%s/%s" % ( - self.server_name, image_info['filesystem_id'] - ) - og["og:image:type"] = image_info['media_type'] - og["matrix:image:size"] = image_info['media_length'] - else: - del og["og:image"] - - if 'og:description' not in og: - meta_description = tree.xpath( - "//*/meta" - "[translate(@name, 'DESCRIPTION', 'description')='description']" - "/@content") - if meta_description: - og['og:description'] = meta_description[0] - else: - # grab any text nodes which are inside the tag... - # unless they are within an HTML5 semantic markup tag... - #
,