forked from MirrorHub/synapse
fix cyrillic URL previews by hardcoding all page decoding to UTF-8 for now, rather than relying on lxml's heuristics which seem to get it wrong
This commit is contained in:
parent
737aee9295
commit
84f9cac4d0
1 changed files with 8 additions and 16 deletions
|
@ -181,22 +181,14 @@ class PreviewUrlResource(BaseMediaResource):
|
||||||
|
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
||||||
try:
|
# XXX: always manually try to decode body as utf-8 first, which
|
||||||
tree = html.parse(media_info['filename'])
|
# seems to help with most character encoding woes.
|
||||||
og = yield self._calc_og(tree, media_info, requester)
|
# XXX: handle non-utf-8 encodings?
|
||||||
except UnicodeDecodeError:
|
file = open(media_info['filename'])
|
||||||
# XXX: evil evil bodge
|
body = file.read()
|
||||||
# Empirically, sites like google.com mix Latin-1 and utf-8
|
file.close()
|
||||||
# encodings in the same page. The rogue Latin-1 characters
|
tree = html.fromstring(body.decode('utf-8', 'ignore'))
|
||||||
# cause lxml to choke with a UnicodeDecodeError, so if we
|
og = yield self._calc_og(tree, media_info, requester)
|
||||||
# see this we go and do a manual decode of the HTML before
|
|
||||||
# handing it to lxml as utf-8 encoding, counter-intuitively,
|
|
||||||
# which seems to make it happier...
|
|
||||||
file = open(media_info['filename'])
|
|
||||||
body = file.read()
|
|
||||||
file.close()
|
|
||||||
tree = html.fromstring(body.decode('utf-8', 'ignore'))
|
|
||||||
og = yield self._calc_og(tree, media_info, requester)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logger.warn("Failed to find any OG data in %s", url)
|
logger.warn("Failed to find any OG data in %s", url)
|
||||||
|
|
Loading…
Reference in a new issue