Support underscores (in addition to hyphens) for charset detection. (#10410)

This commit is contained in:
sri-vidyut 2021-07-28 02:29:42 +09:00 committed by GitHub
parent 5b22d5ee03
commit 8e1febc6a1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 18 additions and 2 deletions

1
changelog.d/10410.bugfix Normal file
View file

@ -0,0 +1 @@
Improve character set detection in URL previews by supporting underscores (in addition to hyphens). Contributed by @srividyut.

View file

@ -58,9 +58,11 @@ if TYPE_CHECKING:
logger = logging.getLogger(__name__)
_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
_charset_match = re.compile(
br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I
)
_xml_encoding_match = re.compile(
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I
)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)

View file

@ -325,6 +325,19 @@ class MediaEncodingTestCase(unittest.TestCase):
)
self.assertEqual(encoding, "ascii")
def test_meta_charset_underscores(self):
"""A character encoding contains underscore."""
encoding = get_html_media_encoding(
b"""
<html>
<head><meta charset="Shift_JIS">
</head>
</html>
""",
"text/html",
)
self.assertEqual(encoding, "Shift_JIS")
def test_xml_encoding(self):
"""A character encoding is found via the meta tag."""
encoding = get_html_media_encoding(