Merge pull request from GHSA-22p3-qrh9-cx32

* Make _iterate_over_text easier to read by using simple data structures

* Prefer a set of tags to ignore

In my tests, it's 4x faster to check for containment in a set of this size

* Add a stack size limit to _iterate_over_text

* Continue accepting the case where there is no body element

* Use an early return instead for None

Co-authored-by: Richard van der Hoff <richard@matrix.org>
This commit is contained in:
reivilibre 2022-06-28 14:29:08 +01:00 committed by GitHub
parent 21e6c0ed64
commit fa13080618
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 56 additions and 24 deletions

View file

@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import codecs import codecs
import itertools
import logging import logging
import re import re
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Set, Union from typing import TYPE_CHECKING, Dict, Generator, Iterable, List, Optional, Set, Union
if TYPE_CHECKING: if TYPE_CHECKING:
from lxml import etree from lxml import etree
@ -276,7 +275,7 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
from lxml import etree from lxml import etree
TAGS_TO_REMOVE = ( TAGS_TO_REMOVE = {
"header", "header",
"nav", "nav",
"aside", "aside",
@ -291,31 +290,42 @@ def parse_html_description(tree: "etree.Element") -> Optional[str]:
"img", "img",
"picture", "picture",
etree.Comment, etree.Comment,
) }
# Split all the text nodes into paragraphs (by splitting on new # Split all the text nodes into paragraphs (by splitting on new
# lines) # lines)
text_nodes = ( text_nodes = (
re.sub(r"\s+", "\n", el).strip() re.sub(r"\s+", "\n", el).strip()
for el in _iterate_over_text(tree.find("body"), *TAGS_TO_REMOVE) for el in _iterate_over_text(tree.find("body"), TAGS_TO_REMOVE)
) )
return summarize_paragraphs(text_nodes) return summarize_paragraphs(text_nodes)
def _iterate_over_text( def _iterate_over_text(
tree: "etree.Element", *tags_to_ignore: Union[str, "etree.Comment"] tree: Optional["etree.Element"],
tags_to_ignore: Set[Union[str, "etree.Comment"]],
stack_limit: int = 1024,
) -> Generator[str, None, None]: ) -> Generator[str, None, None]:
"""Iterate over the tree returning text nodes in a depth first fashion, """Iterate over the tree returning text nodes in a depth first fashion,
skipping text nodes inside certain tags. skipping text nodes inside certain tags.
Args:
tree: The parent element to iterate. Can be None if there isn't one.
tags_to_ignore: Set of tags to ignore
stack_limit: Maximum stack size limit for depth-first traversal.
Nodes will be dropped if this limit is hit, which may truncate the
textual result.
Intended to limit the maximum working memory when generating a preview.
""" """
# This is basically a stack that we extend using itertools.chain.
# This will either consist of an element to iterate over *or* a string if tree is None:
return
# This is a stack whose items are elements to iterate over *or* strings
# to be returned. # to be returned.
elements = iter([tree]) elements: List[Union[str, "etree.Element"]] = [tree]
while True: while elements:
el = next(elements, None) el = elements.pop()
if el is None:
return
if isinstance(el, str): if isinstance(el, str):
yield el yield el
@ -329,17 +339,22 @@ def _iterate_over_text(
if el.text: if el.text:
yield el.text yield el.text
# We add to the stack all the elements children, interspersed with # We add to the stack all the element's children, interspersed with
# each child's tail text (if it exists). The tail text of a node # each child's tail text (if it exists).
# is text that comes *after* the node, so we always include it even #
# if we ignore the child node. # We iterate in reverse order so that earlier pieces of text appear
elements = itertools.chain( # closer to the top of the stack.
itertools.chain.from_iterable( # Basically a flatmap for child in el.iterchildren(reversed=True):
[child, child.tail] if child.tail else [child] if len(elements) > stack_limit:
for child in el.iterchildren() # We've hit our limit for working memory
), break
elements,
) if child.tail:
# The tail text of a node is text that comes *after* the node,
# so we always include it even if we ignore the child node.
elements.append(child.tail)
elements.append(child)
def summarize_paragraphs( def summarize_paragraphs(

View file

@ -370,6 +370,23 @@ class OpenGraphFromHtmlTestCase(unittest.TestCase):
og = parse_html_to_open_graph(tree) og = parse_html_to_open_graph(tree)
self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."}) self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."})
def test_nested_nodes(self) -> None:
"""A body with some nested nodes. Tests that we iterate over children
in the right order (and don't reverse the order of the text)."""
html = b"""
<a href="somewhere">Welcome <b>the bold <u>and underlined text <svg>
with a cheeky SVG</svg></u> and <strong>some</strong> tail text</b></a>
"""
tree = decode_body(html, "http://example.com/test.html")
og = parse_html_to_open_graph(tree)
self.assertEqual(
og,
{
"og:title": None,
"og:description": "Welcome\n\nthe bold\n\nand underlined text\n\nand\n\nsome\n\ntail text",
},
)
class MediaEncodingTestCase(unittest.TestCase): class MediaEncodingTestCase(unittest.TestCase):
def test_meta_charset(self) -> None: def test_meta_charset(self) -> None: