Skip to content

Commit

Permalink
Improve memory efficiency by discarding references to objects immedia…
Browse files Browse the repository at this point in the history
…tely

When "killing" or "removing" elements from a tree, it's important
not to keep references to them in the lists where they're
collected.

Without this, the memory consumption was quadratic because the
tails in this case were combined into parents'/previous'
elements' tails and the original ones could not be removed as we
kept the references to the elements in the _kill and _remove lists.

For more info see: https://bugs.launchpad.net/lxml/+bug/1889653

Co-authored-by: Miro Hrončok <miro@hroncok.cz>
  • Loading branch information
frenzymadness and hroncok committed Aug 28, 2024
1 parent da9d66c commit b03160f
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 11 deletions.
19 changes: 10 additions & 9 deletions lxml_html_clean/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import copy
import re
from collections import deque
from urllib.parse import urlsplit, unquote_plus

from lxml import etree
Expand Down Expand Up @@ -383,8 +384,8 @@ def __call__(self, doc):
if self.annoying_tags:
remove_tags.update(('blink', 'marquee'))

_remove = []
_kill = []
_remove = deque()
_kill = deque()
for el in doc.iter():
if el.tag in kill_tags:
if self.allow_element(el):
Expand All @@ -398,22 +399,22 @@ def __call__(self, doc):
if _remove and _remove[0] == doc:
# We have to drop the parent-most tag, which we can't
# do. Instead we'll rewrite it:
el = _remove.pop(0)
el = _remove.popleft()
el.tag = 'div'
el.attrib.clear()
elif _kill and _kill[0] == doc:
# We have to drop the parent-most element, which we can't
# do. Instead we'll clear it:
el = _kill.pop(0)
el = _kill.popleft()
if el.tag != 'html':
el.tag = 'div'
el.clear()

_kill.reverse() # start with innermost tags
for el in _kill:
el.drop_tree()
for el in _remove:
el.drop_tag()
while _kill:
_kill.popleft().drop_tree() # popleft to start with innermost elements

while _remove:
_remove.pop().drop_tag()

if self.remove_unknown_tags:
if allow_tags:
Expand Down
13 changes: 13 additions & 0 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import lxml.html
from lxml_html_clean import Cleaner, clean_html
from .utils import peak_memory_usage


class CleanerTest(unittest.TestCase):
Expand Down Expand Up @@ -333,3 +334,15 @@ def test_ascii_control_chars_removed(self):
expected = """<a href="">Link</a>"""
cleaner = Cleaner()
self.assertEqual(expected, cleaner.clean_html(html))

def test_memory_usage_many_elements_with_long_tails(self):
comment = "<!-- foo bar baz -->\n"
empty_line = "\t" * 10 + "\n"
element = comment + empty_line * 10
content = element * 5_000
html = f"<html>{content}</html>"

cleaner = Cleaner()
mem = peak_memory_usage(cleaner.clean_html, html)

self.assertTrue(mem < 10, f"Used {mem} MiB memory, expected at most 10 MiB")
18 changes: 18 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import unittest


def peak_memory_usage(func, *args, **kwargs):
"""
Monitor the memory usage of a function and return the peak memory used, in MiB.
"""
try:
from memory_profiler import memory_usage # type: ignore
except ImportError:
raise unittest.SkipTest("memory-profiler is not available")

try:
mem_usage = memory_usage((func, args, kwargs), interval=0.1, timeout=None)
except MemoryError:
return float("inf")
peak_memory = max(mem_usage) - min(mem_usage)
return peak_memory
6 changes: 4 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ skipsdist = True

[testenv]
commands =
python -m unittest tests.test_clean
python -m unittest -v tests.test_clean
python -m doctest tests/test_clean_embed.txt tests/test_clean.txt tests/test_autolink.txt
deps = lxml
deps =
lxml
memory_profiler

[testenv:mypy]
commands =
Expand Down

0 comments on commit b03160f

Please sign in to comment.