From 89b2b3d184c48d08c3f1b1c632e0bcac6c3b520b Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Wed, 28 Aug 2024 13:09:44 +0200 Subject: [PATCH] Improve memory efficiency by discarding references to objects immediately MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When "killing" or "removing" elements from a tree, it's important not to keep references to them in the lists where they're collected. Without this, the memory consumption was quadratic because the tails in this case were combined into parents'/previous' elements' tails and the original ones could not be removed as we kept the references to the elements in the _kill and _remove lists. For more info see: https://bugs.launchpad.net/lxml/+bug/1889653 Co-authored-by: Miro HronĨok --- lxml_html_clean/clean.py | 19 ++++++++++--------- tests/test_clean.py | 13 +++++++++++++ tests/utils.py | 18 ++++++++++++++++++ tox.ini | 6 ++++-- 4 files changed, 45 insertions(+), 11 deletions(-) create mode 100644 tests/utils.py diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py index 531c7f2..1c63cfa 100644 --- a/lxml_html_clean/clean.py +++ b/lxml_html_clean/clean.py @@ -8,6 +8,7 @@ import copy import re +from collections import deque from urllib.parse import urlsplit, unquote_plus from lxml import etree @@ -383,8 +384,8 @@ def __call__(self, doc): if self.annoying_tags: remove_tags.update(('blink', 'marquee')) - _remove = [] - _kill = [] + _remove = deque() + _kill = deque() for el in doc.iter(): if el.tag in kill_tags: if self.allow_element(el): @@ -398,22 +399,22 @@ def __call__(self, doc): if _remove and _remove[0] == doc: # We have to drop the parent-most tag, which we can't # do. Instead we'll rewrite it: - el = _remove.pop(0) + el = _remove.popleft() el.tag = 'div' el.attrib.clear() elif _kill and _kill[0] == doc: # We have to drop the parent-most element, which we can't # do. Instead we'll clear it: - el = _kill.pop(0) + el = _kill.popleft() if el.tag != 'html': el.tag = 'div' el.clear() - _kill.reverse() # start with innermost tags - for el in _kill: - el.drop_tree() - for el in _remove: - el.drop_tag() + while _kill: + _kill.popleft().drop_tree() # popleft to start with innermost elements + + while _remove: + _remove.pop().drop_tag() if self.remove_unknown_tags: if allow_tags: diff --git a/tests/test_clean.py b/tests/test_clean.py index 079d8c1..701cab8 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -5,6 +5,7 @@ import lxml.html from lxml_html_clean import Cleaner, clean_html +from .utils import peak_memory_usage class CleanerTest(unittest.TestCase): @@ -333,3 +334,15 @@ def test_ascii_control_chars_removed(self): expected = """Link""" cleaner = Cleaner() self.assertEqual(expected, cleaner.clean_html(html)) + + def test_memory_usage_many_elements_with_long_tails(self): + comment = "\n" + empty_line = "\t" * 10 + "\n" + element = comment + empty_line * 10 + content = element * 5_000 + html = f"{content}" + + cleaner = Cleaner() + mem = peak_memory_usage(cleaner.clean_html, html) + + self.assertTrue(mem < 10, f"Used {mem} MiB memory, expected at most 10 MiB") diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..b744a41 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,18 @@ +import unittest + + +def peak_memory_usage(func, *args, **kwargs): + """ + Monitor the memory usage of a function and return the peak memory used, in MiB. + """ + try: + from memory_profiler import memory_usage # type: ignore + except ImportError: + raise unittest.SkipTest("memory-profiler is not available") + + try: + mem_usage = memory_usage((func, args, kwargs), interval=0.1, timeout=None) + except MemoryError: + return float("inf") + peak_memory = max(mem_usage) - min(mem_usage) + return peak_memory diff --git a/tox.ini b/tox.ini index b364bb8..ce95032 100644 --- a/tox.ini +++ b/tox.ini @@ -4,9 +4,11 @@ skipsdist = True [testenv] commands = - python -m unittest tests.test_clean + python -m unittest -v tests.test_clean python -m doctest tests/test_clean_embed.txt tests/test_clean.txt tests/test_autolink.txt -deps = lxml +deps = + lxml + memory_profiler [testenv:mypy] commands =