fedora-python · frenzymadness · Aug 29, 2024 · Aug 28, 2024
diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py
@@ -8,6 +8,7 @@
 
 import copy
 import re
+from collections import deque
 from urllib.parse import urlsplit, unquote_plus
 
 from lxml import etree
@@ -383,8 +384,8 @@ def __call__(self, doc):
         if self.annoying_tags:
             remove_tags.update(('blink', 'marquee'))
 
-        _remove = []
-        _kill = []
+        _remove = deque()
+        _kill = deque()
         for el in doc.iter():
             if el.tag in kill_tags:
                 if self.allow_element(el):
@@ -398,22 +399,22 @@ def __call__(self, doc):
         if _remove and _remove[0] == doc:
             # We have to drop the parent-most tag, which we can't
             # do.  Instead we'll rewrite it:
-            el = _remove.pop(0)
+            el = _remove.popleft()
             el.tag = 'div'
             el.attrib.clear()
         elif _kill and _kill[0] == doc:
             # We have to drop the parent-most element, which we can't
             # do.  Instead we'll clear it:
-            el = _kill.pop(0)
+            el = _kill.popleft()
             if el.tag != 'html':
                 el.tag = 'div'
             el.clear()
 
-        _kill.reverse() # start with innermost tags
-        for el in _kill:
-            el.drop_tree()
-        for el in _remove:
-            el.drop_tag()
+        while _kill:
+            _kill.popleft().drop_tree()  # popleft to start with innermost elements
+
+        while _remove:
+            _remove.pop().drop_tag()
 
         if self.remove_unknown_tags:
             if allow_tags:

diff --git a/tests/test_clean.py b/tests/test_clean.py
@@ -5,6 +5,7 @@
 
 import lxml.html
 from lxml_html_clean import Cleaner, clean_html
+from .utils import peak_memory_usage
 
 
 class CleanerTest(unittest.TestCase):
@@ -333,3 +334,15 @@ def test_ascii_control_chars_removed(self):
         expected = """<a href="">Link</a>"""
         cleaner = Cleaner()
         self.assertEqual(expected, cleaner.clean_html(html))
+
+    def test_memory_usage_many_elements_with_long_tails(self):
+        comment = "<!-- foo bar baz -->\n"
+        empty_line = "\t" * 10 + "\n"
+        element = comment + empty_line * 10
+        content = element * 5_000
+        html = f"<html>{content}</html>"
+
+        cleaner = Cleaner()
+        mem = peak_memory_usage(cleaner.clean_html, html)
+
+        self.assertTrue(mem < 10, f"Used {mem} MiB memory, expected at most 10 MiB")
diff --git a/tests/utils.py b/tests/utils.py
@@ -0,0 +1,18 @@
+import unittest
+
+
+def peak_memory_usage(func, *args, **kwargs):
+    """
+    Monitor the memory usage of a function and return the peak memory used, in MiB.
+    """
+    try:
+        from memory_profiler import memory_usage  # type: ignore
+    except ImportError:
+        raise unittest.SkipTest("memory-profiler is not available")
+
+    try:
+        mem_usage = memory_usage((func, args, kwargs), interval=0.1, timeout=None)
+    except MemoryError:
+        return float("inf")
+    peak_memory = max(mem_usage) - min(mem_usage)
+    return peak_memory
diff --git a/tox.ini b/tox.ini
@@ -4,9 +4,11 @@ skipsdist = True
 
 [testenv]
 commands =
-  python -m unittest tests.test_clean
+  python -m unittest -v tests.test_clean
   python -m doctest tests/test_clean_embed.txt tests/test_clean.txt tests/test_autolink.txt
-deps = lxml
+deps =
+  lxml
+  memory_profiler
 
 [testenv:mypy]
 commands =