Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve memory efficiency by discarding references to objects immediately #14

Merged
merged 1 commit into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions lxml_html_clean/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import copy
import re
from collections import deque
from urllib.parse import urlsplit, unquote_plus

from lxml import etree
Expand Down Expand Up @@ -383,8 +384,8 @@ def __call__(self, doc):
if self.annoying_tags:
remove_tags.update(('blink', 'marquee'))

_remove = []
_kill = []
_remove = deque()
_kill = deque()
for el in doc.iter():
if el.tag in kill_tags:
if self.allow_element(el):
Expand All @@ -398,22 +399,22 @@ def __call__(self, doc):
if _remove and _remove[0] == doc:
# We have to drop the parent-most tag, which we can't
# do. Instead we'll rewrite it:
el = _remove.pop(0)
el = _remove.popleft()
el.tag = 'div'
el.attrib.clear()
elif _kill and _kill[0] == doc:
# We have to drop the parent-most element, which we can't
# do. Instead we'll clear it:
el = _kill.pop(0)
el = _kill.popleft()
if el.tag != 'html':
el.tag = 'div'
el.clear()

_kill.reverse() # start with innermost tags
for el in _kill:
el.drop_tree()
for el in _remove:
el.drop_tag()
while _kill:
_kill.popleft().drop_tree() # popleft to start with innermost elements

while _remove:
_remove.pop().drop_tag()

if self.remove_unknown_tags:
if allow_tags:
Expand Down
13 changes: 13 additions & 0 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import lxml.html
from lxml_html_clean import Cleaner, clean_html
from .utils import peak_memory_usage


class CleanerTest(unittest.TestCase):
Expand Down Expand Up @@ -333,3 +334,15 @@ def test_ascii_control_chars_removed(self):
expected = """<a href="">Link</a>"""
cleaner = Cleaner()
self.assertEqual(expected, cleaner.clean_html(html))

def test_memory_usage_many_elements_with_long_tails(self):
comment = "<!-- foo bar baz -->\n"
empty_line = "\t" * 10 + "\n"
element = comment + empty_line * 10
content = element * 5_000
html = f"<html>{content}</html>"

cleaner = Cleaner()
mem = peak_memory_usage(cleaner.clean_html, html)

self.assertTrue(mem < 10, f"Used {mem} MiB memory, expected at most 10 MiB")
18 changes: 18 additions & 0 deletions tests/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import unittest


def peak_memory_usage(func, *args, **kwargs):
"""
Monitor the memory usage of a function and return the peak memory used, in MiB.
"""
try:
from memory_profiler import memory_usage # type: ignore
except ImportError:
raise unittest.SkipTest("memory-profiler is not available")

try:
mem_usage = memory_usage((func, args, kwargs), interval=0.1, timeout=None)
except MemoryError:
return float("inf")
peak_memory = max(mem_usage) - min(mem_usage)
return peak_memory
6 changes: 4 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ skipsdist = True

[testenv]
commands =
python -m unittest tests.test_clean
python -m unittest -v tests.test_clean
python -m doctest tests/test_clean_embed.txt tests/test_clean.txt tests/test_autolink.txt
deps = lxml
deps =
lxml
memory_profiler

[testenv:mypy]
commands =
Expand Down
Loading