Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scan for JS code also in CSS comments #19

Merged
merged 3 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@ lxml_html_clean changelog
Unreleased
==========

0.4.0 (2024-11-12)
==================

Bugs fixed
----------

* The ``Cleaner()`` now scans for hidden JavaScript code embedded
within CSS comments. In certain contexts, such as within ``<svg>`` or ``<math>`` tags,
``<style>`` tags may lose their intended function, allowing comments
like ``/* foo */`` to potentially be executed by the browser.
If a suspicious content is detected, only the comment is removed.

0.3.1 (2024-10-09)
==================

Expand Down
29 changes: 26 additions & 3 deletions lxml_html_clean/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,8 +366,11 @@ def __call__(self, doc):
new = _replace_css_import('', new)
if self._has_sneaky_javascript(new):
# Something tricky is going on...
el.text = '/* deleted */'
elif new != old:
new = '/* deleted */'
else:
new = self._remove_sneaky_css_comments(new)

if new != old:
el.text = new
if self.comments:
kill_tags.add(etree.Comment)
Expand Down Expand Up @@ -568,7 +571,9 @@ def _remove_javascript_link(self, link):
return ''
return link

_substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
_comments_re = re.compile(r'/\*.*?\*/', re.S)
_find_comments = _comments_re.finditer
_substitute_comments = _comments_re.sub

def _has_sneaky_javascript(self, style):
"""
Expand Down Expand Up @@ -599,6 +604,24 @@ def _has_sneaky_javascript(self, style):
return True
return False

def _remove_sneaky_css_comments(self, style):
"""
Look for suspicious code in CSS comment and if found,
remove the entire comment from the given style.

Browsers might parse <style> as an ordinary HTML tag
in some specific context and that might cause code in CSS
comments to run.
"""
for match in self._find_comments(style):
comment = match.group(0)
print("f", comment)
Copy link
Member

@hroncok hroncok Nov 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like a leftover debug print.

EDIT: #20

if _has_javascript_scheme(comment) or _looks_like_tag_content(comment):
style = style.replace(comment, "/* deleted */")
print("f", style)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another one is here.


return style

def clean_html(self, html):
result_type = type(html)
if isinstance(html, (str, bytes)):
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = lxml_html_clean
version = 0.3.1
version = 0.4.0
description = HTML cleaner from lxml project
long_description = file:README.md
long_description_content_type = text/markdown
Expand Down
19 changes: 19 additions & 0 deletions tests/test_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,25 @@ def test_sneaky_js_in_math_style(self):
b'<math><style>/* deleted */</style></math>',
lxml.html.tostring(clean_html(s)))

def test_sneaky_js_in_style_comment_math_svg(self):
for tag in "svg", "math":
html = f'<{tag}><style>p {{color: red;}}/*<img src onerror=alert(origin)>*/h2 {{color: blue;}}</style></{tag}>'
s = lxml.html.fragment_fromstring(html)

expected = f'<{tag}><style>p {{color: red;}}/* deleted */h2 {{color: blue;}}</style></{tag}>'.encode()

self.assertEqual(
expected,
lxml.html.tostring(clean_html(s)))

def test_sneaky_js_in_style_comment_noscript(self):
html = '<noscript><style>p {{color: red;}}/*</noscript><img src onerror=alert(origin)>*/h2 {{color: blue;}}</style></noscript>'
s = lxml.html.fragment_fromstring(html)

self.assertEqual(
b'<noscript><style>p {{color: red;}}/* deleted */h2 {{color: blue;}}</style></noscript>',
lxml.html.tostring(clean_html(s)))

def test_sneaky_import_in_style(self):
# Prevent "@@importimport" -> "@import" replacement etc.
style_codes = [
Expand Down
Loading