fedora-python · frenzymadness · Nov 14, 2024 · Nov 12, 2024 · Nov 12, 2024 · Nov 13, 2024
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -6,6 +6,18 @@ lxml_html_clean changelog
 Unreleased
 ==========
 
+0.4.0 (2024-11-12)
+==================
+
+Bugs fixed
+----------
+
+* The ``Cleaner()`` now scans for hidden JavaScript code embedded
+  within CSS comments. In certain contexts, such as within ``<svg>`` or ``<math>`` tags,
+  ``<style>`` tags may lose their intended function, allowing comments
+  like ``/* foo */`` to potentially be executed by the browser.
+  If a suspicious content is detected, only the comment is removed.
+
 0.3.1 (2024-10-09)
 ==================
 

diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py
@@ -366,8 +366,11 @@ def __call__(self, doc):
                     new = _replace_css_import('', new)
                     if self._has_sneaky_javascript(new):
                         # Something tricky is going on...
-                        el.text = '/* deleted */'
-                    elif new != old:
+                        new = '/* deleted */'
+                    else:
+                        new = self._remove_sneaky_css_comments(new)
+
+                    if new != old:
                         el.text = new
         if self.comments:
             kill_tags.add(etree.Comment)
@@ -568,7 +571,9 @@ def _remove_javascript_link(self, link):
             return ''
         return link
 
-    _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
+    _comments_re = re.compile(r'/\*.*?\*/', re.S)
+    _find_comments = _comments_re.finditer
+    _substitute_comments = _comments_re.sub
 
     def _has_sneaky_javascript(self, style):
         """
@@ -599,6 +604,24 @@ def _has_sneaky_javascript(self, style):
             return True
         return False
 
+    def _remove_sneaky_css_comments(self, style):
+        """
+        Look for suspicious code in CSS comment and if found,
+        remove the entire comment from the given style.
+
+        Browsers might parse <style> as an ordinary HTML tag
+        in some specific context and that might cause code in CSS
+        comments to run.
+        """
+        for match in self._find_comments(style):
+            comment = match.group(0)
+            print("f", comment)
+            if _has_javascript_scheme(comment) or _looks_like_tag_content(comment):
+                style = style.replace(comment, "/* deleted */")
+                print("f", style)
+
+        return style
+
     def clean_html(self, html):
         result_type = type(html)
         if isinstance(html, (str, bytes)):

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = lxml_html_clean
-version = 0.3.1
+version = 0.4.0
 description = HTML cleaner from lxml project
 long_description = file:README.md
 long_description_content_type = text/markdown

diff --git a/tests/test_clean.py b/tests/test_clean.py
@@ -127,6 +127,25 @@ def test_sneaky_js_in_math_style(self):
             b'<math><style>/* deleted */</style></math>',
             lxml.html.tostring(clean_html(s)))
 
+    def test_sneaky_js_in_style_comment_math_svg(self):
+        for tag in "svg", "math":
+            html = f'<{tag}><style>p {{color: red;}}/*<img src onerror=alert(origin)>*/h2 {{color: blue;}}</style></{tag}>'
+            s = lxml.html.fragment_fromstring(html)
+
+            expected = f'<{tag}><style>p {{color: red;}}/* deleted */h2 {{color: blue;}}</style></{tag}>'.encode()
+
+            self.assertEqual(
+                expected,
+                lxml.html.tostring(clean_html(s)))
+
+    def test_sneaky_js_in_style_comment_noscript(self):
+        html = '<noscript><style>p {{color: red;}}/*</noscript><img src onerror=alert(origin)>*/h2 {{color: blue;}}</style></noscript>'
+        s = lxml.html.fragment_fromstring(html)
+
+        self.assertEqual(
+            b'<noscript><style>p {{color: red;}}/* deleted */h2 {{color: blue;}}</style></noscript>',
+            lxml.html.tostring(clean_html(s)))
+
     def test_sneaky_import_in_style(self):
         # Prevent "@@importimport" -> "@import" replacement etc.
         style_codes = [