diff --git a/lxml_html_clean/clean.py b/lxml_html_clean/clean.py index fdc96ab..a62b710 100644 --- a/lxml_html_clean/clean.py +++ b/lxml_html_clean/clean.py @@ -54,7 +54,7 @@ # All kinds of schemes besides just javascript: that can cause # execution: _find_image_dataurls = re.compile( - r'data:image/(.+);base64,', re.I).findall + r'data:image/(.+?);base64,', re.I).findall _possibly_malicious_schemes = re.compile( r'(javascript|jscript|livescript|vbscript|data|about|mocha):', re.I).findall diff --git a/tests/test_clean.py b/tests/test_clean.py index 2ec492d..9bd63eb 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -255,6 +255,31 @@ def test_image_data_links_in_style(self): cleaned, "%s -> %s" % (url, cleaned)) + def test_image_data_links_in_inline_style(self): + safe_attrs = set(lxml.html.defs.safe_attrs) + safe_attrs.add('style') + + cleaner = Cleaner( + safe_attrs_only=True, + safe_attrs=safe_attrs) + + data = b'123' + data_b64 = base64.b64encode(data).decode('ASCII') + url = "url(data:image/jpeg;base64,%s)" % data_b64 + styles = [ + "background: %s" % url, + "background: %s; background-image: %s" % (url, url), + ] + for style in styles: + html = '
' % style + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(cleaner.clean_html(s)) + self.assertEqual( + html.encode("UTF-8"), + cleaned, + "%s -> %s" % (style, cleaned)) + def test_formaction_attribute_in_button_input(self): # The formaction attribute overrides the form's action and should be # treated as a malicious link attribute