From 69a747356655158fdf9abaecea5feafb3bd6b5f5 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Sat, 11 Dec 2021 12:19:21 +0100 Subject: [PATCH] Cleaner: cover some more cases where scripts could sneak through in specially crafted style content. --- src/lxml/html/clean.py | 20 ++++++------ src/lxml/html/tests/test_clean.py | 65 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 73 insertions(+), 12 deletions(-) diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index 4df10c2..0e96627 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -74,22 +74,20 @@ _looks_like_tag_content = re.compile( # All kinds of schemes besides just javascript: that can cause # execution: _find_image_dataurls = re.compile( - r'^data:image/(.+);base64,', re.I).findall -_is_possibly_malicious_scheme = re.compile( + r'data:image/(.+);base64,', re.I).findall +_possibly_malicious_schemes = re.compile( r'(javascript|jscript|livescript|vbscript|data|about|mocha):', re.I).findall # SVG images can contain script content -_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).findall +_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search -def _is_javascript_scheme(s): - is_image_url = False +def _has_javascript_scheme(s): + safe_image_urls = 0 for image_type in _find_image_dataurls(s): - is_image_url = True if _is_unsafe_image_type(image_type): return True - if is_image_url: - return False - return bool(_is_possibly_malicious_scheme(s)) + safe_image_urls += 1 + return len(_possibly_malicious_schemes(s)) > safe_image_urls _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub @@ -521,7 +519,7 @@ class Cleaner(object): def _remove_javascript_link(self, link): # links like "j a v a s c r i p t:" might be interpreted in IE new = _substitute_whitespace('', unquote_plus(link)) - if _is_javascript_scheme(new): + if _has_javascript_scheme(new): # FIXME: should this be None to delete? return '' return link @@ -543,7 +541,7 @@ class Cleaner(object): style = style.replace('\\', '') style = _substitute_whitespace('', style) style = style.lower() - if 'javascript:' in style: + if _has_javascript_scheme(style): return True if 'expression(' in style: return True diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py index a05d967..aec87cd 100644 --- a/src/lxml/html/tests/test_clean.py +++ b/src/lxml/html/tests/test_clean.py @@ -126,7 +126,7 @@ class CleanerTest(unittest.TestCase): lxml.html.tostring(clean_html(s))) def test_sneaky_import_in_style(self): - # Prevent "@@importimport" -> "@import" replacement. + # Prevent "@@importimport" -> "@import" replacement etc. style_codes = [ "@@importimport(extstyle.css)", "@ @ import import(extstyle.css)", @@ -134,6 +134,11 @@ class CleanerTest(unittest.TestCase): "@@ import import(extstyle.css)", "@ @import import(extstyle.css)", "@@importimport()", + "@@importimport() ()", + "@/* ... */import()", + "@im/* ... */port()", + "@ @import/* ... */import()", + "@ /* ... */ import()", ] for style_code in style_codes: html = '' % style_code @@ -145,6 +150,41 @@ class CleanerTest(unittest.TestCase): cleaned, "%s -> %s" % (style_code, cleaned)) + def test_sneaky_schemes_in_style(self): + style_codes = [ + "javasjavascript:cript:", + "javascriptjavascript::", + "javascriptjavascript:: :", + "vbjavascript:cript:", + ] + for style_code in style_codes: + html = '' % style_code + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + b'', + cleaned, + "%s -> %s" % (style_code, cleaned)) + + def test_sneaky_urls_in_style(self): + style_codes = [ + "url(data:image/svg+xml;base64,...)", + "url(javasjavascript:cript:)", + "url(javasjavascript:cript: ::)", + "url(vbjavascript:cript:)", + "url(vbjavascript:cript: :)", + ] + for style_code in style_codes: + html = '' % style_code + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + b'', + cleaned, + "%s -> %s" % (style_code, cleaned)) + def test_svg_data_links(self): # Remove SVG images with potentially insecure content. svg = b'' @@ -188,6 +228,29 @@ class CleanerTest(unittest.TestCase): cleaned, "%s -> %s" % (url, cleaned)) + def test_image_data_links_in_style(self): + data = b'123' + data_b64 = base64.b64encode(data).decode('ASCII') + urls = [ + "data:image/jpeg;base64," + data_b64, + "data:image/apng;base64," + data_b64, + "data:image/png;base64," + data_b64, + "data:image/gif;base64," + data_b64, + "data:image/webp;base64," + data_b64, + "data:image/bmp;base64," + data_b64, + "data:image/tiff;base64," + data_b64, + "data:image/x-icon;base64," + data_b64, + ] + for url in urls: + html = '' % url + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + html.encode("UTF-8"), + cleaned, + "%s -> %s" % (url, cleaned)) + def test_formaction_attribute_in_button_input(self): # The formaction attribute overrides the form's action and should be # treated as a malicious link attribute -- 2.13.7