Compare commits

...

11 Commits

Author SHA1 Message Date
openeuler-ci-bot
72883bff79
!96 Fixed changalog date not sorted
From: @jackssir 
Reviewed-by: @dillon_chen 
Signed-off-by: @dillon_chen
2024-08-21 02:20:37 +00:00
lvfei
d31c18ab7d
update python-lxml.spec.
Signed-off-by: lvfei <lvfei@kylinos.cn>
2024-08-21 01:48:35 +00:00
lvfei
69fd3ee9bc Fixed changalog date not sorted 2024-08-19 15:17:03 +08:00
openeuler-ci-bot
ba231e969f
!89 fix CVE-2024-37388
From: @zhuofeng6 
Reviewed-by: @hubin95, @gaoruoshu 
Signed-off-by: @hubin95, @gaoruoshu
2024-06-14 02:05:03 +00:00
zhuofeng
76da917069 fix CVE-2024-37388 2024-06-13 15:22:52 +08:00
openeuler-ci-bot
a48a35752f
!52 删除源码包中etree.c等文件,构建过程自动生成
From: @tong_1001 
Reviewed-by: @xiezhipeng1 
Signed-off-by: @xiezhipeng1
2022-07-25 08:48:59 +00:00
shixuantong
bddafeb6e5 Remove pregenerated Cython C sources 2022-07-25 16:10:14 +08:00
openeuler-ci-bot
6479fbc60b
!48 Fix CVE-2022-2309
From: @renxichen 
Reviewed-by: @xiezhipeng1 
Signed-off-by: @xiezhipeng1
2022-07-21 09:14:24 +00:00
rwx403335
a1ea60f874 Fix CVE-2022-2309 2022-07-21 15:52:11 +08:00
openeuler-ci-bot
fdc6554a9f
!42 [sync] PR-39: Cleaner: cover some more cases where scripts could sneak through in specially crafted style content.
Merge pull request !42 from openeuler-sync-bot/sync-pr39-openEuler-20.03-LTS-Next-to-openEuler-20.03-LTS-SP3
2022-01-28 01:57:09 +00:00
shixuantong
3735c909d9 Cleaner: cover some more cases where scripts could sneak through in specially crafted style content.
(cherry picked from commit b4f89041f532543487ec5a1b916faf83d98b6b6b)
2022-01-27 10:35:35 +08:00
5 changed files with 706 additions and 2 deletions

View File

@ -0,0 +1,92 @@
From 86368e9cf70a0ad23cccd5ee32de847149af0c6f Mon Sep 17 00:00:00 2001
From: Stefan Behnel <stefan_ml@behnel.de>
Date: Fri, 1 Jul 2022 21:06:10 +0200
Subject: [PATCH] Fix a crash when incorrect parser input occurs together with
usages of iterwalk() on trees generated by the same parser.
---
src/lxml/apihelpers.pxi | 7 ++++---
src/lxml/iterparse.pxi | 11 ++++++-----
src/lxml/tests/test_etree.py | 20 ++++++++++++++++++++
3 files changed, 30 insertions(+), 8 deletions(-)
diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi
index c166276..9fae9fb 100644
--- a/src/lxml/apihelpers.pxi
+++ b/src/lxml/apihelpers.pxi
@@ -246,9 +246,10 @@ cdef dict _build_nsmap(xmlNode* c_node):
while c_node is not NULL and c_node.type == tree.XML_ELEMENT_NODE:
c_ns = c_node.nsDef
while c_ns is not NULL:
- prefix = funicodeOrNone(c_ns.prefix)
- if prefix not in nsmap:
- nsmap[prefix] = funicodeOrNone(c_ns.href)
+ if c_ns.prefix or c_ns.href:
+ prefix = funicodeOrNone(c_ns.prefix)
+ if prefix not in nsmap:
+ nsmap[prefix] = funicodeOrNone(c_ns.href)
c_ns = c_ns.next
c_node = c_node.parent
return nsmap
diff --git a/src/lxml/iterparse.pxi b/src/lxml/iterparse.pxi
index 138c23a..a7299da 100644
--- a/src/lxml/iterparse.pxi
+++ b/src/lxml/iterparse.pxi
@@ -420,7 +420,7 @@ cdef int _countNsDefs(xmlNode* c_node):
count = 0
c_ns = c_node.nsDef
while c_ns is not NULL:
- count += 1
+ count += (c_ns.href is not NULL)
c_ns = c_ns.next
return count
@@ -431,9 +431,10 @@ cdef int _appendStartNsEvents(xmlNode* c_node, list event_list) except -1:
count = 0
c_ns = c_node.nsDef
while c_ns is not NULL:
- ns_tuple = (funicode(c_ns.prefix) if c_ns.prefix is not NULL else '',
- funicode(c_ns.href))
- event_list.append( (u"start-ns", ns_tuple) )
- count += 1
+ if c_ns.href:
+ ns_tuple = (funicodeOrEmpty(c_ns.prefix),
+ funicode(c_ns.href))
+ event_list.append( (u"start-ns", ns_tuple) )
+ count += 1
c_ns = c_ns.next
return count
diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py
index e5f0846..285313f 100644
--- a/src/lxml/tests/test_etree.py
+++ b/src/lxml/tests/test_etree.py
@@ -1460,6 +1460,26 @@ class ETreeOnlyTestCase(HelperTestCase):
[1,2,1,4],
counts)
+ def test_walk_after_parse_failure(self):
+ # This used to be an issue because libxml2 can leak empty namespaces
+ # between failed parser runs. iterwalk() failed to handle such a tree.
+ try:
+ etree.XML('''<anot xmlns="1">''')
+ except etree.XMLSyntaxError:
+ pass
+ else:
+ assert False, "invalid input did not fail to parse"
+
+ et = etree.XML('''<root> </root>''')
+ try:
+ ns = next(etree.iterwalk(et, events=('start-ns',)))
+ except StopIteration:
+ # This would be the expected result, because there was no namespace
+ pass
+ else:
+ # This is a bug in libxml2
+ assert not ns, repr(ns)
+
def test_itertext_comment_pi(self):
# https://bugs.launchpad.net/lxml/+bug/1844674
XML = self.etree.XML
--
1.8.3.1

View File

@ -0,0 +1,372 @@
From b38cebf2f846e92bd63de4488fd3d1c8b568f397 Mon Sep 17 00:00:00 2001
From: scoder <stefan_ml@behnel.de>
Date: Fri, 29 Dec 2023 14:21:23 +0100
Subject: [PATCH] Disable external entity resolution (XXE) by default (GH-391)
This prevents security risks that would allow loading arbitrary external files.
Closes https://bugs.launchpad.net/lxml/+bug/1742885
Supersedes https://github.com/lxml/lxml/pull/130
---
doc/FAQ.txt | 12 +++--
src/lxml/includes/xmlparser.pxd | 18 +++++++-
src/lxml/parser.pxi | 70 ++++++++++++++++++++++++++--
src/lxml/tests/test_etree.py | 81 +++++++++++++++++++++++++++++++++
4 files changed, 170 insertions(+), 11 deletions(-)
diff --git a/doc/FAQ.txt b/doc/FAQ.txt
index 48f69a6..7f3a524 100644
--- a/doc/FAQ.txt
+++ b/doc/FAQ.txt
@@ -1107,9 +1107,9 @@ useless for the data commonly sent through web services and
can simply be disabled, which rules out several types of
denial of service attacks at once. This also involves an attack
that reads local files from the server, as XML entities can be
-defined to expand into their content. Consequently, version
-1.2 of the SOAP standard explicitly disallows entity references
-in the XML stream.
+defined to expand into the content of external resources.
+Consequently, version 1.2 of the SOAP standard explicitly
+disallows entity references in the XML stream.
To disable entity expansion, use an XML parser that is configured
with the option ``resolve_entities=False``. Then, after (or
@@ -1117,7 +1117,11 @@ while) parsing the document, use ``root.iter(etree.Entity)`` to
recursively search for entity references. If it contains any,
reject the entire input document with a suitable error response.
In lxml 3.x, you can also use the new DTD introspection API to
-apply your own restrictions on input documents.
+apply your own restrictions on input documents. Since version 5.x,
+lxml disables the expansion of external entities (XXE) by default.
+If you really want to allow loading external files into XML documents
+using this functionality, you have to explicitly set
+``resolve_entities=True``.
Another attack to consider is compression bombs. If you allow
compressed input into your web service, attackers can try to send
diff --git a/src/lxml/includes/xmlparser.pxd b/src/lxml/includes/xmlparser.pxd
index 45acfc8..3945495 100644
--- a/src/lxml/includes/xmlparser.pxd
+++ b/src/lxml/includes/xmlparser.pxd
@@ -1,9 +1,9 @@
from libc.string cimport const_char
from lxml.includes.tree cimport (
- xmlDoc, xmlNode, xmlDict, xmlDtd, xmlChar, const_xmlChar)
+ xmlDoc, xmlNode, xmlEntity, xmlDict, xmlDtd, xmlChar, const_xmlChar)
from lxml.includes.tree cimport xmlInputReadCallback, xmlInputCloseCallback
-from lxml.includes.xmlerror cimport xmlError, xmlStructuredErrorFunc
+from lxml.includes.xmlerror cimport xmlError, xmlStructuredErrorFunc, xmlErrorLevel
cdef extern from "libxml/parser.h":
@@ -47,11 +47,14 @@ cdef extern from "libxml/parser.h":
ctypedef void (*referenceSAXFunc)(void * ctx, const_xmlChar* name)
+ ctypedef xmlEntity* (*getEntitySAXFunc)(void* ctx, const_xmlChar* name)
+
cdef int XML_SAX2_MAGIC
cdef extern from "libxml/tree.h":
ctypedef struct xmlParserInput:
int line
+ int col
int length
const_xmlChar* base
const_xmlChar* cur
@@ -76,6 +79,7 @@ cdef extern from "libxml/tree.h":
charactersSAXFunc characters
cdataBlockSAXFunc cdataBlock
referenceSAXFunc reference
+ getEntitySAXFunc getEntity
commentSAXFunc comment
processingInstructionSAXFunc processingInstruction
startDocumentSAXFunc startDocument
@@ -150,6 +154,8 @@ cdef extern from "libxml/parser.h":
int inSubset
int charset
xmlParserInput* input
+ int inputNr
+ xmlParserInput** inputTab
ctypedef enum xmlParserOption:
XML_PARSE_RECOVER = 1 # recover on errors
@@ -212,6 +218,12 @@ cdef extern from "libxml/parser.h":
char* filename, const_char* encoding,
int options) nogil
+ cdef void xmlErrParser(xmlParserCtxt* ctxt, xmlNode* node,
+ int domain, int code, xmlErrorLevel level,
+ const xmlChar *str1, const xmlChar *str2, const xmlChar *str3,
+ int int1, const char *msg, ...)
+
+
# iterparse:
cdef xmlParserCtxt* xmlCreatePushParserCtxt(xmlSAXHandler* sax,
@@ -233,6 +245,8 @@ cdef extern from "libxml/parser.h":
cdef xmlExternalEntityLoader xmlGetExternalEntityLoader() nogil
cdef void xmlSetExternalEntityLoader(xmlExternalEntityLoader f) nogil
+ cdef xmlEntity* xmlSAX2GetEntity(void* ctxt, const_xmlChar* name) nogil
+
# DTDs:
cdef xmlDtd* xmlParseDTD(const_xmlChar* ExternalID, const_xmlChar* SystemID) nogil
diff --git a/src/lxml/parser.pxi b/src/lxml/parser.pxi
index 3187a38..2f0ce80 100644
--- a/src/lxml/parser.pxi
+++ b/src/lxml/parser.pxi
@@ -794,6 +794,7 @@ cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
c_attr = c_attr.next
return 0
+
@cython.internal
cdef class _BaseParser:
cdef ElementClassLookup _class_lookup
@@ -806,6 +807,7 @@ cdef class _BaseParser:
cdef bint _remove_pis
cdef bint _strip_cdata
cdef bint _collect_ids
+ cdef bint _resolve_external_entities
cdef XMLSchema _schema
cdef bytes _filename
cdef readonly object target
@@ -814,7 +816,7 @@ cdef class _BaseParser:
def __init__(self, int parse_options, bint for_html, XMLSchema schema,
remove_comments, remove_pis, strip_cdata, collect_ids,
- target, encoding):
+ target, encoding, bint resolve_external_entities=True):
cdef tree.xmlCharEncodingHandler* enchandler
cdef int c_encoding
if not isinstance(self, (XMLParser, HTMLParser)):
@@ -827,6 +829,7 @@ cdef class _BaseParser:
self._remove_pis = remove_pis
self._strip_cdata = strip_cdata
self._collect_ids = collect_ids
+ self._resolve_external_entities = resolve_external_entities
self._schema = schema
self._resolvers = _ResolverRegistry()
@@ -906,6 +909,8 @@ cdef class _BaseParser:
if self._strip_cdata:
# hard switch-off for CDATA nodes => makes them plain text
pctxt.sax.cdataBlock = NULL
+ if not self._resolve_external_entities:
+ pctxt.sax.getEntity = _getInternalEntityOnly
cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
@@ -1206,6 +1211,56 @@ cdef class _BaseParser:
finally:
context.cleanup()
+cdef tree.xmlEntity* _getInternalEntityOnly(void* ctxt, const_xmlChar* name):
+ """
+ Callback function to intercept the entity resolution when external entity loading is disabled.
+ """
+ cdef tree.xmlEntity* entity = xmlparser.xmlSAX2GetEntity(ctxt, name)
+ if not entity:
+ return NULL
+ if entity.etype not in (
+ tree.xmlEntityType.XML_EXTERNAL_GENERAL_PARSED_ENTITY,
+ tree.xmlEntityType.XML_EXTERNAL_GENERAL_UNPARSED_ENTITY,
+ tree.xmlEntityType.XML_EXTERNAL_PARAMETER_ENTITY):
+ return entity
+
+ # Reject all external entities and fail the parsing instead. There is currently
+ # no way in libxml2 to just prevent the entity resolution in this case.
+ cdef xmlerror.xmlError c_error
+ cdef xmlerror.xmlStructuredErrorFunc err_func
+ cdef xmlparser.xmlParserInput* parser_input
+ cdef void* err_context
+
+ c_ctxt = <xmlparser.xmlParserCtxt *> ctxt
+ err_func = xmlerror.xmlStructuredError
+ if err_func:
+ parser_input = c_ctxt.input
+ # Copied from xmlVErrParser() in libxml2: get current input from stack.
+ if parser_input and parser_input.filename is NULL and c_ctxt.inputNr > 1:
+ parser_input = c_ctxt.inputTab[c_ctxt.inputNr - 2]
+
+ c_error = xmlerror.xmlError(
+ domain=xmlerror.xmlErrorDomain.XML_FROM_PARSER,
+ code=xmlerror.xmlParserErrors.XML_ERR_EXT_ENTITY_STANDALONE,
+ level=xmlerror.xmlErrorLevel.XML_ERR_FATAL,
+ message=b"External entity resolution is disabled for security reasons "
+ b"when resolving '&%s;'. Use 'XMLParser(resolve_entities=True)' "
+ b"if you consider it safe to enable it.",
+ file=parser_input.filename,
+ node=entity,
+ str1=<char*> name,
+ str2=NULL,
+ str3=NULL,
+ line=parser_input.line if parser_input else 0,
+ int1=0,
+ int2=parser_input.col if parser_input else 0,
+ )
+ err_context = xmlerror.xmlStructuredErrorContext
+ err_func(err_context, &c_error)
+
+ c_ctxt.wellFormed = 0
+ # The entity was looked up and does not need to be freed.
+ return NULL
cdef void _initSaxDocument(void* ctxt) with gil:
xmlparser.xmlSAX2StartDocument(ctxt)
@@ -1508,12 +1563,14 @@ cdef class XMLParser(_FeedParser):
- strip_cdata - replace CDATA sections by normal text content (default: True)
- compact - save memory for short text content (default: True)
- collect_ids - use a hash table of XML IDs for fast access (default: True, always True with DTD validation)
- - resolve_entities - replace entities by their text value (default: True)
- huge_tree - disable security restrictions and support very deep trees
and very long text content (only affects libxml2 2.7+)
Other keyword arguments:
-
+ - resolve_entities - replace entities by their text value: False for keeping the
+ entity references, True for resolving them, and 'internal' for resolving
+ internal definitions only (no external file/URL access).
+ The default used to be True and was changed to 'internal' in lxml 5.0.
- encoding - override the document encoding
- target - a parser target object that will receive the parse events
- schema - an XMLSchema to validate against
@@ -1525,10 +1582,11 @@ cdef class XMLParser(_FeedParser):
def __init__(self, *, encoding=None, attribute_defaults=False,
dtd_validation=False, load_dtd=False, no_network=True,
ns_clean=False, recover=False, XMLSchema schema=None,
- huge_tree=False, remove_blank_text=False, resolve_entities=True,
+ huge_tree=False, remove_blank_text=False, resolve_entities='internal',
remove_comments=False, remove_pis=False, strip_cdata=True,
collect_ids=True, target=None, compact=True):
cdef int parse_options
+ cdef bint resolve_external = True
parse_options = _XML_DEFAULT_PARSE_OPTIONS
if load_dtd:
parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
@@ -1553,12 +1611,14 @@ cdef class XMLParser(_FeedParser):
parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
if not resolve_entities:
parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
+ elif resolve_entities == 'internal':
+ resolve_external = False
if not strip_cdata:
parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
_BaseParser.__init__(self, parse_options, 0, schema,
remove_comments, remove_pis, strip_cdata,
- collect_ids, target, encoding)
+ collect_ids, target, encoding, resolve_external)
cdef class XMLPullParser(XMLParser):
diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py
index 14b21f7..bc7548f 100644
--- a/src/lxml/tests/test_etree.py
+++ b/src/lxml/tests/test_etree.py
@@ -12,11 +12,14 @@ from __future__ import absolute_import
from collections import OrderedDict
import os.path
import unittest
+import contextlib
import copy
import sys
import re
import gc
import operator
+import shutil
+import tempfile
import textwrap
import zlib
import gzip
@@ -1675,6 +1678,84 @@ class ETreeOnlyTestCase(HelperTestCase):
self.assertEqual(_bytes('<doc>&myentity;</doc>'),
tostring(root))
+ @contextlib.contextmanager
+ def _xml_test_file(self, name, content=b'<evil>XML</evil>'):
+ temp_dir = tempfile.mkdtemp()
+ try:
+ xml_file = os.path.join(temp_dir, name)
+ with open(xml_file, 'wb') as tmpfile:
+ tmpfile.write(content)
+ yield xml_file
+ finally:
+ shutil.rmtree(temp_dir)
+
+ def test_entity_parse_external(self):
+ fromstring = self.etree.fromstring
+ tostring = self.etree.tostring
+ parser = self.etree.XMLParser(resolve_entities=True)
+
+ with self._xml_test_file("entity.xml") as entity_file:
+ xml = '''
+ <!DOCTYPE doc [
+ <!ENTITY my_external_entity SYSTEM "%s">
+ ]>
+ <doc>&my_external_entity;</doc>
+ ''' % path2url(entity_file)
+ root = fromstring(xml, parser)
+
+ self.assertEqual(_bytes('<doc><evil>XML</evil></doc>'),
+ tostring(root))
+ self.assertEqual(root.tag, 'doc')
+ self.assertEqual(root[0].tag, 'evil')
+ self.assertEqual(root[0].text, 'XML')
+ self.assertEqual(root[0].tail, None)
+
+ def test_entity_parse_external_no_resolve(self):
+ fromstring = self.etree.fromstring
+ parser = self.etree.XMLParser(resolve_entities=False)
+ Entity = self.etree.Entity
+
+ with self._xml_test_file("entity.xml") as entity_file:
+ xml = '''
+ <!DOCTYPE doc [
+ <!ENTITY my_external_entity SYSTEM "%s">
+ ]>
+ <doc>&my_external_entity;</doc>
+ ''' % path2url(entity_file)
+ root = fromstring(xml, parser)
+
+ self.assertEqual(root[0].tag, Entity)
+ self.assertEqual(root[0].text, "&my_external_entity;")
+
+ def test_entity_parse_no_external_default(self):
+ fromstring = self.etree.fromstring
+
+ with self._xml_test_file("entity.xml") as entity_file:
+ xml = '''
+ <!DOCTYPE doc [
+ <!ENTITY my_failing_external_entity SYSTEM "%s">
+ ]>
+ <doc>&my_failing_external_entity;</doc>
+ ''' % path2url(entity_file)
+
+ try:
+ fromstring(xml)
+ except self.etree.XMLSyntaxError as exc:
+ exception = exc
+ else:
+ self.assertTrue(False, "XMLSyntaxError was not raised")
+
+ self.assertIn("my_failing_external_entity", str(exception))
+ self.assertTrue(exception.error_log)
+ # Depending on the libxml2 version, we get different errors here,
+ # not necessarily the one that lxml produced. But it should fail either way.
+ for error in exception.error_log:
+ if "my_failing_external_entity" in error.message:
+ self.assertEqual(5, error.line)
+ break
+ else:
+ self.assertFalse("entity error not found in parser error log")
+
def test_entity_restructure(self):
xml = _bytes('''<!DOCTYPE root [ <!ENTITY nbsp "&#160;"> ]>
<root>
--
2.33.0

View File

@ -0,0 +1,163 @@
From 69a747356655158fdf9abaecea5feafb3bd6b5f5 Mon Sep 17 00:00:00 2001
From: Stefan Behnel <stefan_ml@behnel.de>
Date: Sat, 11 Dec 2021 12:19:21 +0100
Subject: [PATCH] Cleaner: cover some more cases where scripts could sneak
through in specially crafted style content.
---
src/lxml/html/clean.py | 20 ++++++------
src/lxml/html/tests/test_clean.py | 65 ++++++++++++++++++++++++++++++++++++++-
2 files changed, 73 insertions(+), 12 deletions(-)
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
index 4df10c2..0e96627 100644
--- a/src/lxml/html/clean.py
+++ b/src/lxml/html/clean.py
@@ -74,22 +74,20 @@ _looks_like_tag_content = re.compile(
# All kinds of schemes besides just javascript: that can cause
# execution:
_find_image_dataurls = re.compile(
- r'^data:image/(.+);base64,', re.I).findall
-_is_possibly_malicious_scheme = re.compile(
+ r'data:image/(.+);base64,', re.I).findall
+_possibly_malicious_schemes = re.compile(
r'(javascript|jscript|livescript|vbscript|data|about|mocha):',
re.I).findall
# SVG images can contain script content
-_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).findall
+_is_unsafe_image_type = re.compile(r"(xml|svg)", re.I).search
-def _is_javascript_scheme(s):
- is_image_url = False
+def _has_javascript_scheme(s):
+ safe_image_urls = 0
for image_type in _find_image_dataurls(s):
- is_image_url = True
if _is_unsafe_image_type(image_type):
return True
- if is_image_url:
- return False
- return bool(_is_possibly_malicious_scheme(s))
+ safe_image_urls += 1
+ return len(_possibly_malicious_schemes(s)) > safe_image_urls
_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
@@ -521,7 +519,7 @@ class Cleaner(object):
def _remove_javascript_link(self, link):
# links like "j a v a s c r i p t:" might be interpreted in IE
new = _substitute_whitespace('', unquote_plus(link))
- if _is_javascript_scheme(new):
+ if _has_javascript_scheme(new):
# FIXME: should this be None to delete?
return ''
return link
@@ -543,7 +541,7 @@ class Cleaner(object):
style = style.replace('\\', '')
style = _substitute_whitespace('', style)
style = style.lower()
- if 'javascript:' in style:
+ if _has_javascript_scheme(style):
return True
if 'expression(' in style:
return True
diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py
index a05d967..aec87cd 100644
--- a/src/lxml/html/tests/test_clean.py
+++ b/src/lxml/html/tests/test_clean.py
@@ -126,7 +126,7 @@ class CleanerTest(unittest.TestCase):
lxml.html.tostring(clean_html(s)))
def test_sneaky_import_in_style(self):
- # Prevent "@@importimport" -> "@import" replacement.
+ # Prevent "@@importimport" -> "@import" replacement etc.
style_codes = [
"@@importimport(extstyle.css)",
"@ @ import import(extstyle.css)",
@@ -134,6 +134,11 @@ class CleanerTest(unittest.TestCase):
"@@ import import(extstyle.css)",
"@ @import import(extstyle.css)",
"@@importimport()",
+ "@@importimport() ()",
+ "@/* ... */import()",
+ "@im/* ... */port()",
+ "@ @import/* ... */import()",
+ "@ /* ... */ import()",
]
for style_code in style_codes:
html = '<style>%s</style>' % style_code
@@ -145,6 +150,41 @@ class CleanerTest(unittest.TestCase):
cleaned,
"%s -> %s" % (style_code, cleaned))
+ def test_sneaky_schemes_in_style(self):
+ style_codes = [
+ "javasjavascript:cript:",
+ "javascriptjavascript::",
+ "javascriptjavascript:: :",
+ "vbjavascript:cript:",
+ ]
+ for style_code in style_codes:
+ html = '<style>%s</style>' % style_code
+ s = lxml.html.fragment_fromstring(html)
+
+ cleaned = lxml.html.tostring(clean_html(s))
+ self.assertEqual(
+ b'<style>/* deleted */</style>',
+ cleaned,
+ "%s -> %s" % (style_code, cleaned))
+
+ def test_sneaky_urls_in_style(self):
+ style_codes = [
+ "url(data:image/svg+xml;base64,...)",
+ "url(javasjavascript:cript:)",
+ "url(javasjavascript:cript: ::)",
+ "url(vbjavascript:cript:)",
+ "url(vbjavascript:cript: :)",
+ ]
+ for style_code in style_codes:
+ html = '<style>%s</style>' % style_code
+ s = lxml.html.fragment_fromstring(html)
+
+ cleaned = lxml.html.tostring(clean_html(s))
+ self.assertEqual(
+ b'<style>url()</style>',
+ cleaned,
+ "%s -> %s" % (style_code, cleaned))
+
def test_svg_data_links(self):
# Remove SVG images with potentially insecure content.
svg = b'<svg onload="alert(123)" />'
@@ -188,6 +228,29 @@ class CleanerTest(unittest.TestCase):
cleaned,
"%s -> %s" % (url, cleaned))
+ def test_image_data_links_in_style(self):
+ data = b'123'
+ data_b64 = base64.b64encode(data).decode('ASCII')
+ urls = [
+ "data:image/jpeg;base64," + data_b64,
+ "data:image/apng;base64," + data_b64,
+ "data:image/png;base64," + data_b64,
+ "data:image/gif;base64," + data_b64,
+ "data:image/webp;base64," + data_b64,
+ "data:image/bmp;base64," + data_b64,
+ "data:image/tiff;base64," + data_b64,
+ "data:image/x-icon;base64," + data_b64,
+ ]
+ for url in urls:
+ html = '<style> url(%s) </style>' % url
+ s = lxml.html.fragment_fromstring(html)
+
+ cleaned = lxml.html.tostring(clean_html(s))
+ self.assertEqual(
+ html.encode("UTF-8"),
+ cleaned,
+ "%s -> %s" % (url, cleaned))
+
def test_formaction_attribute_in_button_input(self):
# The formaction attribute overrides the form's action and should be
# treated as a malicious link attribute
--
2.13.7

View File

@ -0,0 +1,56 @@
From c742576c105f40fc8b754fcae56fee4aa35840a3 Mon Sep 17 00:00:00 2001
From: Stefan Behnel <stefan_ml@behnel.de>
Date: Tue, 19 Jul 2022 08:25:20 +0200
Subject: [PATCH] Work around libxml2 bug in affected versions that failed to
reset the namespace count in the parser context.
See https://gitlab.gnome.org/GNOME/libxml2/-/issues/378
---
src/lxml/includes/xmlparser.pxd | 1 +
src/lxml/parser.pxi | 3 +++
src/lxml/tests/test_etree.py | 3 +--
3 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/src/lxml/includes/xmlparser.pxd b/src/lxml/includes/xmlparser.pxd
index a196e34..45acfc8 100644
--- a/src/lxml/includes/xmlparser.pxd
+++ b/src/lxml/includes/xmlparser.pxd
@@ -144,6 +144,7 @@ cdef extern from "libxml/parser.h":
void* userData
int* spaceTab
int spaceMax
+ int nsNr
bint html
bint progressive
int inSubset
diff --git a/src/lxml/parser.pxi b/src/lxml/parser.pxi
index f5baf29..f0c8c6b 100644
--- a/src/lxml/parser.pxi
+++ b/src/lxml/parser.pxi
@@ -569,6 +569,9 @@ cdef class _ParserContext(_ResolverContext):
self._c_ctxt.disableSAX = 0 # work around bug in libxml2
else:
xmlparser.xmlClearParserCtxt(self._c_ctxt)
+ # work around bug in libxml2 [2.9.10 .. 2.9.14]:
+ # https://gitlab.gnome.org/GNOME/libxml2/-/issues/378
+ self._c_ctxt.nsNr = 0
cdef int prepare(self, bint set_document_loader=True) except -1:
cdef int result
diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py
index 8bf82c0..0339796 100644
--- a/src/lxml/tests/test_etree.py
+++ b/src/lxml/tests/test_etree.py
@@ -1491,8 +1491,7 @@ class ETreeOnlyTestCase(HelperTestCase):
# This would be the expected result, because there was no namespace
pass
else:
- # This is a bug in libxml2
- assert not ns, repr(ns)
+ assert False, "Found unexpected namespace '%s'" % ns
def test_itertext_comment_pi(self):
# https://bugs.launchpad.net/lxml/+bug/1844674
--
1.8.3.1

View File

@ -7,7 +7,7 @@ The latest release works with all CPython versions from 2.7 to 3.7.
Name: python-%{modname}
Version: 4.5.2
Release: 5
Release: 9
Summary: XML processing library combining libxml2/libxslt with the ElementTree API
License: BSD
URL: http://lxml.de
@ -18,6 +18,10 @@ Patch6001: backport-CVE-2020-27783-2.patch
Patch6002: backport-CVE-2021-28957.patch
Patch6003: backport-0001-CVE-2021-43818.patch
Patch6004: backport-0002-CVE-2021-43818.patch
Patch6005: backport-Cleaner-cover-some-more-cases-where-scripts-could-sn.patch
Patch6006: backport-CVE-2022-2309.patch
Patch6007: backport-Work-around-libxml2-bug-in-affected-versions.patch
Patch6008: backport-CVE-2024-37388.patch
BuildRequires: gcc libxml2-devel libxslt-devel
@ -41,6 +45,8 @@ BuildRequires: python3-devel python3-setuptools python3-Cython
%prep
%autosetup -n %{modname}-%{version} -p1
# Remove pregenerated Cython C sources
find -type f -name '*.c' -print -delete
%build
export WITH_CYTHON=true
@ -68,6 +74,21 @@ make test3
%doc README.rst src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt
%changelog
* Wed Jun 12 2024 zhuofeng <zhuofeng2@huawei.com> - 4.5.2-9
- Type:CVE
- CVE:CVE-2024-37388
- SUG:NA
- DESC:fix CVE-2024-37388
* Mon Jul 25 2022 shixuantong <shixuantong@h-partners.com> - 4.5.2-8
- Remove pregenerated Cython C sources
* Thu Jul 21 2022 renhongxun <renhongxun@h-partners.com> - 4.5.2-7
- fix CVE-2022-2309
* Sat Jan 22 2022 shixuantong <shixuantong@huawei.com> - 4.5.2-6
- Cleaner: cover some more cases where scripts could sneak through in specially crafted style content.
* Wed Jan 19 2022 shixuantong <shixuantong@huawei.com> - 4.5.2-5
- enable check
@ -80,7 +101,7 @@ make test3
* Fri Feb 05 2021 shixuantong <shixuantong@huawei.com> - 4.5.2-2
- fix CVE-2020-27783
* Tue Jan 05 2020 shixuantong <shixuantong@huawei.com> - 4.5.2-1
* Tue Jan 05 2021 shixuantong <shixuantong@huawei.com> - 4.5.2-1
- update version to 4.5.2
* Fri Aug 21 2020 shixuantong <shixuantong@huawei.com> - 4.2.3-5