From 8623260fb0949d368376a128bee2189ec0a67ae5 Mon Sep 17 00:00:00 2001 From: nkrapp Date: Mon, 22 Jul 2024 09:43:08 +0200 Subject: [PATCH] [PATCH] [4.2.x] Fixed CVE-2024-38875 -- Mitigated potential DoS in urlize and urlizetrunc template filters. --- django/utils/html.py | 72 +++++++++++++++++++++++++++++----- tests/utils_tests/test_html.py | 21 ++++++---- 2 files changed, 75 insertions(+), 17 deletions(-) diff --git a/django/utils/html.py b/django/utils/html.py index 7a33d5f68d..1dbe39ccd1 100644 --- a/django/utils/html.py +++ b/django/utils/html.py @@ -1,5 +1,6 @@ """HTML utilities suitable for global use.""" +import html import json import re from html.parser import HTMLParser @@ -235,6 +235,16 @@ def smart_urlquote(url): return urlunsplit((scheme, netloc, path, query, fragment)) +class CountsDict(dict): + def __init__(self, *args, word, **kwargs): + super().__init__(*args, *kwargs) + self.word = word + + def __missing__(self, key): + self[key] = self.word.count(key) + return self[key] + + @keep_lazy_text def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): """ @@ -259,6 +269,15 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): return x return '%s…' % x[:max(0, limit - 1)] + def wrapping_punctuation_openings(): + return "".join(dict(WRAPPING_PUNCTUATION).keys()) + + def trailing_punctuation_chars_no_semicolon(): + return TRAILING_PUNCTUATION_CHARS.replace(";", "") + + def trailing_punctuation_chars_has_semicolon(): + return ";" in TRAILING_PUNCTUATION_CHARS + def unescape(text): """ If input URL is HTML-escaped, unescape it so that it can be safely fed @@ -273,21 +292,53 @@ def urlize(text, trim_url_limit=None, nofollow=False, autoescape=False): Trim trailing and wrapping punctuation from `middle`. Return the items of the new state. """ + # Strip all opening wrapping punctuation. + middle = word.lstrip(wrapping_punctuation_openings()) + lead = word[: len(word) - len(middle)] + trail = "" + # Continue trimming until middle remains unchanged. trimmed_something = True - while trimmed_something: + counts = CountsDict(word=middle) + while trimmed_something and middle: trimmed_something = False # Trim wrapping punctuation. for opening, closing in WRAPPING_PUNCTUATION: - if middle.startswith(opening): - middle = middle[len(opening):] - lead += opening - trimmed_something = True - # Keep parentheses at the end only if they're balanced. - if (middle.endswith(closing) and - middle.count(closing) == middle.count(opening) + 1): - middle = middle[:-len(closing)] - trail = closing + trail + if counts[opening] < counts[closing]: + rstripped = middle.rstrip(closing) + if rstripped != middle: + strip = counts[closing] - counts[opening] + trail = middle[-strip:] + middle = middle[:-strip] + trimmed_something = True + counts[closing] -= strip + + rstripped = middle.rstrip(trailing_punctuation_chars_no_semicolon()) + if rstripped != middle: + trail = middle[len(rstripped) :] + trail + middle = rstripped + trimmed_something = True + + if trailing_punctuation_chars_has_semicolon() and middle.endswith(";"): + # Only strip if not part of an HTML entity. + amp = middle.rfind("&") + if amp == -1: + can_strip = True + else: + potential_entity = middle[amp:] + escaped = html.unescape(potential_entity) + can_strip = (escaped == potential_entity) or escaped.endswith(";") + + if can_strip: + rstripped = middle.rstrip(";") + amount_stripped = len(middle) - len(rstripped) + if amp > -1 and amount_stripped > 1: + # Leave a trailing semicolon as might be an entity. + trail = middle[len(rstripped) + 1 :] + trail + middle = rstripped + ";" + else: + trail = middle[len(rstripped) :] + trail + middle = rstripped trimmed_something = True # Trim trailing punctuation (after trimming wrapping punctuation, # as encoded entities contain ';'). Unescape entites to avoid diff --git a/tests/utils_tests/test_html.py b/tests/utils_tests/test_html.py index 5cc2d9b95d..dc89009b63 100644 --- a/tests/utils_tests/test_html.py +++ b/tests/utils_tests/test_html.py @@ -260,13 +260,20 @@ class TestUtilsHtml(SimpleTestCase): def test_urlize_unchanged_inputs(self): tests = ( - ('a' + '@a' * 50000) + 'a', # simple_email_re catastrophic test - ('a' + '.' * 1000000) + 'a', # trailing_punctuation catastrophic test - 'foo@', - '@foo.com', - 'foo@.example.com', - 'foo@localhost', - 'foo@localhost.', + ("a" + "@a" * 50000) + "a", # simple_email_re catastrophic test + ("a" + "." * 1000000) + "a", # trailing_punctuation catastrophic test + "foo@", + "@foo.com", + "foo@.example.com", + "foo@localhost", + "foo@localhost.", + # trim_punctuation catastrophic tests + "(" * 100_000 + ":" + ")" * 100_000, + "(" * 100_000 + "&:" + ")" * 100_000, + "([" * 100_000 + ":" + "])" * 100_000, + "[(" * 100_000 + ":" + ")]" * 100_000, + "([[" * 100_000 + ":" + "]])" * 100_000, + "&:" + ";" * 100_000, ) for value in tests: with self.subTest(value=value): -- 2.45.2