From ef658dc3b6389b091d608e710a810ce8b87995b3 Mon Sep 17 00:00:00 2001 From: David Lord Date: Sun, 31 Jan 2021 07:54:40 -0800 Subject: [PATCH] speed up urlize matching --- CHANGES.rst | 10 +++++ src/jinja2/utils.py | 107 +++++++++++++++++++++++--------------------- 2 files changed, 66 insertions(+), 51 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 9b8b24ee0..6dfe91258 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,5 +1,15 @@ .. currentmodule:: jinja2 +Version 2.11.3 +-------------- + +Unreleased + +- Improve the speed of the ``urlize`` filter by reducing regex + backtracking. Email matching requires a word character at the start + of the domain part, and only word characters in the TLD. :pr:`1343` + + Version 2.11.2 -------------- diff --git a/src/jinja2/utils.py b/src/jinja2/utils.py index b422ba968..6afca8105 100644 --- a/src/jinja2/utils.py +++ b/src/jinja2/utils.py @@ -6,6 +6,8 @@ from collections import deque from random import choice from random import randrange +from string import ascii_letters as _letters +from string import digits as _digits from threading import Lock from markupsafe import escape @@ -16,20 +18,6 @@ from ._compat import text_type from ._compat import url_quote -_word_split_re = re.compile(r"(\s+)") -_punctuation_re = re.compile( - "^(?P(?:%s)*)(?P.*?)(?P(?:%s)*)$" - % ( - "|".join(map(re.escape, ("(", "<", "<"))), - "|".join(map(re.escape, (".", ",", ")", ">", "\n", ">"))), - ) -) -_simple_email_re = re.compile(r"^\S+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9._-]+$") -_striptags_re = re.compile(r"(|<[^>]*>)") -_entity_re = re.compile(r"&([^;]+);") -_letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" -_digits = "0123456789" - # special singleton representing missing values for the runtime missing = type("MissingType", (), {"__repr__": lambda x: "missing"})() @@ -210,48 +198,65 @@ def urlize(text, trim_url_limit=None, rel=None, target=None): and (x[:limit] + (len(x) >= limit and "..." or "")) or x ) - words = _word_split_re.split(text_type(escape(text))) + words = re.split(r"(\s+)", text_type(escape(text))) rel_attr = rel and ' rel="%s"' % text_type(escape(rel)) or "" target_attr = target and ' target="%s"' % escape(target) or "" for i, word in enumerate(words): - match = _punctuation_re.match(word) + head, middle, tail = "", word, "" + match = re.match(r"^([(<]|<)+", middle) + if match: - lead, middle, trail = match.groups() - if middle.startswith("www.") or ( - "@" not in middle - and not middle.startswith("http://") - and not middle.startswith("https://") - and len(middle) > 0 - and middle[0] in _letters + _digits - and ( - middle.endswith(".org") - or middle.endswith(".net") - or middle.endswith(".com") - ) - ): - middle = '%s' % ( - middle, - rel_attr, - target_attr, - trim_url(middle), - ) - if middle.startswith("http://") or middle.startswith("https://"): - middle = '%s' % ( - middle, - rel_attr, - target_attr, - trim_url(middle), - ) - if ( - "@" in middle - and not middle.startswith("www.") - and ":" not in middle - and _simple_email_re.match(middle) - ): - middle = '%s' % (middle, middle) - if lead + middle + trail != word: - words[i] = lead + middle + trail + head = match.group() + middle = middle[match.end() :] + + # Unlike lead, which is anchored to the start of the string, + # need to check that the string ends with any of the characters + # before trying to match all of them, to avoid backtracking. + if middle.endswith((")", ">", ".", ",", "\n", ">")): + match = re.search(r"([)>.,\n]|>)+$", middle) + + if match: + tail = match.group() + middle = middle[: match.start()] + + if middle.startswith("www.") or ( + "@" not in middle + and not middle.startswith("http://") + and not middle.startswith("https://") + and len(middle) > 0 + and middle[0] in _letters + _digits + and ( + middle.endswith(".org") + or middle.endswith(".net") + or middle.endswith(".com") + ) + ): + middle = '%s' % ( + middle, + rel_attr, + target_attr, + trim_url(middle), + ) + + if middle.startswith("http://") or middle.startswith("https://"): + middle = '%s' % ( + middle, + rel_attr, + target_attr, + trim_url(middle), + ) + + if ( + "@" in middle + and not middle.startswith("www.") + and ":" not in middle + and re.match(r"^\S+@\w[\w.-]*\.\w+$", middle) + ): + middle = '%s' % (middle, middle) + + words[i] = head + middle + tail + return u"".join(words)