Skip to content

Commit

Permalink
Review URL percent escaping sets, from whatwg. (#3371)
Browse files Browse the repository at this point in the history
  • Loading branch information
tomchristie authored Oct 28, 2024
1 parent 489fef4 commit d293374
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 27 deletions.
92 changes: 68 additions & 24 deletions httpx/_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,66 @@

PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")

# https://url.spec.whatwg.org/#percent-encoded-bytes

# The fragment percent-encode set is the C0 control percent-encode set
# and U+0020 SPACE, U+0022 ("), U+003C (<), U+003E (>), and U+0060 (`).
FRAG_SAFE = "".join(
[chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x3C, 0x3E, 0x60)]
)

# The query percent-encode set is the C0 control percent-encode set
# and U+0020 SPACE, U+0022 ("), U+0023 (#), U+003C (<), and U+003E (>).
QUERY_SAFE = "".join(
[chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E)]
)

# The path percent-encode set is the query percent-encode set
# and U+003F (?), U+0060 (`), U+007B ({), and U+007D (}).
PATH_SAFE = "".join(
[
chr(i)
for i in range(0x20, 0x7F)
if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E) + (0x3F, 0x60, 0x7B, 0x7D)
]
)

# The userinfo percent-encode set is the path percent-encode set
# and U+002F (/), U+003A (:), U+003B (;), U+003D (=), U+0040 (@),
# U+005B ([) to U+005E (^), inclusive, and U+007C (|).
USERNAME_SAFE = "".join(
[
chr(i)
for i in range(0x20, 0x7F)
if i
not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
+ (0x3F, 0x60, 0x7B, 0x7D)
+ (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
]
)
PASSWORD_SAFE = "".join(
[
chr(i)
for i in range(0x20, 0x7F)
if i
not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
+ (0x3F, 0x60, 0x7B, 0x7D)
+ (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
]
)
# Note... The terminology 'userinfo' percent-encode set in the WHATWG document
# is used for the username and password quoting. For the joint userinfo component
# we remove U+003A (:) from the safe set.
USERINFO_SAFE = "".join(
[
chr(i)
for i in range(0x20, 0x7F)
if i
not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
+ (0x3F, 0x60, 0x7B, 0x7D)
+ (0x2F, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
]
)

# {scheme}: (optional)
# //{authority} (optional)
Expand Down Expand Up @@ -182,8 +242,8 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:

# Replace "username" and/or "password" with "userinfo".
if "username" in kwargs or "password" in kwargs:
username = quote(kwargs.pop("username", "") or "")
password = quote(kwargs.pop("password", "") or "")
username = quote(kwargs.pop("username", "") or "", safe=USERNAME_SAFE)
password = quote(kwargs.pop("password", "") or "", safe=PASSWORD_SAFE)
kwargs["userinfo"] = f"{username}:{password}" if password else username

# Replace "raw_path" with "path" and "query".
Expand Down Expand Up @@ -238,7 +298,7 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
authority = kwargs.get("authority", url_dict["authority"]) or ""
path = kwargs.get("path", url_dict["path"]) or ""
query = kwargs.get("query", url_dict["query"])
fragment = kwargs.get("fragment", url_dict["fragment"])
frag = kwargs.get("fragment", url_dict["fragment"])

# The AUTHORITY_REGEX will always match, but may have empty components.
authority_match = AUTHORITY_REGEX.match(authority)
Expand All @@ -255,7 +315,7 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
# We end up with a parsed representation of the URL,
# with components that are plain ASCII bytestrings.
parsed_scheme: str = scheme.lower()
parsed_userinfo: str = quote(userinfo, safe=SUB_DELIMS + ":")
parsed_userinfo: str = quote(userinfo, safe=USERINFO_SAFE)
parsed_host: str = encode_host(host)
parsed_port: int | None = normalize_port(port, scheme)

Expand All @@ -267,25 +327,9 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
if has_scheme or has_authority:
path = normalize_path(path)

# The GEN_DELIMS set is... : / ? # [ ] @
# These do not need to be percent-quoted unless they serve as delimiters for the
# specific component.
WHATWG_SAFE = '`{}%|^\\"'

# For 'path' we need to drop ? and # from the GEN_DELIMS set.
parsed_path: str = quote(path, safe=SUB_DELIMS + WHATWG_SAFE + ":/[]@")
# For 'query' we need to drop '#' from the GEN_DELIMS set.
parsed_query: str | None = (
None
if query is None
else quote(query, safe=SUB_DELIMS + WHATWG_SAFE + ":/?[]@")
)
# For 'fragment' we can include all of the GEN_DELIMS set.
parsed_fragment: str | None = (
None
if fragment is None
else quote(fragment, safe=SUB_DELIMS + WHATWG_SAFE + ":/?#[]@")
)
parsed_path: str = quote(path, safe=PATH_SAFE)
parsed_query: str | None = None if query is None else quote(query, safe=QUERY_SAFE)
parsed_frag: str | None = None if frag is None else quote(frag, safe=FRAG_SAFE)

# The parsed ASCII bytestrings are our canonical form.
# All properties of the URL are derived from these.
Expand All @@ -296,7 +340,7 @@ def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
parsed_port,
parsed_path,
parsed_query,
parsed_fragment,
parsed_frag,
)


Expand Down
6 changes: 3 additions & 3 deletions tests/models/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def test_param_with_space():
assert str(url) == "http://webservice?u=with%20spaces"


def test_param_does_not_require_encoding():
def test_param_requires_encoding():
# Params passed as form key-value pairs should be escaped.
url = httpx.URL("http://webservice", params={"u": "%"})
assert str(url) == "http://webservice?u=%25"
Expand Down Expand Up @@ -614,10 +614,10 @@ def test_url_copywith_userinfo_subcomponents():
}
url = httpx.URL("https://example.org")
new = url.copy_with(**copy_with_kwargs)
assert str(new) == "https://tom%40example.org:abc123%40%20%25@example.org"
assert str(new) == "https://tom%40example.org:abc123%40%20%@example.org"
assert new.username == "tom@example.org"
assert new.password == "abc123@ %"
assert new.userinfo == b"tom%40example.org:abc123%40%20%25"
assert new.userinfo == b"tom%40example.org:abc123%40%20%"


def test_url_copywith_invalid_component():
Expand Down

0 comments on commit d293374

Please # to comment.