9
9
import json
10
10
import logging
11
11
import os
12
- import re
13
12
import urllib .parse
14
13
import urllib .request
15
- import xml .etree .ElementTree
16
14
from html .parser import HTMLParser
17
15
from optparse import Values
18
16
from typing import (
39
37
from pip ._internal .network .session import PipSession
40
38
from pip ._internal .network .utils import raise_for_status
41
39
from pip ._internal .utils .filetypes import is_archive_file
42
- from pip ._internal .utils .misc import pairwise , redact_auth_from_url
40
+ from pip ._internal .utils .misc import redact_auth_from_url
43
41
from pip ._internal .vcs import vcs
44
42
45
43
from .sources import CandidatesFromPage , LinkSource , build_source
51
49
52
50
logger = logging .getLogger (__name__ )
53
51
54
- HTMLElement = xml .etree .ElementTree .Element
55
52
ResponseHeaders = MutableMapping [str , str ]
56
53
57
54
@@ -191,94 +188,6 @@ def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]:
191
188
return None
192
189
193
190
194
- def _clean_url_path_part (part : str ) -> str :
195
- """
196
- Clean a "part" of a URL path (i.e. after splitting on "@" characters).
197
- """
198
- # We unquote prior to quoting to make sure nothing is double quoted.
199
- return urllib .parse .quote (urllib .parse .unquote (part ))
200
-
201
-
202
- def _clean_file_url_path (part : str ) -> str :
203
- """
204
- Clean the first part of a URL path that corresponds to a local
205
- filesystem path (i.e. the first part after splitting on "@" characters).
206
- """
207
- # We unquote prior to quoting to make sure nothing is double quoted.
208
- # Also, on Windows the path part might contain a drive letter which
209
- # should not be quoted. On Linux where drive letters do not
210
- # exist, the colon should be quoted. We rely on urllib.request
211
- # to do the right thing here.
212
- return urllib .request .pathname2url (urllib .request .url2pathname (part ))
213
-
214
-
215
- # percent-encoded: /
216
- _reserved_chars_re = re .compile ("(@|%2F)" , re .IGNORECASE )
217
-
218
-
219
- def _clean_url_path (path : str , is_local_path : bool ) -> str :
220
- """
221
- Clean the path portion of a URL.
222
- """
223
- if is_local_path :
224
- clean_func = _clean_file_url_path
225
- else :
226
- clean_func = _clean_url_path_part
227
-
228
- # Split on the reserved characters prior to cleaning so that
229
- # revision strings in VCS URLs are properly preserved.
230
- parts = _reserved_chars_re .split (path )
231
-
232
- cleaned_parts = []
233
- for to_clean , reserved in pairwise (itertools .chain (parts , ["" ])):
234
- cleaned_parts .append (clean_func (to_clean ))
235
- # Normalize %xx escapes (e.g. %2f -> %2F)
236
- cleaned_parts .append (reserved .upper ())
237
-
238
- return "" .join (cleaned_parts )
239
-
240
-
241
- def _clean_link (url : str ) -> str :
242
- """
243
- Make sure a link is fully quoted.
244
- For example, if ' ' occurs in the URL, it will be replaced with "%20",
245
- and without double-quoting other characters.
246
- """
247
- # Split the URL into parts according to the general structure
248
- # `scheme://netloc/path;parameters?query#fragment`.
249
- result = urllib .parse .urlparse (url )
250
- # If the netloc is empty, then the URL refers to a local filesystem path.
251
- is_local_path = not result .netloc
252
- path = _clean_url_path (result .path , is_local_path = is_local_path )
253
- return urllib .parse .urlunparse (result ._replace (path = path ))
254
-
255
-
256
- def _create_link_from_element (
257
- element_attribs : Dict [str , Optional [str ]],
258
- page_url : str ,
259
- base_url : str ,
260
- ) -> Optional [Link ]:
261
- """
262
- Convert an anchor element's attributes in a simple repository page to a Link.
263
- """
264
- href = element_attribs .get ("href" )
265
- if not href :
266
- return None
267
-
268
- url = _clean_link (urllib .parse .urljoin (base_url , href ))
269
- pyrequire = element_attribs .get ("data-requires-python" )
270
- yanked_reason = element_attribs .get ("data-yanked" )
271
-
272
- link = Link (
273
- url ,
274
- comes_from = page_url ,
275
- requires_python = pyrequire ,
276
- yanked_reason = yanked_reason ,
277
- )
278
-
279
- return link
280
-
281
-
282
191
class CacheablePageContent :
283
192
def __init__ (self , page : "IndexContent" ) -> None :
284
193
assert page .cache_link_parsing
@@ -326,25 +235,10 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
326
235
if content_type_l .startswith ("application/vnd.pypi.simple.v1+json" ):
327
236
data = json .loads (page .content )
328
237
for file in data .get ("files" , []):
329
- file_url = file . get ( " url" )
330
- if file_url is None :
238
+ link = Link . from_json ( file , page . url )
239
+ if link is None :
331
240
continue
332
-
333
- # The Link.yanked_reason expects an empty string instead of a boolean.
334
- yanked_reason = file .get ("yanked" )
335
- if yanked_reason and not isinstance (yanked_reason , str ):
336
- yanked_reason = ""
337
- # The Link.yanked_reason expects None instead of False
338
- elif not yanked_reason :
339
- yanked_reason = None
340
-
341
- yield Link (
342
- _clean_link (urllib .parse .urljoin (page .url , file_url )),
343
- comes_from = page .url ,
344
- requires_python = file .get ("requires-python" ),
345
- yanked_reason = yanked_reason ,
346
- hashes = file .get ("hashes" , {}),
347
- )
241
+ yield link
348
242
return
349
243
350
244
parser = HTMLLinkParser (page .url )
@@ -354,11 +248,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]:
354
248
url = page .url
355
249
base_url = parser .base_url or url
356
250
for anchor in parser .anchors :
357
- link = _create_link_from_element (
358
- anchor ,
359
- page_url = url ,
360
- base_url = base_url ,
361
- )
251
+ link = Link .from_element (anchor , page_url = url , base_url = base_url )
362
252
if link is None :
363
253
continue
364
254
yield link
0 commit comments