comparison venv/lib/python2.7/site-packages/pip/index.py @ 0:d67268158946 draft

planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author bcclaywell
date Mon, 12 Oct 2015 17:43:33 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d67268158946
1 """Routines related to PyPI, indexes"""
2 from __future__ import absolute_import
3
4 import logging
5 import cgi
6 import sys
7 import os
8 import re
9 import mimetypes
10 import posixpath
11 import warnings
12
13 from pip._vendor.six.moves.urllib import parse as urllib_parse
14 from pip._vendor.six.moves.urllib import request as urllib_request
15
16 from pip.compat import ipaddress
17 from pip.utils import (
18 Inf, cached_property, normalize_name, splitext, normalize_path)
19 from pip.utils.deprecation import RemovedInPip7Warning, RemovedInPip8Warning
20 from pip.utils.logging import indent_log
21 from pip.exceptions import (
22 DistributionNotFound, BestVersionAlreadyInstalled, InvalidWheelFilename,
23 UnsupportedWheel,
24 )
25 from pip.download import url_to_path, path_to_url
26 from pip.models import PyPI
27 from pip.wheel import Wheel, wheel_ext
28 from pip.pep425tags import supported_tags, supported_tags_noarch, get_platform
29 from pip.req.req_requirement import InstallationCandidate
30 from pip._vendor import html5lib, requests, pkg_resources, six
31 from pip._vendor.packaging.version import parse as parse_version
32 from pip._vendor.requests.exceptions import SSLError
33
34
35 __all__ = ['PackageFinder']
36
37
38 # Taken from Chrome's list of secure origins (See: http://bit.ly/1qrySKC)
39 SECURE_ORIGINS = [
40 # protocol, hostname, port
41 ("https", "*", "*"),
42 ("*", "localhost", "*"),
43 ("*", "127.0.0.0/8", "*"),
44 ("*", "::1/128", "*"),
45 ("file", "*", None),
46 ]
47
48
49 logger = logging.getLogger(__name__)
50
51
52 class PackageFinder(object):
53 """This finds packages.
54
55 This is meant to match easy_install's technique for looking for
56 packages, by reading pages and looking for appropriate links
57 """
58
59 def __init__(self, find_links, index_urls,
60 use_wheel=True, allow_external=(), allow_unverified=(),
61 allow_all_external=False, allow_all_prereleases=False,
62 trusted_hosts=None, process_dependency_links=False,
63 session=None):
64 if session is None:
65 raise TypeError(
66 "PackageFinder() missing 1 required keyword argument: "
67 "'session'"
68 )
69
70 # Build find_links. If an argument starts with ~, it may be
71 # a local file relative to a home directory. So try normalizing
72 # it and if it exists, use the normalized version.
73 # This is deliberately conservative - it might be fine just to
74 # blindly normalize anything starting with a ~...
75 self.find_links = []
76 for link in find_links:
77 if link.startswith('~'):
78 new_link = normalize_path(link)
79 if os.path.exists(new_link):
80 link = new_link
81 self.find_links.append(link)
82
83 self.index_urls = index_urls
84 self.dependency_links = []
85
86 # These are boring links that have already been logged somehow:
87 self.logged_links = set()
88
89 self.use_wheel = use_wheel
90
91 # Do we allow (safe and verifiable) externally hosted files?
92 self.allow_external = set(normalize_name(n) for n in allow_external)
93
94 # Which names are allowed to install insecure and unverifiable files?
95 self.allow_unverified = set(
96 normalize_name(n) for n in allow_unverified
97 )
98
99 # Anything that is allowed unverified is also allowed external
100 self.allow_external |= self.allow_unverified
101
102 # Do we allow all (safe and verifiable) externally hosted files?
103 self.allow_all_external = allow_all_external
104
105 # Domains that we won't emit warnings for when not using HTTPS
106 self.secure_origins = [
107 ("*", host, "*")
108 for host in (trusted_hosts if trusted_hosts else [])
109 ]
110
111 # Stores if we ignored any external links so that we can instruct
112 # end users how to install them if no distributions are available
113 self.need_warn_external = False
114
115 # Stores if we ignored any unsafe links so that we can instruct
116 # end users how to install them if no distributions are available
117 self.need_warn_unverified = False
118
119 # Do we want to allow _all_ pre-releases?
120 self.allow_all_prereleases = allow_all_prereleases
121
122 # Do we process dependency links?
123 self.process_dependency_links = process_dependency_links
124
125 # The Session we'll use to make requests
126 self.session = session
127
128 def add_dependency_links(self, links):
129 # # FIXME: this shouldn't be global list this, it should only
130 # # apply to requirements of the package that specifies the
131 # # dependency_links value
132 # # FIXME: also, we should track comes_from (i.e., use Link)
133 if self.process_dependency_links:
134 warnings.warn(
135 "Dependency Links processing has been deprecated and will be "
136 "removed in a future release.",
137 RemovedInPip7Warning,
138 )
139 self.dependency_links.extend(links)
140
141 def _sort_locations(self, locations):
142 """
143 Sort locations into "files" (archives) and "urls", and return
144 a pair of lists (files,urls)
145 """
146 files = []
147 urls = []
148
149 # puts the url for the given file path into the appropriate list
150 def sort_path(path):
151 url = path_to_url(path)
152 if mimetypes.guess_type(url, strict=False)[0] == 'text/html':
153 urls.append(url)
154 else:
155 files.append(url)
156
157 for url in locations:
158
159 is_local_path = os.path.exists(url)
160 is_file_url = url.startswith('file:')
161 is_find_link = url in self.find_links
162
163 if is_local_path or is_file_url:
164 if is_local_path:
165 path = url
166 else:
167 path = url_to_path(url)
168 if is_find_link and os.path.isdir(path):
169 path = os.path.realpath(path)
170 for item in os.listdir(path):
171 sort_path(os.path.join(path, item))
172 elif is_file_url and os.path.isdir(path):
173 urls.append(url)
174 elif os.path.isfile(path):
175 sort_path(path)
176 else:
177 urls.append(url)
178
179 return files, urls
180
181 def _candidate_sort_key(self, candidate):
182 """
183 Function used to generate link sort key for link tuples.
184 The greater the return value, the more preferred it is.
185 If not finding wheels, then sorted by version only.
186 If finding wheels, then the sort order is by version, then:
187 1. existing installs
188 2. wheels ordered via Wheel.support_index_min()
189 3. source archives
190 Note: it was considered to embed this logic into the Link
191 comparison operators, but then different sdist links
192 with the same version, would have to be considered equal
193 """
194 if self.use_wheel:
195 support_num = len(supported_tags)
196 if candidate.location == INSTALLED_VERSION:
197 pri = 1
198 elif candidate.location.is_wheel:
199 # can raise InvalidWheelFilename
200 wheel = Wheel(candidate.location.filename)
201 if not wheel.supported():
202 raise UnsupportedWheel(
203 "%s is not a supported wheel for this platform. It "
204 "can't be sorted." % wheel.filename
205 )
206 pri = -(wheel.support_index_min())
207 else: # sdist
208 pri = -(support_num)
209 return (candidate.version, pri)
210 else:
211 return candidate.version
212
213 def _sort_versions(self, applicable_versions):
214 """
215 Bring the latest version (and wheels) to the front, but maintain the
216 existing ordering as secondary. See the docstring for `_link_sort_key`
217 for details. This function is isolated for easier unit testing.
218 """
219 return sorted(
220 applicable_versions,
221 key=self._candidate_sort_key,
222 reverse=True
223 )
224
225 def _validate_secure_origin(self, logger, location):
226 # Determine if this url used a secure transport mechanism
227 parsed = urllib_parse.urlparse(str(location))
228 origin = (parsed.scheme, parsed.hostname, parsed.port)
229
230 # Determine if our origin is a secure origin by looking through our
231 # hardcoded list of secure origins, as well as any additional ones
232 # configured on this PackageFinder instance.
233 for secure_origin in (SECURE_ORIGINS + self.secure_origins):
234 # Check to see if the protocol matches
235 if origin[0] != secure_origin[0] and secure_origin[0] != "*":
236 continue
237
238 try:
239 # We need to do this decode dance to ensure that we have a
240 # unicode object, even on Python 2.x.
241 addr = ipaddress.ip_address(
242 origin[1]
243 if (
244 isinstance(origin[1], six.text_type) or
245 origin[1] is None
246 )
247 else origin[1].decode("utf8")
248 )
249 network = ipaddress.ip_network(
250 secure_origin[1]
251 if isinstance(secure_origin[1], six.text_type)
252 else secure_origin[1].decode("utf8")
253 )
254 except ValueError:
255 # We don't have both a valid address or a valid network, so
256 # we'll check this origin against hostnames.
257 if origin[1] != secure_origin[1] and secure_origin[1] != "*":
258 continue
259 else:
260 # We have a valid address and network, so see if the address
261 # is contained within the network.
262 if addr not in network:
263 continue
264
265 # Check to see if the port patches
266 if (origin[2] != secure_origin[2] and
267 secure_origin[2] != "*" and
268 secure_origin[2] is not None):
269 continue
270
271 # If we've gotten here, then this origin matches the current
272 # secure origin and we should break out of the loop and continue
273 # on.
274 break
275 else:
276 # If the loop successfully completed without a break, that means
277 # that the origin we are testing is not a secure origin.
278 logger.warning(
279 "This repository located at %s is not a trusted host, if "
280 "this repository is available via HTTPS it is recommend to "
281 "use HTTPS instead, otherwise you may silence this warning "
282 "with '--trusted-host %s'.",
283 parsed.hostname,
284 parsed.hostname,
285 )
286
287 warnings.warn(
288 "Implicitly allowing locations which are not hosted at a "
289 "secure origin is deprecated and will require the use of "
290 "--trusted-host in the future.",
291 RemovedInPip7Warning,
292 )
293
294 def _get_index_urls_locations(self, project_name):
295 """Returns the locations found via self.index_urls
296
297 Checks the url_name on the main (first in the list) index and
298 use this url_name to produce all locations
299 """
300
301 def mkurl_pypi_url(url):
302 loc = posixpath.join(url, project_url_name)
303 # For maximum compatibility with easy_install, ensure the path
304 # ends in a trailing slash. Although this isn't in the spec
305 # (and PyPI can handle it without the slash) some other index
306 # implementations might break if they relied on easy_install's
307 # behavior.
308 if not loc.endswith('/'):
309 loc = loc + '/'
310 return loc
311
312 project_url_name = urllib_parse.quote(project_name.lower())
313
314 if self.index_urls:
315 # Check that we have the url_name correctly spelled:
316
317 # Only check main index if index URL is given
318 main_index_url = Link(
319 mkurl_pypi_url(self.index_urls[0]),
320 trusted=True,
321 )
322
323 page = self._get_page(main_index_url)
324 if page is None and PyPI.netloc not in str(main_index_url):
325 warnings.warn(
326 "Failed to find %r at %s. It is suggested to upgrade "
327 "your index to support normalized names as the name in "
328 "/simple/{name}." % (project_name, main_index_url),
329 RemovedInPip8Warning,
330 )
331
332 project_url_name = self._find_url_name(
333 Link(self.index_urls[0], trusted=True),
334 project_url_name,
335 ) or project_url_name
336
337 if project_url_name is not None:
338 return [mkurl_pypi_url(url) for url in self.index_urls]
339 return []
340
341 def _find_all_versions(self, project_name):
342 """Find all available versions for project_name
343
344 This checks index_urls, find_links and dependency_links
345 All versions found are returned
346
347 See _link_package_versions for details on which files are accepted
348 """
349 index_locations = self._get_index_urls_locations(project_name)
350 file_locations, url_locations = self._sort_locations(index_locations)
351 fl_file_loc, fl_url_loc = self._sort_locations(self.find_links)
352 file_locations.extend(fl_file_loc)
353 url_locations.extend(fl_url_loc)
354
355 _flocations, _ulocations = self._sort_locations(self.dependency_links)
356 file_locations.extend(_flocations)
357
358 # We trust every url that the user has given us whether it was given
359 # via --index-url or --find-links
360 locations = [Link(url, trusted=True) for url in url_locations]
361
362 # We explicitly do not trust links that came from dependency_links
363 locations.extend([Link(url) for url in _ulocations])
364
365 logger.debug('%d location(s) to search for versions of %s:',
366 len(locations), project_name)
367 for location in locations:
368 logger.debug('* %s', location)
369 self._validate_secure_origin(logger, location)
370
371 find_links_versions = list(self._package_versions(
372 # We trust every directly linked archive in find_links
373 (Link(url, '-f', trusted=True) for url in self.find_links),
374 project_name.lower()
375 ))
376
377 page_versions = []
378 for page in self._get_pages(locations, project_name):
379 logger.debug('Analyzing links from page %s', page.url)
380 with indent_log():
381 page_versions.extend(
382 self._package_versions(page.links, project_name.lower())
383 )
384
385 dependency_versions = list(self._package_versions(
386 (Link(url) for url in self.dependency_links), project_name.lower()
387 ))
388 if dependency_versions:
389 logger.debug(
390 'dependency_links found: %s',
391 ', '.join([
392 version.location.url for version in dependency_versions
393 ])
394 )
395
396 file_versions = list(
397 self._package_versions(
398 (Link(url) for url in file_locations),
399 project_name.lower()
400 )
401 )
402 if file_versions:
403 file_versions.sort(reverse=True)
404 logger.debug(
405 'Local files found: %s',
406 ', '.join([
407 url_to_path(candidate.location.url)
408 for candidate in file_versions
409 ])
410 )
411
412 # This is an intentional priority ordering
413 return (
414 file_versions + find_links_versions + page_versions +
415 dependency_versions
416 )
417
418 def find_requirement(self, req, upgrade):
419 """Try to find an InstallationCandidate for req
420
421 Expects req, an InstallRequirement and upgrade, a boolean
422 Returns an InstallationCandidate or None
423 May raise DistributionNotFound or BestVersionAlreadyInstalled
424 """
425 all_versions = self._find_all_versions(req.name)
426 # Filter out anything which doesn't match our specifier
427
428 _versions = set(
429 req.specifier.filter(
430 [x.version for x in all_versions],
431 prereleases=(
432 self.allow_all_prereleases
433 if self.allow_all_prereleases else None
434 ),
435 )
436 )
437 applicable_versions = [
438 x for x in all_versions if x.version in _versions
439 ]
440
441 if req.satisfied_by is not None:
442 # Finally add our existing versions to the front of our versions.
443 applicable_versions.insert(
444 0,
445 InstallationCandidate(
446 req.name,
447 req.satisfied_by.version,
448 INSTALLED_VERSION,
449 )
450 )
451 existing_applicable = True
452 else:
453 existing_applicable = False
454
455 applicable_versions = self._sort_versions(applicable_versions)
456
457 if not upgrade and existing_applicable:
458 if applicable_versions[0].location is INSTALLED_VERSION:
459 logger.debug(
460 'Existing installed version (%s) is most up-to-date and '
461 'satisfies requirement',
462 req.satisfied_by.version,
463 )
464 else:
465 logger.debug(
466 'Existing installed version (%s) satisfies requirement '
467 '(most up-to-date version is %s)',
468 req.satisfied_by.version,
469 applicable_versions[0][2],
470 )
471 return None
472
473 if not applicable_versions:
474 logger.critical(
475 'Could not find a version that satisfies the requirement %s '
476 '(from versions: %s)',
477 req,
478 ', '.join(
479 sorted(
480 set(str(i.version) for i in all_versions),
481 key=parse_version,
482 )
483 )
484 )
485
486 if self.need_warn_external:
487 logger.warning(
488 "Some externally hosted files were ignored as access to "
489 "them may be unreliable (use --allow-external %s to "
490 "allow).",
491 req.name,
492 )
493
494 if self.need_warn_unverified:
495 logger.warning(
496 "Some insecure and unverifiable files were ignored"
497 " (use --allow-unverified %s to allow).",
498 req.name,
499 )
500
501 raise DistributionNotFound(
502 'No matching distribution found for %s' % req
503 )
504
505 if applicable_versions[0].location is INSTALLED_VERSION:
506 # We have an existing version, and its the best version
507 logger.debug(
508 'Installed version (%s) is most up-to-date (past versions: '
509 '%s)',
510 req.satisfied_by.version,
511 ', '.join(str(i.version) for i in applicable_versions[1:]) or
512 "none",
513 )
514 raise BestVersionAlreadyInstalled
515
516 if len(applicable_versions) > 1:
517 logger.debug(
518 'Using version %s (newest of versions: %s)',
519 applicable_versions[0].version,
520 ', '.join(str(i.version) for i in applicable_versions)
521 )
522
523 selected_version = applicable_versions[0].location
524
525 if (selected_version.verifiable is not None and not
526 selected_version.verifiable):
527 logger.warning(
528 "%s is potentially insecure and unverifiable.", req.name,
529 )
530
531 if selected_version._deprecated_regex:
532 warnings.warn(
533 "%s discovered using a deprecated method of parsing, in the "
534 "future it will no longer be discovered." % req.name,
535 RemovedInPip7Warning,
536 )
537
538 return selected_version
539
540 def _find_url_name(self, index_url, url_name):
541 """
542 Finds the true URL name of a package, when the given name isn't quite
543 correct.
544 This is usually used to implement case-insensitivity.
545 """
546 if not index_url.url.endswith('/'):
547 # Vaguely part of the PyPI API... weird but true.
548 # FIXME: bad to modify this?
549 index_url.url += '/'
550 page = self._get_page(index_url)
551 if page is None:
552 logger.critical('Cannot fetch index base URL %s', index_url)
553 return
554 norm_name = normalize_name(url_name)
555 for link in page.links:
556 base = posixpath.basename(link.path.rstrip('/'))
557 if norm_name == normalize_name(base):
558 logger.debug(
559 'Real name of requirement %s is %s', url_name, base,
560 )
561 return base
562 return None
563
564 def _get_pages(self, locations, project_name):
565 """
566 Yields (page, page_url) from the given locations, skipping
567 locations that have errors, and adding download/homepage links
568 """
569 all_locations = list(locations)
570 seen = set()
571 normalized = normalize_name(project_name)
572
573 while all_locations:
574 location = all_locations.pop(0)
575 if location in seen:
576 continue
577 seen.add(location)
578
579 page = self._get_page(location)
580 if page is None:
581 continue
582
583 yield page
584
585 for link in page.rel_links():
586
587 if (normalized not in self.allow_external and not
588 self.allow_all_external):
589 self.need_warn_external = True
590 logger.debug(
591 "Not searching %s for files because external "
592 "urls are disallowed.",
593 link,
594 )
595 continue
596
597 if (link.trusted is not None and not
598 link.trusted and
599 normalized not in self.allow_unverified):
600 logger.debug(
601 "Not searching %s for urls, it is an "
602 "untrusted link and cannot produce safe or "
603 "verifiable files.",
604 link,
605 )
606 self.need_warn_unverified = True
607 continue
608
609 all_locations.append(link)
610
611 _egg_fragment_re = re.compile(r'#egg=([^&]*)')
612 _egg_info_re = re.compile(r'([a-z0-9_.]+)-([a-z0-9_.!+-]+)', re.I)
613 _py_version_re = re.compile(r'-py([123]\.?[0-9]?)$')
614
615 def _sort_links(self, links):
616 """
617 Returns elements of links in order, non-egg links first, egg links
618 second, while eliminating duplicates
619 """
620 eggs, no_eggs = [], []
621 seen = set()
622 for link in links:
623 if link not in seen:
624 seen.add(link)
625 if link.egg_fragment:
626 eggs.append(link)
627 else:
628 no_eggs.append(link)
629 return no_eggs + eggs
630
631 def _package_versions(self, links, search_name):
632 for link in self._sort_links(links):
633 v = self._link_package_versions(link, search_name)
634 if v is not None:
635 yield v
636
637 def _known_extensions(self):
638 extensions = ('.tar.gz', '.tar.bz2', '.tar', '.tgz', '.zip')
639 if self.use_wheel:
640 return extensions + (wheel_ext,)
641 return extensions
642
643 def _link_package_versions(self, link, search_name):
644 """Return an InstallationCandidate or None"""
645 platform = get_platform()
646
647 version = None
648 if link.egg_fragment:
649 egg_info = link.egg_fragment
650 else:
651 egg_info, ext = link.splitext()
652 if not ext:
653 if link not in self.logged_links:
654 logger.debug('Skipping link %s; not a file', link)
655 self.logged_links.add(link)
656 return
657 if egg_info.endswith('.tar'):
658 # Special double-extension case:
659 egg_info = egg_info[:-4]
660 ext = '.tar' + ext
661 if ext not in self._known_extensions():
662 if link not in self.logged_links:
663 logger.debug(
664 'Skipping link %s; unknown archive format: %s',
665 link,
666 ext,
667 )
668 self.logged_links.add(link)
669 return
670 if "macosx10" in link.path and ext == '.zip':
671 if link not in self.logged_links:
672 logger.debug('Skipping link %s; macosx10 one', link)
673 self.logged_links.add(link)
674 return
675 if ext == wheel_ext:
676 try:
677 wheel = Wheel(link.filename)
678 except InvalidWheelFilename:
679 logger.debug(
680 'Skipping %s because the wheel filename is invalid',
681 link
682 )
683 return
684 if (pkg_resources.safe_name(wheel.name).lower() !=
685 pkg_resources.safe_name(search_name).lower()):
686 logger.debug(
687 'Skipping link %s; wrong project name (not %s)',
688 link,
689 search_name,
690 )
691 return
692 if not wheel.supported():
693 logger.debug(
694 'Skipping %s because it is not compatible with this '
695 'Python',
696 link,
697 )
698 return
699 # This is a dirty hack to prevent installing Binary Wheels from
700 # PyPI unless it is a Windows or Mac Binary Wheel. This is
701 # paired with a change to PyPI disabling uploads for the
702 # same. Once we have a mechanism for enabling support for
703 # binary wheels on linux that deals with the inherent problems
704 # of binary distribution this can be removed.
705 comes_from = getattr(link, "comes_from", None)
706 if (
707 (
708 not platform.startswith('win') and not
709 platform.startswith('macosx') and not
710 platform == 'cli'
711 ) and
712 comes_from is not None and
713 urllib_parse.urlparse(
714 comes_from.url
715 ).netloc.endswith(PyPI.netloc)):
716 if not wheel.supported(tags=supported_tags_noarch):
717 logger.debug(
718 "Skipping %s because it is a pypi-hosted binary "
719 "Wheel on an unsupported platform",
720 link,
721 )
722 return
723 version = wheel.version
724
725 if not version:
726 version = self._egg_info_matches(egg_info, search_name, link)
727 if version is None:
728 logger.debug(
729 'Skipping link %s; wrong project name (not %s)',
730 link,
731 search_name,
732 )
733 return
734
735 if (link.internal is not None and not
736 link.internal and not
737 normalize_name(search_name).lower()
738 in self.allow_external and not
739 self.allow_all_external):
740 # We have a link that we are sure is external, so we should skip
741 # it unless we are allowing externals
742 logger.debug("Skipping %s because it is externally hosted.", link)
743 self.need_warn_external = True
744 return
745
746 if (link.verifiable is not None and not
747 link.verifiable and not
748 (normalize_name(search_name).lower()
749 in self.allow_unverified)):
750 # We have a link that we are sure we cannot verify its integrity,
751 # so we should skip it unless we are allowing unsafe installs
752 # for this requirement.
753 logger.debug(
754 "Skipping %s because it is an insecure and unverifiable file.",
755 link,
756 )
757 self.need_warn_unverified = True
758 return
759
760 match = self._py_version_re.search(version)
761 if match:
762 version = version[:match.start()]
763 py_version = match.group(1)
764 if py_version != sys.version[:3]:
765 logger.debug(
766 'Skipping %s because Python version is incorrect', link
767 )
768 return
769 logger.debug('Found link %s, version: %s', link, version)
770
771 return InstallationCandidate(search_name, version, link)
772
773 def _egg_info_matches(self, egg_info, search_name, link):
774 match = self._egg_info_re.search(egg_info)
775 if not match:
776 logger.debug('Could not parse version from link: %s', link)
777 return None
778 name = match.group(0).lower()
779 # To match the "safe" name that pkg_resources creates:
780 name = name.replace('_', '-')
781 # project name and version must be separated by a dash
782 look_for = search_name.lower() + "-"
783 if name.startswith(look_for):
784 return match.group(0)[len(look_for):]
785 else:
786 return None
787
788 def _get_page(self, link):
789 return HTMLPage.get_page(link, session=self.session)
790
791
792 class HTMLPage(object):
793 """Represents one page, along with its URL"""
794
795 # FIXME: these regexes are horrible hacks:
796 _homepage_re = re.compile(b'<th>\\s*home\\s*page', re.I)
797 _download_re = re.compile(b'<th>\\s*download\\s+url', re.I)
798 _href_re = re.compile(
799 b'href=(?:"([^"]*)"|\'([^\']*)\'|([^>\\s\\n]*))',
800 re.I | re.S
801 )
802
803 def __init__(self, content, url, headers=None, trusted=None):
804 # Determine if we have any encoding information in our headers
805 encoding = None
806 if headers and "Content-Type" in headers:
807 content_type, params = cgi.parse_header(headers["Content-Type"])
808
809 if "charset" in params:
810 encoding = params['charset']
811
812 self.content = content
813 self.parsed = html5lib.parse(
814 self.content,
815 encoding=encoding,
816 namespaceHTMLElements=False,
817 )
818 self.url = url
819 self.headers = headers
820 self.trusted = trusted
821
822 def __str__(self):
823 return self.url
824
825 @classmethod
826 def get_page(cls, link, skip_archives=True, session=None):
827 if session is None:
828 raise TypeError(
829 "get_page() missing 1 required keyword argument: 'session'"
830 )
831
832 url = link.url
833 url = url.split('#', 1)[0]
834
835 # Check for VCS schemes that do not support lookup as web pages.
836 from pip.vcs import VcsSupport
837 for scheme in VcsSupport.schemes:
838 if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
839 logger.debug('Cannot look at %s URL %s', scheme, link)
840 return None
841
842 try:
843 if skip_archives:
844 filename = link.filename
845 for bad_ext in ['.tar', '.tar.gz', '.tar.bz2', '.tgz', '.zip']:
846 if filename.endswith(bad_ext):
847 content_type = cls._get_content_type(
848 url, session=session,
849 )
850 if content_type.lower().startswith('text/html'):
851 break
852 else:
853 logger.debug(
854 'Skipping page %s because of Content-Type: %s',
855 link,
856 content_type,
857 )
858 return
859
860 logger.debug('Getting page %s', url)
861
862 # Tack index.html onto file:// URLs that point to directories
863 (scheme, netloc, path, params, query, fragment) = \
864 urllib_parse.urlparse(url)
865 if (scheme == 'file' and
866 os.path.isdir(urllib_request.url2pathname(path))):
867 # add trailing slash if not present so urljoin doesn't trim
868 # final segment
869 if not url.endswith('/'):
870 url += '/'
871 url = urllib_parse.urljoin(url, 'index.html')
872 logger.debug(' file: URL is directory, getting %s', url)
873
874 resp = session.get(
875 url,
876 headers={
877 "Accept": "text/html",
878 "Cache-Control": "max-age=600",
879 },
880 )
881 resp.raise_for_status()
882
883 # The check for archives above only works if the url ends with
884 # something that looks like an archive. However that is not a
885 # requirement of an url. Unless we issue a HEAD request on every
886 # url we cannot know ahead of time for sure if something is HTML
887 # or not. However we can check after we've downloaded it.
888 content_type = resp.headers.get('Content-Type', 'unknown')
889 if not content_type.lower().startswith("text/html"):
890 logger.debug(
891 'Skipping page %s because of Content-Type: %s',
892 link,
893 content_type,
894 )
895 return
896
897 inst = cls(
898 resp.content, resp.url, resp.headers,
899 trusted=link.trusted,
900 )
901 except requests.HTTPError as exc:
902 level = 2 if exc.response.status_code == 404 else 1
903 cls._handle_fail(link, exc, url, level=level)
904 except requests.ConnectionError as exc:
905 cls._handle_fail(link, "connection error: %s" % exc, url)
906 except requests.Timeout:
907 cls._handle_fail(link, "timed out", url)
908 except SSLError as exc:
909 reason = ("There was a problem confirming the ssl certificate: "
910 "%s" % exc)
911 cls._handle_fail(link, reason, url, level=2, meth=logger.info)
912 else:
913 return inst
914
915 @staticmethod
916 def _handle_fail(link, reason, url, level=1, meth=None):
917 if meth is None:
918 meth = logger.debug
919
920 meth("Could not fetch URL %s: %s - skipping", link, reason)
921
922 @staticmethod
923 def _get_content_type(url, session):
924 """Get the Content-Type of the given url, using a HEAD request"""
925 scheme, netloc, path, query, fragment = urllib_parse.urlsplit(url)
926 if scheme not in ('http', 'https'):
927 # FIXME: some warning or something?
928 # assertion error?
929 return ''
930
931 resp = session.head(url, allow_redirects=True)
932 resp.raise_for_status()
933
934 return resp.headers.get("Content-Type", "")
935
936 @cached_property
937 def api_version(self):
938 metas = [
939 x for x in self.parsed.findall(".//meta")
940 if x.get("name", "").lower() == "api-version"
941 ]
942 if metas:
943 try:
944 return int(metas[0].get("value", None))
945 except (TypeError, ValueError):
946 pass
947
948 return None
949
950 @cached_property
951 def base_url(self):
952 bases = [
953 x for x in self.parsed.findall(".//base")
954 if x.get("href") is not None
955 ]
956 if bases and bases[0].get("href"):
957 return bases[0].get("href")
958 else:
959 return self.url
960
961 @property
962 def links(self):
963 """Yields all links in the page"""
964 for anchor in self.parsed.findall(".//a"):
965 if anchor.get("href"):
966 href = anchor.get("href")
967 url = self.clean_link(
968 urllib_parse.urljoin(self.base_url, href)
969 )
970
971 # Determine if this link is internal. If that distinction
972 # doesn't make sense in this context, then we don't make
973 # any distinction.
974 internal = None
975 if self.api_version and self.api_version >= 2:
976 # Only api_versions >= 2 have a distinction between
977 # external and internal links
978 internal = bool(
979 anchor.get("rel") and
980 "internal" in anchor.get("rel").split()
981 )
982
983 yield Link(url, self, internal=internal)
984
985 def rel_links(self):
986 for url in self.explicit_rel_links():
987 yield url
988 for url in self.scraped_rel_links():
989 yield url
990
991 def explicit_rel_links(self, rels=('homepage', 'download')):
992 """Yields all links with the given relations"""
993 rels = set(rels)
994
995 for anchor in self.parsed.findall(".//a"):
996 if anchor.get("rel") and anchor.get("href"):
997 found_rels = set(anchor.get("rel").split())
998 # Determine the intersection between what rels were found and
999 # what rels were being looked for
1000 if found_rels & rels:
1001 href = anchor.get("href")
1002 url = self.clean_link(
1003 urllib_parse.urljoin(self.base_url, href)
1004 )
1005 yield Link(url, self, trusted=False)
1006
1007 def scraped_rel_links(self):
1008 # Can we get rid of this horrible horrible method?
1009 for regex in (self._homepage_re, self._download_re):
1010 match = regex.search(self.content)
1011 if not match:
1012 continue
1013 href_match = self._href_re.search(self.content, pos=match.end())
1014 if not href_match:
1015 continue
1016 url = (
1017 href_match.group(1) or
1018 href_match.group(2) or
1019 href_match.group(3)
1020 )
1021 if not url:
1022 continue
1023 try:
1024 url = url.decode("ascii")
1025 except UnicodeDecodeError:
1026 continue
1027 url = self.clean_link(urllib_parse.urljoin(self.base_url, url))
1028 yield Link(url, self, trusted=False, _deprecated_regex=True)
1029
1030 _clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)
1031
1032 def clean_link(self, url):
1033 """Makes sure a link is fully encoded. That is, if a ' ' shows up in
1034 the link, it will be rewritten to %20 (while not over-quoting
1035 % or other characters)."""
1036 return self._clean_re.sub(
1037 lambda match: '%%%2x' % ord(match.group(0)), url)
1038
1039
1040 class Link(object):
1041
1042 def __init__(self, url, comes_from=None, internal=None, trusted=None,
1043 _deprecated_regex=False):
1044
1045 # url can be a UNC windows share
1046 if url != Inf and url.startswith('\\\\'):
1047 url = path_to_url(url)
1048
1049 self.url = url
1050 self.comes_from = comes_from
1051 self.internal = internal
1052 self.trusted = trusted
1053 self._deprecated_regex = _deprecated_regex
1054
1055 def __str__(self):
1056 if self.comes_from:
1057 return '%s (from %s)' % (self.url, self.comes_from)
1058 else:
1059 return str(self.url)
1060
1061 def __repr__(self):
1062 return '<Link %s>' % self
1063
1064 def __eq__(self, other):
1065 if not isinstance(other, Link):
1066 return NotImplemented
1067 return self.url == other.url
1068
1069 def __ne__(self, other):
1070 if not isinstance(other, Link):
1071 return NotImplemented
1072 return self.url != other.url
1073
1074 def __lt__(self, other):
1075 if not isinstance(other, Link):
1076 return NotImplemented
1077 return self.url < other.url
1078
1079 def __le__(self, other):
1080 if not isinstance(other, Link):
1081 return NotImplemented
1082 return self.url <= other.url
1083
1084 def __gt__(self, other):
1085 if not isinstance(other, Link):
1086 return NotImplemented
1087 return self.url > other.url
1088
1089 def __ge__(self, other):
1090 if not isinstance(other, Link):
1091 return NotImplemented
1092 return self.url >= other.url
1093
1094 def __hash__(self):
1095 return hash(self.url)
1096
1097 @property
1098 def filename(self):
1099 _, netloc, path, _, _ = urllib_parse.urlsplit(self.url)
1100 name = posixpath.basename(path.rstrip('/')) or netloc
1101 name = urllib_parse.unquote(name)
1102 assert name, ('URL %r produced no filename' % self.url)
1103 return name
1104
1105 @property
1106 def scheme(self):
1107 return urllib_parse.urlsplit(self.url)[0]
1108
1109 @property
1110 def netloc(self):
1111 return urllib_parse.urlsplit(self.url)[1]
1112
1113 @property
1114 def path(self):
1115 return urllib_parse.unquote(urllib_parse.urlsplit(self.url)[2])
1116
1117 def splitext(self):
1118 return splitext(posixpath.basename(self.path.rstrip('/')))
1119
1120 @property
1121 def ext(self):
1122 return self.splitext()[1]
1123
1124 @property
1125 def url_without_fragment(self):
1126 scheme, netloc, path, query, fragment = urllib_parse.urlsplit(self.url)
1127 return urllib_parse.urlunsplit((scheme, netloc, path, query, None))
1128
1129 _egg_fragment_re = re.compile(r'#egg=([^&]*)')
1130
1131 @property
1132 def egg_fragment(self):
1133 match = self._egg_fragment_re.search(self.url)
1134 if not match:
1135 return None
1136 return match.group(1)
1137
1138 _hash_re = re.compile(
1139 r'(sha1|sha224|sha384|sha256|sha512|md5)=([a-f0-9]+)'
1140 )
1141
1142 @property
1143 def hash(self):
1144 match = self._hash_re.search(self.url)
1145 if match:
1146 return match.group(2)
1147 return None
1148
1149 @property
1150 def hash_name(self):
1151 match = self._hash_re.search(self.url)
1152 if match:
1153 return match.group(1)
1154 return None
1155
1156 @property
1157 def show_url(self):
1158 return posixpath.basename(self.url.split('#', 1)[0].split('?', 1)[0])
1159
1160 @property
1161 def verifiable(self):
1162 """
1163 Returns True if this link can be verified after download, False if it
1164 cannot, and None if we cannot determine.
1165 """
1166 trusted = self.trusted or getattr(self.comes_from, "trusted", None)
1167 if trusted is not None and trusted:
1168 # This link came from a trusted source. It *may* be verifiable but
1169 # first we need to see if this page is operating under the new
1170 # API version.
1171 try:
1172 api_version = getattr(self.comes_from, "api_version", None)
1173 api_version = int(api_version)
1174 except (ValueError, TypeError):
1175 api_version = None
1176
1177 if api_version is None or api_version <= 1:
1178 # This link is either trusted, or it came from a trusted,
1179 # however it is not operating under the API version 2 so
1180 # we can't make any claims about if it's safe or not
1181 return
1182
1183 if self.hash:
1184 # This link came from a trusted source and it has a hash, so we
1185 # can consider it safe.
1186 return True
1187 else:
1188 # This link came from a trusted source, using the new API
1189 # version, and it does not have a hash. It is NOT verifiable
1190 return False
1191 elif trusted is not None:
1192 # This link came from an untrusted source and we cannot trust it
1193 return False
1194
1195 @property
1196 def is_wheel(self):
1197 return self.ext == wheel_ext
1198
1199
1200 # An object to represent the "link" for the installed version of a requirement.
1201 # Using Inf as the url makes it sort higher.
1202 INSTALLED_VERSION = Link(Inf)