sources.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. import logging
  2. import mimetypes
  3. import os
  4. import pathlib
  5. from typing import Callable, Iterable, Optional, Tuple
  6. from pip._internal.models.candidate import InstallationCandidate
  7. from pip._internal.models.link import Link
  8. from pip._internal.utils.urls import path_to_url, url_to_path
  9. from pip._internal.vcs import is_url
  10. logger = logging.getLogger(__name__)
  11. FoundCandidates = Iterable[InstallationCandidate]
  12. FoundLinks = Iterable[Link]
  13. CandidatesFromPage = Callable[[Link], Iterable[InstallationCandidate]]
  14. PageValidator = Callable[[Link], bool]
  15. class LinkSource:
  16. @property
  17. def link(self) -> Optional[Link]:
  18. """Returns the underlying link, if there's one."""
  19. raise NotImplementedError()
  20. def page_candidates(self) -> FoundCandidates:
  21. """Candidates found by parsing an archive listing HTML file."""
  22. raise NotImplementedError()
  23. def file_links(self) -> FoundLinks:
  24. """Links found by specifying archives directly."""
  25. raise NotImplementedError()
  26. def _is_html_file(file_url: str) -> bool:
  27. return mimetypes.guess_type(file_url, strict=False)[0] == "text/html"
  28. class _FlatDirectorySource(LinkSource):
  29. """Link source specified by ``--find-links=<path-to-dir>``.
  30. This looks the content of the directory, and returns:
  31. * ``page_candidates``: Links listed on each HTML file in the directory.
  32. * ``file_candidates``: Archives in the directory.
  33. """
  34. def __init__(
  35. self,
  36. candidates_from_page: CandidatesFromPage,
  37. path: str,
  38. ) -> None:
  39. self._candidates_from_page = candidates_from_page
  40. self._path = pathlib.Path(os.path.realpath(path))
  41. @property
  42. def link(self) -> Optional[Link]:
  43. return None
  44. def page_candidates(self) -> FoundCandidates:
  45. for path in self._path.iterdir():
  46. url = path_to_url(str(path))
  47. if not _is_html_file(url):
  48. continue
  49. yield from self._candidates_from_page(Link(url))
  50. def file_links(self) -> FoundLinks:
  51. for path in self._path.iterdir():
  52. url = path_to_url(str(path))
  53. if _is_html_file(url):
  54. continue
  55. yield Link(url)
  56. class _LocalFileSource(LinkSource):
  57. """``--find-links=<path-or-url>`` or ``--[extra-]index-url=<path-or-url>``.
  58. If a URL is supplied, it must be a ``file:`` URL. If a path is supplied to
  59. the option, it is converted to a URL first. This returns:
  60. * ``page_candidates``: Links listed on an HTML file.
  61. * ``file_candidates``: The non-HTML file.
  62. """
  63. def __init__(
  64. self,
  65. candidates_from_page: CandidatesFromPage,
  66. link: Link,
  67. ) -> None:
  68. self._candidates_from_page = candidates_from_page
  69. self._link = link
  70. @property
  71. def link(self) -> Optional[Link]:
  72. return self._link
  73. def page_candidates(self) -> FoundCandidates:
  74. if not _is_html_file(self._link.url):
  75. return
  76. yield from self._candidates_from_page(self._link)
  77. def file_links(self) -> FoundLinks:
  78. if _is_html_file(self._link.url):
  79. return
  80. yield self._link
  81. class _RemoteFileSource(LinkSource):
  82. """``--find-links=<url>`` or ``--[extra-]index-url=<url>``.
  83. This returns:
  84. * ``page_candidates``: Links listed on an HTML file.
  85. * ``file_candidates``: The non-HTML file.
  86. """
  87. def __init__(
  88. self,
  89. candidates_from_page: CandidatesFromPage,
  90. page_validator: PageValidator,
  91. link: Link,
  92. ) -> None:
  93. self._candidates_from_page = candidates_from_page
  94. self._page_validator = page_validator
  95. self._link = link
  96. @property
  97. def link(self) -> Optional[Link]:
  98. return self._link
  99. def page_candidates(self) -> FoundCandidates:
  100. if not self._page_validator(self._link):
  101. return
  102. yield from self._candidates_from_page(self._link)
  103. def file_links(self) -> FoundLinks:
  104. yield self._link
  105. class _IndexDirectorySource(LinkSource):
  106. """``--[extra-]index-url=<path-to-directory>``.
  107. This is treated like a remote URL; ``candidates_from_page`` contains logic
  108. for this by appending ``index.html`` to the link.
  109. """
  110. def __init__(
  111. self,
  112. candidates_from_page: CandidatesFromPage,
  113. link: Link,
  114. ) -> None:
  115. self._candidates_from_page = candidates_from_page
  116. self._link = link
  117. @property
  118. def link(self) -> Optional[Link]:
  119. return self._link
  120. def page_candidates(self) -> FoundCandidates:
  121. yield from self._candidates_from_page(self._link)
  122. def file_links(self) -> FoundLinks:
  123. return ()
  124. def build_source(
  125. location: str,
  126. *,
  127. candidates_from_page: CandidatesFromPage,
  128. page_validator: PageValidator,
  129. expand_dir: bool,
  130. cache_link_parsing: bool,
  131. ) -> Tuple[Optional[str], Optional[LinkSource]]:
  132. path: Optional[str] = None
  133. url: Optional[str] = None
  134. if os.path.exists(location): # Is a local path.
  135. url = path_to_url(location)
  136. path = location
  137. elif location.startswith("file:"): # A file: URL.
  138. url = location
  139. path = url_to_path(location)
  140. elif is_url(location):
  141. url = location
  142. if url is None:
  143. msg = (
  144. "Location '%s' is ignored: "
  145. "it is either a non-existing path or lacks a specific scheme."
  146. )
  147. logger.warning(msg, location)
  148. return (None, None)
  149. if path is None:
  150. source: LinkSource = _RemoteFileSource(
  151. candidates_from_page=candidates_from_page,
  152. page_validator=page_validator,
  153. link=Link(url, cache_link_parsing=cache_link_parsing),
  154. )
  155. return (url, source)
  156. if os.path.isdir(path):
  157. if expand_dir:
  158. source = _FlatDirectorySource(
  159. candidates_from_page=candidates_from_page,
  160. path=path,
  161. )
  162. else:
  163. source = _IndexDirectorySource(
  164. candidates_from_page=candidates_from_page,
  165. link=Link(url, cache_link_parsing=cache_link_parsing),
  166. )
  167. return (url, source)
  168. elif os.path.isfile(path):
  169. source = _LocalFileSource(
  170. candidates_from_page=candidates_from_page,
  171. link=Link(url, cache_link_parsing=cache_link_parsing),
  172. )
  173. return (url, source)
  174. logger.warning(
  175. "Location '%s' is ignored: it is neither a file nor a directory.",
  176. location,
  177. )
  178. return (url, None)