robotparser.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. from __future__ import absolute_import, division, unicode_literals
  2. from future.builtins import str
  3. """ robotparser.py
  4. Copyright (C) 2000 Bastian Kleineidam
  5. You can choose between two licenses when using this package:
  6. 1) GNU GPLv2
  7. 2) PSF license for Python 2.2
  8. The robots.txt Exclusion Protocol is implemented as specified in
  9. http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
  10. """
  11. # Was: import urllib.parse, urllib.request
  12. from future.backports import urllib
  13. from future.backports.urllib import parse as _parse, request as _request
  14. urllib.parse = _parse
  15. urllib.request = _request
  16. __all__ = ["RobotFileParser"]
  17. class RobotFileParser(object):
  18. """ This class provides a set of methods to read, parse and answer
  19. questions about a single robots.txt file.
  20. """
  21. def __init__(self, url=''):
  22. self.entries = []
  23. self.default_entry = None
  24. self.disallow_all = False
  25. self.allow_all = False
  26. self.set_url(url)
  27. self.last_checked = 0
  28. def mtime(self):
  29. """Returns the time the robots.txt file was last fetched.
  30. This is useful for long-running web spiders that need to
  31. check for new robots.txt files periodically.
  32. """
  33. return self.last_checked
  34. def modified(self):
  35. """Sets the time the robots.txt file was last fetched to the
  36. current time.
  37. """
  38. import time
  39. self.last_checked = time.time()
  40. def set_url(self, url):
  41. """Sets the URL referring to a robots.txt file."""
  42. self.url = url
  43. self.host, self.path = urllib.parse.urlparse(url)[1:3]
  44. def read(self):
  45. """Reads the robots.txt URL and feeds it to the parser."""
  46. try:
  47. f = urllib.request.urlopen(self.url)
  48. except urllib.error.HTTPError as err:
  49. if err.code in (401, 403):
  50. self.disallow_all = True
  51. elif err.code >= 400:
  52. self.allow_all = True
  53. else:
  54. raw = f.read()
  55. self.parse(raw.decode("utf-8").splitlines())
  56. def _add_entry(self, entry):
  57. if "*" in entry.useragents:
  58. # the default entry is considered last
  59. if self.default_entry is None:
  60. # the first default entry wins
  61. self.default_entry = entry
  62. else:
  63. self.entries.append(entry)
  64. def parse(self, lines):
  65. """Parse the input lines from a robots.txt file.
  66. We allow that a user-agent: line is not preceded by
  67. one or more blank lines.
  68. """
  69. # states:
  70. # 0: start state
  71. # 1: saw user-agent line
  72. # 2: saw an allow or disallow line
  73. state = 0
  74. entry = Entry()
  75. for line in lines:
  76. if not line:
  77. if state == 1:
  78. entry = Entry()
  79. state = 0
  80. elif state == 2:
  81. self._add_entry(entry)
  82. entry = Entry()
  83. state = 0
  84. # remove optional comment and strip line
  85. i = line.find('#')
  86. if i >= 0:
  87. line = line[:i]
  88. line = line.strip()
  89. if not line:
  90. continue
  91. line = line.split(':', 1)
  92. if len(line) == 2:
  93. line[0] = line[0].strip().lower()
  94. line[1] = urllib.parse.unquote(line[1].strip())
  95. if line[0] == "user-agent":
  96. if state == 2:
  97. self._add_entry(entry)
  98. entry = Entry()
  99. entry.useragents.append(line[1])
  100. state = 1
  101. elif line[0] == "disallow":
  102. if state != 0:
  103. entry.rulelines.append(RuleLine(line[1], False))
  104. state = 2
  105. elif line[0] == "allow":
  106. if state != 0:
  107. entry.rulelines.append(RuleLine(line[1], True))
  108. state = 2
  109. if state == 2:
  110. self._add_entry(entry)
  111. def can_fetch(self, useragent, url):
  112. """using the parsed robots.txt decide if useragent can fetch url"""
  113. if self.disallow_all:
  114. return False
  115. if self.allow_all:
  116. return True
  117. # search for given user agent matches
  118. # the first match counts
  119. parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
  120. url = urllib.parse.urlunparse(('','',parsed_url.path,
  121. parsed_url.params,parsed_url.query, parsed_url.fragment))
  122. url = urllib.parse.quote(url)
  123. if not url:
  124. url = "/"
  125. for entry in self.entries:
  126. if entry.applies_to(useragent):
  127. return entry.allowance(url)
  128. # try the default entry last
  129. if self.default_entry:
  130. return self.default_entry.allowance(url)
  131. # agent not found ==> access granted
  132. return True
  133. def __str__(self):
  134. return ''.join([str(entry) + "\n" for entry in self.entries])
  135. class RuleLine(object):
  136. """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
  137. (allowance==False) followed by a path."""
  138. def __init__(self, path, allowance):
  139. if path == '' and not allowance:
  140. # an empty value means allow all
  141. allowance = True
  142. self.path = urllib.parse.quote(path)
  143. self.allowance = allowance
  144. def applies_to(self, filename):
  145. return self.path == "*" or filename.startswith(self.path)
  146. def __str__(self):
  147. return (self.allowance and "Allow" or "Disallow") + ": " + self.path
  148. class Entry(object):
  149. """An entry has one or more user-agents and zero or more rulelines"""
  150. def __init__(self):
  151. self.useragents = []
  152. self.rulelines = []
  153. def __str__(self):
  154. ret = []
  155. for agent in self.useragents:
  156. ret.extend(["User-agent: ", agent, "\n"])
  157. for line in self.rulelines:
  158. ret.extend([str(line), "\n"])
  159. return ''.join(ret)
  160. def applies_to(self, useragent):
  161. """check if this entry applies to the specified agent"""
  162. # split the name token and make it lower case
  163. useragent = useragent.split("/")[0].lower()
  164. for agent in self.useragents:
  165. if agent == '*':
  166. # we have the catch-all agent
  167. return True
  168. agent = agent.lower()
  169. if agent in useragent:
  170. return True
  171. return False
  172. def allowance(self, filename):
  173. """Preconditions:
  174. - our agent applies to this entry
  175. - filename is URL decoded"""
  176. for line in self.rulelines:
  177. if line.applies_to(filename):
  178. return line.allowance
  179. return True