parser.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. """A parser for HTML and XHTML.
  2. Backported for python-future from Python 3.3.
  3. """
  4. # This file is based on sgmllib.py, but the API is slightly different.
  5. # XXX There should be a way to distinguish between PCDATA (parsed
  6. # character data -- the normal case), RCDATA (replaceable character
  7. # data -- only char and entity references and end tags are special)
  8. # and CDATA (character data -- only end tags are special).
  9. from __future__ import (absolute_import, division,
  10. print_function, unicode_literals)
  11. from future.builtins import *
  12. from future.backports import _markupbase
  13. import re
  14. import warnings
  15. # Regular expressions used for parsing
  16. interesting_normal = re.compile('[&<]')
  17. incomplete = re.compile('&[a-zA-Z#]')
  18. entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  19. charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
  20. starttagopen = re.compile('<[a-zA-Z]')
  21. piclose = re.compile('>')
  22. commentclose = re.compile(r'--\s*>')
  23. tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
  24. # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
  25. # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
  26. tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
  27. # Note:
  28. # 1) the strict attrfind isn't really strict, but we can't make it
  29. # correctly strict without breaking backward compatibility;
  30. # 2) if you change attrfind remember to update locatestarttagend too;
  31. # 3) if you change attrfind and/or locatestarttagend the parser will
  32. # explode, so don't do it.
  33. attrfind = re.compile(
  34. r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
  35. r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
  36. attrfind_tolerant = re.compile(
  37. r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
  38. r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
  39. locatestarttagend = re.compile(r"""
  40. <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
  41. (?:\s+ # whitespace before attribute name
  42. (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
  43. (?:\s*=\s* # value indicator
  44. (?:'[^']*' # LITA-enclosed value
  45. |\"[^\"]*\" # LIT-enclosed value
  46. |[^'\">\s]+ # bare value
  47. )
  48. )?
  49. )
  50. )*
  51. \s* # trailing whitespace
  52. """, re.VERBOSE)
  53. locatestarttagend_tolerant = re.compile(r"""
  54. <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
  55. (?:[\s/]* # optional whitespace before attribute name
  56. (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
  57. (?:\s*=+\s* # value indicator
  58. (?:'[^']*' # LITA-enclosed value
  59. |"[^"]*" # LIT-enclosed value
  60. |(?!['"])[^>\s]* # bare value
  61. )
  62. (?:\s*,)* # possibly followed by a comma
  63. )?(?:\s|/(?!>))*
  64. )*
  65. )?
  66. \s* # trailing whitespace
  67. """, re.VERBOSE)
  68. endendtag = re.compile('>')
  69. # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
  70. # </ and the tag name, so maybe this should be fixed
  71. endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
  72. class HTMLParseError(Exception):
  73. """Exception raised for all parse errors."""
  74. def __init__(self, msg, position=(None, None)):
  75. assert msg
  76. self.msg = msg
  77. self.lineno = position[0]
  78. self.offset = position[1]
  79. def __str__(self):
  80. result = self.msg
  81. if self.lineno is not None:
  82. result = result + ", at line %d" % self.lineno
  83. if self.offset is not None:
  84. result = result + ", column %d" % (self.offset + 1)
  85. return result
  86. class HTMLParser(_markupbase.ParserBase):
  87. """Find tags and other markup and call handler functions.
  88. Usage:
  89. p = HTMLParser()
  90. p.feed(data)
  91. ...
  92. p.close()
  93. Start tags are handled by calling self.handle_starttag() or
  94. self.handle_startendtag(); end tags by self.handle_endtag(). The
  95. data between tags is passed from the parser to the derived class
  96. by calling self.handle_data() with the data as argument (the data
  97. may be split up in arbitrary chunks). Entity references are
  98. passed by calling self.handle_entityref() with the entity
  99. reference as the argument. Numeric character references are
  100. passed to self.handle_charref() with the string containing the
  101. reference as the argument.
  102. """
  103. CDATA_CONTENT_ELEMENTS = ("script", "style")
  104. def __init__(self, strict=False):
  105. """Initialize and reset this instance.
  106. If strict is set to False (the default) the parser will parse invalid
  107. markup, otherwise it will raise an error. Note that the strict mode
  108. is deprecated.
  109. """
  110. if strict:
  111. warnings.warn("The strict mode is deprecated.",
  112. DeprecationWarning, stacklevel=2)
  113. self.strict = strict
  114. self.reset()
  115. def reset(self):
  116. """Reset this instance. Loses all unprocessed data."""
  117. self.rawdata = ''
  118. self.lasttag = '???'
  119. self.interesting = interesting_normal
  120. self.cdata_elem = None
  121. _markupbase.ParserBase.reset(self)
  122. def feed(self, data):
  123. r"""Feed data to the parser.
  124. Call this as often as you want, with as little or as much text
  125. as you want (may include '\n').
  126. """
  127. self.rawdata = self.rawdata + data
  128. self.goahead(0)
  129. def close(self):
  130. """Handle any buffered data."""
  131. self.goahead(1)
  132. def error(self, message):
  133. raise HTMLParseError(message, self.getpos())
  134. __starttag_text = None
  135. def get_starttag_text(self):
  136. """Return full source of start tag: '<...>'."""
  137. return self.__starttag_text
  138. def set_cdata_mode(self, elem):
  139. self.cdata_elem = elem.lower()
  140. self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
  141. def clear_cdata_mode(self):
  142. self.interesting = interesting_normal
  143. self.cdata_elem = None
  144. # Internal -- handle data as far as reasonable. May leave state
  145. # and data to be processed by a subsequent call. If 'end' is
  146. # true, force handling all data as if followed by EOF marker.
  147. def goahead(self, end):
  148. rawdata = self.rawdata
  149. i = 0
  150. n = len(rawdata)
  151. while i < n:
  152. match = self.interesting.search(rawdata, i) # < or &
  153. if match:
  154. j = match.start()
  155. else:
  156. if self.cdata_elem:
  157. break
  158. j = n
  159. if i < j: self.handle_data(rawdata[i:j])
  160. i = self.updatepos(i, j)
  161. if i == n: break
  162. startswith = rawdata.startswith
  163. if startswith('<', i):
  164. if starttagopen.match(rawdata, i): # < + letter
  165. k = self.parse_starttag(i)
  166. elif startswith("</", i):
  167. k = self.parse_endtag(i)
  168. elif startswith("<!--", i):
  169. k = self.parse_comment(i)
  170. elif startswith("<?", i):
  171. k = self.parse_pi(i)
  172. elif startswith("<!", i):
  173. if self.strict:
  174. k = self.parse_declaration(i)
  175. else:
  176. k = self.parse_html_declaration(i)
  177. elif (i + 1) < n:
  178. self.handle_data("<")
  179. k = i + 1
  180. else:
  181. break
  182. if k < 0:
  183. if not end:
  184. break
  185. if self.strict:
  186. self.error("EOF in middle of construct")
  187. k = rawdata.find('>', i + 1)
  188. if k < 0:
  189. k = rawdata.find('<', i + 1)
  190. if k < 0:
  191. k = i + 1
  192. else:
  193. k += 1
  194. self.handle_data(rawdata[i:k])
  195. i = self.updatepos(i, k)
  196. elif startswith("&#", i):
  197. match = charref.match(rawdata, i)
  198. if match:
  199. name = match.group()[2:-1]
  200. self.handle_charref(name)
  201. k = match.end()
  202. if not startswith(';', k-1):
  203. k = k - 1
  204. i = self.updatepos(i, k)
  205. continue
  206. else:
  207. if ";" in rawdata[i:]: #bail by consuming &#
  208. self.handle_data(rawdata[0:2])
  209. i = self.updatepos(i, 2)
  210. break
  211. elif startswith('&', i):
  212. match = entityref.match(rawdata, i)
  213. if match:
  214. name = match.group(1)
  215. self.handle_entityref(name)
  216. k = match.end()
  217. if not startswith(';', k-1):
  218. k = k - 1
  219. i = self.updatepos(i, k)
  220. continue
  221. match = incomplete.match(rawdata, i)
  222. if match:
  223. # match.group() will contain at least 2 chars
  224. if end and match.group() == rawdata[i:]:
  225. if self.strict:
  226. self.error("EOF in middle of entity or char ref")
  227. else:
  228. if k <= i:
  229. k = n
  230. i = self.updatepos(i, i + 1)
  231. # incomplete
  232. break
  233. elif (i + 1) < n:
  234. # not the end of the buffer, and can't be confused
  235. # with some other construct
  236. self.handle_data("&")
  237. i = self.updatepos(i, i + 1)
  238. else:
  239. break
  240. else:
  241. assert 0, "interesting.search() lied"
  242. # end while
  243. if end and i < n and not self.cdata_elem:
  244. self.handle_data(rawdata[i:n])
  245. i = self.updatepos(i, n)
  246. self.rawdata = rawdata[i:]
  247. # Internal -- parse html declarations, return length or -1 if not terminated
  248. # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
  249. # See also parse_declaration in _markupbase
  250. def parse_html_declaration(self, i):
  251. rawdata = self.rawdata
  252. assert rawdata[i:i+2] == '<!', ('unexpected call to '
  253. 'parse_html_declaration()')
  254. if rawdata[i:i+4] == '<!--':
  255. # this case is actually already handled in goahead()
  256. return self.parse_comment(i)
  257. elif rawdata[i:i+3] == '<![':
  258. return self.parse_marked_section(i)
  259. elif rawdata[i:i+9].lower() == '<!doctype':
  260. # find the closing >
  261. gtpos = rawdata.find('>', i+9)
  262. if gtpos == -1:
  263. return -1
  264. self.handle_decl(rawdata[i+2:gtpos])
  265. return gtpos+1
  266. else:
  267. return self.parse_bogus_comment(i)
  268. # Internal -- parse bogus comment, return length or -1 if not terminated
  269. # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
  270. def parse_bogus_comment(self, i, report=1):
  271. rawdata = self.rawdata
  272. assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
  273. 'parse_comment()')
  274. pos = rawdata.find('>', i+2)
  275. if pos == -1:
  276. return -1
  277. if report:
  278. self.handle_comment(rawdata[i+2:pos])
  279. return pos + 1
  280. # Internal -- parse processing instr, return end or -1 if not terminated
  281. def parse_pi(self, i):
  282. rawdata = self.rawdata
  283. assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
  284. match = piclose.search(rawdata, i+2) # >
  285. if not match:
  286. return -1
  287. j = match.start()
  288. self.handle_pi(rawdata[i+2: j])
  289. j = match.end()
  290. return j
  291. # Internal -- handle starttag, return end or -1 if not terminated
  292. def parse_starttag(self, i):
  293. self.__starttag_text = None
  294. endpos = self.check_for_whole_start_tag(i)
  295. if endpos < 0:
  296. return endpos
  297. rawdata = self.rawdata
  298. self.__starttag_text = rawdata[i:endpos]
  299. # Now parse the data between i+1 and j into a tag and attrs
  300. attrs = []
  301. match = tagfind.match(rawdata, i+1)
  302. assert match, 'unexpected call to parse_starttag()'
  303. k = match.end()
  304. self.lasttag = tag = match.group(1).lower()
  305. while k < endpos:
  306. if self.strict:
  307. m = attrfind.match(rawdata, k)
  308. else:
  309. m = attrfind_tolerant.match(rawdata, k)
  310. if not m:
  311. break
  312. attrname, rest, attrvalue = m.group(1, 2, 3)
  313. if not rest:
  314. attrvalue = None
  315. elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
  316. attrvalue[:1] == '"' == attrvalue[-1:]:
  317. attrvalue = attrvalue[1:-1]
  318. if attrvalue:
  319. attrvalue = self.unescape(attrvalue)
  320. attrs.append((attrname.lower(), attrvalue))
  321. k = m.end()
  322. end = rawdata[k:endpos].strip()
  323. if end not in (">", "/>"):
  324. lineno, offset = self.getpos()
  325. if "\n" in self.__starttag_text:
  326. lineno = lineno + self.__starttag_text.count("\n")
  327. offset = len(self.__starttag_text) \
  328. - self.__starttag_text.rfind("\n")
  329. else:
  330. offset = offset + len(self.__starttag_text)
  331. if self.strict:
  332. self.error("junk characters in start tag: %r"
  333. % (rawdata[k:endpos][:20],))
  334. self.handle_data(rawdata[i:endpos])
  335. return endpos
  336. if end.endswith('/>'):
  337. # XHTML-style empty tag: <span attr="value" />
  338. self.handle_startendtag(tag, attrs)
  339. else:
  340. self.handle_starttag(tag, attrs)
  341. if tag in self.CDATA_CONTENT_ELEMENTS:
  342. self.set_cdata_mode(tag)
  343. return endpos
  344. # Internal -- check to see if we have a complete starttag; return end
  345. # or -1 if incomplete.
  346. def check_for_whole_start_tag(self, i):
  347. rawdata = self.rawdata
  348. if self.strict:
  349. m = locatestarttagend.match(rawdata, i)
  350. else:
  351. m = locatestarttagend_tolerant.match(rawdata, i)
  352. if m:
  353. j = m.end()
  354. next = rawdata[j:j+1]
  355. if next == ">":
  356. return j + 1
  357. if next == "/":
  358. if rawdata.startswith("/>", j):
  359. return j + 2
  360. if rawdata.startswith("/", j):
  361. # buffer boundary
  362. return -1
  363. # else bogus input
  364. if self.strict:
  365. self.updatepos(i, j + 1)
  366. self.error("malformed empty start tag")
  367. if j > i:
  368. return j
  369. else:
  370. return i + 1
  371. if next == "":
  372. # end of input
  373. return -1
  374. if next in ("abcdefghijklmnopqrstuvwxyz=/"
  375. "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
  376. # end of input in or before attribute value, or we have the
  377. # '/' from a '/>' ending
  378. return -1
  379. if self.strict:
  380. self.updatepos(i, j)
  381. self.error("malformed start tag")
  382. if j > i:
  383. return j
  384. else:
  385. return i + 1
  386. raise AssertionError("we should not get here!")
  387. # Internal -- parse endtag, return end or -1 if incomplete
  388. def parse_endtag(self, i):
  389. rawdata = self.rawdata
  390. assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
  391. match = endendtag.search(rawdata, i+1) # >
  392. if not match:
  393. return -1
  394. gtpos = match.end()
  395. match = endtagfind.match(rawdata, i) # </ + tag + >
  396. if not match:
  397. if self.cdata_elem is not None:
  398. self.handle_data(rawdata[i:gtpos])
  399. return gtpos
  400. if self.strict:
  401. self.error("bad end tag: %r" % (rawdata[i:gtpos],))
  402. # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
  403. namematch = tagfind_tolerant.match(rawdata, i+2)
  404. if not namematch:
  405. # w3.org/TR/html5/tokenization.html#end-tag-open-state
  406. if rawdata[i:i+3] == '</>':
  407. return i+3
  408. else:
  409. return self.parse_bogus_comment(i)
  410. tagname = namematch.group().lower()
  411. # consume and ignore other stuff between the name and the >
  412. # Note: this is not 100% correct, since we might have things like
  413. # </tag attr=">">, but looking for > after tha name should cover
  414. # most of the cases and is much simpler
  415. gtpos = rawdata.find('>', namematch.end())
  416. self.handle_endtag(tagname)
  417. return gtpos+1
  418. elem = match.group(1).lower() # script or style
  419. if self.cdata_elem is not None:
  420. if elem != self.cdata_elem:
  421. self.handle_data(rawdata[i:gtpos])
  422. return gtpos
  423. self.handle_endtag(elem.lower())
  424. self.clear_cdata_mode()
  425. return gtpos
  426. # Overridable -- finish processing of start+end tag: <tag.../>
  427. def handle_startendtag(self, tag, attrs):
  428. self.handle_starttag(tag, attrs)
  429. self.handle_endtag(tag)
  430. # Overridable -- handle start tag
  431. def handle_starttag(self, tag, attrs):
  432. pass
  433. # Overridable -- handle end tag
  434. def handle_endtag(self, tag):
  435. pass
  436. # Overridable -- handle character reference
  437. def handle_charref(self, name):
  438. pass
  439. # Overridable -- handle entity reference
  440. def handle_entityref(self, name):
  441. pass
  442. # Overridable -- handle data
  443. def handle_data(self, data):
  444. pass
  445. # Overridable -- handle comment
  446. def handle_comment(self, data):
  447. pass
  448. # Overridable -- handle declaration
  449. def handle_decl(self, decl):
  450. pass
  451. # Overridable -- handle processing instruction
  452. def handle_pi(self, data):
  453. pass
  454. def unknown_decl(self, data):
  455. if self.strict:
  456. self.error("unknown declaration: %r" % (data,))
  457. # Internal -- helper to remove special character quoting
  458. def unescape(self, s):
  459. if '&' not in s:
  460. return s
  461. def replaceEntities(s):
  462. s = s.groups()[0]
  463. try:
  464. if s[0] == "#":
  465. s = s[1:]
  466. if s[0] in ['x','X']:
  467. c = int(s[1:].rstrip(';'), 16)
  468. else:
  469. c = int(s.rstrip(';'))
  470. return chr(c)
  471. except ValueError:
  472. return '&#' + s
  473. else:
  474. from future.backports.html.entities import html5
  475. if s in html5:
  476. return html5[s]
  477. elif s.endswith(';'):
  478. return '&' + s
  479. for x in range(2, len(s)):
  480. if s[:x] in html5:
  481. return html5[s[:x]] + s[x:]
  482. else:
  483. return '&' + s
  484. return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
  485. replaceEntities, s)