header.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. # Copyright (C) 2002-2007 Python Software Foundation
  2. # Author: Ben Gertzfield, Barry Warsaw
  3. # Contact: email-sig@python.org
  4. """Header encoding and decoding functionality."""
  5. from __future__ import unicode_literals
  6. from __future__ import division
  7. from __future__ import absolute_import
  8. from future.builtins import bytes, range, str, super, zip
  9. __all__ = [
  10. 'Header',
  11. 'decode_header',
  12. 'make_header',
  13. ]
  14. import re
  15. import binascii
  16. from future.backports import email
  17. from future.backports.email import base64mime
  18. from future.backports.email.errors import HeaderParseError
  19. import future.backports.email.charset as _charset
  20. # Helpers
  21. from future.backports.email.quoprimime import _max_append, header_decode
  22. Charset = _charset.Charset
  23. NL = '\n'
  24. SPACE = ' '
  25. BSPACE = b' '
  26. SPACE8 = ' ' * 8
  27. EMPTYSTRING = ''
  28. MAXLINELEN = 78
  29. FWS = ' \t'
  30. USASCII = Charset('us-ascii')
  31. UTF8 = Charset('utf-8')
  32. # Match encoded-word strings in the form =?charset?q?Hello_World?=
  33. ecre = re.compile(r'''
  34. =\? # literal =?
  35. (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
  36. \? # literal ?
  37. (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
  38. \? # literal ?
  39. (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
  40. \?= # literal ?=
  41. ''', re.VERBOSE | re.IGNORECASE | re.MULTILINE)
  42. # Field name regexp, including trailing colon, but not separating whitespace,
  43. # according to RFC 2822. Character range is from tilde to exclamation mark.
  44. # For use with .match()
  45. fcre = re.compile(r'[\041-\176]+:$')
  46. # Find a header embedded in a putative header value. Used to check for
  47. # header injection attack.
  48. _embeded_header = re.compile(r'\n[^ \t]+:')
  49. def decode_header(header):
  50. """Decode a message header value without converting charset.
  51. Returns a list of (string, charset) pairs containing each of the decoded
  52. parts of the header. Charset is None for non-encoded parts of the header,
  53. otherwise a lower-case string containing the name of the character set
  54. specified in the encoded string.
  55. header may be a string that may or may not contain RFC2047 encoded words,
  56. or it may be a Header object.
  57. An email.errors.HeaderParseError may be raised when certain decoding error
  58. occurs (e.g. a base64 decoding exception).
  59. """
  60. # If it is a Header object, we can just return the encoded chunks.
  61. if hasattr(header, '_chunks'):
  62. return [(_charset._encode(string, str(charset)), str(charset))
  63. for string, charset in header._chunks]
  64. # If no encoding, just return the header with no charset.
  65. if not ecre.search(header):
  66. return [(header, None)]
  67. # First step is to parse all the encoded parts into triplets of the form
  68. # (encoded_string, encoding, charset). For unencoded strings, the last
  69. # two parts will be None.
  70. words = []
  71. for line in header.splitlines():
  72. parts = ecre.split(line)
  73. first = True
  74. while parts:
  75. unencoded = parts.pop(0)
  76. if first:
  77. unencoded = unencoded.lstrip()
  78. first = False
  79. if unencoded:
  80. words.append((unencoded, None, None))
  81. if parts:
  82. charset = parts.pop(0).lower()
  83. encoding = parts.pop(0).lower()
  84. encoded = parts.pop(0)
  85. words.append((encoded, encoding, charset))
  86. # Now loop over words and remove words that consist of whitespace
  87. # between two encoded strings.
  88. import sys
  89. droplist = []
  90. for n, w in enumerate(words):
  91. if n>1 and w[1] and words[n-2][1] and words[n-1][0].isspace():
  92. droplist.append(n-1)
  93. for d in reversed(droplist):
  94. del words[d]
  95. # The next step is to decode each encoded word by applying the reverse
  96. # base64 or quopri transformation. decoded_words is now a list of the
  97. # form (decoded_word, charset).
  98. decoded_words = []
  99. for encoded_string, encoding, charset in words:
  100. if encoding is None:
  101. # This is an unencoded word.
  102. decoded_words.append((encoded_string, charset))
  103. elif encoding == 'q':
  104. word = header_decode(encoded_string)
  105. decoded_words.append((word, charset))
  106. elif encoding == 'b':
  107. paderr = len(encoded_string) % 4 # Postel's law: add missing padding
  108. if paderr:
  109. encoded_string += '==='[:4 - paderr]
  110. try:
  111. word = base64mime.decode(encoded_string)
  112. except binascii.Error:
  113. raise HeaderParseError('Base64 decoding error')
  114. else:
  115. decoded_words.append((word, charset))
  116. else:
  117. raise AssertionError('Unexpected encoding: ' + encoding)
  118. # Now convert all words to bytes and collapse consecutive runs of
  119. # similarly encoded words.
  120. collapsed = []
  121. last_word = last_charset = None
  122. for word, charset in decoded_words:
  123. if isinstance(word, str):
  124. word = bytes(word, 'raw-unicode-escape')
  125. if last_word is None:
  126. last_word = word
  127. last_charset = charset
  128. elif charset != last_charset:
  129. collapsed.append((last_word, last_charset))
  130. last_word = word
  131. last_charset = charset
  132. elif last_charset is None:
  133. last_word += BSPACE + word
  134. else:
  135. last_word += word
  136. collapsed.append((last_word, last_charset))
  137. return collapsed
  138. def make_header(decoded_seq, maxlinelen=None, header_name=None,
  139. continuation_ws=' '):
  140. """Create a Header from a sequence of pairs as returned by decode_header()
  141. decode_header() takes a header value string and returns a sequence of
  142. pairs of the format (decoded_string, charset) where charset is the string
  143. name of the character set.
  144. This function takes one of those sequence of pairs and returns a Header
  145. instance. Optional maxlinelen, header_name, and continuation_ws are as in
  146. the Header constructor.
  147. """
  148. h = Header(maxlinelen=maxlinelen, header_name=header_name,
  149. continuation_ws=continuation_ws)
  150. for s, charset in decoded_seq:
  151. # None means us-ascii but we can simply pass it on to h.append()
  152. if charset is not None and not isinstance(charset, Charset):
  153. charset = Charset(charset)
  154. h.append(s, charset)
  155. return h
  156. class Header(object):
  157. def __init__(self, s=None, charset=None,
  158. maxlinelen=None, header_name=None,
  159. continuation_ws=' ', errors='strict'):
  160. """Create a MIME-compliant header that can contain many character sets.
  161. Optional s is the initial header value. If None, the initial header
  162. value is not set. You can later append to the header with .append()
  163. method calls. s may be a byte string or a Unicode string, but see the
  164. .append() documentation for semantics.
  165. Optional charset serves two purposes: it has the same meaning as the
  166. charset argument to the .append() method. It also sets the default
  167. character set for all subsequent .append() calls that omit the charset
  168. argument. If charset is not provided in the constructor, the us-ascii
  169. charset is used both as s's initial charset and as the default for
  170. subsequent .append() calls.
  171. The maximum line length can be specified explicitly via maxlinelen. For
  172. splitting the first line to a shorter value (to account for the field
  173. header which isn't included in s, e.g. `Subject') pass in the name of
  174. the field in header_name. The default maxlinelen is 78 as recommended
  175. by RFC 2822.
  176. continuation_ws must be RFC 2822 compliant folding whitespace (usually
  177. either a space or a hard tab) which will be prepended to continuation
  178. lines.
  179. errors is passed through to the .append() call.
  180. """
  181. if charset is None:
  182. charset = USASCII
  183. elif not isinstance(charset, Charset):
  184. charset = Charset(charset)
  185. self._charset = charset
  186. self._continuation_ws = continuation_ws
  187. self._chunks = []
  188. if s is not None:
  189. self.append(s, charset, errors)
  190. if maxlinelen is None:
  191. maxlinelen = MAXLINELEN
  192. self._maxlinelen = maxlinelen
  193. if header_name is None:
  194. self._headerlen = 0
  195. else:
  196. # Take the separating colon and space into account.
  197. self._headerlen = len(header_name) + 2
  198. def __str__(self):
  199. """Return the string value of the header."""
  200. self._normalize()
  201. uchunks = []
  202. lastcs = None
  203. lastspace = None
  204. for string, charset in self._chunks:
  205. # We must preserve spaces between encoded and non-encoded word
  206. # boundaries, which means for us we need to add a space when we go
  207. # from a charset to None/us-ascii, or from None/us-ascii to a
  208. # charset. Only do this for the second and subsequent chunks.
  209. # Don't add a space if the None/us-ascii string already has
  210. # a space (trailing or leading depending on transition)
  211. nextcs = charset
  212. if nextcs == _charset.UNKNOWN8BIT:
  213. original_bytes = string.encode('ascii', 'surrogateescape')
  214. string = original_bytes.decode('ascii', 'replace')
  215. if uchunks:
  216. hasspace = string and self._nonctext(string[0])
  217. if lastcs not in (None, 'us-ascii'):
  218. if nextcs in (None, 'us-ascii') and not hasspace:
  219. uchunks.append(SPACE)
  220. nextcs = None
  221. elif nextcs not in (None, 'us-ascii') and not lastspace:
  222. uchunks.append(SPACE)
  223. lastspace = string and self._nonctext(string[-1])
  224. lastcs = nextcs
  225. uchunks.append(string)
  226. return EMPTYSTRING.join(uchunks)
  227. # Rich comparison operators for equality only. BAW: does it make sense to
  228. # have or explicitly disable <, <=, >, >= operators?
  229. def __eq__(self, other):
  230. # other may be a Header or a string. Both are fine so coerce
  231. # ourselves to a unicode (of the unencoded header value), swap the
  232. # args and do another comparison.
  233. return other == str(self)
  234. def __ne__(self, other):
  235. return not self == other
  236. def append(self, s, charset=None, errors='strict'):
  237. """Append a string to the MIME header.
  238. Optional charset, if given, should be a Charset instance or the name
  239. of a character set (which will be converted to a Charset instance). A
  240. value of None (the default) means that the charset given in the
  241. constructor is used.
  242. s may be a byte string or a Unicode string. If it is a byte string
  243. (i.e. isinstance(s, str) is false), then charset is the encoding of
  244. that byte string, and a UnicodeError will be raised if the string
  245. cannot be decoded with that charset. If s is a Unicode string, then
  246. charset is a hint specifying the character set of the characters in
  247. the string. In either case, when producing an RFC 2822 compliant
  248. header using RFC 2047 rules, the string will be encoded using the
  249. output codec of the charset. If the string cannot be encoded to the
  250. output codec, a UnicodeError will be raised.
  251. Optional `errors' is passed as the errors argument to the decode
  252. call if s is a byte string.
  253. """
  254. if charset is None:
  255. charset = self._charset
  256. elif not isinstance(charset, Charset):
  257. charset = Charset(charset)
  258. if not isinstance(s, str):
  259. input_charset = charset.input_codec or 'us-ascii'
  260. if input_charset == _charset.UNKNOWN8BIT:
  261. s = s.decode('us-ascii', 'surrogateescape')
  262. else:
  263. s = s.decode(input_charset, errors)
  264. # Ensure that the bytes we're storing can be decoded to the output
  265. # character set, otherwise an early error is raised.
  266. output_charset = charset.output_codec or 'us-ascii'
  267. if output_charset != _charset.UNKNOWN8BIT:
  268. try:
  269. s.encode(output_charset, errors)
  270. except UnicodeEncodeError:
  271. if output_charset!='us-ascii':
  272. raise
  273. charset = UTF8
  274. self._chunks.append((s, charset))
  275. def _nonctext(self, s):
  276. """True if string s is not a ctext character of RFC822.
  277. """
  278. return s.isspace() or s in ('(', ')', '\\')
  279. def encode(self, splitchars=';, \t', maxlinelen=None, linesep='\n'):
  280. r"""Encode a message header into an RFC-compliant format.
  281. There are many issues involved in converting a given string for use in
  282. an email header. Only certain character sets are readable in most
  283. email clients, and as header strings can only contain a subset of
  284. 7-bit ASCII, care must be taken to properly convert and encode (with
  285. Base64 or quoted-printable) header strings. In addition, there is a
  286. 75-character length limit on any given encoded header field, so
  287. line-wrapping must be performed, even with double-byte character sets.
  288. Optional maxlinelen specifies the maximum length of each generated
  289. line, exclusive of the linesep string. Individual lines may be longer
  290. than maxlinelen if a folding point cannot be found. The first line
  291. will be shorter by the length of the header name plus ": " if a header
  292. name was specified at Header construction time. The default value for
  293. maxlinelen is determined at header construction time.
  294. Optional splitchars is a string containing characters which should be
  295. given extra weight by the splitting algorithm during normal header
  296. wrapping. This is in very rough support of RFC 2822's `higher level
  297. syntactic breaks': split points preceded by a splitchar are preferred
  298. during line splitting, with the characters preferred in the order in
  299. which they appear in the string. Space and tab may be included in the
  300. string to indicate whether preference should be given to one over the
  301. other as a split point when other split chars do not appear in the line
  302. being split. Splitchars does not affect RFC 2047 encoded lines.
  303. Optional linesep is a string to be used to separate the lines of
  304. the value. The default value is the most useful for typical
  305. Python applications, but it can be set to \r\n to produce RFC-compliant
  306. line separators when needed.
  307. """
  308. self._normalize()
  309. if maxlinelen is None:
  310. maxlinelen = self._maxlinelen
  311. # A maxlinelen of 0 means don't wrap. For all practical purposes,
  312. # choosing a huge number here accomplishes that and makes the
  313. # _ValueFormatter algorithm much simpler.
  314. if maxlinelen == 0:
  315. maxlinelen = 1000000
  316. formatter = _ValueFormatter(self._headerlen, maxlinelen,
  317. self._continuation_ws, splitchars)
  318. lastcs = None
  319. hasspace = lastspace = None
  320. for string, charset in self._chunks:
  321. if hasspace is not None:
  322. hasspace = string and self._nonctext(string[0])
  323. import sys
  324. if lastcs not in (None, 'us-ascii'):
  325. if not hasspace or charset not in (None, 'us-ascii'):
  326. formatter.add_transition()
  327. elif charset not in (None, 'us-ascii') and not lastspace:
  328. formatter.add_transition()
  329. lastspace = string and self._nonctext(string[-1])
  330. lastcs = charset
  331. hasspace = False
  332. lines = string.splitlines()
  333. if lines:
  334. formatter.feed('', lines[0], charset)
  335. else:
  336. formatter.feed('', '', charset)
  337. for line in lines[1:]:
  338. formatter.newline()
  339. if charset.header_encoding is not None:
  340. formatter.feed(self._continuation_ws, ' ' + line.lstrip(),
  341. charset)
  342. else:
  343. sline = line.lstrip()
  344. fws = line[:len(line)-len(sline)]
  345. formatter.feed(fws, sline, charset)
  346. if len(lines) > 1:
  347. formatter.newline()
  348. if self._chunks:
  349. formatter.add_transition()
  350. value = formatter._str(linesep)
  351. if _embeded_header.search(value):
  352. raise HeaderParseError("header value appears to contain "
  353. "an embedded header: {!r}".format(value))
  354. return value
  355. def _normalize(self):
  356. # Step 1: Normalize the chunks so that all runs of identical charsets
  357. # get collapsed into a single unicode string.
  358. chunks = []
  359. last_charset = None
  360. last_chunk = []
  361. for string, charset in self._chunks:
  362. if charset == last_charset:
  363. last_chunk.append(string)
  364. else:
  365. if last_charset is not None:
  366. chunks.append((SPACE.join(last_chunk), last_charset))
  367. last_chunk = [string]
  368. last_charset = charset
  369. if last_chunk:
  370. chunks.append((SPACE.join(last_chunk), last_charset))
  371. self._chunks = chunks
  372. class _ValueFormatter(object):
  373. def __init__(self, headerlen, maxlen, continuation_ws, splitchars):
  374. self._maxlen = maxlen
  375. self._continuation_ws = continuation_ws
  376. self._continuation_ws_len = len(continuation_ws)
  377. self._splitchars = splitchars
  378. self._lines = []
  379. self._current_line = _Accumulator(headerlen)
  380. def _str(self, linesep):
  381. self.newline()
  382. return linesep.join(self._lines)
  383. def __str__(self):
  384. return self._str(NL)
  385. def newline(self):
  386. end_of_line = self._current_line.pop()
  387. if end_of_line != (' ', ''):
  388. self._current_line.push(*end_of_line)
  389. if len(self._current_line) > 0:
  390. if self._current_line.is_onlyws():
  391. self._lines[-1] += str(self._current_line)
  392. else:
  393. self._lines.append(str(self._current_line))
  394. self._current_line.reset()
  395. def add_transition(self):
  396. self._current_line.push(' ', '')
  397. def feed(self, fws, string, charset):
  398. # If the charset has no header encoding (i.e. it is an ASCII encoding)
  399. # then we must split the header at the "highest level syntactic break"
  400. # possible. Note that we don't have a lot of smarts about field
  401. # syntax; we just try to break on semi-colons, then commas, then
  402. # whitespace. Eventually, this should be pluggable.
  403. if charset.header_encoding is None:
  404. self._ascii_split(fws, string, self._splitchars)
  405. return
  406. # Otherwise, we're doing either a Base64 or a quoted-printable
  407. # encoding which means we don't need to split the line on syntactic
  408. # breaks. We can basically just find enough characters to fit on the
  409. # current line, minus the RFC 2047 chrome. What makes this trickier
  410. # though is that we have to split at octet boundaries, not character
  411. # boundaries but it's only safe to split at character boundaries so at
  412. # best we can only get close.
  413. encoded_lines = charset.header_encode_lines(string, self._maxlengths())
  414. # The first element extends the current line, but if it's None then
  415. # nothing more fit on the current line so start a new line.
  416. try:
  417. first_line = encoded_lines.pop(0)
  418. except IndexError:
  419. # There are no encoded lines, so we're done.
  420. return
  421. if first_line is not None:
  422. self._append_chunk(fws, first_line)
  423. try:
  424. last_line = encoded_lines.pop()
  425. except IndexError:
  426. # There was only one line.
  427. return
  428. self.newline()
  429. self._current_line.push(self._continuation_ws, last_line)
  430. # Everything else are full lines in themselves.
  431. for line in encoded_lines:
  432. self._lines.append(self._continuation_ws + line)
  433. def _maxlengths(self):
  434. # The first line's length.
  435. yield self._maxlen - len(self._current_line)
  436. while True:
  437. yield self._maxlen - self._continuation_ws_len
  438. def _ascii_split(self, fws, string, splitchars):
  439. # The RFC 2822 header folding algorithm is simple in principle but
  440. # complex in practice. Lines may be folded any place where "folding
  441. # white space" appears by inserting a linesep character in front of the
  442. # FWS. The complication is that not all spaces or tabs qualify as FWS,
  443. # and we are also supposed to prefer to break at "higher level
  444. # syntactic breaks". We can't do either of these without intimate
  445. # knowledge of the structure of structured headers, which we don't have
  446. # here. So the best we can do here is prefer to break at the specified
  447. # splitchars, and hope that we don't choose any spaces or tabs that
  448. # aren't legal FWS. (This is at least better than the old algorithm,
  449. # where we would sometimes *introduce* FWS after a splitchar, or the
  450. # algorithm before that, where we would turn all white space runs into
  451. # single spaces or tabs.)
  452. parts = re.split("(["+FWS+"]+)", fws+string)
  453. if parts[0]:
  454. parts[:0] = ['']
  455. else:
  456. parts.pop(0)
  457. for fws, part in zip(*[iter(parts)]*2):
  458. self._append_chunk(fws, part)
  459. def _append_chunk(self, fws, string):
  460. self._current_line.push(fws, string)
  461. if len(self._current_line) > self._maxlen:
  462. # Find the best split point, working backward from the end.
  463. # There might be none, on a long first line.
  464. for ch in self._splitchars:
  465. for i in range(self._current_line.part_count()-1, 0, -1):
  466. if ch.isspace():
  467. fws = self._current_line[i][0]
  468. if fws and fws[0]==ch:
  469. break
  470. prevpart = self._current_line[i-1][1]
  471. if prevpart and prevpart[-1]==ch:
  472. break
  473. else:
  474. continue
  475. break
  476. else:
  477. fws, part = self._current_line.pop()
  478. if self._current_line._initial_size > 0:
  479. # There will be a header, so leave it on a line by itself.
  480. self.newline()
  481. if not fws:
  482. # We don't use continuation_ws here because the whitespace
  483. # after a header should always be a space.
  484. fws = ' '
  485. self._current_line.push(fws, part)
  486. return
  487. remainder = self._current_line.pop_from(i)
  488. self._lines.append(str(self._current_line))
  489. self._current_line.reset(remainder)
  490. class _Accumulator(list):
  491. def __init__(self, initial_size=0):
  492. self._initial_size = initial_size
  493. super().__init__()
  494. def push(self, fws, string):
  495. self.append((fws, string))
  496. def pop_from(self, i=0):
  497. popped = self[i:]
  498. self[i:] = []
  499. return popped
  500. def pop(self):
  501. if self.part_count()==0:
  502. return ('', '')
  503. return super().pop()
  504. def __len__(self):
  505. return sum((len(fws)+len(part) for fws, part in self),
  506. self._initial_size)
  507. def __str__(self):
  508. return EMPTYSTRING.join((EMPTYSTRING.join((fws, part))
  509. for fws, part in self))
  510. def reset(self, startval=None):
  511. if startval is None:
  512. startval = []
  513. self[:] = startval
  514. self._initial_size = 0
  515. def is_onlyws(self):
  516. return self._initial_size==0 and (not self or str(self).isspace())
  517. def part_count(self):
  518. return super().__len__()