surrogateescape.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. """
  2. This is Victor Stinner's pure-Python implementation of PEP 383: the "surrogateescape" error
  3. handler of Python 3.
  4. Source: misc/python/surrogateescape.py in https://bitbucket.org/haypo/misc
  5. """
  6. # This code is released under the Python license and the BSD 2-clause license
  7. import codecs
  8. import sys
  9. from future import utils
  10. FS_ERRORS = 'surrogateescape'
  11. # # -- Python 2/3 compatibility -------------------------------------
  12. # FS_ERRORS = 'my_surrogateescape'
  13. def u(text):
  14. if utils.PY3:
  15. return text
  16. else:
  17. return text.decode('unicode_escape')
  18. def b(data):
  19. if utils.PY3:
  20. return data.encode('latin1')
  21. else:
  22. return data
  23. if utils.PY3:
  24. _unichr = chr
  25. bytes_chr = lambda code: bytes((code,))
  26. else:
  27. _unichr = unichr
  28. bytes_chr = chr
  29. def surrogateescape_handler(exc):
  30. """
  31. Pure Python implementation of the PEP 383: the "surrogateescape" error
  32. handler of Python 3. Undecodable bytes will be replaced by a Unicode
  33. character U+DCxx on decoding, and these are translated into the
  34. original bytes on encoding.
  35. """
  36. mystring = exc.object[exc.start:exc.end]
  37. try:
  38. if isinstance(exc, UnicodeDecodeError):
  39. # mystring is a byte-string in this case
  40. decoded = replace_surrogate_decode(mystring)
  41. elif isinstance(exc, UnicodeEncodeError):
  42. # In the case of u'\udcc3'.encode('ascii',
  43. # 'this_surrogateescape_handler'), both Python 2.x and 3.x raise an
  44. # exception anyway after this function is called, even though I think
  45. # it's doing what it should. It seems that the strict encoder is called
  46. # to encode the unicode string that this function returns ...
  47. decoded = replace_surrogate_encode(mystring)
  48. else:
  49. raise exc
  50. except NotASurrogateError:
  51. raise exc
  52. return (decoded, exc.end)
  53. class NotASurrogateError(Exception):
  54. pass
  55. def replace_surrogate_encode(mystring):
  56. """
  57. Returns a (unicode) string, not the more logical bytes, because the codecs
  58. register_error functionality expects this.
  59. """
  60. decoded = []
  61. for ch in mystring:
  62. # if utils.PY3:
  63. # code = ch
  64. # else:
  65. code = ord(ch)
  66. # The following magic comes from Py3.3's Python/codecs.c file:
  67. if not 0xD800 <= code <= 0xDCFF:
  68. # Not a surrogate. Fail with the original exception.
  69. raise NotASurrogateError
  70. # mybytes = [0xe0 | (code >> 12),
  71. # 0x80 | ((code >> 6) & 0x3f),
  72. # 0x80 | (code & 0x3f)]
  73. # Is this a good idea?
  74. if 0xDC00 <= code <= 0xDC7F:
  75. decoded.append(_unichr(code - 0xDC00))
  76. elif code <= 0xDCFF:
  77. decoded.append(_unichr(code - 0xDC00))
  78. else:
  79. raise NotASurrogateError
  80. return str().join(decoded)
  81. def replace_surrogate_decode(mybytes):
  82. """
  83. Returns a (unicode) string
  84. """
  85. decoded = []
  86. for ch in mybytes:
  87. # We may be parsing newbytes (in which case ch is an int) or a native
  88. # str on Py2
  89. if isinstance(ch, int):
  90. code = ch
  91. else:
  92. code = ord(ch)
  93. if 0x80 <= code <= 0xFF:
  94. decoded.append(_unichr(0xDC00 + code))
  95. elif code <= 0x7F:
  96. decoded.append(_unichr(code))
  97. else:
  98. # # It may be a bad byte
  99. # # Try swallowing it.
  100. # continue
  101. # print("RAISE!")
  102. raise NotASurrogateError
  103. return str().join(decoded)
  104. def encodefilename(fn):
  105. if FS_ENCODING == 'ascii':
  106. # ASCII encoder of Python 2 expects that the error handler returns a
  107. # Unicode string encodable to ASCII, whereas our surrogateescape error
  108. # handler has to return bytes in 0x80-0xFF range.
  109. encoded = []
  110. for index, ch in enumerate(fn):
  111. code = ord(ch)
  112. if code < 128:
  113. ch = bytes_chr(code)
  114. elif 0xDC80 <= code <= 0xDCFF:
  115. ch = bytes_chr(code - 0xDC00)
  116. else:
  117. raise UnicodeEncodeError(FS_ENCODING,
  118. fn, index, index+1,
  119. 'ordinal not in range(128)')
  120. encoded.append(ch)
  121. return bytes().join(encoded)
  122. elif FS_ENCODING == 'utf-8':
  123. # UTF-8 encoder of Python 2 encodes surrogates, so U+DC80-U+DCFF
  124. # doesn't go through our error handler
  125. encoded = []
  126. for index, ch in enumerate(fn):
  127. code = ord(ch)
  128. if 0xD800 <= code <= 0xDFFF:
  129. if 0xDC80 <= code <= 0xDCFF:
  130. ch = bytes_chr(code - 0xDC00)
  131. encoded.append(ch)
  132. else:
  133. raise UnicodeEncodeError(
  134. FS_ENCODING,
  135. fn, index, index+1, 'surrogates not allowed')
  136. else:
  137. ch_utf8 = ch.encode('utf-8')
  138. encoded.append(ch_utf8)
  139. return bytes().join(encoded)
  140. else:
  141. return fn.encode(FS_ENCODING, FS_ERRORS)
  142. def decodefilename(fn):
  143. return fn.decode(FS_ENCODING, FS_ERRORS)
  144. FS_ENCODING = 'ascii'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
  145. # FS_ENCODING = 'cp932'; fn = b('[abc\x81\x00]'); encoded = u('[abc\udc81\x00]')
  146. # FS_ENCODING = 'UTF-8'; fn = b('[abc\xff]'); encoded = u('[abc\udcff]')
  147. # normalize the filesystem encoding name.
  148. # For example, we expect "utf-8", not "UTF8".
  149. FS_ENCODING = codecs.lookup(FS_ENCODING).name
  150. def register_surrogateescape():
  151. """
  152. Registers the surrogateescape error handler on Python 2 (only)
  153. """
  154. if utils.PY3:
  155. return
  156. try:
  157. codecs.lookup_error(FS_ERRORS)
  158. except LookupError:
  159. codecs.register_error(FS_ERRORS, surrogateescape_handler)
  160. if __name__ == '__main__':
  161. pass
  162. # # Tests:
  163. # register_surrogateescape()
  164. # b = decodefilename(fn)
  165. # assert b == encoded, "%r != %r" % (b, encoded)
  166. # c = encodefilename(b)
  167. # assert c == fn, '%r != %r' % (c, fn)
  168. # # print("ok")