UnicodeUtils.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. # -*- coding: windows-1252 -*-
  2. '''
  3. From BIFF8 on, strings are always stored using UTF-16LE text encoding. The
  4. character array is a sequence of 16-bit values4. Additionally it is
  5. possible to use a compressed format, which omits the high bytes of all
  6. characters, if they are all zero.
  7. The following tables describe the standard format of the entire string, but
  8. in many records the strings differ from this format. This will be mentioned
  9. separately. It is possible (but not required) to store Rich-Text formatting
  10. information and Asian phonetic information inside a Unicode string. This
  11. results in four different ways to store a string. The character array
  12. is not zero-terminated.
  13. The string consists of the character count (as usual an 8-bit value or
  14. a 16-bit value), option flags, the character array and optional formatting
  15. information. If the string is empty, sometimes the option flags field will
  16. not occur. This is mentioned at the respective place.
  17. Offset Size Contents
  18. 0 1 or 2 Length of the string (character count, ln)
  19. 1 or 2 1 Option flags:
  20. Bit Mask Contents
  21. 0 01H Character compression (ccompr):
  22. 0 = Compressed (8-bit characters)
  23. 1 = Uncompressed (16-bit characters)
  24. 2 04H Asian phonetic settings (phonetic):
  25. 0 = Does not contain Asian phonetic settings
  26. 1 = Contains Asian phonetic settings
  27. 3 08H Rich-Text settings (richtext):
  28. 0 = Does not contain Rich-Text settings
  29. 1 = Contains Rich-Text settings
  30. [2 or 3] 2 (optional, only if richtext=1) Number of Rich-Text formatting runs (rt)
  31. [var.] 4 (optional, only if phonetic=1) Size of Asian phonetic settings block (in bytes, sz)
  32. var. ln or
  33. 2·ln Character array (8-bit characters or 16-bit characters, dependent on ccompr)
  34. [var.] 4·rt (optional, only if richtext=1) List of rt formatting runs
  35. [var.] sz (optional, only if phonetic=1) Asian Phonetic Settings Block
  36. '''
  37. from .compat import unicode, unicode_type
  38. from struct import pack
  39. def upack2(s, encoding='ascii'):
  40. # If not unicode, make it so.
  41. if isinstance(s, unicode_type):
  42. us = s
  43. else:
  44. us = unicode(s, encoding)
  45. # Limit is based on number of content characters
  46. # (not on number of bytes in packed result)
  47. len_us = len(us)
  48. if len_us > 32767:
  49. raise Exception('String longer than 32767 characters')
  50. try:
  51. encs = us.encode('latin1')
  52. # Success here means all chars are in U+0000 to U+00FF
  53. # inclusive, meaning that we can use "compressed format".
  54. flag = 0
  55. n_items = len_us
  56. except UnicodeEncodeError:
  57. encs = us.encode('utf_16_le')
  58. flag = 1
  59. n_items = len(encs) // 2
  60. # n_items is the number of "double byte characters" i.e. MS C wchars
  61. # Can't use len(us).
  62. # len(u"\U0001D400") -> 1 on a wide-unicode build
  63. # and 2 on a narrow-unicode build.
  64. # We need n_items == 2 in this case.
  65. return pack('<HB', n_items, flag) + encs
  66. def upack2rt(rt, encoding='ascii'):
  67. us = u''
  68. fr = b''
  69. offset = 0
  70. # convert rt strings to unicode if not already unicode
  71. # also generate the formatting run for the styles added
  72. for s, fontx in rt:
  73. if not isinstance(s, unicode_type):
  74. s = unicode(s, encoding)
  75. us += s
  76. if fontx is not None:
  77. # code in Rows.py ensures that
  78. # fontx can be None only for the first piece
  79. fr += pack('<HH', offset, fontx)
  80. # offset is the number of MS C wchar characters.
  81. # That is 1 if c <= u'\uFFFF' else 2
  82. offset += len(s.encode('utf_16_le')) // 2
  83. num_fr = len(fr) // 4 # ensure result is int
  84. if offset > 32767:
  85. raise Exception('String longer than 32767 characters')
  86. try:
  87. encs = us.encode('latin1')
  88. # Success here means all chars are in U+0000 to U+00FF
  89. # inclusive, meaning that we can use "compressed format".
  90. flag = 0 | 8
  91. n_items = len(encs)
  92. except UnicodeEncodeError:
  93. encs = us.encode('utf_16_le')
  94. flag = 1 | 8
  95. n_items = len(encs) // 2 # see comments in upack2 function above
  96. return pack('<HBH', n_items, flag, num_fr) + encs, fr
  97. def upack1(s, encoding='ascii'):
  98. # Same as upack2(), but with a one-byte length field.
  99. if isinstance(s, unicode_type):
  100. us = s
  101. else:
  102. us = unicode(s, encoding)
  103. len_us = len(us)
  104. if len_us > 255:
  105. raise Exception('String longer than 255 characters')
  106. try:
  107. encs = us.encode('latin1')
  108. flag = 0
  109. n_items = len_us
  110. except UnicodeEncodeError:
  111. encs = us.encode('utf_16_le')
  112. flag = 1
  113. n_items = len(encs) // 2
  114. return pack('<BB', n_items, flag) + encs