textsplit.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. #Copyright ReportLab Europe Ltd. 2000-2017
  2. #see license.txt for license details
  3. #history https://hg.reportlab.com/hg-public/reportlab/log/tip/src/reportlab/lib/textsplit.py
  4. """Helpers for text wrapping, hyphenation, Asian text splitting and kinsoku shori.
  5. How to split a 'big word' depends on the language and the writing system. This module
  6. works on a Unicode string. It ought to grow by allowing ore algoriths to be plugged
  7. in based on possible knowledge of the language and desirable 'niceness' of the algorithm.
  8. """
  9. __version__='3.3.0'
  10. from unicodedata import category
  11. from reportlab.pdfbase.pdfmetrics import stringWidth
  12. from reportlab.rl_config import _FUZZ
  13. from reportlab.lib.utils import isUnicode
  14. CANNOT_START_LINE = [
  15. #strongly prohibited e.g. end brackets, stop, exclamation...
  16. u'!\',.:;?!")]\u3001\u3002\u300d\u300f\u3011\u3015\uff3d\u3011\uff09',
  17. #middle priority e.g. continuation small vowels - wrapped on two lines but one string...
  18. u'\u3005\u2015\u3041\u3043\u3045\u3047\u3049\u3063\u3083\u3085\u3087\u308e\u30a1\u30a3'
  19. u'\u30a5\u30a7\u30a9\u30c3\u30e3\u30e5\u30e7\u30ee\u30fc\u30f5\u30f6',
  20. #weakly prohibited - continuations, celsius symbol etc.
  21. u'\u309b\u309c\u30fb\u30fd\u30fe\u309d\u309e\u2015\u2010\xb0\u2032\u2033\u2103\uffe0\uff05\u2030'
  22. ]
  23. ALL_CANNOT_START = u''.join(CANNOT_START_LINE)
  24. CANNOT_END_LINE = [
  25. #strongly prohibited
  26. u'\u2018\u201c\uff08[{\uff08\u3014\uff3b\uff5b\u3008\u300a\u300c\u300e\u3010',
  27. #weaker - currency symbols, hash, postcode - prefixes
  28. u'$\u00a3@#\uffe5\uff04\uffe1\uff20\u3012\u00a7'
  29. ]
  30. ALL_CANNOT_END = u''.join(CANNOT_END_LINE)
  31. def is_multi_byte(ch):
  32. "Is this an Asian character?"
  33. return (ord(ch) >= 0x3000)
  34. def getCharWidths(word, fontName, fontSize):
  35. """Returns a list of glyph widths.
  36. >>> getCharWidths('Hello', 'Courier', 10)
  37. [6.0, 6.0, 6.0, 6.0, 6.0]
  38. >>> from reportlab.pdfbase.cidfonts import UnicodeCIDFont
  39. >>> from reportlab.pdfbase.pdfmetrics import registerFont
  40. >>> registerFont(UnicodeCIDFont('HeiseiMin-W3'))
  41. >>> getCharWidths(u'\u6771\u4EAC', 'HeiseiMin-W3', 10) #most kanji are 100 ems
  42. [10.0, 10.0]
  43. """
  44. #character-level function call; the performance is going to SUCK
  45. return [stringWidth(uChar, fontName, fontSize) for uChar in word]
  46. def wordSplit(word, maxWidths, fontName, fontSize, encoding='utf8'):
  47. """Attempts to break a word which lacks spaces into two parts, the first of which
  48. fits in the remaining space. It is allowed to add hyphens or whatever it wishes.
  49. This is intended as a wrapper for some language- and user-choice-specific splitting
  50. algorithms. It should only be called after line breaking on spaces, which covers western
  51. languages and is highly optimised already. It works on the 'last unsplit word'.
  52. Presumably with further study one could write a Unicode splitting algorithm for text
  53. fragments whick was much faster.
  54. Courier characters should be 6 points wide.
  55. >>> wordSplit('HelloWorld', 30, 'Courier', 10)
  56. [[0.0, 'Hello'], [0.0, 'World']]
  57. >>> wordSplit('HelloWorld', 31, 'Courier', 10)
  58. [[1.0, 'Hello'], [1.0, 'World']]
  59. """
  60. if not isUnicode(word):
  61. uword = word.decode(encoding)
  62. else:
  63. uword = word
  64. charWidths = getCharWidths(uword, fontName, fontSize)
  65. lines = dumbSplit(uword, charWidths, maxWidths)
  66. if not isUnicode(word):
  67. lines2 = []
  68. #convert back
  69. for (extraSpace, text) in lines:
  70. lines2.append([extraSpace, text.encode(encoding)])
  71. lines = lines2
  72. return lines
  73. def dumbSplit(word, widths, maxWidths):
  74. """This function attempts to fit as many characters as possible into the available
  75. space, cutting "like a knife" between characters. This would do for Chinese.
  76. It returns a list of (text, extraSpace) items where text is a Unicode string,
  77. and extraSpace is the points of unused space available on the line. This is a
  78. structure which is fairly easy to display, and supports 'backtracking' approaches
  79. after the fact.
  80. Test cases assume each character is ten points wide...
  81. >>> dumbSplit(u'Hello', [10]*5, 60)
  82. [[10, u'Hello']]
  83. >>> dumbSplit(u'Hello', [10]*5, 50)
  84. [[0, u'Hello']]
  85. >>> dumbSplit(u'Hello', [10]*5, 40)
  86. [[0, u'Hell'], [30, u'o']]
  87. """
  88. _more = """
  89. #>>> dumbSplit(u'Hello', [10]*5, 4) # less than one character
  90. #(u'', u'Hello')
  91. # this says 'Nihongo wa muzukashii desu ne!' (Japanese is difficult isn't it?) in 12 characters
  92. >>> jtext = u'\u65e5\u672c\u8a9e\u306f\u96e3\u3057\u3044\u3067\u3059\u306d\uff01'
  93. >>> dumbSplit(jtext, [10]*11, 30) #
  94. (u'\u65e5\u672c\u8a9e', u'\u306f\u96e3\u3057\u3044\u3067\u3059\u306d\uff01')
  95. """
  96. if not isinstance(maxWidths,(list,tuple)): maxWidths = [maxWidths]
  97. assert isUnicode(word)
  98. lines = []
  99. i = widthUsed = lineStartPos = 0
  100. maxWidth = maxWidths[0]
  101. nW = len(word)
  102. while i<nW:
  103. w = widths[i]
  104. c = word[i]
  105. widthUsed += w
  106. i += 1
  107. if widthUsed > maxWidth + _FUZZ and widthUsed>0:
  108. extraSpace = maxWidth - widthUsed
  109. if ord(c)<0x3000:
  110. # we appear to be inside a non-Asian script section.
  111. # (this is a very crude test but quick to compute).
  112. # This is likely to be quite rare so the speed of the
  113. # code below is hopefully not a big issue. The main
  114. # situation requiring this is that a document title
  115. # with an english product name in it got cut.
  116. # we count back and look for
  117. # - a space-like character
  118. # - reversion to Kanji (which would be a good split point)
  119. # - in the worst case, roughly half way back along the line
  120. limitCheck = (lineStartPos+i)>>1 #(arbitrary taste issue)
  121. for j in range(i-1,limitCheck,-1):
  122. cj = word[j]
  123. if category(cj)=='Zs' or ord(cj)>=0x3000:
  124. k = j+1
  125. if k<i:
  126. j = k+1
  127. extraSpace += sum(widths[j:i])
  128. w = widths[k]
  129. c = word[k]
  130. i = j
  131. break
  132. #end of English-within-Asian special case
  133. #we are pushing this character back, but
  134. #the most important of the Japanese typography rules
  135. #if this character cannot start a line, wrap it up to this line so it hangs
  136. #in the right margin. We won't do two or more though - that's unlikely and
  137. #would result in growing ugliness.
  138. #and increase the extra space
  139. #bug fix contributed by Alexander Vasilenko <alexs.vasilenko@gmail.com>
  140. if c not in ALL_CANNOT_START and i>lineStartPos+1:
  141. #otherwise we need to push the character back
  142. #the i>lineStart+1 condition ensures progress
  143. i -= 1
  144. extraSpace += w
  145. #lines.append([maxWidth-sum(widths[lineStartPos:i]), word[lineStartPos:i].strip()])
  146. lines.append([extraSpace, word[lineStartPos:i].strip()])
  147. try:
  148. maxWidth = maxWidths[len(lines)]
  149. except IndexError:
  150. maxWidth = maxWidths[-1] # use the last one
  151. lineStartPos = i
  152. widthUsed = 0
  153. #any characters left?
  154. if widthUsed > 0:
  155. lines.append([maxWidth - widthUsed, word[lineStartPos:]])
  156. return lines
  157. def kinsokuShoriSplit(word, widths, availWidth):
  158. #NOT USED OR FINISHED YET!
  159. """Split according to Japanese rules according to CJKV (Lunde).
  160. Essentially look for "nice splits" so that we don't end a line
  161. with an open bracket, or start one with a full stop, or stuff like
  162. that. There is no attempt to try to split compound words into
  163. constituent kanji. It currently uses wrap-down: packs as much
  164. on a line as possible, then backtracks if needed
  165. This returns a number of words each of which should just about fit
  166. on a line. If you give it a whole paragraph at once, it will
  167. do all the splits.
  168. It's possible we might slightly step over the width limit
  169. if we do hanging punctuation marks in future (e.g. dangle a Japanese
  170. full stop in the right margin rather than using a whole character
  171. box.
  172. """
  173. lines = []
  174. assert len(word) == len(widths)
  175. curWidth = 0.0
  176. curLine = []
  177. i = 0 #character index - we backtrack at times so cannot use for loop
  178. while 1:
  179. ch = word[i]
  180. w = widths[i]
  181. if curWidth + w < availWidth:
  182. curLine.append(ch)
  183. curWidth += w
  184. else:
  185. #end of line. check legality
  186. if ch in CANNOT_END_LINE[0]:
  187. pass
  188. #to be completed
  189. # This recipe refers:
  190. #
  191. # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/148061
  192. import re
  193. rx=re.compile("([\u2e80-\uffff])", re.UNICODE)
  194. def cjkwrap(text, width, encoding="utf8"):
  195. return reduce(lambda line, word, width=width: '%s%s%s' %
  196. (line,
  197. [' ','\n', ''][(len(line)-line.rfind('\n')-1
  198. + len(word.split('\n',1)[0] ) >= width) or
  199. line[-1:] == '\0' and 2],
  200. word),
  201. rx.sub(r'\1\0 ', str(text,encoding)).split(' ')
  202. ).replace('\0', '').encode(encoding)
  203. if __name__=='__main__':
  204. import doctest
  205. from reportlab.lib import textsplit
  206. doctest.testmod(textsplit)