pofile.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645
  1. # -*- coding: utf-8 -*-
  2. """
  3. babel.messages.pofile
  4. ~~~~~~~~~~~~~~~~~~~~~
  5. Reading and writing of files in the ``gettext`` PO (portable object)
  6. format.
  7. :copyright: (c) 2013-2021 by the Babel Team.
  8. :license: BSD, see LICENSE for more details.
  9. """
  10. from __future__ import print_function
  11. import os
  12. import re
  13. from babel.messages.catalog import Catalog, Message
  14. from babel.util import wraptext
  15. from babel._compat import text_type, cmp
  16. def unescape(string):
  17. r"""Reverse `escape` the given string.
  18. >>> print(unescape('"Say:\\n \\"hello, world!\\"\\n"'))
  19. Say:
  20. "hello, world!"
  21. <BLANKLINE>
  22. :param string: the string to unescape
  23. """
  24. def replace_escapes(match):
  25. m = match.group(1)
  26. if m == 'n':
  27. return '\n'
  28. elif m == 't':
  29. return '\t'
  30. elif m == 'r':
  31. return '\r'
  32. # m is \ or "
  33. return m
  34. return re.compile(r'\\([\\trn"])').sub(replace_escapes, string[1:-1])
  35. def denormalize(string):
  36. r"""Reverse the normalization done by the `normalize` function.
  37. >>> print(denormalize(r'''""
  38. ... "Say:\n"
  39. ... " \"hello, world!\"\n"'''))
  40. Say:
  41. "hello, world!"
  42. <BLANKLINE>
  43. >>> print(denormalize(r'''""
  44. ... "Say:\n"
  45. ... " \"Lorem ipsum dolor sit "
  46. ... "amet, consectetur adipisicing"
  47. ... " elit, \"\n"'''))
  48. Say:
  49. "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "
  50. <BLANKLINE>
  51. :param string: the string to denormalize
  52. """
  53. if '\n' in string:
  54. escaped_lines = string.splitlines()
  55. if string.startswith('""'):
  56. escaped_lines = escaped_lines[1:]
  57. lines = map(unescape, escaped_lines)
  58. return ''.join(lines)
  59. else:
  60. return unescape(string)
  61. class PoFileError(Exception):
  62. """Exception thrown by PoParser when an invalid po file is encountered."""
  63. def __init__(self, message, catalog, line, lineno):
  64. super(PoFileError, self).__init__('{message} on {lineno}'.format(message=message, lineno=lineno))
  65. self.catalog = catalog
  66. self.line = line
  67. self.lineno = lineno
  68. class _NormalizedString(object):
  69. def __init__(self, *args):
  70. self._strs = []
  71. for arg in args:
  72. self.append(arg)
  73. def append(self, s):
  74. self._strs.append(s.strip())
  75. def denormalize(self):
  76. return ''.join(map(unescape, self._strs))
  77. def __nonzero__(self):
  78. return bool(self._strs)
  79. __bool__ = __nonzero__
  80. def __repr__(self):
  81. return os.linesep.join(self._strs)
  82. def __cmp__(self, other):
  83. if not other:
  84. return 1
  85. return cmp(text_type(self), text_type(other))
  86. def __gt__(self, other):
  87. return self.__cmp__(other) > 0
  88. def __lt__(self, other):
  89. return self.__cmp__(other) < 0
  90. def __ge__(self, other):
  91. return self.__cmp__(other) >= 0
  92. def __le__(self, other):
  93. return self.__cmp__(other) <= 0
  94. def __eq__(self, other):
  95. return self.__cmp__(other) == 0
  96. def __ne__(self, other):
  97. return self.__cmp__(other) != 0
  98. class PoFileParser(object):
  99. """Support class to read messages from a ``gettext`` PO (portable object) file
  100. and add them to a `Catalog`
  101. See `read_po` for simple cases.
  102. """
  103. _keywords = [
  104. 'msgid',
  105. 'msgstr',
  106. 'msgctxt',
  107. 'msgid_plural',
  108. ]
  109. def __init__(self, catalog, ignore_obsolete=False, abort_invalid=False):
  110. self.catalog = catalog
  111. self.ignore_obsolete = ignore_obsolete
  112. self.counter = 0
  113. self.offset = 0
  114. self.abort_invalid = abort_invalid
  115. self._reset_message_state()
  116. def _reset_message_state(self):
  117. self.messages = []
  118. self.translations = []
  119. self.locations = []
  120. self.flags = []
  121. self.user_comments = []
  122. self.auto_comments = []
  123. self.context = None
  124. self.obsolete = False
  125. self.in_msgid = False
  126. self.in_msgstr = False
  127. self.in_msgctxt = False
  128. def _add_message(self):
  129. """
  130. Add a message to the catalog based on the current parser state and
  131. clear the state ready to process the next message.
  132. """
  133. self.translations.sort()
  134. if len(self.messages) > 1:
  135. msgid = tuple([m.denormalize() for m in self.messages])
  136. else:
  137. msgid = self.messages[0].denormalize()
  138. if isinstance(msgid, (list, tuple)):
  139. string = ['' for _ in range(self.catalog.num_plurals)]
  140. for idx, translation in self.translations:
  141. if idx >= self.catalog.num_plurals:
  142. self._invalid_pofile(u"", self.offset, "msg has more translations than num_plurals of catalog")
  143. continue
  144. string[idx] = translation.denormalize()
  145. string = tuple(string)
  146. else:
  147. string = self.translations[0][1].denormalize()
  148. if self.context:
  149. msgctxt = self.context.denormalize()
  150. else:
  151. msgctxt = None
  152. message = Message(msgid, string, list(self.locations), set(self.flags),
  153. self.auto_comments, self.user_comments, lineno=self.offset + 1,
  154. context=msgctxt)
  155. if self.obsolete:
  156. if not self.ignore_obsolete:
  157. self.catalog.obsolete[msgid] = message
  158. else:
  159. self.catalog[msgid] = message
  160. self.counter += 1
  161. self._reset_message_state()
  162. def _finish_current_message(self):
  163. if self.messages:
  164. self._add_message()
  165. def _process_message_line(self, lineno, line, obsolete=False):
  166. if line.startswith('"'):
  167. self._process_string_continuation_line(line, lineno)
  168. else:
  169. self._process_keyword_line(lineno, line, obsolete)
  170. def _process_keyword_line(self, lineno, line, obsolete=False):
  171. for keyword in self._keywords:
  172. try:
  173. if line.startswith(keyword) and line[len(keyword)] in [' ', '[']:
  174. arg = line[len(keyword):]
  175. break
  176. except IndexError:
  177. self._invalid_pofile(line, lineno, "Keyword must be followed by a string")
  178. else:
  179. self._invalid_pofile(line, lineno, "Start of line didn't match any expected keyword.")
  180. return
  181. if keyword in ['msgid', 'msgctxt']:
  182. self._finish_current_message()
  183. self.obsolete = obsolete
  184. # The line that has the msgid is stored as the offset of the msg
  185. # should this be the msgctxt if it has one?
  186. if keyword == 'msgid':
  187. self.offset = lineno
  188. if keyword in ['msgid', 'msgid_plural']:
  189. self.in_msgctxt = False
  190. self.in_msgid = True
  191. self.messages.append(_NormalizedString(arg))
  192. elif keyword == 'msgstr':
  193. self.in_msgid = False
  194. self.in_msgstr = True
  195. if arg.startswith('['):
  196. idx, msg = arg[1:].split(']', 1)
  197. self.translations.append([int(idx), _NormalizedString(msg)])
  198. else:
  199. self.translations.append([0, _NormalizedString(arg)])
  200. elif keyword == 'msgctxt':
  201. self.in_msgctxt = True
  202. self.context = _NormalizedString(arg)
  203. def _process_string_continuation_line(self, line, lineno):
  204. if self.in_msgid:
  205. s = self.messages[-1]
  206. elif self.in_msgstr:
  207. s = self.translations[-1][1]
  208. elif self.in_msgctxt:
  209. s = self.context
  210. else:
  211. self._invalid_pofile(line, lineno, "Got line starting with \" but not in msgid, msgstr or msgctxt")
  212. return
  213. s.append(line)
  214. def _process_comment(self, line):
  215. self._finish_current_message()
  216. if line[1:].startswith(':'):
  217. for location in line[2:].lstrip().split():
  218. pos = location.rfind(':')
  219. if pos >= 0:
  220. try:
  221. lineno = int(location[pos + 1:])
  222. except ValueError:
  223. continue
  224. self.locations.append((location[:pos], lineno))
  225. else:
  226. self.locations.append((location, None))
  227. elif line[1:].startswith(','):
  228. for flag in line[2:].lstrip().split(','):
  229. self.flags.append(flag.strip())
  230. elif line[1:].startswith('.'):
  231. # These are called auto-comments
  232. comment = line[2:].strip()
  233. if comment: # Just check that we're not adding empty comments
  234. self.auto_comments.append(comment)
  235. else:
  236. # These are called user comments
  237. self.user_comments.append(line[1:].strip())
  238. def parse(self, fileobj):
  239. """
  240. Reads from the file-like object `fileobj` and adds any po file
  241. units found in it to the `Catalog` supplied to the constructor.
  242. """
  243. for lineno, line in enumerate(fileobj):
  244. line = line.strip()
  245. if not isinstance(line, text_type):
  246. line = line.decode(self.catalog.charset)
  247. if not line:
  248. continue
  249. if line.startswith('#'):
  250. if line[1:].startswith('~'):
  251. self._process_message_line(lineno, line[2:].lstrip(), obsolete=True)
  252. else:
  253. self._process_comment(line)
  254. else:
  255. self._process_message_line(lineno, line)
  256. self._finish_current_message()
  257. # No actual messages found, but there was some info in comments, from which
  258. # we'll construct an empty header message
  259. if not self.counter and (self.flags or self.user_comments or self.auto_comments):
  260. self.messages.append(_NormalizedString(u'""'))
  261. self.translations.append([0, _NormalizedString(u'""')])
  262. self._add_message()
  263. def _invalid_pofile(self, line, lineno, msg):
  264. assert isinstance(line, text_type)
  265. if self.abort_invalid:
  266. raise PoFileError(msg, self.catalog, line, lineno)
  267. print("WARNING:", msg)
  268. # `line` is guaranteed to be unicode so u"{}"-interpolating would always
  269. # succeed, but on Python < 2 if not in a TTY, `sys.stdout.encoding`
  270. # is `None`, unicode may not be printable so we `repr()` to ASCII.
  271. print(u"WARNING: Problem on line {0}: {1}".format(lineno + 1, repr(line)))
  272. def read_po(fileobj, locale=None, domain=None, ignore_obsolete=False, charset=None, abort_invalid=False):
  273. """Read messages from a ``gettext`` PO (portable object) file from the given
  274. file-like object and return a `Catalog`.
  275. >>> from datetime import datetime
  276. >>> from babel._compat import StringIO
  277. >>> buf = StringIO('''
  278. ... #: main.py:1
  279. ... #, fuzzy, python-format
  280. ... msgid "foo %(name)s"
  281. ... msgstr "quux %(name)s"
  282. ...
  283. ... # A user comment
  284. ... #. An auto comment
  285. ... #: main.py:3
  286. ... msgid "bar"
  287. ... msgid_plural "baz"
  288. ... msgstr[0] "bar"
  289. ... msgstr[1] "baaz"
  290. ... ''')
  291. >>> catalog = read_po(buf)
  292. >>> catalog.revision_date = datetime(2007, 4, 1)
  293. >>> for message in catalog:
  294. ... if message.id:
  295. ... print((message.id, message.string))
  296. ... print(' ', (message.locations, sorted(list(message.flags))))
  297. ... print(' ', (message.user_comments, message.auto_comments))
  298. (u'foo %(name)s', u'quux %(name)s')
  299. ([(u'main.py', 1)], [u'fuzzy', u'python-format'])
  300. ([], [])
  301. ((u'bar', u'baz'), (u'bar', u'baaz'))
  302. ([(u'main.py', 3)], [])
  303. ([u'A user comment'], [u'An auto comment'])
  304. .. versionadded:: 1.0
  305. Added support for explicit charset argument.
  306. :param fileobj: the file-like object to read the PO file from
  307. :param locale: the locale identifier or `Locale` object, or `None`
  308. if the catalog is not bound to a locale (which basically
  309. means it's a template)
  310. :param domain: the message domain
  311. :param ignore_obsolete: whether to ignore obsolete messages in the input
  312. :param charset: the character set of the catalog.
  313. :param abort_invalid: abort read if po file is invalid
  314. """
  315. catalog = Catalog(locale=locale, domain=domain, charset=charset)
  316. parser = PoFileParser(catalog, ignore_obsolete, abort_invalid=abort_invalid)
  317. parser.parse(fileobj)
  318. return catalog
  319. WORD_SEP = re.compile('('
  320. r'\s+|' # any whitespace
  321. r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words
  322. r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w)' # em-dash
  323. ')')
  324. def escape(string):
  325. r"""Escape the given string so that it can be included in double-quoted
  326. strings in ``PO`` files.
  327. >>> escape('''Say:
  328. ... "hello, world!"
  329. ... ''')
  330. '"Say:\\n \\"hello, world!\\"\\n"'
  331. :param string: the string to escape
  332. """
  333. return '"%s"' % string.replace('\\', '\\\\') \
  334. .replace('\t', '\\t') \
  335. .replace('\r', '\\r') \
  336. .replace('\n', '\\n') \
  337. .replace('\"', '\\"')
  338. def normalize(string, prefix='', width=76):
  339. r"""Convert a string into a format that is appropriate for .po files.
  340. >>> print(normalize('''Say:
  341. ... "hello, world!"
  342. ... ''', width=None))
  343. ""
  344. "Say:\n"
  345. " \"hello, world!\"\n"
  346. >>> print(normalize('''Say:
  347. ... "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "
  348. ... ''', width=32))
  349. ""
  350. "Say:\n"
  351. " \"Lorem ipsum dolor sit "
  352. "amet, consectetur adipisicing"
  353. " elit, \"\n"
  354. :param string: the string to normalize
  355. :param prefix: a string that should be prepended to every line
  356. :param width: the maximum line width; use `None`, 0, or a negative number
  357. to completely disable line wrapping
  358. """
  359. if width and width > 0:
  360. prefixlen = len(prefix)
  361. lines = []
  362. for line in string.splitlines(True):
  363. if len(escape(line)) + prefixlen > width:
  364. chunks = WORD_SEP.split(line)
  365. chunks.reverse()
  366. while chunks:
  367. buf = []
  368. size = 2
  369. while chunks:
  370. l = len(escape(chunks[-1])) - 2 + prefixlen
  371. if size + l < width:
  372. buf.append(chunks.pop())
  373. size += l
  374. else:
  375. if not buf:
  376. # handle long chunks by putting them on a
  377. # separate line
  378. buf.append(chunks.pop())
  379. break
  380. lines.append(u''.join(buf))
  381. else:
  382. lines.append(line)
  383. else:
  384. lines = string.splitlines(True)
  385. if len(lines) <= 1:
  386. return escape(string)
  387. # Remove empty trailing line
  388. if lines and not lines[-1]:
  389. del lines[-1]
  390. lines[-1] += '\n'
  391. return u'""\n' + u'\n'.join([(prefix + escape(line)) for line in lines])
  392. def write_po(fileobj, catalog, width=76, no_location=False, omit_header=False,
  393. sort_output=False, sort_by_file=False, ignore_obsolete=False,
  394. include_previous=False, include_lineno=True):
  395. r"""Write a ``gettext`` PO (portable object) template file for a given
  396. message catalog to the provided file-like object.
  397. >>> catalog = Catalog()
  398. >>> catalog.add(u'foo %(name)s', locations=[('main.py', 1)],
  399. ... flags=('fuzzy',))
  400. <Message...>
  401. >>> catalog.add((u'bar', u'baz'), locations=[('main.py', 3)])
  402. <Message...>
  403. >>> from babel._compat import BytesIO
  404. >>> buf = BytesIO()
  405. >>> write_po(buf, catalog, omit_header=True)
  406. >>> print(buf.getvalue().decode("utf8"))
  407. #: main.py:1
  408. #, fuzzy, python-format
  409. msgid "foo %(name)s"
  410. msgstr ""
  411. <BLANKLINE>
  412. #: main.py:3
  413. msgid "bar"
  414. msgid_plural "baz"
  415. msgstr[0] ""
  416. msgstr[1] ""
  417. <BLANKLINE>
  418. <BLANKLINE>
  419. :param fileobj: the file-like object to write to
  420. :param catalog: the `Catalog` instance
  421. :param width: the maximum line width for the generated output; use `None`,
  422. 0, or a negative number to completely disable line wrapping
  423. :param no_location: do not emit a location comment for every message
  424. :param omit_header: do not include the ``msgid ""`` entry at the top of the
  425. output
  426. :param sort_output: whether to sort the messages in the output by msgid
  427. :param sort_by_file: whether to sort the messages in the output by their
  428. locations
  429. :param ignore_obsolete: whether to ignore obsolete messages and not include
  430. them in the output; by default they are included as
  431. comments
  432. :param include_previous: include the old msgid as a comment when
  433. updating the catalog
  434. :param include_lineno: include line number in the location comment
  435. """
  436. def _normalize(key, prefix=''):
  437. return normalize(key, prefix=prefix, width=width)
  438. def _write(text):
  439. if isinstance(text, text_type):
  440. text = text.encode(catalog.charset, 'backslashreplace')
  441. fileobj.write(text)
  442. def _write_comment(comment, prefix=''):
  443. # xgettext always wraps comments even if --no-wrap is passed;
  444. # provide the same behaviour
  445. if width and width > 0:
  446. _width = width
  447. else:
  448. _width = 76
  449. for line in wraptext(comment, _width):
  450. _write('#%s %s\n' % (prefix, line.strip()))
  451. def _write_message(message, prefix=''):
  452. if isinstance(message.id, (list, tuple)):
  453. if message.context:
  454. _write('%smsgctxt %s\n' % (prefix,
  455. _normalize(message.context, prefix)))
  456. _write('%smsgid %s\n' % (prefix, _normalize(message.id[0], prefix)))
  457. _write('%smsgid_plural %s\n' % (
  458. prefix, _normalize(message.id[1], prefix)
  459. ))
  460. for idx in range(catalog.num_plurals):
  461. try:
  462. string = message.string[idx]
  463. except IndexError:
  464. string = ''
  465. _write('%smsgstr[%d] %s\n' % (
  466. prefix, idx, _normalize(string, prefix)
  467. ))
  468. else:
  469. if message.context:
  470. _write('%smsgctxt %s\n' % (prefix,
  471. _normalize(message.context, prefix)))
  472. _write('%smsgid %s\n' % (prefix, _normalize(message.id, prefix)))
  473. _write('%smsgstr %s\n' % (
  474. prefix, _normalize(message.string or '', prefix)
  475. ))
  476. sort_by = None
  477. if sort_output:
  478. sort_by = "message"
  479. elif sort_by_file:
  480. sort_by = "location"
  481. for message in _sort_messages(catalog, sort_by=sort_by):
  482. if not message.id: # This is the header "message"
  483. if omit_header:
  484. continue
  485. comment_header = catalog.header_comment
  486. if width and width > 0:
  487. lines = []
  488. for line in comment_header.splitlines():
  489. lines += wraptext(line, width=width,
  490. subsequent_indent='# ')
  491. comment_header = u'\n'.join(lines)
  492. _write(comment_header + u'\n')
  493. for comment in message.user_comments:
  494. _write_comment(comment)
  495. for comment in message.auto_comments:
  496. _write_comment(comment, prefix='.')
  497. if not no_location:
  498. locs = []
  499. # sort locations by filename and lineno.
  500. # if there's no <int> as lineno, use `-1`.
  501. # if no sorting possible, leave unsorted.
  502. # (see issue #606)
  503. try:
  504. locations = sorted(message.locations,
  505. key=lambda x: (x[0], isinstance(x[1], int) and x[1] or -1))
  506. except TypeError: # e.g. "TypeError: unorderable types: NoneType() < int()"
  507. locations = message.locations
  508. for filename, lineno in locations:
  509. if lineno and include_lineno:
  510. locs.append(u'%s:%d' % (filename.replace(os.sep, '/'), lineno))
  511. else:
  512. locs.append(u'%s' % filename.replace(os.sep, '/'))
  513. _write_comment(' '.join(locs), prefix=':')
  514. if message.flags:
  515. _write('#%s\n' % ', '.join([''] + sorted(message.flags)))
  516. if message.previous_id and include_previous:
  517. _write_comment('msgid %s' % _normalize(message.previous_id[0]),
  518. prefix='|')
  519. if len(message.previous_id) > 1:
  520. _write_comment('msgid_plural %s' % _normalize(
  521. message.previous_id[1]
  522. ), prefix='|')
  523. _write_message(message)
  524. _write('\n')
  525. if not ignore_obsolete:
  526. for message in _sort_messages(
  527. catalog.obsolete.values(),
  528. sort_by=sort_by
  529. ):
  530. for comment in message.user_comments:
  531. _write_comment(comment)
  532. _write_message(message, prefix='#~ ')
  533. _write('\n')
  534. def _sort_messages(messages, sort_by):
  535. """
  536. Sort the given message iterable by the given criteria.
  537. Always returns a list.
  538. :param messages: An iterable of Messages.
  539. :param sort_by: Sort by which criteria? Options are `message` and `location`.
  540. :return: list[Message]
  541. """
  542. messages = list(messages)
  543. if sort_by == "message":
  544. messages.sort()
  545. elif sort_by == "location":
  546. messages.sort(key=lambda m: m.locations)
  547. return messages