jslexer.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. # -*- coding: utf-8 -*-
  2. """
  3. babel.messages.jslexer
  4. ~~~~~~~~~~~~~~~~~~~~~~
  5. A simple JavaScript 1.5 lexer which is used for the JavaScript
  6. extractor.
  7. :copyright: (c) 2013-2021 by the Babel Team.
  8. :license: BSD, see LICENSE for more details.
  9. """
  10. from collections import namedtuple
  11. import re
  12. from babel._compat import unichr
  13. operators = sorted([
  14. '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
  15. '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
  16. '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
  17. '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':'
  18. ], key=len, reverse=True)
  19. escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
  20. name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
  21. dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
  22. division_re = re.compile(r'/=?')
  23. regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*', re.DOTALL)
  24. line_re = re.compile(r'(\r\n|\n|\r)')
  25. line_join_re = re.compile(r'\\' + line_re.pattern)
  26. uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
  27. Token = namedtuple('Token', 'type value lineno')
  28. _rules = [
  29. (None, re.compile(r'\s+', re.UNICODE)),
  30. (None, re.compile(r'<!--.*')),
  31. ('linecomment', re.compile(r'//.*')),
  32. ('multilinecomment', re.compile(r'/\*.*?\*/', re.UNICODE | re.DOTALL)),
  33. ('dotted_name', dotted_name_re),
  34. ('name', name_re),
  35. ('number', re.compile(r'''(
  36. (?:0|[1-9]\d*)
  37. (\.\d+)?
  38. ([eE][-+]?\d+)? |
  39. (0x[a-fA-F0-9]+)
  40. )''', re.VERBOSE)),
  41. ('jsx_tag', re.compile(r'(?:</?[^>\s]+|/>)', re.I)), # May be mangled in `get_rules`
  42. ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
  43. ('template_string', re.compile(r'''`(?:[^`\\]*(?:\\.[^`\\]*)*)`''', re.UNICODE)),
  44. ('string', re.compile(r'''(
  45. '(?:[^'\\]*(?:\\.[^'\\]*)*)' |
  46. "(?:[^"\\]*(?:\\.[^"\\]*)*)"
  47. )''', re.VERBOSE | re.DOTALL))
  48. ]
  49. def get_rules(jsx, dotted, template_string):
  50. """
  51. Get a tokenization rule list given the passed syntax options.
  52. Internal to this module.
  53. """
  54. rules = []
  55. for token_type, rule in _rules:
  56. if not jsx and token_type and 'jsx' in token_type:
  57. continue
  58. if not template_string and token_type == 'template_string':
  59. continue
  60. if token_type == 'dotted_name':
  61. if not dotted:
  62. continue
  63. token_type = 'name'
  64. rules.append((token_type, rule))
  65. return rules
  66. def indicates_division(token):
  67. """A helper function that helps the tokenizer to decide if the current
  68. token may be followed by a division operator.
  69. """
  70. if token.type == 'operator':
  71. return token.value in (')', ']', '}', '++', '--')
  72. return token.type in ('name', 'number', 'string', 'regexp')
  73. def unquote_string(string):
  74. """Unquote a string with JavaScript rules. The string has to start with
  75. string delimiters (``'``, ``"`` or the back-tick/grave accent (for template strings).)
  76. """
  77. assert string and string[0] == string[-1] and string[0] in '"\'`', \
  78. 'string provided is not properly delimited'
  79. string = line_join_re.sub('\\1', string[1:-1])
  80. result = []
  81. add = result.append
  82. pos = 0
  83. while 1:
  84. # scan for the next escape
  85. escape_pos = string.find('\\', pos)
  86. if escape_pos < 0:
  87. break
  88. add(string[pos:escape_pos])
  89. # check which character is escaped
  90. next_char = string[escape_pos + 1]
  91. if next_char in escapes:
  92. add(escapes[next_char])
  93. # unicode escapes. trie to consume up to four characters of
  94. # hexadecimal characters and try to interpret them as unicode
  95. # character point. If there is no such character point, put
  96. # all the consumed characters into the string.
  97. elif next_char in 'uU':
  98. escaped = uni_escape_re.match(string, escape_pos + 2)
  99. if escaped is not None:
  100. escaped_value = escaped.group()
  101. if len(escaped_value) == 4:
  102. try:
  103. add(unichr(int(escaped_value, 16)))
  104. except ValueError:
  105. pass
  106. else:
  107. pos = escape_pos + 6
  108. continue
  109. add(next_char + escaped_value)
  110. pos = escaped.end()
  111. continue
  112. else:
  113. add(next_char)
  114. # bogus escape. Just remove the backslash.
  115. else:
  116. add(next_char)
  117. pos = escape_pos + 2
  118. if pos < len(string):
  119. add(string[pos:])
  120. return u''.join(result)
  121. def tokenize(source, jsx=True, dotted=True, template_string=True):
  122. """
  123. Tokenize JavaScript/JSX source. Returns a generator of tokens.
  124. :param jsx: Enable (limited) JSX parsing.
  125. :param dotted: Read dotted names as single name token.
  126. :param template_string: Support ES6 template strings
  127. """
  128. may_divide = False
  129. pos = 0
  130. lineno = 1
  131. end = len(source)
  132. rules = get_rules(jsx=jsx, dotted=dotted, template_string=template_string)
  133. while pos < end:
  134. # handle regular rules first
  135. for token_type, rule in rules:
  136. match = rule.match(source, pos)
  137. if match is not None:
  138. break
  139. # if we don't have a match we don't give up yet, but check for
  140. # division operators or regular expression literals, based on
  141. # the status of `may_divide` which is determined by the last
  142. # processed non-whitespace token using `indicates_division`.
  143. else:
  144. if may_divide:
  145. match = division_re.match(source, pos)
  146. token_type = 'operator'
  147. else:
  148. match = regex_re.match(source, pos)
  149. token_type = 'regexp'
  150. if match is None:
  151. # woops. invalid syntax. jump one char ahead and try again.
  152. pos += 1
  153. continue
  154. token_value = match.group()
  155. if token_type is not None:
  156. token = Token(token_type, token_value, lineno)
  157. may_divide = indicates_division(token)
  158. yield token
  159. lineno += len(line_re.findall(token_value))
  160. pos = match.end()