123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185 |
- # -*- coding: utf-8 -*-
- """
- babel.messages.jslexer
- ~~~~~~~~~~~~~~~~~~~~~~
- A simple JavaScript 1.5 lexer which is used for the JavaScript
- extractor.
- :copyright: (c) 2013-2021 by the Babel Team.
- :license: BSD, see LICENSE for more details.
- """
- from collections import namedtuple
- import re
- from babel._compat import unichr
- operators = sorted([
- '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=',
- '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=',
- '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')',
- '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.', ':'
- ], key=len, reverse=True)
- escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'}
- name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
- dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
- division_re = re.compile(r'/=?')
- regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*', re.DOTALL)
- line_re = re.compile(r'(\r\n|\n|\r)')
- line_join_re = re.compile(r'\\' + line_re.pattern)
- uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')
- Token = namedtuple('Token', 'type value lineno')
- _rules = [
- (None, re.compile(r'\s+', re.UNICODE)),
- (None, re.compile(r'<!--.*')),
- ('linecomment', re.compile(r'//.*')),
- ('multilinecomment', re.compile(r'/\*.*?\*/', re.UNICODE | re.DOTALL)),
- ('dotted_name', dotted_name_re),
- ('name', name_re),
- ('number', re.compile(r'''(
- (?:0|[1-9]\d*)
- (\.\d+)?
- ([eE][-+]?\d+)? |
- (0x[a-fA-F0-9]+)
- )''', re.VERBOSE)),
- ('jsx_tag', re.compile(r'(?:</?[^>\s]+|/>)', re.I)), # May be mangled in `get_rules`
- ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))),
- ('template_string', re.compile(r'''`(?:[^`\\]*(?:\\.[^`\\]*)*)`''', re.UNICODE)),
- ('string', re.compile(r'''(
- '(?:[^'\\]*(?:\\.[^'\\]*)*)' |
- "(?:[^"\\]*(?:\\.[^"\\]*)*)"
- )''', re.VERBOSE | re.DOTALL))
- ]
- def get_rules(jsx, dotted, template_string):
- """
- Get a tokenization rule list given the passed syntax options.
- Internal to this module.
- """
- rules = []
- for token_type, rule in _rules:
- if not jsx and token_type and 'jsx' in token_type:
- continue
- if not template_string and token_type == 'template_string':
- continue
- if token_type == 'dotted_name':
- if not dotted:
- continue
- token_type = 'name'
- rules.append((token_type, rule))
- return rules
- def indicates_division(token):
- """A helper function that helps the tokenizer to decide if the current
- token may be followed by a division operator.
- """
- if token.type == 'operator':
- return token.value in (')', ']', '}', '++', '--')
- return token.type in ('name', 'number', 'string', 'regexp')
- def unquote_string(string):
- """Unquote a string with JavaScript rules. The string has to start with
- string delimiters (``'``, ``"`` or the back-tick/grave accent (for template strings).)
- """
- assert string and string[0] == string[-1] and string[0] in '"\'`', \
- 'string provided is not properly delimited'
- string = line_join_re.sub('\\1', string[1:-1])
- result = []
- add = result.append
- pos = 0
- while 1:
- # scan for the next escape
- escape_pos = string.find('\\', pos)
- if escape_pos < 0:
- break
- add(string[pos:escape_pos])
- # check which character is escaped
- next_char = string[escape_pos + 1]
- if next_char in escapes:
- add(escapes[next_char])
- # unicode escapes. trie to consume up to four characters of
- # hexadecimal characters and try to interpret them as unicode
- # character point. If there is no such character point, put
- # all the consumed characters into the string.
- elif next_char in 'uU':
- escaped = uni_escape_re.match(string, escape_pos + 2)
- if escaped is not None:
- escaped_value = escaped.group()
- if len(escaped_value) == 4:
- try:
- add(unichr(int(escaped_value, 16)))
- except ValueError:
- pass
- else:
- pos = escape_pos + 6
- continue
- add(next_char + escaped_value)
- pos = escaped.end()
- continue
- else:
- add(next_char)
- # bogus escape. Just remove the backslash.
- else:
- add(next_char)
- pos = escape_pos + 2
- if pos < len(string):
- add(string[pos:])
- return u''.join(result)
- def tokenize(source, jsx=True, dotted=True, template_string=True):
- """
- Tokenize JavaScript/JSX source. Returns a generator of tokens.
- :param jsx: Enable (limited) JSX parsing.
- :param dotted: Read dotted names as single name token.
- :param template_string: Support ES6 template strings
- """
- may_divide = False
- pos = 0
- lineno = 1
- end = len(source)
- rules = get_rules(jsx=jsx, dotted=dotted, template_string=template_string)
- while pos < end:
- # handle regular rules first
- for token_type, rule in rules:
- match = rule.match(source, pos)
- if match is not None:
- break
- # if we don't have a match we don't give up yet, but check for
- # division operators or regular expression literals, based on
- # the status of `may_divide` which is determined by the last
- # processed non-whitespace token using `indicates_division`.
- else:
- if may_divide:
- match = division_re.match(source, pos)
- token_type = 'operator'
- else:
- match = regex_re.match(source, pos)
- token_type = 'regexp'
- if match is None:
- # woops. invalid syntax. jump one char ahead and try again.
- pos += 1
- continue
- token_value = match.group()
- if token_type is not None:
- token = Token(token_type, token_value, lineno)
- may_divide = indicates_division(token)
- yield token
- lineno += len(line_re.findall(token_value))
- pos = match.end()
|