ExcelFormulaLexer.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. from __future__ import print_function
  2. # -*- coding: windows-1252 -*-
  3. from .antlr import EOF, CommonToken as Tok, TokenStream, TokenStreamException
  4. from . import ExcelFormulaParser
  5. from re import compile as recompile, IGNORECASE, VERBOSE
  6. int_const_pattern = r"\d+\b"
  7. flt_const_pattern = r"""
  8. (?:
  9. (?: \d* \. \d+ ) # .1 .12 .123 etc 9.1 etc 98.1 etc
  10. |
  11. (?: \d+ \. ) # 1. 12. 123. etc
  12. )
  13. # followed by optional exponent part
  14. (?: [Ee] [+-]? \d+ ) ?
  15. """
  16. str_const_pattern = r'"(?:[^"]|"")*"'
  17. #range2d_pattern = recompile(r"\$?[A-I]?[A-Z]\$?\d+:\$?[A-I]?[A-Z]\$?\d+"
  18. ref2d_r1c1_pattern = r"[Rr]0*[1-9][0-9]*[Cc]0*[1-9][0-9]*"
  19. ref2d_pattern = r"\$?[A-I]?[A-Z]\$?0*[1-9][0-9]*"
  20. true_pattern = r"TRUE\b"
  21. false_pattern = r"FALSE\b"
  22. if_pattern = r"IF\b"
  23. choose_pattern = r"CHOOSE\b"
  24. name_pattern = r"\w[\.\w]*"
  25. quotename_pattern = r"'(?:[^']|'')*'" #### It's essential that this bracket be non-grouping.
  26. ne_pattern = r"<>"
  27. ge_pattern = r">="
  28. le_pattern = r"<="
  29. pattern_type_tuples = (
  30. (flt_const_pattern, ExcelFormulaParser.NUM_CONST),
  31. (int_const_pattern, ExcelFormulaParser.INT_CONST),
  32. (str_const_pattern, ExcelFormulaParser.STR_CONST),
  33. # (range2d_pattern , ExcelFormulaParser.RANGE2D),
  34. (ref2d_r1c1_pattern, ExcelFormulaParser.REF2D_R1C1),
  35. (ref2d_pattern , ExcelFormulaParser.REF2D),
  36. (true_pattern , ExcelFormulaParser.TRUE_CONST),
  37. (false_pattern , ExcelFormulaParser.FALSE_CONST),
  38. (if_pattern , ExcelFormulaParser.FUNC_IF),
  39. (choose_pattern , ExcelFormulaParser.FUNC_CHOOSE),
  40. (name_pattern , ExcelFormulaParser.NAME),
  41. (quotename_pattern, ExcelFormulaParser.QUOTENAME),
  42. (ne_pattern, ExcelFormulaParser.NE),
  43. (ge_pattern, ExcelFormulaParser.GE),
  44. (le_pattern, ExcelFormulaParser.LE),
  45. )
  46. _re = recompile(
  47. '(' + ')|('.join(i[0] for i in pattern_type_tuples) + ')',
  48. VERBOSE+IGNORECASE)
  49. _toktype = [None] + [i[1] for i in pattern_type_tuples]
  50. # need dummy at start because re.MatchObject.lastindex counts from 1
  51. single_char_lookup = {
  52. '=': ExcelFormulaParser.EQ,
  53. '<': ExcelFormulaParser.LT,
  54. '>': ExcelFormulaParser.GT,
  55. '+': ExcelFormulaParser.ADD,
  56. '-': ExcelFormulaParser.SUB,
  57. '*': ExcelFormulaParser.MUL,
  58. '/': ExcelFormulaParser.DIV,
  59. ':': ExcelFormulaParser.COLON,
  60. ';': ExcelFormulaParser.SEMICOLON,
  61. ',': ExcelFormulaParser.COMMA,
  62. '(': ExcelFormulaParser.LP,
  63. ')': ExcelFormulaParser.RP,
  64. '&': ExcelFormulaParser.CONCAT,
  65. '%': ExcelFormulaParser.PERCENT,
  66. '^': ExcelFormulaParser.POWER,
  67. '!': ExcelFormulaParser.BANG,
  68. }
  69. class Lexer(TokenStream):
  70. def __init__(self, text):
  71. self._text = text[:]
  72. self._pos = 0
  73. self._line = 0
  74. def isEOF(self):
  75. return len(self._text) <= self._pos
  76. def curr_ch(self):
  77. return self._text[self._pos]
  78. def next_ch(self, n = 1):
  79. self._pos += n
  80. def is_whitespace(self):
  81. return self.curr_ch() in " \t\n\r\f\v"
  82. def match_pattern(self):
  83. m = _re.match(self._text, self._pos)
  84. if not m:
  85. return None
  86. self._pos = m.end(0)
  87. return Tok(type = _toktype[m.lastindex], text = m.group(0), col = m.start(0) + 1)
  88. def nextToken(self):
  89. # skip whitespace
  90. while not self.isEOF() and self.is_whitespace():
  91. self.next_ch()
  92. if self.isEOF():
  93. return Tok(type = EOF)
  94. # first, try to match token with 2 or more chars
  95. t = self.match_pattern()
  96. if t:
  97. return t
  98. # second, we want 1-char tokens
  99. te = self.curr_ch()
  100. try:
  101. ty = single_char_lookup[te]
  102. except KeyError:
  103. raise TokenStreamException(
  104. "Unexpected char %r in column %u." % (self.curr_ch(), self._pos))
  105. self.next_ch()
  106. return Tok(type=ty, text=te, col=self._pos)
  107. if __name__ == '__main__':
  108. try:
  109. for t in Lexer(""" 1.23 456 "abcd" R2C2 a1 iv65536 true false if choose a_name 'qname' <> >= <= """):
  110. print(t)
  111. except TokenStreamException as e:
  112. print("error:", e)