bytecode.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tools for searching bytecode for key statements which indicate need for
  4. additional resources e.g. data file, package metadata.
  5. By *bytecode* I mean the ``code`` object given by ``compile()``, accessible
  6. from the ``__code__`` attribute of any non-builtin function or, in
  7. PyInstallerLand, the ``PyiModuleGraph.node("some.module").code`` attribute.
  8. The best guide for bytecode format I've found is the disassembler reference:
  9. https://docs.python.org/3/library/dis.html
  10. This parser implementation aims to combine the flexibility and speed of regex
  11. with the clarity of the output of ``dis.dis(code)``. It hasn't achieved the 2nd
  12. but C'est la vie...
  13. The biggest clarity killer here is the ``EXTENDED_ARG`` opcode which can appear
  14. almost anywhere and therefore needs to be tiptoed around at every step.
  15. If this code needs to expand significantly then I would recommend to upgrade to
  16. a regex-based grammar parsing library such as Reparse. This way, little steps
  17. like unpacking ``EXTENDED_ARGS`` can be defined once then simply referenced
  18. forming a nice hierarchy rather than copied everywhere its needed.
  19. """
  20. import dis
  21. import re
  22. from types import CodeType
  23. from typing import Pattern
  24. def _instruction_to_regex(x: str):
  25. """Get a regex-escaped opcode byte from its human readable name."""
  26. if x not in dis.opname: # pragma: no cover
  27. # These opcodes are available only in Python >=3.7.
  28. # For our purposes, these aliases will do.
  29. if x == "LOAD_METHOD":
  30. x = "LOAD_ATTR"
  31. elif x == "CALL_METHOD":
  32. x = "CALL_FUNCTION"
  33. return re.escape(bytes([dis.opmap[x]]))
  34. def bytecode_regex(pattern: bytes, flags=re.VERBOSE | re.DOTALL):
  35. """A regex powered Python bytecode matcher.
  36. ``bytecode_regex`` provides a very thin wrapper around :func:`re.compile`.
  37. * Any opcode names wrapped in backticks are substituted for their
  38. corresponding opcode bytes.
  39. * Patterns are compiled in VERBOSE mode by default so that whitespace and
  40. comments may be used.
  41. This aims to mirror the output of :func:`dis.dis` which is far more
  42. readable than looking at raw byte strings.
  43. """
  44. assert isinstance(pattern, bytes)
  45. # Replace anything wrapped in backticks with regex-escaped opcodes.
  46. pattern = re.sub(
  47. rb"`(\w+)`",
  48. lambda m: _instruction_to_regex(m[1].decode()),
  49. pattern,
  50. )
  51. return re.compile(pattern, flags=flags)
  52. def finditer(pattern: Pattern, string):
  53. """Call ``pattern.finditer(string)`` but remove any matches beginning on an
  54. odd byte. i.e. match.start() is not a multiple of 2.
  55. This should be used to avoid false positive matches where a bytecode pair's
  56. argument is mistaken for an opcode.
  57. """
  58. matches = pattern.finditer(string)
  59. while True:
  60. for match in matches:
  61. if match.start() % 2 == 0:
  62. # All is good. This match starts on an OPCODE.
  63. yield match
  64. else:
  65. # This match has started on an odd byte meaning that it's a
  66. # false positive and should be skipped. There is a very slim
  67. # chance that a genuine match overlaps this one and, because
  68. # re.finditer() doesn't allow overlapping matches, it would be
  69. # lost. To avoid that, restart the regex scan starting at the
  70. # next even byte.
  71. matches = pattern.finditer(string, match.start() + 1)
  72. break
  73. else:
  74. break
  75. # language=PythonVerboseRegExp
  76. _call_function_bytecode = bytecode_regex(rb"""
  77. # Matches `global_function('some', 'constant', 'arguments')`.
  78. # Load the global function.
  79. # In code with >256 of names, this may require extended name references.
  80. ((?:`EXTENDED_ARG`.)*
  81. (?:`LOAD_NAME`|`LOAD_GLOBAL`|`LOAD_FAST`).)
  82. # For foo.bar.whizz(), the above is the 'foo', below is the 'bar.whizz'.
  83. ((?:(?:`EXTENDED_ARG`.)*
  84. (?:`LOAD_METHOD`|`LOAD_ATTR`).)*)
  85. # Load however many arguments it takes. These (for now) must all be
  86. # constants. Again, code with >256 constants may need extended enumeration.
  87. ((?:(?:`EXTENDED_ARG`.)*
  88. `LOAD_CONST`.)*)
  89. # Call the function. The parameter is the argument count (which may also be
  90. # >256).
  91. ((?:`EXTENDED_ARG`.)*
  92. (?:`CALL_FUNCTION`|`CALL_METHOD`).)
  93. """)
  94. # language=PythonVerboseRegExp
  95. _extended_arg_bytecode = bytecode_regex(rb"""(
  96. # Arbitrary number of EXTENDED_ARG pairs.
  97. (?:`EXTENDED_ARG`.)*
  98. # Followed by some other instruction (usually a LOAD).
  99. [^`EXTENDED_ARG`].
  100. )""")
  101. def extended_arguments(extended_args: bytes):
  102. """Unpack the (extended) integer used to reference names or constants.
  103. The input should be a bytecode snippet of the following form::
  104. EXTENDED_ARG ? # Repeated 0-4 times.
  105. LOAD_xxx ? # Any of LOAD_NAME/LOAD_CONST/LOAD_METHOD/...
  106. Each ? byte combined together gives the number we want.
  107. """
  108. return int.from_bytes(extended_args[1::2], "big")
  109. def load(raw: bytes, code: CodeType) -> str:
  110. """Parse an (extended) LOAD_xxx instruction."""
  111. # Get the enumeration.
  112. index = extended_arguments(raw)
  113. # Work out what that enumeration was for (constant/local var/global var).
  114. # If the last instruction byte is a LOAD_FAST:
  115. if raw[-2] == dis.opmap["LOAD_FAST"]:
  116. # Then this is a local variable.
  117. return code.co_varnames[index]
  118. # Or if it's a LOAD_CONST:
  119. if raw[-2] == dis.opmap["LOAD_CONST"]:
  120. # Then this is a literal.
  121. return code.co_consts[index]
  122. # Otherwise, it's a global name.
  123. return code.co_names[index]
  124. def loads(raw: bytes, code: CodeType) -> list:
  125. """Parse multiple consecutive LOAD_xxx instructions. Or load() in a for
  126. loop.
  127. May be used to unpack a function's parameters or nested attributes
  128. ``(foo.bar.pop.whack)``.
  129. """
  130. return [load(i, code) for i in _extended_arg_bytecode.findall(raw)]
  131. def function_calls(code: CodeType) -> list:
  132. """Scan a code object for all function calls on constant arguments."""
  133. match: re.Match
  134. out = []
  135. for match in finditer(_call_function_bytecode, code.co_code):
  136. function_root, methods, args, arg_count = match.groups()
  137. # For foo():
  138. # `function_root` contains 'foo' and `methods` is empty.
  139. # For foo.bar.whizz():
  140. # `function_root` contains 'foo' and `methods` contains the rest.
  141. function_root = load(function_root, code)
  142. methods = loads(methods, code)
  143. function = ".".join([function_root] + methods)
  144. args = loads(args, code)
  145. arg_count = extended_arguments(arg_count)
  146. if arg_count != len(args):
  147. # This happens if there are variable or keyword arguments.
  148. # Bail out in either case.
  149. continue
  150. out.append((function, args))
  151. return out
  152. def search_recursively(search: callable, code: CodeType, _memo=None) -> dict:
  153. """Apply a search function to a code object, recursing into child code
  154. objects (function definitions)."""
  155. if _memo is None:
  156. _memo = {}
  157. if code not in _memo:
  158. _memo[code] = search(code)
  159. for const in code.co_consts:
  160. if isinstance(const, CodeType):
  161. search_recursively(search, const, _memo)
  162. return _memo
  163. def recursive_function_calls(code: CodeType) -> dict:
  164. """Scan a code object, recursing into function definitions and bodies of
  165. comprehension loops, for function calls on constant arguments."""
  166. return search_recursively(function_calls, code)
  167. def any_alias(full_name: str):
  168. """List possible aliases of a fully qualified Python name.
  169. >>> list(any_alias("foo.bar.wizz"))
  170. ['foo.bar.wizz', 'bar.wizz', 'wizz']
  171. This crudely allows us to capture uses of wizz() under any of
  172. ::
  173. import foo
  174. foo.bar.wizz()
  175. ::
  176. from foo import bar
  177. bar.wizz()
  178. ::
  179. from foo.bar import wizz
  180. wizz()
  181. However, it'll fail for any form of aliases and quite likely find false
  182. matches.
  183. """
  184. parts = full_name.split('.')
  185. while parts:
  186. yield ".".join(parts)
  187. parts = parts[1:]