peutils.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592
  1. # -*- coding: Latin-1 -*-
  2. """peutils, Portable Executable utilities module
  3. Copyright (c) 2005-2021 Ero Carrera <ero.carrera@gmail.com>
  4. All rights reserved.
  5. """
  6. from __future__ import division
  7. from future import standard_library
  8. standard_library.install_aliases()
  9. from builtins import range
  10. from builtins import object
  11. import os
  12. import re
  13. import string
  14. import urllib.request, urllib.parse, urllib.error
  15. import pefile
  16. __author__ = "Ero Carrera"
  17. __version__ = pefile.__version__
  18. __contact__ = "ero.carrera@gmail.com"
  19. class SignatureDatabase(object):
  20. """This class loads and keeps a parsed PEiD signature database.
  21. Usage:
  22. sig_db = SignatureDatabase('/path/to/signature/file')
  23. and/or
  24. sig_db = SignatureDatabase()
  25. sig_db.load('/path/to/signature/file')
  26. Signature databases can be combined by performing multiple loads.
  27. The filename parameter can be a URL too. In that case the
  28. signature database will be downloaded from that location.
  29. """
  30. def __init__(self, filename=None, data=None):
  31. # RegExp to match a signature block
  32. #
  33. self.parse_sig = re.compile(
  34. "\[(.*?)\]\s+?signature\s*=\s*(.*?)(\s+\?\?)*\s*ep_only\s*=\s*(\w+)(?:\s*section_start_only\s*=\s*(\w+)|)",
  35. re.S,
  36. )
  37. # Signature information
  38. #
  39. # Signatures are stored as trees using dictionaries
  40. # The keys are the byte values while the values for
  41. # each key are either:
  42. #
  43. # - Other dictionaries of the same form for further
  44. # bytes in the signature
  45. #
  46. # - A dictionary with a string as a key (packer name)
  47. # and None as value to indicate a full signature
  48. #
  49. self.signature_tree_eponly_true = dict()
  50. self.signature_count_eponly_true = 0
  51. self.signature_tree_eponly_false = dict()
  52. self.signature_count_eponly_false = 0
  53. self.signature_tree_section_start = dict()
  54. self.signature_count_section_start = 0
  55. # The depth (length) of the longest signature
  56. #
  57. self.max_depth = 0
  58. self.__load(filename=filename, data=data)
  59. def generate_section_signatures(self, pe, name, sig_length=512):
  60. """Generates signatures for all the sections in a PE file.
  61. If the section contains any data a signature will be created
  62. for it. The signature name will be a combination of the
  63. parameter 'name' and the section number and its name.
  64. """
  65. section_signatures = list()
  66. for idx, section in enumerate(pe.sections):
  67. if section.SizeOfRawData < sig_length:
  68. continue
  69. # offset = pe.get_offset_from_rva(section.VirtualAddress)
  70. offset = section.PointerToRawData
  71. sig_name = "%s Section(%d/%d,%s)" % (
  72. name,
  73. idx + 1,
  74. len(pe.sections),
  75. "".join([c for c in section.Name if c in string.printable]),
  76. )
  77. section_signatures.append(
  78. self.__generate_signature(
  79. pe,
  80. offset,
  81. sig_name,
  82. ep_only=False,
  83. section_start_only=True,
  84. sig_length=sig_length,
  85. )
  86. )
  87. return "\n".join(section_signatures) + "\n"
  88. def generate_ep_signature(self, pe, name, sig_length=512):
  89. """Generate signatures for the entry point of a PE file.
  90. Creates a signature whose name will be the parameter 'name'
  91. and the section number and its name.
  92. """
  93. offset = pe.get_offset_from_rva(pe.OPTIONAL_HEADER.AddressOfEntryPoint)
  94. return self.__generate_signature(
  95. pe, offset, name, ep_only=True, sig_length=sig_length
  96. )
  97. def __generate_signature(
  98. self, pe, offset, name, ep_only=False, section_start_only=False, sig_length=512
  99. ):
  100. data = pe.__data__[offset : offset + sig_length]
  101. signature_bytes = " ".join(["%02x" % ord(c) for c in data])
  102. if ep_only == True:
  103. ep_only = "true"
  104. else:
  105. ep_only = "false"
  106. if section_start_only == True:
  107. section_start_only = "true"
  108. else:
  109. section_start_only = "false"
  110. signature = "[%s]\nsignature = %s\nep_only = %s\nsection_start_only = %s\n" % (
  111. name,
  112. signature_bytes,
  113. ep_only,
  114. section_start_only,
  115. )
  116. return signature
  117. def match(self, pe, ep_only=True, section_start_only=False):
  118. """Matches and returns the exact match(es).
  119. If ep_only is True the result will be a string with
  120. the packer name. Otherwise it will be a list of the
  121. form (file_offset, packer_name) specifying where
  122. in the file the signature was found.
  123. """
  124. matches = self.__match(pe, ep_only, section_start_only)
  125. # The last match (the most precise) from the
  126. # list of matches (if any) is returned
  127. #
  128. if matches:
  129. if ep_only == False:
  130. # Get the most exact match for each list of matches
  131. # at a given offset
  132. #
  133. return [(match[0], match[1][-1]) for match in matches]
  134. return matches[1][-1]
  135. return None
  136. def match_all(self, pe, ep_only=True, section_start_only=False):
  137. """Matches and returns all the likely matches."""
  138. matches = self.__match(pe, ep_only, section_start_only)
  139. if matches:
  140. if ep_only == False:
  141. # Get the most exact match for each list of matches
  142. # at a given offset
  143. #
  144. return matches
  145. return matches[1]
  146. return None
  147. def __match(self, pe, ep_only, section_start_only):
  148. # Load the corresponding set of signatures
  149. # Either the one for ep_only equal to True or
  150. # to False
  151. #
  152. if section_start_only is True:
  153. # Fetch the data of the executable as it'd
  154. # look once loaded in memory
  155. #
  156. try:
  157. data = pe.__data__
  158. except Exception as excp:
  159. raise
  160. # Load the corresponding tree of signatures
  161. #
  162. signatures = self.signature_tree_section_start
  163. # Set the starting address to start scanning from
  164. #
  165. scan_addresses = [section.PointerToRawData for section in pe.sections]
  166. elif ep_only is True:
  167. # Fetch the data of the executable as it'd
  168. # look once loaded in memory
  169. #
  170. try:
  171. data = pe.get_memory_mapped_image()
  172. except Exception as excp:
  173. raise
  174. # Load the corresponding tree of signatures
  175. #
  176. signatures = self.signature_tree_eponly_true
  177. # Fetch the entry point of the PE file and the data
  178. # at the entry point
  179. #
  180. ep = pe.OPTIONAL_HEADER.AddressOfEntryPoint
  181. # Set the starting address to start scanning from
  182. #
  183. scan_addresses = [ep]
  184. else:
  185. data = pe.__data__
  186. signatures = self.signature_tree_eponly_false
  187. scan_addresses = range(len(data))
  188. # For each start address, check if any signature matches
  189. #
  190. matches = []
  191. for idx in scan_addresses:
  192. result = self.__match_signature_tree(
  193. signatures, data[idx : idx + self.max_depth]
  194. )
  195. if result:
  196. matches.append((idx, result))
  197. # Return only the matched items found at the entry point if
  198. # ep_only is True (matches will have only one element in that
  199. # case)
  200. #
  201. if ep_only is True:
  202. if matches:
  203. return matches[0]
  204. return matches
  205. def match_data(self, code_data, ep_only=True, section_start_only=False):
  206. data = code_data
  207. scan_addresses = [0]
  208. # Load the corresponding set of signatures
  209. # Either the one for ep_only equal to True or
  210. # to False
  211. #
  212. if section_start_only is True:
  213. # Load the corresponding tree of signatures
  214. #
  215. signatures = self.signature_tree_section_start
  216. # Set the starting address to start scanning from
  217. #
  218. elif ep_only is True:
  219. # Load the corresponding tree of signatures
  220. #
  221. signatures = self.signature_tree_eponly_true
  222. # For each start address, check if any signature matches
  223. #
  224. matches = []
  225. for idx in scan_addresses:
  226. result = self.__match_signature_tree(
  227. signatures, data[idx : idx + self.max_depth]
  228. )
  229. if result:
  230. matches.append((idx, result))
  231. # Return only the matched items found at the entry point if
  232. # ep_only is True (matches will have only one element in that
  233. # case)
  234. #
  235. if ep_only is True:
  236. if matches:
  237. return matches[0]
  238. return matches
  239. def __match_signature_tree(self, signature_tree, data, depth=0):
  240. """Recursive function to find matches along the signature tree.
  241. signature_tree is the part of the tree left to walk
  242. data is the data being checked against the signature tree
  243. depth keeps track of how far we have gone down the tree
  244. """
  245. matched_names = list()
  246. match = signature_tree
  247. # Walk the bytes in the data and match them
  248. # against the signature
  249. #
  250. for idx, byte in enumerate([b if isinstance(b, int) else ord(b) for b in data]):
  251. # If the tree is exhausted...
  252. #
  253. if match is None:
  254. break
  255. # Get the next byte in the tree
  256. #
  257. match_next = match.get(byte, None)
  258. # If None is among the values for the key
  259. # it means that a signature in the database
  260. # ends here and that there's an exact match.
  261. #
  262. if None in list(match.values()):
  263. # idx represent how deep we are in the tree
  264. #
  265. # names = [idx+depth]
  266. names = list()
  267. # For each of the item pairs we check
  268. # if it has an element other than None,
  269. # if not then we have an exact signature
  270. #
  271. for item in list(match.items()):
  272. if item[1] is None:
  273. names.append(item[0])
  274. matched_names.append(names)
  275. # If a wildcard is found keep scanning the signature
  276. # ignoring the byte.
  277. #
  278. if "??" in match:
  279. match_tree_alternate = match.get("??", None)
  280. data_remaining = data[idx + 1 :]
  281. if data_remaining:
  282. matched_names.extend(
  283. self.__match_signature_tree(
  284. match_tree_alternate, data_remaining, idx + depth + 1
  285. )
  286. )
  287. match = match_next
  288. # If we have any more packer name in the end of the signature tree
  289. # add them to the matches
  290. #
  291. if match is not None and None in list(match.values()):
  292. # names = [idx + depth + 1]
  293. names = list()
  294. for item in list(match.items()):
  295. if item[1] is None:
  296. names.append(item[0])
  297. matched_names.append(names)
  298. return matched_names
  299. def load(self, filename=None, data=None):
  300. """Load a PEiD signature file.
  301. Invoking this method on different files combines the signatures.
  302. """
  303. self.__load(filename=filename, data=data)
  304. def __load(self, filename=None, data=None):
  305. if filename is not None:
  306. # If the path does not exist, attempt to open a URL
  307. #
  308. if not os.path.exists(filename):
  309. try:
  310. sig_f = urllib.request.urlopen(filename)
  311. sig_data = sig_f.read()
  312. sig_f.close()
  313. except IOError:
  314. # Let this be raised back to the user...
  315. raise
  316. else:
  317. # Get the data for a file
  318. #
  319. try:
  320. sig_f = open(filename, "rt")
  321. sig_data = sig_f.read()
  322. sig_f.close()
  323. except IOError:
  324. # Let this be raised back to the user...
  325. raise
  326. else:
  327. sig_data = data
  328. # If the file/URL could not be read or no "raw" data
  329. # was provided there's nothing else to do
  330. #
  331. if not sig_data:
  332. return
  333. # Helper function to parse the signature bytes
  334. #
  335. def to_byte(value):
  336. if "?" in value:
  337. return value
  338. return int(value, 16)
  339. # Parse all the signatures in the file
  340. #
  341. matches = self.parse_sig.findall(sig_data)
  342. # For each signature, get the details and load it into the
  343. # signature tree
  344. #
  345. for (
  346. packer_name,
  347. signature,
  348. superfluous_wildcards,
  349. ep_only,
  350. section_start_only,
  351. ) in matches:
  352. ep_only = ep_only.strip().lower()
  353. signature = signature.replace("\\n", "").strip()
  354. signature_bytes = [to_byte(b) for b in signature.split()]
  355. if ep_only == "true":
  356. ep_only = True
  357. else:
  358. ep_only = False
  359. if section_start_only == "true":
  360. section_start_only = True
  361. else:
  362. section_start_only = False
  363. depth = 0
  364. if section_start_only is True:
  365. tree = self.signature_tree_section_start
  366. self.signature_count_section_start += 1
  367. else:
  368. if ep_only is True:
  369. tree = self.signature_tree_eponly_true
  370. self.signature_count_eponly_true += 1
  371. else:
  372. tree = self.signature_tree_eponly_false
  373. self.signature_count_eponly_false += 1
  374. for idx, byte in enumerate(signature_bytes):
  375. if idx + 1 == len(signature_bytes):
  376. tree[byte] = tree.get(byte, dict())
  377. tree[byte][packer_name] = None
  378. else:
  379. tree[byte] = tree.get(byte, dict())
  380. tree = tree[byte]
  381. depth += 1
  382. if depth > self.max_depth:
  383. self.max_depth = depth
  384. def is_valid(pe):
  385. """"""
  386. pass
  387. def is_suspicious(pe):
  388. """
  389. unusual locations of import tables
  390. non recognized section names
  391. presence of long ASCII strings
  392. """
  393. relocations_overlap_entry_point = False
  394. sequential_relocs = 0
  395. # If relocation data is found and the entries go over the entry point, and also are very
  396. # continuous or point outside section's boundaries => it might imply that an obfuscation
  397. # trick is being used or the relocations are corrupt (maybe intentionally)
  398. #
  399. if hasattr(pe, "DIRECTORY_ENTRY_BASERELOC"):
  400. for base_reloc in pe.DIRECTORY_ENTRY_BASERELOC:
  401. last_reloc_rva = None
  402. for reloc in base_reloc.entries:
  403. if reloc.rva <= pe.OPTIONAL_HEADER.AddressOfEntryPoint <= reloc.rva + 4:
  404. relocations_overlap_entry_point = True
  405. if (
  406. last_reloc_rva is not None
  407. and last_reloc_rva <= reloc.rva <= last_reloc_rva + 4
  408. ):
  409. sequential_relocs += 1
  410. last_reloc_rva = reloc.rva
  411. # If import tables or strings exist (are pointed to) to within the header or in the area
  412. # between the PE header and the first section that's suspicious
  413. #
  414. # IMPLEMENT
  415. warnings_while_parsing = False
  416. # If we have warnings, that's suspicious, some of those will be because of out-of-ordinary
  417. # values are found in the PE header fields
  418. # Things that are reported in warnings:
  419. # (parsing problems, special section characteristics i.e. W & X, uncommon values of fields,
  420. # unusual entrypoint, suspicious imports)
  421. #
  422. warnings = pe.get_warnings()
  423. if warnings:
  424. warnings_while_parsing
  425. # If there are few or none (should come with a standard "density" of strings/kilobytes of data) longer (>8)
  426. # ascii sequences that might indicate packed data, (this is similar to the entropy test in some ways but
  427. # might help to discard cases of legitimate installer or compressed data)
  428. # If compressed data (high entropy) and is_driver => uuuuhhh, nasty
  429. pass
  430. def is_probably_packed(pe):
  431. """Returns True is there is a high likelihood that a file is packed or contains compressed data.
  432. The sections of the PE file will be analyzed, if enough sections
  433. look like containing compressed data and the data makes
  434. up for more than 20% of the total file size, the function will
  435. return True.
  436. """
  437. # Calculate the length of the data up to the end of the last section in the
  438. # file. Overlay data won't be taken into account
  439. #
  440. total_pe_data_length = len(pe.trim())
  441. # Assume that the file is packed when no data is available
  442. if not total_pe_data_length:
  443. return True
  444. has_significant_amount_of_compressed_data = False
  445. # If some of the sections have high entropy and they make for more than 20% of the file's size
  446. # it's assumed that it could be an installer or a packed file
  447. total_compressed_data = 0
  448. for section in pe.sections:
  449. s_entropy = section.get_entropy()
  450. s_length = len(section.get_data())
  451. # The value of 7.4 is empirical, based on looking at a few files packed
  452. # by different packers
  453. if s_entropy > 7.4:
  454. total_compressed_data += s_length
  455. if ((1.0 * total_compressed_data) / total_pe_data_length) > 0.2:
  456. has_significant_amount_of_compressed_data = True
  457. return has_significant_amount_of_compressed_data