writers.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628
  1. #-----------------------------------------------------------------------------
  2. # Copyright (c) 2005-2021, PyInstaller Development Team.
  3. #
  4. # Distributed under the terms of the GNU General Public License (version 2
  5. # or later) with exception for distributing the bootloader.
  6. #
  7. # The full license is in the file COPYING.txt, distributed with this software.
  8. #
  9. # SPDX-License-Identifier: (GPL-2.0-or-later WITH Bootloader-exception)
  10. #-----------------------------------------------------------------------------
  11. """
  12. Utilities to create data structures for embedding Python modules and additional
  13. files into the executable.
  14. """
  15. # While an Archive is really an abstraction for any "filesystem
  16. # within a file", it is tuned for use with imputil.FuncImporter.
  17. # This assumes it contains python code objects, indexed by the
  18. # the internal name (ie, no '.py').
  19. #
  20. # See pyi_carchive.py for a more general archive (contains anything)
  21. # that can be understood by a C program.
  22. import os
  23. import sys
  24. import struct
  25. from types import CodeType
  26. import marshal
  27. import zlib
  28. import io
  29. from PyInstaller.building.utils import get_code_object, strip_paths_in_code,\
  30. fake_pyc_timestamp
  31. from PyInstaller.loader.pyimod02_archive import PYZ_TYPE_MODULE, PYZ_TYPE_PKG, \
  32. PYZ_TYPE_DATA, PYZ_TYPE_NSPKG
  33. from PyInstaller.compat import BYTECODE_MAGIC, is_py37, is_win
  34. class ArchiveWriter(object):
  35. """
  36. A base class for a repository of python code objects.
  37. The extract method is used by imputil.ArchiveImporter
  38. to get code objects by name (fully qualified name), so
  39. an enduser "import a.b" would become
  40. extract('a.__init__')
  41. extract('a.b')
  42. """
  43. MAGIC = b'PYL\0'
  44. HDRLEN = 12 # default is MAGIC followed by python's magic, int pos of toc
  45. TOCPOS = 8
  46. def __init__(self, archive_path, logical_toc):
  47. """
  48. Create an archive file of name 'archive_path'.
  49. logical_toc is a 'logical TOC' - a list of (name, path, ...)
  50. where name is the internal name, eg 'a'
  51. and path is a file to get the object from, eg './a.pyc'.
  52. """
  53. self.start = 0
  54. self._start_add_entries(archive_path)
  55. self._add_from_table_of_contents(logical_toc)
  56. self._finalize()
  57. def _start_add_entries(self, archive_path):
  58. """
  59. Open an empty archive for addition of entries.
  60. """
  61. self.lib = open(archive_path, 'wb')
  62. # Reserve space for the header.
  63. if self.HDRLEN:
  64. self.lib.write(b'\0' * self.HDRLEN)
  65. # Create an empty table of contents.
  66. # Use a list to support reproducible builds
  67. self.toc = []
  68. def _add_from_table_of_contents(self, toc):
  69. """
  70. Add entries from a logical TOC (without absolute positioning info).
  71. An entry is an entry in a logical TOC is a tuple,
  72. entry[0] is name (under which it will be saved).
  73. entry[1] is fullpathname of the file.
  74. entry[2] is a flag for it's storage format (True or 1 if compressed)
  75. entry[3] is the entry's type code.
  76. """
  77. for toc_entry in toc:
  78. self.add(toc_entry) # The guts of the archive.
  79. def _finalize(self):
  80. """
  81. Finalize an archive which has been opened using _start_add_entries(),
  82. writing any needed padding and the table of contents.
  83. """
  84. toc_pos = self.lib.tell()
  85. self.save_trailer(toc_pos)
  86. if self.HDRLEN:
  87. self.update_headers(toc_pos)
  88. self.lib.close()
  89. ####### manages keeping the internal TOC and the guts in sync #######
  90. def add(self, entry):
  91. """
  92. Override this to influence the mechanics of the Archive.
  93. Assumes entry is a seq beginning with (nm, pth, ...) where
  94. nm is the key by which we'll be asked for the object.
  95. pth is the name of where we find the object. Overrides of
  96. get_obj_from can make use of further elements in entry.
  97. """
  98. nm = entry[0]
  99. pth = entry[1]
  100. pynm, ext = os.path.splitext(os.path.basename(pth))
  101. ispkg = pynm == '__init__'
  102. assert ext in ('.pyc', '.pyo')
  103. self.toc.append((nm, (ispkg, self.lib.tell())))
  104. with open(entry[1], 'rb') as f:
  105. f.seek(8) # skip magic and timestamp
  106. self.lib.write(f.read())
  107. def save_trailer(self, tocpos):
  108. """
  109. Default - toc is a dict
  110. Gets marshaled to self.lib
  111. """
  112. try:
  113. self.lib.write(marshal.dumps(self.toc))
  114. # If the TOC to be marshalled contains an unmarshallable object, Python
  115. # raises a cryptic exception providing no details on why such object is
  116. # unmarshallable. Correct this by iteratively inspecting the TOC for
  117. # unmarshallable objects.
  118. except ValueError as exception:
  119. if str(exception) == 'unmarshallable object':
  120. # List of all marshallable types.
  121. MARSHALLABLE_TYPES = {
  122. bool, int, float, complex, str, bytes, bytearray, tuple,
  123. list, set, frozenset, dict, CodeType
  124. }
  125. for module_name, module_tuple in self.toc.items():
  126. if type(module_name) not in MARSHALLABLE_TYPES:
  127. print('Module name "%s" (%s) unmarshallable.' % (module_name, type(module_name)))
  128. if type(module_tuple) not in MARSHALLABLE_TYPES:
  129. print('Module "%s" tuple "%s" (%s) unmarshallable.' % (module_name, module_tuple, type(module_tuple)))
  130. elif type(module_tuple) == tuple:
  131. for i in range(len(module_tuple)):
  132. if type(module_tuple[i]) not in MARSHALLABLE_TYPES:
  133. print('Module "%s" tuple index %s item "%s" (%s) unmarshallable.' % (module_name, i, module_tuple[i], type(module_tuple[i])))
  134. raise
  135. def update_headers(self, tocpos):
  136. """
  137. Default - MAGIC + Python's magic + tocpos
  138. """
  139. self.lib.seek(self.start)
  140. self.lib.write(self.MAGIC)
  141. self.lib.write(BYTECODE_MAGIC)
  142. self.lib.write(struct.pack('!i', tocpos))
  143. class ZlibArchiveWriter(ArchiveWriter):
  144. """
  145. ZlibArchive - an archive with compressed entries. Archive is read
  146. from the executable created by PyInstaller.
  147. This archive is used for bundling python modules inside the executable.
  148. NOTE: The whole ZlibArchive (PYZ) is compressed so it is not necessary
  149. to compress single modules with zlib.
  150. """
  151. MAGIC = b'PYZ\0'
  152. TOCPOS = 8
  153. HDRLEN = ArchiveWriter.HDRLEN + 5
  154. COMPRESSION_LEVEL = 6 # Default level of the 'zlib' module from Python.
  155. def __init__(self, archive_path, logical_toc, code_dict=None, cipher=None):
  156. """
  157. code_dict dict containing module code objects from ModuleGraph.
  158. """
  159. # Keep references to module code objects constructed by ModuleGraph
  160. # to avoid writting .pyc/pyo files to hdd.
  161. self.code_dict = code_dict or {}
  162. self.cipher = cipher or None
  163. super(ZlibArchiveWriter, self).__init__(archive_path, logical_toc)
  164. def add(self, entry):
  165. name, path, typ = entry
  166. if typ == 'PYMODULE':
  167. typ = PYZ_TYPE_MODULE
  168. if path in ('-', None):
  169. # This is a NamespacePackage, modulegraph marks them
  170. # by using the filename '-'. (But wants to use None,
  171. # so check for None, too, to be forward-compatible.)
  172. typ = PYZ_TYPE_NSPKG
  173. else:
  174. base, ext = os.path.splitext(os.path.basename(path))
  175. if base == '__init__':
  176. typ = PYZ_TYPE_PKG
  177. data = marshal.dumps(self.code_dict[name])
  178. else:
  179. # Any data files, that might be required by pkg_resources.
  180. typ = PYZ_TYPE_DATA
  181. with open(path, 'rb') as fh:
  182. data = fh.read()
  183. # No need to use forward slash as path-separator here since
  184. # pkg_resources on Windows back slash as path-separator.
  185. obj = zlib.compress(data, self.COMPRESSION_LEVEL)
  186. # First compress then encrypt.
  187. if self.cipher:
  188. obj = self.cipher.encrypt(obj)
  189. self.toc.append((name, (typ, self.lib.tell(), len(obj))))
  190. self.lib.write(obj)
  191. def update_headers(self, tocpos):
  192. """
  193. add level
  194. """
  195. ArchiveWriter.update_headers(self, tocpos)
  196. self.lib.write(struct.pack('!B', self.cipher is not None))
  197. class CTOC(object):
  198. """
  199. A class encapsulating the table of contents of a CArchive.
  200. When written to disk, it is easily read from C.
  201. """
  202. # (structlen, dpos, dlen, ulen, flag, typcd) followed by name
  203. ENTRYSTRUCT = '!iIIIBB'
  204. ENTRYLEN = struct.calcsize(ENTRYSTRUCT)
  205. def __init__(self):
  206. self.data = []
  207. def tobinary(self):
  208. """
  209. Return self as a binary string.
  210. """
  211. rslt = []
  212. for (dpos, dlen, ulen, flag, typcd, nm) in self.data:
  213. # Encode all names using UTF-8. This should be save as
  214. # standard python modules only contain ascii-characters
  215. # (and standard shared libraries should have the same) and
  216. # thus the C-code still can handle this correctly.
  217. nm = nm.encode('utf-8')
  218. nmlen = len(nm) + 1 # add 1 for a '\0'
  219. # align to 16 byte boundary so xplatform C can read
  220. toclen = nmlen + self.ENTRYLEN
  221. if toclen % 16 == 0:
  222. pad = b'\0'
  223. else:
  224. padlen = 16 - (toclen % 16)
  225. pad = b'\0' * padlen
  226. nmlen = nmlen + padlen
  227. rslt.append(struct.pack(self.ENTRYSTRUCT + '%is' % nmlen,
  228. nmlen + self.ENTRYLEN, dpos, dlen, ulen,
  229. flag, ord(typcd), nm + pad))
  230. return b''.join(rslt)
  231. def add(self, dpos, dlen, ulen, flag, typcd, nm):
  232. """
  233. Add an entry to the table of contents.
  234. DPOS is data position.
  235. DLEN is data length.
  236. ULEN is the uncompressed data len.
  237. FLAG says if the data is compressed.
  238. TYPCD is the "type" of the entry (used by the C code)
  239. NM is the entry's name.
  240. This function is used only while creating an executable.
  241. """
  242. # Ensure forward slashes in paths are on Windows converted to back
  243. # slashes '\\' since on Windows the bootloader works only with back
  244. # slashes.
  245. nm = os.path.normpath(nm)
  246. if is_win and os.path.sep == '/':
  247. # When building under MSYS, the above path normalization
  248. # uses Unix-style separators, so replace them manually.
  249. nm = nm.replace(os.path.sep, '\\')
  250. self.data.append((dpos, dlen, ulen, flag, typcd, nm))
  251. class CArchiveWriter(ArchiveWriter):
  252. """
  253. An Archive subclass that can hold arbitrary data.
  254. This class encapsulates all files that are bundled within an executable.
  255. It can contain ZlibArchive (Python .pyc files), dlls, Python C extensions
  256. and all other data files that are bundled in --onefile mode.
  257. Easily handled from C or from Python.
  258. """
  259. # MAGIC is usefull to verify that conversion of Python data types
  260. # to C structure and back works properly.
  261. MAGIC = b'MEI\014\013\012\013\016'
  262. HDRLEN = 0
  263. LEVEL = 9
  264. # Cookie - holds some information for the bootloader. C struct format
  265. # definition. '!' at the beginning means network byte order.
  266. # C struct looks like:
  267. #
  268. # typedef struct _cookie {
  269. # char magic[8]; /* 'MEI\014\013\012\013\016' */
  270. # uint32_t len; /* len of entire package */
  271. # uint32_t TOC; /* pos (rel to start) of TableOfContents */
  272. # int TOClen; /* length of TableOfContents */
  273. # int pyvers; /* new in v4 */
  274. # char pylibname[64]; /* Filename of Python dynamic library. */
  275. # } COOKIE;
  276. #
  277. _cookie_format = '!8sIIii64s'
  278. _cookie_size = struct.calcsize(_cookie_format)
  279. def __init__(self, archive_path, logical_toc, pylib_name):
  280. """
  281. Constructor.
  282. archive_path path name of file (create empty CArchive if path is None).
  283. start is the seekposition within PATH.
  284. len is the length of the CArchive (if 0, then read till EOF).
  285. pylib_name name of Python DLL which bootloader will use.
  286. """
  287. self._pylib_name = pylib_name
  288. # A CArchive created from scratch starts at 0, no leading bootloader.
  289. super(CArchiveWriter, self).__init__(archive_path, logical_toc)
  290. def _start_add_entries(self, path):
  291. """
  292. Open an empty archive for addition of entries.
  293. """
  294. super(CArchiveWriter, self)._start_add_entries(path)
  295. # Override parents' toc {} with a class.
  296. self.toc = CTOC()
  297. def add(self, entry):
  298. """
  299. Add an ENTRY to the CArchive.
  300. ENTRY must have:
  301. entry[0] is name (under which it will be saved).
  302. entry[1] is fullpathname of the file.
  303. entry[2] is a flag for it's storage format (0==uncompressed,
  304. 1==compressed)
  305. entry[3] is the entry's type code.
  306. Version 5:
  307. If the type code is 'o':
  308. entry[0] is the runtime option
  309. eg: v (meaning verbose imports)
  310. u (meaning unbuffered)
  311. W arg (warning option arg)
  312. s (meaning do site.py processing.
  313. """
  314. (nm, pathnm, flag, typcd) = entry[:4]
  315. # FIXME Could we make the version 5 the default one?
  316. # Version 5 - allow type 'o' = runtime option.
  317. code_data = None
  318. fh = None
  319. try:
  320. if typcd in ('o', 'd'):
  321. ulen = 0
  322. flag = 0
  323. elif typcd == 's':
  324. # If it's a source code file, compile it to a code object and marshall
  325. # the object so it can be unmarshalled by the bootloader.
  326. code = get_code_object(nm, pathnm)
  327. code = strip_paths_in_code(code)
  328. code_data = marshal.dumps(code)
  329. ulen = len(code_data)
  330. elif typcd == 'm':
  331. fh = open(pathnm, 'rb')
  332. ulen = os.fstat(fh.fileno()).st_size
  333. # Check if it is a PYC file
  334. header = fh.read(4)
  335. fh.seek(0)
  336. if header == BYTECODE_MAGIC:
  337. # Read whole header and load code.
  338. # According to PEP-552, in python versions prior to
  339. # 3.7, the PYC header consists of three 32-bit words
  340. # (magic, timestamp, and source file size).
  341. # From python 3.7 on, the PYC header was extended to
  342. # four 32-bit words (magic, flags, and, depending on
  343. # the flags, either timestamp and source file size,
  344. # or a 64-bit hash).
  345. if is_py37:
  346. header = fh.read(16)
  347. else:
  348. header = fh.read(12)
  349. code = marshal.load(fh)
  350. # Strip paths from code, marshal back into module form.
  351. # The header fields (timestamp, size, hash, etc.) are
  352. # all referring to the source file, so our modification
  353. # of the code object does not affect them, and we can
  354. # re-use the original header.
  355. code = strip_paths_in_code(code)
  356. data = header + marshal.dumps(code)
  357. # Create file-like object for timestamp re-write
  358. # in the subsequent steps
  359. fh = io.BytesIO(data)
  360. ulen = len(data)
  361. else:
  362. fh = open(pathnm, 'rb')
  363. ulen = os.fstat(fh.fileno()).st_size
  364. except IOError:
  365. print("Cannot find ('%s', '%s', %s, '%s')" % (nm, pathnm, flag, typcd))
  366. raise
  367. where = self.lib.tell()
  368. assert flag in range(3)
  369. if not fh and not code_data:
  370. # no need to write anything
  371. pass
  372. elif flag == 1:
  373. comprobj = zlib.compressobj(self.LEVEL)
  374. if code_data is not None:
  375. self.lib.write(comprobj.compress(code_data))
  376. else:
  377. assert fh
  378. # We only want to change it for pyc files
  379. modify_header = typcd in ('M', 'm', 's')
  380. while 1:
  381. buf = fh.read(16*1024)
  382. if not buf:
  383. break
  384. if modify_header:
  385. modify_header = False
  386. buf = fake_pyc_timestamp(buf)
  387. self.lib.write(comprobj.compress(buf))
  388. self.lib.write(comprobj.flush())
  389. else:
  390. if code_data is not None:
  391. self.lib.write(code_data)
  392. else:
  393. assert fh
  394. while 1:
  395. buf = fh.read(16*1024)
  396. if not buf:
  397. break
  398. self.lib.write(buf)
  399. dlen = self.lib.tell() - where
  400. if typcd == 'm':
  401. if pathnm.find('.__init__.py') > -1:
  402. typcd = 'M'
  403. if fh:
  404. fh.close()
  405. # Record the entry in the CTOC
  406. self.toc.add(where, dlen, ulen, flag, typcd, nm)
  407. def save_trailer(self, tocpos):
  408. """
  409. Save the table of contents and the cookie for the bootlader to
  410. disk.
  411. CArchives can be opened from the end - the cookie points
  412. back to the start.
  413. """
  414. tocstr = self.toc.tobinary()
  415. self.lib.write(tocstr)
  416. toclen = len(tocstr)
  417. # now save teh cookie
  418. total_len = tocpos + toclen + self._cookie_size
  419. pyvers = sys.version_info[0] * 10 + sys.version_info[1]
  420. # Before saving cookie we need to convert it to corresponding
  421. # C representation.
  422. cookie = struct.pack(self._cookie_format, self.MAGIC, total_len,
  423. tocpos, toclen, pyvers,
  424. self._pylib_name.encode('ascii'))
  425. self.lib.write(cookie)
  426. class SplashWriter(ArchiveWriter):
  427. """
  428. This ArchiveWriter bundles the data for the splash screen resources
  429. Splash screen resources will be added as an entry into the CArchive
  430. with the typecode ARCHIVE_ITEM_SPLASH. This writer creates the bundled
  431. information in the archive.
  432. """
  433. # This struct describes the splash resources as it will be in an
  434. # buffer inside the bootloader. All necessary parts are bundled, the
  435. # *_len and *_offset fields describe the data beyond this header
  436. # definition.
  437. # Whereas script and image fields are binary data, the requirements
  438. # fields describe an array of strings. Each string is null-terminated
  439. # in order to easily iterate over this list from within C.
  440. #
  441. # typedef struct _splash_data_header {
  442. # char tcl_libname[16]; /* Name of tcl library, e.g. tcl86t.dll */
  443. # char tk_libname[16]; /* Name of tk library, e.g. tk86t.dll */
  444. # char tk_lib[16]; /* Tk Library generic, e.g. "tk/" */
  445. # char rundir[16]; /* temp folder inside extraction path in
  446. # * which the dependencies are extracted */
  447. #
  448. # int script_len; /* Length of the script */
  449. # int script_offset; /* Offset (rel to start) of the script */
  450. #
  451. # int image_len; /* Length of the image data */
  452. # int image_offset; /* Offset (rel to start) of the image */
  453. #
  454. # int requirements_len;
  455. # int requirements_offset;
  456. #
  457. # } SPLASH_DATA_HEADER;
  458. #
  459. _header_format = '!16s 16s 16s 16s ii ii ii'
  460. HDRLEN = struct.calcsize(_header_format)
  461. # The created resource will be compressed by the CArchive,
  462. # so no need to compress the data here
  463. def __init__(self, archive_path, name_list,
  464. tcl_libname, tk_libname, tklib, rundir,
  465. image, script):
  466. """
  467. Custom writer for splash screen resources which will be bundled
  468. into the CArchive as an entry.
  469. :param archive_path: The filename of the archive to create
  470. :param name_list: List of filenames for the requirements array
  471. :param str tcl_libname: Name of the tcl shared library file
  472. :param str tk_libname: Name of the tk shared library file
  473. :param str tklib: Root of tk library (e.g. tk/)
  474. :param str rundir: Unique path to extract requirements to
  475. :param Union[str, bytes] image: Image like object
  476. :param str script: The tcl/tk script to execute to create the screen.
  477. """
  478. self._tcl_libname = tcl_libname
  479. self._tk_libname = tk_libname
  480. self._tklib = tklib
  481. self._rundir = rundir
  482. self._image = image
  483. self._image_len = 0
  484. self._image_offset = 0
  485. self._script = script
  486. self._script_len = 0
  487. self._script_offset = 0
  488. self._requirements_len = 0
  489. self._requirements_offset = 0
  490. super(SplashWriter, self).__init__(archive_path, name_list)
  491. def add(self, name):
  492. """
  493. This methods adds a name to the requirement list in the splash
  494. data. This list (more an array) contains the names of all files
  495. the bootloader needs to extract before the splash screen can be
  496. started. The implementation terminates every name with a null-byte,
  497. that keeps the list short memory wise and makes it iterable from C.
  498. """
  499. name = name.encode('utf-8')
  500. self.lib.write(name + b'\0')
  501. self._requirements_len += len(name) + 1 # zero byte at the end
  502. def update_headers(self, tocpos):
  503. """ Updates the offsets of the fields
  504. This function is called after self.save_trailer()
  505. :param tocpos:
  506. :return:
  507. """
  508. self.lib.seek(self.start)
  509. self.lib.write(struct.pack(self._header_format,
  510. self._tcl_libname.encode("utf-8"),
  511. self._tk_libname.encode("utf-8"),
  512. self._tklib.encode("utf-8"),
  513. self._rundir.encode("utf-8"),
  514. self._script_len,
  515. self._script_offset,
  516. self._image_len,
  517. self._image_offset,
  518. self._requirements_len,
  519. self._requirements_offset))
  520. def save_trailer(self, script_pos):
  521. """ Adds the image and script """
  522. self._requirements_offset = script_pos - self._requirements_len
  523. self._script_offset = script_pos
  524. self.save_script()
  525. self._image_offset = self.lib.tell()
  526. self.save_image()
  527. def save_script(self):
  528. """ Add the tcl/tk script into the archive.
  529. This strips out every comment in the source to save some space
  530. """
  531. self._script_len = len(self._script)
  532. self.lib.write(self._script.encode("utf-8"))
  533. def save_image(self):
  534. """Copy the image into the archive.
  535. If self._image are bytes the buffer will be written directly into
  536. the archive, otherwise it is assumed to be a path and the file will
  537. be written into it.
  538. """
  539. if isinstance(self._image, bytes):
  540. # image was converted by PIL/Pillow
  541. buf = self._image
  542. self.lib.write(self._image)
  543. else:
  544. # Copy image to lib
  545. with open(self._image, 'rb') as image_file:
  546. buf = image_file.read()
  547. self._image_len = len(buf)
  548. self.lib.write(buf)