From 005e6e0ba2b09952399dadeb1cdcc6558a45d077 Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Mon, 4 May 2020 14:20:29 +0200 Subject: [PATCH 01/13] bpo-40495: compileall option to hardlink duplicate pyc files Hardlink deduplication enables to prevent duplicates via hardlinks in cases when bytecode cache files are the same for multiple optimization levels. --- Doc/library/compileall.rst | 21 +- Lib/compileall.py | 38 +- Lib/test/test_compileall.py | 446 ++++++++++++++++++ Misc/ACKS | 1 + .../2020-05-04-11-20-49.bpo-40495.TyTc2O.rst | 2 + 5 files changed, 496 insertions(+), 12 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2020-05-04-11-20-49.bpo-40495.TyTc2O.rst diff --git a/Doc/library/compileall.rst b/Doc/library/compileall.rst index b1ae9d60e8ae14..337f75acd3b9af 100644 --- a/Doc/library/compileall.rst +++ b/Doc/library/compileall.rst @@ -113,6 +113,11 @@ compile Python sources. Ignore symlinks pointing outside the given directory. +.. cmdoption:: --hardlink-dupes + + Use hardlinks to prevent duplicates if ``.pyc`` files for multiple + optimization levels have the same content. + .. versionchanged:: 3.2 Added the ``-i``, ``-b`` and ``-h`` options. @@ -125,7 +130,7 @@ compile Python sources. Added the ``--invalidation-mode`` option. .. versionchanged:: 3.9 - Added the ``-s``, ``-p``, ``-e`` options. + Added the ``-s``, ``-p``, ``-e`` and ``--hardlink-dupes`` options. Raised the default recursion limit from 10 to :py:func:`sys.getrecursionlimit()`. Added the possibility to specify the ``-o`` option multiple times. @@ -143,7 +148,7 @@ runtime. Public functions ---------------- -.. function:: compile_dir(dir, maxlevels=sys.getrecursionlimit(), ddir=None, force=False, rx=None, quiet=0, legacy=False, optimize=-1, workers=1, invalidation_mode=None, \*, stripdir=None, prependdir=None, limit_sl_dest=None) +.. function:: compile_dir(dir, maxlevels=sys.getrecursionlimit(), ddir=None, force=False, rx=None, quiet=0, legacy=False, optimize=-1, workers=1, invalidation_mode=None, \*, stripdir=None, prependdir=None, limit_sl_dest=None, hardlink_dupes=False) Recursively descend the directory tree named by *dir*, compiling all :file:`.py` files along the way. Return a true value if all the files compiled successfully, @@ -193,6 +198,9 @@ Public functions the ``-s``, ``-p`` and ``-e`` options described above. They may be specified as ``str``, ``bytes`` or :py:class:`os.PathLike`. + If *hardlink_dupes* is ``True``, hardlinks are used to prevent duplicates + if ``.pyc`` files for multiple optimization levels have the same content. + .. versionchanged:: 3.2 Added the *legacy* and *optimize* parameter. @@ -219,9 +227,9 @@ Public functions Setting *workers* to 0 now chooses the optimal number of cores. .. versionchanged:: 3.9 - Added *stripdir*, *prependdir* and *limit_sl_dest* arguments. + Added *stripdir*, *prependdir*, *limit_sl_dest* and *hardlink_dupes* arguments. -.. function:: compile_file(fullname, ddir=None, force=False, rx=None, quiet=0, legacy=False, optimize=-1, invalidation_mode=None, \*, stripdir=None, prependdir=None, limit_sl_dest=None) +.. function:: compile_file(fullname, ddir=None, force=False, rx=None, quiet=0, legacy=False, optimize=-1, invalidation_mode=None, \*, stripdir=None, prependdir=None, limit_sl_dest=None, hardlink_dupes=False) Compile the file with path *fullname*. Return a true value if the file compiled successfully, and a false value otherwise. @@ -257,6 +265,9 @@ Public functions the ``-s``, ``-p`` and ``-e`` options described above. They may be specified as ``str``, ``bytes`` or :py:class:`os.PathLike`. + If *hardlink_dupes* is ``True``, hardlinks are used to prevent duplicates + if ``.pyc`` files for multiple optimization levels have the same content. + .. versionadded:: 3.2 .. versionchanged:: 3.5 @@ -273,7 +284,7 @@ Public functions The *invalidation_mode* parameter's default value is updated to None. .. versionchanged:: 3.9 - Added *stripdir*, *prependdir* and *limit_sl_dest* arguments. + Added *stripdir*, *prependdir*, *limit_sl_dest* and *hardlink_dupes* arguments. .. function:: compile_path(skip_curdir=True, maxlevels=0, force=False, quiet=0, legacy=False, optimize=-1, invalidation_mode=None) diff --git a/Lib/compileall.py b/Lib/compileall.py index abe6cffce59c5f..5984058bdc9372 100644 --- a/Lib/compileall.py +++ b/Lib/compileall.py @@ -15,6 +15,7 @@ import importlib.util import py_compile import struct +import filecmp from functools import partial from pathlib import Path @@ -47,7 +48,7 @@ def _walk_dir(dir, maxlevels, quiet=0): def compile_dir(dir, maxlevels=None, ddir=None, force=False, rx=None, quiet=0, legacy=False, optimize=-1, workers=1, invalidation_mode=None, *, stripdir=None, - prependdir=None, limit_sl_dest=None): + prependdir=None, limit_sl_dest=None, hardlink_dupes=False): """Byte-compile all modules in the given directory tree. Arguments (only dir is required): @@ -70,6 +71,7 @@ def compile_dir(dir, maxlevels=None, ddir=None, force=False, after stripdir limit_sl_dest: ignore symlinks if they are pointing outside of the defined path + hardlink_dupes: hardlink duplicated pyc files """ ProcessPoolExecutor = None if ddir is not None and (stripdir is not None or prependdir is not None): @@ -104,7 +106,8 @@ def compile_dir(dir, maxlevels=None, ddir=None, force=False, invalidation_mode=invalidation_mode, stripdir=stripdir, prependdir=prependdir, - limit_sl_dest=limit_sl_dest), + limit_sl_dest=limit_sl_dest, + hardlink_dupes=hardlink_dupes), files) success = min(results, default=True) else: @@ -112,14 +115,15 @@ def compile_dir(dir, maxlevels=None, ddir=None, force=False, if not compile_file(file, ddir, force, rx, quiet, legacy, optimize, invalidation_mode, stripdir=stripdir, prependdir=prependdir, - limit_sl_dest=limit_sl_dest): + limit_sl_dest=limit_sl_dest, + hardlink_dupes=hardlink_dupes): success = False return success def compile_file(fullname, ddir=None, force=False, rx=None, quiet=0, legacy=False, optimize=-1, invalidation_mode=None, *, stripdir=None, prependdir=None, - limit_sl_dest=None): + limit_sl_dest=None, hardlink_dupes=False): """Byte-compile one file. Arguments (only fullname is required): @@ -140,6 +144,7 @@ def compile_file(fullname, ddir=None, force=False, rx=None, quiet=0, after stripdir limit_sl_dest: ignore symlinks if they are pointing outside of the defined path. + hardlink_dupes: hardlink duplicated pyc files """ if ddir is not None and (stripdir is not None or prependdir is not None): @@ -176,6 +181,10 @@ def compile_file(fullname, ddir=None, force=False, rx=None, quiet=0, if isinstance(optimize, int): optimize = [optimize] + if hardlink_dupes: + raise ValueError(("Hardlinking of duplicated bytecode makes sense " + "only for more than one optimization level.")) + if rx is not None: mo = rx.search(fullname) if mo: @@ -220,10 +229,16 @@ def compile_file(fullname, ddir=None, force=False, rx=None, quiet=0, if not quiet: print('Compiling {!r}...'.format(fullname)) try: - for opt_level, cfile in opt_cfiles.items(): + for index, opt_level in enumerate(sorted(optimize)): + cfile = opt_cfiles[opt_level] ok = py_compile.compile(fullname, cfile, dfile, True, optimize=opt_level, invalidation_mode=invalidation_mode) + if index > 0 and hardlink_dupes: + previous_cfile = opt_cfiles[optimize[index - 1]] + if filecmp.cmp(cfile, previous_cfile, shallow=False): + os.unlink(cfile) + os.link(previous_cfile, cfile) except py_compile.PyCompileError as err: success = False if quiet >= 2: @@ -352,6 +367,9 @@ def main(): 'Python interpreter itself (specified by -O).')) parser.add_argument('-e', metavar='DIR', dest='limit_sl_dest', help='Ignore symlinks pointing outsite of the DIR') + parser.add_argument('--hardlink-dupes', action='store_true', + dest='hardlink_dupes', + help='Hardlink duplicated pyc files') args = parser.parse_args() compile_dests = args.compile_dest @@ -371,6 +389,10 @@ def main(): if args.opt_levels is None: args.opt_levels = [-1] + if len(args.opt_levels) == 1 and args.hardlink_dupes: + parser.error(("Hardlinking of duplicated bytecode makes sense " + "only for more than one optimization level.")) + if args.ddir is not None and ( args.stripdir is not None or args.prependdir is not None ): @@ -404,7 +426,8 @@ def main(): stripdir=args.stripdir, prependdir=args.prependdir, optimize=args.opt_levels, - limit_sl_dest=args.limit_sl_dest): + limit_sl_dest=args.limit_sl_dest, + hardlink_dupes=args.hardlink_dupes): success = False else: if not compile_dir(dest, maxlevels, args.ddir, @@ -414,7 +437,8 @@ def main(): stripdir=args.stripdir, prependdir=args.prependdir, optimize=args.opt_levels, - limit_sl_dest=args.limit_sl_dest): + limit_sl_dest=args.limit_sl_dest, + hardlink_dupes=args.hardlink_dupes): success = False return success else: diff --git a/Lib/test/test_compileall.py b/Lib/test/test_compileall.py index 72678945089f28..efc2d84f894f54 100644 --- a/Lib/test/test_compileall.py +++ b/Lib/test/test_compileall.py @@ -11,6 +11,7 @@ import time import unittest import io +import filecmp from unittest import mock, skipUnless try: @@ -360,6 +361,234 @@ def test_ignore_symlink_destination(self): self.assertTrue(os.path.isfile(allowed_bc)) self.assertFalse(os.path.isfile(prohibited_bc)) + def test_hardlink_deduplication_bad_args(self): + # Bad arguments combination, hardlink deduplication make sense + # only for more than one optimization level + with self.assertRaises(ValueError): + compileall.compile_dir(self.directory, quiet=True, optimize=0, + hardlink_dupes=True) + + def test_hardlink_deduplication_same_bytecode_all_opt(self): + # 'a = 0' produces the same bytecode for all optimization levels + path = os.path.join(self.directory, "test", "same_all") + os.makedirs(path) + + simple_script = script_helper.make_script(path, "test_same_bytecode", + "a = 0") + pyc_opt0 = importlib.util.cache_from_source(simple_script) + pyc_opt1 = importlib.util.cache_from_source(simple_script, + optimization=1) + pyc_opt2 = importlib.util.cache_from_source(simple_script, + optimization=2) + + compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], + hardlink_dupes=True) + + # import pdb; pdb.set_trace() + + # All three files should have the same inode (hardlinks) + self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + for pyc_file in {pyc_opt0, pyc_opt1, pyc_opt2}: + os.unlink(pyc_file) + + compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], + hardlink_dupes=False) + + # Deduplication disabled, all pyc files should have different inodes + self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + def test_hardlink_deduplication_same_bytecode_some_opt(self): + # 'a = 0' produces the same bytecode for all optimization levels + # only two levels of optimization [0, 1] tested + path = os.path.join(self.directory, "test", "same_some") + os.makedirs(path) + + simple_script = script_helper.make_script(path, "test_same_bytecode", + "a = 0") + pyc_opt0 = importlib.util.cache_from_source(simple_script) + pyc_opt2 = importlib.util.cache_from_source(simple_script, + optimization=2) + + compileall.compile_dir(path, quiet=True, optimize=[0, 2], + hardlink_dupes=True) + + # Both files should have the same inode (hardlink) + self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt2).st_ino) + + for pyc_file in {pyc_opt0, pyc_opt2}: + os.unlink(pyc_file) + + compileall.compile_dir(path, quiet=True, force=True, optimize=[0, 2], + hardlink_dupes=False) + + # Deduplication disabled, both pyc files should have different inodes + self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt2).st_ino) + + def test_hardlink_deduplication_same_bytecode_some_opt_2(self): + # 'a = 0' produces the same bytecode for all optimization levels + path = os.path.join(self.directory, "test", "same_some_2") + os.makedirs(path) + + simple_script = script_helper.make_script(path, "test_same_bytecode", + "a = 0") + pyc_opt1 = importlib.util.cache_from_source(simple_script, + optimization=1) + pyc_opt2 = importlib.util.cache_from_source(simple_script, + optimization=2) + + compileall.compile_dir(path, quiet=True, optimize=[1, 2], + hardlink_dupes=True) + + # Both files should have the same inode (hardlinks) + self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + for pyc_file in {pyc_opt1, pyc_opt2}: + os.unlink(pyc_file) + + compileall.compile_dir(path, quiet=True, optimize=[1, 2]) + + # Deduplication disabled, all pyc files should have different inodes + self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + def test_hardlink_deduplication_different_bytecode_all_opt(self): + # "'''string'''\nassert 1" produces a different bytecode for + # all optimization levels + path = os.path.join(self.directory, "test", "different_all") + os.makedirs(path) + + simple_script = script_helper.make_script( + path, "test_different_bytecode", "'''string'''\nassert 1" + ) + pyc_opt0 = importlib.util.cache_from_source(simple_script) + pyc_opt1 = importlib.util.cache_from_source(simple_script, + optimization=1) + pyc_opt2 = importlib.util.cache_from_source(simple_script, + optimization=2) + + compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], + hardlink_dupes=True) + + # No hardlinks, bytecodes are different + self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + for pyc_file in {pyc_opt0, pyc_opt1, pyc_opt2}: + os.unlink(pyc_file) + + compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], + hardlink_dupes=False) + + # Disabling hardlink deduplication makes no difference + self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + def test_hardlink_deduplication_different_bytecode_one_hardlink(self): + # "'''string'''\na = 1" produces the same bytecode only + # for level 0 and 1 + path = os.path.join(self.directory, "test", "different_one") + os.makedirs(path) + + simple_script = script_helper.make_script( + path, "test_different_bytecode", "'''string'''\na = 1" + ) + pyc_opt0 = importlib.util.cache_from_source(simple_script) + pyc_opt1 = importlib.util.cache_from_source(simple_script, + optimization=1) + pyc_opt2 = importlib.util.cache_from_source(simple_script, + optimization=2) + + compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], + hardlink_dupes=True) + + # Only level 0 and 1 has the same inode, level 2 produces + # a different bytecode + self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + for pyc_file in {pyc_opt0, pyc_opt1, pyc_opt2}: + os.unlink(pyc_file) + + compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], + hardlink_dupes=False) + + # Deduplication disabled, no hardlinks + self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + def test_hardlink_deduplication_recompilation(self): + path = os.path.join(self.directory, "test", "module_change") + os.makedirs(path) + + simple_script = script_helper.make_script(path, "module_change", + "a = 0") + pyc_opt0 = importlib.util.cache_from_source(simple_script) + pyc_opt1 = importlib.util.cache_from_source(simple_script, + optimization=1) + pyc_opt2 = importlib.util.cache_from_source(simple_script, + optimization=2) + + compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], + hardlink_dupes=True) + + # All three levels have the same inode + self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + previous_inode = os.stat(pyc_opt0).st_ino + + # Change of the module content + simple_script = script_helper.make_script(path, "module_change", + "print(0)") + + # Recompilation without -o 1 + compileall.compile_dir(path, force=True, quiet=True, optimize=[0, 2], + hardlink_dupes=True) + + # opt-1.pyc should have the same inode as before and others should not + self.assertEqual(previous_inode, os.stat(pyc_opt1).st_ino) + self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt2).st_ino) + self.assertNotEqual(previous_inode, os.stat(pyc_opt2).st_ino) + # opt-1.pyc and opt-2.pyc have different content + self.assertFalse(filecmp.cmp(pyc_opt1, pyc_opt2, shallow=True)) + + def test_hardlink_deduplication_import(self): + path = os.path.join(self.directory, "test", "module_import") + os.makedirs(path) + + simple_script = script_helper.make_script(path, "module", "a = 0") + pyc_opt0 = importlib.util.cache_from_source(simple_script) + pyc_opt1 = importlib.util.cache_from_source(simple_script, + optimization=1) + pyc_opt2 = importlib.util.cache_from_source(simple_script, + optimization=2) + + compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], + hardlink_dupes=True) + + # All three levels have the same inode + self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + previous_inode = os.stat(pyc_opt0).st_ino + + # Change of the module content + simple_script = script_helper.make_script(path, "module", "print(0)") + + # Import the module in Python + script_helper.assert_python_ok( + "-O", "-c", "import module", __isolated=False, PYTHONPATH=path + ) + + # Only opt-1.pyc is changed + self.assertEqual(previous_inode, os.stat(pyc_opt0).st_ino) + self.assertEqual(previous_inode, os.stat(pyc_opt2).st_ino) + self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + # opt-1.pyc and opt-2.pyc have different content + self.assertFalse(filecmp.cmp(pyc_opt1, pyc_opt2, shallow=True)) + class CompileallTestsWithSourceEpoch(CompileallTestsBase, unittest.TestCase, @@ -825,6 +1054,223 @@ def test_ignore_symlink_destination(self): self.assertTrue(os.path.isfile(allowed_bc)) self.assertFalse(os.path.isfile(prohibited_bc)) + def test_hardlink_deduplication_bad_args(self): + # Bad arguments combination, hardlink deduplication make sense + # only for more than one optimization level + self.assertRunNotOK(self.directory, "-o 1", "--hardlink_dupes") + + def test_hardlink_deduplication_same_bytecode_all_opt(self): + # 'a = 0' produces the same bytecode for all optimization levels + path = os.path.join(self.directory, "test", "same_all") + os.makedirs(path) + + simple_script = script_helper.make_script(path, "test_same_bytecode", + "a = 0") + pyc_opt0 = importlib.util.cache_from_source(simple_script) + pyc_opt1 = importlib.util.cache_from_source(simple_script, + optimization=1) + pyc_opt2 = importlib.util.cache_from_source(simple_script, + optimization=2) + + self.assertRunOK(path, "-q", "-o 0", "-o 1", "-o 2", + "--hardlink-dupes") + + # All three files should have the same inode (hardlinks) + self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + for pyc_file in {pyc_opt0, pyc_opt1, pyc_opt2}: + os.unlink(pyc_file) + + self.assertRunOK(path, "-q", "-o 0", "-o 1", "-o 2") + + # Deduplication disabled, all pyc files should have different inodes + self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + def test_hardlink_deduplication_same_bytecode_some_opt(self): + # 'a = 0' produces the same bytecode for all optimization levels + # only two levels of optimization [0, 1] tested + path = os.path.join(self.directory, "test", "same_some") + os.makedirs(path) + + simple_script = script_helper.make_script(path, "test_same_bytecode", + "a = 0") + pyc_opt0 = importlib.util.cache_from_source(simple_script) + pyc_opt2 = importlib.util.cache_from_source(simple_script, + optimization=2) + + self.assertRunOK(path, "-q", "-o 0", "-o 2", "--hardlink-dupes") + + # Both files should have the same inode (hardlink) + self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt2).st_ino) + + for pyc_file in {pyc_opt0, pyc_opt2}: + os.unlink(pyc_file) + + self.assertRunOK(path, "-q", "-o 0", "-o 2") + + # Deduplication disabled, both pyc files should have different inodes + self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt2).st_ino) + + def test_hardlink_deduplication_same_bytecode_some_opt_2(self): + # 'a = 0' produces the same bytecode for all optimization levels + path = os.path.join(self.directory, "test", "same_some_2") + os.makedirs(path) + + simple_script = script_helper.make_script(path, "test_same_bytecode", + "a = 0") + pyc_opt1 = importlib.util.cache_from_source(simple_script, + optimization=1) + pyc_opt2 = importlib.util.cache_from_source(simple_script, + optimization=2) + + self.assertRunOK(path, "-q", "-o 1", "-o 2", "--hardlink-dupes") + + # Both files should have the same inode (hardlinks) + self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + for pyc_file in {pyc_opt1, pyc_opt2}: + os.unlink(pyc_file) + + self.assertRunOK(path, "-q", "-o 1", "-o 2") + + # Deduplication disabled, all pyc files should have different inodes + self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + def test_hardlink_deduplication_different_bytecode_all_opt(self): + # "'''string'''\nassert 1" produces a different bytecode for + # all optimization levels + path = os.path.join(self.directory, "test", "different_all") + os.makedirs(path) + + simple_script = script_helper.make_script(path, + "test_different_bytecode", + "'''string'''\nassert 1") + pyc_opt0 = importlib.util.cache_from_source(simple_script) + pyc_opt1 = importlib.util.cache_from_source(simple_script, + optimization=1) + pyc_opt2 = importlib.util.cache_from_source(simple_script, + optimization=2) + + self.assertRunOK(path, "-q", "-o 0", "-o 1", "-o 2", + "--hardlink-dupes") + + # No hardlinks, bytecodes are different + self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + for pyc_file in {pyc_opt0, pyc_opt1, pyc_opt2}: + os.unlink(pyc_file) + + self.assertRunOK(path, "-q", "-o 0", "-o 1", "-o 2") + + # Disabling hardlink deduplication makes no difference + self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + def test_hardlink_deduplication_different_bytecode_one_hardlink(self): + # "'''string'''\na = 1" produces the same bytecode only + # for level 0 and 1 + path = os.path.join(self.directory, "test", "different_one") + os.makedirs(path) + + simple_script = script_helper.make_script( + path, "test_different_bytecode", "'''string'''\na = 1" + ) + pyc_opt0 = importlib.util.cache_from_source(simple_script) + pyc_opt1 = importlib.util.cache_from_source(simple_script, + optimization=1) + pyc_opt2 = importlib.util.cache_from_source(simple_script, + optimization=2) + + self.assertRunOK(path, "-q", "-o 0", "-o 1", "-o 2", + "--hardlink-dupes") + + # Only level 0 and 1 has the same inode, level 2 produces + # a different bytecode + self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + for pyc_file in {pyc_opt0, pyc_opt1, pyc_opt2}: + os.unlink(pyc_file) + + self.assertRunOK(path, "-q", "-o 0", "-o 1", "-o 2") + + # Deduplication disabled, no hardlinks + self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + def test_hardlink_deduplication_recompilation(self): + path = os.path.join(self.directory, "test", "module_change") + os.makedirs(path) + + simple_script = script_helper.make_script(path, "module_change", + "a = 0") + pyc_opt0 = importlib.util.cache_from_source(simple_script) + pyc_opt1 = importlib.util.cache_from_source(simple_script, + optimization=1) + pyc_opt2 = importlib.util.cache_from_source(simple_script, + optimization=2) + + self.assertRunOK(path, "-f", "-q", "-o 0", "-o 1", "-o 2", + "--hardlink-dupes") + + # All three levels have the same inode + self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + previous_inode = os.stat(pyc_opt0).st_ino + + # Change of the module content + simple_script = script_helper.make_script(path, "module_change", + "print(0)") + + # Recompilation without -o 1 + self.assertRunOK(path, "-f", "-q", "-o 0", "-o 2", "--hardlink-dupes") + + # opt-1.pyc should have the same inode as before and others should not + self.assertEqual(previous_inode, os.stat(pyc_opt1).st_ino) + self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt2).st_ino) + self.assertNotEqual(previous_inode, os.stat(pyc_opt2).st_ino) + # opt-1.pyc and opt-2.pyc have different content + self.assertFalse(filecmp.cmp(pyc_opt1, pyc_opt2, shallow=True)) + + def test_hardlink_deduplication_import(self): + path = os.path.join(self.directory, "test", "module_import") + os.makedirs(path) + + simple_script = script_helper.make_script(path, "module", "a = 0") + pyc_opt0 = importlib.util.cache_from_source(simple_script) + pyc_opt1 = importlib.util.cache_from_source(simple_script, + optimization=1) + pyc_opt2 = importlib.util.cache_from_source(simple_script, + optimization=2) + + self.assertRunOK(path, "-f", "-q", "-o 0", "-o 1", "-o 2", + "--hardlink-dupes") + + # All three levels have the same inode + self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) + self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + + previous_inode = os.stat(pyc_opt0).st_ino + + # Change of the module content + simple_script = script_helper.make_script(path, "module", "print(0)") + + # Import the module in Python + script_helper.assert_python_ok( + "-O", "-c", "import module", __isolated=False, PYTHONPATH=path + ) + + # Only opt-1.pyc is changed + self.assertEqual(previous_inode, os.stat(pyc_opt0).st_ino) + self.assertEqual(previous_inode, os.stat(pyc_opt2).st_ino) + self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + # opt-1.pyc and opt-2.pyc have different content + self.assertFalse(filecmp.cmp(pyc_opt1, pyc_opt2, shallow=True)) + class CommandLineTestsWithSourceEpoch(CommandLineTestsBase, unittest.TestCase, diff --git a/Misc/ACKS b/Misc/ACKS index 9221f6aae439ea..5602e881538ca6 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -86,6 +86,7 @@ Marcin Bachry Alfonso Baciero Dwayne Bailey Stig Bakken +Lumír Balhar Aleksandr Balezin Greg Ball Lewis Ball diff --git a/Misc/NEWS.d/next/Library/2020-05-04-11-20-49.bpo-40495.TyTc2O.rst b/Misc/NEWS.d/next/Library/2020-05-04-11-20-49.bpo-40495.TyTc2O.rst new file mode 100644 index 00000000000000..65ee4a724b6ad1 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2020-05-04-11-20-49.bpo-40495.TyTc2O.rst @@ -0,0 +1,2 @@ +:mod:`compileall` is now able to use hardlinks to prevent duplicates in a +case when pyc files for different optimization levels have the same content. From 7f8b63fe305c969aa45bfc488e3b7f45b142c9c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lum=C3=ADr=20=27Frenzy=27=20Balhar?= Date: Wed, 6 May 2020 13:07:29 +0200 Subject: [PATCH 02/13] Update Misc/NEWS.d/next/Library/2020-05-04-11-20-49.bpo-40495.TyTc2O.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Miro Hrončok --- .../next/Library/2020-05-04-11-20-49.bpo-40495.TyTc2O.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Misc/NEWS.d/next/Library/2020-05-04-11-20-49.bpo-40495.TyTc2O.rst b/Misc/NEWS.d/next/Library/2020-05-04-11-20-49.bpo-40495.TyTc2O.rst index 65ee4a724b6ad1..d3049b05a78b6c 100644 --- a/Misc/NEWS.d/next/Library/2020-05-04-11-20-49.bpo-40495.TyTc2O.rst +++ b/Misc/NEWS.d/next/Library/2020-05-04-11-20-49.bpo-40495.TyTc2O.rst @@ -1,2 +1,2 @@ :mod:`compileall` is now able to use hardlinks to prevent duplicates in a -case when pyc files for different optimization levels have the same content. +case when ``.pyc`` files for different optimization levels have the same content. From 6a9efa2b16f0a716fe0bec53bb0c9e9a3764852a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lum=C3=ADr=20=27Frenzy=27=20Balhar?= Date: Tue, 12 May 2020 06:50:08 +0200 Subject: [PATCH 03/13] Update Doc/library/compileall.rst Co-authored-by: Victor Stinner --- Doc/library/compileall.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/compileall.rst b/Doc/library/compileall.rst index 337f75acd3b9af..90b659357f1b1e 100644 --- a/Doc/library/compileall.rst +++ b/Doc/library/compileall.rst @@ -198,7 +198,7 @@ Public functions the ``-s``, ``-p`` and ``-e`` options described above. They may be specified as ``str``, ``bytes`` or :py:class:`os.PathLike`. - If *hardlink_dupes* is ``True``, hardlinks are used to prevent duplicates + If *hardlink_dupes* is true, hardlinks are used to prevent duplicates if ``.pyc`` files for multiple optimization levels have the same content. .. versionchanged:: 3.2 From e1ef909b356fff381a96a8910135253469641872 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lum=C3=ADr=20=27Frenzy=27=20Balhar?= Date: Tue, 12 May 2020 06:50:20 +0200 Subject: [PATCH 04/13] Update Lib/compileall.py Co-authored-by: Victor Stinner --- Lib/compileall.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/compileall.py b/Lib/compileall.py index 5984058bdc9372..f52fa786447f56 100644 --- a/Lib/compileall.py +++ b/Lib/compileall.py @@ -182,8 +182,8 @@ def compile_file(fullname, ddir=None, force=False, rx=None, quiet=0, optimize = [optimize] if hardlink_dupes: - raise ValueError(("Hardlinking of duplicated bytecode makes sense " - "only for more than one optimization level.")) + raise ValueError("Hardlinking of duplicated bytecode makes sense " + "only for more than one optimization level.") if rx is not None: mo = rx.search(fullname) From b314c5fe7a870e3748043fe22217aeb9261553ad Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Tue, 12 May 2020 06:59:38 +0200 Subject: [PATCH 05/13] remove debug code --- Lib/test/test_compileall.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Lib/test/test_compileall.py b/Lib/test/test_compileall.py index efc2d84f894f54..024f1312c42679 100644 --- a/Lib/test/test_compileall.py +++ b/Lib/test/test_compileall.py @@ -384,8 +384,6 @@ def test_hardlink_deduplication_same_bytecode_all_opt(self): compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], hardlink_dupes=True) - # import pdb; pdb.set_trace() - # All three files should have the same inode (hardlinks) self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) From e2f3a5080298fba9aa16408bcac61efa7133867d Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Tue, 12 May 2020 07:09:15 +0200 Subject: [PATCH 06/13] docs update --- Doc/library/compileall.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Doc/library/compileall.rst b/Doc/library/compileall.rst index 90b659357f1b1e..a511c7eda265b2 100644 --- a/Doc/library/compileall.rst +++ b/Doc/library/compileall.rst @@ -115,8 +115,8 @@ compile Python sources. .. cmdoption:: --hardlink-dupes - Use hardlinks to prevent duplicates if ``.pyc`` files for multiple - optimization levels have the same content. + If two ``.pyc`` files with different optimization level have + the same content, use hard links to consolidate duplicate files. .. versionchanged:: 3.2 Added the ``-i``, ``-b`` and ``-h`` options. @@ -198,8 +198,8 @@ Public functions the ``-s``, ``-p`` and ``-e`` options described above. They may be specified as ``str``, ``bytes`` or :py:class:`os.PathLike`. - If *hardlink_dupes* is true, hardlinks are used to prevent duplicates - if ``.pyc`` files for multiple optimization levels have the same content. + If *hardlink_dupes* is true and two ``.pyc`` files with different optimization + level have the same content, use hard links to consolidate duplicate files. .. versionchanged:: 3.2 Added the *legacy* and *optimize* parameter. @@ -265,8 +265,8 @@ Public functions the ``-s``, ``-p`` and ``-e`` options described above. They may be specified as ``str``, ``bytes`` or :py:class:`os.PathLike`. - If *hardlink_dupes* is ``True``, hardlinks are used to prevent duplicates - if ``.pyc`` files for multiple optimization levels have the same content. + If *hardlink_dupes* is true and two ``.pyc`` files with different optimization + level have the same content, use hard links to consolidate duplicate files. .. versionadded:: 3.2 From 4607d08cca851d914d9eac16119592a95544ae6e Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Tue, 12 May 2020 07:45:05 +0200 Subject: [PATCH 07/13] use is_hardlink to check inodes instead of repeating code --- Lib/test/test_compileall.py | 50 +++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/Lib/test/test_compileall.py b/Lib/test/test_compileall.py index 024f1312c42679..00b04f48e3acd1 100644 --- a/Lib/test/test_compileall.py +++ b/Lib/test/test_compileall.py @@ -74,6 +74,12 @@ def recreation_check(self, metadata): compileall.compile_dir(self.directory, force=False, quiet=True) self.assertTrue(*self.timestamp_metadata()) + def is_hardlink(self, filename1, filename2): + """Returns True if two files have the same inode (hardlink)""" + inode1 = os.stat(filename1).st_ino + inode2 = os.stat(filename2).st_ino + return inode1 == inode2 + def test_mtime(self): # Test a change in mtime leads to a new .pyc. self.recreation_check(struct.pack('<4sll', importlib.util.MAGIC_NUMBER, @@ -385,8 +391,8 @@ def test_hardlink_deduplication_same_bytecode_all_opt(self): hardlink_dupes=True) # All three files should have the same inode (hardlinks) - self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + self.assertTrue(self.is_hardlink(pyc_opt0, pyc_opt1)) + self.assertTrue(self.is_hardlink(pyc_opt1, pyc_opt2)) for pyc_file in {pyc_opt0, pyc_opt1, pyc_opt2}: os.unlink(pyc_file) @@ -395,8 +401,8 @@ def test_hardlink_deduplication_same_bytecode_all_opt(self): hardlink_dupes=False) # Deduplication disabled, all pyc files should have different inodes - self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + self.assertFalse(self.is_hardlink(pyc_opt0, pyc_opt1)) + self.assertFalse(self.is_hardlink(pyc_opt1, pyc_opt2)) def test_hardlink_deduplication_same_bytecode_some_opt(self): # 'a = 0' produces the same bytecode for all optimization levels @@ -414,7 +420,7 @@ def test_hardlink_deduplication_same_bytecode_some_opt(self): hardlink_dupes=True) # Both files should have the same inode (hardlink) - self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt2).st_ino) + self.assertTrue(self.is_hardlink(pyc_opt0, pyc_opt2)) for pyc_file in {pyc_opt0, pyc_opt2}: os.unlink(pyc_file) @@ -423,7 +429,7 @@ def test_hardlink_deduplication_same_bytecode_some_opt(self): hardlink_dupes=False) # Deduplication disabled, both pyc files should have different inodes - self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt2).st_ino) + self.assertFalse(self.is_hardlink(pyc_opt0, pyc_opt2)) def test_hardlink_deduplication_same_bytecode_some_opt_2(self): # 'a = 0' produces the same bytecode for all optimization levels @@ -441,7 +447,7 @@ def test_hardlink_deduplication_same_bytecode_some_opt_2(self): hardlink_dupes=True) # Both files should have the same inode (hardlinks) - self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + self.assertTrue(self.is_hardlink(pyc_opt1, pyc_opt2)) for pyc_file in {pyc_opt1, pyc_opt2}: os.unlink(pyc_file) @@ -449,7 +455,7 @@ def test_hardlink_deduplication_same_bytecode_some_opt_2(self): compileall.compile_dir(path, quiet=True, optimize=[1, 2]) # Deduplication disabled, all pyc files should have different inodes - self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + self.assertFalse(self.is_hardlink(pyc_opt1, pyc_opt2)) def test_hardlink_deduplication_different_bytecode_all_opt(self): # "'''string'''\nassert 1" produces a different bytecode for @@ -470,8 +476,8 @@ def test_hardlink_deduplication_different_bytecode_all_opt(self): hardlink_dupes=True) # No hardlinks, bytecodes are different - self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + self.assertFalse(self.is_hardlink(pyc_opt0, pyc_opt1)) + self.assertFalse(self.is_hardlink(pyc_opt1, pyc_opt2)) for pyc_file in {pyc_opt0, pyc_opt1, pyc_opt2}: os.unlink(pyc_file) @@ -480,8 +486,8 @@ def test_hardlink_deduplication_different_bytecode_all_opt(self): hardlink_dupes=False) # Disabling hardlink deduplication makes no difference - self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + self.assertFalse(self.is_hardlink(pyc_opt0, pyc_opt1)) + self.assertFalse(self.is_hardlink(pyc_opt1, pyc_opt2)) def test_hardlink_deduplication_different_bytecode_one_hardlink(self): # "'''string'''\na = 1" produces the same bytecode only @@ -503,8 +509,8 @@ def test_hardlink_deduplication_different_bytecode_one_hardlink(self): # Only level 0 and 1 has the same inode, level 2 produces # a different bytecode - self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + self.assertTrue(self.is_hardlink(pyc_opt0, pyc_opt1)) + self.assertFalse(self.is_hardlink(pyc_opt1, pyc_opt2)) for pyc_file in {pyc_opt0, pyc_opt1, pyc_opt2}: os.unlink(pyc_file) @@ -513,8 +519,8 @@ def test_hardlink_deduplication_different_bytecode_one_hardlink(self): hardlink_dupes=False) # Deduplication disabled, no hardlinks - self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + self.assertFalse(self.is_hardlink(pyc_opt0, pyc_opt1)) + self.assertFalse(self.is_hardlink(pyc_opt1, pyc_opt2)) def test_hardlink_deduplication_recompilation(self): path = os.path.join(self.directory, "test", "module_change") @@ -532,8 +538,8 @@ def test_hardlink_deduplication_recompilation(self): hardlink_dupes=True) # All three levels have the same inode - self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + self.assertTrue(self.is_hardlink(pyc_opt0, pyc_opt1)) + self.assertTrue(self.is_hardlink(pyc_opt1, pyc_opt2)) previous_inode = os.stat(pyc_opt0).st_ino @@ -547,7 +553,7 @@ def test_hardlink_deduplication_recompilation(self): # opt-1.pyc should have the same inode as before and others should not self.assertEqual(previous_inode, os.stat(pyc_opt1).st_ino) - self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt2).st_ino) + self.assertTrue(self.is_hardlink(pyc_opt0, pyc_opt2)) self.assertNotEqual(previous_inode, os.stat(pyc_opt2).st_ino) # opt-1.pyc and opt-2.pyc have different content self.assertFalse(filecmp.cmp(pyc_opt1, pyc_opt2, shallow=True)) @@ -567,8 +573,8 @@ def test_hardlink_deduplication_import(self): hardlink_dupes=True) # All three levels have the same inode - self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + self.assertTrue(self.is_hardlink(pyc_opt0, pyc_opt1)) + self.assertTrue(self.is_hardlink(pyc_opt1, pyc_opt2)) previous_inode = os.stat(pyc_opt0).st_ino @@ -583,7 +589,7 @@ def test_hardlink_deduplication_import(self): # Only opt-1.pyc is changed self.assertEqual(previous_inode, os.stat(pyc_opt0).st_ino) self.assertEqual(previous_inode, os.stat(pyc_opt2).st_ino) - self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + self.assertFalse(self.is_hardlink(pyc_opt1, pyc_opt2)) # opt-1.pyc and opt-2.pyc have different content self.assertFalse(filecmp.cmp(pyc_opt1, pyc_opt2, shallow=True)) From 4fb779a6a2cda448f22424c9421593ea92226fb0 Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Tue, 12 May 2020 08:09:12 +0200 Subject: [PATCH 08/13] use subTest to parametrize three tests with different combinations of opt levels --- Lib/test/test_compileall.py | 96 +++++++++++-------------------------- 1 file changed, 27 insertions(+), 69 deletions(-) diff --git a/Lib/test/test_compileall.py b/Lib/test/test_compileall.py index 00b04f48e3acd1..9f006048f2ae96 100644 --- a/Lib/test/test_compileall.py +++ b/Lib/test/test_compileall.py @@ -12,6 +12,7 @@ import unittest import io import filecmp +import itertools from unittest import mock, skipUnless try: @@ -374,88 +375,45 @@ def test_hardlink_deduplication_bad_args(self): compileall.compile_dir(self.directory, quiet=True, optimize=0, hardlink_dupes=True) - def test_hardlink_deduplication_same_bytecode_all_opt(self): - # 'a = 0' produces the same bytecode for all optimization levels - path = os.path.join(self.directory, "test", "same_all") - os.makedirs(path) - - simple_script = script_helper.make_script(path, "test_same_bytecode", - "a = 0") - pyc_opt0 = importlib.util.cache_from_source(simple_script) - pyc_opt1 = importlib.util.cache_from_source(simple_script, - optimization=1) - pyc_opt2 = importlib.util.cache_from_source(simple_script, - optimization=2) - - compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], - hardlink_dupes=True) - - # All three files should have the same inode (hardlinks) - self.assertTrue(self.is_hardlink(pyc_opt0, pyc_opt1)) - self.assertTrue(self.is_hardlink(pyc_opt1, pyc_opt2)) - - for pyc_file in {pyc_opt0, pyc_opt1, pyc_opt2}: - os.unlink(pyc_file) - - compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], - hardlink_dupes=False) - - # Deduplication disabled, all pyc files should have different inodes - self.assertFalse(self.is_hardlink(pyc_opt0, pyc_opt1)) - self.assertFalse(self.is_hardlink(pyc_opt1, pyc_opt2)) - - def test_hardlink_deduplication_same_bytecode_some_opt(self): + def test_hardlink_deduplication_same_bytecode(self): # 'a = 0' produces the same bytecode for all optimization levels - # only two levels of optimization [0, 1] tested - path = os.path.join(self.directory, "test", "same_some") + path = os.path.join(self.directory, "test", "same") os.makedirs(path) simple_script = script_helper.make_script(path, "test_same_bytecode", "a = 0") - pyc_opt0 = importlib.util.cache_from_source(simple_script) - pyc_opt2 = importlib.util.cache_from_source(simple_script, - optimization=2) - - compileall.compile_dir(path, quiet=True, optimize=[0, 2], - hardlink_dupes=True) - - # Both files should have the same inode (hardlink) - self.assertTrue(self.is_hardlink(pyc_opt0, pyc_opt2)) - - for pyc_file in {pyc_opt0, pyc_opt2}: - os.unlink(pyc_file) - compileall.compile_dir(path, quiet=True, force=True, optimize=[0, 2], - hardlink_dupes=False) - - # Deduplication disabled, both pyc files should have different inodes - self.assertFalse(self.is_hardlink(pyc_opt0, pyc_opt2)) + opt_combinations = ((0, 1, 2), (1, 2), (0, 2)) - def test_hardlink_deduplication_same_bytecode_some_opt_2(self): - # 'a = 0' produces the same bytecode for all optimization levels - path = os.path.join(self.directory, "test", "same_some_2") - os.makedirs(path) + for opt_combination in opt_combinations: + with self.subTest(opt_combination=opt_combination): - simple_script = script_helper.make_script(path, "test_same_bytecode", - "a = 0") - pyc_opt1 = importlib.util.cache_from_source(simple_script, - optimization=1) - pyc_opt2 = importlib.util.cache_from_source(simple_script, - optimization=2) + pycs = {} + for opt_level in opt_combination: + pycs[opt_level] = importlib.util.cache_from_source( + simple_script, optimization=opt_level + ) - compileall.compile_dir(path, quiet=True, optimize=[1, 2], - hardlink_dupes=True) + compileall.compile_dir( + path, quiet=True, optimize=opt_combination, + hardlink_dupes=True + ) - # Both files should have the same inode (hardlinks) - self.assertTrue(self.is_hardlink(pyc_opt1, pyc_opt2)) + # All three files should have the same inode (hardlinks) + for pair in itertools.combinations(opt_combination, 2): + self.assertTrue(self.is_hardlink(pycs[pair[0]], pycs[pair[1]])) - for pyc_file in {pyc_opt1, pyc_opt2}: - os.unlink(pyc_file) + for pyc_file in pycs.values(): + os.unlink(pyc_file) - compileall.compile_dir(path, quiet=True, optimize=[1, 2]) + compileall.compile_dir( + path, quiet=True, optimize=opt_combination, + hardlink_dupes=False + ) - # Deduplication disabled, all pyc files should have different inodes - self.assertFalse(self.is_hardlink(pyc_opt1, pyc_opt2)) + # Deduplication disabled, all pyc files should have different inodes + for pair in itertools.combinations(opt_combination, 2): + self.assertFalse(self.is_hardlink(pycs[pair[0]], pycs[pair[1]])) def test_hardlink_deduplication_different_bytecode_all_opt(self): # "'''string'''\nassert 1" produces a different bytecode for From 97b057edc7165492522ba0226cd40296c51c5ec5 Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Tue, 12 May 2020 08:58:50 +0200 Subject: [PATCH 09/13] fix tests --- Lib/test/test_compileall.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_compileall.py b/Lib/test/test_compileall.py index 9f006048f2ae96..d5b615e9b80c7c 100644 --- a/Lib/test/test_compileall.py +++ b/Lib/test/test_compileall.py @@ -390,8 +390,12 @@ def test_hardlink_deduplication_same_bytecode(self): pycs = {} for opt_level in opt_combination: + # We need this because importlib.util.cache_from_source + # produces different results when called with + # optimization=0 and without optimization + optimization_kwarg = {"optimization": opt_level} if opt_level > 0 else {} pycs[opt_level] = importlib.util.cache_from_source( - simple_script, optimization=opt_level + simple_script, **optimization_kwarg ) compileall.compile_dir( @@ -415,6 +419,9 @@ def test_hardlink_deduplication_same_bytecode(self): for pair in itertools.combinations(opt_combination, 2): self.assertFalse(self.is_hardlink(pycs[pair[0]], pycs[pair[1]])) + for pyc_file in pycs.values(): + os.unlink(pyc_file) + def test_hardlink_deduplication_different_bytecode_all_opt(self): # "'''string'''\nassert 1" produces a different bytecode for # all optimization levels From 9ca6eae44a8e13207caff3ea657dd6d8ebc5f2de Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 13 May 2020 16:24:47 +0200 Subject: [PATCH 10/13] Refactor tests * Add HardlinkDedupTestsBase test case. * Only keep two tests on the command line interface since most tests were duplicated with HardlinkDedupTestsBase. * Add helper functions and methods to factorize the code. * Sort imports * Replace 415 lines of tests with 205 lines: 2x smaller. --- Lib/test/test_compileall.py | 612 ++++++++++++------------------------ 1 file changed, 199 insertions(+), 413 deletions(-) diff --git a/Lib/test/test_compileall.py b/Lib/test/test_compileall.py index d5b615e9b80c7c..a277b638c5d917 100644 --- a/Lib/test/test_compileall.py +++ b/Lib/test/test_compileall.py @@ -1,18 +1,19 @@ -import sys import compileall +import contextlib +import filecmp import importlib.util -import test.test_importlib.util +import io +import itertools import os import pathlib import py_compile import shutil import struct +import sys import tempfile +import test.test_importlib.util import time import unittest -import io -import filecmp -import itertools from unittest import mock, skipUnless try: @@ -28,6 +29,24 @@ from .test_py_compile import SourceDateEpochTestMeta +def get_pyc(script, opt): + if not opt: + # Replace None and 0 with '' + opt = '' + return importlib.util.cache_from_source(script, optimization=opt) + + +def get_pycs(script): + return [get_pyc(script, opt) for opt in (0, 1, 2)] + + +def is_hardlink(filename1, filename2): + """Returns True if two files have the same inode (hardlink)""" + inode1 = os.stat(filename1).st_ino + inode2 = os.stat(filename2).st_ino + return inode1 == inode2 + + class CompileallTestsBase: def setUp(self): @@ -75,12 +94,6 @@ def recreation_check(self, metadata): compileall.compile_dir(self.directory, force=False, quiet=True) self.assertTrue(*self.timestamp_metadata()) - def is_hardlink(self, filename1, filename2): - """Returns True if two files have the same inode (hardlink)""" - inode1 = os.stat(filename1).st_ino - inode2 = os.stat(filename2).st_ino - return inode1 == inode2 - def test_mtime(self): # Test a change in mtime leads to a new .pyc. self.recreation_check(struct.pack('<4sll', importlib.util.MAGIC_NUMBER, @@ -368,196 +381,6 @@ def test_ignore_symlink_destination(self): self.assertTrue(os.path.isfile(allowed_bc)) self.assertFalse(os.path.isfile(prohibited_bc)) - def test_hardlink_deduplication_bad_args(self): - # Bad arguments combination, hardlink deduplication make sense - # only for more than one optimization level - with self.assertRaises(ValueError): - compileall.compile_dir(self.directory, quiet=True, optimize=0, - hardlink_dupes=True) - - def test_hardlink_deduplication_same_bytecode(self): - # 'a = 0' produces the same bytecode for all optimization levels - path = os.path.join(self.directory, "test", "same") - os.makedirs(path) - - simple_script = script_helper.make_script(path, "test_same_bytecode", - "a = 0") - - opt_combinations = ((0, 1, 2), (1, 2), (0, 2)) - - for opt_combination in opt_combinations: - with self.subTest(opt_combination=opt_combination): - - pycs = {} - for opt_level in opt_combination: - # We need this because importlib.util.cache_from_source - # produces different results when called with - # optimization=0 and without optimization - optimization_kwarg = {"optimization": opt_level} if opt_level > 0 else {} - pycs[opt_level] = importlib.util.cache_from_source( - simple_script, **optimization_kwarg - ) - - compileall.compile_dir( - path, quiet=True, optimize=opt_combination, - hardlink_dupes=True - ) - - # All three files should have the same inode (hardlinks) - for pair in itertools.combinations(opt_combination, 2): - self.assertTrue(self.is_hardlink(pycs[pair[0]], pycs[pair[1]])) - - for pyc_file in pycs.values(): - os.unlink(pyc_file) - - compileall.compile_dir( - path, quiet=True, optimize=opt_combination, - hardlink_dupes=False - ) - - # Deduplication disabled, all pyc files should have different inodes - for pair in itertools.combinations(opt_combination, 2): - self.assertFalse(self.is_hardlink(pycs[pair[0]], pycs[pair[1]])) - - for pyc_file in pycs.values(): - os.unlink(pyc_file) - - def test_hardlink_deduplication_different_bytecode_all_opt(self): - # "'''string'''\nassert 1" produces a different bytecode for - # all optimization levels - path = os.path.join(self.directory, "test", "different_all") - os.makedirs(path) - - simple_script = script_helper.make_script( - path, "test_different_bytecode", "'''string'''\nassert 1" - ) - pyc_opt0 = importlib.util.cache_from_source(simple_script) - pyc_opt1 = importlib.util.cache_from_source(simple_script, - optimization=1) - pyc_opt2 = importlib.util.cache_from_source(simple_script, - optimization=2) - - compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], - hardlink_dupes=True) - - # No hardlinks, bytecodes are different - self.assertFalse(self.is_hardlink(pyc_opt0, pyc_opt1)) - self.assertFalse(self.is_hardlink(pyc_opt1, pyc_opt2)) - - for pyc_file in {pyc_opt0, pyc_opt1, pyc_opt2}: - os.unlink(pyc_file) - - compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], - hardlink_dupes=False) - - # Disabling hardlink deduplication makes no difference - self.assertFalse(self.is_hardlink(pyc_opt0, pyc_opt1)) - self.assertFalse(self.is_hardlink(pyc_opt1, pyc_opt2)) - - def test_hardlink_deduplication_different_bytecode_one_hardlink(self): - # "'''string'''\na = 1" produces the same bytecode only - # for level 0 and 1 - path = os.path.join(self.directory, "test", "different_one") - os.makedirs(path) - - simple_script = script_helper.make_script( - path, "test_different_bytecode", "'''string'''\na = 1" - ) - pyc_opt0 = importlib.util.cache_from_source(simple_script) - pyc_opt1 = importlib.util.cache_from_source(simple_script, - optimization=1) - pyc_opt2 = importlib.util.cache_from_source(simple_script, - optimization=2) - - compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], - hardlink_dupes=True) - - # Only level 0 and 1 has the same inode, level 2 produces - # a different bytecode - self.assertTrue(self.is_hardlink(pyc_opt0, pyc_opt1)) - self.assertFalse(self.is_hardlink(pyc_opt1, pyc_opt2)) - - for pyc_file in {pyc_opt0, pyc_opt1, pyc_opt2}: - os.unlink(pyc_file) - - compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], - hardlink_dupes=False) - - # Deduplication disabled, no hardlinks - self.assertFalse(self.is_hardlink(pyc_opt0, pyc_opt1)) - self.assertFalse(self.is_hardlink(pyc_opt1, pyc_opt2)) - - def test_hardlink_deduplication_recompilation(self): - path = os.path.join(self.directory, "test", "module_change") - os.makedirs(path) - - simple_script = script_helper.make_script(path, "module_change", - "a = 0") - pyc_opt0 = importlib.util.cache_from_source(simple_script) - pyc_opt1 = importlib.util.cache_from_source(simple_script, - optimization=1) - pyc_opt2 = importlib.util.cache_from_source(simple_script, - optimization=2) - - compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], - hardlink_dupes=True) - - # All three levels have the same inode - self.assertTrue(self.is_hardlink(pyc_opt0, pyc_opt1)) - self.assertTrue(self.is_hardlink(pyc_opt1, pyc_opt2)) - - previous_inode = os.stat(pyc_opt0).st_ino - - # Change of the module content - simple_script = script_helper.make_script(path, "module_change", - "print(0)") - - # Recompilation without -o 1 - compileall.compile_dir(path, force=True, quiet=True, optimize=[0, 2], - hardlink_dupes=True) - - # opt-1.pyc should have the same inode as before and others should not - self.assertEqual(previous_inode, os.stat(pyc_opt1).st_ino) - self.assertTrue(self.is_hardlink(pyc_opt0, pyc_opt2)) - self.assertNotEqual(previous_inode, os.stat(pyc_opt2).st_ino) - # opt-1.pyc and opt-2.pyc have different content - self.assertFalse(filecmp.cmp(pyc_opt1, pyc_opt2, shallow=True)) - - def test_hardlink_deduplication_import(self): - path = os.path.join(self.directory, "test", "module_import") - os.makedirs(path) - - simple_script = script_helper.make_script(path, "module", "a = 0") - pyc_opt0 = importlib.util.cache_from_source(simple_script) - pyc_opt1 = importlib.util.cache_from_source(simple_script, - optimization=1) - pyc_opt2 = importlib.util.cache_from_source(simple_script, - optimization=2) - - compileall.compile_dir(path, quiet=True, optimize=[0, 1, 2], - hardlink_dupes=True) - - # All three levels have the same inode - self.assertTrue(self.is_hardlink(pyc_opt0, pyc_opt1)) - self.assertTrue(self.is_hardlink(pyc_opt1, pyc_opt2)) - - previous_inode = os.stat(pyc_opt0).st_ino - - # Change of the module content - simple_script = script_helper.make_script(path, "module", "print(0)") - - # Import the module in Python - script_helper.assert_python_ok( - "-O", "-c", "import module", __isolated=False, PYTHONPATH=path - ) - - # Only opt-1.pyc is changed - self.assertEqual(previous_inode, os.stat(pyc_opt0).st_ino) - self.assertEqual(previous_inode, os.stat(pyc_opt2).st_ino) - self.assertFalse(self.is_hardlink(pyc_opt1, pyc_opt2)) - # opt-1.pyc and opt-2.pyc have different content - self.assertFalse(filecmp.cmp(pyc_opt1, pyc_opt2, shallow=True)) - class CompileallTestsWithSourceEpoch(CompileallTestsBase, unittest.TestCase, @@ -1023,238 +846,201 @@ def test_ignore_symlink_destination(self): self.assertTrue(os.path.isfile(allowed_bc)) self.assertFalse(os.path.isfile(prohibited_bc)) - def test_hardlink_deduplication_bad_args(self): + def test_hardlink_bad_args(self): # Bad arguments combination, hardlink deduplication make sense # only for more than one optimization level - self.assertRunNotOK(self.directory, "-o 1", "--hardlink_dupes") - - def test_hardlink_deduplication_same_bytecode_all_opt(self): - # 'a = 0' produces the same bytecode for all optimization levels - path = os.path.join(self.directory, "test", "same_all") - os.makedirs(path) + self.assertRunNotOK(self.directory, "-o 1", "--hardlink-dupes") + + def test_hardlink(self): + # 'a = 0' code produces the same bytecode for the 3 optimization + # levels. All three .pyc files must have the same inode (hardlinks). + # + # If deduplication is disabled, all pyc files must have different + # inodes. + for dedup in (True, False): + with tempfile.TemporaryDirectory() as path: + with self.subTest(dedup=dedup): + script = script_helper.make_script(path, "script", "a = 0") + pycs = get_pycs(script) + + args = ["-q", "-o 0", "-o 1", "-o 2"] + if dedup: + args.append("--hardlink-dupes") + self.assertRunOK(path, *args) + + self.assertEqual(is_hardlink(pycs[0], pycs[1]), dedup) + self.assertEqual(is_hardlink(pycs[1], pycs[2]), dedup) + self.assertEqual(is_hardlink(pycs[0], pycs[2]), dedup) - simple_script = script_helper.make_script(path, "test_same_bytecode", - "a = 0") - pyc_opt0 = importlib.util.cache_from_source(simple_script) - pyc_opt1 = importlib.util.cache_from_source(simple_script, - optimization=1) - pyc_opt2 = importlib.util.cache_from_source(simple_script, - optimization=2) - self.assertRunOK(path, "-q", "-o 0", "-o 1", "-o 2", - "--hardlink-dupes") - - # All three files should have the same inode (hardlinks) - self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) - - for pyc_file in {pyc_opt0, pyc_opt1, pyc_opt2}: - os.unlink(pyc_file) - - self.assertRunOK(path, "-q", "-o 0", "-o 1", "-o 2") - - # Deduplication disabled, all pyc files should have different inodes - self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) - - def test_hardlink_deduplication_same_bytecode_some_opt(self): - # 'a = 0' produces the same bytecode for all optimization levels - # only two levels of optimization [0, 1] tested - path = os.path.join(self.directory, "test", "same_some") - os.makedirs(path) - - simple_script = script_helper.make_script(path, "test_same_bytecode", - "a = 0") - pyc_opt0 = importlib.util.cache_from_source(simple_script) - pyc_opt2 = importlib.util.cache_from_source(simple_script, - optimization=2) - - self.assertRunOK(path, "-q", "-o 0", "-o 2", "--hardlink-dupes") - - # Both files should have the same inode (hardlink) - self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt2).st_ino) - - for pyc_file in {pyc_opt0, pyc_opt2}: - os.unlink(pyc_file) - - self.assertRunOK(path, "-q", "-o 0", "-o 2") - - # Deduplication disabled, both pyc files should have different inodes - self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt2).st_ino) - - def test_hardlink_deduplication_same_bytecode_some_opt_2(self): - # 'a = 0' produces the same bytecode for all optimization levels - path = os.path.join(self.directory, "test", "same_some_2") - os.makedirs(path) - - simple_script = script_helper.make_script(path, "test_same_bytecode", - "a = 0") - pyc_opt1 = importlib.util.cache_from_source(simple_script, - optimization=1) - pyc_opt2 = importlib.util.cache_from_source(simple_script, - optimization=2) - - self.assertRunOK(path, "-q", "-o 1", "-o 2", "--hardlink-dupes") - - # Both files should have the same inode (hardlinks) - self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) - - for pyc_file in {pyc_opt1, pyc_opt2}: - os.unlink(pyc_file) - - self.assertRunOK(path, "-q", "-o 1", "-o 2") - - # Deduplication disabled, all pyc files should have different inodes - self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) - - def test_hardlink_deduplication_different_bytecode_all_opt(self): - # "'''string'''\nassert 1" produces a different bytecode for - # all optimization levels - path = os.path.join(self.directory, "test", "different_all") - os.makedirs(path) - - simple_script = script_helper.make_script(path, - "test_different_bytecode", - "'''string'''\nassert 1") - pyc_opt0 = importlib.util.cache_from_source(simple_script) - pyc_opt1 = importlib.util.cache_from_source(simple_script, - optimization=1) - pyc_opt2 = importlib.util.cache_from_source(simple_script, - optimization=2) - - self.assertRunOK(path, "-q", "-o 0", "-o 1", "-o 2", - "--hardlink-dupes") - - # No hardlinks, bytecodes are different - self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) +class CommandLineTestsWithSourceEpoch(CommandLineTestsBase, + unittest.TestCase, + metaclass=SourceDateEpochTestMeta, + source_date_epoch=True): + pass - for pyc_file in {pyc_opt0, pyc_opt1, pyc_opt2}: - os.unlink(pyc_file) - self.assertRunOK(path, "-q", "-o 0", "-o 1", "-o 2") +class CommandLineTestsNoSourceEpoch(CommandLineTestsBase, + unittest.TestCase, + metaclass=SourceDateEpochTestMeta, + source_date_epoch=False): + pass - # Disabling hardlink deduplication makes no difference - self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) - def test_hardlink_deduplication_different_bytecode_one_hardlink(self): - # "'''string'''\na = 1" produces the same bytecode only - # for level 0 and 1 - path = os.path.join(self.directory, "test", "different_one") - os.makedirs(path) - simple_script = script_helper.make_script( - path, "test_different_bytecode", "'''string'''\na = 1" - ) - pyc_opt0 = importlib.util.cache_from_source(simple_script) - pyc_opt1 = importlib.util.cache_from_source(simple_script, - optimization=1) - pyc_opt2 = importlib.util.cache_from_source(simple_script, - optimization=2) +class HardlinkDedupTestsBase: + # Test hardlink_dupes parameter of compileall.compile_dir() - self.assertRunOK(path, "-q", "-o 0", "-o 1", "-o 2", - "--hardlink-dupes") + def setUp(self): + self.path = None - # Only level 0 and 1 has the same inode, level 2 produces - # a different bytecode - self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) + @contextlib.contextmanager + def temporary_directory(self): + with tempfile.TemporaryDirectory() as path: + self.path = path + yield path + self.path = None - for pyc_file in {pyc_opt0, pyc_opt1, pyc_opt2}: - os.unlink(pyc_file) + def make_script(self, code, name="script"): + return script_helper.make_script(self.path, name, code) - self.assertRunOK(path, "-q", "-o 0", "-o 1", "-o 2") + def compile_dir(self, *, dedup=True, optimize=(0, 1, 2), force=False): + compileall.compile_dir(self.path, quiet=True, optimize=optimize, + hardlink_dupes=dedup, force=force) + def test_bad_args(self): + # Bad arguments combination, hardlink deduplication make sense + # only for more than one optimization level + with self.assertRaises(ValueError): + with self.temporary_directory(): + self.make_script("pass") + compileall.compile_dir(self.path, quiet=True, optimize=0, + hardlink_dupes=True) + + def create_code(self, docstring=False, assertion=False): + lines = [] + if docstring: + lines.append("'module docstring'") + lines.append('x = 1') + if assertion: + lines.append("assert x == 1") + return '\n'.join(lines) + + def iter_codes(self): + for docstring in (False, True): + for assertion in (False, True): + code = self.create_code(docstring=docstring, assertion=assertion) + yield (code, docstring, assertion) + + def test_disabled(self): # Deduplication disabled, no hardlinks - self.assertNotEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) - - def test_hardlink_deduplication_recompilation(self): - path = os.path.join(self.directory, "test", "module_change") - os.makedirs(path) - - simple_script = script_helper.make_script(path, "module_change", - "a = 0") - pyc_opt0 = importlib.util.cache_from_source(simple_script) - pyc_opt1 = importlib.util.cache_from_source(simple_script, - optimization=1) - pyc_opt2 = importlib.util.cache_from_source(simple_script, - optimization=2) - - self.assertRunOK(path, "-f", "-q", "-o 0", "-o 1", "-o 2", - "--hardlink-dupes") - - # All three levels have the same inode - self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) - - previous_inode = os.stat(pyc_opt0).st_ino - - # Change of the module content - simple_script = script_helper.make_script(path, "module_change", - "print(0)") - - # Recompilation without -o 1 - self.assertRunOK(path, "-f", "-q", "-o 0", "-o 2", "--hardlink-dupes") - - # opt-1.pyc should have the same inode as before and others should not - self.assertEqual(previous_inode, os.stat(pyc_opt1).st_ino) - self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt2).st_ino) - self.assertNotEqual(previous_inode, os.stat(pyc_opt2).st_ino) - # opt-1.pyc and opt-2.pyc have different content - self.assertFalse(filecmp.cmp(pyc_opt1, pyc_opt2, shallow=True)) - - def test_hardlink_deduplication_import(self): - path = os.path.join(self.directory, "test", "module_import") - os.makedirs(path) - - simple_script = script_helper.make_script(path, "module", "a = 0") - pyc_opt0 = importlib.util.cache_from_source(simple_script) - pyc_opt1 = importlib.util.cache_from_source(simple_script, - optimization=1) - pyc_opt2 = importlib.util.cache_from_source(simple_script, - optimization=2) - - self.assertRunOK(path, "-f", "-q", "-o 0", "-o 1", "-o 2", - "--hardlink-dupes") - - # All three levels have the same inode - self.assertEqual(os.stat(pyc_opt0).st_ino, os.stat(pyc_opt1).st_ino) - self.assertEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) - - previous_inode = os.stat(pyc_opt0).st_ino - - # Change of the module content - simple_script = script_helper.make_script(path, "module", "print(0)") - - # Import the module in Python - script_helper.assert_python_ok( - "-O", "-c", "import module", __isolated=False, PYTHONPATH=path - ) - - # Only opt-1.pyc is changed - self.assertEqual(previous_inode, os.stat(pyc_opt0).st_ino) - self.assertEqual(previous_inode, os.stat(pyc_opt2).st_ino) - self.assertNotEqual(os.stat(pyc_opt1).st_ino, os.stat(pyc_opt2).st_ino) - # opt-1.pyc and opt-2.pyc have different content - self.assertFalse(filecmp.cmp(pyc_opt1, pyc_opt2, shallow=True)) - - -class CommandLineTestsWithSourceEpoch(CommandLineTestsBase, - unittest.TestCase, - metaclass=SourceDateEpochTestMeta, - source_date_epoch=True): + for code, docstring, assertion in self.iter_codes(): + with self.subTest(docstring=docstring, assertion=assertion): + with self.temporary_directory(): + script = self.make_script(code) + pycs = get_pycs(script) + self.compile_dir(dedup=False) + self.assertFalse(is_hardlink(pycs[0], pycs[1])) + self.assertFalse(is_hardlink(pycs[0], pycs[2])) + self.assertFalse(is_hardlink(pycs[1], pycs[2])) + + def check_hardlinks(self, script, docstring=False, assertion=False): + pycs = get_pycs(script) + self.assertEqual(is_hardlink(pycs[0], pycs[1]), + not assertion) + self.assertEqual(is_hardlink(pycs[0], pycs[2]), + not assertion and not docstring) + self.assertEqual(is_hardlink(pycs[1], pycs[2]), + not docstring) + + def test_hardlink(self): + # Test deduplication on all combinations + for code, docstring, assertion in self.iter_codes(): + with self.subTest(docstring=docstring, assertion=assertion): + with self.temporary_directory(): + script = self.make_script(code) + self.compile_dir() + self.check_hardlinks(script, docstring, assertion) + + def test_only_two_levels(self): + # Don't build the 3 optimization levels, but only 2 + for opts in ((0, 1), (1, 2), (0, 2)): + with self.subTest(opts=opts): + with self.temporary_directory(): + # code with no dostring and no assertion: + # same bytecode for all optimization levels + script = self.make_script(self.create_code()) + self.compile_dir(optimize=opts) + pyc1 = get_pyc(script, opts[0]) + pyc2 = get_pyc(script, opts[1]) + self.assertTrue(is_hardlink(pyc1, pyc2)) + + def test_recompilation(self): + # Test compile_dir() when pyc files already exists and the script + # content changed + with self.temporary_directory(): + script = self.make_script("a = 0") + self.compile_dir() + # All three levels have the same inode + self.check_hardlinks(script) + + pycs = get_pycs(script) + inode = os.stat(pycs[0]).st_ino + + # Change of the module content + script = self.make_script("print(0)") + + # Recompilation without -o 1 + self.compile_dir(optimize=[0, 2], force=True) + + # opt-1.pyc should have the same inode as before and others should not + self.assertEqual(inode, os.stat(pycs[1]).st_ino) + self.assertTrue(is_hardlink(pycs[0], pycs[2])) + self.assertNotEqual(inode, os.stat(pycs[2]).st_ino) + # opt-1.pyc and opt-2.pyc have different content + self.assertFalse(filecmp.cmp(pycs[1], pycs[2], shallow=True)) + + def test_import(self): + # Test that import updates a single pyc file when pyc files already + # exists and the script content changed + with self.temporary_directory(): + script = self.make_script(self.create_code(), name="module") + self.compile_dir() + # All three levels have the same inode + self.check_hardlinks(script) + + pycs = get_pycs(script) + inode = os.stat(pycs[0]).st_ino + + # Change of the module content + script = self.make_script("print(0)", name="module") + + # Import the module in Python with -O (optimization level 1) + script_helper.assert_python_ok( + "-O", "-c", "import module", __isolated=False, PYTHONPATH=self.path + ) + + # Only opt-1.pyc is changed + self.assertEqual(inode, os.stat(pycs[0]).st_ino) + self.assertEqual(inode, os.stat(pycs[2]).st_ino) + self.assertFalse(is_hardlink(pycs[1], pycs[2])) + # opt-1.pyc and opt-2.pyc have different content + self.assertFalse(filecmp.cmp(pycs[1], pycs[2], shallow=True)) + + +class HardlinkDedupTestsWithSourceEpoch(HardlinkDedupTestsBase, + unittest.TestCase, + metaclass=SourceDateEpochTestMeta, + source_date_epoch=True): pass -class CommandLineTestsNoSourceEpoch(CommandLineTestsBase, - unittest.TestCase, - metaclass=SourceDateEpochTestMeta, - source_date_epoch=False): +class HardlinkDedupTestsNoSourceEpoch(HardlinkDedupTestsBase, + unittest.TestCase, + metaclass=SourceDateEpochTestMeta, + source_date_epoch=False): pass - if __name__ == "__main__": unittest.main() From b0063618535c5c4aeade1213af84c5d7b67de307 Mon Sep 17 00:00:00 2001 From: Lumir Balhar Date: Thu, 14 May 2020 14:50:09 +0200 Subject: [PATCH 11/13] Updated Whatsnew --- Doc/whatsnew/3.9.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Doc/whatsnew/3.9.rst b/Doc/whatsnew/3.9.rst index cefaf5715d4143..abfd8f85fc47a6 100644 --- a/Doc/whatsnew/3.9.rst +++ b/Doc/whatsnew/3.9.rst @@ -245,6 +245,16 @@ that schedules a shutdown for the default executor that waits on the Added :class:`asyncio.PidfdChildWatcher`, a Linux-specific child watcher implementation that polls process file descriptors. (:issue:`38692`) +compileall +---------- + +Added new possibility to use hardlinks for duplicated ``.pyc`` files: *hardlink_dupes* parameter and --hardlink-dupes command line option. +(Contributed by Lumír 'Frenzy' Balhar in :issue:`40495`.) + +Added new options for path manipulation in resulting ``.pyc`` files: *stripdir*, *prependdir*, *limit_sl_dest* parameters and -s, -p, -e command line options. +Added the possibility to specify the option for an optimization level multiple times. +(Contributed by Lumír 'Frenzy' Balhar in :issue:`38112`.) + concurrent.futures ------------------ From 45259b27f6bf66d7f31095cc10e0453090a3b52d Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 14 May 2020 15:12:38 +0200 Subject: [PATCH 12/13] Update Lib/compileall.py --- Lib/compileall.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/compileall.py b/Lib/compileall.py index f52fa786447f56..8f9ee2bcfdd52b 100644 --- a/Lib/compileall.py +++ b/Lib/compileall.py @@ -183,7 +183,7 @@ def compile_file(fullname, ddir=None, force=False, rx=None, quiet=0, if hardlink_dupes: raise ValueError("Hardlinking of duplicated bytecode makes sense " - "only for more than one optimization level.") + "only for more than one optimization level") if rx is not None: mo = rx.search(fullname) From 7e92096fe6a6c675dbacff88c3e8e44596d8ce66 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Thu, 14 May 2020 15:35:47 +0200 Subject: [PATCH 13/13] Remove duplicated optimization levels --- Lib/compileall.py | 12 ++++++++---- Lib/test/test_compileall.py | 23 ++++++++++++++++++++--- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/Lib/compileall.py b/Lib/compileall.py index 8f9ee2bcfdd52b..fe7f450c55e1c5 100644 --- a/Lib/compileall.py +++ b/Lib/compileall.py @@ -181,9 +181,13 @@ def compile_file(fullname, ddir=None, force=False, rx=None, quiet=0, if isinstance(optimize, int): optimize = [optimize] - if hardlink_dupes: - raise ValueError("Hardlinking of duplicated bytecode makes sense " - "only for more than one optimization level") + # Use set() to remove duplicates. + # Use sorted() to create pyc files in a deterministic order. + optimize = sorted(set(optimize)) + + if hardlink_dupes and len(optimize) < 2: + raise ValueError("Hardlinking of duplicated bytecode makes sense " + "only for more than one optimization level") if rx is not None: mo = rx.search(fullname) @@ -229,7 +233,7 @@ def compile_file(fullname, ddir=None, force=False, rx=None, quiet=0, if not quiet: print('Compiling {!r}...'.format(fullname)) try: - for index, opt_level in enumerate(sorted(optimize)): + for index, opt_level in enumerate(optimize): cfile = opt_cfiles[opt_level] ok = py_compile.compile(fullname, cfile, dfile, True, optimize=opt_level, diff --git a/Lib/test/test_compileall.py b/Lib/test/test_compileall.py index a277b638c5d917..b4061b79357b87 100644 --- a/Lib/test/test_compileall.py +++ b/Lib/test/test_compileall.py @@ -911,11 +911,16 @@ def compile_dir(self, *, dedup=True, optimize=(0, 1, 2), force=False): def test_bad_args(self): # Bad arguments combination, hardlink deduplication make sense # only for more than one optimization level - with self.assertRaises(ValueError): - with self.temporary_directory(): - self.make_script("pass") + with self.temporary_directory(): + self.make_script("pass") + with self.assertRaises(ValueError): compileall.compile_dir(self.path, quiet=True, optimize=0, hardlink_dupes=True) + with self.assertRaises(ValueError): + # same optimization level specified twice: + # compile_dir() removes duplicates + compileall.compile_dir(self.path, quiet=True, optimize=[0, 0], + hardlink_dupes=True) def create_code(self, docstring=False, assertion=False): lines = [] @@ -975,6 +980,18 @@ def test_only_two_levels(self): pyc2 = get_pyc(script, opts[1]) self.assertTrue(is_hardlink(pyc1, pyc2)) + def test_duplicated_levels(self): + # compile_dir() must not fail if optimize contains duplicated + # optimization levels and/or if optimization levels are not sorted. + with self.temporary_directory(): + # code with no dostring and no assertion: + # same bytecode for all optimization levels + script = self.make_script(self.create_code()) + self.compile_dir(optimize=[1, 0, 1, 0]) + pyc1 = get_pyc(script, 0) + pyc2 = get_pyc(script, 1) + self.assertTrue(is_hardlink(pyc1, pyc2)) + def test_recompilation(self): # Test compile_dir() when pyc files already exists and the script # content changed