From 54c5aa5a9320cbb082389bf658fcf31ff7afe645 Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 6 Feb 2024 04:03:45 +0000 Subject: [PATCH 01/10] GH-115060: Speed up `pathlib.Path.glob()` by removing redundant regex matching When expanding and filtering paths for a `**` wildcard segment, build an `re.Pattern` object from the subsequent pattern parts, rather than the entire pattern. Also skip compiling a pattern when expanding a `*` wildcard segment. --- Lib/pathlib/_abc.py | 26 +++++++++---------- ...-02-06-03-55-46.gh-issue-115060.EkWRpP.rst | 1 + 2 files changed, 13 insertions(+), 14 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-02-06-03-55-46.gh-issue-115060.EkWRpP.rst diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index e4b1201a3703c3..b9621259fcd885 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -86,19 +86,21 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match): continue except OSError: continue - if match(entry.name): + if match is None or match(entry.name): yield parent_path._make_child_entry(entry) -def _select_recursive(parent_paths, dir_only, follow_symlinks): +def _select_recursive(parent_paths, dir_only, follow_symlinks, match): """Yield given paths and all their subdirectories, recursively.""" if follow_symlinks is None: follow_symlinks = False for parent_path in parent_paths: + prefix_len = len(str(parent_path._make_child_relpath('_'))) - 1 paths = [parent_path._make_child_relpath('')] while paths: path = paths.pop() - yield path + if match is None or match(str(path), prefix_len): + yield path try: # We must close the scandir() object before proceeding to # avoid exhausting file descriptors when globbing deep trees. @@ -115,7 +117,9 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks): except OSError: pass if not dir_only: - yield path._make_child_entry(entry) + file_path = path._make_child_entry(entry) + if match is None or match(str(file_path), prefix_len): + yield file_path def _select_unique(paths): @@ -769,7 +773,6 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): stack = pattern._pattern_stack specials = ('', '.', '..') - filter_paths = False deduplicate_paths = False sep = self.pathmod.sep paths = iter([self] if self.is_dir() else []) @@ -786,11 +789,11 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): # regex filtering, provided we're treating symlinks consistently. if follow_symlinks is not None: while stack and stack[-1] not in specials: - filter_paths = True - stack.pop() + part += sep + stack.pop() dir_only = bool(stack) - paths = _select_recursive(paths, dir_only, follow_symlinks) + match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None + paths = _select_recursive(paths, dir_only, follow_symlinks, match) if deduplicate_paths: # De-duplicate if we've already seen a '**' component. paths = _select_unique(paths) @@ -799,13 +802,8 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): raise ValueError("Invalid pattern: '**' can only be an entire path component") else: dir_only = bool(stack) - match = _compile_pattern(part, sep, case_sensitive) + match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None paths = _select_children(paths, dir_only, follow_symlinks, match) - if filter_paths: - # Filter out paths that don't match pattern. - prefix_len = len(str(self._make_child_relpath('_'))) - 1 - match = _compile_pattern(pattern._pattern_str, sep, case_sensitive) - paths = (path for path in paths if match(path._pattern_str, prefix_len)) return paths def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None): diff --git a/Misc/NEWS.d/next/Library/2024-02-06-03-55-46.gh-issue-115060.EkWRpP.rst b/Misc/NEWS.d/next/Library/2024-02-06-03-55-46.gh-issue-115060.EkWRpP.rst new file mode 100644 index 00000000000000..b358eeb569626f --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-02-06-03-55-46.gh-issue-115060.EkWRpP.rst @@ -0,0 +1 @@ +Speed up :meth:`pathlib.Path.glob` by removing redundant regex matching. From 6abb80da677dcec2fba0336b6c2d952e4024b6b3 Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 6 Feb 2024 04:58:07 +0000 Subject: [PATCH 02/10] Match against os.DirEntry.path in _select_recursive() --- Lib/pathlib/__init__.py | 6 +++++- Lib/pathlib/_abc.py | 9 ++++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index 65ce836765c42b..e3d01f9f2620e3 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -587,9 +587,13 @@ def iterdir(self): def _scandir(self): return os.scandir(self) + def _entry_str(self, entry): + # Transform an entry yielded from _scandir() into a path string. + return entry.name if str(self) == '.' else entry.path + def _make_child_entry(self, entry): # Transform an entry yielded from _scandir() into a path object. - path_str = entry.name if str(self) == '.' else entry.path + path_str = self._entry_str(entry) path = self.with_segments(path_str) path._str = path_str path._drv = self.drive diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index b9621259fcd885..e813e58639f5e9 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -117,9 +117,8 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks, match): except OSError: pass if not dir_only: - file_path = path._make_child_entry(entry) - if match is None or match(str(file_path), prefix_len): - yield file_path + if match is None or match(path._entry_str(entry), prefix_len): + yield path._make_child_entry(entry) def _select_unique(paths): @@ -754,6 +753,10 @@ def _scandir(self): from contextlib import nullcontext return nullcontext(self.iterdir()) + def _entry_str(self, entry): + # Transform an entry yielded from _scandir() into a path string. + return str(entry) + def _make_child_entry(self, entry): # Transform an entry yielded from _scandir() into a path object. return entry From b382e400635b4dfd78c805f4f2bcdd0eae83842e Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 6 Feb 2024 05:07:14 +0000 Subject: [PATCH 03/10] Matching against dot-prefixed path is fine (and faster!) --- Lib/pathlib/__init__.py | 7 ++++--- Lib/pathlib/_abc.py | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index e3d01f9f2620e3..f34bba309babd3 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -587,13 +587,14 @@ def iterdir(self): def _scandir(self): return os.scandir(self) - def _entry_str(self, entry): + @classmethod + def _entry_str(cls, entry): # Transform an entry yielded from _scandir() into a path string. - return entry.name if str(self) == '.' else entry.path + return entry.path def _make_child_entry(self, entry): # Transform an entry yielded from _scandir() into a path object. - path_str = self._entry_str(entry) + path_str = entry.name if str(self) == '.' else entry.path path = self.with_segments(path_str) path._str = path_str path._drv = self.drive diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index e813e58639f5e9..12176af2290473 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -753,7 +753,8 @@ def _scandir(self): from contextlib import nullcontext return nullcontext(self.iterdir()) - def _entry_str(self, entry): + @classmethod + def _entry_str(cls, entry): # Transform an entry yielded from _scandir() into a path string. return str(entry) From e1472fc23549799473466b7d97827988bbe72684 Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 6 Feb 2024 05:15:18 +0000 Subject: [PATCH 04/10] Revert "Matching against dot-prefixed path is fine (and faster!)" This reverts commit b382e400635b4dfd78c805f4f2bcdd0eae83842e. --- Lib/pathlib/__init__.py | 7 +++---- Lib/pathlib/_abc.py | 3 +-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index f34bba309babd3..e3d01f9f2620e3 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -587,14 +587,13 @@ def iterdir(self): def _scandir(self): return os.scandir(self) - @classmethod - def _entry_str(cls, entry): + def _entry_str(self, entry): # Transform an entry yielded from _scandir() into a path string. - return entry.path + return entry.name if str(self) == '.' else entry.path def _make_child_entry(self, entry): # Transform an entry yielded from _scandir() into a path object. - path_str = entry.name if str(self) == '.' else entry.path + path_str = self._entry_str(entry) path = self.with_segments(path_str) path._str = path_str path._drv = self.drive diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index 12176af2290473..e813e58639f5e9 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -753,8 +753,7 @@ def _scandir(self): from contextlib import nullcontext return nullcontext(self.iterdir()) - @classmethod - def _entry_str(cls, entry): + def _entry_str(self, entry): # Transform an entry yielded from _scandir() into a path string. return str(entry) From 284c42e09ef3e69baee4f59da3ff4d471e9ddc81 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 7 Feb 2024 23:29:44 +0000 Subject: [PATCH 05/10] Skip computing prefix len when not matching --- Lib/pathlib/_abc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index e813e58639f5e9..f9a6ea7179e3ec 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -95,7 +95,8 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks, match): if follow_symlinks is None: follow_symlinks = False for parent_path in parent_paths: - prefix_len = len(str(parent_path._make_child_relpath('_'))) - 1 + if match is not None: + prefix_len = len(str(parent_path._make_child_relpath('_'))) - 1 paths = [parent_path._make_child_relpath('')] while paths: path = paths.pop() From 169b1e7e2b9c7c8bfc37d811b655070bccaa7022 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 8 Feb 2024 21:34:08 +0000 Subject: [PATCH 06/10] Rename `prefix_len` --> `parent_len` for clarity. --- Lib/pathlib/_abc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index f9a6ea7179e3ec..f3d0e22e0c45a9 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -96,11 +96,11 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks, match): follow_symlinks = False for parent_path in parent_paths: if match is not None: - prefix_len = len(str(parent_path._make_child_relpath('_'))) - 1 + parent_len = len(str(parent_path._make_child_relpath('_'))) - 1 paths = [parent_path._make_child_relpath('')] while paths: path = paths.pop() - if match is None or match(str(path), prefix_len): + if match is None or match(str(path), parent_len): yield path try: # We must close the scandir() object before proceeding to @@ -118,7 +118,7 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks, match): except OSError: pass if not dir_only: - if match is None or match(path._entry_str(entry), prefix_len): + if match is None or match(path._entry_str(entry), parent_len): yield path._make_child_entry(entry) From 1c4184f70dc41fa1a9475a616c71b3a7816f9676 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 8 Feb 2024 22:56:09 +0000 Subject: [PATCH 07/10] Comments, naming. --- Lib/pathlib/__init__.py | 6 ++--- Lib/pathlib/_abc.py | 60 ++++++++++++++++++++++++++++++----------- 2 files changed, 47 insertions(+), 19 deletions(-) diff --git a/Lib/pathlib/__init__.py b/Lib/pathlib/__init__.py index e3d01f9f2620e3..46834b1a76a6eb 100644 --- a/Lib/pathlib/__init__.py +++ b/Lib/pathlib/__init__.py @@ -587,13 +587,13 @@ def iterdir(self): def _scandir(self): return os.scandir(self) - def _entry_str(self, entry): + def _direntry_str(self, entry): # Transform an entry yielded from _scandir() into a path string. return entry.name if str(self) == '.' else entry.path - def _make_child_entry(self, entry): + def _make_child_direntry(self, entry): # Transform an entry yielded from _scandir() into a path object. - path_str = self._entry_str(entry) + path_str = self._direntry_str(entry) path = self.with_segments(path_str) path._str = path_str path._drv = self.drive diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index f3d0e22e0c45a9..24f3ed7795aa85 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -86,21 +86,28 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match): continue except OSError: continue + # Avoid cost of making a path object for non-matching paths by + # matching against the os.DirEntry.name string. if match is None or match(entry.name): - yield parent_path._make_child_entry(entry) + yield parent_path._make_child_direntry(entry) def _select_recursive(parent_paths, dir_only, follow_symlinks, match): - """Yield given paths and all their subdirectories, recursively.""" + """Yield given paths and all their children, recursively, filtering by + string and type. + """ if follow_symlinks is None: follow_symlinks = False for parent_path in parent_paths: if match is not None: + # If we're filtering paths through a regex, record the length of + # the parent path. We'll pass it to match(path, pos=...) later. parent_len = len(str(parent_path._make_child_relpath('_'))) - 1 paths = [parent_path._make_child_relpath('')] while paths: path = paths.pop() if match is None or match(str(path), parent_len): + # Yield *directory* path that matches pattern (if any). yield path try: # We must close the scandir() object before proceeding to @@ -111,15 +118,22 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks, match): pass else: for entry in entries: + # Handle directory entry. try: if entry.is_dir(follow_symlinks=follow_symlinks): - paths.append(path._make_child_entry(entry)) + # Recurse into this directory. + paths.append(path._make_child_direntry(entry)) continue except OSError: pass + + # Handle file entry. if not dir_only: - if match is None or match(path._entry_str(entry), parent_len): - yield path._make_child_entry(entry) + # Avoid cost of making a path object for non-matching + # files by matching against the os.DirEntry object. + if match is None or match(path._direntry_str(entry), parent_len): + # Yield *file* path that matches pattern (if any). + yield path._make_child_direntry(entry) def _select_unique(paths): @@ -754,12 +768,14 @@ def _scandir(self): from contextlib import nullcontext return nullcontext(self.iterdir()) - def _entry_str(self, entry): + def _direntry_str(self, entry): # Transform an entry yielded from _scandir() into a path string. + # PathBase._scandir() yields PathBase objects, so use str(). return str(entry) - def _make_child_entry(self, entry): + def _make_child_direntry(self, entry): # Transform an entry yielded from _scandir() into a path object. + # PathBase._scandir() yields PathBase objects, so this is a no-op. return entry def _make_child_relpath(self, name): @@ -783,31 +799,43 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): while stack: part = stack.pop() if part in specials: + # Join special segment (e.g. '..') onto paths. paths = _select_special(paths, part) + elif part == '**': - # Consume adjacent '**' components. + # Consume following '**' components, which have no effect. while stack and stack[-1] == '**': stack.pop() - # Consume adjacent non-special components and enable post-walk - # regex filtering, provided we're treating symlinks consistently. + # Consume following non-special components, provided we're + # treating symlinks consistently. Each component is joined + # onto 'part', which is used to generate an re.Pattern object. if follow_symlinks is not None: while stack and stack[-1] not in specials: part += sep + stack.pop() - dir_only = bool(stack) + # If the previous loop consumed pattern segments, compile an + # re.Pattern object based on those segments. match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None - paths = _select_recursive(paths, dir_only, follow_symlinks, match) + + # Recursively walk directories, filtering by type and regex. + paths = _select_recursive(paths, bool(stack), follow_symlinks, match) + + # De-duplicate if we've already seen a '**' component. if deduplicate_paths: - # De-duplicate if we've already seen a '**' component. paths = _select_unique(paths) deduplicate_paths = True + elif '**' in part: raise ValueError("Invalid pattern: '**' can only be an entire path component") + else: - dir_only = bool(stack) + # If the pattern segment isn't '*', compile an re.Pattern + # object based on the segment. match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None - paths = _select_children(paths, dir_only, follow_symlinks, match) + + # Iterate over directories' children filtering by type and regex. + paths = _select_children(paths, bool(stack), follow_symlinks, match) return paths def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None): @@ -856,7 +884,7 @@ def walk(self, top_down=True, on_error=None, follow_symlinks=False): if is_dir: if not top_down: - paths.append(path._make_child_entry(entry)) + paths.append(path._make_child_direntry(entry)) dirnames.append(entry.name) else: filenames.append(entry.name) From 2873ed852615b601d8916c3aaadb5b4136159024 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 8 Feb 2024 23:09:32 +0000 Subject: [PATCH 08/10] segment --> component --- Lib/pathlib/_abc.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Lib/pathlib/_abc.py b/Lib/pathlib/_abc.py index 24f3ed7795aa85..27c6b4e367a050 100644 --- a/Lib/pathlib/_abc.py +++ b/Lib/pathlib/_abc.py @@ -799,7 +799,7 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): while stack: part = stack.pop() if part in specials: - # Join special segment (e.g. '..') onto paths. + # Join special component (e.g. '..') onto paths. paths = _select_special(paths, part) elif part == '**': @@ -814,8 +814,8 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): while stack and stack[-1] not in specials: part += sep + stack.pop() - # If the previous loop consumed pattern segments, compile an - # re.Pattern object based on those segments. + # If the previous loop consumed pattern components, compile an + # re.Pattern object based on those components. match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None # Recursively walk directories, filtering by type and regex. @@ -830,8 +830,8 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): raise ValueError("Invalid pattern: '**' can only be an entire path component") else: - # If the pattern segment isn't '*', compile an re.Pattern - # object based on the segment. + # If the pattern component isn't '*', compile an re.Pattern + # object based on the component. match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None # Iterate over directories' children filtering by type and regex. From 90d5a128896923a18e5d9ba8441d61c58eba29b1 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 8 Feb 2024 23:53:01 +0000 Subject: [PATCH 09/10] Test post-`**` matching when globbing `.`. --- Lib/test/test_pathlib/test_pathlib.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Lib/test/test_pathlib/test_pathlib.py b/Lib/test/test_pathlib/test_pathlib.py index 2b166451243775..ab6778f4704801 100644 --- a/Lib/test/test_pathlib/test_pathlib.py +++ b/Lib/test/test_pathlib/test_pathlib.py @@ -1250,6 +1250,12 @@ def test_glob_pathlike(self): self.assertEqual(expect, set(p.glob(P(pattern)))) self.assertEqual(expect, set(p.glob(FakePath(pattern)))) + @needs_symlinks + def test_glob_dot(self): + P = self.cls + with os_helper.change_cwd(P(self.base, "dirC")): + self.assertEqual(set(P('.').glob('**/*/*')), {P("dirD/fileD")}) + def test_rglob_pathlike(self): P = self.cls p = P(self.base, "dirC") From a40924b6945b74118355d16ed3fb34f75978c3da Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 9 Feb 2024 01:29:55 +0000 Subject: [PATCH 10/10] Couple more test cases --- Lib/test/test_pathlib/test_pathlib.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_pathlib/test_pathlib.py b/Lib/test/test_pathlib/test_pathlib.py index ab6778f4704801..c0dcf314da4bfc 100644 --- a/Lib/test/test_pathlib/test_pathlib.py +++ b/Lib/test/test_pathlib/test_pathlib.py @@ -1254,7 +1254,14 @@ def test_glob_pathlike(self): def test_glob_dot(self): P = self.cls with os_helper.change_cwd(P(self.base, "dirC")): - self.assertEqual(set(P('.').glob('**/*/*')), {P("dirD/fileD")}) + self.assertEqual( + set(P('.').glob('*')), {P("fileC"), P("novel.txt"), P("dirD")}) + self.assertEqual( + set(P('.').glob('**')), {P("fileC"), P("novel.txt"), P("dirD"), P("dirD/fileD"), P(".")}) + self.assertEqual( + set(P('.').glob('**/*')), {P("fileC"), P("novel.txt"), P("dirD"), P("dirD/fileD")}) + self.assertEqual( + set(P('.').glob('**/*/*')), {P("dirD/fileD")}) def test_rglob_pathlike(self): P = self.cls