Skip to content

GH-115060: Speed up pathlib.Path.glob() by removing redundant regex matching #115061

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Feb 10, 2024
8 changes: 6 additions & 2 deletions Lib/pathlib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,9 +587,13 @@ def iterdir(self):
def _scandir(self):
return os.scandir(self)

def _make_child_entry(self, entry):
def _direntry_str(self, entry):
# Transform an entry yielded from _scandir() into a path string.
return entry.name if str(self) == '.' else entry.path

def _make_child_direntry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
path_str = entry.name if str(self) == '.' else entry.path
path_str = self._direntry_str(entry)
path = self.with_segments(path_str)
path._str = path_str
path._drv = self.drive
Expand Down
82 changes: 56 additions & 26 deletions Lib/pathlib/_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,29 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
continue
except OSError:
continue
if match(entry.name):
yield parent_path._make_child_entry(entry)
# Avoid cost of making a path object for non-matching paths by
# matching against the os.DirEntry.name string.
if match is None or match(entry.name):
yield parent_path._make_child_direntry(entry)


def _select_recursive(parent_paths, dir_only, follow_symlinks):
"""Yield given paths and all their subdirectories, recursively."""
def _select_recursive(parent_paths, dir_only, follow_symlinks, match):
"""Yield given paths and all their children, recursively, filtering by
string and type.
"""
if follow_symlinks is None:
follow_symlinks = False
for parent_path in parent_paths:
if match is not None:
# If we're filtering paths through a regex, record the length of
# the parent path. We'll pass it to match(path, pos=...) later.
parent_len = len(str(parent_path._make_child_relpath('_'))) - 1
paths = [parent_path._make_child_relpath('')]
while paths:
path = paths.pop()
yield path
if match is None or match(str(path), parent_len):
# Yield *directory* path that matches pattern (if any).
yield path
try:
# We must close the scandir() object before proceeding to
# avoid exhausting file descriptors when globbing deep trees.
Expand All @@ -108,14 +118,22 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks):
pass
else:
for entry in entries:
# Handle directory entry.
try:
if entry.is_dir(follow_symlinks=follow_symlinks):
paths.append(path._make_child_entry(entry))
# Recurse into this directory.
paths.append(path._make_child_direntry(entry))
continue
except OSError:
pass

# Handle file entry.
if not dir_only:
yield path._make_child_entry(entry)
# Avoid cost of making a path object for non-matching
# files by matching against the os.DirEntry object.
if match is None or match(path._direntry_str(entry), parent_len):
# Yield *file* path that matches pattern (if any).
yield path._make_child_direntry(entry)


def _select_unique(paths):
Expand Down Expand Up @@ -750,8 +768,14 @@ def _scandir(self):
from contextlib import nullcontext
return nullcontext(self.iterdir())

def _make_child_entry(self, entry):
def _direntry_str(self, entry):
# Transform an entry yielded from _scandir() into a path string.
# PathBase._scandir() yields PathBase objects, so use str().
return str(entry)

def _make_child_direntry(self, entry):
# Transform an entry yielded from _scandir() into a path object.
# PathBase._scandir() yields PathBase objects, so this is a no-op.
return entry

def _make_child_relpath(self, name):
Expand All @@ -769,43 +793,49 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):

stack = pattern._pattern_stack
specials = ('', '.', '..')
filter_paths = False
deduplicate_paths = False
sep = self.pathmod.sep
paths = iter([self] if self.is_dir() else [])
while stack:
part = stack.pop()
if part in specials:
# Join special component (e.g. '..') onto paths.
paths = _select_special(paths, part)

elif part == '**':
# Consume adjacent '**' components.
# Consume following '**' components, which have no effect.
while stack and stack[-1] == '**':
stack.pop()

# Consume adjacent non-special components and enable post-walk
# regex filtering, provided we're treating symlinks consistently.
# Consume following non-special components, provided we're
# treating symlinks consistently. Each component is joined
# onto 'part', which is used to generate an re.Pattern object.
if follow_symlinks is not None:
while stack and stack[-1] not in specials:
filter_paths = True
stack.pop()
part += sep + stack.pop()

dir_only = bool(stack)
paths = _select_recursive(paths, dir_only, follow_symlinks)
# If the previous loop consumed pattern components, compile an
# re.Pattern object based on those components.
match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None

# Recursively walk directories, filtering by type and regex.
paths = _select_recursive(paths, bool(stack), follow_symlinks, match)

# De-duplicate if we've already seen a '**' component.
if deduplicate_paths:
# De-duplicate if we've already seen a '**' component.
paths = _select_unique(paths)
deduplicate_paths = True

elif '**' in part:
raise ValueError("Invalid pattern: '**' can only be an entire path component")

else:
dir_only = bool(stack)
match = _compile_pattern(part, sep, case_sensitive)
paths = _select_children(paths, dir_only, follow_symlinks, match)
if filter_paths:
# Filter out paths that don't match pattern.
prefix_len = len(str(self._make_child_relpath('_'))) - 1
match = _compile_pattern(pattern._pattern_str, sep, case_sensitive)
paths = (path for path in paths if match(path._pattern_str, prefix_len))
# If the pattern component isn't '*', compile an re.Pattern
# object based on the component.
match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None

# Iterate over directories' children filtering by type and regex.
paths = _select_children(paths, bool(stack), follow_symlinks, match)
return paths

def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
Expand Down Expand Up @@ -854,7 +884,7 @@ def walk(self, top_down=True, on_error=None, follow_symlinks=False):

if is_dir:
if not top_down:
paths.append(path._make_child_entry(entry))
paths.append(path._make_child_direntry(entry))
dirnames.append(entry.name)
else:
filenames.append(entry.name)
Expand Down
13 changes: 13 additions & 0 deletions Lib/test/test_pathlib/test_pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -1250,6 +1250,19 @@ def test_glob_pathlike(self):
self.assertEqual(expect, set(p.glob(P(pattern))))
self.assertEqual(expect, set(p.glob(FakePath(pattern))))

@needs_symlinks
def test_glob_dot(self):
P = self.cls
with os_helper.change_cwd(P(self.base, "dirC")):
self.assertEqual(
set(P('.').glob('*')), {P("fileC"), P("novel.txt"), P("dirD")})
self.assertEqual(
set(P('.').glob('**')), {P("fileC"), P("novel.txt"), P("dirD"), P("dirD/fileD"), P(".")})
self.assertEqual(
set(P('.').glob('**/*')), {P("fileC"), P("novel.txt"), P("dirD"), P("dirD/fileD")})
self.assertEqual(
set(P('.').glob('**/*/*')), {P("dirD/fileD")})

def test_rglob_pathlike(self):
P = self.cls
p = P(self.base, "dirC")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Speed up :meth:`pathlib.Path.glob` by removing redundant regex matching.