Skip to content

Commit 4cfdab1

Browse files
committed
FindModuleCache: optionally leverage BuildSourceSet
Gated behind a command line flag to assuage concerns about subtle issues in module lookup being introduced by this fast path.
1 parent 0cec4f7 commit 4cfdab1

File tree

4 files changed

+121
-32
lines changed

4 files changed

+121
-32
lines changed

mypy/build.py

+4-30
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@
4242
from mypy.report import Reports # Avoid unconditional slow import
4343
from mypy.fixup import fixup_module
4444
from mypy.modulefinder import (
45-
BuildSource, compute_search_paths, FindModuleCache, SearchPaths, ModuleSearchResult,
46-
ModuleNotFoundReason
45+
BuildSource, BuildSourceSet, compute_search_paths, FindModuleCache, SearchPaths,
46+
ModuleSearchResult, ModuleNotFoundReason
4747
)
4848
from mypy.nodes import Expression
4949
from mypy.options import Options
@@ -106,33 +106,6 @@ def __init__(self, manager: 'BuildManager', graph: Graph) -> None:
106106
self.errors: List[str] = [] # Filled in by build if desired
107107

108108

109-
class BuildSourceSet:
110-
"""Efficiently test a file's membership in the set of build sources."""
111-
112-
def __init__(self, sources: List[BuildSource]) -> None:
113-
self.source_text_present = False
114-
self.source_modules: Set[str] = set()
115-
self.source_paths: Set[str] = set()
116-
117-
for source in sources:
118-
if source.text is not None:
119-
self.source_text_present = True
120-
elif source.path:
121-
self.source_paths.add(source.path)
122-
else:
123-
self.source_modules.add(source.module)
124-
125-
def is_source(self, file: MypyFile) -> bool:
126-
if file.path and file.path in self.source_paths:
127-
return True
128-
elif file._fullname in self.source_modules:
129-
return True
130-
elif self.source_text_present:
131-
return True
132-
else:
133-
return False
134-
135-
136109
def build(sources: List[BuildSource],
137110
options: Options,
138111
alt_lib_path: Optional[str] = None,
@@ -627,7 +600,8 @@ def __init__(self, data_dir: str,
627600
or options.use_fine_grained_cache)
628601
and not has_reporters)
629602
self.fscache = fscache
630-
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options)
603+
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options,
604+
source_set=self.source_set)
631605
self.metastore = create_metastore(options)
632606

633607
# a mapping from source files to their corresponding shadow files

mypy/main.py

+4
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,10 @@ def add_invertible_flag(flag: str,
870870
'--explicit-package-bases', default=False,
871871
help="Use current directory and MYPYPATH to determine module names of files passed",
872872
group=code_group)
873+
add_invertible_flag(
874+
'--fast-module-lookup', default=False,
875+
help="Enable fast path for finding modules within input sources",
876+
group=code_group)
873877
code_group.add_argument(
874878
"--exclude",
875879
action="append",

mypy/modulefinder.py

+111-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from typing_extensions import Final, TypeAlias as _TypeAlias
1717

1818
from mypy.fscache import FileSystemCache
19+
from mypy.nodes import MypyFile
1920
from mypy.options import Options
2021
from mypy.stubinfo import is_legacy_bundled_package
2122
from mypy import pyinfo
@@ -115,6 +116,33 @@ def __repr__(self) -> str:
115116
self.base_dir)
116117

117118

119+
class BuildSourceSet:
120+
"""Helper to efficiently test a file's membership in a set of build sources."""
121+
122+
def __init__(self, sources: List[BuildSource]) -> None:
123+
self.source_text_present = False
124+
self.source_modules = {} # type: Dict[str, str]
125+
self.source_paths = set() # type: Set[str]
126+
127+
for source in sources:
128+
if source.text is not None:
129+
self.source_text_present = True
130+
if source.path:
131+
self.source_paths.add(source.path)
132+
if source.module:
133+
self.source_modules[source.module] = source.path or ''
134+
135+
def is_source(self, file: MypyFile) -> bool:
136+
if file.path and file.path in self.source_paths:
137+
return True
138+
elif file._fullname in self.source_modules:
139+
return True
140+
elif self.source_text_present:
141+
return True
142+
else:
143+
return False
144+
145+
118146
class FindModuleCache:
119147
"""Module finder with integrated cache.
120148
@@ -130,8 +158,10 @@ def __init__(self,
130158
search_paths: SearchPaths,
131159
fscache: Optional[FileSystemCache],
132160
options: Optional[Options],
133-
stdlib_py_versions: Optional[StdlibVersions] = None) -> None:
161+
stdlib_py_versions: Optional[StdlibVersions] = None,
162+
source_set: Optional[BuildSourceSet] = None) -> None:
134163
self.search_paths = search_paths
164+
self.source_set = source_set
135165
self.fscache = fscache or FileSystemCache()
136166
# Cache for get_toplevel_possibilities:
137167
# search_paths -> (toplevel_id -> list(package_dirs))
@@ -153,6 +183,52 @@ def clear(self) -> None:
153183
self.initial_components.clear()
154184
self.ns_ancestors.clear()
155185

186+
def find_module_via_source_set(self, id: str) -> Optional[ModuleSearchResult]:
187+
if not self.source_set:
188+
return None
189+
190+
p = self.source_set.source_modules.get(id, None)
191+
if p and self.fscache.isfile(p):
192+
# NB: need to make sure we still have __init__.py all the way up
193+
# otherwise we might have false positives compared to slow path
194+
# in case of deletion of init files, which is covered by some tests
195+
# TODO: are there some combination of flags in which this check should be skipped?
196+
d = os.path.dirname(p)
197+
for _ in range(id.count('.')):
198+
if not any(self.fscache.isfile(os.path.join(d, '__init__' + x))
199+
for x in PYTHON_EXTENSIONS):
200+
return None
201+
d = os.path.dirname(d)
202+
return p
203+
204+
idx = id.rfind('.')
205+
if idx != - 1:
206+
# When we're looking for foo.bar.baz and can't find a matching module
207+
# in the source set, look up for a foo.bar module.
208+
parent = self.find_module_via_source_set(id[:idx])
209+
if parent is None or not isinstance(parent, str):
210+
return None
211+
212+
basename, ext = os.path.splitext(parent)
213+
if (
214+
not any(parent.endswith('__init__' + x) for x in PYTHON_EXTENSIONS)
215+
and (ext in PYTHON_EXTENSIONS and not self.fscache.isdir(basename))
216+
):
217+
# If we do find such a *module* (and crucially, we don't want a package,
218+
# hence the filtering out of __init__ files, and checking for the presence
219+
# of a folder with a matching name), then we can be pretty confident that
220+
# 'baz' will either be a top-level variable in foo.bar, or will not exist.
221+
#
222+
# Either way, spelunking in other search paths for another 'foo.bar.baz'
223+
# module should be avoided because:
224+
# 1. in the unlikely event that one were found, it's highly likely that
225+
# it would be unrelated to the source being typechecked and therefore
226+
# more likely to lead to erroneous results
227+
# 2. as described in _find_module, in some cases the search itself could
228+
# potentially waste significant amounts of time
229+
return ModuleNotFoundReason.NOT_FOUND
230+
return None
231+
156232
def find_lib_path_dirs(self, id: str, lib_path: Tuple[str, ...]) -> PackageDirs:
157233
"""Find which elements of a lib_path have the directory a module needs to exist.
158234
@@ -218,7 +294,7 @@ def find_module(self, id: str, *, fast_path: bool = False) -> ModuleSearchResult
218294
elif top_level in self.stdlib_py_versions:
219295
use_typeshed = self._typeshed_has_version(top_level)
220296
self.results[id] = self._find_module(id, use_typeshed)
221-
if (not fast_path
297+
if (not (fast_path or (self.options is not None and self.options.fast_module_lookup))
222298
and self.results[id] is ModuleNotFoundReason.NOT_FOUND
223299
and self._can_find_module_in_parent_dir(id)):
224300
self.results[id] = ModuleNotFoundReason.WRONG_WORKING_DIRECTORY
@@ -284,6 +360,39 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool:
284360
def _find_module(self, id: str, use_typeshed: bool) -> ModuleSearchResult:
285361
fscache = self.fscache
286362

363+
# Fast path for any modules in the current source set.
364+
# This is particularly important when there are a large number of search
365+
# paths which share the first (few) component(s) due to the use of namespace
366+
# packages, for instance:
367+
# foo/
368+
# company/
369+
# __init__.py
370+
# foo/
371+
# bar/
372+
# company/
373+
# __init__.py
374+
# bar/
375+
# baz/
376+
# company/
377+
# __init__.py
378+
# baz/
379+
#
380+
# mypy gets [foo/company/foo, bar/company/bar, baz/company/baz, ...] as input
381+
# and computes [foo, bar, baz, ...] as the module search path.
382+
#
383+
# This would result in O(n) search for every import of company.*, leading to
384+
# O(n**2) behavior in load_graph as such imports are unsurprisingly present
385+
# at least once, and usually many more times than that, in each and every file
386+
# being parsed.
387+
#
388+
# Thankfully, such cases are efficiently handled by looking up the module path
389+
# via BuildSourceSet.
390+
p = (self.find_module_via_source_set(id)
391+
if (self.options is not None and self.options.fast_module_lookup)
392+
else None)
393+
if p:
394+
return p
395+
287396
# If we're looking for a module like 'foo.bar.baz', it's likely that most of the
288397
# many elements of lib_path don't even have a subdirectory 'foo/bar'. Discover
289398
# that only once and cache it for when we look for modules like 'foo.bar.blah'

mypy/options.py

+2
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,8 @@ def __init__(self) -> None:
287287
self.cache_map: Dict[str, Tuple[str, str]] = {}
288288
# Don't properly free objects on exit, just kill the current process.
289289
self.fast_exit = True
290+
# fast path for finding modules from source set
291+
self.fast_module_lookup = False
290292
# Used to transform source code before parsing if not None
291293
# TODO: Make the type precise (AnyStr -> AnyStr)
292294
self.transform_source: Optional[Callable[[Any], Any]] = None

0 commit comments

Comments
 (0)