Skip to content

Commit 8263868

Browse files
committed
FindModuleCache: optionally leverage BuildSourceSet
Gated behind a command line flag to assuage concerns about subtle issues in module lookup being introduced by this fast path.
1 parent 6c2690e commit 8263868

File tree

4 files changed

+119
-32
lines changed

4 files changed

+119
-32
lines changed

mypy/build.py

Lines changed: 4 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@
4343
from mypy.report import Reports # Avoid unconditional slow import
4444
from mypy.fixup import fixup_module
4545
from mypy.modulefinder import (
46-
BuildSource, compute_search_paths, FindModuleCache, SearchPaths, ModuleSearchResult,
47-
ModuleNotFoundReason
46+
BuildSource, BuildSourceSet, compute_search_paths, FindModuleCache, SearchPaths,
47+
ModuleSearchResult, ModuleNotFoundReason
4848
)
4949
from mypy.nodes import Expression
5050
from mypy.options import Options
@@ -107,33 +107,6 @@ def __init__(self, manager: 'BuildManager', graph: Graph) -> None:
107107
self.errors: List[str] = [] # Filled in by build if desired
108108

109109

110-
class BuildSourceSet:
111-
"""Efficiently test a file's membership in the set of build sources."""
112-
113-
def __init__(self, sources: List[BuildSource]) -> None:
114-
self.source_text_present = False
115-
self.source_modules: Set[str] = set()
116-
self.source_paths: Set[str] = set()
117-
118-
for source in sources:
119-
if source.text is not None:
120-
self.source_text_present = True
121-
elif source.path:
122-
self.source_paths.add(source.path)
123-
else:
124-
self.source_modules.add(source.module)
125-
126-
def is_source(self, file: MypyFile) -> bool:
127-
if file.path and file.path in self.source_paths:
128-
return True
129-
elif file._fullname in self.source_modules:
130-
return True
131-
elif self.source_text_present:
132-
return True
133-
else:
134-
return False
135-
136-
137110
def build(sources: List[BuildSource],
138111
options: Options,
139112
alt_lib_path: Optional[str] = None,
@@ -630,7 +603,8 @@ def __init__(self, data_dir: str,
630603
or options.use_fine_grained_cache)
631604
and not has_reporters)
632605
self.fscache = fscache
633-
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options)
606+
self.find_module_cache = FindModuleCache(self.search_paths, self.fscache, self.options,
607+
source_set=self.source_set)
634608
self.metastore = create_metastore(options)
635609

636610
# a mapping from source files to their corresponding shadow files

mypy/main.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -881,6 +881,10 @@ def add_invertible_flag(flag: str,
881881
'--explicit-package-bases', default=False,
882882
help="Use current directory and MYPYPATH to determine module names of files passed",
883883
group=code_group)
884+
add_invertible_flag(
885+
'--fast-module-lookup', default=False,
886+
help="Enable fast path for finding modules within input sources",
887+
group=code_group)
884888
code_group.add_argument(
885889
"--exclude",
886890
action="append",

mypy/modulefinder.py

Lines changed: 109 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from typing_extensions import Final, TypeAlias as _TypeAlias
2424

2525
from mypy.fscache import FileSystemCache
26+
from mypy.nodes import MypyFile
2627
from mypy.options import Options
2728
from mypy.stubinfo import is_legacy_bundled_package
2829
from mypy import pyinfo
@@ -126,6 +127,33 @@ def __repr__(self) -> str:
126127
self.base_dir)
127128

128129

130+
class BuildSourceSet:
131+
"""Helper to efficiently test a file's membership in a set of build sources."""
132+
133+
def __init__(self, sources: List[BuildSource]) -> None:
134+
self.source_text_present = False
135+
self.source_modules = {} # type: Dict[str, str]
136+
self.source_paths = set() # type: Set[str]
137+
138+
for source in sources:
139+
if source.text is not None:
140+
self.source_text_present = True
141+
if source.path:
142+
self.source_paths.add(source.path)
143+
if source.module:
144+
self.source_modules[source.module] = source.path or ''
145+
146+
def is_source(self, file: MypyFile) -> bool:
147+
if file.path and file.path in self.source_paths:
148+
return True
149+
elif file._fullname in self.source_modules:
150+
return True
151+
elif self.source_text_present:
152+
return True
153+
else:
154+
return False
155+
156+
129157
class FindModuleCache:
130158
"""Module finder with integrated cache.
131159
@@ -141,8 +169,10 @@ def __init__(self,
141169
search_paths: SearchPaths,
142170
fscache: Optional[FileSystemCache],
143171
options: Optional[Options],
144-
stdlib_py_versions: Optional[StdlibVersions] = None) -> None:
172+
stdlib_py_versions: Optional[StdlibVersions] = None,
173+
source_set: Optional[BuildSourceSet] = None) -> None:
145174
self.search_paths = search_paths
175+
self.source_set = source_set
146176
self.fscache = fscache or FileSystemCache()
147177
# Cache for get_toplevel_possibilities:
148178
# search_paths -> (toplevel_id -> list(package_dirs))
@@ -164,6 +194,50 @@ def clear(self) -> None:
164194
self.initial_components.clear()
165195
self.ns_ancestors.clear()
166196

197+
def find_module_via_source_set(self, id: str) -> Optional[ModuleSearchResult]:
198+
if not self.source_set:
199+
return None
200+
201+
p = self.source_set.source_modules.get(id, None)
202+
if p and self.fscache.isfile(p):
203+
# We need to make sure we still have __init__.py all the way up
204+
# otherwise we might have false positives compared to slow path
205+
# in case of deletion of init files, which is covered by some tests.
206+
# TODO: are there some combination of flags in which this check should be skipped?
207+
d = os.path.dirname(p)
208+
for _ in range(id.count('.')):
209+
if not any(self.fscache.isfile(os.path.join(d, '__init__' + x))
210+
for x in PYTHON_EXTENSIONS):
211+
return None
212+
d = os.path.dirname(d)
213+
return p
214+
215+
idx = id.rfind('.')
216+
if idx != -1:
217+
# When we're looking for foo.bar.baz and can't find a matching module
218+
# in the source set, look up for a foo.bar module.
219+
parent = self.find_module_via_source_set(id[:idx])
220+
if parent is None or not isinstance(parent, str):
221+
return None
222+
223+
basename, ext = os.path.splitext(parent)
224+
if (not any(parent.endswith('__init__' + x) for x in PYTHON_EXTENSIONS)
225+
and (ext in PYTHON_EXTENSIONS and not self.fscache.isdir(basename))):
226+
# If we do find such a *module* (and crucially, we don't want a package,
227+
# hence the filtering out of __init__ files, and checking for the presence
228+
# of a folder with a matching name), then we can be pretty confident that
229+
# 'baz' will either be a top-level variable in foo.bar, or will not exist.
230+
#
231+
# Either way, spelunking in other search paths for another 'foo.bar.baz'
232+
# module should be avoided because:
233+
# 1. in the unlikely event that one were found, it's highly likely that
234+
# it would be unrelated to the source being typechecked and therefore
235+
# more likely to lead to erroneous results
236+
# 2. as described in _find_module, in some cases the search itself could
237+
# potentially waste significant amounts of time
238+
return ModuleNotFoundReason.NOT_FOUND
239+
return None
240+
167241
def find_lib_path_dirs(self, id: str, lib_path: Tuple[str, ...]) -> PackageDirs:
168242
"""Find which elements of a lib_path have the directory a module needs to exist.
169243
@@ -229,7 +303,7 @@ def find_module(self, id: str, *, fast_path: bool = False) -> ModuleSearchResult
229303
elif top_level in self.stdlib_py_versions:
230304
use_typeshed = self._typeshed_has_version(top_level)
231305
self.results[id] = self._find_module(id, use_typeshed)
232-
if (not fast_path
306+
if (not (fast_path or (self.options is not None and self.options.fast_module_lookup))
233307
and self.results[id] is ModuleNotFoundReason.NOT_FOUND
234308
and self._can_find_module_in_parent_dir(id)):
235309
self.results[id] = ModuleNotFoundReason.WRONG_WORKING_DIRECTORY
@@ -295,6 +369,39 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool:
295369
def _find_module(self, id: str, use_typeshed: bool) -> ModuleSearchResult:
296370
fscache = self.fscache
297371

372+
# Fast path for any modules in the current source set.
373+
# This is particularly important when there are a large number of search
374+
# paths which share the first (few) component(s) due to the use of namespace
375+
# packages, for instance:
376+
# foo/
377+
# company/
378+
# __init__.py
379+
# foo/
380+
# bar/
381+
# company/
382+
# __init__.py
383+
# bar/
384+
# baz/
385+
# company/
386+
# __init__.py
387+
# baz/
388+
#
389+
# mypy gets [foo/company/foo, bar/company/bar, baz/company/baz, ...] as input
390+
# and computes [foo, bar, baz, ...] as the module search path.
391+
#
392+
# This would result in O(n) search for every import of company.*, leading to
393+
# O(n**2) behavior in load_graph as such imports are unsurprisingly present
394+
# at least once, and usually many more times than that, in each and every file
395+
# being parsed.
396+
#
397+
# Thankfully, such cases are efficiently handled by looking up the module path
398+
# via BuildSourceSet.
399+
p = (self.find_module_via_source_set(id)
400+
if (self.options is not None and self.options.fast_module_lookup)
401+
else None)
402+
if p:
403+
return p
404+
298405
# If we're looking for a module like 'foo.bar.baz', it's likely that most of the
299406
# many elements of lib_path don't even have a subdirectory 'foo/bar'. Discover
300407
# that only once and cache it for when we look for modules like 'foo.bar.blah'

mypy/options.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,8 @@ def __init__(self) -> None:
293293
self.cache_map: Dict[str, Tuple[str, str]] = {}
294294
# Don't properly free objects on exit, just kill the current process.
295295
self.fast_exit = True
296+
# fast path for finding modules from source set
297+
self.fast_module_lookup = False
296298
# Used to transform source code before parsing if not None
297299
# TODO: Make the type precise (AnyStr -> AnyStr)
298300
self.transform_source: Optional[Callable[[Any], Any]] = None

0 commit comments

Comments
 (0)