23
23
from typing_extensions import Final , TypeAlias as _TypeAlias
24
24
25
25
from mypy .fscache import FileSystemCache
26
+ from mypy .nodes import MypyFile
26
27
from mypy .options import Options
27
28
from mypy .stubinfo import is_legacy_bundled_package
28
29
from mypy import pyinfo
@@ -126,6 +127,33 @@ def __repr__(self) -> str:
126
127
self .base_dir )
127
128
128
129
130
+ class BuildSourceSet :
131
+ """Helper to efficiently test a file's membership in a set of build sources."""
132
+
133
+ def __init__ (self , sources : List [BuildSource ]) -> None :
134
+ self .source_text_present = False
135
+ self .source_modules = {} # type: Dict[str, str]
136
+ self .source_paths = set () # type: Set[str]
137
+
138
+ for source in sources :
139
+ if source .text is not None :
140
+ self .source_text_present = True
141
+ if source .path :
142
+ self .source_paths .add (source .path )
143
+ if source .module :
144
+ self .source_modules [source .module ] = source .path or ''
145
+
146
+ def is_source (self , file : MypyFile ) -> bool :
147
+ if file .path and file .path in self .source_paths :
148
+ return True
149
+ elif file ._fullname in self .source_modules :
150
+ return True
151
+ elif self .source_text_present :
152
+ return True
153
+ else :
154
+ return False
155
+
156
+
129
157
class FindModuleCache :
130
158
"""Module finder with integrated cache.
131
159
@@ -141,8 +169,10 @@ def __init__(self,
141
169
search_paths : SearchPaths ,
142
170
fscache : Optional [FileSystemCache ],
143
171
options : Optional [Options ],
144
- stdlib_py_versions : Optional [StdlibVersions ] = None ) -> None :
172
+ stdlib_py_versions : Optional [StdlibVersions ] = None ,
173
+ source_set : Optional [BuildSourceSet ] = None ) -> None :
145
174
self .search_paths = search_paths
175
+ self .source_set = source_set
146
176
self .fscache = fscache or FileSystemCache ()
147
177
# Cache for get_toplevel_possibilities:
148
178
# search_paths -> (toplevel_id -> list(package_dirs))
@@ -164,6 +194,50 @@ def clear(self) -> None:
164
194
self .initial_components .clear ()
165
195
self .ns_ancestors .clear ()
166
196
197
+ def find_module_via_source_set (self , id : str ) -> Optional [ModuleSearchResult ]:
198
+ if not self .source_set :
199
+ return None
200
+
201
+ p = self .source_set .source_modules .get (id , None )
202
+ if p and self .fscache .isfile (p ):
203
+ # We need to make sure we still have __init__.py all the way up
204
+ # otherwise we might have false positives compared to slow path
205
+ # in case of deletion of init files, which is covered by some tests.
206
+ # TODO: are there some combination of flags in which this check should be skipped?
207
+ d = os .path .dirname (p )
208
+ for _ in range (id .count ('.' )):
209
+ if not any (self .fscache .isfile (os .path .join (d , '__init__' + x ))
210
+ for x in PYTHON_EXTENSIONS ):
211
+ return None
212
+ d = os .path .dirname (d )
213
+ return p
214
+
215
+ idx = id .rfind ('.' )
216
+ if idx != - 1 :
217
+ # When we're looking for foo.bar.baz and can't find a matching module
218
+ # in the source set, look up for a foo.bar module.
219
+ parent = self .find_module_via_source_set (id [:idx ])
220
+ if parent is None or not isinstance (parent , str ):
221
+ return None
222
+
223
+ basename , ext = os .path .splitext (parent )
224
+ if (not any (parent .endswith ('__init__' + x ) for x in PYTHON_EXTENSIONS )
225
+ and (ext in PYTHON_EXTENSIONS and not self .fscache .isdir (basename ))):
226
+ # If we do find such a *module* (and crucially, we don't want a package,
227
+ # hence the filtering out of __init__ files, and checking for the presence
228
+ # of a folder with a matching name), then we can be pretty confident that
229
+ # 'baz' will either be a top-level variable in foo.bar, or will not exist.
230
+ #
231
+ # Either way, spelunking in other search paths for another 'foo.bar.baz'
232
+ # module should be avoided because:
233
+ # 1. in the unlikely event that one were found, it's highly likely that
234
+ # it would be unrelated to the source being typechecked and therefore
235
+ # more likely to lead to erroneous results
236
+ # 2. as described in _find_module, in some cases the search itself could
237
+ # potentially waste significant amounts of time
238
+ return ModuleNotFoundReason .NOT_FOUND
239
+ return None
240
+
167
241
def find_lib_path_dirs (self , id : str , lib_path : Tuple [str , ...]) -> PackageDirs :
168
242
"""Find which elements of a lib_path have the directory a module needs to exist.
169
243
@@ -229,7 +303,7 @@ def find_module(self, id: str, *, fast_path: bool = False) -> ModuleSearchResult
229
303
elif top_level in self .stdlib_py_versions :
230
304
use_typeshed = self ._typeshed_has_version (top_level )
231
305
self .results [id ] = self ._find_module (id , use_typeshed )
232
- if (not fast_path
306
+ if (not ( fast_path or ( self . options is not None and self . options . fast_module_lookup ))
233
307
and self .results [id ] is ModuleNotFoundReason .NOT_FOUND
234
308
and self ._can_find_module_in_parent_dir (id )):
235
309
self .results [id ] = ModuleNotFoundReason .WRONG_WORKING_DIRECTORY
@@ -295,6 +369,39 @@ def _can_find_module_in_parent_dir(self, id: str) -> bool:
295
369
def _find_module (self , id : str , use_typeshed : bool ) -> ModuleSearchResult :
296
370
fscache = self .fscache
297
371
372
+ # Fast path for any modules in the current source set.
373
+ # This is particularly important when there are a large number of search
374
+ # paths which share the first (few) component(s) due to the use of namespace
375
+ # packages, for instance:
376
+ # foo/
377
+ # company/
378
+ # __init__.py
379
+ # foo/
380
+ # bar/
381
+ # company/
382
+ # __init__.py
383
+ # bar/
384
+ # baz/
385
+ # company/
386
+ # __init__.py
387
+ # baz/
388
+ #
389
+ # mypy gets [foo/company/foo, bar/company/bar, baz/company/baz, ...] as input
390
+ # and computes [foo, bar, baz, ...] as the module search path.
391
+ #
392
+ # This would result in O(n) search for every import of company.*, leading to
393
+ # O(n**2) behavior in load_graph as such imports are unsurprisingly present
394
+ # at least once, and usually many more times than that, in each and every file
395
+ # being parsed.
396
+ #
397
+ # Thankfully, such cases are efficiently handled by looking up the module path
398
+ # via BuildSourceSet.
399
+ p = (self .find_module_via_source_set (id )
400
+ if (self .options is not None and self .options .fast_module_lookup )
401
+ else None )
402
+ if p :
403
+ return p
404
+
298
405
# If we're looking for a module like 'foo.bar.baz', it's likely that most of the
299
406
# many elements of lib_path don't even have a subdirectory 'foo/bar'. Discover
300
407
# that only once and cache it for when we look for modules like 'foo.bar.blah'
0 commit comments