Skip to content

Commit 17365c8

Browse files
committed
Add plugin to infer more precise regex match types
This pull request adds a plugin to make mypy infer more precise types when grabbing regex groups: the plugin will when possible analyze original regex to deduce whether a given group is required or not. ``` from typing_extensions import Final, Literal import re pattern: Final = re.compile("(a)(b)*") match: Final = pattern.match("") if match: reveal_type(match.groups()) # Revealed type is Tuple[str, Optional[str]] reveal_type(match.group(0)) # Revealed type is str reveal_type(match.group(1)) # Revealed type is str reveal_type(match.group(2)) # Revealed type is Optional[str] index: int reveal_type(match.group(index)) # Revealed type is Optional[str] # Error: Regex has 3 total groups, given group number 5 is too big match.group(5) ``` To track this information, I added in an optional 'metadata' dict field to the Instance class, similar to the metadata dict for plugins in TypeInfos. We skip serializing this dict if it does not contain any data. A limitation of this plugin is that both the pattern and the match variables must be declared to be final. Otherwise, we just default to using whatever types are defined in typeshed. This is because we set and erase the metadata field in exactly the same way we set and erase the `last_known_value` field in Instances: both kinds of info are "transient" and are unsafe to keep around if the variable reference is mutable. This limitation *does* end up limiting the usefulness of this plugin to some degree: it won't support common patterns like the below, since variables aren't allowed to be declared final inside loops: ``` for line in file: match = pattern.match(line) if match: ... ``` Possibly we can remove this limitation by making mypy less aggressive about removing this transient info by tracking the "lifetime" of this sort of data in some way? This pull request should mostly address python#7363, though it's unclear if it really fully resolves it: we might want to do something about the limitation described above and re-tune typeshed first. The other mostly unrelated change this PR makes is to refactor some of the helper functions in checker.py into typeops.py so I could use them more cleanly in the plugin.
1 parent db55aa3 commit 17365c8

File tree

10 files changed

+624
-173
lines changed

10 files changed

+624
-173
lines changed

mypy/binder.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from mypy.subtypes import is_subtype
1111
from mypy.join import join_simple
1212
from mypy.sametypes import is_same_type
13-
from mypy.erasetype import remove_instance_last_known_values
13+
from mypy.erasetype import remove_instance_transient_info
1414
from mypy.nodes import Expression, Var, RefExpr
1515
from mypy.literals import Key, literal, literal_hash, subkeys
1616
from mypy.nodes import IndexExpr, MemberExpr, NameExpr
@@ -251,7 +251,7 @@ def assign_type(self, expr: Expression,
251251
restrict_any: bool = False) -> None:
252252
# We should erase last known value in binder, because if we are using it,
253253
# it means that the target is not final, and therefore can't hold a literal.
254-
type = remove_instance_last_known_values(type)
254+
type = remove_instance_transient_info(type)
255255

256256
type = get_proper_type(type)
257257
declared_type = get_proper_type(declared_type)

mypy/checker.py

Lines changed: 5 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import itertools
44
import fnmatch
5-
import sys
65
from contextlib import contextmanager
76

87
from typing import (
@@ -50,7 +49,8 @@
5049
from mypy.typeops import (
5150
map_type_from_supertype, bind_self, erase_to_bound, make_simplified_union,
5251
erase_def_to_union_or_bound, erase_to_union_or_bound,
53-
true_only, false_only, function_type,
52+
true_only, false_only, function_type, is_singleton_type,
53+
try_expanding_enum_to_union, coerce_to_literal,
5454
)
5555
from mypy import message_registry
5656
from mypy.subtypes import (
@@ -63,7 +63,7 @@
6363
from mypy.typevars import fill_typevars, has_no_typevars, fill_typevars_with_any
6464
from mypy.semanal import set_callable_name, refers_to_fullname
6565
from mypy.mro import calculate_mro
66-
from mypy.erasetype import erase_typevars, remove_instance_last_known_values
66+
from mypy.erasetype import erase_typevars, remove_instance_transient_info
6767
from mypy.expandtype import expand_type, expand_type_by_instance
6868
from mypy.visitor import NodeVisitor
6969
from mypy.join import join_types
@@ -2069,7 +2069,7 @@ def check_assignment(self, lvalue: Lvalue, rvalue: Expression, infer_lvalue_type
20692069
if partial_types is not None:
20702070
if not self.current_node_deferred:
20712071
# Partial type can't be final, so strip any literal values.
2072-
rvalue_type = remove_instance_last_known_values(rvalue_type)
2072+
rvalue_type = remove_instance_transient_info(rvalue_type)
20732073
inferred_type = make_simplified_union(
20742074
[rvalue_type, NoneType()])
20752075
self.set_inferred_type(var, lvalue, inferred_type)
@@ -2126,7 +2126,7 @@ def check_assignment(self, lvalue: Lvalue, rvalue: Expression, infer_lvalue_type
21262126
if inferred:
21272127
rvalue_type = self.expr_checker.accept(rvalue)
21282128
if not inferred.is_final:
2129-
rvalue_type = remove_instance_last_known_values(rvalue_type)
2129+
rvalue_type = remove_instance_transient_info(rvalue_type)
21302130
self.infer_variable_type(inferred, lvalue, rvalue_type, rvalue)
21312131

21322132
def check_compatibility_all_supers(self, lvalue: RefExpr, lvalue_type: Optional[Type],
@@ -4753,97 +4753,6 @@ def is_private(node_name: str) -> bool:
47534753
return node_name.startswith('__') and not node_name.endswith('__')
47544754

47554755

4756-
def get_enum_values(typ: Instance) -> List[str]:
4757-
"""Return the list of values for an Enum."""
4758-
return [name for name, sym in typ.type.names.items() if isinstance(sym.node, Var)]
4759-
4760-
4761-
def is_singleton_type(typ: Type) -> bool:
4762-
"""Returns 'true' if this type is a "singleton type" -- if there exists
4763-
exactly only one runtime value associated with this type.
4764-
4765-
That is, given two values 'a' and 'b' that have the same type 't',
4766-
'is_singleton_type(t)' returns True if and only if the expression 'a is b' is
4767-
always true.
4768-
4769-
Currently, this returns True when given NoneTypes, enum LiteralTypes and
4770-
enum types with a single value.
4771-
4772-
Note that other kinds of LiteralTypes cannot count as singleton types. For
4773-
example, suppose we do 'a = 100000 + 1' and 'b = 100001'. It is not guaranteed
4774-
that 'a is b' will always be true -- some implementations of Python will end up
4775-
constructing two distinct instances of 100001.
4776-
"""
4777-
typ = get_proper_type(typ)
4778-
# TODO: Also make this return True if the type is a bool LiteralType.
4779-
# Also make this return True if the type corresponds to ... (ellipsis) or NotImplemented?
4780-
return (
4781-
isinstance(typ, NoneType) or (isinstance(typ, LiteralType) and typ.is_enum_literal())
4782-
or (isinstance(typ, Instance) and typ.type.is_enum and len(get_enum_values(typ)) == 1)
4783-
)
4784-
4785-
4786-
def try_expanding_enum_to_union(typ: Type, target_fullname: str) -> ProperType:
4787-
"""Attempts to recursively expand any enum Instances with the given target_fullname
4788-
into a Union of all of its component LiteralTypes.
4789-
4790-
For example, if we have:
4791-
4792-
class Color(Enum):
4793-
RED = 1
4794-
BLUE = 2
4795-
YELLOW = 3
4796-
4797-
class Status(Enum):
4798-
SUCCESS = 1
4799-
FAILURE = 2
4800-
UNKNOWN = 3
4801-
4802-
...and if we call `try_expanding_enum_to_union(Union[Color, Status], 'module.Color')`,
4803-
this function will return Literal[Color.RED, Color.BLUE, Color.YELLOW, Status].
4804-
"""
4805-
typ = get_proper_type(typ)
4806-
4807-
if isinstance(typ, UnionType):
4808-
items = [try_expanding_enum_to_union(item, target_fullname) for item in typ.items]
4809-
return make_simplified_union(items)
4810-
elif isinstance(typ, Instance) and typ.type.is_enum and typ.type.fullname() == target_fullname:
4811-
new_items = []
4812-
for name, symbol in typ.type.names.items():
4813-
if not isinstance(symbol.node, Var):
4814-
continue
4815-
new_items.append(LiteralType(name, typ))
4816-
# SymbolTables are really just dicts, and dicts are guaranteed to preserve
4817-
# insertion order only starting with Python 3.7. So, we sort these for older
4818-
# versions of Python to help make tests deterministic.
4819-
#
4820-
# We could probably skip the sort for Python 3.6 since people probably run mypy
4821-
# only using CPython, but we might as well for the sake of full correctness.
4822-
if sys.version_info < (3, 7):
4823-
new_items.sort(key=lambda lit: lit.value)
4824-
return make_simplified_union(new_items)
4825-
else:
4826-
return typ
4827-
4828-
4829-
def coerce_to_literal(typ: Type) -> ProperType:
4830-
"""Recursively converts any Instances that have a last_known_value or are
4831-
instances of enum types with a single value into the corresponding LiteralType.
4832-
"""
4833-
typ = get_proper_type(typ)
4834-
if isinstance(typ, UnionType):
4835-
new_items = [coerce_to_literal(item) for item in typ.items]
4836-
return make_simplified_union(new_items)
4837-
elif isinstance(typ, Instance):
4838-
if typ.last_known_value:
4839-
return typ.last_known_value
4840-
elif typ.type.is_enum:
4841-
enum_values = get_enum_values(typ)
4842-
if len(enum_values) == 1:
4843-
return LiteralType(value=enum_values[0], fallback=typ)
4844-
return typ
4845-
4846-
48474756
def has_bool_item(typ: ProperType) -> bool:
48484757
"""Return True if type is 'bool' or a union with a 'bool' item."""
48494758
if is_named_instance(typ, 'builtins.bool'):

mypy/erasetype.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -123,15 +123,16 @@ def visit_type_var(self, t: TypeVarType) -> Type:
123123
return t
124124

125125

126-
def remove_instance_last_known_values(t: Type) -> Type:
127-
return t.accept(LastKnownValueEraser())
128-
126+
def remove_instance_transient_info(t: Type) -> Type:
127+
"""Recursively removes any info from Instances that exist
128+
on a per-instance basis. Currently, this means erasing the
129+
last-known literal type and any plugin metadata.
130+
"""
131+
return t.accept(TransientInstanceInfoEraser())
129132

130-
class LastKnownValueEraser(TypeTranslator):
131-
"""Removes the Literal[...] type that may be associated with any
132-
Instance types."""
133133

134+
class TransientInstanceInfoEraser(TypeTranslator):
134135
def visit_instance(self, t: Instance) -> Type:
135-
if t.last_known_value:
136-
return t.copy_modified(last_known_value=None)
136+
if t.last_known_value or t.metadata:
137+
return t.copy_modified(last_known_value=None, metadata={})
137138
return t

mypy/plugins/default.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ def get_function_hook(self, fullname: str
3232
return ctypes.array_constructor_callback
3333
elif fullname == 're.compile':
3434
return regex.re_compile_callback
35+
elif fullname in regex.FUNCTIONS_PRODUCING_MATCH_OBJECT:
36+
return regex.re_direct_match_callback
3537
return None
3638

3739
def get_method_signature_hook(self, fullname: str
@@ -55,6 +57,7 @@ def get_method_signature_hook(self, fullname: str
5557
def get_method_hook(self, fullname: str
5658
) -> Optional[Callable[[MethodContext], Type]]:
5759
from mypy.plugins import ctypes
60+
from mypy.plugins import regex
5861

5962
if fullname == 'typing.Mapping.get':
6063
return typed_dict_get_callback
@@ -72,6 +75,12 @@ def get_method_hook(self, fullname: str
7275
return ctypes.array_iter_callback
7376
elif fullname == 'pathlib.Path.open':
7477
return path_open_callback
78+
elif fullname in regex.METHODS_PRODUCING_MATCH_OBJECT:
79+
return regex.re_get_match_callback
80+
elif fullname == 'typing.Match.groups':
81+
return regex.re_match_groups_callback
82+
elif fullname in regex.METHODS_PRODUCING_GROUP:
83+
return regex.re_match_group_callback
7584
return None
7685

7786
def get_attribute_hook(self, fullname: str

0 commit comments

Comments
 (0)