-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
PERF/REF: Check use of numexpr earlier in the DataFrame operation #41122
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
1bb21fa
9496e19
ef0202a
3fce895
f0f6bb6
54a84d4
b252e7c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -72,6 +72,50 @@ def _evaluate_standard(op, op_str, a, b): | |
return op(a, b) | ||
|
||
|
||
def can_use_numexpr(op, size=None, dtypes=None, scalar=None): | ||
""" | ||
Initial check whether numexpr can be used with the given size and | ||
involved data types and/or scalar operand. | ||
|
||
Returns False if it definitely cannot use numexpr, otherwise returns | ||
True (which doesn't mean we always end up using numexpr) | ||
|
||
Parameters | ||
---------- | ||
op : operator | ||
size : int | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. int or None? |
||
dtypes : list | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. list[DtypeObj]? |
||
List of dtypes involved in the operation | ||
scalar : | ||
Optionally a scalar, eg if the right operand is a scalar. | ||
|
||
Returns | ||
------- | ||
bool | ||
""" | ||
if not USE_NUMEXPR: | ||
return False | ||
|
||
if size is not None: | ||
if size < _MIN_ELEMENTS: | ||
return False | ||
|
||
op_str = _op_str_mapping.get(op, None) | ||
if op_str is None: | ||
return False | ||
|
||
if scalar is not None: | ||
if isinstance(scalar, str): | ||
return False | ||
|
||
# allowed are a superset | ||
if dtypes is not None: | ||
return _ALLOWED_DTYPES["evaluate"] >= set(dtypes) | ||
|
||
# safe fallback if dtypes were not specified | ||
return True | ||
|
||
|
||
def _can_use_numexpr(op, op_str, a, b, dtype_check): | ||
""" return a boolean if we WILL be using numexpr """ | ||
if op_str is not None: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6827,11 +6827,23 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): | |
------- | ||
DataFrame | ||
""" | ||
import pandas.core.computation.expressions as expressions | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# Get the appropriate array-op to apply to each column/block's values. | ||
array_op = ops.get_array_op(func) | ||
|
||
right = lib.item_from_zerodim(right) | ||
if not is_list_like(right): | ||
|
||
if isinstance(self._mgr, ArrayManager): | ||
use_numexpr = expressions.can_use_numexpr( | ||
func, self.shape[0], None, right | ||
) | ||
else: | ||
use_numexpr = expressions.can_use_numexpr(func, None, None, right) | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
array_op = ops.get_array_op(func, use_numexpr=use_numexpr) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is the get_array_op on L6832 still needed? |
||
|
||
# i.e. scalar, faster than checking np.ndim(right) == 0 | ||
with np.errstate(all="ignore"): | ||
bm = self._mgr.apply(array_op, right=right) | ||
|
@@ -6844,6 +6856,16 @@ def _dispatch_frame_op(self, right, func: Callable, axis: int | None = None): | |
# fails in cases with empty columns reached via | ||
# _frame_arith_method_with_reindex | ||
|
||
if isinstance(self._mgr, ArrayManager): | ||
use_numexpr = expressions.can_use_numexpr( | ||
func, self.shape[0], None, None | ||
) | ||
else: | ||
use_numexpr = expressions.USE_NUMEXPR | ||
|
||
# breakpoint() | ||
array_op = ops.get_array_op(func, use_numexpr=use_numexpr) | ||
|
||
# TODO operate_blockwise expects a manager of the same type | ||
with np.errstate(all="ignore"): | ||
bm = self._mgr.operate_blockwise( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -133,7 +133,7 @@ def _masked_arith_op(x: np.ndarray, y, op): | |
return result | ||
|
||
|
||
def _na_arithmetic_op(left, right, op, is_cmp: bool = False): | ||
def _na_arithmetic_op(left, right, op, is_cmp: bool = False, use_numexpr=True): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. annotate, docstring There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can use_numexpr be keyword-only |
||
""" | ||
Return the result of evaluating op on the passed in values. | ||
|
||
|
@@ -155,7 +155,7 @@ def _na_arithmetic_op(left, right, op, is_cmp: bool = False): | |
TypeError : invalid operation | ||
""" | ||
try: | ||
result = expressions.evaluate(op, left, right) | ||
result = expressions.evaluate(op, left, right, use_numexpr=use_numexpr) | ||
except TypeError: | ||
if is_object_dtype(left) or is_object_dtype(right) and not is_cmp: | ||
# For object dtype, fallback to a masked operation (only operating | ||
|
@@ -174,7 +174,7 @@ def _na_arithmetic_op(left, right, op, is_cmp: bool = False): | |
return missing.dispatch_fill_zeros(op, left, right, result) | ||
|
||
|
||
def arithmetic_op(left: ArrayLike, right: Any, op): | ||
def arithmetic_op(left: ArrayLike, right: Any, op, use_numexpr=True): | ||
""" | ||
Evaluate an arithmetic operation `+`, `-`, `*`, `/`, `//`, `%`, `**`, ... | ||
|
||
|
@@ -205,12 +205,12 @@ def arithmetic_op(left: ArrayLike, right: Any, op): | |
# Timedelta is included because numexpr will fail on it, see GH#31457 | ||
res_values = op(left, right) | ||
else: | ||
res_values = _na_arithmetic_op(left, right, op) | ||
res_values = _na_arithmetic_op(left, right, op, use_numexpr=use_numexpr) | ||
|
||
return res_values | ||
|
||
|
||
def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: | ||
def comparison_op(left: ArrayLike, right: Any, op, use_numexpr=True) -> ArrayLike: | ||
""" | ||
Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`. | ||
|
||
|
@@ -265,7 +265,9 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: | |
res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) | ||
|
||
else: | ||
res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True) | ||
res_values = _na_arithmetic_op( | ||
lvalues, rvalues, op, is_cmp=True, use_numexpr=use_numexpr | ||
) | ||
|
||
return res_values | ||
|
||
|
@@ -311,7 +313,7 @@ def na_logical_op(x: np.ndarray, y, op): | |
return result.reshape(x.shape) | ||
|
||
|
||
def logical_op(left: ArrayLike, right: Any, op) -> ArrayLike: | ||
def logical_op(left: ArrayLike, right: Any, op, use_numexpr=True) -> ArrayLike: | ||
""" | ||
Evaluate a logical operation `|`, `&`, or `^`. | ||
|
||
|
@@ -377,7 +379,7 @@ def fill_bool(x, left=None): | |
return res_values | ||
|
||
|
||
def get_array_op(op): | ||
def get_array_op(op, use_numexpr=True): | ||
""" | ||
Return a binary array operation corresponding to the given operator op. | ||
|
||
|
@@ -401,9 +403,9 @@ def get_array_op(op): | |
return op | ||
|
||
if op_name in {"eq", "ne", "lt", "le", "gt", "ge"}: | ||
return partial(comparison_op, op=op) | ||
return partial(comparison_op, op=op, use_numexpr=use_numexpr) | ||
elif op_name in {"and", "or", "xor", "rand", "ror", "rxor"}: | ||
return partial(logical_op, op=op) | ||
return partial(logical_op, op=op, use_numexpr=use_numexpr) | ||
elif op_name in { | ||
"add", | ||
"sub", | ||
|
@@ -414,7 +416,7 @@ def get_array_op(op): | |
"divmod", | ||
"pow", | ||
}: | ||
return partial(arithmetic_op, op=op) | ||
return partial(arithmetic_op, op=op, use_numexpr=use_numexpr) | ||
else: | ||
raise NotImplementedError(op_name) | ||
|
||
|
Uh oh!
There was an error while loading. Please reload this page.