-
-
Notifications
You must be signed in to change notification settings - Fork 18.7k
BUG: Slice Arrow buffer before passing it to numpy (#40896) #41046
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
f5cc8a8
2a8042c
fd94972
1586d50
5652869
bd70705
ff85a80
df87e14
c195089
f6555df
03648eb
9aa5df6
b86f9fc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,3 +1,4 @@ | ||||||
import numpy as np | ||||||
import pytest | ||||||
|
||||||
import pandas.util._test_decorators as td | ||||||
|
@@ -64,3 +65,102 @@ def test_arrow_sliced(): | |||||
result = table.slice(2, None).to_pandas() | ||||||
expected = df.iloc[2:].reset_index(drop=True) | ||||||
tm.assert_frame_equal(result, expected) | ||||||
|
||||||
|
||||||
@pytest.fixture | ||||||
def np_dtype_to_arrays(request): | ||||||
import pyarrow as pa | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we just add an import_optional_dependency at the top of this file, rather than having skips on every test. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I used pyarrow 0.16.0 as the minimal version so |
||||||
|
||||||
np_dtype = request.param | ||||||
pa_type = pa.from_numpy_dtype(np_dtype) | ||||||
|
||||||
pa_array = pa.array([0, 1, 2], type=pa_type) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Maybe good to include a missing value, to ensure there is a bitmask buffer? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, good point! |
||||||
np_expected = np.array([0, 1, 2], dtype=np_dtype) | ||||||
mask_expected = np.array([True, True, True]) | ||||||
return np_dtype, pa_array, np_expected, mask_expected | ||||||
|
||||||
|
||||||
@td.skip_if_no("pyarrow") | ||||||
@pytest.mark.parametrize( | ||||||
"np_dtype_to_arrays", | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you use the fixture: any_real_dtype or any_numpy_dtype instead |
||||||
( | ||||||
[ | ||||||
np.int8(), | ||||||
np.int16(), | ||||||
np.int32(), | ||||||
np.int64(), | ||||||
np.uint8(), | ||||||
np.uint16(), | ||||||
np.uint32(), | ||||||
np.uint64(), | ||||||
np.float32(), | ||||||
np.float64(), | ||||||
] | ||||||
), | ||||||
indirect=True, | ||||||
) | ||||||
def test_pyarrow_array_to_numpy_and_mask(np_dtype_to_arrays): | ||||||
""" | ||||||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
Test conversion from pyarrow array to numpy array. | ||||||
|
||||||
Modifies the pyarrow buffer to contain padding and offset, which are | ||||||
considered valid buffers by pyarrow. | ||||||
|
||||||
Also tests empty pyarrow arrays with non empty buffers. | ||||||
See https://github.com/pandas-dev/pandas/issues/40896 | ||||||
""" | ||||||
import pyarrow as pa | ||||||
|
||||||
from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you import at the top (after the pyarrow check is fine) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should I move the rest of the pandas imports below the pyarrow check as well? Looks a bit weird having them seperated by the check. |
||||||
|
||||||
np_dtype, pa_array, np_expected, mask_expected = np_dtype_to_arrays | ||||||
data, mask = pyarrow_array_to_numpy_and_mask(pa_array, np_dtype) | ||||||
tm.assert_numpy_array_equal(data, np_expected) | ||||||
tm.assert_numpy_array_equal(mask, mask_expected) | ||||||
|
||||||
mask_buffer = pa_array.buffers()[0] | ||||||
data_buffer = pa_array.buffers()[1] | ||||||
data_buffer_bytes = pa_array.buffers()[1].to_pybytes() | ||||||
|
||||||
# Add trailing padding to the buffer. | ||||||
data_buffer_trail = pa.py_buffer(data_buffer_bytes + b"\x00") | ||||||
pa_array_trail = pa.Array.from_buffers( | ||||||
type=pa_array.type, | ||||||
length=len(pa_array), | ||||||
buffers=[mask_buffer, data_buffer_trail], | ||||||
offset=pa_array.offset, | ||||||
) | ||||||
pa_array_trail.validate() | ||||||
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_trail, np_dtype) | ||||||
tm.assert_numpy_array_equal(data, np_expected) | ||||||
tm.assert_numpy_array_equal(mask, mask_expected) | ||||||
|
||||||
# Add offset to the buffer. | ||||||
offset = b"\x00" * (pa_array.type.bit_width // 8) | ||||||
data_buffer_offset = pa.py_buffer(offset + data_buffer_bytes) | ||||||
mask_buffer_offset = pa.py_buffer(b"\x0F") | ||||||
pa_array_offset = pa.Array.from_buffers( | ||||||
type=pa_array.type, | ||||||
length=len(pa_array), | ||||||
buffers=[mask_buffer_offset, data_buffer_offset], | ||||||
offset=pa_array.offset + 1, | ||||||
) | ||||||
pa_array_offset.validate() | ||||||
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) | ||||||
tm.assert_numpy_array_equal(data, np_expected) | ||||||
tm.assert_numpy_array_equal(mask, mask_expected) | ||||||
|
||||||
# Empty array | ||||||
np_expected_empty = np.array([], dtype=np_dtype) | ||||||
mask_expected_empty = np.array([], dtype=np.bool_) | ||||||
|
||||||
pa_array_offset = pa.Array.from_buffers( | ||||||
type=pa_array.type, | ||||||
length=0, | ||||||
buffers=[mask_buffer, data_buffer], | ||||||
offset=pa_array.offset, | ||||||
) | ||||||
pa_array_offset.validate() | ||||||
data, mask = pyarrow_array_to_numpy_and_mask(pa_array_offset, np_dtype) | ||||||
tm.assert_numpy_array_equal(data, np_expected_empty) | ||||||
tm.assert_numpy_array_equal(mask, mask_expected_empty) |
Uh oh!
There was an error while loading. Please reload this page.