Skip to content

Commit dcee5bf

Browse files
jorisvandenbosschevibhatha
authored andcommitted
apacheGH-35081: [Python] construct pandas.DataFrame with public API in to_pandas (apache#40897)
### Rationale for this change Avoiding using pandas internals to create Block objects ourselves, using a new API for pandas>=3 * GitHub Issue: apache#35081 Authored-by: Joris Van den Bossche <[email protected]> Signed-off-by: Joris Van den Bossche <[email protected]>
1 parent c1cdc38 commit dcee5bf

File tree

2 files changed

+48
-34
lines changed

2 files changed

+48
-34
lines changed

python/pyarrow/pandas-shim.pxi

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ cdef class _PandasAPIShim(object):
3838
object _array_like_types, _is_extension_array_dtype, _lock
3939
bint has_sparse
4040
bint _pd024
41-
bint _is_v1, _is_ge_v21
41+
bint _is_v1, _is_ge_v21, _is_ge_v3
4242

4343
def __init__(self):
4444
self._lock = Lock()
@@ -79,6 +79,7 @@ cdef class _PandasAPIShim(object):
7979

8080
self._is_v1 = self._loose_version < Version('2.0.0')
8181
self._is_ge_v21 = self._loose_version >= Version('2.1.0')
82+
self._is_ge_v3 = self._loose_version >= Version('3.0.0.dev0')
8283

8384
self._compat_module = pdcompat
8485
self._data_frame = pd.DataFrame
@@ -169,6 +170,10 @@ cdef class _PandasAPIShim(object):
169170
self._check_import()
170171
return self._is_ge_v21
171172

173+
def is_ge_v3(self):
174+
self._check_import()
175+
return self._is_ge_v3
176+
172177
@property
173178
def categorical_type(self):
174179
self._check_import()

python/pyarrow/pandas_compat.py

Lines changed: 42 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -676,7 +676,7 @@ def get_datetimetz_type(values, dtype, type_):
676676
# Converting pyarrow.Table efficiently to pandas.DataFrame
677677

678678

679-
def _reconstruct_block(item, columns=None, extension_columns=None):
679+
def _reconstruct_block(item, columns=None, extension_columns=None, return_block=True):
680680
"""
681681
Construct a pandas Block from the `item` dictionary coming from pyarrow's
682682
serialization or returned by arrow::python::ConvertTableToPandas.
@@ -709,22 +709,23 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
709709
block_arr = item.get('block', None)
710710
placement = item['placement']
711711
if 'dictionary' in item:
712-
cat = _pandas_api.categorical_type.from_codes(
712+
arr = _pandas_api.categorical_type.from_codes(
713713
block_arr, categories=item['dictionary'],
714714
ordered=item['ordered'])
715-
block = _int.make_block(cat, placement=placement)
716715
elif 'timezone' in item:
717716
unit, _ = np.datetime_data(block_arr.dtype)
718717
dtype = make_datetimetz(unit, item['timezone'])
719718
if _pandas_api.is_ge_v21():
720-
pd_arr = _pandas_api.pd.array(
719+
arr = _pandas_api.pd.array(
721720
block_arr.view("int64"), dtype=dtype, copy=False
722721
)
723-
block = _int.make_block(pd_arr, placement=placement)
724722
else:
725-
block = _int.make_block(block_arr, placement=placement,
726-
klass=_int.DatetimeTZBlock,
727-
dtype=dtype)
723+
arr = block_arr
724+
if return_block:
725+
block = _int.make_block(block_arr, placement=placement,
726+
klass=_int.DatetimeTZBlock,
727+
dtype=dtype)
728+
return block
728729
elif 'py_array' in item:
729730
# create ExtensionBlock
730731
arr = item['py_array']
@@ -734,12 +735,14 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
734735
if not hasattr(pandas_dtype, '__from_arrow__'):
735736
raise ValueError("This column does not support to be converted "
736737
"to a pandas ExtensionArray")
737-
pd_ext_arr = pandas_dtype.__from_arrow__(arr)
738-
block = _int.make_block(pd_ext_arr, placement=placement)
738+
arr = pandas_dtype.__from_arrow__(arr)
739739
else:
740-
block = _int.make_block(block_arr, placement=placement)
740+
arr = block_arr
741741

742-
return block
742+
if return_block:
743+
return _int.make_block(arr, placement=placement)
744+
else:
745+
return arr, placement
743746

744747

745748
def make_datetimetz(unit, tz):
@@ -752,9 +755,6 @@ def make_datetimetz(unit, tz):
752755
def table_to_dataframe(
753756
options, table, categories=None, ignore_metadata=False, types_mapper=None
754757
):
755-
from pandas.core.internals import BlockManager
756-
from pandas import DataFrame
757-
758758
all_columns = []
759759
column_indexes = []
760760
pandas_metadata = table.schema.pandas_metadata
@@ -774,15 +774,35 @@ def table_to_dataframe(
774774

775775
_check_data_column_metadata_consistency(all_columns)
776776
columns = _deserialize_column_index(table, all_columns, column_indexes)
777-
blocks = _table_to_blocks(options, table, categories, ext_columns_dtypes)
778777

779-
axes = [columns, index]
780-
mgr = BlockManager(blocks, axes)
781-
if _pandas_api.is_ge_v21():
782-
df = DataFrame._from_mgr(mgr, mgr.axes)
778+
column_names = table.column_names
779+
result = pa.lib.table_to_blocks(options, table, categories,
780+
list(ext_columns_dtypes.keys()))
781+
if _pandas_api.is_ge_v3():
782+
from pandas.api.internals import create_dataframe_from_blocks
783+
784+
blocks = [
785+
_reconstruct_block(
786+
item, column_names, ext_columns_dtypes, return_block=False)
787+
for item in result
788+
]
789+
df = create_dataframe_from_blocks(blocks, index=index, columns=columns)
790+
return df
783791
else:
784-
df = DataFrame(mgr)
785-
return df
792+
from pandas.core.internals import BlockManager
793+
from pandas import DataFrame
794+
795+
blocks = [
796+
_reconstruct_block(item, column_names, ext_columns_dtypes)
797+
for item in result
798+
]
799+
axes = [columns, index]
800+
mgr = BlockManager(blocks, axes)
801+
if _pandas_api.is_ge_v21():
802+
df = DataFrame._from_mgr(mgr, mgr.axes)
803+
else:
804+
df = DataFrame(mgr)
805+
return df
786806

787807

788808
# Set of the string repr of all numpy dtypes that can be stored in a pandas
@@ -1099,17 +1119,6 @@ def _reconstruct_columns_from_metadata(columns, column_indexes):
10991119
return pd.Index(new_levels[0], dtype=new_levels[0].dtype, name=columns.name)
11001120

11011121

1102-
def _table_to_blocks(options, block_table, categories, extension_columns):
1103-
# Part of table_to_blockmanager
1104-
1105-
# Convert an arrow table to Block from the internal pandas API
1106-
columns = block_table.column_names
1107-
result = pa.lib.table_to_blocks(options, block_table, categories,
1108-
list(extension_columns.keys()))
1109-
return [_reconstruct_block(item, columns, extension_columns)
1110-
for item in result]
1111-
1112-
11131122
def _add_any_metadata(table, pandas_metadata):
11141123
modified_columns = {}
11151124
modified_fields = {}

0 commit comments

Comments
 (0)