@@ -676,7 +676,7 @@ def get_datetimetz_type(values, dtype, type_):
676
676
# Converting pyarrow.Table efficiently to pandas.DataFrame
677
677
678
678
679
- def _reconstruct_block (item , columns = None , extension_columns = None ):
679
+ def _reconstruct_block (item , columns = None , extension_columns = None , return_block = True ):
680
680
"""
681
681
Construct a pandas Block from the `item` dictionary coming from pyarrow's
682
682
serialization or returned by arrow::python::ConvertTableToPandas.
@@ -709,22 +709,23 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
709
709
block_arr = item .get ('block' , None )
710
710
placement = item ['placement' ]
711
711
if 'dictionary' in item :
712
- cat = _pandas_api .categorical_type .from_codes (
712
+ arr = _pandas_api .categorical_type .from_codes (
713
713
block_arr , categories = item ['dictionary' ],
714
714
ordered = item ['ordered' ])
715
- block = _int .make_block (cat , placement = placement )
716
715
elif 'timezone' in item :
717
716
unit , _ = np .datetime_data (block_arr .dtype )
718
717
dtype = make_datetimetz (unit , item ['timezone' ])
719
718
if _pandas_api .is_ge_v21 ():
720
- pd_arr = _pandas_api .pd .array (
719
+ arr = _pandas_api .pd .array (
721
720
block_arr .view ("int64" ), dtype = dtype , copy = False
722
721
)
723
- block = _int .make_block (pd_arr , placement = placement )
724
722
else :
725
- block = _int .make_block (block_arr , placement = placement ,
726
- klass = _int .DatetimeTZBlock ,
727
- dtype = dtype )
723
+ arr = block_arr
724
+ if return_block :
725
+ block = _int .make_block (block_arr , placement = placement ,
726
+ klass = _int .DatetimeTZBlock ,
727
+ dtype = dtype )
728
+ return block
728
729
elif 'py_array' in item :
729
730
# create ExtensionBlock
730
731
arr = item ['py_array' ]
@@ -734,12 +735,14 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
734
735
if not hasattr (pandas_dtype , '__from_arrow__' ):
735
736
raise ValueError ("This column does not support to be converted "
736
737
"to a pandas ExtensionArray" )
737
- pd_ext_arr = pandas_dtype .__from_arrow__ (arr )
738
- block = _int .make_block (pd_ext_arr , placement = placement )
738
+ arr = pandas_dtype .__from_arrow__ (arr )
739
739
else :
740
- block = _int . make_block ( block_arr , placement = placement )
740
+ arr = block_arr
741
741
742
- return block
742
+ if return_block :
743
+ return _int .make_block (arr , placement = placement )
744
+ else :
745
+ return arr , placement
743
746
744
747
745
748
def make_datetimetz (unit , tz ):
@@ -752,9 +755,6 @@ def make_datetimetz(unit, tz):
752
755
def table_to_dataframe (
753
756
options , table , categories = None , ignore_metadata = False , types_mapper = None
754
757
):
755
- from pandas .core .internals import BlockManager
756
- from pandas import DataFrame
757
-
758
758
all_columns = []
759
759
column_indexes = []
760
760
pandas_metadata = table .schema .pandas_metadata
@@ -774,15 +774,35 @@ def table_to_dataframe(
774
774
775
775
_check_data_column_metadata_consistency (all_columns )
776
776
columns = _deserialize_column_index (table , all_columns , column_indexes )
777
- blocks = _table_to_blocks (options , table , categories , ext_columns_dtypes )
778
777
779
- axes = [columns , index ]
780
- mgr = BlockManager (blocks , axes )
781
- if _pandas_api .is_ge_v21 ():
782
- df = DataFrame ._from_mgr (mgr , mgr .axes )
778
+ column_names = table .column_names
779
+ result = pa .lib .table_to_blocks (options , table , categories ,
780
+ list (ext_columns_dtypes .keys ()))
781
+ if _pandas_api .is_ge_v3 ():
782
+ from pandas .api .internals import create_dataframe_from_blocks
783
+
784
+ blocks = [
785
+ _reconstruct_block (
786
+ item , column_names , ext_columns_dtypes , return_block = False )
787
+ for item in result
788
+ ]
789
+ df = create_dataframe_from_blocks (blocks , index = index , columns = columns )
790
+ return df
783
791
else :
784
- df = DataFrame (mgr )
785
- return df
792
+ from pandas .core .internals import BlockManager
793
+ from pandas import DataFrame
794
+
795
+ blocks = [
796
+ _reconstruct_block (item , column_names , ext_columns_dtypes )
797
+ for item in result
798
+ ]
799
+ axes = [columns , index ]
800
+ mgr = BlockManager (blocks , axes )
801
+ if _pandas_api .is_ge_v21 ():
802
+ df = DataFrame ._from_mgr (mgr , mgr .axes )
803
+ else :
804
+ df = DataFrame (mgr )
805
+ return df
786
806
787
807
788
808
# Set of the string repr of all numpy dtypes that can be stored in a pandas
@@ -1099,17 +1119,6 @@ def _reconstruct_columns_from_metadata(columns, column_indexes):
1099
1119
return pd .Index (new_levels [0 ], dtype = new_levels [0 ].dtype , name = columns .name )
1100
1120
1101
1121
1102
- def _table_to_blocks (options , block_table , categories , extension_columns ):
1103
- # Part of table_to_blockmanager
1104
-
1105
- # Convert an arrow table to Block from the internal pandas API
1106
- columns = block_table .column_names
1107
- result = pa .lib .table_to_blocks (options , block_table , categories ,
1108
- list (extension_columns .keys ()))
1109
- return [_reconstruct_block (item , columns , extension_columns )
1110
- for item in result ]
1111
-
1112
-
1113
1122
def _add_any_metadata (table , pandas_metadata ):
1114
1123
modified_columns = {}
1115
1124
modified_fields = {}
0 commit comments