Skip to content

Commit e5923dd

Browse files
committed
Arrow: Allow missing field-ids from Schema
1 parent 553695e commit e5923dd

File tree

2 files changed

+66
-18
lines changed

2 files changed

+66
-18
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,13 @@
2828
import logging
2929
import os
3030
import re
31+
import warnings
3132
from abc import ABC, abstractmethod
3233
from concurrent.futures import Future
3334
from dataclasses import dataclass
3435
from enum import Enum
3536
from functools import lru_cache, singledispatch
36-
from itertools import chain
37+
from itertools import chain, count
3738
from typing import (
3839
TYPE_CHECKING,
3940
Any,
@@ -713,28 +714,50 @@ def primitive(self, primitive: pa.DataType) -> Optional[T]:
713714
"""Visit a primitive type."""
714715

715716

716-
def _get_field_id(field: pa.Field) -> Optional[int]:
717-
for pyarrow_field_id_key in PYARROW_FIELD_ID_KEYS:
718-
if field_id_str := field.metadata.get(pyarrow_field_id_key):
719-
return int(field_id_str.decode())
720-
return None
717+
class _ConvertToIceberg(PyArrowSchemaVisitor[Union[IcebergType, Schema]]):
718+
counter: count[int]
719+
missing_is_metadata: Optional[bool]
721720

721+
def __init__(self) -> None:
722+
self.counter = count()
723+
self.missing_is_metadata = None
722724

723-
def _get_field_doc(field: pa.Field) -> Optional[str]:
724-
for pyarrow_doc_key in PYARROW_FIELD_DOC_KEYS:
725-
if doc_str := field.metadata.get(pyarrow_doc_key):
726-
return doc_str.decode()
727-
return None
725+
def _get_field_id(self, field: pa.Field) -> int:
726+
field_id: Optional[int] = None
728727

728+
for pyarrow_field_id_key in PYARROW_FIELD_ID_KEYS:
729+
if field.metadata and (field_id_str := field.metadata.get(pyarrow_field_id_key)):
730+
field_id = int(field_id_str.decode())
731+
732+
if field_id is None:
733+
if self.missing_is_metadata is None:
734+
warnings.warn("Field-ids are missing, generating new IDs")
735+
736+
field_id = next(self.counter)
737+
missing_is_metadata = True
738+
else:
739+
missing_is_metadata = False
740+
741+
if self.missing_is_metadata is not None and self.missing_is_metadata != missing_is_metadata:
742+
raise ValueError("Parquet file contains partial field-ids")
743+
else:
744+
self.missing_is_metadata = missing_is_metadata
745+
746+
return field_id
747+
748+
def _get_field_doc(self, field: pa.Field) -> Optional[str]:
749+
for pyarrow_doc_key in PYARROW_FIELD_DOC_KEYS:
750+
if field.metadata and (doc_str := field.metadata.get(pyarrow_doc_key)):
751+
return doc_str.decode()
752+
return None
729753

730-
class _ConvertToIceberg(PyArrowSchemaVisitor[Union[IcebergType, Schema]]):
731754
def _convert_fields(self, arrow_fields: Iterable[pa.Field], field_results: List[Optional[IcebergType]]) -> List[NestedField]:
732755
fields = []
733756
for i, field in enumerate(arrow_fields):
734-
field_id = _get_field_id(field)
735-
field_doc = _get_field_doc(field)
757+
field_id = self._get_field_id(field)
758+
field_doc = self._get_field_doc(field)
736759
field_type = field_results[i]
737-
if field_type is not None and field_id is not None:
760+
if field_type is not None:
738761
fields.append(NestedField(field_id, field.name, field_type, required=not field.nullable, doc=field_doc))
739762
return fields
740763

@@ -746,7 +769,7 @@ def struct(self, struct: pa.StructType, field_results: List[Optional[IcebergType
746769

747770
def list(self, list_type: pa.ListType, element_result: Optional[IcebergType]) -> Optional[IcebergType]:
748771
element_field = list_type.value_field
749-
element_id = _get_field_id(element_field)
772+
element_id = self._get_field_id(element_field)
750773
if element_result is not None and element_id is not None:
751774
return ListType(element_id, element_result, element_required=not element_field.nullable)
752775
return None
@@ -755,9 +778,9 @@ def map(
755778
self, map_type: pa.MapType, key_result: Optional[IcebergType], value_result: Optional[IcebergType]
756779
) -> Optional[IcebergType]:
757780
key_field = map_type.key_field
758-
key_id = _get_field_id(key_field)
781+
key_id = self._get_field_id(key_field)
759782
value_field = map_type.item_field
760-
value_id = _get_field_id(value_field)
783+
value_id = self._get_field_id(value_field)
761784
if key_result is not None and value_result is not None and key_id is not None and value_id is not None:
762785
return MapType(key_id, key_result, value_id, value_result, value_required=not value_field.nullable)
763786
return None

tests/io/test_pyarrow_visitor.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,3 +269,28 @@ def test_round_schema_conversion_nested(table_schema_nested: Schema) -> None:
269269
15: person: optional struct<16: name: optional string, 17: age: required int>
270270
}"""
271271
assert actual == expected
272+
273+
274+
def test_schema_to_pyarrow_schema_missing_ids() -> None:
275+
schema = pa.schema([pa.field('some_int', pa.int32(), nullable=True), pa.field('some_string', pa.string(), nullable=False)])
276+
actual = pyarrow_to_schema(schema)
277+
278+
expected = Schema(
279+
NestedField(field_id=0, name="some_int", field_type=IntegerType(), required=False),
280+
NestedField(field_id=1, name="some_string", field_type=StringType(), required=True),
281+
)
282+
283+
assert actual == expected
284+
285+
286+
def test_schema_to_pyarrow_schema_missing_id() -> None:
287+
schema = pa.schema(
288+
[
289+
pa.field('some_int', pa.int32(), nullable=True),
290+
pa.field('some_string', pa.string(), nullable=False, metadata={b"field_id": "22"}),
291+
]
292+
)
293+
294+
with pytest.raises(ValueError) as exc_info:
295+
_ = pyarrow_to_schema(schema)
296+
assert "Parquet file contains partial field-ids" in str(exc_info.value)

0 commit comments

Comments
 (0)