Skip to content

Commit 27017cf

Browse files
committed
Thanks Honah!
1 parent 405d36c commit 27017cf

File tree

2 files changed

+19
-13
lines changed

2 files changed

+19
-13
lines changed

pyiceberg/io/pyarrow.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from __future__ import annotations
2626

2727
import concurrent.futures
28+
import itertools
2829
import logging
2930
import os
3031
import re
@@ -34,7 +35,7 @@
3435
from dataclasses import dataclass
3536
from enum import Enum
3637
from functools import lru_cache, singledispatch
37-
from itertools import chain, count
38+
from itertools import chain
3839
from typing import (
3940
TYPE_CHECKING,
4041
Any,
@@ -111,6 +112,7 @@
111112
Schema,
112113
SchemaVisitorPerPrimitiveType,
113114
SchemaWithPartnerVisitor,
115+
assign_fresh_schema_ids,
114116
pre_order_visit,
115117
promote,
116118
prune_columns,
@@ -617,7 +619,12 @@ def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], rows:
617619

618620
def pyarrow_to_schema(schema: pa.Schema) -> Schema:
619621
visitor = _ConvertToIceberg()
620-
return visit_pyarrow(schema, visitor)
622+
schema = visit_pyarrow(schema, visitor)
623+
624+
if visitor.missing_id_metadata:
625+
return assign_fresh_schema_ids(schema)
626+
else:
627+
return schema
621628

622629

623630
@singledispatch
@@ -715,12 +722,12 @@ def primitive(self, primitive: pa.DataType) -> Optional[T]:
715722

716723

717724
class _ConvertToIceberg(PyArrowSchemaVisitor[Union[IcebergType, Schema]]):
718-
counter: count[int]
719-
missing_is_metadata: Optional[bool]
725+
counter: itertools.count[int]
726+
missing_id_metadata: Optional[bool]
720727

721728
def __init__(self) -> None:
722-
self.counter = count()
723-
self.missing_is_metadata = None
729+
self.counter = itertools.count(1)
730+
self.missing_id_metadata = None
724731

725732
def _get_field_id(self, field: pa.Field) -> int:
726733
field_id: Optional[int] = None
@@ -730,18 +737,17 @@ def _get_field_id(self, field: pa.Field) -> int:
730737
field_id = int(field_id_str.decode())
731738

732739
if field_id is None:
733-
if self.missing_is_metadata is None:
734-
warnings.warn("Field-ids are missing, generating new IDs")
735-
740+
if self.missing_id_metadata is None:
741+
warnings.warn("Field-ids are missing, new IDs will be set")
736742
field_id = next(self.counter)
737743
missing_is_metadata = True
738744
else:
739745
missing_is_metadata = False
740746

741-
if self.missing_is_metadata is not None and self.missing_is_metadata != missing_is_metadata:
747+
if self.missing_id_metadata is not None and self.missing_id_metadata != missing_is_metadata:
742748
raise ValueError("Parquet file contains partial field-ids")
743749
else:
744-
self.missing_is_metadata = missing_is_metadata
750+
self.missing_id_metadata = missing_is_metadata
745751

746752
return field_id
747753

tests/io/test_pyarrow_visitor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,8 +278,8 @@ def test_schema_to_pyarrow_schema_missing_ids(warn: Mock) -> None:
278278
actual = pyarrow_to_schema(schema)
279279

280280
expected = Schema(
281-
NestedField(field_id=0, name="some_int", field_type=IntegerType(), required=False),
282-
NestedField(field_id=1, name="some_string", field_type=StringType(), required=True),
281+
NestedField(field_id=1, name="some_int", field_type=IntegerType(), required=False),
282+
NestedField(field_id=2, name="some_string", field_type=StringType(), required=True),
283283
)
284284

285285
assert actual == expected

0 commit comments

Comments
 (0)