Additional fields and type inferring for union (#1834)

AlbertvanHouten · web-flow · commit 6417998a1c9a · 2025-08-22T14:25:17.000+02:00
### Summary This pull request introduces robust support for Python `Union` types in the experimental Datumaro type registry and dataset schema inference. It enables seamless conversion between multiple candidate types (including both `typing.Union` and modern `A | B` syntax), with fallback logic and comprehensive test coverage. The changes also improve image type conversion and schema inference for datasets, making the system more flexible and reliable. ### Type registry and conversion improvements * Added full support for `Union` types in the type registry: both `typing.Union` and Python 3.10+ `A | B` syntax are now handled, with fallback to subsequent types if the first conversion fails. This includes updated logic in `from_polars_data` and new tests for ordering, error handling, and fallback behavior. [[1]](diffhunk://#diff-e324261812079d99ca2989612441e5df1dd15dabde37fb2e5e8c0c1b639dac0dR122-R154) [[2]](diffhunk://#diff-e324261812079d99ca2989612441e5df1dd15dabde37fb2e5e8c0c1b639dac0dR170-R269) [[3]](diffhunk://#diff-30f23b2869128577a39c918ed25c78229a30cb96578c33728d45e5ebce740ac2R1-R162) * Added comprehensive tests for type registry conversions, including basic types, union types, error cases, ordering, and converter functionality for numpy and torch tensors. ### Dataset and schema inference enhancements * Improved schema inference in `Dataset` to resolve string annotations to actual type objects, supporting cases where `from __future__ import annotations` is used, and added correct handling for `Union` types to preserve the original annotation. [[1]](diffhunk://#diff-4ac196ddc4dc8e6d33daf684ded18886ff8774fadb8b6cbd4bfa88ca424bb34fR65-R80) [[2]](diffhunk://#diff-4ac196ddc4dc8e6d33daf684ded18886ff8774fadb8b6cbd4bfa88ca424bb34fR94-R110) * Updated type variable definitions and method signatures in `dataset.py` for clarity and correctness, and removed unnecessary imports. [[1]](diffhunk://#diff-4ac196ddc4dc8e6d33daf684ded18886ff8774fadb8b6cbd4bfa88ca424bb34fR19-R25) [[2]](diffhunk://#diff-4ac196ddc4dc8e6d33daf684ded18886ff8774fadb8b6cbd4bfa88ca424bb34fL105-R128) [[3]](diffhunk://#diff-4ac196ddc4dc8e6d33daf684ded18886ff8774fadb8b6cbd4bfa88ca424bb34fL134-R157) ### API and import improvements * Updated the experimental module’s public API to expose new converters, dataset classes, fields, schema types, and registry functions. ### Test coverage * Added targeted tests for union type handling in dataset samples, ensuring both modern and legacy union syntax are supported. These changes significantly improve the flexibility and reliability of type conversion and schema inference in Datumaro’s experimental pipeline.  ### How to test  ### Checklist  - [ ] I have added unit tests to cover my changes.​ - [ ] I have added integration tests to cover my changes.​ - [ ] I have added the description of my changes into [CHANGELOG](https://github.com/open-edge-platform/datumaro/blob/develop/CHANGELOG.md).​ - [ ] I have updated the [documentation](https://github.com/open-edge-platform/datumaro/tree/develop/docs) accordingly ### License - [ ] I submit _my code changes_ under the same [MIT License](https://github.com/open-edge-platform/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [ ] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2025 Intel Corporation # # SPDX-License-Identifier: MIT ```
diff --git a/src/datumaro/experimental/__init__.py b/src/datumaro/experimental/__init__.py
@@ -1,3 +1,22 @@
 # Copyright (C) 2025 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
+
+from .converter_registry import ConverterRegistry, converter, find_conversion_path
+from .dataset import Dataset, Sample
+from .fields import (
+    BBoxField,
+    ImageField,
+    ImageInfoField,
+    ImagePathField,
+    LabelField,
+    TensorField,
+    bbox_field,
+    image_field,
+    image_info_field,
+    image_path_field,
+    label_field,
+    tensor_field,
+)
+from .schema import AttributeInfo, Field, Schema, Semantic
+from .type_registry import register_from_polars_converter, register_numpy_converter
diff --git a/src/datumaro/experimental/dataset.py b/src/datumaro/experimental/dataset.py
@@ -4,10 +4,12 @@
 
 from __future__ import annotations
 
-import copy
+import sys
+import types
 from functools import cache
 from typing import (
     TYPE_CHECKING,
+    Annotated,
     Any,
     Dict,
     Generic,
@@ -16,12 +18,13 @@
     Type,
     Union,
     cast,
+    dataclass_transform,
     get_args,
     get_origin,
 )
 
 import polars as pl
-from typing_extensions import Annotated, TypeGuard, TypeVar, dataclass_transform
+from typing_extensions import TypeGuard, TypeVar
 
 from .converter_registry import Converter, find_conversion_path
 from .schema import AttributeInfo, Field, Schema
@@ -61,8 +64,21 @@ def infer_schema(cls) -> Schema:
         Raises:
             TypeError: If attributes don't have proper Field annotations
         """
+
         attributes: dict[str, AttributeInfo] = {}
         for name, annotation in cls.__annotations__.items():
+            # Resolve string annotations to actual type objects
+            # This handles cases where `from __future__ import annotations` is used
+            if isinstance(annotation, str):
+                try:
+                    # Get the module where the class is defined to resolve annotations
+                    module = sys.modules[cls.__module__]
+                    annotation = eval(annotation, module.__dict__)
+                except Exception as e:
+                    raise TypeError(
+                        f"Failed to resolve type annotation '{annotation}' for attribute '{name}': {e}"
+                    )
+
             origin = get_origin(annotation)
             if origin is Annotated:
                 # Handle Annotated[Type, Field] approach
@@ -78,13 +94,18 @@ def infer_schema(cls) -> Schema:
             # Extract base class from generic types like MyClass[A, B, C] -> MyClass
             type_origin = get_origin(annotation)
 
-            final_type = type_origin if type_origin is not None else annotation
+            # For Union types, keep the original annotation (the Union instance)
+            # instead of the origin (which is just the UnionType class)
+            if isinstance(annotation, types.UnionType) or type_origin is Union:
+                final_type = annotation
+            else:
+                final_type = type_origin if type_origin is not None else annotation
             attributes[name] = AttributeInfo(type=final_type, annotation=field_annotation)
         return Schema(attributes=attributes)
 
 
-DType = TypeVar("DType", bound=Sample, default=Sample)
-DTargetType = TypeVar("DTargetType", bound=Sample, default=Sample)
+DType = TypeVar("DType", bound=Sample)
+DTargetType = TypeVar("DTargetType", bound=Sample)
 
 
 class Dataset(Generic[DType]):
@@ -102,7 +123,7 @@ class Dataset(Generic[DType]):
     def __init__(
         self,
         dtype_or_schema: Union[Schema, Type[DType]],
-        categories: Dict[str, "Categories"] = None,
+        categories: Categories = None,
     ):
         """
         Initialize dataset with either a schema or sample type.
@@ -131,7 +152,7 @@ def from_dataframe(
         df: pl.DataFrame,
         dtype_or_schema: Union[Schema, Type[DTargetType]],
         lazy_converters: List[Converter] | None = None,
-        categories: Dict[str, "Categories"] = None,
+        categories: Dict[str, Categories] = None,
     ) -> "Dataset[DTargetType]":
         """
         Create a Dataset from an existing DataFrame and lazy converters.
@@ -282,8 +303,7 @@ def convert_to_schema(
             A new Dataset instance with the converted schema
         """
         # Import the converter implementations to register them
-        # ruff: noqa: F401
-        import datumaro.experimental.converters  # pyright: ignore [reportUnusedImport, reportMissingImports]
+        import datumaro.experimental.converters  # type: ignore[import]  # noqa: F401
 
         # Determine target schema
         if isinstance(target_dtype_or_schema, Schema):
diff --git a/src/datumaro/experimental/fields.py b/src/datumaro/experimental/fields.py
@@ -246,3 +246,57 @@ def image_path_field(semantic: Semantic = Semantic.Default) -> Any:
         ImagePathField instance configured with the given semantic tags
     """
     return ImagePathField(semantic=semantic)
+
+
+@dataclass(frozen=True)
+class LabelField(Field):
+    """
+    Represents a unified label annotation field that supports both single and multi-label scenarios.
+
+    This field automatically detects whether the input is a single label or multiple labels
+    and handles the conversion accordingly:
+    - Single labels: stored as Int32
+    - Multi-labels: stored as List(Int32)
+    """
+
+    semantic: Semantic
+    dtype: Any
+    multi_label: bool = False  # Flag to indicate if this field should handle multi-labels
+
+    def to_polars_schema(self, name: str) -> dict[str, pl.DataType]:
+        """Generate schema based on whether this is single or multi-label."""
+        if self.multi_label:
+            return {name: pl.List(self.dtype)}
+        return {name: self.dtype}
+
+    def to_polars(self, name: str, value: Any) -> dict[str, pl.Series]:
+        """Convert label(s) to Polars format for single or multi-label cases."""
+        if value is None:
+            return {name: pl.Series(name, [None], dtype=self.dtype)}
+
+        if self.multi_label:
+            return {name: pl.Series(name, [to_numpy(value)], dtype=pl.List(self.dtype))}
+
+        return {name: pl.Series(name, [value], dtype=self.dtype)}
+
+    def from_polars(self, name: str, row_index: int, df: pl.DataFrame, target_type: type[T]) -> T:
+        """Reconstruct label(s) from Polars data."""
+        data = df[name][row_index]
+        return from_polars_data(data, target_type)
+
+
+def label_field(
+    dtype: Any = pl.Int32(), semantic: Semantic = Semantic.Default, multi_label: bool = False
+) -> Any:
+    """
+    Create a LabelField instance with the specified parameters.
+
+    Args:
+        dtype: Polars data type for label values (defaults to pl.Int32())
+        semantic: Semantic tags describing the label purpose (optional)
+        multi_label: Whether this field should handle multiple labels (defaults to False)
+
+    Returns:
+        LabelField instance configured with the given parameters
+    """
+    return LabelField(semantic=semantic, dtype=dtype, multi_label=multi_label)
diff --git a/src/datumaro/experimental/legacy.py b/src/datumaro/experimental/legacy.py
@@ -194,7 +194,7 @@ def analyze_legacy_dataset(legacy_dataset: LegacyDataset) -> AnalysisResult:
         attributes.update(media_converter.get_schema_attributes())
     except ValueError:
         # No converter for this media type - skip
-        media_converter = None
+        pass
 
     # Get annotation attributes from converters
     for ann_type in ann_types:
diff --git a/src/datumaro/experimental/type_registry.py b/src/datumaro/experimental/type_registry.py
@@ -9,7 +9,8 @@
 DataFrames. New types can be registered at runtime without modifying core code.
 """
 
-from typing import Any, Callable
+import types
+from typing import Any, Callable, Union
 
 import numpy as np
 import polars as pl
@@ -118,9 +119,39 @@ def from_polars_data(polars_data: Any, target_type: type) -> Any:
         >>> isinstance(tensor, torch.Tensor)
         True
     """
+    # Handle direct type matches first
     if target_type in _from_polars_converters:
         return _from_polars_converters[target_type](polars_data)
 
+    # Handle Union types (e.g., torch.Tensor | np.ndarray)
+    # Check if target_type is a Union type (Python 3.10+ style or typing.Union)
+    is_union = False
+    union_args = None
+
+    # Check for types.UnionType (Python 3.10+ syntax: A | B)
+    if isinstance(target_type, types.UnionType):
+        is_union = True
+        union_args = target_type.__args__
+
+    # Check for typing.Union (older syntax: Union[A, B])
+    try:
+        from typing import get_args, get_origin
+
+        if get_origin(target_type) is Union:
+            is_union = True
+            union_args = get_args(target_type)
+    except Exception:
+        pass
+
+    if is_union and union_args:
+        # Try each type in the union until one succeeds
+        for union_type in union_args:
+            if union_type in _from_polars_converters:
+                try:
+                    return _from_polars_converters[union_type](polars_data)
+                except KeyError:
+                    # If conversion fails, try the next type in the union
+                    continue
     raise TypeError(f"No converter registered for type {target_type}")
 
 
@@ -136,3 +167,101 @@ def from_polars_data(polars_data: Any, target_type: type) -> Any:
     )  # pyright: ignore[reportUnknownMemberType, reportUnknownLambdaType, reportUnknownArgumentType]
 except ImportError:
     pass
+
+
+# Register PIL Image converters if available
+try:
+    from PIL import Image
+
+    register_numpy_converter(Image.Image, lambda x: np.array(x))
+    register_from_polars_converter(Image.Image, lambda x: Image.fromarray(np.array(x)))
+except ImportError:
+    pass
+
+
+def convert_image_type(image: Any, target_type: type) -> Any:
+    """
+    Convert an image between different types (numpy, PIL, torch).
+    This function provides direct conversion between image types using
+    the registered converters in the type registry.
+    Args:
+        image: Source image (numpy.ndarray, PIL.Image.Image, or torch.Tensor)
+        target_type: Target type to convert to
+    Returns:
+        Image converted to the target type
+    Raises:
+        TypeError: If source or target type is not supported
+    Example:
+        >>> import numpy as np
+        >>> from PIL import Image
+        >>> import torch
+        >>>
+        >>> # Convert numpy array to PIL Image
+        >>> np_image = np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8)
+        >>> pil_image = convert_image_type(np_image, Image.Image)
+        >>>
+        >>> # Convert PIL Image to torch tensor
+        >>> torch_image = convert_image_type(pil_image, torch.Tensor)
+    """
+    current_type = type(image)
+
+    # Define supported image types - only numpy, PIL Image, and torch Tensor
+    supported_image_types = get_supported_image_types()
+
+    # Validate that target_type is a supported image type
+    if target_type not in supported_image_types:
+        supported_names = [t.__name__ for t in supported_image_types]
+        raise TypeError(
+            f"Target type {target_type.__name__} not supported. Supported image types: {supported_names}"
+        )
+
+    # If already the target type, return as-is
+    if current_type == target_type:
+        return image
+
+    # Convert via numpy as intermediate format
+    try:
+        # First convert to numpy if not already
+        if current_type == np.ndarray:
+            numpy_image = image
+        else:
+            numpy_image = to_numpy(image)
+
+        # Then convert from numpy to target type
+        if target_type == np.ndarray:
+            return numpy_image
+        else:
+            # Convert numpy to target via polars-style conversion
+            return _from_polars_converters[target_type](numpy_image)
+
+    except Exception as e:
+        raise TypeError(f"Cannot convert from {current_type} to {target_type}: {e}")
+
+
+def get_supported_image_types() -> list[type]:
+    """
+    Get a list of all supported image types for conversion.
+    Returns:
+        List of supported image types
+    """
+    supported_types = [np.ndarray]  # numpy is always supported
+
+    # Add conditionally available types
+    try:
+        from PIL import Image
+
+        if Image.Image in _from_polars_converters:
+            supported_types.append(Image.Image)
+    except ImportError:
+        pass
+
+    # Check for torch
+    try:
+        import torch
+
+        if torch.Tensor in _from_polars_converters:
+            supported_types.append(torch.Tensor)
+    except ImportError:
+        pass
+
+    return supported_types
diff --git a/tests/unit/experimental/test_dataset.py b/tests/unit/experimental/test_dataset.py
@@ -599,3 +599,28 @@ class TestSample(Sample):
 
     assert len(schema3.attributes["image"].categories) == 2  # car, truck
     assert len(schema1.attributes["bbox"].categories) == 1  # person
+
+
+def test_union_type_handling():
+    """Test Union type handling with both modern (A | B) and typing.Union syntax."""
+    try:
+        import torch
+    except ImportError:
+        pytest.skip("PyTorch not available")
+
+    from typing import Union
+
+    from datumaro.experimental.type_registry import from_polars_data
+
+    # Modern syntax
+    union_type_modern = torch.Tensor | np.ndarray
+    polars_data = [1.0, 2.0, 3.0]
+    result = from_polars_data(polars_data, union_type_modern)
+    assert isinstance(result, torch.Tensor)
+    assert result.tolist() == [1.0, 2.0, 3.0]
+
+    # typing.Union syntax
+    union_type_typing = Union[torch.Tensor, np.ndarray]
+    result2 = from_polars_data(polars_data, union_type_typing)
+    assert isinstance(result2, torch.Tensor)
+    assert result2.tolist() == [1.0, 2.0, 3.0]
diff --git a/tests/unit/experimental/test_type_registry.py b/tests/unit/experimental/test_type_registry.py