Fix sample validation for complex types (#1973)

JortBergfeld · web-flow · commit 6497f58f20b8 · 2025-12-07T10:57:18.000Z
This pull request fixes a problem with type validation in the experimental dataset and sample modules. When we use the is_list boolean of score_field, a complex ndarray type is generated that includes an Any type, which is not accepted by isinstance. To circumvent problems with complex types, whenever isintance fails with a type error, we only validate against the origin type (ndarray instead of ndarray[float32], for example) ### Type validation improvements * Updated the `_validate_attribute_type` method in `src/datumaro/experimental/dataset.py` to correctly handle type validation for generic types by using `origin` when available, improving support for complex type annotations. ### Test enhancements * Added a new test, `test_sample_with_is_list`, in `tests/unit/experimental/test_sample.py` to verify that samples with list-type fields (using `is_list=True` in `score_field`) are created without validation errors. Resolves #1971 --------- Signed-off-by: Jort Bergfeld <jort.bergfeld@intel.com>
diff --git a/src/datumaro/experimental/dataset.py b/src/datumaro/experimental/dataset.py
@@ -78,15 +78,22 @@ def _validate_attribute_type(self, expected_type: Any, value: Any) -> bool:
         # Union and Callable types have to be handled separately,
         # because isinstance() does not work with Callable types.
         origin = get_origin(expected_type)
-        if origin is Union:
+        if origin in {Union, types.UnionType}:
             # Check each type in the Union
-            return any(self._validate_attribute_type(typ, value) for typ in get_args(expected_type))
-        if origin in {typing.Callable, collections.abc.Callable} or expected_type in {
+            result = any(self._validate_attribute_type(typ, value) for typ in get_args(expected_type))
+        elif origin in {typing.Callable, collections.abc.Callable} or expected_type in {
             typing.Callable,
             collections.abc.Callable,
         }:
-            return callable(value)
-        return isinstance(value, expected_type)
+            result = callable(value)
+        else:
+            try:
+                result = isinstance(value, expected_type)
+            except TypeError:
+                # Some complex types cannot be validated, for example, sometimes when a numpy dtype is turned
+                # into a list using Polars List, the resulting complex dtype will contain a generic Any.
+                result = isinstance(value, origin)
+        return result
 
     @classmethod
     @cache
diff --git a/tests/unit/experimental/test_sample.py b/tests/unit/experimental/test_sample.py
@@ -5,6 +5,7 @@
 from typing import Any
 
 import numpy as np
+import numpy.typing as npt
 import polars as pl
 import pytest
 
@@ -17,6 +18,7 @@
     bbox_field,
     image_field,
     image_info_field,
+    numeric_field,
 )
 from datumaro.experimental.fields.images import image_path_field
 from datumaro.experimental.schema import Schema
@@ -185,3 +187,11 @@ class ExtendedSample(BaseSample):
     assert len(extended_schema.attributes) == 3
     assert "image_info" in extended_schema.attributes
     assert "image_info" not in base_schema.attributes
+
+
+def test_sample_with_is_list():
+    class MySample(Sample):
+        confidence: npt.NDArray[np.float32] | None = numeric_field(dtype=pl.Float32(), is_list=True)
+
+    # Assert that sample can be created without validation errors
+    MySample(confidence=np.array([0.8]))