Add Hive syntax support, type aliases, parse fallback, and index-based hints

laughingman7743 · claude · laughingman7743 · commit ab33667ca19c · 2026-02-28T20:52:19.000+09:00
- Normalize Hive-style DDL syntax (array&lt;struct&lt;a:int&gt;&gt;) to Trino-style
  so users can paste DESCRIBE TABLE output directly as type hints
- Resolve type alias "int" to "integer" in the parser
- Fall back to untyped conversion when typed converter returns None,
  preventing silent data loss on parse failures
- Support integer keys in result_set_type_hints for index-based column
  resolution, enabling hints for duplicate column names
- Update type annotations across all cursor/result_set files

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/docs/usage.md b/docs/usage.md
@@ -441,6 +441,38 @@ positions = row[0]
 # positions["x"] == 4.736 (float, not "4.736")
 ```
 
+### Hive-style syntax
+
+You can paste type signatures from Hive DDL or ``DESCRIBE TABLE`` output directly.
+Hive-style angle brackets and colons are automatically converted to Trino-style syntax:
+
+```python
+# Both are equivalent:
+result_set_type_hints={"col": "array(struct(a integer, b varchar))"}   # Trino
+result_set_type_hints={"col": "array<struct<a:int,b:varchar>>"}        # Hive
+```
+
+The ``int`` alias is also supported and resolves to ``integer``.
+
+### Index-based hints for duplicate column names
+
+When a query produces columns with the same alias (e.g. ``SELECT a AS x, b AS x``),
+name-based hints cannot distinguish between them. Use integer keys to specify hints
+by zero-based column position:
+
+```python
+cursor.execute(
+    "SELECT a AS x, b AS x FROM my_table",
+    result_set_type_hints={
+        0: "array(integer)",   # first "x" column
+        1: "map(varchar, integer)",  # second "x" column
+    },
+)
+```
+
+Integer (index-based) hints take priority over string (name-based) hints for the same
+column. You can mix both styles in the same dictionary.
+
 ### Constraints
 
 * **Nested arrays in native format** — Athena's native (non-JSON) string representation
diff --git a/pyathena/aio/cursor.py b/pyathena/aio/cursor.py
@@ -79,7 +79,7 @@ async def execute(  # type: ignore[override]
         result_reuse_enable: bool | None = None,
         result_reuse_minutes: int | None = None,
         paramstyle: str | None = None,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
         **kwargs,
     ) -> AioCursor:
         """Execute a SQL query asynchronously.
diff --git a/pyathena/aio/result_set.py b/pyathena/aio/result_set.py
@@ -35,7 +35,7 @@ def __init__(
         query_execution: AthenaQueryExecution,
         arraysize: int,
         retry_config: RetryConfig,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
     ) -> None:
         super().__init__(
             connection=connection,
@@ -55,7 +55,7 @@ async def create(
         query_execution: AthenaQueryExecution,
         arraysize: int,
         retry_config: RetryConfig,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
     ) -> AthenaAioResultSet:
         """Async factory method.
 
diff --git a/pyathena/arrow/async_cursor.py b/pyathena/arrow/async_cursor.py
@@ -149,7 +149,7 @@ def arraysize(self, value: int) -> None:
     def _collect_result_set(
         self,
         query_id: str,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
         unload_location: str | None = None,
         kwargs: dict[str, Any] | None = None,
     ) -> AthenaArrowResultSet:
@@ -181,7 +181,7 @@ def execute(
         result_reuse_enable: bool | None = None,
         result_reuse_minutes: int | None = None,
         paramstyle: str | None = None,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
         **kwargs,
     ) -> tuple[str, Future[AthenaArrowResultSet | Any]]:
         operation, unload_location = self._prepare_unload(operation, s3_staging_dir)
diff --git a/pyathena/arrow/cursor.py b/pyathena/arrow/cursor.py
@@ -137,7 +137,7 @@ def execute(
         result_reuse_minutes: int | None = None,
         paramstyle: str | None = None,
         on_start_query_execution: Callable[[str], None] | None = None,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
         **kwargs,
     ) -> ArrowCursor:
         """Execute a SQL query and return results as Apache Arrow Tables.
diff --git a/pyathena/arrow/result_set.py b/pyathena/arrow/result_set.py
@@ -91,7 +91,7 @@ def __init__(
         unload_location: str | None = None,
         connect_timeout: float | None = None,
         request_timeout: float | None = None,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
         **kwargs,
     ) -> None:
         super().__init__(
diff --git a/pyathena/async_cursor.py b/pyathena/async_cursor.py
@@ -147,7 +147,7 @@ def poll(self, query_id: str) -> Future[AthenaQueryExecution]:
     def _collect_result_set(
         self,
         query_id: str,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
     ) -> AthenaResultSet:
         query_execution = cast(AthenaQueryExecution, self._poll(query_id))
         return self._result_set_class(
@@ -170,7 +170,7 @@ def execute(
         result_reuse_enable: bool | None = None,
         result_reuse_minutes: int | None = None,
         paramstyle: str | None = None,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
         **kwargs,
     ) -> tuple[str, Future[AthenaResultSet | Any]]:
         """Execute a SQL query asynchronously.
diff --git a/pyathena/converter.py b/pyathena/converter.py
@@ -12,7 +12,13 @@
 
 from dateutil.tz import gettz
 
-from pyathena.parser import TypedValueConverter, TypeNode, TypeSignatureParser, _split_array_items
+from pyathena.parser import (
+    TypedValueConverter,
+    TypeNode,
+    TypeSignatureParser,
+    _normalize_hive_syntax,
+    _split_array_items,
+)
 from pyathena.util import strtobool
 
 _logger = logging.getLogger(__name__)
@@ -559,8 +565,9 @@ def convert(self, type_: str, value: str | None, type_hint: str | None = None) -
         """Convert a string value to the appropriate Python type.
 
         When ``type_hint`` is provided, uses the typed converter for precise
-        conversion of complex types. Otherwise, uses the standard converter
-        for the given Athena type.
+        conversion of complex types. If the typed converter returns ``None``
+        (indicating a parse failure), falls back to the standard untyped
+        converter so that data is never silently lost.
 
         Args:
             type_: The Athena data type name (e.g., "integer", "varchar", "array").
@@ -575,19 +582,30 @@ def convert(self, type_: str, value: str | None, type_hint: str | None = None) -
             return None
         if type_hint:
             type_node = self._parse_type_hint(type_hint)
-            return self._typed_converter.convert(value, type_node)
+            result = self._typed_converter.convert(value, type_node)
+            if result is not None:
+                return result
+            # Typed conversion returned None — this means a parse failure
+            # (actual SQL NULLs are caught by the `value is None` check above).
+            # Fall back to untyped conversion to avoid silent data loss.
+            return self.get(type_)(value)
         converter = self.get(type_)
         return converter(value)
 
     def _parse_type_hint(self, type_hint: str) -> TypeNode:
         """Parse a type hint string into a TypeNode, with caching.
 
+        Normalizes Hive-style syntax (``array<int>``) to Trino-style
+        (``array(integer)``) before parsing, so both syntaxes share the
+        same cache entry.
+
         Args:
             type_hint: Athena DDL type signature string.
 
         Returns:
             Parsed TypeNode.
         """
-        if type_hint not in self._parsed_hints:
-            self._parsed_hints[type_hint] = self._parser.parse(type_hint)
-        return self._parsed_hints[type_hint]
+        normalized = _normalize_hive_syntax(type_hint)
+        if normalized not in self._parsed_hints:
+            self._parsed_hints[normalized] = self._parser.parse(normalized)
+        return self._parsed_hints[normalized]
diff --git a/pyathena/cursor.py b/pyathena/cursor.py
@@ -95,7 +95,7 @@ def execute(
         result_reuse_minutes: int | None = None,
         paramstyle: str | None = None,
         on_start_query_execution: Callable[[str], None] | None = None,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
         **kwargs,
     ) -> Cursor:
         """Execute a SQL query.
diff --git a/pyathena/pandas/async_cursor.py b/pyathena/pandas/async_cursor.py
@@ -118,7 +118,7 @@ def arraysize(self, value: int) -> None:
     def _collect_result_set(
         self,
         query_id: str,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
         keep_default_na: bool = False,
         na_values: Iterable[str] | None = ("",),
         quoting: int = 1,
@@ -156,7 +156,7 @@ def execute(
         result_reuse_enable: bool | None = None,
         result_reuse_minutes: int | None = None,
         paramstyle: str | None = None,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
         keep_default_na: bool = False,
         na_values: Iterable[str] | None = ("",),
         quoting: int = 1,
diff --git a/pyathena/pandas/cursor.py b/pyathena/pandas/cursor.py
@@ -153,7 +153,7 @@ def execute(
         na_values: Iterable[str] | None = ("",),
         quoting: int = 1,
         on_start_query_execution: Callable[[str], None] | None = None,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
         **kwargs,
     ) -> PandasCursor:
         """Execute a SQL query and return results as pandas DataFrames.
diff --git a/pyathena/pandas/result_set.py b/pyathena/pandas/result_set.py
@@ -229,7 +229,7 @@ def __init__(
         cache_type: str | None = None,
         max_workers: int = (cpu_count() or 1) * 5,
         auto_optimize_chunksize: bool = False,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
         **kwargs,
     ) -> None:
         """Initialize AthenaPandasResultSet with pandas-specific configurations.
diff --git a/pyathena/parser.py b/pyathena/parser.py
@@ -1,10 +1,38 @@
 from __future__ import annotations
 
 import json
+import re
 from collections.abc import Callable
 from dataclasses import dataclass, field
 from typing import Any
 
+# Aliases for Athena type names that differ between Hive DDL and Trino DDL.
+_TYPE_ALIASES: dict[str, str] = {
+    "int": "integer",
+}
+
+# Pattern for normalizing Hive-style type signatures to Trino-style.
+# Matches angle brackets and colons used in Hive DDL (e.g., array<struct<a:int>>).
+_HIVE_SYNTAX_RE: re.Pattern[str] = re.compile(r"[<>:]")
+_HIVE_REPLACEMENTS: dict[str, str] = {"<": "(", ">": ")", ":": " "}
+
+
+def _normalize_hive_syntax(type_str: str) -> str:
+    """Normalize Hive-style DDL syntax to Trino-style.
+
+    Converts angle-bracket notation (``array<struct<a:int>>``) to
+    parenthesized notation (``array(struct(a int))``).
+
+    Args:
+        type_str: Type signature string, possibly using Hive syntax.
+
+    Returns:
+        Normalized type signature using Trino-style parenthesized notation.
+    """
+    if "<" not in type_str:
+        return type_str
+    return _HIVE_SYNTAX_RE.sub(lambda m: _HIVE_REPLACEMENTS[m.group()], type_str)
+
 
 def _split_array_items(inner: str) -> list[str]:
     """Split array items by comma, respecting brace and bracket groupings.
@@ -96,9 +124,11 @@ def parse(self, type_str: str) -> TypeNode:
 
         paren_idx = type_str.find("(")
         if paren_idx == -1:
-            return TypeNode(type_name=type_str.lower())
+            name = type_str.lower()
+            return TypeNode(type_name=_TYPE_ALIASES.get(name, name))
 
         type_name = type_str[:paren_idx].strip().lower()
+        type_name = _TYPE_ALIASES.get(type_name, type_name)
 
         inner = type_str[paren_idx + 1 : -1].strip()
 
diff --git a/pyathena/polars/async_cursor.py b/pyathena/polars/async_cursor.py
@@ -161,7 +161,7 @@ def arraysize(self, value: int) -> None:
     def _collect_result_set(
         self,
         query_id: str,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
         unload_location: str | None = None,
         kwargs: dict[str, Any] | None = None,
     ) -> AthenaPolarsResultSet:
@@ -195,7 +195,7 @@ def execute(
         result_reuse_enable: bool | None = None,
         result_reuse_minutes: int | None = None,
         paramstyle: str | None = None,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
         **kwargs,
     ) -> tuple[str, Future[AthenaPolarsResultSet | Any]]:
         """Execute a SQL query asynchronously and return results as Polars DataFrames.
diff --git a/pyathena/polars/cursor.py b/pyathena/polars/cursor.py
@@ -157,7 +157,7 @@ def execute(
         result_reuse_minutes: int | None = None,
         paramstyle: str | None = None,
         on_start_query_execution: Callable[[str], None] | None = None,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
         **kwargs,
     ) -> PolarsCursor:
         """Execute a SQL query and return results as Polars DataFrames.
diff --git a/pyathena/polars/result_set.py b/pyathena/polars/result_set.py
@@ -202,7 +202,7 @@ def __init__(
         cache_type: str | None = None,
         max_workers: int = (cpu_count() or 1) * 5,
         chunksize: int | None = None,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
         **kwargs,
     ) -> None:
         """Initialize the Polars result set.
diff --git a/pyathena/result_set.py b/pyathena/result_set.py
@@ -65,7 +65,7 @@ def __init__(
         arraysize: int,
         retry_config: RetryConfig,
         _pre_fetch: bool = True,
-        result_set_type_hints: dict[str, str] | None = None,
+        result_set_type_hints: dict[str | int, str] | None = None,
     ) -> None:
         super().__init__(arraysize=arraysize)
         self._connection: Connection[Any] | None = connection
@@ -74,11 +74,14 @@ def __init__(
         if not self._query_execution:
             raise ProgrammingError("Required argument `query_execution` not found.")
         self._retry_config = retry_config
-        self._result_set_type_hints = (
-            {k.lower(): v for k, v in result_set_type_hints.items()}
-            if result_set_type_hints
-            else None
-        )
+        self._hints_by_name: dict[str, str] = {}
+        self._hints_by_index: dict[int, str] = {}
+        if result_set_type_hints:
+            for k, v in result_set_type_hints.items():
+                if isinstance(k, int):
+                    self._hints_by_index[k] = v
+                else:
+                    self._hints_by_name[k.lower()] = v
         self._client = connection.session.client(
             "s3",
             region_name=connection.region_name,
@@ -433,18 +436,40 @@ def _process_metadata(self, response: dict[str, Any]) -> None:
         self._metadata = tuple(column_info)
         self._column_types = tuple(m.get("Type", "") for m in self._metadata)
         self._column_names = tuple(m.get("Name", "") for m in self._metadata)
-        if self._result_set_type_hints and any(
+        if (self._hints_by_name or self._hints_by_index) and any(
             t.lower() in self._COMPLEX_TYPES for t in self._column_types
         ):
             hints = tuple(
-                self._result_set_type_hints.get(m.get("Name", "").lower())
-                if t.lower() in self._COMPLEX_TYPES
-                else None
-                for m, t in zip(self._metadata, self._column_types, strict=True)
+                self._resolve_type_hint(i, m.get("Name", "").lower(), t.lower())
+                for i, (m, t) in enumerate(zip(self._metadata, self._column_types, strict=True))
             )
             if any(hints):
                 self._column_type_hints = hints
 
+    def _resolve_type_hint(
+        self, index: int, col_name_lower: str, col_type_lower: str
+    ) -> str | None:
+        """Look up the type hint for a column by index then by name.
+
+        Index-based hints take priority over name-based hints, allowing
+        callers to disambiguate duplicate column names.
+
+        Args:
+            index: Zero-based column position.
+            col_name_lower: Lowercased column name from metadata.
+            col_type_lower: Lowercased column type from metadata.
+
+        Returns:
+            The type hint string, or None if the column has no hint or
+            is not a complex type.
+        """
+        if col_type_lower not in self._COMPLEX_TYPES:
+            return None
+        hint = self._hints_by_index.get(index)
+        if hint is not None:
+            return hint
+        return self._hints_by_name.get(col_name_lower)
+
     def _process_update_count(self, response: dict[str, Any]) -> None:
         update_count = response.get("UpdateCount")
         if (
diff --git a/pyathena/s3fs/async_cursor.py b/pyathena/s3fs/async_cursor.py
diff --git a/pyathena/s3fs/cursor.py b/pyathena/s3fs/cursor.py
diff --git a/pyathena/s3fs/result_set.py b/pyathena/s3fs/result_set.py
diff --git a/tests/pyathena/test_converter.py b/tests/pyathena/test_converter.py
diff --git a/tests/pyathena/test_parser.py b/tests/pyathena/test_parser.py