Skip to content

Commit ab33667

Browse files
Add Hive syntax support, type aliases, parse fallback, and index-based hints
- Normalize Hive-style DDL syntax (array<struct<a:int>>) to Trino-style so users can paste DESCRIBE TABLE output directly as type hints - Resolve type alias "int" to "integer" in the parser - Fall back to untyped conversion when typed converter returns None, preventing silent data loss on parse failures - Support integer keys in result_set_type_hints for index-based column resolution, enabling hints for duplicate column names - Update type annotations across all cursor/result_set files Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 989aab2 commit ab33667

22 files changed

+268
-42
lines changed

docs/usage.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,38 @@ positions = row[0]
441441
# positions["x"] == 4.736 (float, not "4.736")
442442
```
443443

444+
### Hive-style syntax
445+
446+
You can paste type signatures from Hive DDL or ``DESCRIBE TABLE`` output directly.
447+
Hive-style angle brackets and colons are automatically converted to Trino-style syntax:
448+
449+
```python
450+
# Both are equivalent:
451+
result_set_type_hints={"col": "array(struct(a integer, b varchar))"} # Trino
452+
result_set_type_hints={"col": "array<struct<a:int,b:varchar>>"} # Hive
453+
```
454+
455+
The ``int`` alias is also supported and resolves to ``integer``.
456+
457+
### Index-based hints for duplicate column names
458+
459+
When a query produces columns with the same alias (e.g. ``SELECT a AS x, b AS x``),
460+
name-based hints cannot distinguish between them. Use integer keys to specify hints
461+
by zero-based column position:
462+
463+
```python
464+
cursor.execute(
465+
"SELECT a AS x, b AS x FROM my_table",
466+
result_set_type_hints={
467+
0: "array(integer)", # first "x" column
468+
1: "map(varchar, integer)", # second "x" column
469+
},
470+
)
471+
```
472+
473+
Integer (index-based) hints take priority over string (name-based) hints for the same
474+
column. You can mix both styles in the same dictionary.
475+
444476
### Constraints
445477

446478
* **Nested arrays in native format** — Athena's native (non-JSON) string representation

pyathena/aio/cursor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ async def execute( # type: ignore[override]
7979
result_reuse_enable: bool | None = None,
8080
result_reuse_minutes: int | None = None,
8181
paramstyle: str | None = None,
82-
result_set_type_hints: dict[str, str] | None = None,
82+
result_set_type_hints: dict[str | int, str] | None = None,
8383
**kwargs,
8484
) -> AioCursor:
8585
"""Execute a SQL query asynchronously.

pyathena/aio/result_set.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def __init__(
3535
query_execution: AthenaQueryExecution,
3636
arraysize: int,
3737
retry_config: RetryConfig,
38-
result_set_type_hints: dict[str, str] | None = None,
38+
result_set_type_hints: dict[str | int, str] | None = None,
3939
) -> None:
4040
super().__init__(
4141
connection=connection,
@@ -55,7 +55,7 @@ async def create(
5555
query_execution: AthenaQueryExecution,
5656
arraysize: int,
5757
retry_config: RetryConfig,
58-
result_set_type_hints: dict[str, str] | None = None,
58+
result_set_type_hints: dict[str | int, str] | None = None,
5959
) -> AthenaAioResultSet:
6060
"""Async factory method.
6161

pyathena/arrow/async_cursor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ def arraysize(self, value: int) -> None:
149149
def _collect_result_set(
150150
self,
151151
query_id: str,
152-
result_set_type_hints: dict[str, str] | None = None,
152+
result_set_type_hints: dict[str | int, str] | None = None,
153153
unload_location: str | None = None,
154154
kwargs: dict[str, Any] | None = None,
155155
) -> AthenaArrowResultSet:
@@ -181,7 +181,7 @@ def execute(
181181
result_reuse_enable: bool | None = None,
182182
result_reuse_minutes: int | None = None,
183183
paramstyle: str | None = None,
184-
result_set_type_hints: dict[str, str] | None = None,
184+
result_set_type_hints: dict[str | int, str] | None = None,
185185
**kwargs,
186186
) -> tuple[str, Future[AthenaArrowResultSet | Any]]:
187187
operation, unload_location = self._prepare_unload(operation, s3_staging_dir)

pyathena/arrow/cursor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def execute(
137137
result_reuse_minutes: int | None = None,
138138
paramstyle: str | None = None,
139139
on_start_query_execution: Callable[[str], None] | None = None,
140-
result_set_type_hints: dict[str, str] | None = None,
140+
result_set_type_hints: dict[str | int, str] | None = None,
141141
**kwargs,
142142
) -> ArrowCursor:
143143
"""Execute a SQL query and return results as Apache Arrow Tables.

pyathena/arrow/result_set.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def __init__(
9191
unload_location: str | None = None,
9292
connect_timeout: float | None = None,
9393
request_timeout: float | None = None,
94-
result_set_type_hints: dict[str, str] | None = None,
94+
result_set_type_hints: dict[str | int, str] | None = None,
9595
**kwargs,
9696
) -> None:
9797
super().__init__(

pyathena/async_cursor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def poll(self, query_id: str) -> Future[AthenaQueryExecution]:
147147
def _collect_result_set(
148148
self,
149149
query_id: str,
150-
result_set_type_hints: dict[str, str] | None = None,
150+
result_set_type_hints: dict[str | int, str] | None = None,
151151
) -> AthenaResultSet:
152152
query_execution = cast(AthenaQueryExecution, self._poll(query_id))
153153
return self._result_set_class(
@@ -170,7 +170,7 @@ def execute(
170170
result_reuse_enable: bool | None = None,
171171
result_reuse_minutes: int | None = None,
172172
paramstyle: str | None = None,
173-
result_set_type_hints: dict[str, str] | None = None,
173+
result_set_type_hints: dict[str | int, str] | None = None,
174174
**kwargs,
175175
) -> tuple[str, Future[AthenaResultSet | Any]]:
176176
"""Execute a SQL query asynchronously.

pyathena/converter.py

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,13 @@
1212

1313
from dateutil.tz import gettz
1414

15-
from pyathena.parser import TypedValueConverter, TypeNode, TypeSignatureParser, _split_array_items
15+
from pyathena.parser import (
16+
TypedValueConverter,
17+
TypeNode,
18+
TypeSignatureParser,
19+
_normalize_hive_syntax,
20+
_split_array_items,
21+
)
1622
from pyathena.util import strtobool
1723

1824
_logger = logging.getLogger(__name__)
@@ -559,8 +565,9 @@ def convert(self, type_: str, value: str | None, type_hint: str | None = None) -
559565
"""Convert a string value to the appropriate Python type.
560566
561567
When ``type_hint`` is provided, uses the typed converter for precise
562-
conversion of complex types. Otherwise, uses the standard converter
563-
for the given Athena type.
568+
conversion of complex types. If the typed converter returns ``None``
569+
(indicating a parse failure), falls back to the standard untyped
570+
converter so that data is never silently lost.
564571
565572
Args:
566573
type_: The Athena data type name (e.g., "integer", "varchar", "array").
@@ -575,19 +582,30 @@ def convert(self, type_: str, value: str | None, type_hint: str | None = None) -
575582
return None
576583
if type_hint:
577584
type_node = self._parse_type_hint(type_hint)
578-
return self._typed_converter.convert(value, type_node)
585+
result = self._typed_converter.convert(value, type_node)
586+
if result is not None:
587+
return result
588+
# Typed conversion returned None — this means a parse failure
589+
# (actual SQL NULLs are caught by the `value is None` check above).
590+
# Fall back to untyped conversion to avoid silent data loss.
591+
return self.get(type_)(value)
579592
converter = self.get(type_)
580593
return converter(value)
581594

582595
def _parse_type_hint(self, type_hint: str) -> TypeNode:
583596
"""Parse a type hint string into a TypeNode, with caching.
584597
598+
Normalizes Hive-style syntax (``array<int>``) to Trino-style
599+
(``array(integer)``) before parsing, so both syntaxes share the
600+
same cache entry.
601+
585602
Args:
586603
type_hint: Athena DDL type signature string.
587604
588605
Returns:
589606
Parsed TypeNode.
590607
"""
591-
if type_hint not in self._parsed_hints:
592-
self._parsed_hints[type_hint] = self._parser.parse(type_hint)
593-
return self._parsed_hints[type_hint]
608+
normalized = _normalize_hive_syntax(type_hint)
609+
if normalized not in self._parsed_hints:
610+
self._parsed_hints[normalized] = self._parser.parse(normalized)
611+
return self._parsed_hints[normalized]

pyathena/cursor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def execute(
9595
result_reuse_minutes: int | None = None,
9696
paramstyle: str | None = None,
9797
on_start_query_execution: Callable[[str], None] | None = None,
98-
result_set_type_hints: dict[str, str] | None = None,
98+
result_set_type_hints: dict[str | int, str] | None = None,
9999
**kwargs,
100100
) -> Cursor:
101101
"""Execute a SQL query.

pyathena/pandas/async_cursor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def arraysize(self, value: int) -> None:
118118
def _collect_result_set(
119119
self,
120120
query_id: str,
121-
result_set_type_hints: dict[str, str] | None = None,
121+
result_set_type_hints: dict[str | int, str] | None = None,
122122
keep_default_na: bool = False,
123123
na_values: Iterable[str] | None = ("",),
124124
quoting: int = 1,
@@ -156,7 +156,7 @@ def execute(
156156
result_reuse_enable: bool | None = None,
157157
result_reuse_minutes: int | None = None,
158158
paramstyle: str | None = None,
159-
result_set_type_hints: dict[str, str] | None = None,
159+
result_set_type_hints: dict[str | int, str] | None = None,
160160
keep_default_na: bool = False,
161161
na_values: Iterable[str] | None = ("",),
162162
quoting: int = 1,

0 commit comments

Comments
 (0)