Skip to content

Commit 15e4e8d

Browse files
fix(utils): Suppress pandas date parsing warnings in normalize_dttm_col (#35042)
1 parent c5f220a commit 15e4e8d

File tree

3 files changed

+385
-32
lines changed

3 files changed

+385
-32
lines changed

superset/utils/core.py

Lines changed: 60 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import threading
3535
import traceback
3636
import uuid
37+
import warnings
3738
import zlib
3839
from collections.abc import Iterable, Iterator, Sequence
3940
from contextlib import closing, contextmanager
@@ -110,6 +111,7 @@
110111
from superset.utils.database import get_example_database
111112
from superset.utils.date_parser import parse_human_timedelta
112113
from superset.utils.hashing import md5_sha_from_dict, md5_sha_from_str
114+
from superset.utils.pandas import detect_datetime_format
113115

114116
if TYPE_CHECKING:
115117
from superset.connectors.sqla.models import BaseDatasource, TableColumn
@@ -1858,6 +1860,62 @@ def get_legacy_time_column(
18581860
)
18591861

18601862

1863+
def _process_datetime_column(
1864+
df: pd.DataFrame,
1865+
col: DateColumn,
1866+
) -> None:
1867+
"""Process a single datetime column with format detection."""
1868+
if col.timestamp_format in ("epoch_s", "epoch_ms"):
1869+
dttm_series = df[col.col_label]
1870+
if is_numeric_dtype(dttm_series):
1871+
# Column is formatted as a numeric value
1872+
unit = col.timestamp_format.replace("epoch_", "")
1873+
df[col.col_label] = pd.to_datetime(
1874+
dttm_series,
1875+
utc=False,
1876+
unit=unit,
1877+
origin="unix",
1878+
errors="coerce",
1879+
exact=False,
1880+
)
1881+
else:
1882+
# Column has already been formatted as a timestamp.
1883+
try:
1884+
df[col.col_label] = dttm_series.apply(
1885+
lambda x: pd.Timestamp(x) if pd.notna(x) else pd.NaT
1886+
)
1887+
except ValueError:
1888+
logger.warning(
1889+
"Unable to convert column %s to datetime, ignoring",
1890+
col.col_label,
1891+
)
1892+
else:
1893+
# Try to detect format if not specified
1894+
format_to_use = col.timestamp_format or detect_datetime_format(
1895+
df[col.col_label]
1896+
)
1897+
1898+
# Parse with or without format (suppress warning if no format)
1899+
if format_to_use:
1900+
df[col.col_label] = pd.to_datetime(
1901+
df[col.col_label],
1902+
utc=False,
1903+
format=format_to_use,
1904+
errors="coerce",
1905+
exact=False,
1906+
)
1907+
else:
1908+
with warnings.catch_warnings():
1909+
warnings.filterwarnings("ignore", message=".*Could not infer format.*")
1910+
df[col.col_label] = pd.to_datetime(
1911+
df[col.col_label],
1912+
utc=False,
1913+
format=None,
1914+
errors="coerce",
1915+
exact=False,
1916+
)
1917+
1918+
18611919
def normalize_dttm_col(
18621920
df: pd.DataFrame,
18631921
dttm_cols: tuple[DateColumn, ...] = tuple(), # noqa: C408
@@ -1866,38 +1924,8 @@ def normalize_dttm_col(
18661924
if _col.col_label not in df.columns:
18671925
continue
18681926

1869-
if _col.timestamp_format in ("epoch_s", "epoch_ms"):
1870-
dttm_series = df[_col.col_label]
1871-
if is_numeric_dtype(dttm_series):
1872-
# Column is formatted as a numeric value
1873-
unit = _col.timestamp_format.replace("epoch_", "")
1874-
df[_col.col_label] = pd.to_datetime(
1875-
dttm_series,
1876-
utc=False,
1877-
unit=unit,
1878-
origin="unix",
1879-
errors="coerce",
1880-
exact=False,
1881-
)
1882-
else:
1883-
# Column has already been formatted as a timestamp.
1884-
try:
1885-
df[_col.col_label] = dttm_series.apply(
1886-
lambda x: pd.Timestamp(x) if pd.notna(x) else pd.NaT
1887-
)
1888-
except ValueError:
1889-
logger.warning(
1890-
"Unable to convert column %s to datetime, ignoring",
1891-
_col.col_label,
1892-
)
1893-
else:
1894-
df[_col.col_label] = pd.to_datetime(
1895-
df[_col.col_label],
1896-
utc=False,
1897-
format=_col.timestamp_format,
1898-
errors="coerce",
1899-
exact=False,
1900-
)
1927+
_process_datetime_column(df, _col)
1928+
19011929
if _col.offset:
19021930
df[_col.col_label] += timedelta(hours=_col.offset)
19031931
if _col.time_shift is not None:

superset/utils/pandas.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
"""Pandas utilities for data processing."""
18+
19+
import pandas as pd
20+
21+
22+
def detect_datetime_format(series: pd.Series, sample_size: int = 100) -> str | None:
23+
"""
24+
Detect the datetime format from a sample of the series.
25+
26+
:param series: The pandas Series to analyze
27+
:param sample_size: Number of rows to sample for format detection
28+
:return: Detected format string or None if no consistent format found
29+
"""
30+
# Most common formats first for performance
31+
common_formats = [
32+
"%Y-%m-%d %H:%M:%S",
33+
"%Y-%m-%d",
34+
"%Y-%m-%dT%H:%M:%S",
35+
"%Y-%m-%dT%H:%M:%SZ",
36+
"%Y-%m-%dT%H:%M:%S.%f",
37+
"%Y-%m-%dT%H:%M:%S.%fZ",
38+
"%m/%d/%Y",
39+
"%d/%m/%Y",
40+
"%Y/%m/%d",
41+
"%m/%d/%Y %H:%M:%S",
42+
"%d/%m/%Y %H:%M:%S",
43+
"%m-%d-%Y",
44+
"%d-%m-%Y",
45+
"%Y%m%d",
46+
]
47+
48+
# Get non-null sample
49+
sample = series.dropna().head(sample_size)
50+
if sample.empty:
51+
return None
52+
53+
# Convert to string if not already
54+
if not pd.api.types.is_string_dtype(sample):
55+
sample = sample.astype(str)
56+
57+
# Try each format
58+
for fmt in common_formats:
59+
try:
60+
# Test on small sample first
61+
test_sample = sample.head(10)
62+
pd.to_datetime(test_sample, format=fmt, errors="raise")
63+
# If successful, verify on larger sample
64+
pd.to_datetime(sample, format=fmt, errors="raise")
65+
return fmt
66+
except (ValueError, TypeError):
67+
continue
68+
69+
return None

0 commit comments

Comments
 (0)