Skip to content

Commit b9be01f

Browse files
Handle int64 columns with missing data in SQL Lab (#8226)
* Handle int64 columns with missing data in SQL Lab * Fix docstring * Add unit test * Small fix * Small fixes * Fix cursor description update * Better fix * Fix unit test, black * Fix nan comparison in unit test
1 parent 1ad1793 commit b9be01f

File tree

4 files changed

+84
-6
lines changed

4 files changed

+84
-6
lines changed

superset/dataframe.py

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,12 @@ def dedup(l, suffix="__", case_sensitive=True):
6262
return new_l
6363

6464

65+
def is_numeric(dtype):
66+
if hasattr(dtype, "_is_numeric"):
67+
return dtype._is_numeric
68+
return np.issubdtype(dtype, np.number)
69+
70+
6571
class SupersetDataFrame(object):
6672
# Mapping numpy dtype.char to generic database types
6773
type_map = {
@@ -80,21 +86,45 @@ class SupersetDataFrame(object):
8086
}
8187

8288
def __init__(self, data, cursor_description, db_engine_spec):
89+
data = data or []
90+
8391
column_names = []
92+
dtype = None
8493
if cursor_description:
85-
column_names = [col[0] for col in cursor_description]
94+
# get deduped list of column names
95+
column_names = dedup([col[0] for col in cursor_description])
8696

87-
self.column_names = dedup(column_names)
97+
# fix cursor descriptor with the deduped names
98+
cursor_description = [
99+
tuple([column_name, *list(description)[1:]])
100+
for column_name, description in zip(column_names, cursor_description)
101+
]
88102

89-
data = data or []
90-
self.df = pd.DataFrame(list(data), columns=self.column_names).infer_objects()
103+
# get type for better type casting, if possible
104+
dtype = db_engine_spec.get_pandas_dtype(cursor_description)
105+
106+
self.column_names = column_names
107+
108+
if dtype:
109+
# convert each column in data into a Series of the proper dtype; we
110+
# need to do this because we can not specify a mixed dtype when
111+
# instantiating the DataFrame, and this allows us to have different
112+
# dtypes for each column.
113+
array = np.array(data)
114+
data = {
115+
column: pd.Series(array[:, i], dtype=dtype[column])
116+
for i, column in enumerate(column_names)
117+
}
118+
self.df = pd.DataFrame(data, columns=column_names)
119+
else:
120+
self.df = pd.DataFrame(list(data), columns=column_names).infer_objects()
91121

92122
self._type_dict = {}
93123
try:
94124
# The driver may not be passing a cursor.description
95125
self._type_dict = {
96126
col: db_engine_spec.get_datatype(cursor_description[i][1])
97-
for i, col in enumerate(self.column_names)
127+
for i, col in enumerate(column_names)
98128
if cursor_description
99129
}
100130
except Exception as e:
@@ -183,7 +213,7 @@ def agg_func(cls, dtype, column_name):
183213
if (
184214
hasattr(dtype, "type")
185215
and issubclass(dtype.type, np.generic)
186-
and np.issubdtype(dtype, np.number)
216+
and is_numeric(dtype)
187217
):
188218
return "sum"
189219
return None

superset/db_engine_specs/base.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,12 @@ def get_datatype(cls, type_code: Any) -> Optional[str]:
275275
return type_code.upper()
276276
return None
277277

278+
@classmethod
279+
def get_pandas_dtype(
280+
cls, cursor_description: List[tuple]
281+
) -> Optional[Dict[str, str]]:
282+
return None
283+
278284
@classmethod
279285
def extra_table_metadata(
280286
cls, database, table_name: str, schema_name: str

superset/db_engine_specs/presto.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,21 @@
3939

4040
QueryStatus = utils.QueryStatus
4141

42+
# map between Presto types and Pandas
43+
pandas_dtype_map = {
44+
"boolean": "bool",
45+
"tinyint": "Int64", # note: capital "I" means nullable int
46+
"smallint": "Int64",
47+
"integer": "Int64",
48+
"bigint": "Int64",
49+
"real": "float64",
50+
"double": "float64",
51+
"varchar": "object",
52+
"timestamp": "datetime64",
53+
"date": "datetime64",
54+
"varbinary": "object",
55+
}
56+
4257

4358
class PrestoEngineSpec(BaseEngineSpec):
4459
engine = "presto"
@@ -1052,3 +1067,9 @@ def latest_sub_partition(cls, table_name, schema, database, **kwargs):
10521067
if df.empty:
10531068
return ""
10541069
return df.to_dict()[field_to_return][0]
1070+
1071+
@classmethod
1072+
def get_pandas_dtype(cls, cursor_description: List[tuple]) -> Dict[str, str]:
1073+
return {
1074+
col[0]: pandas_dtype_map.get(col[1], "object") for col in cursor_description
1075+
}

tests/dataframe_test.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
from superset.dataframe import dedup, SupersetDataFrame
2020
from superset.db_engine_specs import BaseEngineSpec
21+
from superset.db_engine_specs.presto import PrestoEngineSpec
2122
from .base_tests import SupersetTestCase
2223

2324

@@ -108,3 +109,23 @@ def test_dedup_with_data(self):
108109
cursor_descr = (("a", "string"), ("a", "string"))
109110
cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
110111
self.assertListEqual(cdf.column_names, ["a", "a__1"])
112+
113+
def test_int64_with_missing_data(self):
114+
data = [(None,), (1239162456494753670,), (None,), (None,), (None,), (None,)]
115+
cursor_descr = [("user_id", "bigint", None, None, None, None, True)]
116+
117+
# the base engine spec does not provide a dtype based on the cursor
118+
# description, so the column is inferred as float64 because of the
119+
# missing data
120+
cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
121+
np.testing.assert_array_equal(
122+
cdf.raw_df.values.tolist(),
123+
[[np.nan], [1.2391624564947538e18], [np.nan], [np.nan], [np.nan], [np.nan]],
124+
)
125+
126+
# currently only Presto provides a dtype based on the cursor description
127+
cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec)
128+
np.testing.assert_array_equal(
129+
cdf.raw_df.values.tolist(),
130+
[[np.nan], [1239162456494753670], [np.nan], [np.nan], [np.nan], [np.nan]],
131+
)

0 commit comments

Comments
 (0)