Skip to content

Handle int64 columns with missing data in SQL Lab #8226

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Sep 17, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 36 additions & 6 deletions superset/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,12 @@ def dedup(l, suffix="__", case_sensitive=True):
return new_l


def is_numeric(dtype):
if hasattr(dtype, "_is_numeric"):
return dtype._is_numeric
return np.issubdtype(dtype, np.number)


class SupersetDataFrame(object):
# Mapping numpy dtype.char to generic database types
type_map = {
Expand All @@ -80,21 +86,45 @@ class SupersetDataFrame(object):
}

def __init__(self, data, cursor_description, db_engine_spec):
data = data or []

column_names = []
dtype = None
if cursor_description:
column_names = [col[0] for col in cursor_description]
# get deduped list of column names
column_names = dedup([col[0] for col in cursor_description])

self.column_names = dedup(column_names)
# fix cursor descriptor with the deduped names
cursor_description = [
tuple([column_name, *list(description)[1:]])
for column_name, description in zip(column_names, cursor_description)
]

data = data or []
self.df = pd.DataFrame(list(data), columns=self.column_names).infer_objects()
# get type for better type casting, if possible
dtype = db_engine_spec.get_pandas_dtype(cursor_description)

self.column_names = column_names

if dtype:
# convert each column in data into a Series of the proper dtype; we
# need to do this because we can not specify a mixed dtype when
# instantiating the DataFrame, and this allows us to have different
# dtypes for each column.
array = np.array(data)
data = {
column: pd.Series(array[:, i], dtype=dtype[column])
for i, column in enumerate(column_names)
}
self.df = pd.DataFrame(data, columns=column_names)
else:
self.df = pd.DataFrame(list(data), columns=column_names).infer_objects()

self._type_dict = {}
try:
# The driver may not be passing a cursor.description
self._type_dict = {
col: db_engine_spec.get_datatype(cursor_description[i][1])
for i, col in enumerate(self.column_names)
for i, col in enumerate(column_names)
if cursor_description
}
except Exception as e:
Expand Down Expand Up @@ -183,7 +213,7 @@ def agg_func(cls, dtype, column_name):
if (
hasattr(dtype, "type")
and issubclass(dtype.type, np.generic)
and np.issubdtype(dtype, np.number)
and is_numeric(dtype)
):
return "sum"
return None
Expand Down
6 changes: 6 additions & 0 deletions superset/db_engine_specs/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,12 @@ def get_datatype(cls, type_code: Any) -> Optional[str]:
return type_code.upper()
return None

@classmethod
def get_pandas_dtype(
cls, cursor_description: List[tuple]
) -> Optional[Dict[str, str]]:
return None

@classmethod
def extra_table_metadata(
cls, database, table_name: str, schema_name: str
Expand Down
21 changes: 21 additions & 0 deletions superset/db_engine_specs/presto.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,21 @@

QueryStatus = utils.QueryStatus

# map between Presto types and Pandas
pandas_dtype_map = {
"boolean": "bool",
"tinyint": "Int64", # note: capital "I" means nullable int
"smallint": "Int64",
"integer": "Int64",
"bigint": "Int64",
"real": "float64",
"double": "float64",
"varchar": "object",
"timestamp": "datetime64",
"date": "datetime64",
"varbinary": "object",
}


class PrestoEngineSpec(BaseEngineSpec):
engine = "presto"
Expand Down Expand Up @@ -1052,3 +1067,9 @@ def latest_sub_partition(cls, table_name, schema, database, **kwargs):
if df.empty:
return ""
return df.to_dict()[field_to_return][0]

@classmethod
def get_pandas_dtype(cls, cursor_description: List[tuple]) -> Dict[str, str]:
return {
col[0]: pandas_dtype_map.get(col[1], "object") for col in cursor_description
}
21 changes: 21 additions & 0 deletions tests/dataframe_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from superset.dataframe import dedup, SupersetDataFrame
from superset.db_engine_specs import BaseEngineSpec
from superset.db_engine_specs.presto import PrestoEngineSpec
from .base_tests import SupersetTestCase


Expand Down Expand Up @@ -108,3 +109,23 @@ def test_dedup_with_data(self):
cursor_descr = (("a", "string"), ("a", "string"))
cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
self.assertListEqual(cdf.column_names, ["a", "a__1"])

def test_int64_with_missing_data(self):
data = [(None,), (1239162456494753670,), (None,), (None,), (None,), (None,)]
cursor_descr = [("user_id", "bigint", None, None, None, None, True)]

# the base engine spec does not provide a dtype based on the cursor
# description, so the column is inferred as float64 because of the
# missing data
cdf = SupersetDataFrame(data, cursor_descr, BaseEngineSpec)
np.testing.assert_array_equal(
cdf.raw_df.values.tolist(),
[[np.nan], [1.2391624564947538e18], [np.nan], [np.nan], [np.nan], [np.nan]],
)

# currently only Presto provides a dtype based on the cursor description
cdf = SupersetDataFrame(data, cursor_descr, PrestoEngineSpec)
np.testing.assert_array_equal(
cdf.raw_df.values.tolist(),
[[np.nan], [1239162456494753670], [np.nan], [np.nan], [np.nan], [np.nan]],
)