Skip to content
73 changes: 41 additions & 32 deletions python/perspective/perspective/core/data/np.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#
import six
import numpy as np
import pandas as pd
from datetime import datetime

DATE_DTYPES = [np.dtype("datetime64[D]"), np.dtype("datetime64[W]"), np.dtype("datetime64[M]"), np.dtype("datetime64[Y]")]
Expand All @@ -22,45 +21,55 @@ def deconstruct_numpy(array):
Returns:
dict : `array` is the original array, and `mask` is an array of booleans where `True` represents a nan/None value.
'''
is_object_or_string = pd.api.types.is_object_dtype(array.dtype) or pd.api.types.is_string_dtype(array.dtype)
mask = []

# use `isnull` or `isnan` depending on dtype
if is_object_or_string or six.PY2:
# python3 masked_invalid compares datetimes, but not in python2
data = array
mask = np.argwhere(pd.isnull(array)).flatten()
else:
masked = np.ma.masked_invalid(array)
data = masked.data
mask = np.argwhere(masked.mask).flatten()
is_object_or_string_dtype = np.issubdtype(array.dtype, np.str_) or\
np.issubdtype(array.dtype, np.object_)

if data.dtype == bool or data.dtype == "?":
# bool => byte
data = data.astype("b", copy=False)
elif np.issubdtype(data.dtype, np.datetime64):
if six.PY2:
is_object_or_string_dtype = is_object_or_string_dtype or np.issubdtype(array.dtype, np.unicode_)

is_datetime_dtype = np.issubdtype(array.dtype, np.datetime64) or\
np.issubdtype(array.dtype, np.timedelta64)

for i, item in enumerate(array):
invalid = item is None

if not is_object_or_string_dtype:
if is_datetime_dtype:
invalid = invalid or np.isnat(item)
else:
invalid = invalid or np.isnan(item)

if invalid:
mask.append(i)

if array.dtype == bool or array.dtype == "?":
# bool => byte
array = array.astype("b", copy=False)
elif np.issubdtype(array.dtype, np.datetime64):
# treat days/weeks/months/years as datetime objects - avoid idiosyncracy with days of month, etc.
if data.dtype in DATE_DTYPES:
data = data.astype(datetime)
if array.dtype in DATE_DTYPES:
array = array.astype(datetime)

# cast datetimes to millisecond timestamps
# because datetime64("nat") is a double, cast to float64 here - C++ handles the rest
if data.dtype == np.dtype("datetime64[us]"):
data = data.astype(np.float64, copy=False) / 1000
elif data.dtype == np.dtype("datetime64[ns]"):
data = data.astype(np.float64, copy=False) / 1000000
elif data.dtype == np.dtype("datetime64[ms]"):
data = data.astype(np.float64, copy=False)
elif data.dtype == np.dtype("datetime64[s]"):
data = data.astype(np.float64, copy=False) * 1000
elif data.dtype == np.dtype("datetime64[m]"):
data = data.astype(np.float64, copy=False) * 60000
elif data.dtype == np.dtype("datetime64[h]"):
data = data.astype(np.float64, copy=False) * 3600000
elif np.issubdtype(data.dtype, np.timedelta64):
data = data.astype(np.float64, copy=False)
if array.dtype == np.dtype("datetime64[us]"):
array = array.astype(np.float64, copy=False) / 1000
elif array.dtype == np.dtype("datetime64[ns]"):
array = array.astype(np.float64, copy=False) / 1000000
elif array.dtype == np.dtype("datetime64[ms]"):
array = array.astype(np.float64, copy=False)
elif array.dtype == np.dtype("datetime64[s]"):
array = array.astype(np.float64, copy=False) * 1000
elif array.dtype == np.dtype("datetime64[m]"):
array = array.astype(np.float64, copy=False) * 60000
elif array.dtype == np.dtype("datetime64[h]"):
array = array.astype(np.float64, copy=False) * 3600000
elif np.issubdtype(array.dtype, np.timedelta64):
array = array.astype(np.float64, copy=False)

return {
"array": data,
"array": array,
"mask": mask
}
45 changes: 42 additions & 3 deletions python/perspective/perspective/core/data/pd.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,49 @@
# This file is part of the Perspective library, distributed under the terms of
# the Apache License 2.0. The full license can be found in the LICENSE file.
#
import numpy as np
import pandas as pd


def _parse_datetime_index(index):
'''Given an instance of `pandas.DatetimeIndex`, parse its `freq` and return a `numpy.dtype`
that corresponds to the unit it should be parsed in.

Because Pandas DataFrames cannot store datetimes in anything other than `datetime64[ns]`, we need
to examine the `DatetimeIndex` itself to understand what unit it needs to be parsed as.

Args:
index (pandas.DatetimeIndex)

Returns:
numpy.dtype : a datetime64 dtype with the correct units depending on `index.freq`.
'''
if index.freq is None:
return np.dtype("datetime64[ns]")

freq = str(index.freq).lower()
new_type = None

if any(s in freq for s in ["businessday", "day"]) or freq == "sm" or freq == "sms":
# days
new_type = "D"
elif freq == "w" or "week" in freq:
# weeks
new_type = "W"
elif any(s in freq for s in ["month", "quarter"]):
# months
new_type = "M"
elif "year" in freq or freq == "a":
new_type = "Y"
else:
# default to datetime
new_type = "ns"

return np.dtype("datetime64[{0}]".format(new_type))


def deconstruct_pandas(data):
'''Remove pivots from the passed-in dataframe.'''
import pandas as pd
kwargs = {}

# level unstacking
Expand Down Expand Up @@ -53,8 +91,9 @@ def deconstruct_pandas(data):
# preserve name from series
flattened.name = data.name

# use explicit index column as primary key
kwargs["index"] = "index"
# make sure all columns are strings
flattened.columns = [str(c) for c in flattened.columns]

data = flattened

return data, kwargs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,11 @@ namespace numpy {
std::vector<std::string> names() const;
std::vector<t_dtype> types() const;
std::uint32_t row_count() const;

/**
* Keep a list of numpy datetime64 units that we should treat as dates and not datetimes.
*/
static const std::vector<std::string> DATE_UNITS;
private:
/**
* When memory cannot be copied for dtype=object arrays, for example), fill the column through iteration.
Expand Down
2 changes: 1 addition & 1 deletion python/perspective/perspective/src/accessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ infer_type(t_val x, t_val date_validator) {
} else {
t = type_string_to_t_dtype(type_string);
}

return t;
}

Expand Down Expand Up @@ -148,7 +149,6 @@ get_data_types(t_val data, std::int32_t format, std::vector<std::string> names,
if (format == 2) {
py::dict data_dict = data.cast<py::dict>();


for (auto tup : data_dict) {
auto name = tup.first.cast<std::string>();
auto data_type = tup.second.get_type().attr("__name__").cast<std::string>();
Expand Down
30 changes: 21 additions & 9 deletions python/perspective/perspective/src/numpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ using namespace perspective;
namespace perspective {
namespace numpy {

const std::vector<std::string> NumpyLoader::DATE_UNITS = {"[D]", "[W]", "[M]", "[Y]"};

NumpyLoader::NumpyLoader(py::object accessor)
: m_init(false)
, m_has_numeric_dtype(false)
Expand All @@ -41,18 +43,29 @@ namespace numpy {
std::vector<t_dtype> reconciled_types;
std::uint32_t num_columns = m_names.size();

// Get numpy dtypes as string so we can tell the difference between dates and datetimes
std::vector<std::string> str_dtypes = m_accessor.attr("types")().cast<std::vector<std::string>>();

for (auto i = 0; i < num_columns; ++i) {
std::string numpy_type_as_string = str_dtypes[i];
t_dtype numpy_type = m_types[i];
t_dtype inferred_type = inferred_types[i];
switch (numpy_type) {
case DTYPE_OBJECT: {
// inferred type has the correct underlying type for the array
reconciled_types.push_back(inferred_type);
} break;
default: {
reconciled_types.push_back(numpy_type);

// Check whether column is a date or a datetime
if(numpy_type_as_string.find("datetime64") != std::string::npos) {
for (const std::string& unit : DATE_UNITS) {
if (numpy_type_as_string.find(unit) != std::string::npos) {
inferred_type = DTYPE_DATE;
}
}
}

// Otherwise, numpy type takes precedence unless date/object - need specificity of inferred type
if (inferred_type == DTYPE_DATE || numpy_type == DTYPE_OBJECT) {
reconciled_types.push_back(inferred_type);
} else {
reconciled_types.push_back(numpy_type);
}
}

return reconciled_types;
Expand Down Expand Up @@ -149,7 +162,7 @@ namespace numpy {
}

// Datetimes are not trivially copyable - they are float64 values that need to be read as int64
if (type == DTYPE_TIME) {
if (type == DTYPE_TIME || type == DTYPE_DATE) {
fill_column_iter(array, tbl, col, name, np_dtype, type, cidx, is_update);
fill_validity_map(col, mask_ptr, mask_size, is_update);
return;
Expand Down Expand Up @@ -377,7 +390,6 @@ namespace numpy {
continue;
}


auto date_components = item.cast<std::map<std::string, std::int32_t>>();
t_date dt = t_date(date_components["year"], date_components["month"], date_components["day"]);
col->set_nth(i, dt);
Expand Down
4 changes: 2 additions & 2 deletions python/perspective/perspective/src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ t_dtype type_string_to_t_dtype(std::string value, std::string name){
// Python date
// TODO inheritance
type = t_dtype::DTYPE_DATE;
} else if (value == "timedelta64") {
// cast timedelta to string to preserve units
} else if (value == "timedelta64" || value == "time") {
// cast time/timedelta to string to preserve units
type = t_dtype::DTYPE_STR;
} else {
CRITICAL("Unknown type '%s' for key '%s'", value, name);
Expand Down
37 changes: 23 additions & 14 deletions python/perspective/perspective/table/_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
import numpy
from math import isnan
from ._date_validator import _PerspectiveDateValidator
from ..core.data import deconstruct_numpy
from ..core.data import deconstruct_numpy, deconstruct_pandas
from ..core.data.pd import _parse_datetime_index
from ..core.exception import PerspectiveError

try:
Expand All @@ -21,11 +22,8 @@

def _flatten_structure(array):
'''Flatten numpy.recarray or structured arrays into a dict.'''
if six.PY2:
# recarrays/structured arrays have weird bit offsets in py2 - make a copy of the array to fix
columns = [numpy.copy(array[col]) for col in array.dtype.names]
else:
columns = [array[col] for col in array.dtype.names]
# recarrays/structured arrays do not have guaranteed bit offsets - make a copy of the array to fix
columns = [numpy.copy(array[col]) for col in array.dtype.names]
return dict(zip(array.dtype.names, columns))


Expand Down Expand Up @@ -75,14 +73,8 @@ def _type_to_format(data_or_schema):
# if pandas not installed or is not a dataframe or series
raise NotImplementedError("Data must be dataframe, dict, list, numpy.recarray, or a numpy structured array.")
else:
from ..core.data import deconstruct_pandas

# flatten column/index multiindex
df, _ = deconstruct_pandas(data_or_schema)

# try to squash object dtype as much as possible
df.fillna(value=numpy.nan, inplace=True)

return True, 1, {c: df[c].values for c in df.columns}


Expand All @@ -104,6 +96,25 @@ def __init__(self, data_or_schema):

self._types = []

# Verify that column names are strings, and that numpy arrays are of type `ndarray`
for name in self._names:
if not isinstance(name, six.string_types):
raise PerspectiveError(
"Column names should be strings, not type `{0}`".format(type(name).__name__))
if self._is_numpy:
array = self._data_or_schema[name]

if not isinstance(array, numpy.ndarray):
raise PerspectiveError("Mixed datasets of numpy.ndarray and lists are not supported.")

dtype = array.dtype
if name == "index" and isinstance(data_or_schema.index, pandas.DatetimeIndex):
# use the index of the original, unflattened dataframe
dtype = _parse_datetime_index(data_or_schema.index)

# keep a string representation of the dtype, as PyBind only has access to the char dtype code
self._types.append(str(dtype))

def data(self):
return self._data_or_schema

Expand Down Expand Up @@ -216,8 +227,6 @@ def _get_numpy_column(self, name):
data = self._data_or_schema.get(name, None)
if data is None:
raise PerspectiveError("Column `{0}` does not exist.".format(name))
if not isinstance(data, numpy.ndarray):
raise PerspectiveError("Mixed datasets of numpy.ndarray and lists are not supported.")
return deconstruct_numpy(data)

def _has_column(self, ridx, name):
Expand Down
33 changes: 23 additions & 10 deletions python/perspective/perspective/table/_date_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,20 @@
from past.builtins import long


def _normalize_timestamp(obj):
'''Convert a timestamp in seconds to milliseconds.'''
try:
# if it overflows, it's milliseconds - otherwise convert from seconds to milliseconds.
datetime.fromtimestamp(obj)
return int(obj * 1000)
except (ValueError, OverflowError):
# milliseconds
return int(obj)


class _PerspectiveDateValidator(object):
'''Validate and parse dates using the `dateutil` package.'''

EPOCH = datetime(1970, 1, 1)

def parse(self, str):
'''Return a datetime.datetime object containing the parsed date, or None if the date is invalid.

Expand All @@ -53,6 +62,13 @@ def to_date_components(self, obj):
if obj is None:
return obj

if isinstance(obj, (int, float)):
obj = datetime.fromtimestamp(_normalize_timestamp(obj) / 1000)

if six.PY2:
if isinstance(obj, (long)):
obj = datetime.fromtimestamp(long(obj))

if isinstance(obj, numpy.datetime64):
if str(obj) == "NaT":
return None
Expand All @@ -75,6 +91,10 @@ def to_timestamp(self, obj):
if obj is None:
return obj

if obj.__class__.__name__ == "date":
# handle updating datetime with date object
obj = datetime(obj.year, obj.month, obj.day)

if isinstance(obj, Period):
# extract the start of the Period
obj = obj.to_timestamp()
Expand Down Expand Up @@ -104,14 +124,7 @@ def to_timestamp(self, obj):
return round(obj / 1000000)

if isinstance(obj, (int, float)):
# figure out whether the timestamp is in seconds or milliseconds
try:
# if it overflows, it's milliseconds - otherwise convert from seconds to milliseconds.
datetime.fromtimestamp(obj)
return int(obj * 1000)
except (ValueError, OverflowError):
# milliseconds
return int(obj)
return _normalize_timestamp(obj)

# Convert `datetime.datetime` and `pandas.Timestamp` to millisecond timestamps
return int((time.mktime(obj.timetuple()) + obj.microsecond / 1000000.0) * 1000)
Expand Down
Loading