perspective-dev · texodus · Nov 10, 2019 · Nov 7, 2019 · Nov 7, 2019 · Nov 7, 2019
diff --git a/python/perspective/perspective/core/data/np.py b/python/perspective/perspective/core/data/np.py
@@ -7,7 +7,6 @@
 #
 import six
 import numpy as np
-import pandas as pd
 from datetime import datetime
 
 DATE_DTYPES = [np.dtype("datetime64[D]"), np.dtype("datetime64[W]"), np.dtype("datetime64[M]"), np.dtype("datetime64[Y]")]
@@ -22,45 +21,55 @@ def deconstruct_numpy(array):
     Returns:
         dict : `array` is the original array, and `mask` is an array of booleans where `True` represents a nan/None value.
     '''
-    is_object_or_string = pd.api.types.is_object_dtype(array.dtype) or pd.api.types.is_string_dtype(array.dtype)
+    mask = []
 
-    # use `isnull` or `isnan` depending on dtype
-    if is_object_or_string or six.PY2:
-        # python3 masked_invalid compares datetimes, but not in python2
-        data = array
-        mask = np.argwhere(pd.isnull(array)).flatten()
-    else:
-        masked = np.ma.masked_invalid(array)
-        data = masked.data
-        mask = np.argwhere(masked.mask).flatten()
+    is_object_or_string_dtype = np.issubdtype(array.dtype, np.str_) or\
+        np.issubdtype(array.dtype, np.object_)
 
-    if data.dtype == bool or data.dtype == "?":
-        # bool => byte
-        data = data.astype("b", copy=False)
-    elif np.issubdtype(data.dtype, np.datetime64):
+    if six.PY2:
+        is_object_or_string_dtype = is_object_or_string_dtype or np.issubdtype(array.dtype, np.unicode_)
+
+    is_datetime_dtype = np.issubdtype(array.dtype, np.datetime64) or\
+        np.issubdtype(array.dtype, np.timedelta64)
+
+    for i, item in enumerate(array):
+        invalid = item is None
 
+        if not is_object_or_string_dtype:
+            if is_datetime_dtype:
+                invalid = invalid or np.isnat(item)
+            else:
+                invalid = invalid or np.isnan(item)
+
+        if invalid:
+            mask.append(i)
+
+    if array.dtype == bool or array.dtype == "?":
+        # bool => byte
+        array = array.astype("b", copy=False)
+    elif np.issubdtype(array.dtype, np.datetime64):
         # treat days/weeks/months/years as datetime objects - avoid idiosyncracy with days of month, etc.
-        if data.dtype in DATE_DTYPES:
-            data = data.astype(datetime)
+        if array.dtype in DATE_DTYPES:
+            array = array.astype(datetime)
 
         # cast datetimes to millisecond timestamps
         # because datetime64("nat") is a double, cast to float64 here - C++ handles the rest
-        if data.dtype == np.dtype("datetime64[us]"):
-            data = data.astype(np.float64, copy=False) / 1000
-        elif data.dtype == np.dtype("datetime64[ns]"):
-            data = data.astype(np.float64, copy=False) / 1000000
-        elif data.dtype == np.dtype("datetime64[ms]"):
-            data = data.astype(np.float64, copy=False)
-        elif data.dtype == np.dtype("datetime64[s]"):
-            data = data.astype(np.float64, copy=False) * 1000
-        elif data.dtype == np.dtype("datetime64[m]"):
-            data = data.astype(np.float64, copy=False) * 60000
-        elif data.dtype == np.dtype("datetime64[h]"):
-            data = data.astype(np.float64, copy=False) * 3600000
-    elif np.issubdtype(data.dtype, np.timedelta64):
-        data = data.astype(np.float64, copy=False)
+        if array.dtype == np.dtype("datetime64[us]"):
+            array = array.astype(np.float64, copy=False) / 1000
+        elif array.dtype == np.dtype("datetime64[ns]"):
+            array = array.astype(np.float64, copy=False) / 1000000
+        elif array.dtype == np.dtype("datetime64[ms]"):
+            array = array.astype(np.float64, copy=False)
+        elif array.dtype == np.dtype("datetime64[s]"):
+            array = array.astype(np.float64, copy=False) * 1000
+        elif array.dtype == np.dtype("datetime64[m]"):
+            array = array.astype(np.float64, copy=False) * 60000
+        elif array.dtype == np.dtype("datetime64[h]"):
+            array = array.astype(np.float64, copy=False) * 3600000
+    elif np.issubdtype(array.dtype, np.timedelta64):
+        array = array.astype(np.float64, copy=False)
 
     return {
-            "array": data,
+            "array": array,
             "mask": mask
         }
diff --git a/python/perspective/perspective/core/data/pd.py b/python/perspective/perspective/core/data/pd.py
@@ -5,11 +5,49 @@
 # This file is part of the Perspective library, distributed under the terms of
 # the Apache License 2.0.  The full license can be found in the LICENSE file.
 #
+import numpy as np
+import pandas as pd
+
+
+def _parse_datetime_index(index):
+    '''Given an instance of `pandas.DatetimeIndex`, parse its `freq` and return a `numpy.dtype`
+    that corresponds to the unit it should be parsed in.
+
+    Because Pandas DataFrames cannot store datetimes in anything other than `datetime64[ns]`, we need
+    to examine the `DatetimeIndex` itself to understand what unit it needs to be parsed as.
+
+    Args:
+        index (pandas.DatetimeIndex)
+
+    Returns:
+        numpy.dtype : a datetime64 dtype with the correct units depending on `index.freq`.
+    '''
+    if index.freq is None:
+        return np.dtype("datetime64[ns]")
+
+    freq = str(index.freq).lower()
+    new_type = None
+
+    if any(s in freq for s in ["businessday", "day"]) or freq == "sm" or freq == "sms":
+        # days
+        new_type = "D"
+    elif freq == "w" or "week" in freq:
+        # weeks
+        new_type = "W"
+    elif any(s in freq for s in ["month", "quarter"]):
+        # months
+        new_type = "M"
+    elif "year" in freq or freq == "a":
+        new_type = "Y"
+    else:
+        # default to datetime
+        new_type = "ns"
+
+    return np.dtype("datetime64[{0}]".format(new_type))
 
 
 def deconstruct_pandas(data):
     '''Remove pivots from the passed-in dataframe.'''
-    import pandas as pd
     kwargs = {}
 
     # level unstacking
@@ -53,8 +91,9 @@ def deconstruct_pandas(data):
             # preserve name from series
             flattened.name = data.name
 
-        # use explicit index column as primary key
-        kwargs["index"] = "index"
+            # make sure all columns are strings
+            flattened.columns = [str(c) for c in flattened.columns]
+
         data = flattened
 
     return data, kwargs
diff --git a/python/perspective/perspective/include/perspective/python/numpy.h b/python/perspective/perspective/include/perspective/python/numpy.h
@@ -75,6 +75,11 @@ namespace numpy {
             std::vector<std::string> names() const;
             std::vector<t_dtype> types() const;
             std::uint32_t row_count() const;
+
+            /**
+             * Keep a list of numpy datetime64 units that we should treat as dates and not datetimes.
+             */
+            static const std::vector<std::string> DATE_UNITS;
         private:
             /**
              * When memory cannot be copied for dtype=object arrays, for example), fill the column through iteration.

diff --git a/python/perspective/perspective/src/accessor.cpp b/python/perspective/perspective/src/accessor.cpp
@@ -93,6 +93,7 @@ infer_type(t_val x, t_val date_validator) {
     } else {
         t = type_string_to_t_dtype(type_string);
     }
+
     return t;
 }
 
@@ -148,7 +149,6 @@ get_data_types(t_val data, std::int32_t format, std::vector<std::string> names,
     if (format == 2) {
         py::dict data_dict = data.cast<py::dict>();
 
-
         for (auto tup : data_dict) {
             auto name = tup.first.cast<std::string>();
             auto data_type = tup.second.get_type().attr("__name__").cast<std::string>();

diff --git a/python/perspective/perspective/src/numpy.cpp b/python/perspective/perspective/src/numpy.cpp
@@ -15,6 +15,8 @@ using namespace perspective;
 namespace perspective {
 namespace numpy {
 
+    const std::vector<std::string> NumpyLoader::DATE_UNITS = {"[D]", "[W]", "[M]", "[Y]"};
+
     NumpyLoader::NumpyLoader(py::object accessor)
         : m_init(false)
         , m_has_numeric_dtype(false)
@@ -41,18 +43,29 @@ namespace numpy {
         std::vector<t_dtype> reconciled_types;
         std::uint32_t num_columns = m_names.size();
 
+        // Get numpy dtypes as string so we can tell the difference between dates and datetimes
+        std::vector<std::string> str_dtypes = m_accessor.attr("types")().cast<std::vector<std::string>>();
+
         for (auto i = 0; i < num_columns; ++i) {
+            std::string numpy_type_as_string = str_dtypes[i];
             t_dtype numpy_type = m_types[i];
             t_dtype inferred_type = inferred_types[i];
-            switch (numpy_type) {
-                case DTYPE_OBJECT: {
-                    // inferred type has the correct underlying type for the array
-                    reconciled_types.push_back(inferred_type);
-                } break;
-                default: {
-                    reconciled_types.push_back(numpy_type);
+
+            // Check whether column is a date or a datetime
+            if(numpy_type_as_string.find("datetime64") != std::string::npos) {
+                for (const std::string& unit : DATE_UNITS) {
+                    if (numpy_type_as_string.find(unit) != std::string::npos) {
+                        inferred_type = DTYPE_DATE;
+                    }
                 }
             }
+
+            // Otherwise, numpy type takes precedence unless date/object - need specificity of inferred type
+            if (inferred_type == DTYPE_DATE || numpy_type == DTYPE_OBJECT) {
+                reconciled_types.push_back(inferred_type);
+            } else {
+                reconciled_types.push_back(numpy_type);
+            }
         }
 
         return reconciled_types;
@@ -149,7 +162,7 @@ namespace numpy {
         }
 
         // Datetimes are not trivially copyable - they are float64 values that need to be read as int64
-        if (type == DTYPE_TIME) {
+        if (type == DTYPE_TIME || type == DTYPE_DATE) {
             fill_column_iter(array, tbl, col, name, np_dtype, type, cidx, is_update);
             fill_validity_map(col, mask_ptr, mask_size, is_update);
             return;
@@ -377,7 +390,6 @@ namespace numpy {
                 continue;
             }
 
-
             auto date_components = item.cast<std::map<std::string, std::int32_t>>();
             t_date dt = t_date(date_components["year"], date_components["month"], date_components["day"]);
             col->set_nth(i, dt);

diff --git a/python/perspective/perspective/src/utils.cpp b/python/perspective/perspective/src/utils.cpp
@@ -82,8 +82,8 @@ t_dtype type_string_to_t_dtype(std::string value, std::string name){
         // Python date
         // TODO inheritance
         type = t_dtype::DTYPE_DATE;
-    } else if (value == "timedelta64") {
-        // cast timedelta to string to preserve units
+    } else if (value == "timedelta64" || value == "time") {
+        // cast time/timedelta to string to preserve units
         type = t_dtype::DTYPE_STR;
     } else {
         CRITICAL("Unknown type '%s' for key '%s'", value, name);

diff --git a/python/perspective/perspective/table/_accessor.py b/python/perspective/perspective/table/_accessor.py
@@ -10,7 +10,8 @@
 import numpy
 from math import isnan
 from ._date_validator import _PerspectiveDateValidator
-from ..core.data import deconstruct_numpy
+from ..core.data import deconstruct_numpy, deconstruct_pandas
+from ..core.data.pd import _parse_datetime_index
 from ..core.exception import PerspectiveError
 
 try:
@@ -21,11 +22,8 @@
 
 def _flatten_structure(array):
     '''Flatten numpy.recarray or structured arrays into a dict.'''
-    if six.PY2:
-        # recarrays/structured arrays have weird bit offsets in py2 - make a copy of the array to fix
-        columns = [numpy.copy(array[col]) for col in array.dtype.names]
-    else:
-        columns = [array[col] for col in array.dtype.names]
+    # recarrays/structured arrays do not have guaranteed bit offsets - make a copy of the array to fix
+    columns = [numpy.copy(array[col]) for col in array.dtype.names]
     return dict(zip(array.dtype.names, columns))
 
 
@@ -75,14 +73,8 @@ def _type_to_format(data_or_schema):
             # if pandas not installed or is not a dataframe or series
             raise NotImplementedError("Data must be dataframe, dict, list, numpy.recarray, or a numpy structured array.")
         else:
-            from ..core.data import deconstruct_pandas
-
             # flatten column/index multiindex
             df, _ = deconstruct_pandas(data_or_schema)
-
-            # try to squash object dtype as much as possible
-            df.fillna(value=numpy.nan, inplace=True)
-
             return True, 1, {c: df[c].values for c in df.columns}
 
 
@@ -104,6 +96,25 @@ def __init__(self, data_or_schema):
 
         self._types = []
 
+        # Verify that column names are strings, and that numpy arrays are of type `ndarray`
+        for name in self._names:
+            if not isinstance(name, six.string_types):
+                raise PerspectiveError(
+                    "Column names should be strings, not type `{0}`".format(type(name).__name__))
+            if self._is_numpy:
+                array = self._data_or_schema[name]
+
+                if not isinstance(array, numpy.ndarray):
+                    raise PerspectiveError("Mixed datasets of numpy.ndarray and lists are not supported.")
+
+                dtype = array.dtype
+                if name == "index" and isinstance(data_or_schema.index, pandas.DatetimeIndex):
+                    # use the index of the original, unflattened dataframe
+                    dtype = _parse_datetime_index(data_or_schema.index)
+
+                # keep a string representation of the dtype, as PyBind only has access to the char dtype code
+                self._types.append(str(dtype))
+
     def data(self):
         return self._data_or_schema
 
@@ -216,8 +227,6 @@ def _get_numpy_column(self, name):
         data = self._data_or_schema.get(name, None)
         if data is None:
             raise PerspectiveError("Column `{0}` does not exist.".format(name))
-        if not isinstance(data, numpy.ndarray):
-            raise PerspectiveError("Mixed datasets of numpy.ndarray and lists are not supported.")
         return deconstruct_numpy(data)
 
     def _has_column(self, ridx, name):

diff --git a/python/perspective/perspective/table/_date_validator.py b/python/perspective/perspective/table/_date_validator.py
@@ -22,11 +22,20 @@
     from past.builtins import long
 
 
+def _normalize_timestamp(obj):
+    '''Convert a timestamp in seconds to milliseconds.'''
+    try:
+        # if it overflows, it's milliseconds - otherwise convert from seconds to milliseconds.
+        datetime.fromtimestamp(obj)
+        return int(obj * 1000)
+    except (ValueError, OverflowError):
+        # milliseconds
+        return int(obj)
+
+
 class _PerspectiveDateValidator(object):
     '''Validate and parse dates using the `dateutil` package.'''
 
-    EPOCH = datetime(1970, 1, 1)
-
     def parse(self, str):
         '''Return a datetime.datetime object containing the parsed date, or None if the date is invalid.
 
@@ -53,6 +62,13 @@ def to_date_components(self, obj):
         if obj is None:
             return obj
 
+        if isinstance(obj, (int, float)):
+            obj = datetime.fromtimestamp(_normalize_timestamp(obj) / 1000)
+
+        if six.PY2:
+            if isinstance(obj, (long)):
+                obj = datetime.fromtimestamp(long(obj))
+
         if isinstance(obj, numpy.datetime64):
             if str(obj) == "NaT":
                 return None
@@ -75,6 +91,10 @@ def to_timestamp(self, obj):
         if obj is None:
             return obj
 
+        if obj.__class__.__name__ == "date":
+            # handle updating datetime with date object
+            obj = datetime(obj.year, obj.month, obj.day)
+
         if isinstance(obj, Period):
             # extract the start of the Period
             obj = obj.to_timestamp()
@@ -104,14 +124,7 @@ def to_timestamp(self, obj):
                 return round(obj / 1000000)
 
         if isinstance(obj, (int, float)):
-            # figure out whether the timestamp is in seconds or milliseconds
-            try:
-                # if it overflows, it's milliseconds - otherwise convert from seconds to milliseconds.
-                datetime.fromtimestamp(obj)
-                return int(obj * 1000)
-            except (ValueError, OverflowError):
-                # milliseconds
-                return int(obj)
+            return _normalize_timestamp(obj)
 
         # Convert `datetime.datetime` and `pandas.Timestamp` to millisecond timestamps
         return int((time.mktime(obj.timetuple()) + obj.microsecond / 1000000.0) * 1000)