Skip to content

Commit 83bf961

Browse files
authored
Merge pull request #907 from finos/python-fixes
Fix column ordering in Python, null handling for computed columns
2 parents 880a63c + f2a5080 commit 83bf961

File tree

11 files changed

+1153
-188
lines changed

11 files changed

+1153
-188
lines changed

cpp/perspective/src/cpp/computed_function.cpp

Lines changed: 89 additions & 28 deletions
Large diffs are not rendered by default.

packages/perspective-viewer-hypergrid/src/js/perspective-plugin.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ var Borders = cellRenderersRegistry.BaseClass.extend("Borders", {
2323
var color;
2424

2525
gc.save();
26-
gc.translate(-0.5, 0.5); // paint "sharp" lines on pixels instead of "blury" lines between pixels
26+
gc.translate(-0.5, 0.5); // paint "sharp" lines on pixels instead of "blurry" lines between pixels
2727
gc.cache.lineWidth = 1;
2828

2929
color = config.borderTop;

packages/perspective/test/js/computed.js

Lines changed: 905 additions & 83 deletions
Large diffs are not rendered by default.

python/perspective/perspective/src/numpy.cpp

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -600,17 +600,32 @@ namespace numpy {
600600
*/
601601
std::vector<std::string>
602602
NumpyLoader::make_names() {
603-
auto names = py::list(m_accessor.attr("data")().attr("keys")());
604-
return names.cast<std::vector<std::string>>();
603+
auto data = m_accessor.attr("data")();
604+
auto py_names = m_accessor.attr("names")().cast<std::vector<std::string>>();
605+
606+
// Match names to dataset - only keep names that are present in dataset.
607+
// The `m_names` variable is used internally to access the numpy arrays
608+
// containing each column. On first-time load, `m_names` contains
609+
// every name in the dataset. On update, `m_names` is recalculated to
610+
// only include columns that are present in the update dataset.
611+
std::vector<std::string> names;
612+
for (const auto& name : py_names) {
613+
if (data.contains(py::str(name))) {
614+
names.push_back(name);
615+
}
616+
}
617+
618+
return names;
605619
}
606620

607621
std::vector<t_dtype>
608622
NumpyLoader::make_types() {
609623
std::vector<t_dtype> rval;
610624

611-
py::list arrays = m_accessor.attr("data")().attr("values")();
612-
for (const auto& a : arrays) {
613-
py::array array = py::array::ensure(a);
625+
auto data = m_accessor.attr("data")();
626+
for (const auto& name : m_names) {
627+
// Access each array by name to guarantee ordered access.
628+
py::array array = py::array::ensure(data[py::str(name)]);
614629

615630
if (!array) {
616631
PSP_COMPLAIN_AND_ABORT("Perspective does not support the mixing of ndarrays and lists.");

python/perspective/perspective/src/table.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,14 @@ std::shared_ptr<Table> make_table_py(t_val table, t_data_accessor accessor, t_va
113113
* not created from a DataFrame, the "index" column would not exist.
114114
*/
115115
if (is_numpy) {
116+
// `numpy_loader`s `m_names` and `m_types` variable contains only
117+
// the column names and data types present in the update dataset,
118+
// not the names/types of the entire `Table`.
116119
numpy_loader.init();
117120
}
121+
122+
// `column_names` and `data_types` contain every single column in the
123+
// dataset, as well as `__INDEX__` if it exists.
118124
column_names = accessor.attr("names")().cast<std::vector<std::string>>();
119125
data_types = accessor.attr("types")().cast<std::vector<t_dtype>>();
120126
} else if (is_numpy) {
@@ -123,9 +129,15 @@ std::shared_ptr<Table> make_table_py(t_val table, t_data_accessor accessor, t_va
123129
* Perspective. Using `get_data_types` allows us to know the type of an array with `dtype=object`.
124130
*/
125131
numpy_loader.init();
132+
133+
// This will contain every single column in the dataset, as the
134+
// first-time data load path does not mutate the `names` property of
135+
// `accessor`.
126136
column_names = numpy_loader.names();
127137

128-
// composite array and inferred `data_types` for the Table
138+
// Infer data type for each column, and then use a composite of numpy
139+
// dtype, inferred `t_dtype`, and stringified numpy dtype to get the
140+
// final, canonical data type mapping.
129141
std::vector<t_dtype> inferred_types = get_data_types(accessor.attr("data")(), 1, column_names, accessor.attr("date_validator")().cast<t_val>());
130142
data_types = numpy_loader.reconcile_dtypes(inferred_types);
131143
} else {

python/perspective/perspective/src/view.cpp

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -120,16 +120,18 @@ make_view_config(const t_schema& schema, t_val date_parser, t_val config) {
120120

121121
// to preserve order, do not cast to std::map - use keys and python 3.7's guarantee that dicts respect insertion order
122122
auto p_aggregates = py::dict(config.attr("get_aggregates")());
123-
auto aggregate_keys = py::list(p_aggregates.attr("keys")());
124123
tsl::ordered_map<std::string, std::vector<std::string>> aggregates;
125124

126-
for (auto& key : aggregate_keys) {
127-
const std::string key_str = key.cast<std::string>();
128-
if (py::isinstance<py::str>(p_aggregates[key])) {
129-
std::vector<std::string> agg{p_aggregates[key].cast<std::string>()};
130-
aggregates[key_str] = agg;
131-
} else {
132-
aggregates[key_str] = p_aggregates[key].cast<std::vector<std::string>>();
125+
for (auto& column : columns) {
126+
py::str py_column_name = py::str(column);
127+
if (p_aggregates.contains(py_column_name)) {
128+
if (py::isinstance<py::str>(p_aggregates[py_column_name])) {
129+
std::vector<std::string> agg{
130+
p_aggregates[py_column_name].cast<std::string>()};
131+
aggregates[column] = agg;
132+
} else {
133+
aggregates[column] = p_aggregates[py_column_name].cast<std::vector<std::string>>();
134+
}
133135
}
134136
};
135137

python/perspective/perspective/table/_accessor.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -43,41 +43,44 @@ def _type_to_format(data_or_schema):
4343
- 0: records (:obj:`list` of :obj:`dict`)
4444
- 1: columns (:obj:`dict` of :obj:`str` to :obj:`list`)
4545
- 2: schema (dist[str]/dict[type])
46+
:obj:`list`: column names
4647
():obj:`list`/:obj:`dict`): processed data
4748
'''
4849
if isinstance(data_or_schema, list):
4950
# records
50-
return False, 0, data_or_schema
51+
names = list(data_or_schema[0].keys()) if len(data_or_schema) > 0 else []
52+
return False, 0, names, data_or_schema
5153
elif isinstance(data_or_schema, dict):
5254
# schema or columns
5355
for v in data_or_schema.values():
5456
if isinstance(v, type) or isinstance(v, str):
5557
# schema maps name -> type
56-
return False, 2, data_or_schema
58+
return False, 2, list(data_or_schema.keys()), data_or_schema
5759
elif isinstance(v, list):
5860
# a dict of iterables = type 1
59-
return False, 1, data_or_schema
61+
return False, 1, list(data_or_schema.keys()), data_or_schema
6062
else:
6163
# See if iterable
6264
try:
6365
iter(v)
6466
except TypeError:
6567
raise NotImplementedError("Cannot load dataset of non-iterable type: Data passed in through a dict must be of type `list` or `numpy.ndarray`.")
6668
else:
67-
return isinstance(v, numpy.ndarray), 1, data_or_schema
69+
return isinstance(v, numpy.ndarray), 1, list(data_or_schema.keys()), data_or_schema
6870
elif isinstance(data_or_schema, numpy.ndarray):
6971
# structured or record array
7072
if not isinstance(data_or_schema.dtype.names, tuple):
7173
raise NotImplementedError("Data should be dict of numpy.ndarray or a structured array.")
72-
return True, 1, _flatten_structure(data_or_schema)
74+
flattened = _flatten_structure(data_or_schema)
75+
return True, 1, list(flattened.keys()), flattened
7376
else:
7477
if not (isinstance(data_or_schema, pandas.DataFrame) or isinstance(data_or_schema, pandas.Series)):
7578
# if pandas not installed or is not a dataframe or series
7679
raise NotImplementedError("Data must be dataframe, dict, list, numpy.recarray, or a numpy structured array.")
7780
else:
7881
# flatten column/index multiindex
7982
df, _ = deconstruct_pandas(data_or_schema)
80-
return True, 1, {c: df[c].values for c in df.columns}
83+
return True, 1, df.columns.tolist(), {c: df[c].values for c in df.columns}
8184

8285

8386
class _PerspectiveAccessor(object):
@@ -88,18 +91,13 @@ class _PerspectiveAccessor(object):
8891
INTEGER_TYPES = six.integer_types + (numpy.integer,)
8992

9093
def __init__(self, data_or_schema):
91-
self._is_numpy, self._format, self._data_or_schema = _type_to_format(data_or_schema)
94+
self._is_numpy, self._format, self._names, self._data_or_schema = _type_to_format(data_or_schema)
9295
self._date_validator = _PerspectiveDateValidator()
9396
self._row_count = \
9497
len(self._data_or_schema) if self._format == 0 else \
9598
len(max(self._data_or_schema.values(), key=len)) if self._format == 1 else \
9699
0
97100

98-
if isinstance(self._data_or_schema, list):
99-
self._names = list(self._data_or_schema[0].keys()) if len(self._data_or_schema) > 0 else []
100-
elif isinstance(self._data_or_schema, dict):
101-
self._names = list(self._data_or_schema.keys())
102-
103101
self._types = []
104102

105103
# Verify that column names are strings, and that numpy arrays are of
@@ -115,6 +113,7 @@ def __init__(self, data_or_schema):
115113
raise PerspectiveError("Mixed datasets of numpy.ndarray and lists are not supported.")
116114

117115
dtype = array.dtype
116+
118117
if name == "index" and isinstance(data_or_schema.index, pandas.DatetimeIndex):
119118
# use the index of the original, unflattened dataframe
120119
dtype = _parse_datetime_index(data_or_schema.index)

python/perspective/perspective/tests/table/test_table_numpy.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,26 @@ def test_table_int(self):
2828
"b": [4, 5, 6]
2929
}
3030

31+
def test_table_int_lots_of_columns(self):
32+
data = {
33+
"a": np.array([1, 2, 3]),
34+
"b": np.array([4, 5, 6]),
35+
"c": np.array([4, 5, 6]),
36+
"d": np.array([4, 5, 6]),
37+
"e": np.array([4, 5, 6]),
38+
"f": np.array([4, 5, 6]),
39+
}
40+
tbl = Table(data)
41+
assert tbl.size() == 3
42+
assert tbl.view().to_dict() == {
43+
"a": [1, 2, 3],
44+
"b": [4, 5, 6],
45+
"c": [4, 5, 6],
46+
"d": [4, 5, 6],
47+
"e": [4, 5, 6],
48+
"f": [4, 5, 6]
49+
}
50+
3151
def test_table_int_with_None(self):
3252
data = {"a": np.array([1, 2, 3, None, None]), "b": np.array([4, 5, 6, None, None])}
3353
tbl = Table(data)
@@ -738,6 +758,36 @@ def test_table_numpy_from_schema_str(self):
738758
table.update(df)
739759
assert table.view().to_dict()["a"] == data
740760

761+
# partial update
762+
763+
def test_table_numpy_partial_update(self):
764+
data = ["a", None, "b", None, "c"]
765+
df = {"a": np.array([1, 2, 3, 4, 5]), "b": np.array(data), "c": np.array(data)}
766+
table = Table(df, index="a")
767+
table.update({
768+
"a": np.array([2, 4, 5]),
769+
"b": np.array(["x", "y", "z"])
770+
})
771+
assert table.view().to_dict() == {
772+
"a": [1, 2, 3, 4, 5],
773+
"b": ["a", "x", "b", "y", "z"],
774+
"c": ["a", None, "b", None, "c"]
775+
}
776+
777+
def test_table_numpy_partial_update_implicit(self):
778+
data = ["a", None, "b", None, "c"]
779+
df = {"a": np.array([1, 2, 3, 4, 5]), "b": np.array(data), "c": np.array(data)}
780+
table = Table(df)
781+
table.update({
782+
"__INDEX__": np.array([1, 3, 4]),
783+
"b": np.array(["x", "y", "z"])
784+
})
785+
assert table.view().to_dict() == {
786+
"a": [1, 2, 3, 4, 5],
787+
"b": ["a", "x", "b", "y", "z"],
788+
"c": ["a", None, "b", None, "c"]
789+
}
790+
741791
# structured array
742792

743793
def test_table_structured_array(self):

python/perspective/perspective/tests/table/test_table_pandas.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,20 @@ def test_table_dataframe(self):
3535
{"a": 3, "b": 4, "index": 1}
3636
]
3737

38+
def test_table_dataframe_column_order(self):
39+
d = [{"a": 1, "b": 2, "c": 3, "d": 4}, {"a": 3, "b": 4, "c": 5, "d": 6}]
40+
data = pd.DataFrame(d, columns=["b", "c", "a", "d"])
41+
tbl = Table(data)
42+
assert tbl.size() == 2
43+
assert tbl.columns() == ["index", "b", "c", "a", "d"]
44+
45+
def test_table_dataframe_selective_column_order(self):
46+
d = [{"a": 1, "b": 2, "c": 3, "d": 4}, {"a": 3, "b": 4, "c": 5, "d": 6}]
47+
data = pd.DataFrame(d, columns=["b", "c", "a"])
48+
tbl = Table(data)
49+
assert tbl.size() == 2
50+
assert tbl.columns() == ["index", "b", "c", "a"]
51+
3852
def test_table_dataframe_does_not_mutate(self):
3953
# make sure we don't mutate the dataframe that a user passes in
4054
data = pd.DataFrame({

0 commit comments

Comments
 (0)