Skip to content

Commit 52a0763

Browse files
authored
Merge pull request #1476 from finos/stddev
Add standard deviation and variance aggregates
2 parents d1cf2a5 + dddbc56 commit 52a0763

File tree

19 files changed

+1136
-53
lines changed

19 files changed

+1136
-53
lines changed

cpp/perspective/src/cpp/aggspec.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,12 @@ t_aggspec::agg_str() const {
193193
case AGGTYPE_PCT_SUM_GRAND_TOTAL: {
194194
return "pct_sum_grand_total";
195195
}
196+
case AGGTYPE_VARIANCE: {
197+
return "variance";
198+
}
199+
case AGGTYPE_STANDARD_DEVIATION: {
200+
return "stddev";
201+
}
196202
default: {
197203
PSP_COMPLAIN_AND_ABORT("Unknown agg type");
198204
return "unknown";
@@ -315,10 +321,8 @@ t_aggspec::get_output_specs(const t_schema& schema) const {
315321
case AGGTYPE_COUNT: {
316322
return mk_col_name_type_vec(name(), DTYPE_INT64);
317323
}
324+
case AGGTYPE_MEAN:
318325
case AGGTYPE_MEAN_BY_COUNT:
319-
case AGGTYPE_MEAN: {
320-
return mk_col_name_type_vec(name(), DTYPE_F64PAIR);
321-
}
322326
case AGGTYPE_WEIGHTED_MEAN: {
323327
return mk_col_name_type_vec(name(), DTYPE_F64PAIR);
324328
}
@@ -327,7 +331,9 @@ t_aggspec::get_output_specs(const t_schema& schema) const {
327331
}
328332
case AGGTYPE_SCALED_DIV:
329333
case AGGTYPE_SCALED_ADD:
330-
case AGGTYPE_SCALED_MUL: {
334+
case AGGTYPE_SCALED_MUL:
335+
case AGGTYPE_VARIANCE:
336+
case AGGTYPE_STANDARD_DEVIATION: {
331337
return mk_col_name_type_vec(name(), DTYPE_FLOAT64);
332338
}
333339
case AGGTYPE_UDF_COMBINER:

cpp/perspective/src/cpp/base.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,8 +481,14 @@ str_to_aggtype(const std::string& str) {
481481
return t_aggtype::AGGTYPE_UDF_COMBINER;
482482
} else if (str.find("udf_reducer_") != std::string::npos) {
483483
return t_aggtype::AGGTYPE_UDF_REDUCER;
484+
} else if (str =="var" || str == "variance") {
485+
return t_aggtype::AGGTYPE_VARIANCE;
486+
} else if (str == "stddev" || str == "standard deviation") {
487+
return t_aggtype::AGGTYPE_STANDARD_DEVIATION;
484488
} else {
485-
PSP_COMPLAIN_AND_ABORT("Encountered unknown aggregate operation.");
489+
std::stringstream ss;
490+
ss << "Encountered unknown aggregate operation: '" << str << "'" << std::endl;
491+
PSP_COMPLAIN_AND_ABORT(ss.str());
486492
// use any as default
487493
return t_aggtype::AGGTYPE_ANY;
488494
}

cpp/perspective/src/cpp/config.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,8 @@ t_config::setup(const std::vector<std::string>& detail_columns,
195195
case AGGTYPE_MUL:
196196
case AGGTYPE_DISTINCT_COUNT:
197197
case AGGTYPE_DISTINCT_LEAF:
198+
case AGGTYPE_VARIANCE:
199+
case AGGTYPE_STANDARD_DEVIATION:
198200
m_has_pkey_agg = true;
199201
break;
200202
default:

cpp/perspective/src/cpp/extract_aggregate.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,9 @@ extract_aggregate(
6868
case AGGTYPE_JOIN:
6969
case AGGTYPE_IDENTITY:
7070
case AGGTYPE_DISTINCT_COUNT:
71-
case AGGTYPE_DISTINCT_LEAF: {
71+
case AGGTYPE_DISTINCT_LEAF:
72+
case AGGTYPE_VARIANCE:
73+
case AGGTYPE_STANDARD_DEVIATION: {
7274
t_tscalar rval = aggcol->get_scalar(ridx);
7375
return rval;
7476
} break;

cpp/perspective/src/cpp/sparse_tree.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1397,6 +1397,43 @@ t_stree::update_agg_table(
13971397
if (!skip)
13981398
dst->set_scalar(dst_ridx, new_value);
13991399
} break;
1400+
case AGGTYPE_VARIANCE:
1401+
case AGGTYPE_STANDARD_DEVIATION: {
1402+
old_value.set(dst->get_scalar(dst_ridx));
1403+
1404+
auto pkeys = get_pkeys(nidx);
1405+
std::vector<double> values;
1406+
1407+
read_column_from_gstate(
1408+
gstate, expression_master_table, spec.get_dependencies()[0].name(), pkeys, values, false);
1409+
1410+
// Calculate the count, rolling mean, and sum of squares of
1411+
// differences from the current mean at each iteration.
1412+
double count = 0, mean = 0, m2 = 0;
1413+
1414+
for (double num : values) {
1415+
count++;
1416+
double next_mean = mean + (num - mean) / count;
1417+
m2 += (num - mean) * (num - next_mean);
1418+
mean = next_mean;
1419+
}
1420+
1421+
// Only calculate stddev for more than 1 element in the group.
1422+
if (count >= 2) {
1423+
double value = m2 / count;
1424+
1425+
if (spec.agg() == AGGTYPE_STANDARD_DEVIATION) {
1426+
value = std::sqrt(value);
1427+
}
1428+
1429+
new_value.set(value);
1430+
dst->set_scalar(dst_ridx, new_value);
1431+
dst->set_valid(dst_ridx, true);
1432+
} else {
1433+
dst->set_valid(dst_ridx, false);
1434+
}
1435+
1436+
} break;
14001437
default: { PSP_COMPLAIN_AND_ABORT("Not implemented"); }
14011438
} // end switch
14021439

cpp/perspective/src/cpp/view.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,16 @@ View<CTX_T>::data_slice_to_arrow(
554554
for (auto cidx = start_col; cidx < end_col; ++cidx) {
555555
std::vector<t_tscalar> col_path = names.at(cidx);
556556
t_dtype dtype = get_column_dtype(cidx);
557+
558+
// mean and weighted mean uses DTYPE_F64PAIR on the aggtable, which is
559+
// the dtype returned by get_column_dtype. However, in the output data
560+
// slice they are DTYPE_FLOAT64 and the f64 pair is not exposed outside
561+
// of the sparse tree. Thus, treat f64 pair as DTYPE_FLOAT64 for arrow
562+
// serialization.
563+
if (dtype == DTYPE_F64PAIR) {
564+
dtype = DTYPE_FLOAT64;
565+
}
566+
557567
std::string name;
558568

559569
if (sides() > 1) {
@@ -918,7 +928,9 @@ View<CTX_T>::_map_aggregate_types(
918928
case AGGTYPE_MEAN_BY_COUNT:
919929
case AGGTYPE_WEIGHTED_MEAN:
920930
case AGGTYPE_PCT_SUM_PARENT:
921-
case AGGTYPE_PCT_SUM_GRAND_TOTAL: {
931+
case AGGTYPE_PCT_SUM_GRAND_TOTAL:
932+
case AGGTYPE_VARIANCE:
933+
case AGGTYPE_STANDARD_DEVIATION: {
922934
return "float";
923935
} break;
924936
default: { return typestring; } break;

cpp/perspective/src/include/perspective/base.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,9 @@ enum t_aggtype {
256256
AGGTYPE_DISTINCT_COUNT,
257257
AGGTYPE_DISTINCT_LEAF,
258258
AGGTYPE_PCT_SUM_PARENT,
259-
AGGTYPE_PCT_SUM_GRAND_TOTAL
259+
AGGTYPE_PCT_SUM_GRAND_TOTAL,
260+
AGGTYPE_VARIANCE,
261+
AGGTYPE_STANDARD_DEVIATION
260262
};
261263

262264
PERSPECTIVE_EXPORT t_aggtype str_to_aggtype(const std::string& str);

examples/workspace-editing-python/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"start": "yarn webpack && yarn start:server",
88
"start:client": "webpack-dev-server --open",
99
"start:server": "PYTHONPATH=../../python/perspective python3 src/server.py",
10-
"webpack": "webpack --colour"
10+
"webpack": "webpack --color"
1111
},
1212
"keywords": [],
1313
"license": "Apache-2.0",

examples/workspace-editing-python/src/index.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ const worker = perspective.shared_worker();
3333
* tables just as you would on a locally created table.
3434
*/
3535
const server_table = websocket.open_table("data_source_one");
36-
const server_view = await server_table.view();
36+
let server_view;
3737

3838
// All viewers are based on the same table, which then feed edits back to a
3939
// table on the server with a schema.
@@ -55,6 +55,7 @@ const PORTS = [];
5555
*/
5656
const datasource = async function() {
5757
const load_start = performance.now();
58+
server_view = await server_table.view();
5859

5960
// The API of the remote table/view are symmetric.
6061
const arrow = await server_view.to_arrow();

packages/perspective/index.d.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,12 @@ declare module "@finos/perspective" {
4343
MEDIAN = "median",
4444
PCT_SUM_PARENT = "pct sum parent",
4545
PCT_SUM_TOTAL = "pct sum grand total",
46+
STANDARD_DEVIATION = "stddev",
4647
SUM = "sum",
4748
SUM_ABS = "sum abs",
4849
SUM_NOT_NULL = "sum not null",
49-
UNIQUE = "unique"
50+
UNIQUE = "unique",
51+
VARIANCE = "var"
5052
}
5153

5254
enum STRING_AGGREGATES {

0 commit comments

Comments
 (0)