diff --git a/python/perspective/bench/bench.py b/python/perspective/bench/bench.py index 9178fe8d89..3f20492127 100644 --- a/python/perspective/bench/bench.py +++ b/python/perspective/bench/bench.py @@ -12,6 +12,7 @@ import subprocess import venv import tornado +from datetime import datetime from timeit import timeit sys.path.insert(1, os.path.join(os.path.dirname(__file__), '..')) from perspective import Table, PerspectiveManager, PerspectiveTornadoHandler # noqa: E402 @@ -207,8 +208,9 @@ def host_results(self): def write_results(self): if self._table is None: return - logging.info("Writing results to `benchmark.arrow`") - arrow_path = os.path.join(os.path.dirname(__file__), "benchmark.arrow") + name = "benchmark_{}_.arrow".format(datetime.now().isoformat()) + logging.info("Writing results to `{}`".format(name)) + arrow_path = os.path.join(os.path.dirname(__file__), name) with open(arrow_path, "wb") as file: arrow = self._table.view().to_arrow() file.write(arrow) diff --git a/python/perspective/bench/perspective_benchmark.py b/python/perspective/bench/perspective_benchmark.py index 2b1d853b73..9db02db7f6 100644 --- a/python/perspective/bench/perspective_benchmark.py +++ b/python/perspective/bench/perspective_benchmark.py @@ -9,8 +9,10 @@ import os import sys import subprocess +import random +import pandas as pd from functools import partial -from bench import Benchmark, Suite, Runner, VirtualEnvHandler +from bench import Benchmark, Suite, Runner sys.path.insert(1, os.path.join(os.path.dirname(__file__), '..')) from perspective import Table # noqa: E402 from perspective.tests.common import superstore # noqa: E402 @@ -33,6 +35,9 @@ def make_meta(group, name): "name": name } +def empty_callback(): + pass + class PerspectiveBenchmark(Suite): @@ -47,6 +52,13 @@ class PerspectiveBenchmark(Suite): def __init__(self): """Create a benchmark suite for `perspective-python`.""" tbl = Table(SUPERSTORE) + self._schema = tbl.schema() + self._df_schema = tbl.schema() + # mutate schema to have some integer columns, so as to force numpy + # float-to-int demotion + self._df_schema["Sales"] = int + self._df_schema["Profit"] = int + self._df_schema["Quantity"] = int self._view = tbl.view() self.dict = self._view.to_dict() self.records = self._view.to_records() @@ -55,6 +67,10 @@ def __init__(self): self.arrow = self._view.to_arrow() self._table = tbl + def _get_update_data(self, n=30): + """Retrieve n rows from self.records to be used as update data.""" + return random.sample(self.records, n) + def register_benchmarks(self): """Register all the benchmark methods - each method creates a number of lambdas, and then calls `setattr` on the Suite itself so that the @@ -65,6 +81,14 @@ def register_benchmarks(self): self.benchmark_view_one() self.benchmark_view_two() self.benchmark_view_two_column_only() + self.benchmark_view_zero_updates() + self.benchmark_view_one_updates() + self.benchmark_view_two_updates() + self.benchmark_view_two_column_only_updates() + self.benchmark_view_zero_df_updates() + self.benchmark_view_one_df_updates() + self.benchmark_view_two_df_updates() + self.benchmark_view_two_column_only_df_updates() self.benchmark_to_format_zero() self.benchmark_to_format_one() self.benchmark_to_format_two() @@ -92,6 +116,38 @@ def benchmark_view_zero(self): func = Benchmark(lambda: self._table.view(), meta=make_meta("view", "zero")) setattr(self, "view_zero", func) + def benchmark_view_zero_updates(self): + """Benchmark how long it takes for each update to resolve fully, using + the on update callback that forces resolution of updates across + 10 views.""" + table = Table(self._schema) + views = [table.view() for i in range(25)] + for v in views: + v.on_update(empty_callback) + update_data = self._get_update_data(1000) + def resolve_update(): + table.update(update_data) + table.size() + func = Benchmark(resolve_update, meta=make_meta("update", "zero")) + setattr(self, "update_zero", func) + + def benchmark_view_zero_df_updates(self): + """Benchmark how long it takes for each update to resolve fully, using + the on update callback that forces resolution of updates across + 10 views. This version updates using dataframes, and is designed to + compare the overhead of dataframe loading vs. regular data structure + loading.""" + table = Table(self._df_schema) + views = [table.view() for i in range(25)] + for v in views: + v.on_update(empty_callback) + update_data = pd.DataFrame(self._get_update_data(1000)) + def resolve_update(): + table.update(update_data) + table.size() + func = Benchmark(resolve_update, meta=make_meta("update", "zero_df")) + setattr(self, "update_zero_df", func) + def benchmark_view_one(self): """Benchmark view creation with different pivots.""" for pivot in PerspectiveBenchmark.ROW_PIVOT_OPTIONS: @@ -102,6 +158,34 @@ def benchmark_view_one(self): func = Benchmark(lambda: view_constructor(), meta=test_meta) setattr(self, "view_{0}".format(test_meta["name"]), func) + def benchmark_view_one_updates(self): + """Benchmark how long it takes for each update to resolve fully, using + the on update callback that forces resolution of updates across + 25 views.""" + table = Table(self._schema) + views = [table.view(row_pivots=["State", "City"]) for i in range(25)] + for v in views: + v.on_update(empty_callback) + update_data = self._get_update_data(1000) + def resolve_update(): + table.update(update_data) + table.size() + func = Benchmark(resolve_update, meta=make_meta("update", "one")) + setattr(self, "update_one", func) + + def benchmark_view_one_df_updates(self): + """Benchmark dataframe updates for one-sided views.""" + table = Table(self._df_schema) + views = [table.view(row_pivots=["State", "City"]) for i in range(25)] + for v in views: + v.on_update(empty_callback) + update_data = pd.DataFrame(self._get_update_data(1000)) + def resolve_update(): + table.update(update_data) + table.size() + func = Benchmark(resolve_update, meta=make_meta("update", "one_df")) + setattr(self, "update_one_df", func) + def benchmark_view_two(self): """Benchmark view creation with row and column pivots.""" for i in range(len(PerspectiveBenchmark.ROW_PIVOT_OPTIONS)): @@ -116,6 +200,35 @@ def benchmark_view_two(self): func = Benchmark(lambda: view_constructor(), meta=test_meta) setattr(self, "view_{0}".format(test_meta["name"]), func) + def benchmark_view_two_updates(self): + """Benchmark how long it takes for each update to resolve fully, using + the on update callback that forces resolution of updates across + 25 views.""" + table = Table(self._schema) + views = [table.view(row_pivots=["State", "City"], column_pivots=["Category", "Sub-Category"]) for i in range(25)] + for v in views: + v.on_update(empty_callback) + update_data = self._get_update_data(1000) + def resolve_update(): + table.update(update_data) + table.size() + func = Benchmark(resolve_update, meta=make_meta("update", "two")) + setattr(self, "update_two", func) + + + def benchmark_view_two_df_updates(self): + """Benchmark dataframe updates for two-sided views.""" + table = Table(self._df_schema) + views = [table.view(row_pivots=["State", "City"], column_pivots=["Category", "Sub-Category"]) for i in range(25)] + for v in views: + v.on_update(empty_callback) + update_data = pd.DataFrame(self._get_update_data(1000)) + def resolve_update(): + table.update(update_data) + table.size() + func = Benchmark(resolve_update, meta=make_meta("update", "two_df")) + setattr(self, "update_two_df", func) + def benchmark_view_two_column_only(self): """Benchmark column-only view creation.""" for pivot in PerspectiveBenchmark.COLUMN_PIVOT_OPTIONS: @@ -127,6 +240,34 @@ def benchmark_view_two_column_only(self): func = Benchmark(lambda: view_constructor(), meta=test_meta) setattr(self, "view_{0}".format(test_meta["name"]), func) + def benchmark_view_two_column_only_updates(self): + """Benchmark how long it takes for each update to resolve fully, using + the on update callback that forces resolution of updates across + 25 views.""" + table = Table(self._schema) + views = [table.view(column_pivots=["Category", "Sub-Category"]) for i in range(25)] + for v in views: + v.on_update(empty_callback) + update_data = self._get_update_data(1000) + def resolve_update(): + table.update(update_data) + table.size() + func = Benchmark(resolve_update, meta=make_meta("update", "two_column_only")) + setattr(self, "update_two_column_only", func) + + def benchmark_view_two_column_only_df_updates(self): + """Benchmark dataframe updates for two-sided column only views.""" + table = Table(self._df_schema) + views = [table.view(column_pivots=["Category", "Sub-Category"]) for i in range(25)] + for v in views: + v.on_update(empty_callback) + update_data = pd.DataFrame(self._get_update_data(1000)) + def resolve_update(): + table.update(update_data) + table.size() + func = Benchmark(resolve_update, meta=make_meta("update", "two_column_only_df")) + setattr(self, "update_two_column_only_df", func) + def benchmark_to_format_zero(self): """Benchmark each `to_format` method.""" for name in ("numpy", "dict", "records", "df", "arrow"): @@ -179,7 +320,7 @@ def benchmark_to_format_two_column_only(self): if __name__ == "__main__": - VERSION = sys.argv[1] + VERSION = "master" # Initialize a suite and runner, then call `.run()` suite = PerspectiveBenchmark() diff --git a/python/perspective/bench/run_perspective_benchmark.py b/python/perspective/bench/run_perspective_benchmark.py index 2930a94135..2801c32af8 100644 --- a/python/perspective/bench/run_perspective_benchmark.py +++ b/python/perspective/bench/run_perspective_benchmark.py @@ -41,8 +41,7 @@ for version in VERSIONS[1:]: print("Installing perspective-python=={}".format(version)) subprocess.check_output( - "yes | python3 -m pip uninstall perspective-python".format(version), - shell=True) + "yes | python3 -m pip uninstall perspective-python", shell=True) subprocess.check_output( "yes | python3 -m pip install perspective-python=={}".format(version), shell=True)