perspective-dev · texodus · Apr 21, 2020 · Apr 7, 2020 · Apr 9, 2020
diff --git a/python/perspective/bench/bench.py b/python/perspective/bench/bench.py
@@ -12,6 +12,7 @@
 import subprocess
 import venv
 import tornado
+from datetime import datetime
 from timeit import timeit
 sys.path.insert(1, os.path.join(os.path.dirname(__file__), '..'))
 from perspective import Table, PerspectiveManager, PerspectiveTornadoHandler  # noqa: E402
@@ -207,8 +208,9 @@ def host_results(self):
     def write_results(self):
         if self._table is None:
             return
-        logging.info("Writing results to `benchmark.arrow`")
-        arrow_path = os.path.join(os.path.dirname(__file__), "benchmark.arrow")
+        name = "benchmark_{}_.arrow".format(datetime.now().isoformat())
+        logging.info("Writing results to `{}`".format(name))
+        arrow_path = os.path.join(os.path.dirname(__file__), name)
         with open(arrow_path, "wb") as file:
             arrow = self._table.view().to_arrow()
             file.write(arrow)

diff --git a/python/perspective/bench/perspective_benchmark.py b/python/perspective/bench/perspective_benchmark.py
@@ -9,8 +9,10 @@
 import os
 import sys
 import subprocess
+import random
+import pandas as pd
 from functools import partial
-from bench import Benchmark, Suite, Runner, VirtualEnvHandler
+from bench import Benchmark, Suite, Runner
 sys.path.insert(1, os.path.join(os.path.dirname(__file__), '..'))
 from perspective import Table  # noqa: E402
 from perspective.tests.common import superstore  # noqa: E402
@@ -33,6 +35,9 @@ def make_meta(group, name):
         "name": name
     }
 
+def empty_callback():
+    pass
+
 
 class PerspectiveBenchmark(Suite):
 
@@ -47,6 +52,13 @@ class PerspectiveBenchmark(Suite):
     def __init__(self):
         """Create a benchmark suite for `perspective-python`."""
         tbl = Table(SUPERSTORE)
+        self._schema = tbl.schema()
+        self._df_schema = tbl.schema()
+        # mutate schema to have some integer columns, so as to force numpy 
+        # float-to-int demotion
+        self._df_schema["Sales"] = int
+        self._df_schema["Profit"] = int
+        self._df_schema["Quantity"] = int
         self._view = tbl.view()
         self.dict = self._view.to_dict()
         self.records = self._view.to_records()
@@ -55,6 +67,10 @@ def __init__(self):
         self.arrow = self._view.to_arrow()
         self._table = tbl
 
+    def _get_update_data(self, n=30):
+        """Retrieve n rows from self.records to be used as update data."""
+        return random.sample(self.records, n)
+
     def register_benchmarks(self):
         """Register all the benchmark methods - each method creates a number of
         lambdas, and then calls `setattr` on the Suite itself so that the
@@ -65,6 +81,14 @@ def register_benchmarks(self):
         self.benchmark_view_one()
         self.benchmark_view_two()
         self.benchmark_view_two_column_only()
+        self.benchmark_view_zero_updates()
+        self.benchmark_view_one_updates()
+        self.benchmark_view_two_updates()
+        self.benchmark_view_two_column_only_updates()
+        self.benchmark_view_zero_df_updates()
+        self.benchmark_view_one_df_updates()
+        self.benchmark_view_two_df_updates()
+        self.benchmark_view_two_column_only_df_updates()
         self.benchmark_to_format_zero()
         self.benchmark_to_format_one()
         self.benchmark_to_format_two()
@@ -92,6 +116,38 @@ def benchmark_view_zero(self):
         func = Benchmark(lambda: self._table.view(), meta=make_meta("view", "zero"))
         setattr(self, "view_zero", func)
 
+    def benchmark_view_zero_updates(self):
+        """Benchmark how long it takes for each update to resolve fully, using
+        the on update callback that forces resolution of updates across
+        10 views."""
+        table = Table(self._schema)
+        views = [table.view() for i in range(25)]
+        for v in views:
+            v.on_update(empty_callback)
+        update_data = self._get_update_data(1000)
+        def resolve_update():
+            table.update(update_data)
+            table.size()
+        func = Benchmark(resolve_update, meta=make_meta("update", "zero"))
+        setattr(self, "update_zero", func)
+
+    def benchmark_view_zero_df_updates(self):
+        """Benchmark how long it takes for each update to resolve fully, using
+        the on update callback that forces resolution of updates across
+        10 views. This version updates using dataframes, and is designed to
+        compare the overhead of dataframe loading vs. regular data structure
+        loading."""
+        table = Table(self._df_schema)
+        views = [table.view() for i in range(25)]
+        for v in views:
+            v.on_update(empty_callback)
+        update_data = pd.DataFrame(self._get_update_data(1000))
+        def resolve_update():
+            table.update(update_data)
+            table.size()
+        func = Benchmark(resolve_update, meta=make_meta("update", "zero_df"))
+        setattr(self, "update_zero_df", func)
+
     def benchmark_view_one(self):
         """Benchmark view creation with different pivots."""
         for pivot in PerspectiveBenchmark.ROW_PIVOT_OPTIONS:
@@ -102,6 +158,34 @@ def benchmark_view_one(self):
             func = Benchmark(lambda: view_constructor(), meta=test_meta)
             setattr(self, "view_{0}".format(test_meta["name"]), func)
 
+    def benchmark_view_one_updates(self):
+        """Benchmark how long it takes for each update to resolve fully, using
+        the on update callback that forces resolution of updates across
+        25 views."""
+        table = Table(self._schema)
+        views = [table.view(row_pivots=["State", "City"]) for i in range(25)]
+        for v in views:
+            v.on_update(empty_callback)
+        update_data = self._get_update_data(1000)
+        def resolve_update():
+            table.update(update_data)
+            table.size()
+        func = Benchmark(resolve_update, meta=make_meta("update", "one"))
+        setattr(self, "update_one", func)
+
+    def benchmark_view_one_df_updates(self):
+        """Benchmark dataframe updates for one-sided views."""
+        table = Table(self._df_schema)
+        views = [table.view(row_pivots=["State", "City"]) for i in range(25)]
+        for v in views:
+            v.on_update(empty_callback)
+        update_data = pd.DataFrame(self._get_update_data(1000))
+        def resolve_update():
+            table.update(update_data)
+            table.size()
+        func = Benchmark(resolve_update, meta=make_meta("update", "one_df"))
+        setattr(self, "update_one_df", func)
+
     def benchmark_view_two(self):
         """Benchmark view creation with row and column pivots."""
         for i in range(len(PerspectiveBenchmark.ROW_PIVOT_OPTIONS)):
@@ -116,6 +200,35 @@ def benchmark_view_two(self):
             func = Benchmark(lambda: view_constructor(), meta=test_meta)
             setattr(self, "view_{0}".format(test_meta["name"]), func)
 
+    def benchmark_view_two_updates(self):
+        """Benchmark how long it takes for each update to resolve fully, using
+        the on update callback that forces resolution of updates across
+        25 views."""
+        table = Table(self._schema)
+        views = [table.view(row_pivots=["State", "City"], column_pivots=["Category", "Sub-Category"]) for i in range(25)]
+        for v in views:
+            v.on_update(empty_callback)
+        update_data = self._get_update_data(1000)
+        def resolve_update():
+            table.update(update_data)
+            table.size()
+        func = Benchmark(resolve_update, meta=make_meta("update", "two"))
+        setattr(self, "update_two", func)
+
+
+    def benchmark_view_two_df_updates(self):
+        """Benchmark dataframe updates for two-sided views."""
+        table = Table(self._df_schema)
+        views = [table.view(row_pivots=["State", "City"], column_pivots=["Category", "Sub-Category"]) for i in range(25)]
+        for v in views:
+            v.on_update(empty_callback)
+        update_data = pd.DataFrame(self._get_update_data(1000))
+        def resolve_update():
+            table.update(update_data)
+            table.size()
+        func = Benchmark(resolve_update, meta=make_meta("update", "two_df"))
+        setattr(self, "update_two_df", func)
+
     def benchmark_view_two_column_only(self):
         """Benchmark column-only view creation."""
         for pivot in PerspectiveBenchmark.COLUMN_PIVOT_OPTIONS:
@@ -127,6 +240,34 @@ def benchmark_view_two_column_only(self):
             func = Benchmark(lambda: view_constructor(), meta=test_meta)
             setattr(self, "view_{0}".format(test_meta["name"]), func)
 
+    def benchmark_view_two_column_only_updates(self):
+        """Benchmark how long it takes for each update to resolve fully, using
+        the on update callback that forces resolution of updates across
+        25 views."""
+        table = Table(self._schema)
+        views = [table.view(column_pivots=["Category", "Sub-Category"]) for i in range(25)]
+        for v in views:
+            v.on_update(empty_callback)
+        update_data = self._get_update_data(1000)
+        def resolve_update():
+            table.update(update_data)
+            table.size()
+        func = Benchmark(resolve_update, meta=make_meta("update", "two_column_only"))
+        setattr(self, "update_two_column_only", func)
+
+    def benchmark_view_two_column_only_df_updates(self):
+        """Benchmark dataframe updates for two-sided column only views."""
+        table = Table(self._df_schema)
+        views = [table.view(column_pivots=["Category", "Sub-Category"]) for i in range(25)]
+        for v in views:
+            v.on_update(empty_callback)
+        update_data = pd.DataFrame(self._get_update_data(1000))
+        def resolve_update():
+            table.update(update_data)
+            table.size()
+        func = Benchmark(resolve_update, meta=make_meta("update", "two_column_only_df"))
+        setattr(self, "update_two_column_only_df", func)
+
     def benchmark_to_format_zero(self):
         """Benchmark each `to_format` method."""
         for name in ("numpy", "dict", "records", "df", "arrow"):
@@ -179,7 +320,7 @@ def benchmark_to_format_two_column_only(self):
 
 
 if __name__ == "__main__":
-    VERSION = sys.argv[1]
+    VERSION = "master"
 
     # Initialize a suite and runner, then call `.run()`
     suite = PerspectiveBenchmark()

diff --git a/python/perspective/bench/run_perspective_benchmark.py b/python/perspective/bench/run_perspective_benchmark.py
@@ -41,8 +41,7 @@
     for version in VERSIONS[1:]:
         print("Installing perspective-python=={}".format(version))
         subprocess.check_output(
-            "yes | python3 -m pip uninstall perspective-python".format(version),
-            shell=True)
+            "yes | python3 -m pip uninstall perspective-python", shell=True)
         subprocess.check_output(
             "yes | python3 -m pip install perspective-python=={}".format(version),
             shell=True)