Add 'benchcomp visualize' & error on regression (rust-lang#2348)

karkhaz · web-flow · commit 94a26d16f58d · 2023-04-07T00:11:44.000Z
This commit adds an implementation for the `benchcomp visualize` command. Currently, there is one visualization, "error_on_regression", which causes `benchcomp` or `benchcomp visualize` to terminate with a return code of 1 if there was a regression in any of the metrics. Users can specify the following in their config file: visualize: - type: error_on_regression variant_pairs: - [variant_1, variant_2] - [variant_1, variant_3] checks: - metric: runtime test: "lambda old, new: new / old > 1.1" - metric: passed test: "lambda old, new: False if not old else not new" This says to check whether any benchmark regressed when run under variant_2 compared to variant_1. A benchmark is considered to have regressed if the value of the 'runtime' metric under variant_2 is 10% higher than the value under variant_1. Furthermore, the benchmark is also considered to have regressed if it was previously passing, but is now failing. These same checks are performed on all benchmarks run under variant_3 compared to variant_1. If any of those lambda functions returns True, then benchcomp will terminate with a return code of 1. This commit fixes rust-lang#2338.
diff --git a/tools/benchcomp/benchcomp/entry/benchcomp.py b/tools/benchcomp/benchcomp/entry/benchcomp.py
@@ -15,3 +15,5 @@ def main(args):
 
     args.suites_dir = run_result.out_prefix / run_result.out_symlink
     results = benchcomp.entry.collate.main(args)
+
+    benchcomp.entry.visualize.main(args)
diff --git a/tools/benchcomp/benchcomp/entry/visualize.py b/tools/benchcomp/benchcomp/entry/visualize.py
@@ -4,5 +4,16 @@
 # Entrypoint for `benchcomp visualize`
 
 
-def main(_):
-    pass
+import sys
+
+import yaml
+
+import benchcomp.visualizers.utils
+
+def main(args):
+    with open(args.result_file, encoding="utf-8") as handle:
+        results = yaml.safe_load(handle)
+
+    generate_visualizations = benchcomp.visualizers.utils.Generator(args.config)
+    generate_visualizations(results)
+    sys.exit(benchcomp.visualizers.utils.EXIT_CODE)
diff --git a/tools/benchcomp/benchcomp/parsers/test_file_to_metric.py b/tools/benchcomp/benchcomp/parsers/test_file_to_metric.py
@@ -0,0 +1,27 @@
+# Copyright Kani Contributors
+# SPDX-License-Identifier: Apache-2.0 OR MIT
+#
+# This parser is used by the test suite. It reads files in a directory of
+# directories. Each directory represents a benchmark name; inside that
+# directory, every file name is the name of a metric and the contents of the
+# file are the metric value. This is to allow writing ad-hoc regression tests
+# without actually running a real benchmark suite.
+
+
+import json
+import pathlib
+
+
+def main(root_dir):
+    ret = {
+        "metrics": {},
+        "benchmarks": {},
+    }
+    for benchmark in pathlib.Path(root_dir).iterdir():
+        ret["benchmarks"][benchmark.name] = {"metrics": {}}
+        for metric in pathlib.Path(benchmark).iterdir():
+            ret["metrics"][metric.name] = {}
+            with open(metric) as handle:
+                value = json.loads(handle.read().strip())
+            ret["benchmarks"][benchmark.name]["metrics"][metric.name] = value
+    return ret
diff --git a/tools/benchcomp/benchcomp/visualizers/__init__.py b/tools/benchcomp/benchcomp/visualizers/__init__.py
@@ -0,0 +1,51 @@
+# Copyright Kani Contributors
+# SPDX-License-Identifier: Apache-2.0 OR MIT
+
+
+import dataclasses
+
+import benchcomp.visualizers.utils as viz_utils
+
+
+# TODO The doc comment should appear in the help output, which should list all
+# available checks.
+
+@dataclasses.dataclass
+class error_on_regression:
+    """Terminate benchcomp with a return code of 1 if any benchmark regressed.
+
+    This visualization checks whether any benchmark regressed from one variant
+    to another. Sample configuration:
+
+    visualize:
+    - type: error_on_regression
+      variant_pairs:
+      - [variant_1, variant_2]
+      - [variant_1, variant_3]
+      checks:
+      - metric: runtime
+        test: "lambda old, new: new / old > 1.1"
+      - metric: passed
+        test: "lambda old, new: False if not old else not new"
+
+    This says to check whether any benchmark regressed when run under variant_2
+    compared to variant_1. A benchmark is considered to have regressed if the
+    value of the 'runtime' metric under variant_2 is 10% higher than the value
+    under variant_1. Furthermore, the benchmark is also considered to have
+    regressed if it was previously passing, but is now failing. These same
+    checks are performed on all benchmarks run under variant_3 compared to
+    variant_1. If any of those lambda functions returns True, then benchcomp
+    will terminate with a return code of 1.
+    """
+
+    checks: list
+    variant_pairs: list
+
+
+    def __call__(self, results):
+        for check in self.checks:
+            any_benchmark_regressed = viz_utils.AnyBenchmarkRegressedChecker(
+                    self.variant_pairs, **check)
+
+        if any_benchmark_regressed(results):
+            viz_utils.EXIT_CODE = 1
diff --git a/tools/benchcomp/benchcomp/visualizers/utils.py b/tools/benchcomp/benchcomp/visualizers/utils.py
@@ -0,0 +1,105 @@
+# Copyright Kani Contributors
+# SPDX-License-Identifier: Apache-2.0 OR MIT
+
+
+import dataclasses
+import typing
+
+import benchcomp.visualizers
+
+
+EXIT_CODE = 0
+
+
+class SingleRegressionCheck:
+    """Check whether a single benchmark has regressed on a single metric
+
+    Instances of this class are constructed with the name of a metric to check,
+    and a test function that figures out whether that metric has
+    regressed. Instances of this class can then be called on pairs of
+    benchmarks values. The instance returns true if the second benchmark
+    regressed compared to the first.
+    """
+
+    metric: str
+    test: typing.Callable
+
+
+    def __init__(self, metric, test_program):
+        self.metric = metric
+        try:
+            self.test = eval(test_program)
+        except SyntaxError:
+            logging.error(
+                "This test program is not valid Python: '%s'", test_program)
+            logging.error(
+                "Regression test programs should be Python lambda functions that "
+                "take two arguments (the value of a metric when run under two "
+                "variants) and returns true if the second value regressed with "
+                "respect to the first.")
+            sys.exit(1)
+
+
+    def __call__(self, old_value, new_value):
+        return self.test(old_value, new_value)
+
+
+
+class AnyBenchmarkRegressedChecker:
+    """Check whether any benchmark has regressed on a particular metric
+
+    Instances of this class are constructed with the name of a metric to check,
+    and the name of a comparison function that figures out whether one variant
+    of a benchmark has regressed compared to another variant.
+
+    When called, instances of this class return True iff any of the benchmarks
+    regressed.
+    """
+
+    def __init__(self, variant_pairs, metric, test, **test_args):
+        self.variant_pairs = variant_pairs
+        self.metric = metric
+        self.test = test
+        self.test_args = test_args
+
+
+    def __call__(self, results):
+        ret = False
+        has_regressed = SingleRegressionCheck(
+            self.metric, self.test, **self.test_args)
+
+        for bench_name, bench in results["benchmarks"].items():
+            for old_variant, new_variant in self.variant_pairs:
+                for variant in (old_variant, new_variant):
+                    if variant not in bench["variants"]:
+                        logging.warning(
+                            "benchmark '%s' did not have a value for metric '%s' "
+                            "when run under variant '%s'",
+                            bench_name, self.metric, variant)
+                        continue
+
+                old = bench["variants"][old_variant]["metrics"][self.metric]
+                new = bench["variants"][new_variant]["metrics"][self.metric]
+
+                if has_regressed(old, new):
+                    logging.warining(
+                        "Benchmark '%s' regressed on metric '%s' (%s -> %s)",
+                        bench_name, self.metric, old, new)
+                    ret = True
+        return ret
+
+
+
+@dataclasses.dataclass
+class Generator:
+    """Generate all visualizations in a config file given a dict of results"""
+
+    config: benchcomp.ConfigFile
+
+
+    def __call__(self, results):
+        for viz in self.config["visualize"]:
+            viz_type = viz.pop("type")
+            klass = getattr(benchcomp.visualizers, viz_type)
+            visualize = klass(**viz)
+            visualize(results)
diff --git a/tools/benchcomp/test/test_regression.py b/tools/benchcomp/test/test_regression.py