diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 710e1d9577..7f3d89104a 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -4,7 +4,6 @@ on:
   push:
     branches:
       - main
-      - checks
   pull_request:
     branches:
       - main
@@ -115,7 +114,7 @@ jobs:
       - name: Install dependencies
         shell: bash -l {0}
         run: |
-          mamba install --yes -q "python~=${PYTHON_VERSION}=*_cpython" mkl numpy scipy pip mkl-service graphviz cython pytest coverage pytest-cov sympy
+          mamba install --yes -q "python~=${PYTHON_VERSION}=*_cpython" mkl numpy scipy pip mkl-service graphviz cython pytest coverage pytest-cov pytest-benchmark sympy
           if [[ $INSTALL_NUMBA == "1" ]]; then mamba install --yes -q -c conda-forge "python~=${PYTHON_VERSION}=*_cpython" "numba>=0.55" numba-scipy; fi
           mamba install --yes -q -c conda-forge "python~=${PYTHON_VERSION}=*_cpython" jax jaxlib
           pip install -e ./
@@ -132,7 +131,7 @@ jobs:
           if [[ $FAST_COMPILE == "1" ]]; then export PYTENSOR_FLAGS=$PYTENSOR_FLAGS,mode=FAST_COMPILE; fi
           if [[ $FLOAT32 == "1" ]]; then export PYTENSOR_FLAGS=$PYTENSOR_FLAGS,floatX=float32; fi
           export PYTENSOR_FLAGS=$PYTENSOR_FLAGS,warn__ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise,gcc__cxxflags=-pipe
-          python -m pytest -x -r A --verbose --runslow --cov=pytensor/ --cov-report=xml:coverage/coverage-${MATRIX_ID}.xml --no-cov-on-fail $PART
+          python -m pytest -x -r A --verbose --runslow --cov=pytensor/ --cov-report=xml:coverage/coverage-${MATRIX_ID}.xml --no-cov-on-fail $PART --benchmark-skip
         env:
           MATRIX_ID: ${{ steps.matrix-id.outputs.id }}
           MKL_THREADING_LAYER: GNU
@@ -148,6 +147,60 @@ jobs:
           name: coverage
           path: coverage/coverage-${{ steps.matrix-id.outputs.id }}.xml
 
+  benchmarks:
+      name: "Benchmarks"
+      needs:
+        - changes
+        - style
+      runs-on: ubuntu-latest
+      if: ${{ needs.changes.outputs.changes == 'true' && needs.style.result == 'success' }}
+      strategy:
+        fail-fast: true
+      steps:
+        - uses: actions/checkout@v3
+          with:
+            fetch-depth: 0
+        - name: Set up Python 3.9
+          uses: conda-incubator/setup-miniconda@v2
+          with:
+            mamba-version: "*"
+            channels: conda-forge,defaults
+            channel-priority: true
+            python-version: 3.9
+            auto-update-conda: true
+        - name: Install dependencies
+          shell: bash -l {0}
+          run: |
+            mamba install --yes -q -c conda-forge "python~=${PYTHON_VERSION}=*_cpython" mkl numpy scipy pip mkl-service cython pytest "numba>=0.55" numba-scipy jax jaxlib pytest-benchmark
+            pip install -e ./
+            mamba list && pip freeze
+            python -c 'import pytensor; print(pytensor.config.__str__(print_doc=False))'
+            python -c 'import pytensor; assert(pytensor.config.blas__ldflags != "")'
+          env:
+            PYTHON_VERSION: 3.9
+        - name: Download previous benchmark data
+          uses: actions/cache@v1
+          with:
+            path: ./cache
+            key: ${{ runner.os }}-benchmark
+        - name: Run benchmarks
+          shell: bash -l {0}
+          run: |
+            export PYTENSOR_FLAGS=mode=FAST_COMPILE,warn__ignore_bug_before=all,on_opt_error=raise,on_shape_error=raise,gcc__cxxflags=-pipe
+            python -m pytest --runslow --benchmark-only --benchmark-json output.json
+        - name: Store benchmark result
+          uses: benchmark-action/github-action-benchmark@v1
+          with:
+            name: Python Benchmark with pytest-benchmark
+            tool: 'pytest'
+            output-file-path: output.json
+            external-data-json-path: ./cache/benchmark-data.json
+            alert-threshold: '200%'
+            github-token: ${{ secrets.GITHUB_TOKEN }}
+            comment-on-alert: ${{ github.event_name == 'push' }}
+            fail-on-alert: true
+            auto-push: false
+
   all-checks:
     if: ${{ always() }}
     runs-on: ubuntu-latest
diff --git a/environment.yml b/environment.yml
index ddcc6f41a2..d9f704640c 100644
--- a/environment.yml
+++ b/environment.yml
@@ -29,6 +29,7 @@ dependencies:
   - pytest
   - pytest-cov
   - pytest-xdist
+  - pytest-benchmark
   # For building docs
   - sphinx>=1.3
   - sphinx_rtd_theme
diff --git a/pyproject.toml b/pyproject.toml
index 1fa27d77ec..5588b8eb5c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -86,6 +86,7 @@ tests = [
     "pre-commit",
     "pytest-cov>=2.6.1",
     "coverage>=5.1",
+    "pytest-benchmark",
 ]
 rtd = [
     "sphinx>=1.3.0",
diff --git a/tests/link/jax/test_elemwise.py b/tests/link/jax/test_elemwise.py
index 36b3446fe6..947bea6a5b 100644
--- a/tests/link/jax/test_elemwise.py
+++ b/tests/link/jax/test_elemwise.py
@@ -1,6 +1,9 @@
 import numpy as np
 import pytest
+import scipy.special
 
+import pytensor
+import pytensor.tensor as at
 from pytensor.configdefaults import config
 from pytensor.graph.fg import FunctionGraph
 from pytensor.graph.op import get_test_value
@@ -98,3 +101,24 @@ def test_softmax_grad(axis):
     out = SoftmaxGrad(axis=axis)(dy, sm)
     fgraph = FunctionGraph([dy, sm], [out])
     compare_jax_and_py(fgraph, [get_test_value(i) for i in fgraph.inputs])
+
+
+@pytest.mark.parametrize("size", [(10, 10), (1000, 1000), (10000, 10000)])
+@pytest.mark.parametrize("axis", [0, 1])
+def test_logsumexp_benchmark(size, axis, benchmark):
+    X = at.matrix("X")
+    X_max = at.max(X, axis=axis, keepdims=True)
+    X_max = at.switch(at.isinf(X_max), 0, X_max)
+    X_lse = at.log(at.sum(at.exp(X - X_max), axis=axis, keepdims=True)) + X_max
+
+    X_val = np.random.normal(size=size)
+
+    X_lse_fn = pytensor.function([X], X_lse, mode="JAX")
+
+    # JIT compile first
+    _ = X_lse_fn(X_val)
+
+    res = benchmark(X_lse_fn, X_val)
+
+    exp_res = scipy.special.logsumexp(X_val, axis=axis, keepdims=True)
+    np.testing.assert_array_almost_equal(res, exp_res)
diff --git a/tests/link/numba/test_basic.py b/tests/link/numba/test_basic.py
index 887ec63d9b..6ce260566f 100644
--- a/tests/link/numba/test_basic.py
+++ b/tests/link/numba/test_basic.py
@@ -1,6 +1,6 @@
 import contextlib
 import inspect
-from typing import TYPE_CHECKING, Callable, Optional, Sequence, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Optional, Sequence, Tuple, Union
 from unittest import mock
 
 import numba
@@ -190,7 +190,7 @@ def compare_numba_and_py(
     numba_mode=numba_mode,
     py_mode=py_mode,
     updates=None,
-):
+) -> Tuple[Callable, Any]:
     """Function to compare python graph output and Numba compiled output for testing equality
 
     In the tests below computational graphs are defined in PyTensor. These graphs are then passed to
@@ -209,6 +209,10 @@ def compare_numba_and_py(
     updates
         Updates to be passed to `pytensor.function`.
 
+    Returns
+    -------
+    The compiled PyTensor function and its last computed result.
+
     """
     if assert_fn is None:
 
@@ -248,7 +252,7 @@ def assert_fn(x, y):
     else:
         assert_fn(numba_res, py_res)
 
-    return numba_res
+    return pytensor_numba_fn, numba_res
 
 
 @pytest.mark.parametrize(
diff --git a/tests/link/numba/test_scan.py b/tests/link/numba/test_scan.py
index 04bb3aefd8..0380ae0d92 100644
--- a/tests/link/numba/test_scan.py
+++ b/tests/link/numba/test_scan.py
@@ -159,7 +159,7 @@ def test_xit_xot_types(
         assert np.allclose(res_val, output_vals)
 
 
-def test_scan_multiple_output():
+def test_scan_multiple_output(benchmark):
     """Test a scan implementation of a SEIR model.
 
     SEIR model definition:
@@ -244,7 +244,9 @@ def seir_one_step(ct0, dt0, st0, et0, it0, logp_c, logp_d, beta, gamma, delta):
         gamma_val,
         delta_val,
     ]
-    compare_numba_and_py(out_fg, test_input_vals)
+    scan_fn, _ = compare_numba_and_py(out_fg, test_input_vals)
+
+    benchmark(scan_fn, *test_input_vals)
 
 
 @config.change_flags(compute_test_value="raise")
diff --git a/tests/link/numba/test_tensor_basic.py b/tests/link/numba/test_tensor_basic.py
index 56dc7ebaaf..783661578b 100644
--- a/tests/link/numba/test_tensor_basic.py
+++ b/tests/link/numba/test_tensor_basic.py
@@ -32,7 +32,7 @@ def test_Alloc(v, shape):
     g = at.alloc(v, *shape)
     g_fg = FunctionGraph(outputs=[g])
 
-    (numba_res,) = compare_numba_and_py(
+    _, (numba_res,) = compare_numba_and_py(
         g_fg,
         [
             i.tag.test_value
diff --git a/tests/scan/test_basic.py b/tests/scan/test_basic.py
index a4aa9e7020..d826639f0f 100644
--- a/tests/scan/test_basic.py
+++ b/tests/scan/test_basic.py
@@ -13,7 +13,6 @@
 import pickle
 import shutil
 import sys
-import timeit
 from collections import OrderedDict
 from tempfile import mkdtemp
 
@@ -2179,15 +2178,13 @@ def scan_fn():
 @pytest.mark.skipif(
     not config.cxx, reason="G++ not available, so we need to skip this test."
 )
-def test_cython_performance():
+def test_cython_performance(benchmark):
 
     # This implicitly confirms that the Cython version is being used
     from pytensor.scan import scan_perform_ext  # noqa: F401
 
     # Python usually out-performs PyTensor below 100 iterations
     N = 200
-    n_timeit = 50
-
     M = -1 / np.arange(1, 11).astype(config.floatX)
     r = np.arange(N * 10).astype(config.floatX).reshape(N, 10)
 
@@ -2216,17 +2213,11 @@ def f_py():
     # Make sure we're actually computing a `Scan`
     assert any(isinstance(node.op, Scan) for node in f_cvm.maker.fgraph.apply_nodes)
 
-    cvm_res = f_cvm()
+    cvm_res = benchmark(f_cvm)
 
     # Make sure the results are the same between the two implementations
     assert np.allclose(cvm_res, py_res)
 
-    python_duration = timeit.timeit(lambda: f_py(), number=n_timeit)
-    cvm_duration = timeit.timeit(lambda: f_cvm(), number=n_timeit)
-    print(f"python={python_duration}, cvm={cvm_duration}")
-
-    assert cvm_duration <= python_duration
-
 
 @config.change_flags(mode="FAST_COMPILE", compute_test_value="raise")
 def test_compute_test_values():
@@ -2662,7 +2653,7 @@ def numpy_implementation(vsample):
         n_result = numpy_implementation(v_vsample)
         utt.assert_allclose(t_result, n_result)
 
-    def test_reordering(self):
+    def test_reordering(self, benchmark):
         """Test re-ordering of inputs.
 
         some rnn with multiple outputs and multiple inputs; other
@@ -2722,14 +2713,14 @@ def f_rnn_cmpl(u1_t, u2_t, x_tm1, y_tm1, y_tm3, W_in1):
             v_x[i] = np.dot(v_u1[i], vW_in1) + v_u2[i] * vW_in2 + np.dot(v_x[i - 1], vW)
             v_y[i] = np.dot(v_x[i - 1], vWout) + v_y[i - 1]
 
-        (pytensor_dump1, pytensor_dump2, pytensor_x, pytensor_y) = f4(
-            v_u1, v_u2, v_x0, v_y0, vW_in1
+        (pytensor_dump1, pytensor_dump2, pytensor_x, pytensor_y) = benchmark(
+            f4, v_u1, v_u2, v_x0, v_y0, vW_in1
         )
 
         utt.assert_allclose(pytensor_x, v_x)
         utt.assert_allclose(pytensor_y, v_y)
 
-    def test_scan_as_tensor_on_gradients(self):
+    def test_scan_as_tensor_on_gradients(self, benchmark):
         to_scan = dvector("to_scan")
         seq = dmatrix("seq")
         f1 = dscalar("f1")
@@ -2743,7 +2734,12 @@ def scanStep(prev, seq, f1):
         function(inputs=[to_scan, seq, f1], outputs=scanned, allow_input_downcast=True)
 
         t_grad = grad(scanned.sum(), wrt=[to_scan, f1], consider_constant=[seq])
-        function(inputs=[to_scan, seq, f1], outputs=t_grad, allow_input_downcast=True)
+        benchmark(
+            function,
+            inputs=[to_scan, seq, f1],
+            outputs=t_grad,
+            allow_input_downcast=True,
+        )
 
     def caching_nsteps_by_scan_op(self):
         W = matrix("weights")
@@ -3060,7 +3056,7 @@ def inner_fn(tap_m3, tap_m2, tap_m1):
         utt.assert_allclose(outputs, expected_outputs)
 
     @pytest.mark.slow
-    def test_hessian_bug_grad_grad_two_scans(self):
+    def test_hessian_bug_grad_grad_two_scans(self, benchmark):
         # Bug reported by Bitton Tenessi
         # NOTE : The test to reproduce the bug reported by Bitton Tenessi
         # was modified from its original version to be faster to run.
@@ -3094,7 +3090,7 @@ def loss_inner(sum_inner, W):
         H = hessian(cost, W)
         print(".", file=sys.stderr)
         f = function([W, n_steps], H)
-        f(np.ones((8,), dtype="float32"), 1)
+        benchmark(f, np.ones((8,), dtype="float32"), 1)
 
     def test_grad_connectivity_matrix(self):
         def inner_fn(x_tm1, y_tm1, z_tm1):
@@ -3710,7 +3706,7 @@ def f_rnn_cmpl(u1_t, u2_t, x_tm1, y_tm1, W_in1):
         utt.assert_allclose(pytensor_x, v_x)
         utt.assert_allclose(pytensor_y, v_y)
 
-    def test_multiple_outs_taps(self):
+    def test_multiple_outs_taps(self, benchmark):
         l = 5
         rng = np.random.default_rng(utt.fetch_seed())
 
@@ -3753,8 +3749,6 @@ def f_rnn_cmpl(u1_t, u2_tm1, u2_t, u2_tp1, x_tm1, y_tm1, y_tm3, W_in1):
             [u1, u2, x0, y0, W_in1], outputs, updates=updates, allow_input_downcast=True
         )
 
-        f(v_u1, v_u2, v_x0, v_y0, vW_in1)
-
         ny0 = np.zeros((5, 2))
         ny1 = np.zeros((5,))
         ny2 = np.zeros((5, 2))
@@ -3802,7 +3796,12 @@ def f_rnn_cmpl(u1_t, u2_tm1, u2_t, u2_tp1, x_tm1, y_tm1, y_tm3, W_in1):
         ny1[4] = (ny1[3] + ny1[1]) * np.dot(ny0[3], vWout)
         ny2[4] = np.dot(v_u1[4], vW_in1)
 
-        # TODO FIXME: What is this testing?  At least assert something.
+        res = f(v_u1, v_u2, v_x0, v_y0, vW_in1)
+        np.testing.assert_almost_equal(res[0], ny0)
+        np.testing.assert_almost_equal(res[1], ny1)
+        np.testing.assert_almost_equal(res[2], ny2)
+
+        benchmark(f, v_u1, v_u2, v_x0, v_y0, vW_in1)
 
     def _grad_mout_helper(self, n_iters, mode):
         rng = np.random.default_rng(utt.fetch_seed())
diff --git a/tests/scan/test_rewriting.py b/tests/scan/test_rewriting.py
index 0f0b13a75a..eb9f5b2618 100644
--- a/tests/scan/test_rewriting.py
+++ b/tests/scan/test_rewriting.py
@@ -620,7 +620,7 @@ def test_sum_dot(self):
         vB = rng.uniform(size=(5, 5)).astype(config.floatX)
         utt.assert_allclose(f(vA, vB), np.dot(vA.T, vB))
 
-    def test_pregreedy_optimizer(self):
+    def test_pregreedy_optimizer(self, benchmark):
         W = at.zeros((5, 4))
         bv = at.zeros((5,))
         bh = at.zeros((4,))
@@ -634,7 +634,9 @@ def test_pregreedy_optimizer(self):
             n_steps=2,
         )
         # TODO FIXME: Make this a real test and assert something.
-        function([v], chain)(np.zeros((3, 5), dtype=config.floatX))
+        chain_fn = function([v], chain)
+
+        benchmark(chain_fn, np.zeros((3, 5), dtype=config.floatX))
 
     def test_machine_translation(self):
         """
@@ -1291,7 +1293,7 @@ def test_savemem_does_not_duplicate_number_of_scan_nodes(self):
         ]
         assert len(scan_nodes) == 1
 
-    def test_savemem_opt(self):
+    def test_savemem_opt(self, benchmark):
         y0 = shared(np.ones((2, 10)))
         [y1, y2], updates = scan(
             lambda y: [y, y],
@@ -1299,7 +1301,8 @@ def test_savemem_opt(self):
             n_steps=5,
         )
         # TODO FIXME: Make this a real test and assert something.
-        function([], y2.sum(), mode=self.mode)()
+        fn = function([], y2.sum(), mode=self.mode)
+        benchmark(fn)
 
     def test_savemem_opt_0_step(self):
         """