From b8634be04d6fb70e8707d7d2ed0abd58a7511b10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Edgar=20Andr=C3=A9s=20Margffoy=20Tuay?= <andfoy@gmail.com>
Date: Thu, 27 Mar 2025 12:04:39 -0500
Subject: [PATCH 1/2] Add a benchmark/example for numexpr usage under
 free-threading conditions

---
 bench/free_threading.py | 154 ++++++++++++++++++++++++++++++++++++++++
 numexpr/necompiler.py   |  10 +--
 2 files changed, 157 insertions(+), 7 deletions(-)
 create mode 100644 bench/free_threading.py

diff --git a/bench/free_threading.py b/bench/free_threading.py
new file mode 100644
index 0000000..f070c09
--- /dev/null
+++ b/bench/free_threading.py
@@ -0,0 +1,154 @@
+#################################################################################
+# To mimic the scenario that computation is i/o bound and constrained by memory
+#
+# It's a much simplified version that the chunk is computed in a loop,
+# and expression is evaluated in a sequence, which is not true in reality.
+# Neverthless, numexpr outperforms numpy.
+#################################################################################
+"""
+Benchmarking Expression 1:
+NumPy time (threaded over 32 chunks with 2 threads): 4.612313 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 2 threads): 0.951172 seconds
+numexpr speedup: 4.85x
+----------------------------------------
+Benchmarking Expression 2:
+NumPy time (threaded over 32 chunks with 2 threads): 23.862752 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 2 threads): 2.182058 seconds
+numexpr speedup: 10.94x
+----------------------------------------
+Benchmarking Expression 3:
+NumPy time (threaded over 32 chunks with 2 threads): 20.594895 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 2 threads): 2.927881 seconds
+numexpr speedup: 7.03x
+----------------------------------------
+Benchmarking Expression 4:
+NumPy time (threaded over 32 chunks with 2 threads): 12.834101 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 2 threads): 5.392480 seconds
+numexpr speedup: 2.38x
+----------------------------------------
+"""
+
+import os
+
+os.environ["NUMEXPR_NUM_THREADS"] = "1"
+import threading
+import timeit
+
+import numpy as np
+
+import numexpr as ne
+
+array_size = 10**8
+num_runs = 10
+num_chunks = 32  # Number of chunks
+num_threads = 16  # Number of threads constrained by how many chunks memory can hold
+
+a = np.random.rand(array_size).reshape(10**4, -1)
+b = np.random.rand(array_size).reshape(10**4, -1)
+c = np.random.rand(array_size).reshape(10**4, -1)
+
+chunk_size = array_size // num_chunks
+
+expressions_numpy = [
+    lambda a, b, c: a + b * c,
+    lambda a, b, c: a**2 + b**2 - 2 * a * b * np.cos(c),
+    lambda a, b, c: np.sin(a) + np.log(b) * np.sqrt(c),
+    lambda a, b, c: np.exp(a) + np.tan(b) - np.sinh(c),
+]
+
+expressions_numexpr = [
+    "a + b * c",
+    "a**2 + b**2 - 2 * a * b * cos(c)",
+    "sin(a) + log(b) * sqrt(c)",
+    "exp(a) + tan(b) - sinh(c)",
+]
+
+
+def benchmark_numpy_chunk(func, a, b, c, results, indices):
+    for index in indices:
+        start = index * chunk_size
+        end = (index + 1) * chunk_size
+        time_taken = timeit.timeit(
+            lambda: func(a[start:end], b[start:end], c[start:end]), number=num_runs
+        )
+        results.append(time_taken)
+
+
+def benchmark_numexpr_re_evaluate(expr, a, b, c, results, indices):
+    for index in indices:
+        start = index * chunk_size
+        end = (index + 1) * chunk_size
+        # if index == 0:
+        # Evaluate the first chunk with evaluate
+        time_taken = timeit.timeit(
+            lambda: ne.evaluate(
+                expr,
+                local_dict={
+                    "a": a[start:end],
+                    "b": b[start:end],
+                    "c": c[start:end],
+                },
+            ),
+            number=num_runs,
+        )
+        # else:
+        # Re-evaluate subsequent chunks with re_evaluate
+        # time_taken = timeit.timeit(
+        #     lambda: ne.re_evaluate(
+        #         local_dict={"a": a[start:end], "b": b[start:end], "c": c[start:end]}
+        #     ),
+        #     number=num_runs,
+        # )
+        results.append(time_taken)
+
+
+def run_benchmark_threaded():
+    chunk_indices = list(range(num_chunks))
+
+    for i in range(len(expressions_numpy)):
+        print(f"Benchmarking Expression {i+1}:")
+
+        results_numpy = []
+        results_numexpr = []
+
+        threads_numpy = []
+        for j in range(num_threads):
+            indices = chunk_indices[j::num_threads]  # Distribute chunks across threads
+            thread = threading.Thread(
+                target=benchmark_numpy_chunk,
+                args=(expressions_numpy[i], a, b, c, results_numpy, indices),
+            )
+            threads_numpy.append(thread)
+            thread.start()
+
+        for thread in threads_numpy:
+            thread.join()
+
+        numpy_time = sum(results_numpy)
+        print(
+            f"NumPy time (threaded over {num_chunks} chunks with {num_threads} threads): {numpy_time:.6f} seconds"
+        )
+
+        threads_numexpr = []
+        for j in range(num_threads):
+            indices = chunk_indices[j::num_threads]  # Distribute chunks across threads
+            thread = threading.Thread(
+                target=benchmark_numexpr_re_evaluate,
+                args=(expressions_numexpr[i], a, b, c, results_numexpr, indices),
+            )
+            threads_numexpr.append(thread)
+            thread.start()
+
+        for thread in threads_numexpr:
+            thread.join()
+
+        numexpr_time = sum(results_numexpr)
+        print(
+            f"numexpr time (threaded with re_evaluate over {num_chunks} chunks with {num_threads} threads): {numexpr_time:.6f} seconds"
+        )
+        print(f"numexpr speedup: {numpy_time / numexpr_time:.2f}x")
+        print("-" * 40)
+
+
+if __name__ == "__main__":
+    run_benchmark_threaded()
diff --git a/numexpr/necompiler.py b/numexpr/necompiler.py
index 4ada878..537f816 100644
--- a/numexpr/necompiler.py
+++ b/numexpr/necompiler.py
@@ -775,14 +775,12 @@ def getArguments(names, local_dict=None, global_dict=None, _frame_depth: int=2):
 
 
 # Dictionaries for caching variable names and compiled expressions
-# _names_cache = CacheDict(256)
 _names_cache = threading.local()
-# _numexpr_cache = CacheDict(256)
 _numexpr_cache = threading.local()
-# _numexpr_last = ContextDict()
 _numexpr_last = threading.local()
 evaluate_lock = threading.Lock()
 
+
 def validate(ex: str,
              local_dict: Optional[Dict] = None,
              global_dict: Optional[Dict] = None,
@@ -856,7 +854,6 @@ def validate(ex: str,
     ----
 
     """
-    global _numexpr_last
     if not hasattr(_numexpr_last, 'l'):
         _numexpr_last.l = ContextDict()
 
@@ -998,7 +995,6 @@ def re_evaluate(local_dict: Optional[Dict] = None,
         The calling frame depth. Unless you are a NumExpr developer you should
         not set this value.
     """
-    global _numexpr_last
     if not hasattr(_numexpr_last, 'l'):
         _numexpr_last.l = ContextDict()
 
@@ -1009,5 +1005,5 @@ def re_evaluate(local_dict: Optional[Dict] = None,
     argnames = _numexpr_last.l['argnames']
     args = getArguments(argnames, local_dict, global_dict, _frame_depth=_frame_depth)
     kwargs = _numexpr_last.l['kwargs']
-    with evaluate_lock:
-        return compiled_ex(*args, **kwargs)
+    # with evaluate_lock:
+    return compiled_ex(*args, **kwargs)

From 462dd17b54cf4122941120d26f1f3d76f4db9b42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Edgar=20Andr=C3=A9s=20Margffoy=20Tuay?= <andfoy@gmail.com>
Date: Mon, 7 Apr 2025 12:56:23 -0500
Subject: [PATCH 2/2] Add benchmark results

---
 bench/free_threading.py | 67 ++++++++++++++++++++++++++---------------
 1 file changed, 42 insertions(+), 25 deletions(-)

diff --git a/bench/free_threading.py b/bench/free_threading.py
index f070c09..cd00e78 100644
--- a/bench/free_threading.py
+++ b/bench/free_threading.py
@@ -1,36 +1,61 @@
 #################################################################################
-# To mimic the scenario that computation is i/o bound and constrained by memory
+# To compare the performance of numexpr when free-threading CPython is used.
 #
-# It's a much simplified version that the chunk is computed in a loop,
-# and expression is evaluated in a sequence, which is not true in reality.
-# Neverthless, numexpr outperforms numpy.
+# This example makes use of Python threads, as opposed to C native ones
+# in order to highlight the improvement introduced by free-threading CPython,
+# which now disables the GIL altogether.
 #################################################################################
 """
+Results with GIL-enabled CPython:
+
+Benchmarking Expression 1:
+NumPy time (threaded over 32 chunks with 16 threads): 1.173090 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 0.951071 seconds
+numexpr speedup: 1.23x
+----------------------------------------
+Benchmarking Expression 2:
+NumPy time (threaded over 32 chunks with 16 threads): 10.410874 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 8.248753 seconds
+numexpr speedup: 1.26x
+----------------------------------------
+Benchmarking Expression 3:
+NumPy time (threaded over 32 chunks with 16 threads): 9.605909 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 11.087108 seconds
+numexpr speedup: 0.87x
+----------------------------------------
+Benchmarking Expression 4:
+NumPy time (threaded over 32 chunks with 16 threads): 3.836962 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 18.054531 seconds
+numexpr speedup: 0.21x
+----------------------------------------
+
+Results with free-threading CPython:
+
 Benchmarking Expression 1:
-NumPy time (threaded over 32 chunks with 2 threads): 4.612313 seconds
-numexpr time (threaded with re_evaluate over 32 chunks with 2 threads): 0.951172 seconds
-numexpr speedup: 4.85x
+NumPy time (threaded over 32 chunks with 16 threads): 3.415349 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 2.618876 seconds
+numexpr speedup: 1.30x
 ----------------------------------------
 Benchmarking Expression 2:
-NumPy time (threaded over 32 chunks with 2 threads): 23.862752 seconds
-numexpr time (threaded with re_evaluate over 32 chunks with 2 threads): 2.182058 seconds
-numexpr speedup: 10.94x
+NumPy time (threaded over 32 chunks with 16 threads): 19.005238 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 12.611407 seconds
+numexpr speedup: 1.51x
 ----------------------------------------
 Benchmarking Expression 3:
-NumPy time (threaded over 32 chunks with 2 threads): 20.594895 seconds
-numexpr time (threaded with re_evaluate over 32 chunks with 2 threads): 2.927881 seconds
-numexpr speedup: 7.03x
+NumPy time (threaded over 32 chunks with 16 threads): 20.555149 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 17.690749 seconds
+numexpr speedup: 1.16x
 ----------------------------------------
 Benchmarking Expression 4:
-NumPy time (threaded over 32 chunks with 2 threads): 12.834101 seconds
-numexpr time (threaded with re_evaluate over 32 chunks with 2 threads): 5.392480 seconds
-numexpr speedup: 2.38x
+NumPy time (threaded over 32 chunks with 16 threads): 38.338372 seconds
+numexpr time (threaded with re_evaluate over 32 chunks with 16 threads): 35.074684 seconds
+numexpr speedup: 1.09x
 ----------------------------------------
 """
 
 import os
 
-os.environ["NUMEXPR_NUM_THREADS"] = "1"
+os.environ["NUMEXPR_NUM_THREADS"] = "2"
 import threading
 import timeit
 
@@ -91,14 +116,6 @@ def benchmark_numexpr_re_evaluate(expr, a, b, c, results, indices):
             ),
             number=num_runs,
         )
-        # else:
-        # Re-evaluate subsequent chunks with re_evaluate
-        # time_taken = timeit.timeit(
-        #     lambda: ne.re_evaluate(
-        #         local_dict={"a": a[start:end], "b": b[start:end], "c": c[start:end]}
-        #     ),
-        #     number=num_runs,
-        # )
         results.append(time_taken)