Skip to content

Commit ce3cb38

Browse files
committed
Refined bench and new results for linspace and different codecs
1 parent 4abfb19 commit ce3cb38

File tree

5 files changed

+47
-27
lines changed

5 files changed

+47
-27
lines changed

bench/ndarray/concatenate.py

Lines changed: 47 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
from matplotlib.ticker import ScalarFormatter
1515

1616

17-
def run_benchmark(num_arrays=10, size=500, aligned_chunks=False, axis=0, codec=blosc2.Codec.ZSTD):
17+
def run_benchmark(num_arrays=10, size=500, aligned_chunks=False, axis=0,
18+
dtype=np.float64, datadist="linspace", codec=blosc2.Codec.ZSTD):
1819
"""
1920
Benchmark blosc2.concatenate performance with different chunk alignments.
2021
@@ -23,6 +24,9 @@ def run_benchmark(num_arrays=10, size=500, aligned_chunks=False, axis=0, codec=b
2324
- size: Base size for array dimensions
2425
- aligned_chunks: Whether to use aligned chunk shapes
2526
- axis: Axis along which to concatenate (0 or 1)
27+
- dtype: Data type for the arrays (default is np.float64)
28+
- datadist: Distribution of data in arrays (default is "linspace")
29+
- codec: Codec to use for compression (default is blosc2.Codec.ZSTD)
2630
2731
Returns:
2832
- duration: Time taken in seconds
@@ -39,20 +43,28 @@ def run_benchmark(num_arrays=10, size=500, aligned_chunks=False, axis=0, codec=b
3943
raise ValueError("Only axis 0 and 1 are supported")
4044

4145
# Create appropriate chunk shapes
46+
chunks, blocks = blosc2.compute_chunks_blocks(shapes[0], dtype=dtype, cparams=blosc2.CParams(codec=codec))
4247
if aligned_chunks:
4348
# Aligned chunks: divisors of the shape dimensions
44-
chunk_shapes = [(shape[0] // 4, shape[1] // 4) for shape in shapes]
49+
chunk_shapes = [(chunks[0], chunks[1]) for shape in shapes]
4550
else:
4651
# Unaligned chunks: not divisors of shape dimensions
47-
chunk_shapes = [(shape[0] // 4 + 1, shape[1] // 4 - 1) for shape in shapes]
52+
chunk_shapes = [(chunks[0] + 1, chunks[1] - 1) for shape in shapes]
4853

4954
# Create arrays
5055
arrays = []
5156
for i, (shape, chunk_shape) in enumerate(zip(shapes, chunk_shapes)):
52-
arr = blosc2.arange(
53-
i * np.prod(shape), (i + 1) * np.prod(shape), 1, dtype="i4", shape=shape, chunks=chunk_shape,
54-
cparams=blosc2.CParams(codec=codec)
55-
)
57+
if datadist == "linspace":
58+
# Create arrays with linearly spaced values
59+
arr = blosc2.linspace(i, i + 1, num=np.prod(shape),
60+
dtype=dtype, shape=shape, chunks=chunk_shape,
61+
cparams=blosc2.CParams(codec=codec))
62+
else:
63+
# Default to arange for simplicity
64+
arr = blosc2.arange(
65+
i * np.prod(shape), (i + 1) * np.prod(shape), 1, dtype=dtype, shape=shape, chunks=chunk_shape,
66+
cparams=blosc2.CParams(codec=codec)
67+
)
5668
arrays.append(arr)
5769

5870
# Calculate total data size in GB (4 bytes per int32)
@@ -67,14 +79,16 @@ def run_benchmark(num_arrays=10, size=500, aligned_chunks=False, axis=0, codec=b
6779
return duration, result.shape, data_size_gb
6880

6981

70-
def run_numpy_benchmark(num_arrays=10, size=500, axis=0):
82+
def run_numpy_benchmark(num_arrays=10, size=500, axis=0, dtype=np.float64, datadist="linspace"):
7183
"""
7284
Benchmark numpy.concatenate performance for comparison.
7385
7486
Parameters:
7587
- num_arrays: Number of arrays to concatenate
7688
- size: Base size for array dimensions
7789
- axis: Axis along which to concatenate (0 or 1)
90+
- dtype: Data type for the arrays (default is np.float64)
91+
- datadist: Distribution of data in arrays (default is "linspace")
7892
7993
Returns:
8094
- duration: Time taken in seconds
@@ -93,12 +107,11 @@ def run_numpy_benchmark(num_arrays=10, size=500, axis=0):
93107
# Create arrays
94108
numpy_arrays = []
95109
for i, shape in enumerate(shapes):
96-
arr = np.arange(
97-
i * np.prod(shape),
98-
(i + 1) * np.prod(shape),
99-
1,
100-
dtype="i4"
101-
).reshape(shape)
110+
if datadist == "linspace":
111+
# Create arrays with linearly spaced values
112+
arr = np.linspace(i, i + 1, num=np.prod(shape), dtype=dtype).reshape(shape)
113+
else:
114+
arr = np.arange(i * np.prod(shape), (i + 1) * np.prod(shape), 1, dtype=dtype).reshape(shape)
102115
numpy_arrays.append(arr)
103116

104117
# Calculate total data size in GB (4 bytes per int32)
@@ -114,7 +127,8 @@ def run_numpy_benchmark(num_arrays=10, size=500, axis=0):
114127

115128

116129
def create_combined_plot(num_arrays, sizes, numpy_speeds_axis0, unaligned_speeds_axis0, aligned_speeds_axis0,
117-
numpy_speeds_axis1, unaligned_speeds_axis1, aligned_speeds_axis1, output_dir="plots"):
130+
numpy_speeds_axis1, unaligned_speeds_axis1, aligned_speeds_axis1, output_dir="plots",
131+
datadist="linspace", codec_str="LZ4"):
118132
"""
119133
Create a figure with two side-by-side bar plots comparing the performance for both axes.
120134
@@ -148,7 +162,7 @@ def create_combined_plot(num_arrays, sizes, numpy_speeds_axis0, unaligned_speeds
148162
# Add labels and titles
149163
for ax, axis in [(ax0, 0), (ax1, 1)]:
150164
ax.set_xlabel('Array Size (N for NxN array)', fontsize=12)
151-
ax.set_title(f'Concatenation Performance for {num_arrays} arrays (axis={axis})', fontsize=14)
165+
ax.set_title(f'Concatenation Performance for {num_arrays} arrays (axis={axis}) [{datadist}, {codec_str}]', fontsize=14)
152166
ax.set_xticks(x)
153167
ax.set_xticklabels(x_labels)
154168
ax.grid(True, axis='y', linestyle='--', alpha=0.7)
@@ -186,22 +200,25 @@ def autolabel(rects, ax):
186200

187201
# Save the plot
188202
plt.tight_layout()
189-
plt.savefig(os.path.join(output_dir, 'concatenate_benchmark_combined.png'), dpi=300)
203+
plt.savefig(os.path.join(output_dir, 'concatenate_benchmark_combined.png'), dpi=100)
190204
plt.show()
191205
plt.close()
192206

193207
print(f"Combined plot saved to {os.path.join(output_dir, 'concatenate_benchmark_combined.png')}")
194208

195209

196210
def main():
197-
codec = blosc2.Codec.BLOSCLZ
211+
# Parameters
212+
sizes = [500, 1000, 2000, 4000, 10000] #, 20000] # Sizes of arrays to test
213+
num_arrays = 10
214+
dtype = np.float64 # Data type for arrays
215+
datadist = "linspace" # Distribution of data in arrays
216+
codec = blosc2.Codec.LZ4
217+
codec_str = str(codec).split('.')[-1]
198218
print(f"{'=' * 70}")
199-
print(f"Blosc2 vs NumPy concatenation benchmark {codec=}")
219+
print(f"Blosc2 vs NumPy concatenation benchmark with {codec_str} codec")
200220
print(f"{'=' * 70}")
201221

202-
# Parameters
203-
sizes = [500, 1000, 2000, 4000] #, 10000] # must be divisible by 4 for aligned chunks
204-
num_arrays = 10
205222

206223
# Lists to store results for both axes
207224
numpy_speeds_axis0 = []
@@ -212,16 +229,18 @@ def main():
212229
aligned_speeds_axis1 = []
213230

214231
for axis in [0, 1]:
215-
print(f"\nConcatenating {num_arrays} arrays along axis {axis}")
232+
print(f"\nConcatenating {num_arrays} arrays along axis {axis} with data distribution '{datadist}' ")
216233
print(f"{'Size':<8} {'NumPy (GB/s)':<14} {'Unaligned (GB/s)':<18} "
217234
f"{'Aligned (GB/s)':<16} {'Alig vs Unalig':<16} {'Alig vs NumPy':<16}")
218235
print(f"{'-' * 90}")
219236

220237
for size in sizes:
221238
# Run the benchmarks
222-
numpy_time, numpy_shape, data_size_gb = run_numpy_benchmark(num_arrays, size, axis=axis)
223-
unaligned_time, shape1, _ = run_benchmark(num_arrays, size, aligned_chunks=False, axis=axis, codec=codec)
224-
aligned_time, shape2, _ = run_benchmark(num_arrays, size, aligned_chunks=True, axis=axis, codec=codec)
239+
numpy_time, numpy_shape, data_size_gb = run_numpy_benchmark(num_arrays, size, axis=axis, dtype=dtype)
240+
unaligned_time, shape1, _ = run_benchmark(num_arrays, size, aligned_chunks=False, axis=axis,
241+
dtype=dtype, datadist=datadist, codec=codec)
242+
aligned_time, shape2, _ = run_benchmark(num_arrays, size, aligned_chunks=True, axis=axis,
243+
dtype=dtype, datadist=datadist, codec=codec)
225244

226245
# Calculate throughputs in GB/s
227246
numpy_speed = data_size_gb / numpy_time if numpy_time > 0 else float("inf")
@@ -266,7 +285,8 @@ def main():
266285
num_arrays,
267286
sizes,
268287
numpy_speeds_axis0, unaligned_speeds_axis0, aligned_speeds_axis0,
269-
numpy_speeds_axis1, unaligned_speeds_axis1, aligned_speeds_axis1
288+
numpy_speeds_axis1, unaligned_speeds_axis1, aligned_speeds_axis1,
289+
datadist=datadist, output_dir="plots", codec_str=codec_str,
270290
)
271291

272292

-243 KB
Binary file not shown.
64.3 KB
Loading
63.4 KB
Loading
63.6 KB
Loading

0 commit comments

Comments
 (0)