1414from matplotlib .ticker import ScalarFormatter
1515
1616
17- def run_benchmark (num_arrays = 10 , size = 500 , aligned_chunks = False , axis = 0 , codec = blosc2 .Codec .ZSTD ):
17+ def run_benchmark (num_arrays = 10 , size = 500 , aligned_chunks = False , axis = 0 ,
18+ dtype = np .float64 , datadist = "linspace" , codec = blosc2 .Codec .ZSTD ):
1819 """
1920 Benchmark blosc2.concatenate performance with different chunk alignments.
2021
@@ -23,6 +24,9 @@ def run_benchmark(num_arrays=10, size=500, aligned_chunks=False, axis=0, codec=b
2324 - size: Base size for array dimensions
2425 - aligned_chunks: Whether to use aligned chunk shapes
2526 - axis: Axis along which to concatenate (0 or 1)
27+ - dtype: Data type for the arrays (default is np.float64)
28+ - datadist: Distribution of data in arrays (default is "linspace")
29+ - codec: Codec to use for compression (default is blosc2.Codec.ZSTD)
2630
2731 Returns:
2832 - duration: Time taken in seconds
@@ -39,20 +43,28 @@ def run_benchmark(num_arrays=10, size=500, aligned_chunks=False, axis=0, codec=b
3943 raise ValueError ("Only axis 0 and 1 are supported" )
4044
4145 # Create appropriate chunk shapes
46+ chunks , blocks = blosc2 .compute_chunks_blocks (shapes [0 ], dtype = dtype , cparams = blosc2 .CParams (codec = codec ))
4247 if aligned_chunks :
4348 # Aligned chunks: divisors of the shape dimensions
44- chunk_shapes = [(shape [0 ] // 4 , shape [1 ] // 4 ) for shape in shapes ]
49+ chunk_shapes = [(chunks [0 ], chunks [1 ]) for shape in shapes ]
4550 else :
4651 # Unaligned chunks: not divisors of shape dimensions
47- chunk_shapes = [(shape [0 ] // 4 + 1 , shape [1 ] // 4 - 1 ) for shape in shapes ]
52+ chunk_shapes = [(chunks [0 ] + 1 , chunks [1 ] - 1 ) for shape in shapes ]
4853
4954 # Create arrays
5055 arrays = []
5156 for i , (shape , chunk_shape ) in enumerate (zip (shapes , chunk_shapes )):
52- arr = blosc2 .arange (
53- i * np .prod (shape ), (i + 1 ) * np .prod (shape ), 1 , dtype = "i4" , shape = shape , chunks = chunk_shape ,
54- cparams = blosc2 .CParams (codec = codec )
55- )
57+ if datadist == "linspace" :
58+ # Create arrays with linearly spaced values
59+ arr = blosc2 .linspace (i , i + 1 , num = np .prod (shape ),
60+ dtype = dtype , shape = shape , chunks = chunk_shape ,
61+ cparams = blosc2 .CParams (codec = codec ))
62+ else :
63+ # Default to arange for simplicity
64+ arr = blosc2 .arange (
65+ i * np .prod (shape ), (i + 1 ) * np .prod (shape ), 1 , dtype = dtype , shape = shape , chunks = chunk_shape ,
66+ cparams = blosc2 .CParams (codec = codec )
67+ )
5668 arrays .append (arr )
5769
5870 # Calculate total data size in GB (4 bytes per int32)
@@ -67,14 +79,16 @@ def run_benchmark(num_arrays=10, size=500, aligned_chunks=False, axis=0, codec=b
6779 return duration , result .shape , data_size_gb
6880
6981
70- def run_numpy_benchmark (num_arrays = 10 , size = 500 , axis = 0 ):
82+ def run_numpy_benchmark (num_arrays = 10 , size = 500 , axis = 0 , dtype = np . float64 , datadist = "linspace" ):
7183 """
7284 Benchmark numpy.concatenate performance for comparison.
7385
7486 Parameters:
7587 - num_arrays: Number of arrays to concatenate
7688 - size: Base size for array dimensions
7789 - axis: Axis along which to concatenate (0 or 1)
90+ - dtype: Data type for the arrays (default is np.float64)
91+ - datadist: Distribution of data in arrays (default is "linspace")
7892
7993 Returns:
8094 - duration: Time taken in seconds
@@ -93,12 +107,11 @@ def run_numpy_benchmark(num_arrays=10, size=500, axis=0):
93107 # Create arrays
94108 numpy_arrays = []
95109 for i , shape in enumerate (shapes ):
96- arr = np .arange (
97- i * np .prod (shape ),
98- (i + 1 ) * np .prod (shape ),
99- 1 ,
100- dtype = "i4"
101- ).reshape (shape )
110+ if datadist == "linspace" :
111+ # Create arrays with linearly spaced values
112+ arr = np .linspace (i , i + 1 , num = np .prod (shape ), dtype = dtype ).reshape (shape )
113+ else :
114+ arr = np .arange (i * np .prod (shape ), (i + 1 ) * np .prod (shape ), 1 , dtype = dtype ).reshape (shape )
102115 numpy_arrays .append (arr )
103116
104117 # Calculate total data size in GB (4 bytes per int32)
@@ -114,7 +127,8 @@ def run_numpy_benchmark(num_arrays=10, size=500, axis=0):
114127
115128
116129def create_combined_plot (num_arrays , sizes , numpy_speeds_axis0 , unaligned_speeds_axis0 , aligned_speeds_axis0 ,
117- numpy_speeds_axis1 , unaligned_speeds_axis1 , aligned_speeds_axis1 , output_dir = "plots" ):
130+ numpy_speeds_axis1 , unaligned_speeds_axis1 , aligned_speeds_axis1 , output_dir = "plots" ,
131+ datadist = "linspace" , codec_str = "LZ4" ):
118132 """
119133 Create a figure with two side-by-side bar plots comparing the performance for both axes.
120134
@@ -148,7 +162,7 @@ def create_combined_plot(num_arrays, sizes, numpy_speeds_axis0, unaligned_speeds
148162 # Add labels and titles
149163 for ax , axis in [(ax0 , 0 ), (ax1 , 1 )]:
150164 ax .set_xlabel ('Array Size (N for NxN array)' , fontsize = 12 )
151- ax .set_title (f'Concatenation Performance for { num_arrays } arrays (axis={ axis } )' , fontsize = 14 )
165+ ax .set_title (f'Concatenation Performance for { num_arrays } arrays (axis={ axis } ) [ { datadist } , { codec_str } ] ' , fontsize = 14 )
152166 ax .set_xticks (x )
153167 ax .set_xticklabels (x_labels )
154168 ax .grid (True , axis = 'y' , linestyle = '--' , alpha = 0.7 )
@@ -186,22 +200,25 @@ def autolabel(rects, ax):
186200
187201 # Save the plot
188202 plt .tight_layout ()
189- plt .savefig (os .path .join (output_dir , 'concatenate_benchmark_combined.png' ), dpi = 300 )
203+ plt .savefig (os .path .join (output_dir , 'concatenate_benchmark_combined.png' ), dpi = 100 )
190204 plt .show ()
191205 plt .close ()
192206
193207 print (f"Combined plot saved to { os .path .join (output_dir , 'concatenate_benchmark_combined.png' )} " )
194208
195209
196210def main ():
197- codec = blosc2 .Codec .BLOSCLZ
211+ # Parameters
212+ sizes = [500 , 1000 , 2000 , 4000 , 10000 ] #, 20000] # Sizes of arrays to test
213+ num_arrays = 10
214+ dtype = np .float64 # Data type for arrays
215+ datadist = "linspace" # Distribution of data in arrays
216+ codec = blosc2 .Codec .LZ4
217+ codec_str = str (codec ).split ('.' )[- 1 ]
198218 print (f"{ '=' * 70 } " )
199- print (f"Blosc2 vs NumPy concatenation benchmark { codec = } " )
219+ print (f"Blosc2 vs NumPy concatenation benchmark with { codec_str } codec " )
200220 print (f"{ '=' * 70 } " )
201221
202- # Parameters
203- sizes = [500 , 1000 , 2000 , 4000 ] #, 10000] # must be divisible by 4 for aligned chunks
204- num_arrays = 10
205222
206223 # Lists to store results for both axes
207224 numpy_speeds_axis0 = []
@@ -212,16 +229,18 @@ def main():
212229 aligned_speeds_axis1 = []
213230
214231 for axis in [0 , 1 ]:
215- print (f"\n Concatenating { num_arrays } arrays along axis { axis } " )
232+ print (f"\n Concatenating { num_arrays } arrays along axis { axis } with data distribution ' { datadist } ' " )
216233 print (f"{ 'Size' :<8} { 'NumPy (GB/s)' :<14} { 'Unaligned (GB/s)' :<18} "
217234 f"{ 'Aligned (GB/s)' :<16} { 'Alig vs Unalig' :<16} { 'Alig vs NumPy' :<16} " )
218235 print (f"{ '-' * 90 } " )
219236
220237 for size in sizes :
221238 # Run the benchmarks
222- numpy_time , numpy_shape , data_size_gb = run_numpy_benchmark (num_arrays , size , axis = axis )
223- unaligned_time , shape1 , _ = run_benchmark (num_arrays , size , aligned_chunks = False , axis = axis , codec = codec )
224- aligned_time , shape2 , _ = run_benchmark (num_arrays , size , aligned_chunks = True , axis = axis , codec = codec )
239+ numpy_time , numpy_shape , data_size_gb = run_numpy_benchmark (num_arrays , size , axis = axis , dtype = dtype )
240+ unaligned_time , shape1 , _ = run_benchmark (num_arrays , size , aligned_chunks = False , axis = axis ,
241+ dtype = dtype , datadist = datadist , codec = codec )
242+ aligned_time , shape2 , _ = run_benchmark (num_arrays , size , aligned_chunks = True , axis = axis ,
243+ dtype = dtype , datadist = datadist , codec = codec )
225244
226245 # Calculate throughputs in GB/s
227246 numpy_speed = data_size_gb / numpy_time if numpy_time > 0 else float ("inf" )
@@ -266,7 +285,8 @@ def main():
266285 num_arrays ,
267286 sizes ,
268287 numpy_speeds_axis0 , unaligned_speeds_axis0 , aligned_speeds_axis0 ,
269- numpy_speeds_axis1 , unaligned_speeds_axis1 , aligned_speeds_axis1
288+ numpy_speeds_axis1 , unaligned_speeds_axis1 , aligned_speeds_axis1 ,
289+ datadist = datadist , output_dir = "plots" , codec_str = codec_str ,
270290 )
271291
272292
0 commit comments