Description
I recently found both zarr and PyTables (finally, a stable replacement to using CSVs...) and was wondering if I'm doing something wrong in my choice of chunk shapes here. My data is roughly a 100000 x 20000 int64 array, fairly compressible, and I need to access it from multiple processes (I'm using PyTorch, which spawns multiple workers). I only really need to access a full row at a time, so I've been setting the chunk size to None on the second dimension.
However, my reads seem to be about 30x slower in zarr than in PyTables, despite using the same compressor/filter (blosc-blosclz). I can't quite reproduce this magnitude of difference using synthetic data, but the example below zarr is about 8x slower than PyTables.
Am I doing something wrong, or is this expected?
import os
import sys
import numcodecs
from numcodecs import Blosc
import numpy as np
import tables
import zarr
def access(z, n=100000):
i = np.random.randint(0, 100000)
return z.data[i]
def access_tables(t, n=100000):
i = np.random.randint(0, 100000)
return t.root.data[i]
def create_zarr(path, n=100000, shape=(0, 20000), chunks=(100, None)):
if os.path.exists(path):
return zarr.open(path)
else:
z = zarr.open(path, 'w')
compressor = Blosc(cname='blosclz', clevel=7)
arr = z.create('data', shape=shape, compressor=compressor, chunks=chunks)
for _ in range(n):
arr.append(np.random.randint(0, 10, (1, shape[1])))
return z
def create_table(path, n=100000, shape=(0, 20000)):
if os.path.exists(path):
return tables.open_file(path)
else:
t = tables.open_file(path, 'w')
filter = tables.Filters(7, 'blosc')
a = tables.Float64Atom()
arr = t.create_earray(
t.root, 'data', a, shape, expectedrows=n, filters=filter,
)
for _ in range(n):
arr.append(np.random.randint(0, 10, (1, shape[1])))
return t
path = 'bench.{}'
z = create_zarr(path.format('zarr'))
t = create_table(path.format('h5'))
print('zarr info:')
print(z.data.info)
print('tables info:')
print(t.root.data)
print(t.root.data.filters)
print('zarr timings:')
%timeit access(z)
print('tables timings:')
%timeit access_tables(t)
print(f'zarr: {zarr.version.version}')
print(f'numcodecs: {numcodecs.version.version}')
print(f'tables: {tables.__version__}')
print(f'python: {sys.version_info}')
print(f'platform: {sys.platform}')
Output:
zarr info:
Name : /data
Type : zarr.core.Array
Data type : float64
Shape : (100000, 20000)
Chunk shape : (100, 20000)
Order : C
Read-only : False
Compressor : Blosc(cname='blosclz', clevel=7, shuffle=SHUFFLE,
: blocksize=0)
Store type : zarr.storage.DirectoryStore
No. bytes : 16000000000 (14.9G)
No. bytes stored : 2219391813 (2.1G)
Storage ratio : 7.2
Chunks initialized : 1000/1000
tables info:
/data (EArray(100000, 20000), shuffle, blosc(7)) ''
Filters(complevel=7, complib='blosc', shuffle=True, bitshuffle=False, fletcher32=False, least_significant_digit=None)
zarr timings:
4.11 ms ± 343 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
tables timings:
560 µs ± 36.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
zarr: 2.2.0
numcodecs: 0.5.5
tables: 3.4.4
python: sys.version_info(major=3, minor=6, micro=6, releaselevel='final', serial=0)
platform: linux
zarr/numcodecs/tables installed using conda.