Performance compared to PyTables

I recently found both zarr and PyTables (finally, a stable replacement to using CSVs...) and was wondering if I'm doing something wrong in my choice of chunk shapes here. My data is roughly a 100000 x 20000 int64 array, fairly compressible, and I need to access it from multiple processes (I'm using PyTorch, which spawns multiple workers). I only really need to access a full row at a time, so I've been setting the chunk size to None on the second dimension.

However, my reads seem to be about 30x slower in zarr than in PyTables, despite using the same compressor/filter (blosc-blosclz). I can't quite reproduce this magnitude of difference using synthetic data, but the example below zarr is about 8x slower than PyTables.

Am I doing something wrong, or is this expected?

```python
import os
import sys

import numcodecs
from numcodecs import Blosc
import numpy as np
import tables
import zarr


def access(z, n=100000):
    i = np.random.randint(0, 100000)
    return z.data[i]


def access_tables(t, n=100000):
    i = np.random.randint(0, 100000)
    return t.root.data[i]


def create_zarr(path, n=100000, shape=(0, 20000), chunks=(100, None)):
    if os.path.exists(path):
        return zarr.open(path)
    else:
        z = zarr.open(path, 'w')
    compressor = Blosc(cname='blosclz', clevel=7)
    arr = z.create('data', shape=shape, compressor=compressor, chunks=chunks)
    for _ in range(n):
        arr.append(np.random.randint(0, 10, (1, shape[1])))
    return z


def create_table(path, n=100000, shape=(0, 20000)):
    if os.path.exists(path):
        return tables.open_file(path)
    else:
        t = tables.open_file(path, 'w')
    filter = tables.Filters(7, 'blosc')
    a = tables.Float64Atom()
    arr = t.create_earray(
        t.root, 'data', a, shape, expectedrows=n, filters=filter,
    )
    for _ in range(n):
        arr.append(np.random.randint(0, 10, (1, shape[1])))
    return t


path = 'bench.{}'
z = create_zarr(path.format('zarr'))
t = create_table(path.format('h5'))

print('zarr info:')
print(z.data.info)
print('tables info:')
print(t.root.data)
print(t.root.data.filters)

print('zarr timings:')
%timeit access(z)
print('tables timings:')
%timeit access_tables(t)

print(f'zarr: {zarr.version.version}')
print(f'numcodecs: {numcodecs.version.version}')
print(f'tables: {tables.__version__}')
print(f'python: {sys.version_info}')
print(f'platform: {sys.platform}')
```

Output:
```
zarr info:
Name               : /data
Type               : zarr.core.Array
Data type          : float64
Shape              : (100000, 20000)
Chunk shape        : (100, 20000)
Order              : C
Read-only          : False
Compressor         : Blosc(cname='blosclz', clevel=7, shuffle=SHUFFLE,
                   : blocksize=0)
Store type         : zarr.storage.DirectoryStore
No. bytes          : 16000000000 (14.9G)
No. bytes stored   : 2219391813 (2.1G)
Storage ratio      : 7.2
Chunks initialized : 1000/1000

tables info:
/data (EArray(100000, 20000), shuffle, blosc(7)) ''
Filters(complevel=7, complib='blosc', shuffle=True, bitshuffle=False, fletcher32=False, least_significant_digit=None)
zarr timings:
4.11 ms ± 343 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
tables timings:
560 µs ± 36.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
zarr: 2.2.0
numcodecs: 0.5.5
tables: 3.4.4
python: sys.version_info(major=3, minor=6, micro=6, releaselevel='final', serial=0)
platform: linux
```

zarr/numcodecs/tables installed using conda.


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Performance compared to PyTables #330

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

Performance compared to PyTables #330

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions