Skip to content

Adds crc32c codec #613

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Oct 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ jobs:
conda activate env
export DISABLE_NUMCODECS_AVX2=""
# TODO: put back zfpy import when it supports numpy 2.0
python -m pip install -v -e .[test,test_extras,msgpack]
python -m pip install -v -e .[test,test_extras,msgpack,crc32c]

- name: Install pcodec
if: matrix.python-version != '3.13.0'
Expand Down
11 changes: 11 additions & 0 deletions docs/checksum32.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,17 @@ CRC32
.. automethod:: from_config


CRC32C
------
.. autoclass:: CRC32C

.. autoattribute:: codec_id
.. automethod:: encode
.. automethod:: decode
.. automethod:: get_config
.. automethod:: from_config


Adler32
-------
.. autoclass:: Adler32
Expand Down
4 changes: 4 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ Fix
* Fix in-place mutation of input array in `BitRound`.
By :user:`Sam Levang <slevang>`, :issue:`608`

Enhancements
~~~~~~~~~~~~
* Add Crc32c checksum codec
By :user:`Norman Rzepka <normanrz>`, :issue:`613`.

.. _release_0.13.1:

Expand Down
Binary file added fixture/adler32/array.05.npy
Binary file not shown.
Binary file added fixture/adler32/array.06.npy
Binary file not shown.
Binary file added fixture/adler32/array.07.npy
Binary file not shown.
Binary file added fixture/adler32/array.08.npy
Binary file not shown.
Binary file added fixture/adler32/array.09.npy
Binary file not shown.
Binary file added fixture/adler32/array.10.npy
Binary file not shown.
Binary file added fixture/adler32/array.11.npy
Binary file not shown.
Binary file added fixture/adler32/array.12.npy
Binary file not shown.
Binary file added fixture/adler32/codec.00/encoded.05.dat
Binary file not shown.
Binary file added fixture/adler32/codec.00/encoded.06.dat
Binary file not shown.
Binary file added fixture/adler32/codec.00/encoded.07.dat
Binary file not shown.
Binary file added fixture/adler32/codec.00/encoded.08.dat
Binary file not shown.
Binary file added fixture/adler32/codec.00/encoded.09.dat
Binary file not shown.
Binary file added fixture/adler32/codec.00/encoded.10.dat
Binary file not shown.
Binary file added fixture/adler32/codec.00/encoded.11.dat
Binary file not shown.
Binary file added fixture/adler32/codec.00/encoded.12.dat
Binary file not shown.
Binary file added fixture/crc32/array.05.npy
Binary file not shown.
Binary file added fixture/crc32/array.06.npy
Binary file not shown.
Binary file added fixture/crc32/array.07.npy
Binary file not shown.
Binary file added fixture/crc32/array.08.npy
Binary file not shown.
Binary file added fixture/crc32/array.09.npy
Binary file not shown.
Binary file added fixture/crc32/array.10.npy
Binary file not shown.
Binary file added fixture/crc32/array.11.npy
Binary file not shown.
Binary file added fixture/crc32/array.12.npy
Binary file not shown.
Binary file added fixture/crc32/codec.00/encoded.05.dat
Binary file not shown.
Binary file added fixture/crc32/codec.00/encoded.06.dat
Binary file not shown.
Binary file added fixture/crc32/codec.00/encoded.07.dat
Binary file not shown.
Binary file added fixture/crc32/codec.00/encoded.08.dat
Binary file not shown.
Binary file added fixture/crc32/codec.00/encoded.09.dat
Binary file not shown.
Binary file added fixture/crc32/codec.00/encoded.10.dat
Binary file not shown.
Binary file added fixture/crc32/codec.00/encoded.11.dat
Binary file not shown.
Binary file added fixture/crc32/codec.00/encoded.12.dat
Binary file not shown.
Binary file added fixture/crc32c/array.00.npy
Binary file not shown.
Binary file added fixture/crc32c/array.01.npy
Binary file not shown.
Binary file added fixture/crc32c/array.02.npy
Binary file not shown.
Binary file added fixture/crc32c/array.03.npy
Binary file not shown.
Binary file added fixture/crc32c/array.04.npy
Binary file not shown.
Binary file added fixture/crc32c/array.05.npy
Binary file not shown.
Binary file added fixture/crc32c/array.06.npy
Binary file not shown.
Binary file added fixture/crc32c/array.07.npy
Binary file not shown.
Binary file added fixture/crc32c/array.08.npy
Binary file not shown.
Binary file added fixture/crc32c/array.09.npy
Binary file not shown.
Binary file added fixture/crc32c/array.10.npy
Binary file not shown.
Binary file added fixture/crc32c/array.11.npy
Binary file not shown.
Binary file added fixture/crc32c/array.12.npy
Binary file not shown.
3 changes: 3 additions & 0 deletions fixture/crc32c/codec.00/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"id": "crc32c"
}
Binary file added fixture/crc32c/codec.00/encoded.00.dat
Binary file not shown.
Binary file added fixture/crc32c/codec.00/encoded.01.dat
Binary file not shown.
Binary file added fixture/crc32c/codec.00/encoded.02.dat
Binary file not shown.
Binary file added fixture/crc32c/codec.00/encoded.03.dat
Binary file not shown.
Binary file added fixture/crc32c/codec.00/encoded.04.dat
Binary file not shown.
Binary file added fixture/crc32c/codec.00/encoded.05.dat
Binary file not shown.
Binary file added fixture/crc32c/codec.00/encoded.06.dat
Binary file not shown.
Binary file added fixture/crc32c/codec.00/encoded.07.dat
Binary file not shown.
Binary file added fixture/crc32c/codec.00/encoded.08.dat
Binary file not shown.
Binary file added fixture/crc32c/codec.00/encoded.09.dat
Binary file not shown.
Binary file added fixture/crc32c/codec.00/encoded.10.dat
Binary file not shown.
Binary file added fixture/crc32c/codec.00/encoded.11.dat
Binary file not shown.
Binary file added fixture/crc32c/codec.00/encoded.12.dat
Binary file not shown.
Binary file added fixture/delta/bool/array.00.npy
Binary file not shown.
5 changes: 5 additions & 0 deletions fixture/delta/bool/codec.00/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"astype": "|b1",
"dtype": "|b1",
"id": "delta"
}
Binary file added fixture/delta/bool/codec.00/encoded.00.dat
Binary file not shown.
3 changes: 2 additions & 1 deletion numcodecs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,10 @@

register_codec(MsgPack)

from numcodecs.checksum32 import CRC32, Adler32, JenkinsLookup3
from numcodecs.checksum32 import CRC32, CRC32C, Adler32, JenkinsLookup3

register_codec(CRC32)
register_codec(CRC32C)
register_codec(Adler32)
register_codec(JenkinsLookup3)

Expand Down
90 changes: 81 additions & 9 deletions numcodecs/checksum32.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,111 @@
import struct
import zlib
from typing import Literal

import numpy as np

from .abc import Codec
from .compat import ensure_contiguous_ndarray, ndarray_copy
from .jenkins import jenkins_lookup3

CHECKSUM_LOCATION = Literal['start', 'end']


class Checksum32(Codec):
# override in sub-class
checksum = None
location: CHECKSUM_LOCATION = 'start'

def __init__(self, location: CHECKSUM_LOCATION | None = None):
if location is not None:
self.location = location
if self.location not in ['start', 'end']:
raise ValueError(f"Invalid checksum location: {self.location}")

def encode(self, buf):
arr = ensure_contiguous_ndarray(buf).view('u1')
checksum = self.checksum(arr) & 0xFFFFFFFF
enc = np.empty(arr.nbytes + 4, dtype='u1')
enc[:4].view('<u4')[0] = checksum
ndarray_copy(arr, enc[4:])
if self.location == 'start':
checksum_view = enc[:4]
payload_view = enc[4:]
else:
checksum_view = enc[-4:]
payload_view = enc[:-4]
checksum_view.view('<u4')[0] = checksum
ndarray_copy(arr, payload_view)
return enc

def decode(self, buf, out=None):
if len(buf) < 4:
raise ValueError("Input buffer is too short to contain a 32-bit checksum.")
if out is not None:
ensure_contiguous_ndarray(out) # check that out is a valid ndarray

arr = ensure_contiguous_ndarray(buf).view('u1')
expect = arr[:4].view('<u4')[0]
checksum = self.checksum(arr[4:]) & 0xFFFFFFFF
if self.location == 'start':
checksum_view = arr[:4]
payload_view = arr[4:]
else:
checksum_view = arr[-4:]
payload_view = arr[:-4]
expect = checksum_view.view('<u4')[0]
checksum = self.checksum(payload_view) & 0xFFFFFFFF
if expect != checksum:
raise RuntimeError('checksum failed')
return ndarray_copy(arr[4:], out)
raise RuntimeError(
f"Stored and computed {self.codec_id} checksum do not match. Stored: {expect}. Computed: {checksum}."
)
return ndarray_copy(payload_view, out)


class CRC32(Checksum32):
"""Codec add a crc32 checksum to the buffer.

Parameters
----------
location : 'start' or 'end'
Where to place the checksum in the buffer.
"""

codec_id = 'crc32'
checksum = zlib.crc32
location = 'start'


class CRC32C(Checksum32):
"""Codec add a crc32c checksum to the buffer.

Parameters
----------
location : 'start' or 'end'
Where to place the checksum in the buffer.
"""

codec_id = 'crc32c'

def checksum(self, buf):
try:
from crc32c import crc32c as crc32c_

return crc32c_(buf)
except ImportError: # pragma: no cover
raise ImportError("crc32c must be installed to use the CRC32C checksum codec.")

location = 'end'


class Adler32(Checksum32):
"""Codec add a adler32 checksum to the buffer.

Parameters
----------
location : 'start' or 'end'
Where to place the checksum in the buffer.
"""

codec_id = 'adler32'
checksum = zlib.adler32
location = 'start'


class JenkinsLookup3(Checksum32):
Expand All @@ -50,9 +119,12 @@ class JenkinsLookup3(Checksum32):
the data portion and compared with the four-byte checksum, raising
RuntimeError if inconsistent.

Attributes:
initval: initial seed passed to the hash algorithm, default: 0
prefix: bytes prepended to the buffer before evaluating the hash, default: None
Parameters
----------
initval : int
initial seed passed to the hash algorithm, default: 0
prefix : int
bytes prepended to the buffer before evaluating the hash, default: None
"""

checksum = jenkins_lookup3
Expand Down
116 changes: 98 additions & 18 deletions numcodecs/tests/test_checksum32.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
import numpy as np
import pytest

from numcodecs.checksum32 import CRC32, Adler32
from numcodecs.checksum32 import CRC32, CRC32C, Adler32
from numcodecs.tests.common import (
check_backwards_compatibility,
check_config,
check_encode_decode,
check_err_decode_object_buffer,
check_err_encode_object_buffer,
check_repr,
)
Expand All @@ -21,38 +22,117 @@
np.random.normal(loc=1000, scale=1, size=(100, 10)),
np.random.randint(0, 2, size=1000, dtype=bool).reshape(100, 10, order='F'),
np.random.choice([b'a', b'bb', b'ccc'], size=1000).reshape(10, 10, 10),
np.random.randint(0, 2**60, size=1000, dtype='u8').view('M8[ns]'),
np.random.randint(0, 2**60, size=1000, dtype='u8').view('m8[ns]'),
np.random.randint(0, 2**25, size=1000, dtype='u8').view('M8[m]'),
np.random.randint(0, 2**25, size=1000, dtype='u8').view('m8[m]'),
np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('M8[ns]'),
np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('m8[ns]'),
np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('M8[m]'),
np.random.randint(-(2**63), -(2**63) + 20, size=1000, dtype='i8').view('m8[m]'),
]

codecs = [CRC32(), Adler32()]
codecs = [
CRC32(),
CRC32(location="end"),
CRC32C(location="start"),
CRC32C(),
Adler32(),
Adler32(location="end"),
]


@pytest.mark.parametrize(("codec", "arr"), itertools.product(codecs, arrays))
def test_encode_decode(codec, arr):
check_encode_decode(arr, codec)


@pytest.mark.parametrize(("codec", "arr"), itertools.product(codecs, arrays))
def test_errors(codec, arr):
enc = codec.encode(arr)
with pytest.raises(RuntimeError):
codec.decode(enc[:-1])


@pytest.mark.parametrize("codec", codecs)
def test_config(codec):
check_config(codec)


@pytest.mark.parametrize("codec", codecs)
def test_err_input_too_small(codec):
buf = b'000' # 3 bytes are too little for a 32-bit checksum
with pytest.raises(ValueError):
codec.decode(buf)


def test_encode_decode():
for codec, arr in itertools.product(codecs, arrays):
check_encode_decode(arr, codec)
@pytest.mark.parametrize("codec", codecs)
def test_err_encode_non_contiguous(codec):
# non-contiguous memory
arr = np.arange(1000, dtype='i4')[::2]
with pytest.raises(ValueError):
codec.encode(arr)


def test_errors():
for codec, arr in itertools.product(codecs, arrays):
enc = codec.encode(arr)
with pytest.raises(RuntimeError):
codec.decode(enc[:-1])
@pytest.mark.parametrize("codec", codecs)
def test_err_encode_list(codec):
data = ['foo', 'bar', 'baz']
with pytest.raises(TypeError):
codec.encode(data)


def test_config():
for codec in codecs:
check_config(codec)
def test_err_location():
with pytest.raises(ValueError):
CRC32(location="foo")
with pytest.raises(ValueError):
CRC32C(location="foo")
with pytest.raises(ValueError):
Adler32(location="foo")


def test_repr():
check_repr("CRC32()")
check_repr("Adler32()")
check_repr("CRC32(location='start')")
check_repr("CRC32C(location='start')")
check_repr("Adler32(location='start')")
check_repr("CRC32(location='end')")
check_repr("CRC32C(location='end')")
check_repr("Adler32(location='end')")


def test_backwards_compatibility():
check_backwards_compatibility(CRC32.codec_id, arrays, [CRC32()])
check_backwards_compatibility(Adler32.codec_id, arrays, [Adler32()])
check_backwards_compatibility(CRC32C.codec_id, arrays, [CRC32C()])


@pytest.mark.parametrize("codec", codecs)
def test_err_encode_object_buffer(codec):
check_err_encode_object_buffer(codec)


@pytest.mark.parametrize("codec", codecs)
def test_err_decode_object_buffer(codec):
check_err_decode_object_buffer(codec)


@pytest.mark.parametrize("codec", codecs)
def test_err_out_too_small(codec):
arr = np.arange(10, dtype='i4')
out = np.empty_like(arr)[:-1]
with pytest.raises(ValueError):
codec.decode(codec.encode(arr), out)


def test_crc32c_checksum():
arr = np.arange(0, 64, dtype="uint8")
buf = CRC32C(location="end").encode(arr)
assert np.frombuffer(buf, dtype="<u4", offset=(len(buf) - 4))[0] == np.uint32(4218238699)


def test_err_encode_object_buffer():
check_err_encode_object_buffer(CRC32())
check_err_encode_object_buffer(Adler32())
@pytest.mark.parametrize("codec", codecs)
def test_err_checksum(codec):
arr = np.arange(0, 64, dtype="uint8")
buf = bytearray(codec.encode(arr))
buf[-1] = 0 # corrupt the checksum
with pytest.raises(RuntimeError):
codec.decode(buf)
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ zfpy = [
pcodec = [
"pcodec>=0.2.0",
]
crc32c = [
"crc32c>=2.7",
]

[tool.setuptools]
license-files = ["LICENSE.txt"]
Expand Down
Loading