Skip to content

AsType Filter #96

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 4, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api/codecs.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ code of this module for details.
.. autoclass:: BZ2
.. autoclass:: LZMA
.. autoclass:: Delta
.. autoclass:: AsType
.. autoclass:: FixedScaleOffset
.. autoclass:: Quantize
.. autoclass:: PackBits
Expand Down
1 change: 1 addition & 0 deletions docs/api/core.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ The Array class (``zarr.core``)
.. automethod:: resize
.. automethod:: append
.. automethod:: view
.. automethod:: astype
83 changes: 83 additions & 0 deletions zarr/codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,89 @@ def __repr__(self):
codec_registry[Delta.codec_id] = Delta


class AsType(Codec):
"""Filter to convert data between different types.

Parameters
----------
encode_dtype : dtype
Data type to use for encoded data.
decode_dtype : dtype, optional
Data type to use for decoded data.

Notes
-----
If `encode_dtype` is of lower precision than `decode_dtype`, please be
aware that data loss can occur by writing data to disk using this filter.
No checks are made to ensure the casting will work in that direction and
data corruption will occur.

Examples
--------
>>> import zarr
>>> import numpy as np
>>> x = np.arange(100, 120, 2, dtype=np.int8)
>>> x
array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int8)
>>> f = zarr.AsType(encode_dtype=x.dtype, decode_dtype=np.int64)
>>> y = f.decode(x)
>>> y
array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118])
>>> z = f.encode(y)
>>> z
array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int8)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this example, can I suggest to use 'i4' instead of 'i8'. It's just that that 'i8' and 'int8' look similar and so there is potential for confusion for users less familiar with NumPy.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alternatively we could just use np.int8 instead of i1 and such to avoid any confusion.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Went ahead and switch to np.int8 and the like. Please let me know if this is acceptable.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, that's great.

""" # flake8: noqa

codec_id = 'astype'

def __init__(self, encode_dtype, decode_dtype):
self.encode_dtype = np.dtype(encode_dtype)
self.decode_dtype = np.dtype(decode_dtype)

def encode(self, buf):

# view input data as 1D array
arr = _ndarray_from_buffer(buf, self.decode_dtype)

# convert and copy
enc = arr.astype(self.encode_dtype)

return enc

def decode(self, buf, out=None):

# view encoded data as 1D array
enc = _ndarray_from_buffer(buf, self.encode_dtype)

# convert and copy
dec = enc.astype(self.decode_dtype)

# handle output
out = _buffer_copy(dec, out)

return out

def get_config(self):
config = dict()
config['id'] = self.codec_id
config['encode_dtype'] = self.encode_dtype.str
config['decode_dtype'] = self.decode_dtype.str
return config
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you did add a casting property, would need to also include it in the config.


def __repr__(self):
return (
'%s(encode_dtype=%s, decode_dtype=%s)' % (
type(self).__name__,
self.encode_dtype,
self.decode_dtype
)
)


codec_registry[AsType.codec_id] = AsType


class FixedScaleOffset(Codec):
"""Simplified version of the scale-offset filter available in HDF5.
Applies the transformation `(x - offset) * scale` to all chunks. Results
Expand Down
63 changes: 62 additions & 1 deletion zarr/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from zarr.attrs import Attributes
from zarr.errors import PermissionError, err_read_only, err_array_not_found
from zarr.compat import reduce
from zarr.codecs import get_codec
from zarr.codecs import AsType, get_codec


class Array(object):
Expand Down Expand Up @@ -73,6 +73,7 @@ class Array(object):
resize
append
view
astype

""" # flake8: noqa

Expand Down Expand Up @@ -1176,3 +1177,63 @@ def view(self, shape=None, chunks=None, dtype=None,
a._filters = filters

return a

def astype(self, dtype):
"""Does on the fly type conversion of the underlying data.

Parameters
----------
dtype : string or dtype
NumPy dtype.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So one could technically write to disk when using astype. I don't know if we should add a note or warning about this. Maybe something similar to what view has. Please let me know if you have any thoughts on this.

Notes
-----
This method returns a new Array object which is a view on the same
underlying chunk data. Modifying any data via the view is currently
not permitted and will result in an error. This is an experimental
feature and its behavior is subject to change in the future.

See Also
--------
Array.view

Examples
--------

>>> import zarr
>>> import numpy as np
>>> data = np.arange(100, dtype=np.uint8)
>>> a = zarr.array(data, chunks=10)
>>> a[:]
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99], dtype=uint8)
>>> v = a.astype(np.float32)
>>> v.is_view
True
>>> v[:]
array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9.,
10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
20., 21., 22., 23., 24., 25., 26., 27., 28., 29.,
30., 31., 32., 33., 34., 35., 36., 37., 38., 39.,
40., 41., 42., 43., 44., 45., 46., 47., 48., 49.,
50., 51., 52., 53., 54., 55., 56., 57., 58., 59.,
60., 61., 62., 63., 64., 65., 66., 67., 68., 69.,
70., 71., 72., 73., 74., 75., 76., 77., 78., 79.,
80., 81., 82., 83., 84., 85., 86., 87., 88., 89.,
90., 91., 92., 93., 94., 95., 96., 97., 98., 99.],
dtype=float32)
""" # flake8: noqa

dtype = np.dtype(dtype)

filters = []
if self._filters:
filters.extend(self._filters)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry had just realized _filters could be None. So had to update this like so. Please let me know if this is ok.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks ok to me.

filters.insert(0, AsType(encode_dtype=self._dtype, decode_dtype=dtype))

return self.view(filters=filters, dtype=dtype, read_only=True)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that I'm thinking about it. Honestly it seems pretty reasonable to me to just set read_only to True for now on the view. I am having trouble seeing a good use case for using this to write to disk. Especially as there are easier and clearer ways to do this. If someone asks, it is pretty easy to relax this constraint later. What do you think?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should add this avoids some of the concerns raised about using casting as this allows it to have the typical meaning.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the hiatus. I'm very happy to set read_only=True by default.

58 changes: 58 additions & 0 deletions zarr/tests/test_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,64 @@ def test_repr(self):
eq(expect, actual)


class TestAsType(CodecTests, unittest.TestCase):

codec_id = 'astype'

def test_encode(self):
for arr in test_arrays:
if arr.dtype.kind in {'f', 'i', 'u'}:
self._test_encode(
arr,
encode_dtype=arr.dtype,
decode_dtype=arr.dtype
)

def test_decode(self):
for arr in test_arrays:
if arr.dtype.kind == 'f':
self._test_decode_lossy(
arr,
decimal=10,
encode_dtype=arr.dtype,
decode_dtype=arr.dtype
)
elif arr.dtype.kind in {'i', 'u'}:
self._test_decode_lossless(
arr, encode_dtype=arr.dtype, decode_dtype=arr.dtype
)

def test_encode_output(self):
encode_dtype = 'i4'
decode_dtype = 'i8'
codec = self.init_codec(
encode_dtype=encode_dtype, decode_dtype=decode_dtype
)
arr = np.arange(10, 20, 1, dtype=decode_dtype)
expect = arr.astype(encode_dtype)
actual = codec.encode(arr)
assert_array_equal(expect, actual)
eq(np.dtype(encode_dtype), actual.dtype)

def test_decode_input(self):
encode_dtype = 'i4'
decode_dtype = 'i8'
codec = self.init_codec(
encode_dtype=encode_dtype, decode_dtype=decode_dtype
)
arr = np.arange(10, 20, 1, dtype=encode_dtype)
expect = arr.astype(decode_dtype)
actual = codec.decode(arr)
assert_array_equal(expect, actual)
eq(np.dtype(decode_dtype), actual.dtype)

def test_repr(self):
codec = self.init_codec(encode_dtype='i4', decode_dtype='i8')
expect = 'AsType(encode_dtype=int32, decode_dtype=int64)'
actual = repr(codec)
eq(expect, actual)


class TestFixedScaleOffset(CodecTests, unittest.TestCase):

codec_id = 'fixedscaleoffset'
Expand Down
34 changes: 34 additions & 0 deletions zarr/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -766,6 +766,40 @@ def test_repr(self):
for l1, l2 in zip(expect.split('\n'), actual.split('\n')):
eq(l1, l2)

def test_astype_no_filters(self):
shape = (100,)
dtype = np.dtype(np.int8)
astype = np.dtype(np.float32)

store = dict()
init_array(store, shape=shape, chunks=10, dtype=dtype)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added this test to handle the case where self._filters is None.


data = np.arange(np.prod(shape), dtype=dtype).reshape(shape)

z1 = Array(store)
z1[...] = data
z2 = z1.astype(astype)

expected = data.astype(astype)
assert_array_equal(expected, z2)
eq(z2.read_only, True)

def test_astype(self):
shape = (100,)
chunks = (10,)

dtype = np.dtype(np.int8)
astype = np.dtype(np.float32)

data = np.arange(np.prod(shape), dtype=dtype).reshape(shape)

z1 = self.create_array(shape=shape, chunks=chunks, dtype=dtype)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added this case for when there are other filters.

z1[...] = data
z2 = z1.astype(astype)

expected = data.astype(astype)
assert_array_equal(expected, z2)


# custom store, does not support getsize()
class CustomMapping(object):
Expand Down
34 changes: 33 additions & 1 deletion zarr/tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from nose.tools import eq_ as eq


from zarr.codecs import Delta, FixedScaleOffset, \
from zarr.codecs import AsType, Delta, FixedScaleOffset, \
Quantize, PackBits, Categorize, \
Zlib, Blosc, BZ2
from zarr.creation import array
Expand Down Expand Up @@ -55,6 +55,38 @@ def test_array_with_delta_filter():
assert_array_equal(expect, actual)


def test_array_with_astype_filter():

# setup
encode_dtype = 'i1'
decode_dtype = 'i8'
filters = [AsType(encode_dtype=encode_dtype, decode_dtype=decode_dtype)]
chunks = 10
chunk_size = 10
shape = chunks * chunk_size
data = np.arange(shape, dtype=decode_dtype)

for compressor in compressors:
print(repr(compressor))

a = array(data, chunks=chunks, compressor=compressor, filters=filters)

# check round-trip
assert data.dtype == a.dtype
assert_array_equal(data, a[:])

# check chunks
for i in range(chunks):
cdata = a.store[str(i)]
if compressor:
chunk = compressor.decode(cdata)
else:
chunk = cdata
actual = np.frombuffer(chunk, dtype=encode_dtype)
expect = data.astype(encode_dtype)[i*chunk_size:(i+1)*chunk_size]
assert_array_equal(expect, actual)


def test_array_with_scaleoffset_filter():

# setup
Expand Down