-
-
Notifications
You must be signed in to change notification settings - Fork 330
AsType Filter #96
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AsType Filter #96
Changes from all commits
0a462ba
dfd28ef
4d2cf7e
da459e4
610daa4
328d04c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -485,6 +485,89 @@ def __repr__(self): | |
codec_registry[Delta.codec_id] = Delta | ||
|
||
|
||
class AsType(Codec): | ||
"""Filter to convert data between different types. | ||
|
||
Parameters | ||
---------- | ||
encode_dtype : dtype | ||
Data type to use for encoded data. | ||
decode_dtype : dtype, optional | ||
Data type to use for decoded data. | ||
|
||
Notes | ||
----- | ||
If `encode_dtype` is of lower precision than `decode_dtype`, please be | ||
aware that data loss can occur by writing data to disk using this filter. | ||
No checks are made to ensure the casting will work in that direction and | ||
data corruption will occur. | ||
|
||
Examples | ||
-------- | ||
>>> import zarr | ||
>>> import numpy as np | ||
>>> x = np.arange(100, 120, 2, dtype=np.int8) | ||
>>> x | ||
array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int8) | ||
>>> f = zarr.AsType(encode_dtype=x.dtype, decode_dtype=np.int64) | ||
>>> y = f.decode(x) | ||
>>> y | ||
array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118]) | ||
>>> z = f.encode(y) | ||
>>> z | ||
array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int8) | ||
|
||
""" # flake8: noqa | ||
|
||
codec_id = 'astype' | ||
|
||
def __init__(self, encode_dtype, decode_dtype): | ||
self.encode_dtype = np.dtype(encode_dtype) | ||
self.decode_dtype = np.dtype(decode_dtype) | ||
|
||
def encode(self, buf): | ||
|
||
# view input data as 1D array | ||
arr = _ndarray_from_buffer(buf, self.decode_dtype) | ||
|
||
# convert and copy | ||
enc = arr.astype(self.encode_dtype) | ||
|
||
return enc | ||
|
||
def decode(self, buf, out=None): | ||
|
||
# view encoded data as 1D array | ||
enc = _ndarray_from_buffer(buf, self.encode_dtype) | ||
|
||
# convert and copy | ||
dec = enc.astype(self.decode_dtype) | ||
|
||
# handle output | ||
out = _buffer_copy(dec, out) | ||
|
||
return out | ||
|
||
def get_config(self): | ||
config = dict() | ||
config['id'] = self.codec_id | ||
config['encode_dtype'] = self.encode_dtype.str | ||
config['decode_dtype'] = self.decode_dtype.str | ||
return config | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you did add a |
||
|
||
def __repr__(self): | ||
return ( | ||
'%s(encode_dtype=%s, decode_dtype=%s)' % ( | ||
type(self).__name__, | ||
self.encode_dtype, | ||
self.decode_dtype | ||
) | ||
) | ||
|
||
|
||
codec_registry[AsType.codec_id] = AsType | ||
|
||
|
||
class FixedScaleOffset(Codec): | ||
"""Simplified version of the scale-offset filter available in HDF5. | ||
Applies the transformation `(x - offset) * scale` to all chunks. Results | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,7 @@ | |
from zarr.attrs import Attributes | ||
from zarr.errors import PermissionError, err_read_only, err_array_not_found | ||
from zarr.compat import reduce | ||
from zarr.codecs import get_codec | ||
from zarr.codecs import AsType, get_codec | ||
|
||
|
||
class Array(object): | ||
|
@@ -73,6 +73,7 @@ class Array(object): | |
resize | ||
append | ||
view | ||
astype | ||
|
||
""" # flake8: noqa | ||
|
||
|
@@ -1176,3 +1177,63 @@ def view(self, shape=None, chunks=None, dtype=None, | |
a._filters = filters | ||
|
||
return a | ||
|
||
def astype(self, dtype): | ||
"""Does on the fly type conversion of the underlying data. | ||
|
||
Parameters | ||
---------- | ||
dtype : string or dtype | ||
NumPy dtype. | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So one could technically write to disk when using |
||
Notes | ||
----- | ||
This method returns a new Array object which is a view on the same | ||
underlying chunk data. Modifying any data via the view is currently | ||
not permitted and will result in an error. This is an experimental | ||
feature and its behavior is subject to change in the future. | ||
|
||
See Also | ||
-------- | ||
Array.view | ||
|
||
Examples | ||
-------- | ||
|
||
>>> import zarr | ||
>>> import numpy as np | ||
>>> data = np.arange(100, dtype=np.uint8) | ||
>>> a = zarr.array(data, chunks=10) | ||
>>> a[:] | ||
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, | ||
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, | ||
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, | ||
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, | ||
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, | ||
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, | ||
96, 97, 98, 99], dtype=uint8) | ||
>>> v = a.astype(np.float32) | ||
>>> v.is_view | ||
True | ||
>>> v[:] | ||
array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., | ||
10., 11., 12., 13., 14., 15., 16., 17., 18., 19., | ||
20., 21., 22., 23., 24., 25., 26., 27., 28., 29., | ||
30., 31., 32., 33., 34., 35., 36., 37., 38., 39., | ||
40., 41., 42., 43., 44., 45., 46., 47., 48., 49., | ||
50., 51., 52., 53., 54., 55., 56., 57., 58., 59., | ||
60., 61., 62., 63., 64., 65., 66., 67., 68., 69., | ||
70., 71., 72., 73., 74., 75., 76., 77., 78., 79., | ||
80., 81., 82., 83., 84., 85., 86., 87., 88., 89., | ||
90., 91., 92., 93., 94., 95., 96., 97., 98., 99.], | ||
dtype=float32) | ||
""" # flake8: noqa | ||
|
||
dtype = np.dtype(dtype) | ||
|
||
filters = [] | ||
if self._filters: | ||
filters.extend(self._filters) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry had just realized There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks ok to me. |
||
filters.insert(0, AsType(encode_dtype=self._dtype, decode_dtype=dtype)) | ||
|
||
return self.view(filters=filters, dtype=dtype, read_only=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that I'm thinking about it. Honestly it seems pretty reasonable to me to just set There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should add this avoids some of the concerns raised about using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry for the hiatus. I'm very happy to set read_only=True by default. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -766,6 +766,40 @@ def test_repr(self): | |
for l1, l2 in zip(expect.split('\n'), actual.split('\n')): | ||
eq(l1, l2) | ||
|
||
def test_astype_no_filters(self): | ||
shape = (100,) | ||
dtype = np.dtype(np.int8) | ||
astype = np.dtype(np.float32) | ||
|
||
store = dict() | ||
init_array(store, shape=shape, chunks=10, dtype=dtype) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added this test to handle the case where |
||
|
||
data = np.arange(np.prod(shape), dtype=dtype).reshape(shape) | ||
|
||
z1 = Array(store) | ||
z1[...] = data | ||
z2 = z1.astype(astype) | ||
|
||
expected = data.astype(astype) | ||
assert_array_equal(expected, z2) | ||
eq(z2.read_only, True) | ||
|
||
def test_astype(self): | ||
shape = (100,) | ||
chunks = (10,) | ||
|
||
dtype = np.dtype(np.int8) | ||
astype = np.dtype(np.float32) | ||
|
||
data = np.arange(np.prod(shape), dtype=dtype).reshape(shape) | ||
|
||
z1 = self.create_array(shape=shape, chunks=chunks, dtype=dtype) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added this case for when there are other filters. |
||
z1[...] = data | ||
z2 = z1.astype(astype) | ||
|
||
expected = data.astype(astype) | ||
assert_array_equal(expected, z2) | ||
|
||
|
||
# custom store, does not support getsize() | ||
class CustomMapping(object): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For this example, can I suggest to use 'i4' instead of 'i8'. It's just that that 'i8' and 'int8' look similar and so there is potential for confusion for users less familiar with NumPy.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alternatively we could just use
np.int8
instead ofi1
and such to avoid any confusion.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Went ahead and switch to
np.int8
and the like. Please let me know if this is acceptable.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, that's great.