Skip to content

ENH: add Pickle/MsgPack codec with support for object ndarrays #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 17, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ __pycache__/
# C extensions
*.so

# editor
*~

# Distribution / packaging
.Python
env/
Expand Down Expand Up @@ -44,6 +47,7 @@ nosetests.xml
coverage.xml
*,cover
.hypothesis/
cover/

# Translations
*.mo
Expand Down
8 changes: 8 additions & 0 deletions numcodecs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,14 @@
from numcodecs.categorize import Categorize
register_codec(Categorize)

from numcodecs.pickles import Pickle
register_codec(Pickle)

try:
from numcodecs.msgpacks import MsgPack
register_codec(MsgPack)
except ImportError: # pragma: no cover
pass

from numcodecs.checksum32 import CRC32, Adler32
register_codec(CRC32)
Expand Down
52 changes: 52 additions & 0 deletions numcodecs/msgpacks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


import numpy as np


from numcodecs.abc import Codec
from numcodecs.compat import ndarray_from_buffer, buffer_copy
import msgpack


class MsgPack(Codec):
"""Codec to encode data as msgpacked bytes. Useful for encoding python
strings

Raises
------
encoding a non-object dtyped ndarray will raise ValueError

Examples
--------
>>> import numcodecs as codecs
>>> import numpy as np
>>> x = np.array(['foo', 'bar', 'baz'], dtype='object')
>>> f = codecs.MsgPack()
>>> f.decode(f.encode(x))
array(['foo', 'bar', 'baz'], dtype=object)

""" # flake8: noqa

codec_id = 'msgpack'

def encode(self, buf):
if hasattr(buf, 'dtype') and buf.dtype != 'object':
raise ValueError("cannot encode non-object ndarrays, %s "
"dtype was passed" % buf.dtype)
return msgpack.packb(buf.tolist(), encoding='utf-8')

def decode(self, buf, out=None):
dec = np.array(msgpack.unpackb(buf, encoding='utf-8'), dtype='object')
if out is not None:
np.copyto(out, dec)
return out
else:
return dec

def get_config(self):
return dict(id=self.codec_id)

def __repr__(self):
return 'MsgPack()'
64 changes: 64 additions & 0 deletions numcodecs/pickles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


import numpy as np


from numcodecs.abc import Codec
from numcodecs.compat import ndarray_from_buffer, buffer_copy
try:
import cPickle as pickle
except ImportError:
import pickle


class Pickle(Codec):
"""Codec to encode data as as pickled bytes. Useful for encoding python
strings.

Parameters
----------
protocol : int, defaults to pickle.HIGHEST_PROTOCOL
the protocol used to pickle data

Raises
------
encoding a non-object dtyped ndarray will raise ValueError

Examples
--------
>>> import numcodecs as codecs
>>> import numpy as np
>>> x = np.array(['foo', 'bar', 'baz'], dtype='object')
>>> f = codecs.Pickle()
>>> f.decode(f.encode(x))
array(['foo', 'bar', 'baz'], dtype=object)

""" # flake8: noqa

codec_id = 'pickle'

def __init__(self, protocol=pickle.HIGHEST_PROTOCOL):
self.protocol = protocol

def encode(self, buf):
if hasattr(buf, 'dtype') and buf.dtype != 'object':
raise ValueError("cannot encode non-object ndarrays, %s "
"dtype was passed" % buf.dtype)
return pickle.dumps(buf, protocol=self.protocol)

def decode(self, buf, out=None):
dec = pickle.loads(buf)
if out is not None:
np.copyto(out, dec)
return out
else:
return dec

def get_config(self):
return dict(id=self.codec_id,
protocol=self.protocol)

def __repr__(self):
return 'Pickle(protocol=%s)' % self.protocol
33 changes: 32 additions & 1 deletion numcodecs/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@


import numpy as np
from nose.tools import eq_ as eq
from nose.tools import eq_ as eq, assert_true
from numpy.testing import assert_array_almost_equal


Expand Down Expand Up @@ -91,6 +91,37 @@ def compare(res):
compare(out)


def check_encode_decode_objects(arr, codec):

# this is a more specific test that check_encode_decode
# as these require actual objects (and not bytes only)

def compare(res, arr=arr):

assert_true(isinstance(res, np.ndarray))
assert_true(res.shape == arr.shape)
assert_true(res.dtype == 'object')

# numpy asserts don't compare object arrays
# properly; assert that we have the same nans
# and values
arr = arr.ravel().tolist()
res = res.ravel().tolist()
for a, r in zip(arr, res):
if a != a:
assert_true(r != r)
else:
assert_true(a == r)

enc = codec.encode(arr)
dec = codec.decode(enc)
compare(dec)

out = np.empty_like(arr)
codec.decode(enc, out=out)
compare(out)


def check_config(codec):
config = codec.get_config()
# round-trip through JSON to check serialization
Expand Down
52 changes: 52 additions & 0 deletions numcodecs/tests/test_msgpacks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


import nose
import numpy as np
from numpy.testing import assert_raises

try:
from numcodecs.msgpacks import MsgPack
except ImportError:
raise nose.SkipTest("no msgpack installed")

from numcodecs.tests.common import (check_config, check_repr,
check_encode_decode_objects)


# object array with strings
# object array with mix strings / nans
# object array with mix of string, int, float
arrays = [
np.array(['foo', 'bar', 'baz'] * 300, dtype=object),
np.array([['foo', 'bar', np.nan]] * 300, dtype=object),
np.array(['foo', 1.0, 2] * 300, dtype=object),
]

# non-object ndarrays
arrays_incompat = [
np.arange(1000, dtype='i4'),
np.array(['foo', 'bar', 'baz'] * 300),
]


def test_encode_errors():
for arr in arrays_incompat:
codec = MsgPack()
assert_raises(ValueError, codec.encode, arr)


def test_encode_decode():
for arr in arrays:
codec = MsgPack()
check_encode_decode_objects(arr, codec)


def test_config():
codec = MsgPack()
check_config(codec)


def test_repr():
check_repr("MsgPack()")
48 changes: 48 additions & 0 deletions numcodecs/tests/test_pickle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, print_function, division


import numpy as np
from numpy.testing import assert_raises


from numcodecs.pickles import Pickle
from numcodecs.tests.common import (check_config, check_repr,
check_encode_decode_objects)


# object array with strings
# object array with mix strings / nans
# object array with mix of string, int, float
arrays = [
np.array(['foo', 'bar', 'baz'] * 300, dtype=object),
np.array([['foo', 'bar', np.nan]] * 300, dtype=object),
np.array(['foo', 1.0, 2] * 300, dtype=object),
]

# non-object ndarrays
arrays_incompat = [
np.arange(1000, dtype='i4'),
np.array(['foo', 'bar', 'baz'] * 300),
]


def test_encode_errors():
for arr in arrays_incompat:
codec = Pickle()
assert_raises(ValueError, codec.encode, arr)


def test_encode_decode():
for arr in arrays:
codec = Pickle()
check_encode_decode_objects(arr, codec)


def test_config():
codec = Pickle(protocol=-1)
check_config(codec)


def test_repr():
check_repr("Pickle(protocol=-1)")
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
numpy
msgpack-python