Skip to content

Add the ToNumber preprocess layer. #1845

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
55 changes: 55 additions & 0 deletions elasticdl_preprocessing/layers/to_number.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import tensorflow as tf

_NUMBER_DTYPES = [
tf.int8,
tf.uint8,
tf.int16,
tf.uint16,
tf.int32,
tf.uint32,
tf.int64,
tf.uint64,
tf.float16,
tf.float32,
tf.float64,
tf.bfloat16,
tf.double,
]


class ToNumber(tf.keras.layers.Layer):
"""Convert the inputs to a number dtype (int, float, double)

Input Shape: Tensor or SparseTensor of any shape
Output Shape: Tensor or SparseTensor of the same shape with input
"""

def __init__(self, out_type, default_value):
super(ToNumber, self).__init__()
if out_type not in _NUMBER_DTYPES:
raise ValueError("{} is not a number type.".format(out_type))
self.out_type = out_type
self.default_value = default_value

def call(self, inputs):
if isinstance(inputs, tf.SparseTensor):
number_value = self._cast_dense_to_number(inputs.values)
return tf.SparseTensor(
indices=inputs.indices,
values=number_value,
dense_shape=inputs.dense_shape,
)
else:
return self._cast_dense_to_number(inputs)

def _cast_dense_to_number(self, dense_inputs):
if dense_inputs.dtype is tf.string:
default_value = str(self.default_value)
outputs = tf.where(
tf.equal(dense_inputs, ""), x=default_value, y=dense_inputs
)
outputs = tf.strings.to_number(outputs, out_type=self.out_type)
else:
outputs = tf.cast(dense_inputs, self.out_type)

return outputs
Empty file.
17 changes: 17 additions & 0 deletions elasticdl_preprocessing/tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import numpy as np


def sparse_tensor_equal(sp_a, sp_b):
if not np.array_equal(sp_a.dense_shape.numpy(), sp_b.dense_shape.numpy()):
return False

if not np.array_equal(sp_a.indices.numpy(), sp_b.indices.numpy()):
return False

if sp_a.values.dtype != sp_b.values.dtype:
return False

if not np.array_equal(sp_a.values.numpy(), sp_b.values.numpy()):
return False

return True
63 changes: 63 additions & 0 deletions elasticdl_preprocessing/tests/to_number_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import unittest

import numpy as np
import tensorflow as tf

from elasticdl_preprocessing.layers.to_number import ToNumber
from elasticdl_preprocessing.tests.test_utils import sparse_tensor_equal


class ToNumberTest(unittest.TestCase):
def test_call_dense(self):
layer = ToNumber(out_type=tf.int32, default_value=-1)
input = tf.constant([["123", ""], ["456", "-789"]], tf.string)
output = layer.call(input)
expected_output = tf.constant([[123, -1], [456, -789]], tf.int32)
self.assertEqual(output.dtype, tf.int32)
self.assertTrue(
np.array_equal(output.numpy(), expected_output.numpy())
)

layer = ToNumber(out_type=tf.float32, default_value=0.0)
input = tf.constant([["123.1", ""], ["456", "-789.987"]], tf.string)
output = layer.call(input)
expected_output = tf.constant(
[[123.1, 0.0], [456.0, -789.987]], tf.float32
)
self.assertEqual(output.dtype, tf.float32)
self.assertTrue(
np.array_equal(output.numpy(), expected_output.numpy())
)

def test_call_sparse(self):
layer = ToNumber(out_type=tf.int32, default_value=-1)
input = tf.SparseTensor(
indices=[[0, 2], [2, 1], [2, 3], [5, 4]],
values=tf.constant(["123", "", "456", "-789"], tf.string),
dense_shape=[6, 5],
)
output = layer.call(input)
expected_output = tf.SparseTensor(
indices=[[0, 2], [2, 1], [2, 3], [5, 4]],
values=tf.constant([123, -1, 456, -789], tf.int32),
dense_shape=[6, 5],
)
self.assertTrue(sparse_tensor_equal(output, expected_output))

layer = ToNumber(out_type=tf.float32, default_value=0.0)
input = tf.SparseTensor(
indices=[[0, 2], [2, 1], [2, 3], [5, 4]],
values=tf.constant(["123.1", "", "456", "-789.987"], tf.string),
dense_shape=[6, 5],
)
output = layer.call(input)
expected_output = tf.SparseTensor(
indices=[[0, 2], [2, 1], [2, 3], [5, 4]],
values=tf.constant([123.1, 0.0, 456.0, -789.987], tf.float32),
dense_shape=[6, 5],
)
self.assertTrue(sparse_tensor_equal(output, expected_output))


if __name__ == "__main__":
unittest.main()