diff --git a/elasticdl_preprocessing/layers/__init__.py b/elasticdl_preprocessing/layers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/elasticdl_preprocessing/layers/to_number.py b/elasticdl_preprocessing/layers/to_number.py new file mode 100644 index 000000000..eabd62315 --- /dev/null +++ b/elasticdl_preprocessing/layers/to_number.py @@ -0,0 +1,55 @@ +import tensorflow as tf + +_NUMBER_DTYPES = [ + tf.int8, + tf.uint8, + tf.int16, + tf.uint16, + tf.int32, + tf.uint32, + tf.int64, + tf.uint64, + tf.float16, + tf.float32, + tf.float64, + tf.bfloat16, + tf.double, +] + + +class ToNumber(tf.keras.layers.Layer): + """Convert the inputs to a number dtype (int, float, double) + + Input Shape: Tensor or SparseTensor of any shape + Output Shape: Tensor or SparseTensor of the same shape with input + """ + + def __init__(self, out_type, default_value): + super(ToNumber, self).__init__() + if out_type not in _NUMBER_DTYPES: + raise ValueError("{} is not a number type.".format(out_type)) + self.out_type = out_type + self.default_value = default_value + + def call(self, inputs): + if isinstance(inputs, tf.SparseTensor): + number_value = self._cast_dense_to_number(inputs.values) + return tf.SparseTensor( + indices=inputs.indices, + values=number_value, + dense_shape=inputs.dense_shape, + ) + else: + return self._cast_dense_to_number(inputs) + + def _cast_dense_to_number(self, dense_inputs): + if dense_inputs.dtype is tf.string: + default_value = str(self.default_value) + outputs = tf.where( + tf.equal(dense_inputs, ""), x=default_value, y=dense_inputs + ) + outputs = tf.strings.to_number(outputs, out_type=self.out_type) + else: + outputs = tf.cast(dense_inputs, self.out_type) + + return outputs diff --git a/elasticdl_preprocessing/tests/__init__.py b/elasticdl_preprocessing/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/elasticdl_preprocessing/tests/test_utils.py b/elasticdl_preprocessing/tests/test_utils.py new file mode 100644 index 000000000..bf1370b13 --- /dev/null +++ b/elasticdl_preprocessing/tests/test_utils.py @@ -0,0 +1,17 @@ +import numpy as np + + +def sparse_tensor_equal(sp_a, sp_b): + if not np.array_equal(sp_a.dense_shape.numpy(), sp_b.dense_shape.numpy()): + return False + + if not np.array_equal(sp_a.indices.numpy(), sp_b.indices.numpy()): + return False + + if sp_a.values.dtype != sp_b.values.dtype: + return False + + if not np.array_equal(sp_a.values.numpy(), sp_b.values.numpy()): + return False + + return True diff --git a/elasticdl_preprocessing/tests/to_number_test.py b/elasticdl_preprocessing/tests/to_number_test.py new file mode 100644 index 000000000..b156a8876 --- /dev/null +++ b/elasticdl_preprocessing/tests/to_number_test.py @@ -0,0 +1,63 @@ +import unittest + +import numpy as np +import tensorflow as tf + +from elasticdl_preprocessing.layers.to_number import ToNumber +from elasticdl_preprocessing.tests.test_utils import sparse_tensor_equal + + +class ToNumberTest(unittest.TestCase): + def test_call_dense(self): + layer = ToNumber(out_type=tf.int32, default_value=-1) + input = tf.constant([["123", ""], ["456", "-789"]], tf.string) + output = layer.call(input) + expected_output = tf.constant([[123, -1], [456, -789]], tf.int32) + self.assertEqual(output.dtype, tf.int32) + self.assertTrue( + np.array_equal(output.numpy(), expected_output.numpy()) + ) + + layer = ToNumber(out_type=tf.float32, default_value=0.0) + input = tf.constant([["123.1", ""], ["456", "-789.987"]], tf.string) + output = layer.call(input) + expected_output = tf.constant( + [[123.1, 0.0], [456.0, -789.987]], tf.float32 + ) + self.assertEqual(output.dtype, tf.float32) + self.assertTrue( + np.array_equal(output.numpy(), expected_output.numpy()) + ) + + def test_call_sparse(self): + layer = ToNumber(out_type=tf.int32, default_value=-1) + input = tf.SparseTensor( + indices=[[0, 2], [2, 1], [2, 3], [5, 4]], + values=tf.constant(["123", "", "456", "-789"], tf.string), + dense_shape=[6, 5], + ) + output = layer.call(input) + expected_output = tf.SparseTensor( + indices=[[0, 2], [2, 1], [2, 3], [5, 4]], + values=tf.constant([123, -1, 456, -789], tf.int32), + dense_shape=[6, 5], + ) + self.assertTrue(sparse_tensor_equal(output, expected_output)) + + layer = ToNumber(out_type=tf.float32, default_value=0.0) + input = tf.SparseTensor( + indices=[[0, 2], [2, 1], [2, 3], [5, 4]], + values=tf.constant(["123.1", "", "456", "-789.987"], tf.string), + dense_shape=[6, 5], + ) + output = layer.call(input) + expected_output = tf.SparseTensor( + indices=[[0, 2], [2, 1], [2, 3], [5, 4]], + values=tf.constant([123.1, 0.0, 456.0, -789.987], tf.float32), + dense_shape=[6, 5], + ) + self.assertTrue(sparse_tensor_equal(output, expected_output)) + + +if __name__ == "__main__": + unittest.main()