|
| 1 | +from __future__ import absolute_import, division, print_function |
| 2 | + |
| 3 | +import tensorflow as tf |
| 4 | + |
| 5 | + |
| 6 | +class Hashing(tf.keras.layers.Layer): |
| 7 | + """Distribute categorical feature values into a finite number of buckets |
| 8 | + by hashing. |
| 9 | +
|
| 10 | + This layer converts a sequence of int or string to a sequence of int. |
| 11 | + output_id = Hash(input_feature_string) % num_bins for string type input. |
| 12 | + For int type input, the layer converts the value to string and then |
| 13 | + processes it by the same formula. TensorFlow 2.2 has developed |
| 14 | + `tf.keras.layers.preprocessing.Hashing` but not released it yet. So the |
| 15 | + layer is a simple temporary version. |
| 16 | + https://github.com/tensorflow/tensorflow/blob/r2.2/tensorflow/python/keras/layers/preprocessing/hashing.py |
| 17 | +
|
| 18 | + Note that the TensorFlow version with the layer must be greater than 2.0.0. |
| 19 | +
|
| 20 | + Example: |
| 21 | + ```python |
| 22 | + layer = Hashing(num_bins=3) |
| 23 | + inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']]) |
| 24 | + layer(inp) |
| 25 | + ``` |
| 26 | + The output will be `[[1], [0], [1], [1], [2]]` |
| 27 | +
|
| 28 | + Arguments: |
| 29 | + num_bins: Number of hash bins. |
| 30 | + **kwargs: Keyword arguments to construct a layer. |
| 31 | +
|
| 32 | + Input: A string, int32 or int64 `tf.Tensor`, |
| 33 | + `tf.SparseTensor` or `tf.RaggedTensor` |
| 34 | +
|
| 35 | + Output: An int64 tensor with the same shape as input. |
| 36 | +
|
| 37 | + """ |
| 38 | + |
| 39 | + def __init__(self, num_bins, **kwargs): |
| 40 | + if num_bins is None or num_bins <= 0: |
| 41 | + raise ValueError( |
| 42 | + "`num_bins` cannot be `None` or non-positive values." |
| 43 | + ) |
| 44 | + super(Hashing, self).__init__(**kwargs) |
| 45 | + self.num_bins = num_bins |
| 46 | + self._supports_ragged_inputs = True |
| 47 | + |
| 48 | + def call(self, inputs): |
| 49 | + # Converts integer inputs to string. |
| 50 | + if inputs.dtype.is_integer: |
| 51 | + if isinstance(inputs, tf.SparseTensor): |
| 52 | + inputs = tf.SparseTensor( |
| 53 | + indices=inputs.indices, |
| 54 | + values=tf.as_string(inputs.values), |
| 55 | + dense_shape=inputs.dense_shape, |
| 56 | + ) |
| 57 | + else: |
| 58 | + inputs = tf.as_string(inputs) |
| 59 | + if isinstance(inputs, tf.RaggedTensor): |
| 60 | + return tf.ragged.map_flat_values( |
| 61 | + tf.strings.to_hash_bucket_fast, |
| 62 | + inputs, |
| 63 | + num_buckets=self.num_bins, |
| 64 | + name="hash", |
| 65 | + ) |
| 66 | + elif isinstance(inputs, tf.SparseTensor): |
| 67 | + sparse_values = inputs.values |
| 68 | + sparse_hashed_values = tf.strings.to_hash_bucket_fast( |
| 69 | + sparse_values, self.num_bins, name="hash" |
| 70 | + ) |
| 71 | + return tf.SparseTensor( |
| 72 | + indices=inputs.indices, |
| 73 | + values=sparse_hashed_values, |
| 74 | + dense_shape=inputs.dense_shape, |
| 75 | + ) |
| 76 | + else: |
| 77 | + return tf.strings.to_hash_bucket_fast( |
| 78 | + inputs, self.num_bins, name="hash" |
| 79 | + ) |
| 80 | + |
| 81 | + def compute_output_shape(self, input_shape): |
| 82 | + return input_shape |
| 83 | + |
| 84 | + def get_config(self): |
| 85 | + config = {"num_bins": self.num_bins} |
| 86 | + base_config = super(Hashing, self).get_config() |
| 87 | + return dict(list(base_config.items()) + list(config.items())) |
0 commit comments