From 28795f14ba734d0d97cf292e601d7ab00039f401 Mon Sep 17 00:00:00 2001
From: andhus <andhus@kth.se>
Date: Sun, 24 Sep 2017 23:26:52 +0200
Subject: [PATCH 01/13] Added support for passing external constants to RNN,
 which will pass them on to the cell

---
 keras/layers/recurrent.py            | 164 +++++++++++++++++++++------
 tests/keras/layers/recurrent_test.py |  72 ++++++++++++
 2 files changed, 203 insertions(+), 33 deletions(-)

diff --git a/keras/layers/recurrent.py b/keras/layers/recurrent.py
index 4fd4edc50f80..bfcaa0c27114 100644
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@@ -185,7 +185,9 @@ class RNN(Layer):
     # Arguments
         cell: A RNN cell instance. A RNN cell is a class that has:
             - a `call(input_at_t, states_at_t)` method, returning
-                `(output_at_t, states_at_t_plus_1)`.
+                `(output_at_t, states_at_t_plus_1)`. The call method of the
+                cell can also take the optional argument `constants`, see
+                section "Note on passing external constants" below.
             - a `state_size` attribute. This can be a single integer
                 (single state) in which case it is
                 the size of the recurrent state
@@ -276,6 +278,14 @@ class RNN(Layer):
         `states` should be a numpy array or list of numpy arrays representing
         the initial state of the RNN layer.
 
+    # Note on passing external constants to RNNs
+        You can pass "external" constants to the cell using the `constants`
+        keyword argument of RNN.__call__ (as well as RNN.call) method. This
+        requires that the `cell.call` method accepts the same keyword argument
+        `constants`. Such constants can be used to condition the cell
+        transformation on additional static inputs (not changing over time)
+        (a.k.a. an attention mechanism).
+
     # Examples
 
     ```python
@@ -354,6 +364,8 @@ def __init__(self, cell,
             self.state_spec = InputSpec(shape=(None, self.cell.state_size))
         self._states = None
 
+        self.external_constants_spec = None
+
     @property
     def states(self):
         if self._states is None:
@@ -399,6 +411,14 @@ def compute_mask(self, inputs, mask):
             return output_mask
 
     def build(self, input_shape):
+        # Note input_shape will be list of shapes of initial states and
+        # constants if these are passed in __call__.
+        if self.external_constants_spec is not None:
+            # input_shape must be list
+            constants_shape = input_shape[-len(self.external_constants_spec):]
+        else:
+            constants_shape = None
+
         if isinstance(input_shape, list):
             input_shape = input_shape[0]
 
@@ -411,7 +431,10 @@ def build(self, input_shape):
 
         if isinstance(self.cell, Layer):
             step_input_shape = (input_shape[0],) + input_shape[2:]
-            self.cell.build(step_input_shape)
+            if constants_shape is not None:
+                self.cell.build([step_input_shape] + constants_shape)
+            else:
+                self.cell.build(step_input_shape)
 
     def get_initial_state(self, inputs):
         # build an all-zero tensor of shape (samples, output_dim)
@@ -424,43 +447,58 @@ def get_initial_state(self, inputs):
         else:
             return [K.tile(initial_state, [1, self.cell.state_size])]
 
-    def __call__(self, inputs, initial_state=None, **kwargs):
-        # If there are multiple inputs, then
-        # they should be the main input and `initial_state`
-        # e.g. when loading model from file
-        if isinstance(inputs, (list, tuple)) and len(inputs) > 1 and initial_state is None:
-            initial_state = inputs[1:]
-            inputs = inputs[0]
-
-        # If `initial_state` is specified,
-        # and if it a Keras tensor,
-        # then add it to the inputs and temporarily
-        # modify the input spec to include the state.
-        if initial_state is None:
+    def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
+        # If there are multiple inputs, then they should be the main input,
+        # `initial_state` and (optionally) `constants` e.g. when loading model
+        # from file  # TODO ask for clarification
+        inputs, initial_state, constants = self._normalize_args(
+            inputs, initial_state, constants)
+
+        # we need to know length of constants in build
+        if constants:
+            self.external_constants_spec = [
+                InputSpec(shape=K.int_shape(constant))
+                for constant in constants
+            ]
+
+        if initial_state is None and constants is None:
             return super(RNN, self).__call__(inputs, **kwargs)
 
-        if not isinstance(initial_state, (list, tuple)):
-            initial_state = [initial_state]
-
-        is_keras_tensor = hasattr(initial_state[0], '_keras_history')
-        for tensor in initial_state:
+        # If any of `initial_state` or `constants` are specified and are Keras
+        # tensors, then add them to the inputs and temporarily modify the
+        # input_spec to include them.
+
+        check_list = []
+        if initial_state:
+            check_list += initial_state
+        if constants:
+            check_list += constants
+        # at this point check_list cannot be empty
+        is_keras_tensor = hasattr(check_list[0], '_keras_history')
+        for tensor in check_list:
             if hasattr(tensor, '_keras_history') != is_keras_tensor:
-                raise ValueError('The initial state of an RNN layer cannot be'
-                                 ' specified with a mix of Keras tensors and'
-                                 ' non-Keras tensors')
+                raise ValueError('The initial state and constants of an RNN'
+                                 ' layer cannot be specified with a mix of'
+                                 ' Keras tensors and non-Keras tensors')
 
         if is_keras_tensor:
-            # Compute the full input spec, including state
+            # Compute the full input spec, including state and constants
             input_spec = self.input_spec
             state_spec = self.state_spec
             if not isinstance(input_spec, list):
                 input_spec = [input_spec]
             if not isinstance(state_spec, list):
                 state_spec = [state_spec]
-            self.input_spec = input_spec + state_spec
-
-            # Compute the full inputs, including state
-            inputs = [inputs] + list(initial_state)
+            self.input_spec = input_spec
+            inputs = [inputs]
+            if initial_state:
+                self.input_spec += state_spec
+                inputs += initial_state
+                kwargs['initial_state'] = initial_state
+            if constants:
+                self.input_spec += self.external_constants_spec
+                inputs += constants
+                kwargs['constants'] = constants
 
             # Perform the call
             output = super(RNN, self).__call__(inputs, **kwargs)
@@ -470,16 +508,22 @@ def __call__(self, inputs, initial_state=None, **kwargs):
             return output
         else:
             kwargs['initial_state'] = initial_state
+            if constants is not None:
+                kwargs['constants'] = constants
             return super(RNN, self).__call__(inputs, **kwargs)
 
-    def call(self, inputs, mask=None, training=None, initial_state=None):
+    def call(self,
+             inputs,
+             mask=None,
+             training=None,
+             initial_state=None,
+             constants=None):
         # input shape: `(samples, time (padded with zeros), input_dim)`
         # note that the .build() method of subclasses MUST define
         # self.input_spec and self.state_spec with complete input shapes.
         if isinstance(inputs, list):
-            initial_state = inputs[1:]
             inputs = inputs[0]
-        elif initial_state is not None:
+        if initial_state is not None:
             pass
         elif self.stateful:
             initial_state = self.states
@@ -508,9 +552,17 @@ def call(self, inputs, mask=None, training=None, initial_state=None):
                              '- If using the functional API, specify '
                              'the time dimension by passing a `shape` '
                              'or `batch_shape` argument to your Input layer.')
-
+        cell_kwargs = {}
         if has_arg(self.cell.call, 'training'):
-            step = functools.partial(self.cell.call, training=training)
+            cell_kwargs['training'] = training
+
+        if constants is not None:
+            if not has_arg(self.cell.call, 'constants'):
+                raise TypeError('cell does not take keyword argument constants')
+            cell_kwargs['constants'] = constants
+
+        if cell_kwargs:
+            step = functools.partial(self.cell.call, **cell_kwargs)
         else:
             step = self.cell.call
         last_output, outputs, states = K.rnn(step,
@@ -544,6 +596,52 @@ def call(self, inputs, mask=None, training=None, initial_state=None):
         else:
             return output
 
+    def _normalize_args(self, inputs, initial_state=None, constants=None):
+        """The inputs `initial_state` and `constants` can be passed to
+        RNN.__call__ either by separate arguments or as part of `inputs`. In
+        this case `inputs` is a list of tensors of which the first one is the
+        actual (sequence) input followed by initial states, followed by
+        constants.
+
+        This method separates and noramlizes the different groups of inputs.
+
+        # Arguments
+            inputs: tensor of list/tuple of tensors
+            initial_state: tensor or list of tensors or None
+            constants: tensor or list of tensors or None
+
+        # Returns
+            inputs: tensor
+            initial_state: list of tensors or None
+            constants: list of tensors or None
+        """
+        if isinstance(inputs, (list, tuple)):
+            remaining_inputs = inputs[1:]
+            inputs = inputs[0]
+            if remaining_inputs and initial_state is None:
+                if isinstance(self.state_spec, list):
+                    n_states = len(self.state_spec)
+                else:
+                    n_states = 1
+                initial_state = remaining_inputs[:n_states]
+                remaining_inputs = remaining_inputs[n_states:]
+            if remaining_inputs and constants is None:
+                constants = remaining_inputs
+            if len(remaining_inputs) > 0:
+                raise ValueError('too many inputs were passed')
+
+        def to_list_or_none(x):  # TODO break out?
+            if x is None or isinstance(x, list):
+                return x
+            if isinstance(x, tuple):
+                return list(x)
+            return [x]
+
+        initial_state = to_list_or_none(initial_state)
+        constants = to_list_or_none(constants)
+
+        return inputs, initial_state, constants
+
     def reset_states(self, states=None):
         if not self.stateful:
             raise AttributeError('Layer must be stateful.')
diff --git a/tests/keras/layers/recurrent_test.py b/tests/keras/layers/recurrent_test.py
index fc328caf57d0..24122e1dce99 100644
--- a/tests/keras/layers/recurrent_test.py
+++ b/tests/keras/layers/recurrent_test.py
@@ -564,5 +564,77 @@ def test_batch_size_equal_one(layer_class):
     model.train_on_batch(x, y)
 
 
+def test_rnn_cell_with_constants_layer():
+
+    class RNNCellWithConstants(keras.layers.Layer):
+
+        def __init__(self, units, **kwargs):
+            self.units = units
+            self.state_size = units
+            super(RNNCellWithConstants, self).__init__(**kwargs)
+
+        def build(self, input_shape):
+            if not isinstance(input_shape, list):
+                raise TypeError('expects constants shape')
+            [input_shape, constant_shape] = input_shape
+            # will (and should) raise if more than one constant passed
+
+            self.input_kernel = self.add_weight(
+                shape=(input_shape[-1], self.units),
+                initializer='uniform',
+                name='kernel')
+            self.recurrent_kernel = self.add_weight(
+                shape=(self.units, self.units),
+                initializer='uniform',
+                name='recurrent_kernel')
+            self.constant_kernel = self.add_weight(
+                shape=(constant_shape[-1], self.units),
+                initializer='uniform',
+                name='constant_kernel')
+            self.built = True
+
+        def call(self, inputs, states, constants):
+            [prev_output] = states
+            [constant] = constants
+            h_input = keras.backend.dot(inputs, self.input_kernel)
+            h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
+            h_const = keras.backend.dot(constant, self.constant_kernel)
+            output = h_input + h_state + h_const
+            return output, [output]
+
+        def get_config(self):
+            config = {'units': self.units}
+            base_config = super(RNNCellWithConstants, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    # Test basic case.
+    x = keras.Input((None, 5))
+    c = keras.Input((3,))
+    cell = RNNCellWithConstants(32)
+    layer = recurrent.RNN(cell)
+    y = layer(x, constants=c)
+    model = keras.models.Model([x, c], y)
+    model.compile(optimizer='rmsprop', loss='mse')
+    model.train_on_batch(
+        [np.zeros((6, 5, 5)), np.zeros((6, 3))],
+        np.zeros((6, 32))
+    )
+
+    # Test basic case serialization.
+    x_np = np.random.random((6, 5, 5))
+    c_np = np.random.random((6, 3))
+    y_np = model.predict([x_np, c_np])
+    weights = model.get_weights()
+    config = layer.get_config()
+    with keras.utils.CustomObjectScope(
+        {'RNNCellWithConstants': RNNCellWithConstants}):
+        layer = recurrent.RNN.from_config(config)
+    y = layer(x, constants=c)
+    model = keras.models.Model([x, c], y)
+    model.set_weights(weights)
+    y_np_2 = model.predict([x_np, c_np])
+    assert_allclose(y_np, y_np_2, atol=1e-4)
+
+
 if __name__ == '__main__':
     pytest.main([__file__])

From 03b8fdad429e5451a05064a839dec179450d1dec Mon Sep 17 00:00:00 2001
From: andhus <andhus@kth.se>
Date: Mon, 25 Sep 2017 01:49:10 +0200
Subject: [PATCH 02/13] Added class for allowing functional composition of RNN
 Cells, supporting constants

---
 keras/layers/recurrent.py            |  132 ++-
 tests/keras/layers/recurrent_test.py | 1262 ++++++++++++++------------
 2 files changed, 781 insertions(+), 613 deletions(-)

diff --git a/keras/layers/recurrent.py b/keras/layers/recurrent.py
index bfcaa0c27114..54e59b7c2ead 100644
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@@ -4,6 +4,8 @@
 import functools
 import warnings
 
+from keras.engine import Model
+from keras.layers.wrappers import Wrapper
 from .. import backend as K
 from .. import activations
 from .. import initializers
@@ -179,6 +181,117 @@ def get_losses_for(self, inputs=None):
         return losses
 
 
+class FunctionalRNNCell(Wrapper):
+    """Wrapper for allowing composition of RNN Cells using functional API.
+
+    # Arguments:
+        inputs: input tensor at a single time step
+        outputs: output tensor at a single timestep
+        input_states: state tensor(s) from previous time step
+        output_states: state tensor(s) after cell transformation
+        constants: tensor(s) or None, represents inputs that should be static
+            (the same) for each time step.
+
+    # Examples
+
+    ```python
+    # Use functional API to define RNN Cell transformation (in this case
+    # simple vanilla RNN) for a single time step:
+
+    units = 32
+    input_size = 5
+    x = Input((input_size,))
+    h_tm1 = Input((units,))
+    h_ = add([Dense(units)(x), Dense(units, use_bias=False)(h_tm1)])
+    h = Activation('tanh')(h_)
+
+    # Create the cell:
+
+    cell = FunctionalRNNCell(
+        inputs=x, outputs=h, input_states=h_tm1, output_states=h)
+
+    x_sequence = Input((None, input_size))
+    rnn = RNN(cell)
+    y = rnn(x_sequence)
+
+    # We can also define cells that make use of "external" constants, to
+    # implement attention mechanisms:
+
+    constant_shape = (10,)
+    c = Input(constant_shape)
+    density = Dense(constant_shape[0], activation='softmax')(
+        concatenate([x, h_tm1]))
+    attention = multiply([density, c])
+    h2_ = add([h_, Dense(units)(attention)])
+    h2 = Activation('tanh')(h2_)
+
+    attention_cell = FunctionalRNNCell(
+        inputs=x, outputs=h2, input_states=h_tm1, output_states=h2,
+        constants=c)
+
+    attention_rnn = RNN(attention_cell)
+    y2 = attention_rnn(x_sequence, constants=c)
+
+    # Remember to pass the constant to the RNN layer (which will pass it on to
+    # the cell). Also note that shape of c is same as in cell (no time
+    # dimension added)
+
+    attention_model = Model([x_sequence, c], y2)
+    ```
+    """
+    def __init__(
+        self,
+        inputs,
+        outputs,
+        input_states,
+        output_states,
+        constants=None,
+        **kwargs
+    ):
+        input_states = _to_list_or_none(input_states)
+        output_states = _to_list_or_none(output_states)
+        constants = _to_list_or_none(constants)
+        model = Model(
+            inputs=self._get_model_inputs(inputs, input_states, constants),
+            outputs=[outputs] + output_states
+        )
+        super(FunctionalRNNCell, self).__init__(layer=model, **kwargs)
+
+        in_states_shape = [K.int_shape(state) for state in input_states]
+        out_states_shape = [K.int_shape(state) for state in output_states]
+        if not in_states_shape == out_states_shape:
+            raise ValueError(
+                'shape of input_states: {} are not same as shape of '
+                'output_states: {}'.format(in_states_shape, out_states_shape))
+        self._state_size = [state_shape[-1] for state_shape in in_states_shape]
+
+    @property
+    def state_size(self):
+        return self._state_size
+
+    def call(self, inputs, states, constants=None):
+        """Defines the cell transformation for a single time step.
+
+        # Arguments
+            inputs: Tensor representing input at current time step.
+            states: Tensor or list/tuple of tensors representing states from
+                previous time step.
+            constants: Tensor or list of tensors or None representing inputs
+                that should be the same at each time step.
+        """
+        outputs = self.layer(self._get_model_inputs(inputs, states, constants))
+        output, states = outputs[0], outputs[1:]
+
+        return output, states
+
+    def _get_model_inputs(self, inputs, input_states, constants):
+        inputs = [inputs] + list(input_states)
+        if constants is not None:
+            inputs += constants
+
+        return inputs
+
+
 class RNN(Layer):
     """Base class for recurrent layers.
 
@@ -630,15 +743,8 @@ def _normalize_args(self, inputs, initial_state=None, constants=None):
             if len(remaining_inputs) > 0:
                 raise ValueError('too many inputs were passed')
 
-        def to_list_or_none(x):  # TODO break out?
-            if x is None or isinstance(x, list):
-                return x
-            if isinstance(x, tuple):
-                return list(x)
-            return [x]
-
-        initial_state = to_list_or_none(initial_state)
-        constants = to_list_or_none(constants)
+        initial_state = _to_list_or_none(initial_state)
+        constants = _to_list_or_none(constants)
 
         return inputs, initial_state, constants
 
@@ -2001,3 +2107,11 @@ def from_config(cls, config):
         if 'implementation' in config and config['implementation'] == 0:
             config['implementation'] = 1
         return cls(**config)
+
+
+def _to_list_or_none(x):  # TODO move? Very similar to topology._to_list
+    if x is None or isinstance(x, list):
+        return x
+    if isinstance(x, tuple):
+        return list(x)
+    return [x]
diff --git a/tests/keras/layers/recurrent_test.py b/tests/keras/layers/recurrent_test.py
index 24122e1dce99..47992eca5af9 100644
--- a/tests/keras/layers/recurrent_test.py
+++ b/tests/keras/layers/recurrent_test.py
@@ -16,625 +16,679 @@
 
 num_samples, timesteps, embedding_dim, units = 2, 5, 4, 3
 embedding_num = 12
-
-
-@keras_test
-def rnn_test(f):
-    """
-    All the recurrent layers share the same interface,
-    so we can run through them with a single function.
-    """
-    f = keras_test(f)
-    return pytest.mark.parametrize('layer_class', [
-        recurrent.SimpleRNN,
-        recurrent.GRU,
-        recurrent.LSTM
-    ])(f)
-
-
-@rnn_test
-def test_return_sequences(layer_class):
-    layer_test(layer_class,
-               kwargs={'units': units,
-                       'return_sequences': True},
-               input_shape=(num_samples, timesteps, embedding_dim))
-
-
-@rnn_test
-def test_dynamic_behavior(layer_class):
-    layer = layer_class(units, input_shape=(None, embedding_dim))
-    model = Sequential()
-    model.add(layer)
-    model.compile('sgd', 'mse')
-    x = np.random.random((num_samples, timesteps, embedding_dim))
-    y = np.random.random((num_samples, units))
-    model.train_on_batch(x, y)
-
-
-@rnn_test
-def test_stateful_invalid_use(layer_class):
-    layer = layer_class(units,
-                        stateful=True,
-                        batch_input_shape=(num_samples,
-                                           timesteps,
-                                           embedding_dim))
-    model = Sequential()
-    model.add(layer)
-    model.compile('sgd', 'mse')
-    x = np.random.random((num_samples * 2, timesteps, embedding_dim))
-    y = np.random.random((num_samples * 2, units))
-    with pytest.raises(ValueError):
-        model.fit(x, y)
-    with pytest.raises(ValueError):
-        model.predict(x, batch_size=num_samples + 1)
-
-
-@rnn_test
-@pytest.mark.skipif((K.backend() == 'cntk'),
-                    reason='Not yet supported.')
-def test_dropout(layer_class):
-    for unroll in [True, False]:
-        layer_test(layer_class,
-                   kwargs={'units': units,
-                           'dropout': 0.1,
-                           'recurrent_dropout': 0.1,
-                           'unroll': unroll},
-                   input_shape=(num_samples, timesteps, embedding_dim))
-
-        # Test that dropout is applied during training
-        x = K.ones((num_samples, timesteps, embedding_dim))
-        layer = layer_class(units, dropout=0.5, recurrent_dropout=0.5,
-                            input_shape=(timesteps, embedding_dim))
-        y = layer(x)
-        assert y._uses_learning_phase
-
-        y = layer(x, training=True)
-        assert not getattr(y, '_uses_learning_phase')
-
-        # Test that dropout is not applied during testing
-        x = np.random.random((num_samples, timesteps, embedding_dim))
-        layer = layer_class(units, dropout=0.5, recurrent_dropout=0.5,
-                            unroll=unroll,
-                            input_shape=(timesteps, embedding_dim))
-        model = Sequential([layer])
-        assert model.uses_learning_phase
-        y1 = model.predict(x)
-        y2 = model.predict(x)
-        assert_allclose(y1, y2)
-
-
-@rnn_test
-def test_statefulness(layer_class):
-    model = Sequential()
-    model.add(embeddings.Embedding(embedding_num, embedding_dim,
-                                   mask_zero=True,
-                                   input_length=timesteps,
-                                   batch_input_shape=(num_samples, timesteps)))
-    layer = layer_class(units, return_sequences=False,
-                        stateful=True,
-                        weights=None)
-    model.add(layer)
-    model.compile(optimizer='sgd', loss='mse')
-    out1 = model.predict(np.ones((num_samples, timesteps)))
-    assert(out1.shape == (num_samples, units))
-
-    # train once so that the states change
-    model.train_on_batch(np.ones((num_samples, timesteps)),
-                         np.ones((num_samples, units)))
-    out2 = model.predict(np.ones((num_samples, timesteps)))
-
-    # if the state is not reset, output should be different
-    assert(out1.max() != out2.max())
-
-    # check that output changes after states are reset
-    # (even though the model itself didn't change)
-    layer.reset_states()
-    out3 = model.predict(np.ones((num_samples, timesteps)))
-    assert(out2.max() != out3.max())
-
-    # check that container-level reset_states() works
-    model.reset_states()
-    out4 = model.predict(np.ones((num_samples, timesteps)))
-    assert_allclose(out3, out4, atol=1e-5)
-
-    # check that the call to `predict` updated the states
-    out5 = model.predict(np.ones((num_samples, timesteps)))
-    assert(out4.max() != out5.max())
-
-
-@rnn_test
-def test_masking_correctness(layer_class):
-    # Check masking: output with left padding and right padding
-    # should be the same.
-    model = Sequential()
-    model.add(embeddings.Embedding(embedding_num, embedding_dim,
-                                   mask_zero=True,
-                                   input_length=timesteps,
-                                   batch_input_shape=(num_samples, timesteps)))
-    layer = layer_class(units, return_sequences=False)
-    model.add(layer)
-    model.compile(optimizer='sgd', loss='mse')
-
-    left_padded_input = np.ones((num_samples, timesteps))
-    left_padded_input[0, :1] = 0
-    left_padded_input[1, :2] = 0
-    out6 = model.predict(left_padded_input)
-
-    right_padded_input = np.ones((num_samples, timesteps))
-    right_padded_input[0, -1:] = 0
-    right_padded_input[1, -2:] = 0
-    out7 = model.predict(right_padded_input)
-
-    assert_allclose(out7, out6, atol=1e-5)
-
-
-@rnn_test
-def test_implementation_mode(layer_class):
-    for mode in [1, 2]:
-        # Without dropout
-        layer_test(layer_class,
-                   kwargs={'units': units,
-                           'implementation': mode},
-                   input_shape=(num_samples, timesteps, embedding_dim))
-        # With dropout
-        layer_test(layer_class,
-                   kwargs={'units': units,
-                           'implementation': mode,
-                           'dropout': 0.1,
-                           'recurrent_dropout': 0.1},
-                   input_shape=(num_samples, timesteps, embedding_dim))
-
-
-@rnn_test
-def test_regularizer(layer_class):
-    layer = layer_class(units, return_sequences=False, weights=None,
-                        input_shape=(timesteps, embedding_dim),
-                        kernel_regularizer=regularizers.l1(0.01),
-                        recurrent_regularizer=regularizers.l1(0.01),
-                        bias_regularizer='l2')
-    layer.build((None, None, embedding_dim))
-    assert len(layer.losses) == 3
-    assert len(layer.cell.losses) == 3
-
-    layer = layer_class(units, return_sequences=False, weights=None,
-                        input_shape=(timesteps, embedding_dim),
-                        activity_regularizer='l2')
-    assert layer.activity_regularizer
-    x = K.variable(np.ones((num_samples, timesteps, embedding_dim)))
-    layer(x)
-    assert len(layer.cell.get_losses_for(x)) == 0
-    assert len(layer.get_losses_for(x)) == 1
-
-
-@keras_test
-def test_masking_layer():
-    ''' This test based on a previously failing issue here:
-    https://github.com/fchollet/keras/issues/1567
-    '''
-    inputs = np.random.random((6, 3, 4))
-    targets = np.abs(np.random.random((6, 3, 5)))
-    targets /= targets.sum(axis=-1, keepdims=True)
-
-    model = Sequential()
-    model.add(Masking(input_shape=(3, 4)))
-    model.add(recurrent.SimpleRNN(units=5, return_sequences=True, unroll=False))
-    model.compile(loss='categorical_crossentropy', optimizer='adam')
-    model.fit(inputs, targets, epochs=1, batch_size=100, verbose=1)
-
-    model = Sequential()
-    model.add(Masking(input_shape=(3, 4)))
-    model.add(recurrent.SimpleRNN(units=5, return_sequences=True, unroll=True))
-    model.compile(loss='categorical_crossentropy', optimizer='adam')
-    model.fit(inputs, targets, epochs=1, batch_size=100, verbose=1)
-
-
-@rnn_test
-def test_from_config(layer_class):
-    stateful_flags = (False, True)
-    for stateful in stateful_flags:
-        l1 = layer_class(units=1, stateful=stateful)
-        l2 = layer_class.from_config(l1.get_config())
-        assert l1.get_config() == l2.get_config()
-
-
-@rnn_test
-def test_specify_initial_state_keras_tensor(layer_class):
-    num_states = 2 if layer_class is recurrent.LSTM else 1
-
-    # Test with Keras tensor
-    inputs = Input((timesteps, embedding_dim))
-    initial_state = [Input((units,)) for _ in range(num_states)]
-    layer = layer_class(units)
-    if len(initial_state) == 1:
-        output = layer(inputs, initial_state=initial_state[0])
-    else:
-        output = layer(inputs, initial_state=initial_state)
-    assert initial_state[0] in layer.inbound_nodes[0].input_tensors
-
-    model = Model([inputs] + initial_state, output)
-    model.compile(loss='categorical_crossentropy', optimizer='adam')
-
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    initial_state = [np.random.random((num_samples, units))
-                     for _ in range(num_states)]
-    targets = np.random.random((num_samples, units))
-    model.fit([inputs] + initial_state, targets)
-
-
-@rnn_test
-def test_specify_initial_state_non_keras_tensor(layer_class):
-    num_states = 2 if layer_class is recurrent.LSTM else 1
-
-    # Test with non-Keras tensor
-    inputs = Input((timesteps, embedding_dim))
-    initial_state = [K.random_normal_variable((num_samples, units), 0, 1)
-                     for _ in range(num_states)]
-    layer = layer_class(units)
-    output = layer(inputs, initial_state=initial_state)
-
-    model = Model(inputs, output)
-    model.compile(loss='categorical_crossentropy', optimizer='adam')
-
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    targets = np.random.random((num_samples, units))
-    model.fit(inputs, targets)
-
-
-@rnn_test
-def test_reset_states_with_values(layer_class):
-    num_states = 2 if layer_class is recurrent.LSTM else 1
-
-    layer = layer_class(units, stateful=True)
-    layer.build((num_samples, timesteps, embedding_dim))
-    layer.reset_states()
-    assert len(layer.states) == num_states
-    assert layer.states[0] is not None
-    np.testing.assert_allclose(K.eval(layer.states[0]),
-                               np.zeros(K.int_shape(layer.states[0])),
-                               atol=1e-4)
-    state_shapes = [K.int_shape(state) for state in layer.states]
-    values = [np.ones(shape) for shape in state_shapes]
-    if len(values) == 1:
-        values = values[0]
-    layer.reset_states(values)
-    np.testing.assert_allclose(K.eval(layer.states[0]),
-                               np.ones(K.int_shape(layer.states[0])),
-                               atol=1e-4)
-
-    # Test fit with invalid data
-    with pytest.raises(ValueError):
-        layer.reset_states([1] * (len(layer.states) + 1))
-
-
-@rnn_test
-def test_initial_states_as_other_inputs(layer_class):
-    num_states = 2 if layer_class is recurrent.LSTM else 1
-
-    # Test with Keras tensor
-    main_inputs = Input((timesteps, embedding_dim))
-    initial_state = [Input((units,)) for _ in range(num_states)]
-    inputs = [main_inputs] + initial_state
-
-    layer = layer_class(units)
-    output = layer(inputs)
-    assert initial_state[0] in layer.inbound_nodes[0].input_tensors
-
-    model = Model(inputs, output)
-    model.compile(loss='categorical_crossentropy', optimizer='adam')
-
-    main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    initial_state = [np.random.random((num_samples, units))
-                     for _ in range(num_states)]
-    targets = np.random.random((num_samples, units))
-    model.train_on_batch([main_inputs] + initial_state, targets)
-
-
-@rnn_test
-def test_specify_state_with_masking(layer_class):
-    ''' This test based on a previously failing issue here:
-    https://github.com/fchollet/keras/issues/1567
-    '''
-    num_states = 2 if layer_class is recurrent.LSTM else 1
-
-    inputs = Input((timesteps, embedding_dim))
-    _ = Masking()(inputs)
-    initial_state = [Input((units,)) for _ in range(num_states)]
-    output = layer_class(units)(inputs, initial_state=initial_state)
-
-    model = Model([inputs] + initial_state, output)
-    model.compile(loss='categorical_crossentropy', optimizer='adam')
-
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    initial_state = [np.random.random((num_samples, units))
-                     for _ in range(num_states)]
-    targets = np.random.random((num_samples, units))
-    model.fit([inputs] + initial_state, targets)
-
-
-@rnn_test
-def test_return_state(layer_class):
-    num_states = 2 if layer_class is recurrent.LSTM else 1
-
-    inputs = Input(batch_shape=(num_samples, timesteps, embedding_dim))
-    layer = layer_class(units, return_state=True, stateful=True)
-    outputs = layer(inputs)
-    output, state = outputs[0], outputs[1:]
-    assert len(state) == num_states
-    model = Model(inputs, state[0])
-
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    state = model.predict(inputs)
-    np.testing.assert_allclose(K.eval(layer.states[0]), state, atol=1e-4)
-
-
-@rnn_test
-def test_state_reuse(layer_class):
-    inputs = Input(batch_shape=(num_samples, timesteps, embedding_dim))
-    layer = layer_class(units, return_state=True, return_sequences=True)
-    outputs = layer(inputs)
-    output, state = outputs[0], outputs[1:]
-    output = layer_class(units)(output, initial_state=state)
-    model = Model(inputs, output)
-
-    inputs = np.random.random((num_samples, timesteps, embedding_dim))
-    outputs = model.predict(inputs)
-
-
-def test_minimal_rnn_cell_non_layer():
-
-    class MinimalRNNCell(object):
-
-        def __init__(self, units, input_dim):
-            self.units = units
-            self.state_size = units
-            self.kernel = keras.backend.variable(
-                np.random.random((input_dim, units)))
-
-        def call(self, inputs, states):
-            prev_output = states[0]
-            output = keras.backend.dot(inputs, self.kernel) + prev_output
-            return output, [output]
-
-    # Basic test case.
-    cell = MinimalRNNCell(32, 5)
-    x = keras.Input((None, 5))
-    layer = recurrent.RNN(cell)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-    # Test stacking.
-    cells = [MinimalRNNCell(8, 5),
-             MinimalRNNCell(32, 8),
-             MinimalRNNCell(32, 32)]
-    layer = recurrent.RNN(cells)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-
-def test_minimal_rnn_cell_non_layer_multiple_states():
-
-    class MinimalRNNCell(object):
-
-        def __init__(self, units, input_dim):
-            self.units = units
-            self.state_size = (units, units)
-            self.kernel = keras.backend.variable(
-                np.random.random((input_dim, units)))
-
-        def call(self, inputs, states):
-            prev_output_1 = states[0]
-            prev_output_2 = states[1]
-            output = keras.backend.dot(inputs, self.kernel)
-            output += prev_output_1
-            output -= prev_output_2
-            return output, [output * 2, output * 3]
-
-    # Basic test case.
-    cell = MinimalRNNCell(32, 5)
-    x = keras.Input((None, 5))
-    layer = recurrent.RNN(cell)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-    # Test stacking.
-    cells = [MinimalRNNCell(8, 5),
-             MinimalRNNCell(16, 8),
-             MinimalRNNCell(32, 16)]
-    layer = recurrent.RNN(cells)
-    assert layer.cell.state_size == (32, 32, 16, 16, 8, 8)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-
-def test_minimal_rnn_cell_layer():
-
-    class MinimalRNNCell(keras.layers.Layer):
-
-        def __init__(self, units, **kwargs):
-            self.units = units
-            self.state_size = units
-            super(MinimalRNNCell, self).__init__(**kwargs)
-
-        def build(self, input_shape):
-            self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
-                                          initializer='uniform',
-                                          name='kernel')
-            self.recurrent_kernel = self.add_weight(
-                shape=(self.units, self.units),
-                initializer='uniform',
-                name='recurrent_kernel')
-            self.built = True
-
-        def call(self, inputs, states):
-            prev_output = states[0]
-            h = keras.backend.dot(inputs, self.kernel)
-            output = h + keras.backend.dot(prev_output, self.recurrent_kernel)
-            return output, [output]
-
-        def get_config(self):
-            config = {'units': self.units}
-            base_config = super(MinimalRNNCell, self).get_config()
-            return dict(list(base_config.items()) + list(config.items()))
+#
+#
+# @keras_test
+# def rnn_test(f):
+#     """
+#     All the recurrent layers share the same interface,
+#     so we can run through them with a single function.
+#     """
+#     f = keras_test(f)
+#     return pytest.mark.parametrize('layer_class', [
+#         recurrent.SimpleRNN,
+#         recurrent.GRU,
+#         recurrent.LSTM
+#     ])(f)
+#
+#
+# @rnn_test
+# def test_return_sequences(layer_class):
+#     layer_test(layer_class,
+#                kwargs={'units': units,
+#                        'return_sequences': True},
+#                input_shape=(num_samples, timesteps, embedding_dim))
+#
+#
+# @rnn_test
+# def test_dynamic_behavior(layer_class):
+#     layer = layer_class(units, input_shape=(None, embedding_dim))
+#     model = Sequential()
+#     model.add(layer)
+#     model.compile('sgd', 'mse')
+#     x = np.random.random((num_samples, timesteps, embedding_dim))
+#     y = np.random.random((num_samples, units))
+#     model.train_on_batch(x, y)
+#
+#
+# @rnn_test
+# def test_stateful_invalid_use(layer_class):
+#     layer = layer_class(units,
+#                         stateful=True,
+#                         batch_input_shape=(num_samples,
+#                                            timesteps,
+#                                            embedding_dim))
+#     model = Sequential()
+#     model.add(layer)
+#     model.compile('sgd', 'mse')
+#     x = np.random.random((num_samples * 2, timesteps, embedding_dim))
+#     y = np.random.random((num_samples * 2, units))
+#     with pytest.raises(ValueError):
+#         model.fit(x, y)
+#     with pytest.raises(ValueError):
+#         model.predict(x, batch_size=num_samples + 1)
+#
+#
+# @rnn_test
+# @pytest.mark.skipif((K.backend() == 'cntk'),
+#                     reason='Not yet supported.')
+# def test_dropout(layer_class):
+#     for unroll in [True, False]:
+#         layer_test(layer_class,
+#                    kwargs={'units': units,
+#                            'dropout': 0.1,
+#                            'recurrent_dropout': 0.1,
+#                            'unroll': unroll},
+#                    input_shape=(num_samples, timesteps, embedding_dim))
+#
+#         # Test that dropout is applied during training
+#         x = K.ones((num_samples, timesteps, embedding_dim))
+#         layer = layer_class(units, dropout=0.5, recurrent_dropout=0.5,
+#                             input_shape=(timesteps, embedding_dim))
+#         y = layer(x)
+#         assert y._uses_learning_phase
+#
+#         y = layer(x, training=True)
+#         assert not getattr(y, '_uses_learning_phase')
+#
+#         # Test that dropout is not applied during testing
+#         x = np.random.random((num_samples, timesteps, embedding_dim))
+#         layer = layer_class(units, dropout=0.5, recurrent_dropout=0.5,
+#                             unroll=unroll,
+#                             input_shape=(timesteps, embedding_dim))
+#         model = Sequential([layer])
+#         assert model.uses_learning_phase
+#         y1 = model.predict(x)
+#         y2 = model.predict(x)
+#         assert_allclose(y1, y2)
+#
+#
+# @rnn_test
+# def test_statefulness(layer_class):
+#     model = Sequential()
+#     model.add(embeddings.Embedding(embedding_num, embedding_dim,
+#                                    mask_zero=True,
+#                                    input_length=timesteps,
+#                                    batch_input_shape=(num_samples, timesteps)))
+#     layer = layer_class(units, return_sequences=False,
+#                         stateful=True,
+#                         weights=None)
+#     model.add(layer)
+#     model.compile(optimizer='sgd', loss='mse')
+#     out1 = model.predict(np.ones((num_samples, timesteps)))
+#     assert(out1.shape == (num_samples, units))
+#
+#     # train once so that the states change
+#     model.train_on_batch(np.ones((num_samples, timesteps)),
+#                          np.ones((num_samples, units)))
+#     out2 = model.predict(np.ones((num_samples, timesteps)))
+#
+#     # if the state is not reset, output should be different
+#     assert(out1.max() != out2.max())
+#
+#     # check that output changes after states are reset
+#     # (even though the model itself didn't change)
+#     layer.reset_states()
+#     out3 = model.predict(np.ones((num_samples, timesteps)))
+#     assert(out2.max() != out3.max())
+#
+#     # check that container-level reset_states() works
+#     model.reset_states()
+#     out4 = model.predict(np.ones((num_samples, timesteps)))
+#     assert_allclose(out3, out4, atol=1e-5)
+#
+#     # check that the call to `predict` updated the states
+#     out5 = model.predict(np.ones((num_samples, timesteps)))
+#     assert(out4.max() != out5.max())
+#
+#
+# @rnn_test
+# def test_masking_correctness(layer_class):
+#     # Check masking: output with left padding and right padding
+#     # should be the same.
+#     model = Sequential()
+#     model.add(embeddings.Embedding(embedding_num, embedding_dim,
+#                                    mask_zero=True,
+#                                    input_length=timesteps,
+#                                    batch_input_shape=(num_samples, timesteps)))
+#     layer = layer_class(units, return_sequences=False)
+#     model.add(layer)
+#     model.compile(optimizer='sgd', loss='mse')
+#
+#     left_padded_input = np.ones((num_samples, timesteps))
+#     left_padded_input[0, :1] = 0
+#     left_padded_input[1, :2] = 0
+#     out6 = model.predict(left_padded_input)
+#
+#     right_padded_input = np.ones((num_samples, timesteps))
+#     right_padded_input[0, -1:] = 0
+#     right_padded_input[1, -2:] = 0
+#     out7 = model.predict(right_padded_input)
+#
+#     assert_allclose(out7, out6, atol=1e-5)
+#
+#
+# @rnn_test
+# def test_implementation_mode(layer_class):
+#     for mode in [1, 2]:
+#         # Without dropout
+#         layer_test(layer_class,
+#                    kwargs={'units': units,
+#                            'implementation': mode},
+#                    input_shape=(num_samples, timesteps, embedding_dim))
+#         # With dropout
+#         layer_test(layer_class,
+#                    kwargs={'units': units,
+#                            'implementation': mode,
+#                            'dropout': 0.1,
+#                            'recurrent_dropout': 0.1},
+#                    input_shape=(num_samples, timesteps, embedding_dim))
+#
+#
+# @rnn_test
+# def test_regularizer(layer_class):
+#     layer = layer_class(units, return_sequences=False, weights=None,
+#                         input_shape=(timesteps, embedding_dim),
+#                         kernel_regularizer=regularizers.l1(0.01),
+#                         recurrent_regularizer=regularizers.l1(0.01),
+#                         bias_regularizer='l2')
+#     layer.build((None, None, embedding_dim))
+#     assert len(layer.losses) == 3
+#     assert len(layer.cell.losses) == 3
+#
+#     layer = layer_class(units, return_sequences=False, weights=None,
+#                         input_shape=(timesteps, embedding_dim),
+#                         activity_regularizer='l2')
+#     assert layer.activity_regularizer
+#     x = K.variable(np.ones((num_samples, timesteps, embedding_dim)))
+#     layer(x)
+#     assert len(layer.cell.get_losses_for(x)) == 0
+#     assert len(layer.get_losses_for(x)) == 1
+#
+#
+# @keras_test
+# def test_masking_layer():
+#     ''' This test based on a previously failing issue here:
+#     https://github.com/fchollet/keras/issues/1567
+#     '''
+#     inputs = np.random.random((6, 3, 4))
+#     targets = np.abs(np.random.random((6, 3, 5)))
+#     targets /= targets.sum(axis=-1, keepdims=True)
+#
+#     model = Sequential()
+#     model.add(Masking(input_shape=(3, 4)))
+#     model.add(recurrent.SimpleRNN(units=5, return_sequences=True, unroll=False))
+#     model.compile(loss='categorical_crossentropy', optimizer='adam')
+#     model.fit(inputs, targets, epochs=1, batch_size=100, verbose=1)
+#
+#     model = Sequential()
+#     model.add(Masking(input_shape=(3, 4)))
+#     model.add(recurrent.SimpleRNN(units=5, return_sequences=True, unroll=True))
+#     model.compile(loss='categorical_crossentropy', optimizer='adam')
+#     model.fit(inputs, targets, epochs=1, batch_size=100, verbose=1)
+#
+#
+# @rnn_test
+# def test_from_config(layer_class):
+#     stateful_flags = (False, True)
+#     for stateful in stateful_flags:
+#         l1 = layer_class(units=1, stateful=stateful)
+#         l2 = layer_class.from_config(l1.get_config())
+#         assert l1.get_config() == l2.get_config()
+#
+#
+# @rnn_test
+# def test_specify_initial_state_keras_tensor(layer_class):
+#     num_states = 2 if layer_class is recurrent.LSTM else 1
+#
+#     # Test with Keras tensor
+#     inputs = Input((timesteps, embedding_dim))
+#     initial_state = [Input((units,)) for _ in range(num_states)]
+#     layer = layer_class(units)
+#     if len(initial_state) == 1:
+#         output = layer(inputs, initial_state=initial_state[0])
+#     else:
+#         output = layer(inputs, initial_state=initial_state)
+#     assert initial_state[0] in layer.inbound_nodes[0].input_tensors
+#
+#     model = Model([inputs] + initial_state, output)
+#     model.compile(loss='categorical_crossentropy', optimizer='adam')
+#
+#     inputs = np.random.random((num_samples, timesteps, embedding_dim))
+#     initial_state = [np.random.random((num_samples, units))
+#                      for _ in range(num_states)]
+#     targets = np.random.random((num_samples, units))
+#     model.fit([inputs] + initial_state, targets)
+#
+#
+# @rnn_test
+# def test_specify_initial_state_non_keras_tensor(layer_class):
+#     num_states = 2 if layer_class is recurrent.LSTM else 1
+#
+#     # Test with non-Keras tensor
+#     inputs = Input((timesteps, embedding_dim))
+#     initial_state = [K.random_normal_variable((num_samples, units), 0, 1)
+#                      for _ in range(num_states)]
+#     layer = layer_class(units)
+#     output = layer(inputs, initial_state=initial_state)
+#
+#     model = Model(inputs, output)
+#     model.compile(loss='categorical_crossentropy', optimizer='adam')
+#
+#     inputs = np.random.random((num_samples, timesteps, embedding_dim))
+#     targets = np.random.random((num_samples, units))
+#     model.fit(inputs, targets)
+#
+#
+# @rnn_test
+# def test_reset_states_with_values(layer_class):
+#     num_states = 2 if layer_class is recurrent.LSTM else 1
+#
+#     layer = layer_class(units, stateful=True)
+#     layer.build((num_samples, timesteps, embedding_dim))
+#     layer.reset_states()
+#     assert len(layer.states) == num_states
+#     assert layer.states[0] is not None
+#     np.testing.assert_allclose(K.eval(layer.states[0]),
+#                                np.zeros(K.int_shape(layer.states[0])),
+#                                atol=1e-4)
+#     state_shapes = [K.int_shape(state) for state in layer.states]
+#     values = [np.ones(shape) for shape in state_shapes]
+#     if len(values) == 1:
+#         values = values[0]
+#     layer.reset_states(values)
+#     np.testing.assert_allclose(K.eval(layer.states[0]),
+#                                np.ones(K.int_shape(layer.states[0])),
+#                                atol=1e-4)
+#
+#     # Test fit with invalid data
+#     with pytest.raises(ValueError):
+#         layer.reset_states([1] * (len(layer.states) + 1))
+#
+#
+# @rnn_test
+# def test_initial_states_as_other_inputs(layer_class):
+#     num_states = 2 if layer_class is recurrent.LSTM else 1
+#
+#     # Test with Keras tensor
+#     main_inputs = Input((timesteps, embedding_dim))
+#     initial_state = [Input((units,)) for _ in range(num_states)]
+#     inputs = [main_inputs] + initial_state
+#
+#     layer = layer_class(units)
+#     output = layer(inputs)
+#     assert initial_state[0] in layer.inbound_nodes[0].input_tensors
+#
+#     model = Model(inputs, output)
+#     model.compile(loss='categorical_crossentropy', optimizer='adam')
+#
+#     main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
+#     initial_state = [np.random.random((num_samples, units))
+#                      for _ in range(num_states)]
+#     targets = np.random.random((num_samples, units))
+#     model.train_on_batch([main_inputs] + initial_state, targets)
+#
+#
+# @rnn_test
+# def test_specify_state_with_masking(layer_class):
+#     ''' This test based on a previously failing issue here:
+#     https://github.com/fchollet/keras/issues/1567
+#     '''
+#     num_states = 2 if layer_class is recurrent.LSTM else 1
+#
+#     inputs = Input((timesteps, embedding_dim))
+#     _ = Masking()(inputs)
+#     initial_state = [Input((units,)) for _ in range(num_states)]
+#     output = layer_class(units)(inputs, initial_state=initial_state)
+#
+#     model = Model([inputs] + initial_state, output)
+#     model.compile(loss='categorical_crossentropy', optimizer='adam')
+#
+#     inputs = np.random.random((num_samples, timesteps, embedding_dim))
+#     initial_state = [np.random.random((num_samples, units))
+#                      for _ in range(num_states)]
+#     targets = np.random.random((num_samples, units))
+#     model.fit([inputs] + initial_state, targets)
+#
+#
+# @rnn_test
+# def test_return_state(layer_class):
+#     num_states = 2 if layer_class is recurrent.LSTM else 1
+#
+#     inputs = Input(batch_shape=(num_samples, timesteps, embedding_dim))
+#     layer = layer_class(units, return_state=True, stateful=True)
+#     outputs = layer(inputs)
+#     output, state = outputs[0], outputs[1:]
+#     assert len(state) == num_states
+#     model = Model(inputs, state[0])
+#
+#     inputs = np.random.random((num_samples, timesteps, embedding_dim))
+#     state = model.predict(inputs)
+#     np.testing.assert_allclose(K.eval(layer.states[0]), state, atol=1e-4)
+#
+#
+# @rnn_test
+# def test_state_reuse(layer_class):
+#     inputs = Input(batch_shape=(num_samples, timesteps, embedding_dim))
+#     layer = layer_class(units, return_state=True, return_sequences=True)
+#     outputs = layer(inputs)
+#     output, state = outputs[0], outputs[1:]
+#     output = layer_class(units)(output, initial_state=state)
+#     model = Model(inputs, output)
+#
+#     inputs = np.random.random((num_samples, timesteps, embedding_dim))
+#     outputs = model.predict(inputs)
+#
+#
+# def test_minimal_rnn_cell_non_layer():
+#
+#     class MinimalRNNCell(object):
+#
+#         def __init__(self, units, input_dim):
+#             self.units = units
+#             self.state_size = units
+#             self.kernel = keras.backend.variable(
+#                 np.random.random((input_dim, units)))
+#
+#         def call(self, inputs, states):
+#             prev_output = states[0]
+#             output = keras.backend.dot(inputs, self.kernel) + prev_output
+#             return output, [output]
+#
+#     # Basic test case.
+#     cell = MinimalRNNCell(32, 5)
+#     x = keras.Input((None, 5))
+#     layer = recurrent.RNN(cell)
+#     y = layer(x)
+#     model = keras.models.Model(x, y)
+#     model.compile(optimizer='rmsprop', loss='mse')
+#     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+#
+#     # Test stacking.
+#     cells = [MinimalRNNCell(8, 5),
+#              MinimalRNNCell(32, 8),
+#              MinimalRNNCell(32, 32)]
+#     layer = recurrent.RNN(cells)
+#     y = layer(x)
+#     model = keras.models.Model(x, y)
+#     model.compile(optimizer='rmsprop', loss='mse')
+#     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+#
+#
+# def test_minimal_rnn_cell_non_layer_multiple_states():
+#
+#     class MinimalRNNCell(object):
+#
+#         def __init__(self, units, input_dim):
+#             self.units = units
+#             self.state_size = (units, units)
+#             self.kernel = keras.backend.variable(
+#                 np.random.random((input_dim, units)))
+#
+#         def call(self, inputs, states):
+#             prev_output_1 = states[0]
+#             prev_output_2 = states[1]
+#             output = keras.backend.dot(inputs, self.kernel)
+#             output += prev_output_1
+#             output -= prev_output_2
+#             return output, [output * 2, output * 3]
+#
+#     # Basic test case.
+#     cell = MinimalRNNCell(32, 5)
+#     x = keras.Input((None, 5))
+#     layer = recurrent.RNN(cell)
+#     y = layer(x)
+#     model = keras.models.Model(x, y)
+#     model.compile(optimizer='rmsprop', loss='mse')
+#     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+#
+#     # Test stacking.
+#     cells = [MinimalRNNCell(8, 5),
+#              MinimalRNNCell(16, 8),
+#              MinimalRNNCell(32, 16)]
+#     layer = recurrent.RNN(cells)
+#     assert layer.cell.state_size == (32, 32, 16, 16, 8, 8)
+#     y = layer(x)
+#     model = keras.models.Model(x, y)
+#     model.compile(optimizer='rmsprop', loss='mse')
+#     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+#
+#
+# def test_minimal_rnn_cell_layer():
+#
+#     class MinimalRNNCell(keras.layers.Layer):
+#
+#         def __init__(self, units, **kwargs):
+#             self.units = units
+#             self.state_size = units
+#             super(MinimalRNNCell, self).__init__(**kwargs)
+#
+#         def build(self, input_shape):
+#             self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
+#                                           initializer='uniform',
+#                                           name='kernel')
+#             self.recurrent_kernel = self.add_weight(
+#                 shape=(self.units, self.units),
+#                 initializer='uniform',
+#                 name='recurrent_kernel')
+#             self.built = True
+#
+#         def call(self, inputs, states):
+#             prev_output = states[0]
+#             h = keras.backend.dot(inputs, self.kernel)
+#             output = h + keras.backend.dot(prev_output, self.recurrent_kernel)
+#             return output, [output]
+#
+#         def get_config(self):
+#             config = {'units': self.units}
+#             base_config = super(MinimalRNNCell, self).get_config()
+#             return dict(list(base_config.items()) + list(config.items()))
+#
+#     # Test basic case.
+#     x = keras.Input((None, 5))
+#     cell = MinimalRNNCell(32)
+#     layer = recurrent.RNN(cell)
+#     y = layer(x)
+#     model = keras.models.Model(x, y)
+#     model.compile(optimizer='rmsprop', loss='mse')
+#     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+#
+#     # Test basic case serialization.
+#     x_np = np.random.random((6, 5, 5))
+#     y_np = model.predict(x_np)
+#     weights = model.get_weights()
+#     config = layer.get_config()
+#     with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
+#         layer = recurrent.RNN.from_config(config)
+#     y = layer(x)
+#     model = keras.models.Model(x, y)
+#     model.set_weights(weights)
+#     y_np_2 = model.predict(x_np)
+#     assert_allclose(y_np, y_np_2, atol=1e-4)
+#
+#     # Test stacking.
+#     cells = [MinimalRNNCell(8),
+#              MinimalRNNCell(12),
+#              MinimalRNNCell(32)]
+#     layer = recurrent.RNN(cells)
+#     y = layer(x)
+#     model = keras.models.Model(x, y)
+#     model.compile(optimizer='rmsprop', loss='mse')
+#     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+#
+#     # Test stacked RNN serialization.
+#     x_np = np.random.random((6, 5, 5))
+#     y_np = model.predict(x_np)
+#     weights = model.get_weights()
+#     config = layer.get_config()
+#     with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
+#         layer = recurrent.RNN.from_config(config)
+#     y = layer(x)
+#     model = keras.models.Model(x, y)
+#     model.set_weights(weights)
+#     y_np_2 = model.predict(x_np)
+#     assert_allclose(y_np, y_np_2, atol=1e-4)
+#
+#
+# def test_stacked_rnn_attributes():
+#     cells = [recurrent.LSTMCell(3),
+#              recurrent.LSTMCell(3, kernel_regularizer='l2')]
+#     layer = recurrent.RNN(cells)
+#     layer.build((None, None, 5))
+#
+#     # Test regularization losses
+#     assert len(layer.losses) == 1
+#
+#     # Test weights
+#     assert len(layer.trainable_weights) == 6
+#     cells[0].trainable = False
+#     assert len(layer.trainable_weights) == 3
+#     assert len(layer.non_trainable_weights) == 3
+#
+#     # Test `get_losses_for`
+#     x = keras.Input((None, 5))
+#     y = K.sum(x)
+#     cells[0].add_loss(y, inputs=x)
+#     assert layer.get_losses_for(x) == [y]
+#
+#
+# @rnn_test
+# def test_batch_size_equal_one(layer_class):
+#     inputs = Input(batch_shape=(1, timesteps, embedding_dim))
+#     layer = layer_class(units)
+#     outputs = layer(inputs)
+#     model = Model(inputs, outputs)
+#     model.compile('sgd', 'mse')
+#     x = np.random.random((1, timesteps, embedding_dim))
+#     y = np.random.random((1, units))
+#     model.train_on_batch(x, y)
+#
+#
+# def test_rnn_cell_with_constants_layer():
+#
+#     class RNNCellWithConstants(keras.layers.Layer):
+#
+#         def __init__(self, units, **kwargs):
+#             self.units = units
+#             self.state_size = units
+#             super(RNNCellWithConstants, self).__init__(**kwargs)
+#
+#         def build(self, input_shape):
+#             if not isinstance(input_shape, list):
+#                 raise TypeError('expects constants shape')
+#             [input_shape, constant_shape] = input_shape
+#             # will (and should) raise if more than one constant passed
+#
+#             self.input_kernel = self.add_weight(
+#                 shape=(input_shape[-1], self.units),
+#                 initializer='uniform',
+#                 name='kernel')
+#             self.recurrent_kernel = self.add_weight(
+#                 shape=(self.units, self.units),
+#                 initializer='uniform',
+#                 name='recurrent_kernel')
+#             self.constant_kernel = self.add_weight(
+#                 shape=(constant_shape[-1], self.units),
+#                 initializer='uniform',
+#                 name='constant_kernel')
+#             self.built = True
+#
+#         def call(self, inputs, states, constants):
+#             [prev_output] = states
+#             [constant] = constants
+#             h_input = keras.backend.dot(inputs, self.input_kernel)
+#             h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
+#             h_const = keras.backend.dot(constant, self.constant_kernel)
+#             output = h_input + h_state + h_const
+#             return output, [output]
+#
+#         def get_config(self):
+#             config = {'units': self.units}
+#             base_config = super(RNNCellWithConstants, self).get_config()
+#             return dict(list(base_config.items()) + list(config.items()))
+#
+#     # Test basic case.
+#     x = keras.Input((None, 5))
+#     c = keras.Input((3,))
+#     cell = RNNCellWithConstants(32)
+#     layer = recurrent.RNN(cell)
+#     y = layer(x, constants=c)
+#     model = keras.models.Model([x, c], y)
+#     model.compile(optimizer='rmsprop', loss='mse')
+#     model.train_on_batch(
+#         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
+#         np.zeros((6, 32))
+#     )
+#
+#     # Test basic case serialization.
+#     x_np = np.random.random((6, 5, 5))
+#     c_np = np.random.random((6, 3))
+#     y_np = model.predict([x_np, c_np])
+#     weights = model.get_weights()
+#     config = layer.get_config()
+#     with keras.utils.CustomObjectScope(
+#         {'RNNCellWithConstants': RNNCellWithConstants}):
+#         layer = recurrent.RNN.from_config(config)
+#     y = layer(x, constants=c)
+#     model = keras.models.Model([x, c], y)
+#     model.set_weights(weights)
+#     y_np_2 = model.predict([x_np, c_np])
+#     assert_allclose(y_np, y_np_2, atol=1e-4)
+
+
+def test_functional_rnn_cell():
+    layers = keras.layers
+
+    # Create the cell:
+    units = 8
+    input_size = 5
+    x = Input((input_size,))
+    h_tm1 = Input((units,))
+    h_ = layers.add([layers.Dense(units)(x), layers.Dense(units)(h_tm1)])
+    h = layers.Activation('tanh')(h_)
+    cell = recurrent.FunctionalRNNCell(
+        inputs=x, outputs=h, input_states=h_tm1, output_states=h)
 
     # Test basic case.
-    x = keras.Input((None, 5))
-    cell = MinimalRNNCell(32)
+    x_seq = Input((None, input_size))
     layer = recurrent.RNN(cell)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-
-    # Test basic case serialization.
-    x_np = np.random.random((6, 5, 5))
-    y_np = model.predict(x_np)
-    weights = model.get_weights()
-    config = layer.get_config()
-    with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
-        layer = recurrent.RNN.from_config(config)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.set_weights(weights)
-    y_np_2 = model.predict(x_np)
-    assert_allclose(y_np, y_np_2, atol=1e-4)
-
-    # Test stacking.
-    cells = [MinimalRNNCell(8),
-             MinimalRNNCell(12),
-             MinimalRNNCell(32)]
-    layer = recurrent.RNN(cells)
-    y = layer(x)
-    model = keras.models.Model(x, y)
+    y = layer(x_seq)
+    model = keras.models.Model(x_seq, y)
     model.compile(optimizer='rmsprop', loss='mse')
-    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+    model.train_on_batch(np.zeros((6, 5, input_size)), np.zeros((6, units)))
 
-    # Test stacked RNN serialization.
-    x_np = np.random.random((6, 5, 5))
-    y_np = model.predict(x_np)
-    weights = model.get_weights()
-    config = layer.get_config()
-    with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
-        layer = recurrent.RNN.from_config(config)
-    y = layer(x)
-    model = keras.models.Model(x, y)
-    model.set_weights(weights)
-    y_np_2 = model.predict(x_np)
-    assert_allclose(y_np, y_np_2, atol=1e-4)
 
+def test_functional_rnn_cell_with_constants():
+    layers = keras.layers
 
-def test_stacked_rnn_attributes():
-    cells = [recurrent.LSTMCell(3),
-             recurrent.LSTMCell(3, kernel_regularizer='l2')]
-    layer = recurrent.RNN(cells)
-    layer.build((None, None, 5))
+    # Create the cell:
+    units = 8
+    input_size = 5
+    constant_shape = (10,)
+    x = Input((input_size,))
+    h_tm1 = Input((units,))
+    c = Input(constant_shape)
+    h_ = layers.add([
+        layers.Dense(units)(x),
+        layers.Dense(units)(h_tm1),
+        layers.Dense(units)(c)
+    ])
+    h = layers.Activation('tanh')(h_)
 
-    # Test regularization losses
-    assert len(layer.losses) == 1
-
-    # Test weights
-    assert len(layer.trainable_weights) == 6
-    cells[0].trainable = False
-    assert len(layer.trainable_weights) == 3
-    assert len(layer.non_trainable_weights) == 3
-
-    # Test `get_losses_for`
-    x = keras.Input((None, 5))
-    y = K.sum(x)
-    cells[0].add_loss(y, inputs=x)
-    assert layer.get_losses_for(x) == [y]
-
-
-@rnn_test
-def test_batch_size_equal_one(layer_class):
-    inputs = Input(batch_shape=(1, timesteps, embedding_dim))
-    layer = layer_class(units)
-    outputs = layer(inputs)
-    model = Model(inputs, outputs)
-    model.compile('sgd', 'mse')
-    x = np.random.random((1, timesteps, embedding_dim))
-    y = np.random.random((1, units))
-    model.train_on_batch(x, y)
-
-
-def test_rnn_cell_with_constants_layer():
-
-    class RNNCellWithConstants(keras.layers.Layer):
-
-        def __init__(self, units, **kwargs):
-            self.units = units
-            self.state_size = units
-            super(RNNCellWithConstants, self).__init__(**kwargs)
-
-        def build(self, input_shape):
-            if not isinstance(input_shape, list):
-                raise TypeError('expects constants shape')
-            [input_shape, constant_shape] = input_shape
-            # will (and should) raise if more than one constant passed
-
-            self.input_kernel = self.add_weight(
-                shape=(input_shape[-1], self.units),
-                initializer='uniform',
-                name='kernel')
-            self.recurrent_kernel = self.add_weight(
-                shape=(self.units, self.units),
-                initializer='uniform',
-                name='recurrent_kernel')
-            self.constant_kernel = self.add_weight(
-                shape=(constant_shape[-1], self.units),
-                initializer='uniform',
-                name='constant_kernel')
-            self.built = True
-
-        def call(self, inputs, states, constants):
-            [prev_output] = states
-            [constant] = constants
-            h_input = keras.backend.dot(inputs, self.input_kernel)
-            h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
-            h_const = keras.backend.dot(constant, self.constant_kernel)
-            output = h_input + h_state + h_const
-            return output, [output]
-
-        def get_config(self):
-            config = {'units': self.units}
-            base_config = super(RNNCellWithConstants, self).get_config()
-            return dict(list(base_config.items()) + list(config.items()))
+    cell = recurrent.FunctionalRNNCell(
+        inputs=x, outputs=h, input_states=h_tm1, output_states=h, constants=c)
 
     # Test basic case.
-    x = keras.Input((None, 5))
-    c = keras.Input((3,))
-    cell = RNNCellWithConstants(32)
+    x_seq = Input((None, input_size))
     layer = recurrent.RNN(cell)
-    y = layer(x, constants=c)
-    model = keras.models.Model([x, c], y)
+    y = layer(x_seq, constants=c)
+    model = keras.models.Model([x_seq, c], y)
     model.compile(optimizer='rmsprop', loss='mse')
     model.train_on_batch(
-        [np.zeros((6, 5, 5)), np.zeros((6, 3))],
-        np.zeros((6, 32))
+        [np.zeros((6, 5, input_size)), np.zeros((6, constant_shape[0]))],
+        np.zeros((6, units))
     )
 
-    # Test basic case serialization.
-    x_np = np.random.random((6, 5, 5))
-    c_np = np.random.random((6, 3))
-    y_np = model.predict([x_np, c_np])
-    weights = model.get_weights()
-    config = layer.get_config()
-    with keras.utils.CustomObjectScope(
-        {'RNNCellWithConstants': RNNCellWithConstants}):
-        layer = recurrent.RNN.from_config(config)
-    y = layer(x, constants=c)
-    model = keras.models.Model([x, c], y)
-    model.set_weights(weights)
-    y_np_2 = model.predict([x_np, c_np])
-    assert_allclose(y_np, y_np_2, atol=1e-4)
-
 
 if __name__ == '__main__':
     pytest.main([__file__])

From 53deca2c9be2e8ca8d723780b978e99c68611132 Mon Sep 17 00:00:00 2001
From: andhus <andhus@kth.se>
Date: Mon, 25 Sep 2017 01:58:49 +0200
Subject: [PATCH 03/13] put back accidentally commented out recurrent tests

---
 tests/keras/layers/recurrent_test.py | 1236 +++++++++++++-------------
 1 file changed, 618 insertions(+), 618 deletions(-)

diff --git a/tests/keras/layers/recurrent_test.py b/tests/keras/layers/recurrent_test.py
index 47992eca5af9..ddb00abcb0bc 100644
--- a/tests/keras/layers/recurrent_test.py
+++ b/tests/keras/layers/recurrent_test.py
@@ -16,624 +16,624 @@
 
 num_samples, timesteps, embedding_dim, units = 2, 5, 4, 3
 embedding_num = 12
-#
-#
-# @keras_test
-# def rnn_test(f):
-#     """
-#     All the recurrent layers share the same interface,
-#     so we can run through them with a single function.
-#     """
-#     f = keras_test(f)
-#     return pytest.mark.parametrize('layer_class', [
-#         recurrent.SimpleRNN,
-#         recurrent.GRU,
-#         recurrent.LSTM
-#     ])(f)
-#
-#
-# @rnn_test
-# def test_return_sequences(layer_class):
-#     layer_test(layer_class,
-#                kwargs={'units': units,
-#                        'return_sequences': True},
-#                input_shape=(num_samples, timesteps, embedding_dim))
-#
-#
-# @rnn_test
-# def test_dynamic_behavior(layer_class):
-#     layer = layer_class(units, input_shape=(None, embedding_dim))
-#     model = Sequential()
-#     model.add(layer)
-#     model.compile('sgd', 'mse')
-#     x = np.random.random((num_samples, timesteps, embedding_dim))
-#     y = np.random.random((num_samples, units))
-#     model.train_on_batch(x, y)
-#
-#
-# @rnn_test
-# def test_stateful_invalid_use(layer_class):
-#     layer = layer_class(units,
-#                         stateful=True,
-#                         batch_input_shape=(num_samples,
-#                                            timesteps,
-#                                            embedding_dim))
-#     model = Sequential()
-#     model.add(layer)
-#     model.compile('sgd', 'mse')
-#     x = np.random.random((num_samples * 2, timesteps, embedding_dim))
-#     y = np.random.random((num_samples * 2, units))
-#     with pytest.raises(ValueError):
-#         model.fit(x, y)
-#     with pytest.raises(ValueError):
-#         model.predict(x, batch_size=num_samples + 1)
-#
-#
-# @rnn_test
-# @pytest.mark.skipif((K.backend() == 'cntk'),
-#                     reason='Not yet supported.')
-# def test_dropout(layer_class):
-#     for unroll in [True, False]:
-#         layer_test(layer_class,
-#                    kwargs={'units': units,
-#                            'dropout': 0.1,
-#                            'recurrent_dropout': 0.1,
-#                            'unroll': unroll},
-#                    input_shape=(num_samples, timesteps, embedding_dim))
-#
-#         # Test that dropout is applied during training
-#         x = K.ones((num_samples, timesteps, embedding_dim))
-#         layer = layer_class(units, dropout=0.5, recurrent_dropout=0.5,
-#                             input_shape=(timesteps, embedding_dim))
-#         y = layer(x)
-#         assert y._uses_learning_phase
-#
-#         y = layer(x, training=True)
-#         assert not getattr(y, '_uses_learning_phase')
-#
-#         # Test that dropout is not applied during testing
-#         x = np.random.random((num_samples, timesteps, embedding_dim))
-#         layer = layer_class(units, dropout=0.5, recurrent_dropout=0.5,
-#                             unroll=unroll,
-#                             input_shape=(timesteps, embedding_dim))
-#         model = Sequential([layer])
-#         assert model.uses_learning_phase
-#         y1 = model.predict(x)
-#         y2 = model.predict(x)
-#         assert_allclose(y1, y2)
-#
-#
-# @rnn_test
-# def test_statefulness(layer_class):
-#     model = Sequential()
-#     model.add(embeddings.Embedding(embedding_num, embedding_dim,
-#                                    mask_zero=True,
-#                                    input_length=timesteps,
-#                                    batch_input_shape=(num_samples, timesteps)))
-#     layer = layer_class(units, return_sequences=False,
-#                         stateful=True,
-#                         weights=None)
-#     model.add(layer)
-#     model.compile(optimizer='sgd', loss='mse')
-#     out1 = model.predict(np.ones((num_samples, timesteps)))
-#     assert(out1.shape == (num_samples, units))
-#
-#     # train once so that the states change
-#     model.train_on_batch(np.ones((num_samples, timesteps)),
-#                          np.ones((num_samples, units)))
-#     out2 = model.predict(np.ones((num_samples, timesteps)))
-#
-#     # if the state is not reset, output should be different
-#     assert(out1.max() != out2.max())
-#
-#     # check that output changes after states are reset
-#     # (even though the model itself didn't change)
-#     layer.reset_states()
-#     out3 = model.predict(np.ones((num_samples, timesteps)))
-#     assert(out2.max() != out3.max())
-#
-#     # check that container-level reset_states() works
-#     model.reset_states()
-#     out4 = model.predict(np.ones((num_samples, timesteps)))
-#     assert_allclose(out3, out4, atol=1e-5)
-#
-#     # check that the call to `predict` updated the states
-#     out5 = model.predict(np.ones((num_samples, timesteps)))
-#     assert(out4.max() != out5.max())
-#
-#
-# @rnn_test
-# def test_masking_correctness(layer_class):
-#     # Check masking: output with left padding and right padding
-#     # should be the same.
-#     model = Sequential()
-#     model.add(embeddings.Embedding(embedding_num, embedding_dim,
-#                                    mask_zero=True,
-#                                    input_length=timesteps,
-#                                    batch_input_shape=(num_samples, timesteps)))
-#     layer = layer_class(units, return_sequences=False)
-#     model.add(layer)
-#     model.compile(optimizer='sgd', loss='mse')
-#
-#     left_padded_input = np.ones((num_samples, timesteps))
-#     left_padded_input[0, :1] = 0
-#     left_padded_input[1, :2] = 0
-#     out6 = model.predict(left_padded_input)
-#
-#     right_padded_input = np.ones((num_samples, timesteps))
-#     right_padded_input[0, -1:] = 0
-#     right_padded_input[1, -2:] = 0
-#     out7 = model.predict(right_padded_input)
-#
-#     assert_allclose(out7, out6, atol=1e-5)
-#
-#
-# @rnn_test
-# def test_implementation_mode(layer_class):
-#     for mode in [1, 2]:
-#         # Without dropout
-#         layer_test(layer_class,
-#                    kwargs={'units': units,
-#                            'implementation': mode},
-#                    input_shape=(num_samples, timesteps, embedding_dim))
-#         # With dropout
-#         layer_test(layer_class,
-#                    kwargs={'units': units,
-#                            'implementation': mode,
-#                            'dropout': 0.1,
-#                            'recurrent_dropout': 0.1},
-#                    input_shape=(num_samples, timesteps, embedding_dim))
-#
-#
-# @rnn_test
-# def test_regularizer(layer_class):
-#     layer = layer_class(units, return_sequences=False, weights=None,
-#                         input_shape=(timesteps, embedding_dim),
-#                         kernel_regularizer=regularizers.l1(0.01),
-#                         recurrent_regularizer=regularizers.l1(0.01),
-#                         bias_regularizer='l2')
-#     layer.build((None, None, embedding_dim))
-#     assert len(layer.losses) == 3
-#     assert len(layer.cell.losses) == 3
-#
-#     layer = layer_class(units, return_sequences=False, weights=None,
-#                         input_shape=(timesteps, embedding_dim),
-#                         activity_regularizer='l2')
-#     assert layer.activity_regularizer
-#     x = K.variable(np.ones((num_samples, timesteps, embedding_dim)))
-#     layer(x)
-#     assert len(layer.cell.get_losses_for(x)) == 0
-#     assert len(layer.get_losses_for(x)) == 1
-#
-#
-# @keras_test
-# def test_masking_layer():
-#     ''' This test based on a previously failing issue here:
-#     https://github.com/fchollet/keras/issues/1567
-#     '''
-#     inputs = np.random.random((6, 3, 4))
-#     targets = np.abs(np.random.random((6, 3, 5)))
-#     targets /= targets.sum(axis=-1, keepdims=True)
-#
-#     model = Sequential()
-#     model.add(Masking(input_shape=(3, 4)))
-#     model.add(recurrent.SimpleRNN(units=5, return_sequences=True, unroll=False))
-#     model.compile(loss='categorical_crossentropy', optimizer='adam')
-#     model.fit(inputs, targets, epochs=1, batch_size=100, verbose=1)
-#
-#     model = Sequential()
-#     model.add(Masking(input_shape=(3, 4)))
-#     model.add(recurrent.SimpleRNN(units=5, return_sequences=True, unroll=True))
-#     model.compile(loss='categorical_crossentropy', optimizer='adam')
-#     model.fit(inputs, targets, epochs=1, batch_size=100, verbose=1)
-#
-#
-# @rnn_test
-# def test_from_config(layer_class):
-#     stateful_flags = (False, True)
-#     for stateful in stateful_flags:
-#         l1 = layer_class(units=1, stateful=stateful)
-#         l2 = layer_class.from_config(l1.get_config())
-#         assert l1.get_config() == l2.get_config()
-#
-#
-# @rnn_test
-# def test_specify_initial_state_keras_tensor(layer_class):
-#     num_states = 2 if layer_class is recurrent.LSTM else 1
-#
-#     # Test with Keras tensor
-#     inputs = Input((timesteps, embedding_dim))
-#     initial_state = [Input((units,)) for _ in range(num_states)]
-#     layer = layer_class(units)
-#     if len(initial_state) == 1:
-#         output = layer(inputs, initial_state=initial_state[0])
-#     else:
-#         output = layer(inputs, initial_state=initial_state)
-#     assert initial_state[0] in layer.inbound_nodes[0].input_tensors
-#
-#     model = Model([inputs] + initial_state, output)
-#     model.compile(loss='categorical_crossentropy', optimizer='adam')
-#
-#     inputs = np.random.random((num_samples, timesteps, embedding_dim))
-#     initial_state = [np.random.random((num_samples, units))
-#                      for _ in range(num_states)]
-#     targets = np.random.random((num_samples, units))
-#     model.fit([inputs] + initial_state, targets)
-#
-#
-# @rnn_test
-# def test_specify_initial_state_non_keras_tensor(layer_class):
-#     num_states = 2 if layer_class is recurrent.LSTM else 1
-#
-#     # Test with non-Keras tensor
-#     inputs = Input((timesteps, embedding_dim))
-#     initial_state = [K.random_normal_variable((num_samples, units), 0, 1)
-#                      for _ in range(num_states)]
-#     layer = layer_class(units)
-#     output = layer(inputs, initial_state=initial_state)
-#
-#     model = Model(inputs, output)
-#     model.compile(loss='categorical_crossentropy', optimizer='adam')
-#
-#     inputs = np.random.random((num_samples, timesteps, embedding_dim))
-#     targets = np.random.random((num_samples, units))
-#     model.fit(inputs, targets)
-#
-#
-# @rnn_test
-# def test_reset_states_with_values(layer_class):
-#     num_states = 2 if layer_class is recurrent.LSTM else 1
-#
-#     layer = layer_class(units, stateful=True)
-#     layer.build((num_samples, timesteps, embedding_dim))
-#     layer.reset_states()
-#     assert len(layer.states) == num_states
-#     assert layer.states[0] is not None
-#     np.testing.assert_allclose(K.eval(layer.states[0]),
-#                                np.zeros(K.int_shape(layer.states[0])),
-#                                atol=1e-4)
-#     state_shapes = [K.int_shape(state) for state in layer.states]
-#     values = [np.ones(shape) for shape in state_shapes]
-#     if len(values) == 1:
-#         values = values[0]
-#     layer.reset_states(values)
-#     np.testing.assert_allclose(K.eval(layer.states[0]),
-#                                np.ones(K.int_shape(layer.states[0])),
-#                                atol=1e-4)
-#
-#     # Test fit with invalid data
-#     with pytest.raises(ValueError):
-#         layer.reset_states([1] * (len(layer.states) + 1))
-#
-#
-# @rnn_test
-# def test_initial_states_as_other_inputs(layer_class):
-#     num_states = 2 if layer_class is recurrent.LSTM else 1
-#
-#     # Test with Keras tensor
-#     main_inputs = Input((timesteps, embedding_dim))
-#     initial_state = [Input((units,)) for _ in range(num_states)]
-#     inputs = [main_inputs] + initial_state
-#
-#     layer = layer_class(units)
-#     output = layer(inputs)
-#     assert initial_state[0] in layer.inbound_nodes[0].input_tensors
-#
-#     model = Model(inputs, output)
-#     model.compile(loss='categorical_crossentropy', optimizer='adam')
-#
-#     main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
-#     initial_state = [np.random.random((num_samples, units))
-#                      for _ in range(num_states)]
-#     targets = np.random.random((num_samples, units))
-#     model.train_on_batch([main_inputs] + initial_state, targets)
-#
-#
-# @rnn_test
-# def test_specify_state_with_masking(layer_class):
-#     ''' This test based on a previously failing issue here:
-#     https://github.com/fchollet/keras/issues/1567
-#     '''
-#     num_states = 2 if layer_class is recurrent.LSTM else 1
-#
-#     inputs = Input((timesteps, embedding_dim))
-#     _ = Masking()(inputs)
-#     initial_state = [Input((units,)) for _ in range(num_states)]
-#     output = layer_class(units)(inputs, initial_state=initial_state)
-#
-#     model = Model([inputs] + initial_state, output)
-#     model.compile(loss='categorical_crossentropy', optimizer='adam')
-#
-#     inputs = np.random.random((num_samples, timesteps, embedding_dim))
-#     initial_state = [np.random.random((num_samples, units))
-#                      for _ in range(num_states)]
-#     targets = np.random.random((num_samples, units))
-#     model.fit([inputs] + initial_state, targets)
-#
-#
-# @rnn_test
-# def test_return_state(layer_class):
-#     num_states = 2 if layer_class is recurrent.LSTM else 1
-#
-#     inputs = Input(batch_shape=(num_samples, timesteps, embedding_dim))
-#     layer = layer_class(units, return_state=True, stateful=True)
-#     outputs = layer(inputs)
-#     output, state = outputs[0], outputs[1:]
-#     assert len(state) == num_states
-#     model = Model(inputs, state[0])
-#
-#     inputs = np.random.random((num_samples, timesteps, embedding_dim))
-#     state = model.predict(inputs)
-#     np.testing.assert_allclose(K.eval(layer.states[0]), state, atol=1e-4)
-#
-#
-# @rnn_test
-# def test_state_reuse(layer_class):
-#     inputs = Input(batch_shape=(num_samples, timesteps, embedding_dim))
-#     layer = layer_class(units, return_state=True, return_sequences=True)
-#     outputs = layer(inputs)
-#     output, state = outputs[0], outputs[1:]
-#     output = layer_class(units)(output, initial_state=state)
-#     model = Model(inputs, output)
-#
-#     inputs = np.random.random((num_samples, timesteps, embedding_dim))
-#     outputs = model.predict(inputs)
-#
-#
-# def test_minimal_rnn_cell_non_layer():
-#
-#     class MinimalRNNCell(object):
-#
-#         def __init__(self, units, input_dim):
-#             self.units = units
-#             self.state_size = units
-#             self.kernel = keras.backend.variable(
-#                 np.random.random((input_dim, units)))
-#
-#         def call(self, inputs, states):
-#             prev_output = states[0]
-#             output = keras.backend.dot(inputs, self.kernel) + prev_output
-#             return output, [output]
-#
-#     # Basic test case.
-#     cell = MinimalRNNCell(32, 5)
-#     x = keras.Input((None, 5))
-#     layer = recurrent.RNN(cell)
-#     y = layer(x)
-#     model = keras.models.Model(x, y)
-#     model.compile(optimizer='rmsprop', loss='mse')
-#     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-#
-#     # Test stacking.
-#     cells = [MinimalRNNCell(8, 5),
-#              MinimalRNNCell(32, 8),
-#              MinimalRNNCell(32, 32)]
-#     layer = recurrent.RNN(cells)
-#     y = layer(x)
-#     model = keras.models.Model(x, y)
-#     model.compile(optimizer='rmsprop', loss='mse')
-#     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-#
-#
-# def test_minimal_rnn_cell_non_layer_multiple_states():
-#
-#     class MinimalRNNCell(object):
-#
-#         def __init__(self, units, input_dim):
-#             self.units = units
-#             self.state_size = (units, units)
-#             self.kernel = keras.backend.variable(
-#                 np.random.random((input_dim, units)))
-#
-#         def call(self, inputs, states):
-#             prev_output_1 = states[0]
-#             prev_output_2 = states[1]
-#             output = keras.backend.dot(inputs, self.kernel)
-#             output += prev_output_1
-#             output -= prev_output_2
-#             return output, [output * 2, output * 3]
-#
-#     # Basic test case.
-#     cell = MinimalRNNCell(32, 5)
-#     x = keras.Input((None, 5))
-#     layer = recurrent.RNN(cell)
-#     y = layer(x)
-#     model = keras.models.Model(x, y)
-#     model.compile(optimizer='rmsprop', loss='mse')
-#     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-#
-#     # Test stacking.
-#     cells = [MinimalRNNCell(8, 5),
-#              MinimalRNNCell(16, 8),
-#              MinimalRNNCell(32, 16)]
-#     layer = recurrent.RNN(cells)
-#     assert layer.cell.state_size == (32, 32, 16, 16, 8, 8)
-#     y = layer(x)
-#     model = keras.models.Model(x, y)
-#     model.compile(optimizer='rmsprop', loss='mse')
-#     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-#
-#
-# def test_minimal_rnn_cell_layer():
-#
-#     class MinimalRNNCell(keras.layers.Layer):
-#
-#         def __init__(self, units, **kwargs):
-#             self.units = units
-#             self.state_size = units
-#             super(MinimalRNNCell, self).__init__(**kwargs)
-#
-#         def build(self, input_shape):
-#             self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
-#                                           initializer='uniform',
-#                                           name='kernel')
-#             self.recurrent_kernel = self.add_weight(
-#                 shape=(self.units, self.units),
-#                 initializer='uniform',
-#                 name='recurrent_kernel')
-#             self.built = True
-#
-#         def call(self, inputs, states):
-#             prev_output = states[0]
-#             h = keras.backend.dot(inputs, self.kernel)
-#             output = h + keras.backend.dot(prev_output, self.recurrent_kernel)
-#             return output, [output]
-#
-#         def get_config(self):
-#             config = {'units': self.units}
-#             base_config = super(MinimalRNNCell, self).get_config()
-#             return dict(list(base_config.items()) + list(config.items()))
-#
-#     # Test basic case.
-#     x = keras.Input((None, 5))
-#     cell = MinimalRNNCell(32)
-#     layer = recurrent.RNN(cell)
-#     y = layer(x)
-#     model = keras.models.Model(x, y)
-#     model.compile(optimizer='rmsprop', loss='mse')
-#     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-#
-#     # Test basic case serialization.
-#     x_np = np.random.random((6, 5, 5))
-#     y_np = model.predict(x_np)
-#     weights = model.get_weights()
-#     config = layer.get_config()
-#     with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
-#         layer = recurrent.RNN.from_config(config)
-#     y = layer(x)
-#     model = keras.models.Model(x, y)
-#     model.set_weights(weights)
-#     y_np_2 = model.predict(x_np)
-#     assert_allclose(y_np, y_np_2, atol=1e-4)
-#
-#     # Test stacking.
-#     cells = [MinimalRNNCell(8),
-#              MinimalRNNCell(12),
-#              MinimalRNNCell(32)]
-#     layer = recurrent.RNN(cells)
-#     y = layer(x)
-#     model = keras.models.Model(x, y)
-#     model.compile(optimizer='rmsprop', loss='mse')
-#     model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
-#
-#     # Test stacked RNN serialization.
-#     x_np = np.random.random((6, 5, 5))
-#     y_np = model.predict(x_np)
-#     weights = model.get_weights()
-#     config = layer.get_config()
-#     with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
-#         layer = recurrent.RNN.from_config(config)
-#     y = layer(x)
-#     model = keras.models.Model(x, y)
-#     model.set_weights(weights)
-#     y_np_2 = model.predict(x_np)
-#     assert_allclose(y_np, y_np_2, atol=1e-4)
-#
-#
-# def test_stacked_rnn_attributes():
-#     cells = [recurrent.LSTMCell(3),
-#              recurrent.LSTMCell(3, kernel_regularizer='l2')]
-#     layer = recurrent.RNN(cells)
-#     layer.build((None, None, 5))
-#
-#     # Test regularization losses
-#     assert len(layer.losses) == 1
-#
-#     # Test weights
-#     assert len(layer.trainable_weights) == 6
-#     cells[0].trainable = False
-#     assert len(layer.trainable_weights) == 3
-#     assert len(layer.non_trainable_weights) == 3
-#
-#     # Test `get_losses_for`
-#     x = keras.Input((None, 5))
-#     y = K.sum(x)
-#     cells[0].add_loss(y, inputs=x)
-#     assert layer.get_losses_for(x) == [y]
-#
-#
-# @rnn_test
-# def test_batch_size_equal_one(layer_class):
-#     inputs = Input(batch_shape=(1, timesteps, embedding_dim))
-#     layer = layer_class(units)
-#     outputs = layer(inputs)
-#     model = Model(inputs, outputs)
-#     model.compile('sgd', 'mse')
-#     x = np.random.random((1, timesteps, embedding_dim))
-#     y = np.random.random((1, units))
-#     model.train_on_batch(x, y)
-#
-#
-# def test_rnn_cell_with_constants_layer():
-#
-#     class RNNCellWithConstants(keras.layers.Layer):
-#
-#         def __init__(self, units, **kwargs):
-#             self.units = units
-#             self.state_size = units
-#             super(RNNCellWithConstants, self).__init__(**kwargs)
-#
-#         def build(self, input_shape):
-#             if not isinstance(input_shape, list):
-#                 raise TypeError('expects constants shape')
-#             [input_shape, constant_shape] = input_shape
-#             # will (and should) raise if more than one constant passed
-#
-#             self.input_kernel = self.add_weight(
-#                 shape=(input_shape[-1], self.units),
-#                 initializer='uniform',
-#                 name='kernel')
-#             self.recurrent_kernel = self.add_weight(
-#                 shape=(self.units, self.units),
-#                 initializer='uniform',
-#                 name='recurrent_kernel')
-#             self.constant_kernel = self.add_weight(
-#                 shape=(constant_shape[-1], self.units),
-#                 initializer='uniform',
-#                 name='constant_kernel')
-#             self.built = True
-#
-#         def call(self, inputs, states, constants):
-#             [prev_output] = states
-#             [constant] = constants
-#             h_input = keras.backend.dot(inputs, self.input_kernel)
-#             h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
-#             h_const = keras.backend.dot(constant, self.constant_kernel)
-#             output = h_input + h_state + h_const
-#             return output, [output]
-#
-#         def get_config(self):
-#             config = {'units': self.units}
-#             base_config = super(RNNCellWithConstants, self).get_config()
-#             return dict(list(base_config.items()) + list(config.items()))
-#
-#     # Test basic case.
-#     x = keras.Input((None, 5))
-#     c = keras.Input((3,))
-#     cell = RNNCellWithConstants(32)
-#     layer = recurrent.RNN(cell)
-#     y = layer(x, constants=c)
-#     model = keras.models.Model([x, c], y)
-#     model.compile(optimizer='rmsprop', loss='mse')
-#     model.train_on_batch(
-#         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
-#         np.zeros((6, 32))
-#     )
-#
-#     # Test basic case serialization.
-#     x_np = np.random.random((6, 5, 5))
-#     c_np = np.random.random((6, 3))
-#     y_np = model.predict([x_np, c_np])
-#     weights = model.get_weights()
-#     config = layer.get_config()
-#     with keras.utils.CustomObjectScope(
-#         {'RNNCellWithConstants': RNNCellWithConstants}):
-#         layer = recurrent.RNN.from_config(config)
-#     y = layer(x, constants=c)
-#     model = keras.models.Model([x, c], y)
-#     model.set_weights(weights)
-#     y_np_2 = model.predict([x_np, c_np])
-#     assert_allclose(y_np, y_np_2, atol=1e-4)
+
+
+@keras_test
+def rnn_test(f):
+    """
+    All the recurrent layers share the same interface,
+    so we can run through them with a single function.
+    """
+    f = keras_test(f)
+    return pytest.mark.parametrize('layer_class', [
+        recurrent.SimpleRNN,
+        recurrent.GRU,
+        recurrent.LSTM
+    ])(f)
+
+
+@rnn_test
+def test_return_sequences(layer_class):
+    layer_test(layer_class,
+               kwargs={'units': units,
+                       'return_sequences': True},
+               input_shape=(num_samples, timesteps, embedding_dim))
+
+
+@rnn_test
+def test_dynamic_behavior(layer_class):
+    layer = layer_class(units, input_shape=(None, embedding_dim))
+    model = Sequential()
+    model.add(layer)
+    model.compile('sgd', 'mse')
+    x = np.random.random((num_samples, timesteps, embedding_dim))
+    y = np.random.random((num_samples, units))
+    model.train_on_batch(x, y)
+
+
+@rnn_test
+def test_stateful_invalid_use(layer_class):
+    layer = layer_class(units,
+                        stateful=True,
+                        batch_input_shape=(num_samples,
+                                           timesteps,
+                                           embedding_dim))
+    model = Sequential()
+    model.add(layer)
+    model.compile('sgd', 'mse')
+    x = np.random.random((num_samples * 2, timesteps, embedding_dim))
+    y = np.random.random((num_samples * 2, units))
+    with pytest.raises(ValueError):
+        model.fit(x, y)
+    with pytest.raises(ValueError):
+        model.predict(x, batch_size=num_samples + 1)
+
+
+@rnn_test
+@pytest.mark.skipif((K.backend() == 'cntk'),
+                    reason='Not yet supported.')
+def test_dropout(layer_class):
+    for unroll in [True, False]:
+        layer_test(layer_class,
+                   kwargs={'units': units,
+                           'dropout': 0.1,
+                           'recurrent_dropout': 0.1,
+                           'unroll': unroll},
+                   input_shape=(num_samples, timesteps, embedding_dim))
+
+        # Test that dropout is applied during training
+        x = K.ones((num_samples, timesteps, embedding_dim))
+        layer = layer_class(units, dropout=0.5, recurrent_dropout=0.5,
+                            input_shape=(timesteps, embedding_dim))
+        y = layer(x)
+        assert y._uses_learning_phase
+
+        y = layer(x, training=True)
+        assert not getattr(y, '_uses_learning_phase')
+
+        # Test that dropout is not applied during testing
+        x = np.random.random((num_samples, timesteps, embedding_dim))
+        layer = layer_class(units, dropout=0.5, recurrent_dropout=0.5,
+                            unroll=unroll,
+                            input_shape=(timesteps, embedding_dim))
+        model = Sequential([layer])
+        assert model.uses_learning_phase
+        y1 = model.predict(x)
+        y2 = model.predict(x)
+        assert_allclose(y1, y2)
+
+
+@rnn_test
+def test_statefulness(layer_class):
+    model = Sequential()
+    model.add(embeddings.Embedding(embedding_num, embedding_dim,
+                                   mask_zero=True,
+                                   input_length=timesteps,
+                                   batch_input_shape=(num_samples, timesteps)))
+    layer = layer_class(units, return_sequences=False,
+                        stateful=True,
+                        weights=None)
+    model.add(layer)
+    model.compile(optimizer='sgd', loss='mse')
+    out1 = model.predict(np.ones((num_samples, timesteps)))
+    assert(out1.shape == (num_samples, units))
+
+    # train once so that the states change
+    model.train_on_batch(np.ones((num_samples, timesteps)),
+                         np.ones((num_samples, units)))
+    out2 = model.predict(np.ones((num_samples, timesteps)))
+
+    # if the state is not reset, output should be different
+    assert(out1.max() != out2.max())
+
+    # check that output changes after states are reset
+    # (even though the model itself didn't change)
+    layer.reset_states()
+    out3 = model.predict(np.ones((num_samples, timesteps)))
+    assert(out2.max() != out3.max())
+
+    # check that container-level reset_states() works
+    model.reset_states()
+    out4 = model.predict(np.ones((num_samples, timesteps)))
+    assert_allclose(out3, out4, atol=1e-5)
+
+    # check that the call to `predict` updated the states
+    out5 = model.predict(np.ones((num_samples, timesteps)))
+    assert(out4.max() != out5.max())
+
+
+@rnn_test
+def test_masking_correctness(layer_class):
+    # Check masking: output with left padding and right padding
+    # should be the same.
+    model = Sequential()
+    model.add(embeddings.Embedding(embedding_num, embedding_dim,
+                                   mask_zero=True,
+                                   input_length=timesteps,
+                                   batch_input_shape=(num_samples, timesteps)))
+    layer = layer_class(units, return_sequences=False)
+    model.add(layer)
+    model.compile(optimizer='sgd', loss='mse')
+
+    left_padded_input = np.ones((num_samples, timesteps))
+    left_padded_input[0, :1] = 0
+    left_padded_input[1, :2] = 0
+    out6 = model.predict(left_padded_input)
+
+    right_padded_input = np.ones((num_samples, timesteps))
+    right_padded_input[0, -1:] = 0
+    right_padded_input[1, -2:] = 0
+    out7 = model.predict(right_padded_input)
+
+    assert_allclose(out7, out6, atol=1e-5)
+
+
+@rnn_test
+def test_implementation_mode(layer_class):
+    for mode in [1, 2]:
+        # Without dropout
+        layer_test(layer_class,
+                   kwargs={'units': units,
+                           'implementation': mode},
+                   input_shape=(num_samples, timesteps, embedding_dim))
+        # With dropout
+        layer_test(layer_class,
+                   kwargs={'units': units,
+                           'implementation': mode,
+                           'dropout': 0.1,
+                           'recurrent_dropout': 0.1},
+                   input_shape=(num_samples, timesteps, embedding_dim))
+
+
+@rnn_test
+def test_regularizer(layer_class):
+    layer = layer_class(units, return_sequences=False, weights=None,
+                        input_shape=(timesteps, embedding_dim),
+                        kernel_regularizer=regularizers.l1(0.01),
+                        recurrent_regularizer=regularizers.l1(0.01),
+                        bias_regularizer='l2')
+    layer.build((None, None, embedding_dim))
+    assert len(layer.losses) == 3
+    assert len(layer.cell.losses) == 3
+
+    layer = layer_class(units, return_sequences=False, weights=None,
+                        input_shape=(timesteps, embedding_dim),
+                        activity_regularizer='l2')
+    assert layer.activity_regularizer
+    x = K.variable(np.ones((num_samples, timesteps, embedding_dim)))
+    layer(x)
+    assert len(layer.cell.get_losses_for(x)) == 0
+    assert len(layer.get_losses_for(x)) == 1
+
+
+@keras_test
+def test_masking_layer():
+    ''' This test based on a previously failing issue here:
+    https://github.com/fchollet/keras/issues/1567
+    '''
+    inputs = np.random.random((6, 3, 4))
+    targets = np.abs(np.random.random((6, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+
+    model = Sequential()
+    model.add(Masking(input_shape=(3, 4)))
+    model.add(recurrent.SimpleRNN(units=5, return_sequences=True, unroll=False))
+    model.compile(loss='categorical_crossentropy', optimizer='adam')
+    model.fit(inputs, targets, epochs=1, batch_size=100, verbose=1)
+
+    model = Sequential()
+    model.add(Masking(input_shape=(3, 4)))
+    model.add(recurrent.SimpleRNN(units=5, return_sequences=True, unroll=True))
+    model.compile(loss='categorical_crossentropy', optimizer='adam')
+    model.fit(inputs, targets, epochs=1, batch_size=100, verbose=1)
+
+
+@rnn_test
+def test_from_config(layer_class):
+    stateful_flags = (False, True)
+    for stateful in stateful_flags:
+        l1 = layer_class(units=1, stateful=stateful)
+        l2 = layer_class.from_config(l1.get_config())
+        assert l1.get_config() == l2.get_config()
+
+
+@rnn_test
+def test_specify_initial_state_keras_tensor(layer_class):
+    num_states = 2 if layer_class is recurrent.LSTM else 1
+
+    # Test with Keras tensor
+    inputs = Input((timesteps, embedding_dim))
+    initial_state = [Input((units,)) for _ in range(num_states)]
+    layer = layer_class(units)
+    if len(initial_state) == 1:
+        output = layer(inputs, initial_state=initial_state[0])
+    else:
+        output = layer(inputs, initial_state=initial_state)
+    assert initial_state[0] in layer.inbound_nodes[0].input_tensors
+
+    model = Model([inputs] + initial_state, output)
+    model.compile(loss='categorical_crossentropy', optimizer='adam')
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [np.random.random((num_samples, units))
+                     for _ in range(num_states)]
+    targets = np.random.random((num_samples, units))
+    model.fit([inputs] + initial_state, targets)
+
+
+@rnn_test
+def test_specify_initial_state_non_keras_tensor(layer_class):
+    num_states = 2 if layer_class is recurrent.LSTM else 1
+
+    # Test with non-Keras tensor
+    inputs = Input((timesteps, embedding_dim))
+    initial_state = [K.random_normal_variable((num_samples, units), 0, 1)
+                     for _ in range(num_states)]
+    layer = layer_class(units)
+    output = layer(inputs, initial_state=initial_state)
+
+    model = Model(inputs, output)
+    model.compile(loss='categorical_crossentropy', optimizer='adam')
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    targets = np.random.random((num_samples, units))
+    model.fit(inputs, targets)
+
+
+@rnn_test
+def test_reset_states_with_values(layer_class):
+    num_states = 2 if layer_class is recurrent.LSTM else 1
+
+    layer = layer_class(units, stateful=True)
+    layer.build((num_samples, timesteps, embedding_dim))
+    layer.reset_states()
+    assert len(layer.states) == num_states
+    assert layer.states[0] is not None
+    np.testing.assert_allclose(K.eval(layer.states[0]),
+                               np.zeros(K.int_shape(layer.states[0])),
+                               atol=1e-4)
+    state_shapes = [K.int_shape(state) for state in layer.states]
+    values = [np.ones(shape) for shape in state_shapes]
+    if len(values) == 1:
+        values = values[0]
+    layer.reset_states(values)
+    np.testing.assert_allclose(K.eval(layer.states[0]),
+                               np.ones(K.int_shape(layer.states[0])),
+                               atol=1e-4)
+
+    # Test fit with invalid data
+    with pytest.raises(ValueError):
+        layer.reset_states([1] * (len(layer.states) + 1))
+
+
+@rnn_test
+def test_initial_states_as_other_inputs(layer_class):
+    num_states = 2 if layer_class is recurrent.LSTM else 1
+
+    # Test with Keras tensor
+    main_inputs = Input((timesteps, embedding_dim))
+    initial_state = [Input((units,)) for _ in range(num_states)]
+    inputs = [main_inputs] + initial_state
+
+    layer = layer_class(units)
+    output = layer(inputs)
+    assert initial_state[0] in layer.inbound_nodes[0].input_tensors
+
+    model = Model(inputs, output)
+    model.compile(loss='categorical_crossentropy', optimizer='adam')
+
+    main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [np.random.random((num_samples, units))
+                     for _ in range(num_states)]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([main_inputs] + initial_state, targets)
+
+
+@rnn_test
+def test_specify_state_with_masking(layer_class):
+    ''' This test based on a previously failing issue here:
+    https://github.com/fchollet/keras/issues/1567
+    '''
+    num_states = 2 if layer_class is recurrent.LSTM else 1
+
+    inputs = Input((timesteps, embedding_dim))
+    _ = Masking()(inputs)
+    initial_state = [Input((units,)) for _ in range(num_states)]
+    output = layer_class(units)(inputs, initial_state=initial_state)
+
+    model = Model([inputs] + initial_state, output)
+    model.compile(loss='categorical_crossentropy', optimizer='adam')
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [np.random.random((num_samples, units))
+                     for _ in range(num_states)]
+    targets = np.random.random((num_samples, units))
+    model.fit([inputs] + initial_state, targets)
+
+
+@rnn_test
+def test_return_state(layer_class):
+    num_states = 2 if layer_class is recurrent.LSTM else 1
+
+    inputs = Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    layer = layer_class(units, return_state=True, stateful=True)
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    assert len(state) == num_states
+    model = Model(inputs, state[0])
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    state = model.predict(inputs)
+    np.testing.assert_allclose(K.eval(layer.states[0]), state, atol=1e-4)
+
+
+@rnn_test
+def test_state_reuse(layer_class):
+    inputs = Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    layer = layer_class(units, return_state=True, return_sequences=True)
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = layer_class(units)(output, initial_state=state)
+    model = Model(inputs, output)
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    outputs = model.predict(inputs)
+
+
+def test_minimal_rnn_cell_non_layer():
+
+    class MinimalRNNCell(object):
+
+        def __init__(self, units, input_dim):
+            self.units = units
+            self.state_size = units
+            self.kernel = keras.backend.variable(
+                np.random.random((input_dim, units)))
+
+        def call(self, inputs, states):
+            prev_output = states[0]
+            output = keras.backend.dot(inputs, self.kernel) + prev_output
+            return output, [output]
+
+    # Basic test case.
+    cell = MinimalRNNCell(32, 5)
+    x = keras.Input((None, 5))
+    layer = recurrent.RNN(cell)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer='rmsprop', loss='mse')
+    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+    # Test stacking.
+    cells = [MinimalRNNCell(8, 5),
+             MinimalRNNCell(32, 8),
+             MinimalRNNCell(32, 32)]
+    layer = recurrent.RNN(cells)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer='rmsprop', loss='mse')
+    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+
+def test_minimal_rnn_cell_non_layer_multiple_states():
+
+    class MinimalRNNCell(object):
+
+        def __init__(self, units, input_dim):
+            self.units = units
+            self.state_size = (units, units)
+            self.kernel = keras.backend.variable(
+                np.random.random((input_dim, units)))
+
+        def call(self, inputs, states):
+            prev_output_1 = states[0]
+            prev_output_2 = states[1]
+            output = keras.backend.dot(inputs, self.kernel)
+            output += prev_output_1
+            output -= prev_output_2
+            return output, [output * 2, output * 3]
+
+    # Basic test case.
+    cell = MinimalRNNCell(32, 5)
+    x = keras.Input((None, 5))
+    layer = recurrent.RNN(cell)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer='rmsprop', loss='mse')
+    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+    # Test stacking.
+    cells = [MinimalRNNCell(8, 5),
+             MinimalRNNCell(16, 8),
+             MinimalRNNCell(32, 16)]
+    layer = recurrent.RNN(cells)
+    assert layer.cell.state_size == (32, 32, 16, 16, 8, 8)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer='rmsprop', loss='mse')
+    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+
+def test_minimal_rnn_cell_layer():
+
+    class MinimalRNNCell(keras.layers.Layer):
+
+        def __init__(self, units, **kwargs):
+            self.units = units
+            self.state_size = units
+            super(MinimalRNNCell, self).__init__(**kwargs)
+
+        def build(self, input_shape):
+            self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
+                                          initializer='uniform',
+                                          name='kernel')
+            self.recurrent_kernel = self.add_weight(
+                shape=(self.units, self.units),
+                initializer='uniform',
+                name='recurrent_kernel')
+            self.built = True
+
+        def call(self, inputs, states):
+            prev_output = states[0]
+            h = keras.backend.dot(inputs, self.kernel)
+            output = h + keras.backend.dot(prev_output, self.recurrent_kernel)
+            return output, [output]
+
+        def get_config(self):
+            config = {'units': self.units}
+            base_config = super(MinimalRNNCell, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    # Test basic case.
+    x = keras.Input((None, 5))
+    cell = MinimalRNNCell(32)
+    layer = recurrent.RNN(cell)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer='rmsprop', loss='mse')
+    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+    # Test basic case serialization.
+    x_np = np.random.random((6, 5, 5))
+    y_np = model.predict(x_np)
+    weights = model.get_weights()
+    config = layer.get_config()
+    with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
+        layer = recurrent.RNN.from_config(config)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.set_weights(weights)
+    y_np_2 = model.predict(x_np)
+    assert_allclose(y_np, y_np_2, atol=1e-4)
+
+    # Test stacking.
+    cells = [MinimalRNNCell(8),
+             MinimalRNNCell(12),
+             MinimalRNNCell(32)]
+    layer = recurrent.RNN(cells)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.compile(optimizer='rmsprop', loss='mse')
+    model.train_on_batch(np.zeros((6, 5, 5)), np.zeros((6, 32)))
+
+    # Test stacked RNN serialization.
+    x_np = np.random.random((6, 5, 5))
+    y_np = model.predict(x_np)
+    weights = model.get_weights()
+    config = layer.get_config()
+    with keras.utils.CustomObjectScope({'MinimalRNNCell': MinimalRNNCell}):
+        layer = recurrent.RNN.from_config(config)
+    y = layer(x)
+    model = keras.models.Model(x, y)
+    model.set_weights(weights)
+    y_np_2 = model.predict(x_np)
+    assert_allclose(y_np, y_np_2, atol=1e-4)
+
+
+def test_stacked_rnn_attributes():
+    cells = [recurrent.LSTMCell(3),
+             recurrent.LSTMCell(3, kernel_regularizer='l2')]
+    layer = recurrent.RNN(cells)
+    layer.build((None, None, 5))
+
+    # Test regularization losses
+    assert len(layer.losses) == 1
+
+    # Test weights
+    assert len(layer.trainable_weights) == 6
+    cells[0].trainable = False
+    assert len(layer.trainable_weights) == 3
+    assert len(layer.non_trainable_weights) == 3
+
+    # Test `get_losses_for`
+    x = keras.Input((None, 5))
+    y = K.sum(x)
+    cells[0].add_loss(y, inputs=x)
+    assert layer.get_losses_for(x) == [y]
+
+
+@rnn_test
+def test_batch_size_equal_one(layer_class):
+    inputs = Input(batch_shape=(1, timesteps, embedding_dim))
+    layer = layer_class(units)
+    outputs = layer(inputs)
+    model = Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+    x = np.random.random((1, timesteps, embedding_dim))
+    y = np.random.random((1, units))
+    model.train_on_batch(x, y)
+
+
+def test_rnn_cell_with_constants_layer():
+
+    class RNNCellWithConstants(keras.layers.Layer):
+
+        def __init__(self, units, **kwargs):
+            self.units = units
+            self.state_size = units
+            super(RNNCellWithConstants, self).__init__(**kwargs)
+
+        def build(self, input_shape):
+            if not isinstance(input_shape, list):
+                raise TypeError('expects constants shape')
+            [input_shape, constant_shape] = input_shape
+            # will (and should) raise if more than one constant passed
+
+            self.input_kernel = self.add_weight(
+                shape=(input_shape[-1], self.units),
+                initializer='uniform',
+                name='kernel')
+            self.recurrent_kernel = self.add_weight(
+                shape=(self.units, self.units),
+                initializer='uniform',
+                name='recurrent_kernel')
+            self.constant_kernel = self.add_weight(
+                shape=(constant_shape[-1], self.units),
+                initializer='uniform',
+                name='constant_kernel')
+            self.built = True
+
+        def call(self, inputs, states, constants):
+            [prev_output] = states
+            [constant] = constants
+            h_input = keras.backend.dot(inputs, self.input_kernel)
+            h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
+            h_const = keras.backend.dot(constant, self.constant_kernel)
+            output = h_input + h_state + h_const
+            return output, [output]
+
+        def get_config(self):
+            config = {'units': self.units}
+            base_config = super(RNNCellWithConstants, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    # Test basic case.
+    x = keras.Input((None, 5))
+    c = keras.Input((3,))
+    cell = RNNCellWithConstants(32)
+    layer = recurrent.RNN(cell)
+    y = layer(x, constants=c)
+    model = keras.models.Model([x, c], y)
+    model.compile(optimizer='rmsprop', loss='mse')
+    model.train_on_batch(
+        [np.zeros((6, 5, 5)), np.zeros((6, 3))],
+        np.zeros((6, 32))
+    )
+
+    # Test basic case serialization.
+    x_np = np.random.random((6, 5, 5))
+    c_np = np.random.random((6, 3))
+    y_np = model.predict([x_np, c_np])
+    weights = model.get_weights()
+    config = layer.get_config()
+    with keras.utils.CustomObjectScope(
+        {'RNNCellWithConstants': RNNCellWithConstants}):
+        layer = recurrent.RNN.from_config(config)
+    y = layer(x, constants=c)
+    model = keras.models.Model([x, c], y)
+    model.set_weights(weights)
+    y_np_2 = model.predict([x_np, c_np])
+    assert_allclose(y_np, y_np_2, atol=1e-4)
 
 
 def test_functional_rnn_cell():

From 568fd2e9f8521d0f755bf1b4b6ef3b06d1675d23 Mon Sep 17 00:00:00 2001
From: andhus <andhus@kth.se>
Date: Sat, 7 Oct 2017 07:42:31 +0200
Subject: [PATCH 04/13] added basic example of functional cell

---
 examples/functional_rnn_cell.py | 45 +++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 examples/functional_rnn_cell.py

diff --git a/examples/functional_rnn_cell.py b/examples/functional_rnn_cell.py
new file mode 100644
index 000000000000..f3e67b2d104b
--- /dev/null
+++ b/examples/functional_rnn_cell.py
@@ -0,0 +1,45 @@
+from __future__ import division, print_function
+
+import numpy as np
+
+from keras import Input
+from keras.layers import add, Dense, Activation, FunctionalRNNCell, RNN, \
+    concatenate, multiply, Model
+
+units = 32
+input_size = 5
+x = Input((input_size,))
+h_tm1 = Input((units,))
+h_ = add([Dense(units)(x), Dense(units, use_bias=False)(h_tm1)])
+h = Activation('tanh')(h_)
+
+# Create the cell:
+
+cell = FunctionalRNNCell(
+    inputs=x, outputs=h, input_states=h_tm1, output_states=h)
+
+x_sequence = Input((None, input_size))
+rnn = RNN(cell)
+y = rnn(x_sequence)
+
+# Now we can modify the cell to make use of "external" constants:
+constant_shape = (10,)
+c = Input(constant_shape)
+density = Dense(constant_shape[0], activation='softmax')(
+    concatenate([x, h_tm1]))
+attention = multiply([density, c])
+h2_ = add([h, Dense(units)(attention)])
+h2 = Activation('tanh')(h2_)
+
+attention_cell = FunctionalRNNCell(
+    inputs=x, outputs=h2, input_states=h_tm1, output_states=h2, constants=c)
+
+attention_rnn = RNN(attention_cell)
+y2 = attention_rnn(x_sequence, constants=c)
+# Note that shape of c is same as in cell (no time dimension added)
+
+attention_model = Model([x_sequence, c], y2)
+
+x_sequence_arr = np.random.randn(3, 5, input_size)
+c_arr = np.random.randn(3, constant_shape[0])
+y2_arr = attention_model.predict([x_sequence_arr, c_arr])

From 2f9f6f07d2652f773ae79d832205d6793accf111 Mon Sep 17 00:00:00 2001
From: andhus <andhus@kth.se>
Date: Sun, 8 Oct 2017 18:43:58 +0200
Subject: [PATCH 05/13] new class AttentionRNN

---
 keras/layers/recurrent.py | 260 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 260 insertions(+)

diff --git a/keras/layers/recurrent.py b/keras/layers/recurrent.py
index 54e59b7c2ead..73384df16554 100644
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@@ -844,6 +844,266 @@ def get_losses_for(self, inputs=None):
         return super(RNN, self).get_losses_for(inputs)
 
 
+class AttentionRNN(RNN):
+    """Base class for attentive recurrent layers.
+
+    # Arguments
+        cell: A RNN cell instance supporting attention. It should implement:
+            - a `call(input_at_t, states_at_t, attended)` method, returning
+                `(output_at_t, states_at_t_plus_1)`. It must accept the keyword
+                argument `attended` which refers to the input(s) (tensor or
+                list of tensors) that is attended to an will be presented as a
+                whole at each timestep.
+            - a `state_size` attribute. This can be a single integer
+                (single state) in which case it is the size of the recurrent
+                state (which should be the same as the size of the cell
+                output). This can also be a list/tuple of integers
+                (one size per state). In this case, the first entry
+                (`state_size[0]`) should be the same as the size of the cell
+                output.
+            If the RNN cell is a keras layer, the input_shape passed to its
+            `build` method will be a list of the input shape of the regular
+            sequence input followed by the shape(s) of the attended.
+        **kwargs: See docs of super class RNN.
+
+    # Input shapes
+        3D tensor with shape `(batch_size, timesteps, input_dim)`,
+        (Optional) 2D tensors with shape `(batch_size, output_dim)`.
+
+    # Attended shapes
+        ND tensor of the shape expected by the attentive cell.
+
+    # Examples
+
+    ```python
+
+    TODO: minimal example (using functional API?)
+    ```
+    """
+
+    def __init__(self, cell, **kwargs):
+        if isinstance(cell, (list, tuple)):
+            # Note: not obviously how one would want to propagate the attended
+            # for stacked cells, user should stack them manually into a single
+            # cell
+            raise ValueError('AttentionRNN only supports a single cell')
+        super(AttentionRNN, self).__init__(cell=cell, **kwargs)
+        # we let base class check that cel has call function before checking
+        # for the additional argument
+        if not has_arg(cell.call, 'attended'):
+            raise ValueError('`cell.call` does not take the keyword argument'
+                             ' attended')
+
+        self._n_attended = None  # set in __call__, needed in build to split
+                                 # input_shape
+        self.attended_spec = None
+
+    def build(self, input_shape):
+        attended_shapes = input_shape[-self._n_attended:]
+        input_shape = input_shape[0]
+        batch_size = input_shape[0] if self.stateful else None
+        input_dim = input_shape[-1]
+        self.input_spec[0] = InputSpec(shape=(batch_size, None, input_dim))
+
+        attended_specs = [InputSpec(shape=(batch_size,) + attended_shape[1:])
+                          for attended_shape in attended_shapes]
+        if len(attended_specs) > 1:
+            self.attended_spec = attended_specs
+        else:
+            self.attended_spec = attended_specs[0]
+
+        if self.stateful:
+            self.reset_states()
+
+        if isinstance(self.cell, Layer):
+            step_input_shape = (input_shape[0],) + input_shape[2:]
+            self.cell.build([step_input_shape] + attended_shapes)
+
+    def __call__(self, inputs, initial_state=None, attended=None, **kwargs):
+        # If there are multiple inputs, then they should be the main input,
+        # `initial_state` and `attended`
+        # TODO what is meant by "e.g. when loading model from file" in comment
+        # in base class RNN, can there be a problem if initial states are not
+        # passed in the Attentive RNN with respect ot this!?
+        inputs, initial_state, attended = self._normalize_args(
+            inputs, initial_state, attended)
+
+        if attended is None:
+            raise ValueError('attended input must be passed')
+        # we need to know length of attended in build
+        self._n_attended = len(attended)
+
+        check_list = []
+        if initial_state:
+            check_list += initial_state
+        if attended:
+            check_list += attended
+        # at this point check_list cannot be empty
+        is_keras_tensor = hasattr(check_list[0], '_keras_history')
+        for tensor in check_list:
+            if hasattr(tensor, '_keras_history') != is_keras_tensor:
+                raise ValueError('The initial state and attended of an RNN'
+                                 ' layer cannot be specified with a mix of'
+                                 ' Keras tensors and non-Keras tensors')
+
+        if is_keras_tensor:
+            # Compute the full input spec, including state and attended
+            input_spec = self.input_spec
+            state_spec = self.state_spec
+            if not isinstance(input_spec, list):
+                input_spec = [input_spec]
+            if not isinstance(state_spec, list):
+                state_spec = [state_spec]
+            self.input_spec = input_spec
+            inputs = [inputs]
+            if initial_state:
+                self.input_spec += state_spec
+                inputs += initial_state
+                kwargs['initial_state'] = initial_state
+            if attended:
+                self.input_spec += self.external_constants_spec
+                inputs += attended
+                kwargs['attended'] = attended
+
+            # Perform the call
+            output = Layer.__call__(self, inputs, **kwargs)
+
+            # Restore original input spec
+            self.input_spec = input_spec
+            return output
+        else:
+            kwargs['initial_state'] = initial_state
+            if attended is not None:
+                kwargs['attended'] = attended
+            return Layer.__call__(self, inputs, **kwargs)
+
+    def call(self,
+             inputs,
+             mask=None,
+             training=None,
+             initial_state=None,
+             attended=None):
+        # TODO this method duplicates almost everything in RNN.call,
+        # better solution?
+
+        # input shape: `(samples, time (padded with zeros), input_dim)`
+        # note that the .build() method of subclasses MUST define
+        # self.input_spec and self.state_spec with complete input shapes.
+        if isinstance(inputs, list):
+            inputs = inputs[0]
+        if initial_state is not None:
+            pass
+        elif self.stateful:
+            initial_state = self.states
+        else:
+            initial_state = self.get_initial_state(inputs)
+
+        if isinstance(mask, list):
+            mask = mask[0]
+
+        if len(initial_state) != len(self.states):
+            raise ValueError('Layer has ' + str(len(self.states)) +
+                             ' states but was passed ' +
+                             str(len(initial_state)) +
+                             ' initial states.')
+        input_shape = K.int_shape(inputs)
+        timesteps = input_shape[1]
+        if self.unroll and timesteps in [None, 1]:
+            raise ValueError('Cannot unroll a RNN if the '
+                             'time dimension is undefined or equal to 1. \n'
+                             '- If using a Sequential model, '
+                             'specify the time dimension by passing '
+                             'an `input_shape` or `batch_input_shape` '
+                             'argument to your first layer. If your '
+                             'first layer is an Embedding, you can '
+                             'also use the `input_length` argument.\n'
+                             '- If using the functional API, specify '
+                             'the time dimension by passing a `shape` '
+                             'or `batch_shape` argument to your Input layer.')
+
+        cell_kwargs = {'attended': attended}
+        if has_arg(self.cell.call, 'training'):
+            cell_kwargs['training'] = training
+
+        # NOTE: by passing the attended implicitly into the K.rnn it is not
+        # possible for theano backend to optimise the scan op, see section:
+        # "Explicitly passing inputs of the inner function to scan" in:
+        #   http://deeplearning.net/software/theano/library/scan.html#lib-scan-shared-variables
+        # but on the other hand we are not passed weights (shared variables)
+        # of the cell transformation anyway.
+        step = functools.partial(self.cell.call, **cell_kwargs)
+
+        last_output, outputs, states = K.rnn(step,
+                                             inputs,
+                                             initial_state,
+                                             go_backwards=self.go_backwards,
+                                             mask=mask,
+                                             unroll=self.unroll,
+                                             input_length=timesteps)
+        if self.stateful:
+            updates = []
+            for i in range(len(states)):
+                updates.append((self.states[i], states[i]))
+            self.add_update(updates, inputs)
+
+        if self.return_sequences:
+            output = outputs
+        else:
+            output = last_output
+
+        # Properly set learning phase
+        if getattr(last_output, '_uses_learning_phase', False):
+            output._uses_learning_phase = True
+
+        if self.return_state:
+            if not isinstance(states, (list, tuple)):
+                states = [states]
+            else:
+                states = list(states)
+            return [output] + states
+        else:
+            return output
+
+    def _normalize_args(self, inputs, initial_state, attended):
+        """The inputs `initial_state` and `attended` can be passed to
+        AttentionRNN.__call__ either by separate arguments or as part of
+        `inputs`. In this case `inputs` is a list of tensors of which the first
+        one is the actual (sequence) input followed by initial states followed
+        by the attended.
+
+        This method separates and normalizes the different groups of inputs.
+
+        # Arguments
+            inputs: tensor of list/tuple of tensors
+            initial_state: tensor or list of tensors or None
+            attended: tensor or list of tensors or None
+
+        # Returns
+            inputs: tensor
+            initial_state: list of tensors or None
+            attended: list of tensors or None
+        """
+        if isinstance(inputs, (list, tuple)):
+            remaining_inputs = inputs[1:]
+            inputs = inputs[0]
+            if remaining_inputs and initial_state is None:
+                if isinstance(self.state_spec, list):
+                    n_states = len(self.state_spec)
+                else:
+                    n_states = 1
+                initial_state = remaining_inputs[:n_states]
+                remaining_inputs = remaining_inputs[n_states:]
+            if remaining_inputs and attended is None:
+                attended = remaining_inputs
+            if len(remaining_inputs) > 0:
+                raise ValueError('too many inputs were passed')
+
+        initial_state = _to_list_or_none(initial_state)
+        attended = _to_list_or_none(attended)
+
+        return inputs, initial_state, attended
+
+
 class SimpleRNNCell(Layer):
     """Cell class for SimpleRNN.
 

From 1b90731dd0b1bbab0fb6025d8664c447e358fded Mon Sep 17 00:00:00 2001
From: andhus <andhus@kth.se>
Date: Sun, 8 Oct 2017 18:57:25 +0200
Subject: [PATCH 06/13] restored RNN layer

---
 keras/layers/recurrent.py | 1126 +++++++++++++++++--------------------
 1 file changed, 518 insertions(+), 608 deletions(-)

diff --git a/keras/layers/recurrent.py b/keras/layers/recurrent.py
index 73384df16554..165bebaaeb3d 100644
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@@ -298,9 +298,7 @@ class RNN(Layer):
     # Arguments
         cell: A RNN cell instance. A RNN cell is a class that has:
             - a `call(input_at_t, states_at_t)` method, returning
-                `(output_at_t, states_at_t_plus_1)`. The call method of the
-                cell can also take the optional argument `constants`, see
-                section "Note on passing external constants" below.
+                `(output_at_t, states_at_t_plus_1)`.
             - a `state_size` attribute. This can be a single integer
                 (single state) in which case it is
                 the size of the recurrent state
@@ -329,7 +327,8 @@ class RNN(Layer):
             although it tends to be more memory-intensive.
             Unrolling is only suitable for short sequences.
         input_dim: dimensionality of the input (integer).
-            This argument (or alternatively, the keyword argument `input_shape`)
+            This argument (or alternatively,
+            the keyword argument `input_shape`)
             is required when using this layer as the first layer in a model.
         input_length: Length of input sequences, to be specified
             when it is constant.
@@ -391,55 +390,47 @@ class RNN(Layer):
         `states` should be a numpy array or list of numpy arrays representing
         the initial state of the RNN layer.
 
-    # Note on passing external constants to RNNs
-        You can pass "external" constants to the cell using the `constants`
-        keyword argument of RNN.__call__ (as well as RNN.call) method. This
-        requires that the `cell.call` method accepts the same keyword argument
-        `constants`. Such constants can be used to condition the cell
-        transformation on additional static inputs (not changing over time)
-        (a.k.a. an attention mechanism).
-
     # Examples
 
     ```python
-    # First, let's define a RNN Cell, as a layer subclass.
-
-    class MinimalRNNCell(keras.layers.Layer):
-
-        def __init__(self, units, **kwargs):
-            self.units = units
-            self.state_size = units
-            super(MinimalRNNCell, self).__init__(**kwargs)
-
-        def build(self, input_shape):
-            self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
-                                          initializer='uniform',
-                                          name='kernel')
-            self.recurrent_kernel = self.add_weight(
-                shape=(self.units, self.units),
-                initializer='uniform',
-                name='recurrent_kernel')
-            self.built = True
-
-        def call(self, inputs, states):
-            prev_output = states[0]
-            h = K.dot(inputs, self.kernel)
-            output = h + K.dot(prev_output, self.recurrent_kernel)
-            return output, [output]
-
-    # Let's use this cell in a RNN layer:
-
-    cell = MinimalRNNCell(32)
-    x = keras.Input((None, 5))
-    layer = RNN(cell)
-    y = layer(x)
-
-    # Here's how to use the cell to build a stacked RNN:
-
-    cells = [MinimalRNNCell(32), MinimalRNNCell(64)]
-    x = keras.Input((None, 5))
-    layer = RNN(cells)
-    y = layer(x)
+        # First, let's define a RNN Cell, as a layer subclass.
+
+        class MinimalRNNCell(keras.layers.Layer):
+
+            def __init__(self, units, **kwargs):
+                self.units = units
+                self.state_size = units
+                super(MinimalRNNCell, self).__init__(**kwargs)
+
+            def build(self, input_shape):
+                self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
+                                              initializer='uniform',
+                                              name='kernel')
+                self.recurrent_kernel = self.add_weight(
+                    shape=(self.units, self.units),
+                    initializer='uniform',
+                    name='recurrent_kernel')
+                self.built = True
+
+            def call(self, inputs, states):
+                prev_output = states[0]
+                h = K.dot(inputs, self.kernel)
+                output = h + K.dot(prev_output, self.recurrent_kernel)
+                return output, [output]
+
+        # Let's use this cell in a RNN layer:
+
+        cell = MinimalRNNCell(32)
+        x = keras.Input((None, 5))
+        layer = RNN(cell)
+        y = layer(x)
+
+        # Here's how to use the cell to build a stacked RNN:
+
+        cells = [MinimalRNNCell(32), MinimalRNNCell(64)]
+        x = keras.Input((None, 5))
+        layer = RNN(cells)
+        y = layer(x)
     ```
     """
 
@@ -477,8 +468,6 @@ def __init__(self, cell,
             self.state_spec = InputSpec(shape=(None, self.cell.state_size))
         self._states = None
 
-        self.external_constants_spec = None
-
     @property
     def states(self):
         if self._states is None:
@@ -524,14 +513,6 @@ def compute_mask(self, inputs, mask):
             return output_mask
 
     def build(self, input_shape):
-        # Note input_shape will be list of shapes of initial states and
-        # constants if these are passed in __call__.
-        if self.external_constants_spec is not None:
-            # input_shape must be list
-            constants_shape = input_shape[-len(self.external_constants_spec):]
-        else:
-            constants_shape = None
-
         if isinstance(input_shape, list):
             input_shape = input_shape[0]
 
@@ -544,10 +525,7 @@ def build(self, input_shape):
 
         if isinstance(self.cell, Layer):
             step_input_shape = (input_shape[0],) + input_shape[2:]
-            if constants_shape is not None:
-                self.cell.build([step_input_shape] + constants_shape)
-            else:
-                self.cell.build(step_input_shape)
+            self.cell.build(step_input_shape)
 
     def get_initial_state(self, inputs):
         # build an all-zero tensor of shape (samples, output_dim)
@@ -560,58 +538,43 @@ def get_initial_state(self, inputs):
         else:
             return [K.tile(initial_state, [1, self.cell.state_size])]
 
-    def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
-        # If there are multiple inputs, then they should be the main input,
-        # `initial_state` and (optionally) `constants` e.g. when loading model
-        # from file  # TODO ask for clarification
-        inputs, initial_state, constants = self._normalize_args(
-            inputs, initial_state, constants)
-
-        # we need to know length of constants in build
-        if constants:
-            self.external_constants_spec = [
-                InputSpec(shape=K.int_shape(constant))
-                for constant in constants
-            ]
-
-        if initial_state is None and constants is None:
+    def __call__(self, inputs, initial_state=None, **kwargs):
+        # If there are multiple inputs, then
+        # they should be the main input and `initial_state`
+        # e.g. when loading model from file
+        if isinstance(inputs, (list, tuple)) and len(inputs) > 1 and initial_state is None:
+            initial_state = inputs[1:]
+            inputs = inputs[0]
+
+        # If `initial_state` is specified,
+        # and if it a Keras tensor,
+        # then add it to the inputs and temporarily
+        # modify the input spec to include the state.
+        if initial_state is None:
             return super(RNN, self).__call__(inputs, **kwargs)
 
-        # If any of `initial_state` or `constants` are specified and are Keras
-        # tensors, then add them to the inputs and temporarily modify the
-        # input_spec to include them.
+        if not isinstance(initial_state, (list, tuple)):
+            initial_state = [initial_state]
 
-        check_list = []
-        if initial_state:
-            check_list += initial_state
-        if constants:
-            check_list += constants
-        # at this point check_list cannot be empty
-        is_keras_tensor = hasattr(check_list[0], '_keras_history')
-        for tensor in check_list:
+        is_keras_tensor = hasattr(initial_state[0], '_keras_history')
+        for tensor in initial_state:
             if hasattr(tensor, '_keras_history') != is_keras_tensor:
-                raise ValueError('The initial state and constants of an RNN'
-                                 ' layer cannot be specified with a mix of'
-                                 ' Keras tensors and non-Keras tensors')
+                raise ValueError('The initial state of an RNN layer cannot be'
+                                 ' specified with a mix of Keras tensors and'
+                                 ' non-Keras tensors')
 
         if is_keras_tensor:
-            # Compute the full input spec, including state and constants
+            # Compute the full input spec, including state
             input_spec = self.input_spec
             state_spec = self.state_spec
             if not isinstance(input_spec, list):
                 input_spec = [input_spec]
             if not isinstance(state_spec, list):
                 state_spec = [state_spec]
-            self.input_spec = input_spec
-            inputs = [inputs]
-            if initial_state:
-                self.input_spec += state_spec
-                inputs += initial_state
-                kwargs['initial_state'] = initial_state
-            if constants:
-                self.input_spec += self.external_constants_spec
-                inputs += constants
-                kwargs['constants'] = constants
+            self.input_spec = input_spec + state_spec
+
+            # Compute the full inputs, including state
+            inputs = [inputs] + list(initial_state)
 
             # Perform the call
             output = super(RNN, self).__call__(inputs, **kwargs)
@@ -621,22 +584,16 @@ def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
             return output
         else:
             kwargs['initial_state'] = initial_state
-            if constants is not None:
-                kwargs['constants'] = constants
             return super(RNN, self).__call__(inputs, **kwargs)
 
-    def call(self,
-             inputs,
-             mask=None,
-             training=None,
-             initial_state=None,
-             constants=None):
+    def call(self, inputs, mask=None, training=None, initial_state=None):
         # input shape: `(samples, time (padded with zeros), input_dim)`
         # note that the .build() method of subclasses MUST define
         # self.input_spec and self.state_spec with complete input shapes.
         if isinstance(inputs, list):
+            initial_state = inputs[1:]
             inputs = inputs[0]
-        if initial_state is not None:
+        elif initial_state is not None:
             pass
         elif self.stateful:
             initial_state = self.states
@@ -665,17 +622,9 @@ def call(self,
                              '- If using the functional API, specify '
                              'the time dimension by passing a `shape` '
                              'or `batch_shape` argument to your Input layer.')
-        cell_kwargs = {}
-        if has_arg(self.cell.call, 'training'):
-            cell_kwargs['training'] = training
-
-        if constants is not None:
-            if not has_arg(self.cell.call, 'constants'):
-                raise TypeError('cell does not take keyword argument constants')
-            cell_kwargs['constants'] = constants
 
-        if cell_kwargs:
-            step = functools.partial(self.cell.call, **cell_kwargs)
+        if has_arg(self.cell.call, 'training'):
+            step = functools.partial(self.cell.call, training=training)
         else:
             step = self.cell.call
         last_output, outputs, states = K.rnn(step,
@@ -709,45 +658,6 @@ def call(self,
         else:
             return output
 
-    def _normalize_args(self, inputs, initial_state=None, constants=None):
-        """The inputs `initial_state` and `constants` can be passed to
-        RNN.__call__ either by separate arguments or as part of `inputs`. In
-        this case `inputs` is a list of tensors of which the first one is the
-        actual (sequence) input followed by initial states, followed by
-        constants.
-
-        This method separates and noramlizes the different groups of inputs.
-
-        # Arguments
-            inputs: tensor of list/tuple of tensors
-            initial_state: tensor or list of tensors or None
-            constants: tensor or list of tensors or None
-
-        # Returns
-            inputs: tensor
-            initial_state: list of tensors or None
-            constants: list of tensors or None
-        """
-        if isinstance(inputs, (list, tuple)):
-            remaining_inputs = inputs[1:]
-            inputs = inputs[0]
-            if remaining_inputs and initial_state is None:
-                if isinstance(self.state_spec, list):
-                    n_states = len(self.state_spec)
-                else:
-                    n_states = 1
-                initial_state = remaining_inputs[:n_states]
-                remaining_inputs = remaining_inputs[n_states:]
-            if remaining_inputs and constants is None:
-                constants = remaining_inputs
-            if len(remaining_inputs) > 0:
-                raise ValueError('too many inputs were passed')
-
-        initial_state = _to_list_or_none(initial_state)
-        constants = _to_list_or_none(constants)
-
-        return inputs, initial_state, constants
-
     def reset_states(self, states=None):
         if not self.stateful:
             raise AttributeError('Layer must be stateful.')
@@ -844,268 +754,168 @@ def get_losses_for(self, inputs=None):
         return super(RNN, self).get_losses_for(inputs)
 
 
-class AttentionRNN(RNN):
-    """Base class for attentive recurrent layers.
+class SimpleRNNCell(Layer):
+    """Cell class for SimpleRNN.
 
     # Arguments
-        cell: A RNN cell instance supporting attention. It should implement:
-            - a `call(input_at_t, states_at_t, attended)` method, returning
-                `(output_at_t, states_at_t_plus_1)`. It must accept the keyword
-                argument `attended` which refers to the input(s) (tensor or
-                list of tensors) that is attended to an will be presented as a
-                whole at each timestep.
-            - a `state_size` attribute. This can be a single integer
-                (single state) in which case it is the size of the recurrent
-                state (which should be the same as the size of the cell
-                output). This can also be a list/tuple of integers
-                (one size per state). In this case, the first entry
-                (`state_size[0]`) should be the same as the size of the cell
-                output.
-            If the RNN cell is a keras layer, the input_shape passed to its
-            `build` method will be a list of the input shape of the regular
-            sequence input followed by the shape(s) of the attended.
-        **kwargs: See docs of super class RNN.
+        units: Positive integer, dimensionality of the output space.
+        activation: Activation function to use
+            (see [activations](../activations.md)).
+            If you pass None, no activation is applied
+            (ie. "linear" activation: `a(x) = x`).
+        use_bias: Boolean, whether the layer uses a bias vector.
+        kernel_initializer: Initializer for the `kernel` weights matrix,
+            used for the linear transformation of the inputs.
+            (see [initializers](../initializers.md)).
+        recurrent_initializer: Initializer for the `recurrent_kernel`
+            weights matrix,
+            used for the linear transformation of the recurrent state.
+            (see [initializers](../initializers.md)).
+        bias_initializer: Initializer for the bias vector
+            (see [initializers](../initializers.md)).
+        kernel_regularizer: Regularizer function applied to
+            the `kernel` weights matrix
+            (see [regularizer](../regularizers.md)).
+        recurrent_regularizer: Regularizer function applied to
+            the `recurrent_kernel` weights matrix
+            (see [regularizer](../regularizers.md)).
+        bias_regularizer: Regularizer function applied to the bias vector
+            (see [regularizer](../regularizers.md)).
+        activity_regularizer: Regularizer function applied to
+            the output of the layer (its "activation").
+            (see [regularizer](../regularizers.md)).
+        kernel_constraint: Constraint function applied to
+            the `kernel` weights matrix
+            (see [constraints](../constraints.md)).
+        recurrent_constraint: Constraint function applied to
+            the `recurrent_kernel` weights matrix
+            (see [constraints](../constraints.md)).
+        bias_constraint: Constraint function applied to the bias vector
+            (see [constraints](../constraints.md)).
+        dropout: Float between 0 and 1.
+            Fraction of the units to drop for
+            the linear transformation of the inputs.
+        recurrent_dropout: Float between 0 and 1.
+            Fraction of the units to drop for
+            the linear transformation of the recurrent state.
+    """
 
-    # Input shapes
-        3D tensor with shape `(batch_size, timesteps, input_dim)`,
-        (Optional) 2D tensors with shape `(batch_size, output_dim)`.
+    def __init__(self, units,
+                 activation='tanh',
+                 use_bias=True,
+                 kernel_initializer='glorot_uniform',
+                 recurrent_initializer='orthogonal',
+                 bias_initializer='zeros',
+                 kernel_regularizer=None,
+                 recurrent_regularizer=None,
+                 bias_regularizer=None,
+                 kernel_constraint=None,
+                 recurrent_constraint=None,
+                 bias_constraint=None,
+                 dropout=0.,
+                 recurrent_dropout=0.,
+                 **kwargs):
+        super(SimpleRNNCell, self).__init__(**kwargs)
+        self.units = units
+        self.activation = activations.get(activation)
+        self.use_bias = use_bias
 
-    # Attended shapes
-        ND tensor of the shape expected by the attentive cell.
+        self.kernel_initializer = initializers.get(kernel_initializer)
+        self.recurrent_initializer = initializers.get(recurrent_initializer)
+        self.bias_initializer = initializers.get(bias_initializer)
 
-    # Examples
+        self.kernel_regularizer = regularizers.get(kernel_regularizer)
+        self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+        self.bias_regularizer = regularizers.get(bias_regularizer)
 
-    ```python
+        self.kernel_constraint = constraints.get(kernel_constraint)
+        self.recurrent_constraint = constraints.get(recurrent_constraint)
+        self.bias_constraint = constraints.get(bias_constraint)
 
-    TODO: minimal example (using functional API?)
-    ```
-    """
+        self.dropout = min(1., max(0., dropout))
+        self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+        self.state_size = self.units
+        self._dropout_mask = None
+        self._recurrent_dropout_mask = None
 
-    def __init__(self, cell, **kwargs):
-        if isinstance(cell, (list, tuple)):
-            # Note: not obviously how one would want to propagate the attended
-            # for stacked cells, user should stack them manually into a single
-            # cell
-            raise ValueError('AttentionRNN only supports a single cell')
-        super(AttentionRNN, self).__init__(cell=cell, **kwargs)
-        # we let base class check that cel has call function before checking
-        # for the additional argument
-        if not has_arg(cell.call, 'attended'):
-            raise ValueError('`cell.call` does not take the keyword argument'
-                             ' attended')
+    def build(self, input_shape):
+        self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
+                                      name='kernel',
+                                      initializer=self.kernel_initializer,
+                                      regularizer=self.kernel_regularizer,
+                                      constraint=self.kernel_constraint)
+        self.recurrent_kernel = self.add_weight(
+            shape=(self.units, self.units),
+            name='recurrent_kernel',
+            initializer=self.recurrent_initializer,
+            regularizer=self.recurrent_regularizer,
+            constraint=self.recurrent_constraint)
+        if self.use_bias:
+            self.bias = self.add_weight(shape=(self.units,),
+                                        name='bias',
+                                        initializer=self.bias_initializer,
+                                        regularizer=self.bias_regularizer,
+                                        constraint=self.bias_constraint)
+        else:
+            self.bias = None
+        self.built = True
 
-        self._n_attended = None  # set in __call__, needed in build to split
-                                 # input_shape
-        self.attended_spec = None
+    def _generate_dropout_mask(self, inputs, training=None):
+        if 0 < self.dropout < 1:
+            ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
 
-    def build(self, input_shape):
-        attended_shapes = input_shape[-self._n_attended:]
-        input_shape = input_shape[0]
-        batch_size = input_shape[0] if self.stateful else None
-        input_dim = input_shape[-1]
-        self.input_spec[0] = InputSpec(shape=(batch_size, None, input_dim))
+            def dropped_inputs():
+                return K.dropout(ones, self.dropout)
 
-        attended_specs = [InputSpec(shape=(batch_size,) + attended_shape[1:])
-                          for attended_shape in attended_shapes]
-        if len(attended_specs) > 1:
-            self.attended_spec = attended_specs
+            self._dropout_mask = K.in_train_phase(
+                dropped_inputs,
+                ones,
+                training=training)
         else:
-            self.attended_spec = attended_specs[0]
+            self._dropout_mask = None
 
-        if self.stateful:
-            self.reset_states()
+    def _generate_recurrent_dropout_mask(self, inputs, training=None):
+        if 0 < self.recurrent_dropout < 1:
+            ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
+            ones = K.tile(ones, (1, self.units))
 
-        if isinstance(self.cell, Layer):
-            step_input_shape = (input_shape[0],) + input_shape[2:]
-            self.cell.build([step_input_shape] + attended_shapes)
+            def dropped_inputs():
+                return K.dropout(ones, self.dropout)
 
-    def __call__(self, inputs, initial_state=None, attended=None, **kwargs):
-        # If there are multiple inputs, then they should be the main input,
-        # `initial_state` and `attended`
-        # TODO what is meant by "e.g. when loading model from file" in comment
-        # in base class RNN, can there be a problem if initial states are not
-        # passed in the Attentive RNN with respect ot this!?
-        inputs, initial_state, attended = self._normalize_args(
-            inputs, initial_state, attended)
+            self._recurrent_dropout_mask = K.in_train_phase(
+                dropped_inputs,
+                ones,
+                training=training)
+        else:
+            self._recurrent_dropout_mask = None
 
-        if attended is None:
-            raise ValueError('attended input must be passed')
-        # we need to know length of attended in build
-        self._n_attended = len(attended)
-
-        check_list = []
-        if initial_state:
-            check_list += initial_state
-        if attended:
-            check_list += attended
-        # at this point check_list cannot be empty
-        is_keras_tensor = hasattr(check_list[0], '_keras_history')
-        for tensor in check_list:
-            if hasattr(tensor, '_keras_history') != is_keras_tensor:
-                raise ValueError('The initial state and attended of an RNN'
-                                 ' layer cannot be specified with a mix of'
-                                 ' Keras tensors and non-Keras tensors')
-
-        if is_keras_tensor:
-            # Compute the full input spec, including state and attended
-            input_spec = self.input_spec
-            state_spec = self.state_spec
-            if not isinstance(input_spec, list):
-                input_spec = [input_spec]
-            if not isinstance(state_spec, list):
-                state_spec = [state_spec]
-            self.input_spec = input_spec
-            inputs = [inputs]
-            if initial_state:
-                self.input_spec += state_spec
-                inputs += initial_state
-                kwargs['initial_state'] = initial_state
-            if attended:
-                self.input_spec += self.external_constants_spec
-                inputs += attended
-                kwargs['attended'] = attended
-
-            # Perform the call
-            output = Layer.__call__(self, inputs, **kwargs)
-
-            # Restore original input spec
-            self.input_spec = input_spec
-            return output
-        else:
-            kwargs['initial_state'] = initial_state
-            if attended is not None:
-                kwargs['attended'] = attended
-            return Layer.__call__(self, inputs, **kwargs)
-
-    def call(self,
-             inputs,
-             mask=None,
-             training=None,
-             initial_state=None,
-             attended=None):
-        # TODO this method duplicates almost everything in RNN.call,
-        # better solution?
-
-        # input shape: `(samples, time (padded with zeros), input_dim)`
-        # note that the .build() method of subclasses MUST define
-        # self.input_spec and self.state_spec with complete input shapes.
-        if isinstance(inputs, list):
-            inputs = inputs[0]
-        if initial_state is not None:
-            pass
-        elif self.stateful:
-            initial_state = self.states
-        else:
-            initial_state = self.get_initial_state(inputs)
-
-        if isinstance(mask, list):
-            mask = mask[0]
-
-        if len(initial_state) != len(self.states):
-            raise ValueError('Layer has ' + str(len(self.states)) +
-                             ' states but was passed ' +
-                             str(len(initial_state)) +
-                             ' initial states.')
-        input_shape = K.int_shape(inputs)
-        timesteps = input_shape[1]
-        if self.unroll and timesteps in [None, 1]:
-            raise ValueError('Cannot unroll a RNN if the '
-                             'time dimension is undefined or equal to 1. \n'
-                             '- If using a Sequential model, '
-                             'specify the time dimension by passing '
-                             'an `input_shape` or `batch_input_shape` '
-                             'argument to your first layer. If your '
-                             'first layer is an Embedding, you can '
-                             'also use the `input_length` argument.\n'
-                             '- If using the functional API, specify '
-                             'the time dimension by passing a `shape` '
-                             'or `batch_shape` argument to your Input layer.')
-
-        cell_kwargs = {'attended': attended}
-        if has_arg(self.cell.call, 'training'):
-            cell_kwargs['training'] = training
-
-        # NOTE: by passing the attended implicitly into the K.rnn it is not
-        # possible for theano backend to optimise the scan op, see section:
-        # "Explicitly passing inputs of the inner function to scan" in:
-        #   http://deeplearning.net/software/theano/library/scan.html#lib-scan-shared-variables
-        # but on the other hand we are not passed weights (shared variables)
-        # of the cell transformation anyway.
-        step = functools.partial(self.cell.call, **cell_kwargs)
-
-        last_output, outputs, states = K.rnn(step,
-                                             inputs,
-                                             initial_state,
-                                             go_backwards=self.go_backwards,
-                                             mask=mask,
-                                             unroll=self.unroll,
-                                             input_length=timesteps)
-        if self.stateful:
-            updates = []
-            for i in range(len(states)):
-                updates.append((self.states[i], states[i]))
-            self.add_update(updates, inputs)
-
-        if self.return_sequences:
-            output = outputs
-        else:
-            output = last_output
-
-        # Properly set learning phase
-        if getattr(last_output, '_uses_learning_phase', False):
-            output._uses_learning_phase = True
+    def call(self, inputs, states, training=None):
+        prev_output = states[0]
+        dp_mask = self._dropout_mask
+        rec_dp_mask = self._recurrent_dropout_mask
 
-        if self.return_state:
-            if not isinstance(states, (list, tuple)):
-                states = [states]
-            else:
-                states = list(states)
-            return [output] + states
+        if dp_mask is not None:
+            h = K.dot(inputs * dp_mask, self.kernel)
         else:
-            return output
-
-    def _normalize_args(self, inputs, initial_state, attended):
-        """The inputs `initial_state` and `attended` can be passed to
-        AttentionRNN.__call__ either by separate arguments or as part of
-        `inputs`. In this case `inputs` is a list of tensors of which the first
-        one is the actual (sequence) input followed by initial states followed
-        by the attended.
-
-        This method separates and normalizes the different groups of inputs.
-
-        # Arguments
-            inputs: tensor of list/tuple of tensors
-            initial_state: tensor or list of tensors or None
-            attended: tensor or list of tensors or None
-
-        # Returns
-            inputs: tensor
-            initial_state: list of tensors or None
-            attended: list of tensors or None
-        """
-        if isinstance(inputs, (list, tuple)):
-            remaining_inputs = inputs[1:]
-            inputs = inputs[0]
-            if remaining_inputs and initial_state is None:
-                if isinstance(self.state_spec, list):
-                    n_states = len(self.state_spec)
-                else:
-                    n_states = 1
-                initial_state = remaining_inputs[:n_states]
-                remaining_inputs = remaining_inputs[n_states:]
-            if remaining_inputs and attended is None:
-                attended = remaining_inputs
-            if len(remaining_inputs) > 0:
-                raise ValueError('too many inputs were passed')
+            h = K.dot(inputs, self.kernel)
+        if self.bias is not None:
+            h = K.bias_add(h, self.bias)
 
-        initial_state = _to_list_or_none(initial_state)
-        attended = _to_list_or_none(attended)
+        if rec_dp_mask is not None:
+            prev_output *= rec_dp_mask
+        output = h + K.dot(prev_output, self.recurrent_kernel)
+        if self.activation is not None:
+            output = self.activation(output)
 
-        return inputs, initial_state, attended
+        # Properly set learning phase on output tensor.
+        if 0 < self.dropout + self.recurrent_dropout:
+            if training is None:
+                output._uses_learning_phase = True
+        return output, [output]
 
 
-class SimpleRNNCell(Layer):
-    """Cell class for SimpleRNN.
+class SimpleRNN(RNN):
+    """Fully-connected RNN where the output is to be fed back to input.
 
     # Arguments
         units: Positive integer, dimensionality of the output space.
@@ -1150,6 +960,7 @@ class SimpleRNNCell(Layer):
             the linear transformation of the recurrent state.
     """
 
+    @interfaces.legacy_recurrent_support
     def __init__(self, units,
                  activation='tanh',
                  use_bias=True,
@@ -1159,222 +970,61 @@ def __init__(self, units,
                  kernel_regularizer=None,
                  recurrent_regularizer=None,
                  bias_regularizer=None,
+                 activity_regularizer=None,
                  kernel_constraint=None,
                  recurrent_constraint=None,
                  bias_constraint=None,
                  dropout=0.,
                  recurrent_dropout=0.,
                  **kwargs):
-        super(SimpleRNNCell, self).__init__(**kwargs)
-        self.units = units
-        self.activation = activations.get(activation)
-        self.use_bias = use_bias
+        if 'implementation' in kwargs:
+            kwargs.pop('implementation')
+            warnings.warn('The `implementation` argument '
+                          'in `SimpleRNN` has been deprecated. '
+                          'Please remove it from your layer call.')
+        if K.backend() == 'cntk':
+            if not kwargs.get('unroll') and (dropout > 0 or recurrent_dropout > 0):
+                warnings.warn(
+                    'RNN dropout is not supported with the CNTK backend '
+                    'when using dynamic RNNs (i.e. non-unrolled). '
+                    'You can either set `unroll=True`, '
+                    'set `dropout` and `recurrent_dropout` to 0, '
+                    'or use a different backend.')
+                dropout = 0.
+                recurrent_dropout = 0.
 
-        self.kernel_initializer = initializers.get(kernel_initializer)
-        self.recurrent_initializer = initializers.get(recurrent_initializer)
-        self.bias_initializer = initializers.get(bias_initializer)
+        cell = SimpleRNNCell(units,
+                             activation=activation,
+                             use_bias=use_bias,
+                             kernel_initializer=kernel_initializer,
+                             recurrent_initializer=recurrent_initializer,
+                             bias_initializer=bias_initializer,
+                             kernel_regularizer=kernel_regularizer,
+                             recurrent_regularizer=recurrent_regularizer,
+                             bias_regularizer=bias_regularizer,
+                             kernel_constraint=kernel_constraint,
+                             recurrent_constraint=recurrent_constraint,
+                             bias_constraint=bias_constraint,
+                             dropout=dropout,
+                             recurrent_dropout=recurrent_dropout)
+        super(SimpleRNN, self).__init__(cell, **kwargs)
+        self.activity_regularizer = regularizers.get(activity_regularizer)
 
-        self.kernel_regularizer = regularizers.get(kernel_regularizer)
-        self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
-        self.bias_regularizer = regularizers.get(bias_regularizer)
+    def call(self, inputs, mask=None, training=None, initial_state=None):
+        self.cell._generate_dropout_mask(inputs, training=training)
+        self.cell._generate_recurrent_dropout_mask(inputs, training=training)
+        return super(SimpleRNN, self).call(inputs,
+                                           mask=mask,
+                                           training=training,
+                                           initial_state=initial_state)
 
-        self.kernel_constraint = constraints.get(kernel_constraint)
-        self.recurrent_constraint = constraints.get(recurrent_constraint)
-        self.bias_constraint = constraints.get(bias_constraint)
+    @property
+    def units(self):
+        return self.cell.units
 
-        self.dropout = min(1., max(0., dropout))
-        self.recurrent_dropout = min(1., max(0., recurrent_dropout))
-        self.state_size = self.units
-        self._dropout_mask = None
-        self._recurrent_dropout_mask = None
-
-    def build(self, input_shape):
-        self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
-                                      name='kernel',
-                                      initializer=self.kernel_initializer,
-                                      regularizer=self.kernel_regularizer,
-                                      constraint=self.kernel_constraint)
-        self.recurrent_kernel = self.add_weight(
-            shape=(self.units, self.units),
-            name='recurrent_kernel',
-            initializer=self.recurrent_initializer,
-            regularizer=self.recurrent_regularizer,
-            constraint=self.recurrent_constraint)
-        if self.use_bias:
-            self.bias = self.add_weight(shape=(self.units,),
-                                        name='bias',
-                                        initializer=self.bias_initializer,
-                                        regularizer=self.bias_regularizer,
-                                        constraint=self.bias_constraint)
-        else:
-            self.bias = None
-        self.built = True
-
-    def _generate_dropout_mask(self, inputs, training=None):
-        if 0 < self.dropout < 1:
-            ones = K.ones_like(K.squeeze(inputs[:, 0:1, :], axis=1))
-
-            def dropped_inputs():
-                return K.dropout(ones, self.dropout)
-
-            self._dropout_mask = K.in_train_phase(
-                dropped_inputs,
-                ones,
-                training=training)
-        else:
-            self._dropout_mask = None
-
-    def _generate_recurrent_dropout_mask(self, inputs, training=None):
-        if 0 < self.recurrent_dropout < 1:
-            ones = K.ones_like(K.reshape(inputs[:, 0, 0], (-1, 1)))
-            ones = K.tile(ones, (1, self.units))
-
-            def dropped_inputs():
-                return K.dropout(ones, self.dropout)
-
-            self._recurrent_dropout_mask = K.in_train_phase(
-                dropped_inputs,
-                ones,
-                training=training)
-        else:
-            self._recurrent_dropout_mask = None
-
-    def call(self, inputs, states, training=None):
-        prev_output = states[0]
-        dp_mask = self._dropout_mask
-        rec_dp_mask = self._recurrent_dropout_mask
-
-        if dp_mask is not None:
-            h = K.dot(inputs * dp_mask, self.kernel)
-        else:
-            h = K.dot(inputs, self.kernel)
-        if self.bias is not None:
-            h = K.bias_add(h, self.bias)
-
-        if rec_dp_mask is not None:
-            prev_output *= rec_dp_mask
-        output = h + K.dot(prev_output, self.recurrent_kernel)
-        if self.activation is not None:
-            output = self.activation(output)
-
-        # Properly set learning phase on output tensor.
-        if 0 < self.dropout + self.recurrent_dropout:
-            if training is None:
-                output._uses_learning_phase = True
-        return output, [output]
-
-
-class SimpleRNN(RNN):
-    """Fully-connected RNN where the output is to be fed back to input.
-
-    # Arguments
-        units: Positive integer, dimensionality of the output space.
-        activation: Activation function to use
-            (see [activations](../activations.md)).
-            If you pass None, no activation is applied
-            (ie. "linear" activation: `a(x) = x`).
-        use_bias: Boolean, whether the layer uses a bias vector.
-        kernel_initializer: Initializer for the `kernel` weights matrix,
-            used for the linear transformation of the inputs.
-            (see [initializers](../initializers.md)).
-        recurrent_initializer: Initializer for the `recurrent_kernel`
-            weights matrix,
-            used for the linear transformation of the recurrent state.
-            (see [initializers](../initializers.md)).
-        bias_initializer: Initializer for the bias vector
-            (see [initializers](../initializers.md)).
-        kernel_regularizer: Regularizer function applied to
-            the `kernel` weights matrix
-            (see [regularizer](../regularizers.md)).
-        recurrent_regularizer: Regularizer function applied to
-            the `recurrent_kernel` weights matrix
-            (see [regularizer](../regularizers.md)).
-        bias_regularizer: Regularizer function applied to the bias vector
-            (see [regularizer](../regularizers.md)).
-        activity_regularizer: Regularizer function applied to
-            the output of the layer (its "activation").
-            (see [regularizer](../regularizers.md)).
-        kernel_constraint: Constraint function applied to
-            the `kernel` weights matrix
-            (see [constraints](../constraints.md)).
-        recurrent_constraint: Constraint function applied to
-            the `recurrent_kernel` weights matrix
-            (see [constraints](../constraints.md)).
-        bias_constraint: Constraint function applied to the bias vector
-            (see [constraints](../constraints.md)).
-        dropout: Float between 0 and 1.
-            Fraction of the units to drop for
-            the linear transformation of the inputs.
-        recurrent_dropout: Float between 0 and 1.
-            Fraction of the units to drop for
-            the linear transformation of the recurrent state.
-    """
-
-    @interfaces.legacy_recurrent_support
-    def __init__(self, units,
-                 activation='tanh',
-                 use_bias=True,
-                 kernel_initializer='glorot_uniform',
-                 recurrent_initializer='orthogonal',
-                 bias_initializer='zeros',
-                 kernel_regularizer=None,
-                 recurrent_regularizer=None,
-                 bias_regularizer=None,
-                 activity_regularizer=None,
-                 kernel_constraint=None,
-                 recurrent_constraint=None,
-                 bias_constraint=None,
-                 dropout=0.,
-                 recurrent_dropout=0.,
-                 **kwargs):
-        if 'implementation' in kwargs:
-            kwargs.pop('implementation')
-            warnings.warn('The `implementation` argument '
-                          'in `SimpleRNN` has been deprecated. '
-                          'Please remove it from your layer call.')
-        if K.backend() == 'cntk':
-            if not kwargs.get('unroll') and (dropout > 0 or recurrent_dropout > 0):
-                warnings.warn(
-                    'RNN dropout is not supported with the CNTK backend '
-                    'when using dynamic RNNs (i.e. non-unrolled). '
-                    'You can either set `unroll=True`, '
-                    'set `dropout` and `recurrent_dropout` to 0, '
-                    'or use a different backend.')
-                dropout = 0.
-                recurrent_dropout = 0.
-
-        cell = SimpleRNNCell(units,
-                             activation=activation,
-                             use_bias=use_bias,
-                             kernel_initializer=kernel_initializer,
-                             recurrent_initializer=recurrent_initializer,
-                             bias_initializer=bias_initializer,
-                             kernel_regularizer=kernel_regularizer,
-                             recurrent_regularizer=recurrent_regularizer,
-                             bias_regularizer=bias_regularizer,
-                             kernel_constraint=kernel_constraint,
-                             recurrent_constraint=recurrent_constraint,
-                             bias_constraint=bias_constraint,
-                             dropout=dropout,
-                             recurrent_dropout=recurrent_dropout)
-        super(SimpleRNN, self).__init__(cell, **kwargs)
-        self.activity_regularizer = regularizers.get(activity_regularizer)
-
-    def call(self, inputs, mask=None, training=None, initial_state=None):
-        self.cell._generate_dropout_mask(inputs, training=training)
-        self.cell._generate_recurrent_dropout_mask(inputs, training=training)
-        return super(SimpleRNN, self).call(inputs,
-                                           mask=mask,
-                                           training=training,
-                                           initial_state=initial_state)
-
-    @property
-    def units(self):
-        return self.cell.units
-
-    @property
-    def activation(self):
-        return self.cell.activation
+    @property
+    def activation(self):
+        return self.cell.activation
 
     @property
     def use_bias(self):
@@ -2369,6 +2019,266 @@ def from_config(cls, config):
         return cls(**config)
 
 
+class AttentionRNN(RNN):
+    """Base class for attentive recurrent layers.
+
+    # Arguments
+        cell: A RNN cell instance supporting attention. It should implement:
+            - a `call(input_at_t, states_at_t, attended)` method, returning
+                `(output_at_t, states_at_t_plus_1)`. It must accept the keyword
+                argument `attended` which refers to the input(s) (tensor or
+                list of tensors) that is attended to an will be presented as a
+                whole at each timestep.
+            - a `state_size` attribute. This can be a single integer
+                (single state) in which case it is the size of the recurrent
+                state (which should be the same as the size of the cell
+                output). This can also be a list/tuple of integers
+                (one size per state). In this case, the first entry
+                (`state_size[0]`) should be the same as the size of the cell
+                output.
+            If the RNN cell is a keras layer, the input_shape passed to its
+            `build` method will be a list of the input shape of the regular
+            sequence input followed by the shape(s) of the attended.
+        **kwargs: See docs of super class RNN.
+
+    # Input shapes
+        3D tensor with shape `(batch_size, timesteps, input_dim)`,
+        (Optional) 2D tensors with shape `(batch_size, output_dim)`.
+
+    # Attended shapes
+        ND tensor of the shape expected by the attentive cell.
+
+    # Examples
+
+    ```python
+
+    TODO: minimal example (using functional API?)
+    ```
+    """
+
+    def __init__(self, cell, **kwargs):
+        if isinstance(cell, (list, tuple)):
+            # Note: not obviously how one would want to propagate the attended
+            # for stacked cells, user should stack them manually into a single
+            # cell
+            raise ValueError('AttentionRNN only supports a single cell')
+        super(AttentionRNN, self).__init__(cell=cell, **kwargs)
+        # we let base class check that cel has call function before checking
+        # for the additional argument
+        if not has_arg(cell.call, 'attended'):
+            raise ValueError('`cell.call` does not take the keyword argument'
+                             ' attended')
+
+        self._n_attended = None  # set in __call__, needed in build to split
+                                 # input_shape
+        self.attended_spec = None
+
+    def build(self, input_shape):
+        attended_shapes = input_shape[-self._n_attended:]
+        input_shape = input_shape[0]
+        batch_size = input_shape[0] if self.stateful else None
+        input_dim = input_shape[-1]
+        self.input_spec[0] = InputSpec(shape=(batch_size, None, input_dim))
+
+        attended_specs = [InputSpec(shape=(batch_size,) + attended_shape[1:])
+                          for attended_shape in attended_shapes]
+        if len(attended_specs) > 1:
+            self.attended_spec = attended_specs
+        else:
+            self.attended_spec = attended_specs[0]
+
+        if self.stateful:
+            self.reset_states()
+
+        if isinstance(self.cell, Layer):
+            step_input_shape = (input_shape[0],) + input_shape[2:]
+            self.cell.build([step_input_shape] + attended_shapes)
+
+    def __call__(self, inputs, initial_state=None, attended=None, **kwargs):
+        # If there are multiple inputs, then they should be the main input,
+        # `initial_state` and `attended`
+        # TODO what is meant by "e.g. when loading model from file" in comment
+        # in base class RNN, can there be a problem if initial states are not
+        # passed in the Attentive RNN with respect ot this!?
+        inputs, initial_state, attended = self._normalize_args(
+            inputs, initial_state, attended)
+
+        if attended is None:
+            raise ValueError('attended input must be passed')
+        # we need to know length of attended in build
+        self._n_attended = len(attended)
+
+        check_list = []
+        if initial_state:
+            check_list += initial_state
+        if attended:
+            check_list += attended
+        # at this point check_list cannot be empty
+        is_keras_tensor = hasattr(check_list[0], '_keras_history')
+        for tensor in check_list:
+            if hasattr(tensor, '_keras_history') != is_keras_tensor:
+                raise ValueError('The initial state and attended of an RNN'
+                                 ' layer cannot be specified with a mix of'
+                                 ' Keras tensors and non-Keras tensors')
+
+        if is_keras_tensor:
+            # Compute the full input spec, including state and attended
+            input_spec = self.input_spec
+            state_spec = self.state_spec
+            if not isinstance(input_spec, list):
+                input_spec = [input_spec]
+            if not isinstance(state_spec, list):
+                state_spec = [state_spec]
+            self.input_spec = input_spec
+            inputs = [inputs]
+            if initial_state:
+                self.input_spec += state_spec
+                inputs += initial_state
+                kwargs['initial_state'] = initial_state
+            if attended:
+                self.input_spec += self.external_constants_spec
+                inputs += attended
+                kwargs['attended'] = attended
+
+            # Perform the call
+            output = Layer.__call__(self, inputs, **kwargs)
+
+            # Restore original input spec
+            self.input_spec = input_spec
+            return output
+        else:
+            kwargs['initial_state'] = initial_state
+            if attended is not None:
+                kwargs['attended'] = attended
+            return Layer.__call__(self, inputs, **kwargs)
+
+    def call(self,
+             inputs,
+             mask=None,
+             training=None,
+             initial_state=None,
+             attended=None):
+        # TODO this method duplicates almost everything in RNN.call,
+        # better solution?
+
+        # input shape: `(samples, time (padded with zeros), input_dim)`
+        # note that the .build() method of subclasses MUST define
+        # self.input_spec and self.state_spec with complete input shapes.
+        if isinstance(inputs, list):
+            inputs = inputs[0]
+        if initial_state is not None:
+            pass
+        elif self.stateful:
+            initial_state = self.states
+        else:
+            initial_state = self.get_initial_state(inputs)
+
+        if isinstance(mask, list):
+            mask = mask[0]
+
+        if len(initial_state) != len(self.states):
+            raise ValueError('Layer has ' + str(len(self.states)) +
+                             ' states but was passed ' +
+                             str(len(initial_state)) +
+                             ' initial states.')
+        input_shape = K.int_shape(inputs)
+        timesteps = input_shape[1]
+        if self.unroll and timesteps in [None, 1]:
+            raise ValueError('Cannot unroll a RNN if the '
+                             'time dimension is undefined or equal to 1. \n'
+                             '- If using a Sequential model, '
+                             'specify the time dimension by passing '
+                             'an `input_shape` or `batch_input_shape` '
+                             'argument to your first layer. If your '
+                             'first layer is an Embedding, you can '
+                             'also use the `input_length` argument.\n'
+                             '- If using the functional API, specify '
+                             'the time dimension by passing a `shape` '
+                             'or `batch_shape` argument to your Input layer.')
+
+        cell_kwargs = {'attended': attended}
+        if has_arg(self.cell.call, 'training'):
+            cell_kwargs['training'] = training
+
+        # NOTE: by passing the attended implicitly into the K.rnn it is not
+        # possible for theano backend to optimise the scan op, see section:
+        # "Explicitly passing inputs of the inner function to scan" in:
+        #   http://deeplearning.net/software/theano/library/scan.html#lib-scan-shared-variables
+        # but on the other hand we are not passed weights (shared variables)
+        # of the cell transformation anyway.
+        step = functools.partial(self.cell.call, **cell_kwargs)
+
+        last_output, outputs, states = K.rnn(step,
+                                             inputs,
+                                             initial_state,
+                                             go_backwards=self.go_backwards,
+                                             mask=mask,
+                                             unroll=self.unroll,
+                                             input_length=timesteps)
+        if self.stateful:
+            updates = []
+            for i in range(len(states)):
+                updates.append((self.states[i], states[i]))
+            self.add_update(updates, inputs)
+
+        if self.return_sequences:
+            output = outputs
+        else:
+            output = last_output
+
+        # Properly set learning phase
+        if getattr(last_output, '_uses_learning_phase', False):
+            output._uses_learning_phase = True
+
+        if self.return_state:
+            if not isinstance(states, (list, tuple)):
+                states = [states]
+            else:
+                states = list(states)
+            return [output] + states
+        else:
+            return output
+
+    def _normalize_args(self, inputs, initial_state, attended):
+        """The inputs `initial_state` and `attended` can be passed to
+        AttentionRNN.__call__ either by separate arguments or as part of
+        `inputs`. In this case `inputs` is a list of tensors of which the first
+        one is the actual (sequence) input followed by initial states followed
+        by the attended.
+
+        This method separates and normalizes the different groups of inputs.
+
+        # Arguments
+            inputs: tensor of list/tuple of tensors
+            initial_state: tensor or list of tensors or None
+            attended: tensor or list of tensors or None
+
+        # Returns
+            inputs: tensor
+            initial_state: list of tensors or None
+            attended: list of tensors or None
+        """
+        if isinstance(inputs, (list, tuple)):
+            remaining_inputs = inputs[1:]
+            inputs = inputs[0]
+            if remaining_inputs and initial_state is None:
+                if isinstance(self.state_spec, list):
+                    n_states = len(self.state_spec)
+                else:
+                    n_states = 1
+                initial_state = remaining_inputs[:n_states]
+                remaining_inputs = remaining_inputs[n_states:]
+            if remaining_inputs and attended is None:
+                attended = remaining_inputs
+            if len(remaining_inputs) > 0:
+                raise ValueError('too many inputs were passed')
+
+        initial_state = _to_list_or_none(initial_state)
+        attended = _to_list_or_none(attended)
+
+        return inputs, initial_state, attended
+
+
 def _to_list_or_none(x):  # TODO move? Very similar to topology._to_list
     if x is None or isinstance(x, list):
         return x

From e74b125b9b06b00912184bf049589719a9441758 Mon Sep 17 00:00:00 2001
From: andhus <andhus@kth.se>
Date: Sun, 8 Oct 2017 22:34:22 +0200
Subject: [PATCH 07/13] renamed constants to attended in FunctionRNNCell,
 avoided duplicating outputs in wrapped model

---
 examples/functional_rnn_cell.py      |  46 +++++----
 keras/layers/recurrent.py            | 138 +++++++++++++++++----------
 tests/keras/layers/recurrent_test.py |  44 +++++----
 3 files changed, 138 insertions(+), 90 deletions(-)

diff --git a/examples/functional_rnn_cell.py b/examples/functional_rnn_cell.py
index f3e67b2d104b..14287209173b 100644
--- a/examples/functional_rnn_cell.py
+++ b/examples/functional_rnn_cell.py
@@ -4,42 +4,46 @@
 
 from keras import Input
 from keras.layers import add, Dense, Activation, FunctionalRNNCell, RNN, \
-    concatenate, multiply, Model
+    concatenate, multiply, Model, AttentionRNN
 
 units = 32
 input_size = 5
 x = Input((input_size,))
-h_tm1 = Input((units,))
-h_ = add([Dense(units)(x), Dense(units, use_bias=False)(h_tm1)])
-h = Activation('tanh')(h_)
+h_in = Input((units,))
+h_ = add([Dense(units)(x), Dense(units, use_bias=False)(h_in)])
+h_out = Activation('tanh')(h_)
 
 # Create the cell:
-
 cell = FunctionalRNNCell(
-    inputs=x, outputs=h, input_states=h_tm1, output_states=h)
+    inputs=x, outputs=h_out, input_states=h_in, output_states=h_out)
 
 x_sequence = Input((None, input_size))
 rnn = RNN(cell)
 y = rnn(x_sequence)
 
-# Now we can modify the cell to make use of "external" constants:
-constant_shape = (10,)
-c = Input(constant_shape)
-density = Dense(constant_shape[0], activation='softmax')(
-    concatenate([x, h_tm1]))
-attention = multiply([density, c])
-h2_ = add([h, Dense(units)(attention)])
-h2 = Activation('tanh')(h2_)
+# Modify the cell to make use of attention to "external" constants:
+attended_shape = (10,)
+attended = Input(attended_shape)
+density = Dense(attended_shape[0], activation='softmax')(
+    concatenate([x, h_in]))
+attention = multiply([density, attended])
+h2_ = add([h_out, Dense(units)(attention)])
+h_out_2 = Activation('tanh')(h2_)
 
 attention_cell = FunctionalRNNCell(
-    inputs=x, outputs=h2, input_states=h_tm1, output_states=h2, constants=c)
-
-attention_rnn = RNN(attention_cell)
-y2 = attention_rnn(x_sequence, constants=c)
+    inputs=x,
+    outputs=h_out_2,
+    input_states=h_in,
+    output_states=h_out_2,
+    attended=attended
+)
+
+attention_rnn = AttentionRNN(attention_cell)
+y2 = attention_rnn(x_sequence, attended=attended)
 # Note that shape of c is same as in cell (no time dimension added)
 
-attention_model = Model([x_sequence, c], y2)
+attention_model = Model([x_sequence, attended], y2)
 
 x_sequence_arr = np.random.randn(3, 5, input_size)
-c_arr = np.random.randn(3, constant_shape[0])
-y2_arr = attention_model.predict([x_sequence_arr, c_arr])
+attended_arr = np.random.randn(3, attended_shape[0])
+y2_arr = attention_model.predict([x_sequence_arr, attended_arr])
diff --git a/keras/layers/recurrent.py b/keras/layers/recurrent.py
index c35ceb19fc50..abb00ece6c1b 100644
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@@ -204,8 +204,9 @@ class FunctionalRNNCell(Wrapper):
         outputs: output tensor at a single timestep
         input_states: state tensor(s) from previous time step
         output_states: state tensor(s) after cell transformation
-        constants: tensor(s) or None, represents inputs that should be static
-            (the same) for each time step.
+        attended: tensor(s) or None, represents inputs that should be static
+            (the same) for each time step. Used for implementing attention
+            mechanisms.
 
     # Examples
 
@@ -232,26 +233,26 @@ class FunctionalRNNCell(Wrapper):
     # We can also define cells that make use of "external" constants, to
     # implement attention mechanisms:
 
-    constant_shape = (10,)
-    c = Input(constant_shape)
-    density = Dense(constant_shape[0], activation='softmax')(
+    attended_shape = (10,)
+    attended = Input(attended_shape)
+    density = Dense(attended_shape[0], activation='softmax')(
         concatenate([x, h_tm1]))
-    attention = multiply([density, c])
+    attention = multiply([density, attended])
     h2_ = add([h_, Dense(units)(attention)])
     h2 = Activation('tanh')(h2_)
 
     attention_cell = FunctionalRNNCell(
         inputs=x, outputs=h2, input_states=h_tm1, output_states=h2,
-        constants=c)
+        attended=attended)
 
-    attention_rnn = RNN(attention_cell)
-    y2 = attention_rnn(x_sequence, constants=c)
+    attention_rnn = AttentionRNN(attention_cell)
+    y2 = attention_rnn(x_sequence, attended=attended)
 
     # Remember to pass the constant to the RNN layer (which will pass it on to
     # the cell). Also note that shape of c is same as in cell (no time
     # dimension added)
 
-    attention_model = Model([x_sequence, c], y2)
+    attention_model = Model([x_sequence, attended], y2)
     ```
     """
     def __init__(
@@ -260,15 +261,24 @@ def __init__(
         outputs,
         input_states,
         output_states,
-        constants=None,
+        attended=None,
         **kwargs
     ):
         input_states = _to_list_or_none(input_states)
         output_states = _to_list_or_none(output_states)
-        constants = _to_list_or_none(constants)
+        attended = _to_list_or_none(attended)
+        if outputs == output_states[0]:
+            self.first_state_is_output = True
+            model_outputs = output_states
+        else:
+            warnings.warn('it is expected by RNN that output tensor is same as'
+                          ' first state')
+            self.first_state_is_output = False
+            model_outputs = [outputs] + output_states
+
         model = Model(
-            inputs=self._get_model_inputs(inputs, input_states, constants),
-            outputs=[outputs] + output_states
+            inputs=self._get_model_inputs(inputs, input_states, attended),
+            outputs=model_outputs
         )
         super(FunctionalRNNCell, self).__init__(layer=model, **kwargs)
 
@@ -284,25 +294,30 @@ def __init__(
     def state_size(self):
         return self._state_size
 
-    def call(self, inputs, states, constants=None):
+    def call(self, inputs, states, attended=None):
         """Defines the cell transformation for a single time step.
 
         # Arguments
             inputs: Tensor representing input at current time step.
             states: Tensor or list/tuple of tensors representing states from
                 previous time step.
-            constants: Tensor or list of tensors or None representing inputs
+            attended: Tensor or list of tensors or None representing inputs
                 that should be the same at each time step.
         """
-        outputs = self.layer(self._get_model_inputs(inputs, states, constants))
-        output, states = outputs[0], outputs[1:]
-
-        return output, states
-
-    def _get_model_inputs(self, inputs, input_states, constants):
+        outputs = self.layer(self._get_model_inputs(inputs, states, attended))
+        if not isinstance(outputs, list):
+            # if a list of a single output is passed to Model it still
+            # just returns a tensor
+            outputs = [outputs]
+        output = outputs[0]
+        new_states = outputs if self.first_state_is_output else outputs[1:]
+        return output, new_states
+
+    @staticmethod
+    def _get_model_inputs(inputs, input_states, attended):
         inputs = [inputs] + list(input_states)
-        if constants is not None:
-            inputs += constants
+        if attended is not None:
+            inputs += attended
 
         return inputs
 
@@ -2147,8 +2162,41 @@ class AttentionRNN(RNN):
     # Examples
 
     ```python
+        units = 32
+        input_size = 5
+        attended_shape = (10,)
+
+        x = Input((input_size,))
+        h_in = Input((units,))
+        attended = Input(attended_shape)
+
+        # predict "attention density" based on input and previous state
+        density = Dense(attended_shape[0], activation='softmax')(
+            concatenate([x, h_in]))
+        attention = multiply([density, attended])
+
+        h_ = add([
+            Dense(units)(x),
+            Dense(units)(attention),
+            Dense(units, use_bias=False)(h_in)
+        ])
+        h_out = Activation('tanh')(h_)
+
+        # create cell
+        attention_cell = FunctionalRNNCell(
+            inputs=x,
+            outputs=h_out,
+            input_states=[h_in],
+            output_states=[h_out],
+            attended=attended
+        )
 
-    TODO: minimal example (using functional API?)
+        # apply on input sequence
+        x_sequence = Input((None, input_size))
+        attention_rnn = AttentionRNN(attention_cell)
+        y = attention_rnn(x_sequence, attended=attended)
+
+        attention_model = Model([x_sequence, attended], y)
     ```
     """
 
@@ -2165,24 +2213,19 @@ def __init__(self, cell, **kwargs):
             raise ValueError('`cell.call` does not take the keyword argument'
                              ' attended')
 
-        self._n_attended = None  # set in __call__, needed in build to split
-                                 # input_shape
         self.attended_spec = None
 
     def build(self, input_shape):
-        attended_shapes = input_shape[-self._n_attended:]
+        if isinstance(self.attended_spec, list):
+            attended_shapes = input_shape[-len(self.attended_spec):]
+        else:
+            attended_shapes = input_shape[-1:]
+
         input_shape = input_shape[0]
         batch_size = input_shape[0] if self.stateful else None
         input_dim = input_shape[-1]
         self.input_spec[0] = InputSpec(shape=(batch_size, None, input_dim))
 
-        attended_specs = [InputSpec(shape=(batch_size,) + attended_shape[1:])
-                          for attended_shape in attended_shapes]
-        if len(attended_specs) > 1:
-            self.attended_spec = attended_specs
-        else:
-            self.attended_spec = attended_specs[0]
-
         if self.stateful:
             self.reset_states()
 
@@ -2201,14 +2244,14 @@ def __call__(self, inputs, initial_state=None, attended=None, **kwargs):
 
         if attended is None:
             raise ValueError('attended input must be passed')
-        # we need to know length of attended in build
-        self._n_attended = len(attended)
+        # we need to append attended spec to input spec below
+        self.attended_spec = [InputSpec(shape=K.int_shape(attended_))
+                              for attended_ in attended]
 
-        check_list = []
         if initial_state:
-            check_list += initial_state
-        if attended:
-            check_list += attended
+            check_list = initial_state + attended
+        else:
+            check_list = attended
         # at this point check_list cannot be empty
         is_keras_tensor = hasattr(check_list[0], '_keras_history')
         for tensor in check_list:
@@ -2231,10 +2274,9 @@ def __call__(self, inputs, initial_state=None, attended=None, **kwargs):
                 self.input_spec += state_spec
                 inputs += initial_state
                 kwargs['initial_state'] = initial_state
-            if attended:
-                self.input_spec += self.external_constants_spec
-                inputs += attended
-                kwargs['attended'] = attended
+            self.input_spec += self.attended_spec
+            inputs += attended
+            kwargs['attended'] = attended
 
             # Perform the call
             output = Layer.__call__(self, inputs, **kwargs)
@@ -2243,9 +2285,9 @@ def __call__(self, inputs, initial_state=None, attended=None, **kwargs):
             self.input_spec = input_spec
             return output
         else:
-            kwargs['initial_state'] = initial_state
-            if attended is not None:
-                kwargs['attended'] = attended
+            if initial_state:
+                kwargs['initial_state'] = initial_state
+            kwargs['attended'] = attended
             return Layer.__call__(self, inputs, **kwargs)
 
     def call(self,
diff --git a/tests/keras/layers/recurrent_test.py b/tests/keras/layers/recurrent_test.py
index 3526e5aaa9ca..ca66ef42bde5 100644
--- a/tests/keras/layers/recurrent_test.py
+++ b/tests/keras/layers/recurrent_test.py
@@ -568,7 +568,7 @@ def test_batch_size_equal_one(layer_class):
     model.train_on_batch(x, y)
 
 
-def test_rnn_cell_with_constants_layer():
+def test_attention_rnn():
 
     class RNNCellWithConstants(keras.layers.Layer):
 
@@ -597,12 +597,12 @@ def build(self, input_shape):
                 name='constant_kernel')
             self.built = True
 
-        def call(self, inputs, states, constants):
+        def call(self, inputs, states, attended):
             [prev_output] = states
-            [constant] = constants
+            [attended] = attended
             h_input = keras.backend.dot(inputs, self.input_kernel)
             h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
-            h_const = keras.backend.dot(constant, self.constant_kernel)
+            h_const = keras.backend.dot(attended, self.constant_kernel)
             output = h_input + h_state + h_const
             return output, [output]
 
@@ -613,11 +613,11 @@ def get_config(self):
 
     # Test basic case.
     x = keras.Input((None, 5))
-    c = keras.Input((3,))
+    attended = keras.Input((3,))
     cell = RNNCellWithConstants(32)
-    layer = recurrent.RNN(cell)
-    y = layer(x, constants=c)
-    model = keras.models.Model([x, c], y)
+    layer = recurrent.AttentionRNN(cell)
+    y = layer(x, attended=attended)
+    model = keras.models.Model([x, attended], y)
     model.compile(optimizer='rmsprop', loss='mse')
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
@@ -626,17 +626,17 @@ def get_config(self):
 
     # Test basic case serialization.
     x_np = np.random.random((6, 5, 5))
-    c_np = np.random.random((6, 3))
-    y_np = model.predict([x_np, c_np])
+    attended_np = np.random.random((6, 3))
+    y_np = model.predict([x_np, attended_np])
     weights = model.get_weights()
     config = layer.get_config()
     with keras.utils.CustomObjectScope(
         {'RNNCellWithConstants': RNNCellWithConstants}):
-        layer = recurrent.RNN.from_config(config)
-    y = layer(x, constants=c)
-    model = keras.models.Model([x, c], y)
+            layer = recurrent.AttentionRNN.from_config(config)
+    y = layer(x, attended=attended)
+    model = keras.models.Model([x, attended], y)
     model.set_weights(weights)
-    y_np_2 = model.predict([x_np, c_np])
+    y_np_2 = model.predict([x_np, attended_np])
     assert_allclose(y_np, y_np_2, atol=1e-4)
 
 
@@ -662,7 +662,7 @@ def test_functional_rnn_cell():
     model.train_on_batch(np.zeros((6, 5, input_size)), np.zeros((6, units)))
 
 
-def test_functional_rnn_cell_with_constants():
+def test_functional_rnn_cell_with_attended():
     layers = keras.layers
 
     # Create the cell:
@@ -671,22 +671,24 @@ def test_functional_rnn_cell_with_constants():
     constant_shape = (10,)
     x = Input((input_size,))
     h_tm1 = Input((units,))
-    c = Input(constant_shape)
+    attended = Input(constant_shape)
     h_ = layers.add([
         layers.Dense(units)(x),
         layers.Dense(units)(h_tm1),
-        layers.Dense(units)(c)
+        layers.Dense(units)(attended)
     ])
     h = layers.Activation('tanh')(h_)
 
     cell = recurrent.FunctionalRNNCell(
-        inputs=x, outputs=h, input_states=h_tm1, output_states=h, constants=c)
+        inputs=x, outputs=h, input_states=h_tm1, output_states=h,
+        attended=attended
+    )
 
     # Test basic case.
     x_seq = Input((None, input_size))
-    layer = recurrent.RNN(cell)
-    y = layer(x_seq, constants=c)
-    model = keras.models.Model([x_seq, c], y)
+    layer = recurrent.AttentionRNN(cell)
+    y = layer(x_seq, attended=attended)
+    model = keras.models.Model([x_seq, attended], y)
     model.compile(optimizer='rmsprop', loss='mse')
     model.train_on_batch(
         [np.zeros((6, 5, input_size)), np.zeros((6, constant_shape[0]))],

From fb91e4e4eec842d24bb1d7079f1bd3f6298e5da4 Mon Sep 17 00:00:00 2001
From: andhus <andhus@kth.se>
Date: Sun, 8 Oct 2017 22:54:55 +0200
Subject: [PATCH 08/13] minor clean-up of docs

---
 keras/layers/recurrent.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/keras/layers/recurrent.py b/keras/layers/recurrent.py
index abb00ece6c1b..0caa07ce9ba8 100644
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@@ -230,9 +230,7 @@ class FunctionalRNNCell(Wrapper):
     rnn = RNN(cell)
     y = rnn(x_sequence)
 
-    # We can also define cells that make use of "external" constants, to
-    # implement attention mechanisms:
-
+    # We can also define cells that attend to "external" constants
     attended_shape = (10,)
     attended = Input(attended_shape)
     density = Dense(attended_shape[0], activation='softmax')(
@@ -248,9 +246,9 @@ class FunctionalRNNCell(Wrapper):
     attention_rnn = AttentionRNN(attention_cell)
     y2 = attention_rnn(x_sequence, attended=attended)
 
-    # Remember to pass the constant to the RNN layer (which will pass it on to
-    # the cell). Also note that shape of c is same as in cell (no time
-    # dimension added)
+    # Remember to pass the attended to the AttentionRNN layer (which will pass
+    # it on to the cell). Also note that shape of the attended is same as in
+    # cell (no time dimension added)
 
     attention_model = Model([x_sequence, attended], y2)
     ```
@@ -2138,7 +2136,7 @@ class AttentionRNN(RNN):
             - a `call(input_at_t, states_at_t, attended)` method, returning
                 `(output_at_t, states_at_t_plus_1)`. It must accept the keyword
                 argument `attended` which refers to the input(s) (tensor or
-                list of tensors) that is attended to an will be presented as a
+                list of tensors) that is attended to and will be presented as a
                 whole at each timestep.
             - a `state_size` attribute. This can be a single integer
                 (single state) in which case it is the size of the recurrent
@@ -2149,7 +2147,7 @@ class AttentionRNN(RNN):
                 output.
             If the RNN cell is a keras layer, the input_shape passed to its
             `build` method will be a list of the input shape of the regular
-            sequence input followed by the shape(s) of the attended.
+            (sequence) input followed by the shape(s) of the attended.
         **kwargs: See docs of super class RNN.
 
     # Input shapes
@@ -2202,7 +2200,7 @@ class AttentionRNN(RNN):
 
     def __init__(self, cell, **kwargs):
         if isinstance(cell, (list, tuple)):
-            # Note: not obviously how one would want to propagate the attended
+            # Note: not obvious how one would want to propagate the attended
             # for stacked cells, user should stack them manually into a single
             # cell
             raise ValueError('AttentionRNN only supports a single cell')
@@ -2210,8 +2208,8 @@ def __init__(self, cell, **kwargs):
         # we let base class check that cel has call function before checking
         # for the additional argument
         if not has_arg(cell.call, 'attended'):
-            raise ValueError('`cell.call` does not take the keyword argument'
-                             ' attended')
+            raise ValueError('cell.call does not take the required keyword '
+                             'argument attended')
 
         self.attended_spec = None
 
@@ -2238,7 +2236,7 @@ def __call__(self, inputs, initial_state=None, attended=None, **kwargs):
         # `initial_state` and `attended`
         # TODO what is meant by "e.g. when loading model from file" in comment
         # in base class RNN, can there be a problem if initial states are not
-        # passed in the Attentive RNN with respect ot this!?
+        # passed in the Attentive RNN with respect to this!?
         inputs, initial_state, attended = self._normalize_args(
             inputs, initial_state, attended)
 
@@ -2342,7 +2340,7 @@ def call(self,
         # possible for theano backend to optimise the scan op, see section:
         # "Explicitly passing inputs of the inner function to scan" in:
         #   http://deeplearning.net/software/theano/library/scan.html#lib-scan-shared-variables
-        # but on the other hand we are not passed weights (shared variables)
+        # but on the other hand we are not passing weights (shared variables)
         # of the cell transformation anyway.
         step = functools.partial(self.cell.call, **cell_kwargs)
 

From fcc854cdf59a9644bf6ced3e82a6cc66ff9cf0d5 Mon Sep 17 00:00:00 2001
From: andhus <andhus@kth.se>
Date: Mon, 9 Oct 2017 09:00:12 +0200
Subject: [PATCH 09/13] Minor cleanup & improvments in docs, fixed PEP breaking
 formatting in attention test

---
 examples/functional_rnn_cell.py      |  48 ++++++------
 keras/layers/recurrent.py            | 113 ++++++++++++++-------------
 tests/keras/layers/recurrent_test.py |  52 ++++++------
 3 files changed, 108 insertions(+), 105 deletions(-)

diff --git a/examples/functional_rnn_cell.py b/examples/functional_rnn_cell.py
index 14287209173b..2c50212809ed 100644
--- a/examples/functional_rnn_cell.py
+++ b/examples/functional_rnn_cell.py
@@ -3,47 +3,51 @@
 import numpy as np
 
 from keras import Input
-from keras.layers import add, Dense, Activation, FunctionalRNNCell, RNN, \
-    concatenate, multiply, Model, AttentionRNN
+from keras.models import Model
+from keras.layers import add, concatenate, multiply, Dense, Activation
+from keras.layers.recurrent import FunctionalRNNCell, RNN, AttentionRNN
 
 units = 32
 input_size = 5
+
+# Use functional API to define RNN Cell transformation (in this case
+# simple vanilla RNN) for a single time step:
 x = Input((input_size,))
 h_in = Input((units,))
 h_ = add([Dense(units)(x), Dense(units, use_bias=False)(h_in)])
 h_out = Activation('tanh')(h_)
+cell = FunctionalRNNCell(inputs=x,
+                         outputs=h_out,
+                         input_states=h_in,
+                         output_states=h_out)
 
-# Create the cell:
-cell = FunctionalRNNCell(
-    inputs=x, outputs=h_out, input_states=h_in, output_states=h_out)
-
+# Inject cell in RNN and apply to input sequence
 x_sequence = Input((None, input_size))
 rnn = RNN(cell)
 y = rnn(x_sequence)
 
-# Modify the cell to make use of attention to "external" constants:
+# Modify the cell to make use of attention (condition transformation on
+# "external" constants such as an image or another sequence):
 attended_shape = (10,)
 attended = Input(attended_shape)
-density = Dense(attended_shape[0], activation='softmax')(
+attention_density = Dense(attended_shape[0], activation='softmax')(
     concatenate([x, h_in]))
-attention = multiply([density, attended])
-h2_ = add([h_out, Dense(units)(attention)])
-h_out_2 = Activation('tanh')(h2_)
-
-attention_cell = FunctionalRNNCell(
-    inputs=x,
-    outputs=h_out_2,
-    input_states=h_in,
-    output_states=h_out_2,
-    attended=attended
-)
-
+attention = multiply([attention_density, attended])
+h2_ = add([h_, Dense(units)(attention)])
+h2_out = Activation('tanh')(h2_)
+attention_cell = FunctionalRNNCell(inputs=x,
+                                   outputs=h2_out,
+                                   input_states=h_in,
+                                   output_states=h2_out,
+                                   attended=attended)
+
+# Pass the attentive cell to the AttentionRNN. Note that shape of attended is
+# same as in cell (no time dimension added)
 attention_rnn = AttentionRNN(attention_cell)
 y2 = attention_rnn(x_sequence, attended=attended)
-# Note that shape of c is same as in cell (no time dimension added)
 
+# Apply it on some (mock) data
 attention_model = Model([x_sequence, attended], y2)
-
 x_sequence_arr = np.random.randn(3, 5, input_size)
 attended_arr = np.random.randn(3, attended_shape[0])
 y2_arr = attention_model.predict([x_sequence_arr, attended_arr])
diff --git a/keras/layers/recurrent.py b/keras/layers/recurrent.py
index 0caa07ce9ba8..8512be91926b 100644
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@@ -211,46 +211,45 @@ class FunctionalRNNCell(Wrapper):
     # Examples
 
     ```python
-    # Use functional API to define RNN Cell transformation (in this case
-    # simple vanilla RNN) for a single time step:
-
-    units = 32
-    input_size = 5
-    x = Input((input_size,))
-    h_tm1 = Input((units,))
-    h_ = add([Dense(units)(x), Dense(units, use_bias=False)(h_tm1)])
-    h = Activation('tanh')(h_)
-
-    # Create the cell:
-
-    cell = FunctionalRNNCell(
-        inputs=x, outputs=h, input_states=h_tm1, output_states=h)
-
-    x_sequence = Input((None, input_size))
-    rnn = RNN(cell)
-    y = rnn(x_sequence)
-
-    # We can also define cells that attend to "external" constants
-    attended_shape = (10,)
-    attended = Input(attended_shape)
-    density = Dense(attended_shape[0], activation='softmax')(
-        concatenate([x, h_tm1]))
-    attention = multiply([density, attended])
-    h2_ = add([h_, Dense(units)(attention)])
-    h2 = Activation('tanh')(h2_)
-
-    attention_cell = FunctionalRNNCell(
-        inputs=x, outputs=h2, input_states=h_tm1, output_states=h2,
-        attended=attended)
-
-    attention_rnn = AttentionRNN(attention_cell)
-    y2 = attention_rnn(x_sequence, attended=attended)
-
-    # Remember to pass the attended to the AttentionRNN layer (which will pass
-    # it on to the cell). Also note that shape of the attended is same as in
-    # cell (no time dimension added)
-
-    attention_model = Model([x_sequence, attended], y2)
+        # Use functional API to define RNN Cell transformation (in this case
+        # simple vanilla RNN) for a single time step:
+        units = 32
+        input_size = 5
+        x = Input((input_size,))
+        h_in = Input((units,))
+        h_ = add([Dense(units)(x), Dense(units, use_bias=False)(h_in)])
+        h_out = Activation('tanh')(h_)
+        cell = FunctionalRNNCell(inputs=x,
+                                 outputs=h_out,
+                                 input_states=h_in,
+                                 output_states=h_out)
+
+        # Inject cell in RNN and apply to input sequence
+        x_sequence = Input((None, input_size))
+        rnn = RNN(cell)
+        y = rnn(x_sequence)
+
+        # Modify the cell to make use of attention (condition transformation on
+        # "external" constants such as an image or another sequence):
+        attended_shape = (10,)
+        attended = Input(attended_shape)
+        attention_density = Dense(attended_shape[0], activation='softmax')(
+            concatenate([x, h_in]))
+        attention = multiply([attention_density, attended])
+        h2_ = add([h_, Dense(units)(attention)])
+        h2_out = Activation('tanh')(h2_)
+        attention_cell = FunctionalRNNCell(inputs=x,
+                                           outputs=h2_out,
+                                           input_states=h_in,
+                                           output_states=h2_out,
+                                           attended=attended)
+
+        # Pass the attentive cell to the AttentionRNN. Note that shape of
+        # attended is same as in cell (no time dimension added)
+        attention_rnn = AttentionRNN(attention_cell)
+        y2 = attention_rnn(x_sequence, attended=attended)
+
+        attention_model = Model([x_sequence, attended], y2)
     ```
     """
     def __init__(
@@ -265,6 +264,9 @@ def __init__(
         input_states = _to_list_or_none(input_states)
         output_states = _to_list_or_none(output_states)
         attended = _to_list_or_none(attended)
+
+        # the same tensor should not be present multiple times in output of
+        # wrapped Model
         if outputs == output_states[0]:
             self.first_state_is_output = True
             model_outputs = output_states
@@ -273,7 +275,6 @@ def __init__(
                           ' first state')
             self.first_state_is_output = False
             model_outputs = [outputs] + output_states
-
         model = Model(
             inputs=self._get_model_inputs(inputs, input_states, attended),
             outputs=model_outputs
@@ -301,6 +302,10 @@ def call(self, inputs, states, attended=None):
                 previous time step.
             attended: Tensor or list of tensors or None representing inputs
                 that should be the same at each time step.
+
+        # Returns
+            output: output of cell transformation
+            new_states: the updated cell states
         """
         outputs = self.layer(self._get_model_inputs(inputs, states, attended))
         if not isinstance(outputs, list):
@@ -2169,27 +2174,23 @@ class AttentionRNN(RNN):
         attended = Input(attended_shape)
 
         # predict "attention density" based on input and previous state
-        density = Dense(attended_shape[0], activation='softmax')(
+        attention_density = Dense(attended_shape[0], activation='softmax')(
             concatenate([x, h_in]))
-        attention = multiply([density, attended])
+        attention = multiply([attention_density, attended])
 
-        h_ = add([
-            Dense(units)(x),
-            Dense(units)(attention),
-            Dense(units, use_bias=False)(h_in)
-        ])
+        h_ = add([Dense(units)(x),
+                  Dense(units)(attention),
+                  Dense(units, use_bias=False)(h_in)])
         h_out = Activation('tanh')(h_)
 
         # create cell
-        attention_cell = FunctionalRNNCell(
-            inputs=x,
-            outputs=h_out,
-            input_states=[h_in],
-            output_states=[h_out],
-            attended=attended
-        )
+        attention_cell = FunctionalRNNCell(inputs=x,
+                                           outputs=h_out,
+                                           input_states=[h_in],
+                                           output_states=[h_out],
+                                           attended=attended)
 
-        # apply on input sequence
+        # apply to input sequence
         x_sequence = Input((None, input_size))
         attention_rnn = AttentionRNN(attention_cell)
         y = attention_rnn(x_sequence, attended=attended)
diff --git a/tests/keras/layers/recurrent_test.py b/tests/keras/layers/recurrent_test.py
index ca66ef42bde5..eab72ba44163 100644
--- a/tests/keras/layers/recurrent_test.py
+++ b/tests/keras/layers/recurrent_test.py
@@ -570,18 +570,18 @@ def test_batch_size_equal_one(layer_class):
 
 def test_attention_rnn():
 
-    class RNNCellWithConstants(keras.layers.Layer):
+    class AttentionRNNCell(keras.layers.Layer):
 
         def __init__(self, units, **kwargs):
             self.units = units
             self.state_size = units
-            super(RNNCellWithConstants, self).__init__(**kwargs)
+            super(AttentionRNNCell, self).__init__(**kwargs)
 
         def build(self, input_shape):
             if not isinstance(input_shape, list):
-                raise TypeError('expects constants shape')
-            [input_shape, constant_shape] = input_shape
-            # will (and should) raise if more than one constant passed
+                raise TypeError('expects shape of attended')
+            [input_shape, attended_shape] = input_shape
+            # will (and should) raise if more than one attended tensor passed
 
             self.input_kernel = self.add_weight(
                 shape=(input_shape[-1], self.units),
@@ -591,10 +591,10 @@ def build(self, input_shape):
                 shape=(self.units, self.units),
                 initializer='uniform',
                 name='recurrent_kernel')
-            self.constant_kernel = self.add_weight(
-                shape=(constant_shape[-1], self.units),
+            self.attended_kernel = self.add_weight(
+                shape=(attended_shape[-1], self.units),
                 initializer='uniform',
-                name='constant_kernel')
+                name='attended_kernel')
             self.built = True
 
         def call(self, inputs, states, attended):
@@ -602,19 +602,19 @@ def call(self, inputs, states, attended):
             [attended] = attended
             h_input = keras.backend.dot(inputs, self.input_kernel)
             h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
-            h_const = keras.backend.dot(attended, self.constant_kernel)
+            h_const = keras.backend.dot(attended, self.attended_kernel)
             output = h_input + h_state + h_const
             return output, [output]
 
         def get_config(self):
             config = {'units': self.units}
-            base_config = super(RNNCellWithConstants, self).get_config()
+            base_config = super(AttentionRNNCell, self).get_config()
             return dict(list(base_config.items()) + list(config.items()))
 
     # Test basic case.
     x = keras.Input((None, 5))
     attended = keras.Input((3,))
-    cell = RNNCellWithConstants(32)
+    cell = AttentionRNNCell(32)
     layer = recurrent.AttentionRNN(cell)
     y = layer(x, attended=attended)
     model = keras.models.Model([x, attended], y)
@@ -630,9 +630,8 @@ def get_config(self):
     y_np = model.predict([x_np, attended_np])
     weights = model.get_weights()
     config = layer.get_config()
-    with keras.utils.CustomObjectScope(
-        {'RNNCellWithConstants': RNNCellWithConstants}):
-            layer = recurrent.AttentionRNN.from_config(config)
+    with keras.utils.CustomObjectScope({'AttentionRNNCell': AttentionRNNCell}):
+        layer = recurrent.AttentionRNN.from_config(config)
     y = layer(x, attended=attended)
     model = keras.models.Model([x, attended], y)
     model.set_weights(weights)
@@ -650,9 +649,10 @@ def test_functional_rnn_cell():
     h_tm1 = Input((units,))
     h_ = layers.add([layers.Dense(units)(x), layers.Dense(units)(h_tm1)])
     h = layers.Activation('tanh')(h_)
-    cell = recurrent.FunctionalRNNCell(
-        inputs=x, outputs=h, input_states=h_tm1, output_states=h)
-
+    cell = recurrent.FunctionalRNNCell(inputs=x,
+                                       outputs=h,
+                                       input_states=h_tm1,
+                                       output_states=h)
     # Test basic case.
     x_seq = Input((None, input_size))
     layer = recurrent.RNN(cell)
@@ -672,18 +672,16 @@ def test_functional_rnn_cell_with_attended():
     x = Input((input_size,))
     h_tm1 = Input((units,))
     attended = Input(constant_shape)
-    h_ = layers.add([
-        layers.Dense(units)(x),
-        layers.Dense(units)(h_tm1),
-        layers.Dense(units)(attended)
-    ])
+    h_ = layers.add([layers.Dense(units)(x),
+                     layers.Dense(units)(h_tm1),
+                     layers.Dense(units)(attended)])
     h = layers.Activation('tanh')(h_)
 
-    cell = recurrent.FunctionalRNNCell(
-        inputs=x, outputs=h, input_states=h_tm1, output_states=h,
-        attended=attended
-    )
-
+    cell = recurrent.FunctionalRNNCell(inputs=x,
+                                       outputs=h,
+                                       input_states=h_tm1,
+                                       output_states=h,
+                                       attended=attended)
     # Test basic case.
     x_seq = Input((None, input_size))
     layer = recurrent.AttentionRNN(cell)

From ab89c6ae1983b643e12145af0c902b05a9c9d2db Mon Sep 17 00:00:00 2001
From: andhus <andhus@kth.se>
Date: Sat, 21 Oct 2017 02:31:32 +0200
Subject: [PATCH 10/13] removed FunctionalRNNCell and AttentionRNN, added back
 support for constants in RNN

---
 examples/functional_rnn_cell.py      |  53 ---
 keras/layers/recurrent.py            | 634 +++++++--------------------
 tests/keras/layers/recurrent_test.py | 106 ++---
 3 files changed, 183 insertions(+), 610 deletions(-)
 delete mode 100644 examples/functional_rnn_cell.py

diff --git a/examples/functional_rnn_cell.py b/examples/functional_rnn_cell.py
deleted file mode 100644
index 2c50212809ed..000000000000
--- a/examples/functional_rnn_cell.py
+++ /dev/null
@@ -1,53 +0,0 @@
-from __future__ import division, print_function
-
-import numpy as np
-
-from keras import Input
-from keras.models import Model
-from keras.layers import add, concatenate, multiply, Dense, Activation
-from keras.layers.recurrent import FunctionalRNNCell, RNN, AttentionRNN
-
-units = 32
-input_size = 5
-
-# Use functional API to define RNN Cell transformation (in this case
-# simple vanilla RNN) for a single time step:
-x = Input((input_size,))
-h_in = Input((units,))
-h_ = add([Dense(units)(x), Dense(units, use_bias=False)(h_in)])
-h_out = Activation('tanh')(h_)
-cell = FunctionalRNNCell(inputs=x,
-                         outputs=h_out,
-                         input_states=h_in,
-                         output_states=h_out)
-
-# Inject cell in RNN and apply to input sequence
-x_sequence = Input((None, input_size))
-rnn = RNN(cell)
-y = rnn(x_sequence)
-
-# Modify the cell to make use of attention (condition transformation on
-# "external" constants such as an image or another sequence):
-attended_shape = (10,)
-attended = Input(attended_shape)
-attention_density = Dense(attended_shape[0], activation='softmax')(
-    concatenate([x, h_in]))
-attention = multiply([attention_density, attended])
-h2_ = add([h_, Dense(units)(attention)])
-h2_out = Activation('tanh')(h2_)
-attention_cell = FunctionalRNNCell(inputs=x,
-                                   outputs=h2_out,
-                                   input_states=h_in,
-                                   output_states=h2_out,
-                                   attended=attended)
-
-# Pass the attentive cell to the AttentionRNN. Note that shape of attended is
-# same as in cell (no time dimension added)
-attention_rnn = AttentionRNN(attention_cell)
-y2 = attention_rnn(x_sequence, attended=attended)
-
-# Apply it on some (mock) data
-attention_model = Model([x_sequence, attended], y2)
-x_sequence_arr = np.random.randn(3, 5, input_size)
-attended_arr = np.random.randn(3, attended_shape[0])
-y2_arr = attention_model.predict([x_sequence_arr, attended_arr])
diff --git a/keras/layers/recurrent.py b/keras/layers/recurrent.py
index 8512be91926b..867563cd94a2 100644
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@@ -1,11 +1,8 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import
 import numpy as np
-import functools
 import warnings
 
-from keras.engine import Model
-from keras.layers.wrappers import Wrapper
 from .. import backend as K
 from .. import activations
 from .. import initializers
@@ -196,142 +193,15 @@ def get_losses_for(self, inputs=None):
         return losses
 
 
-class FunctionalRNNCell(Wrapper):
-    """Wrapper for allowing composition of RNN Cells using functional API.
-
-    # Arguments:
-        inputs: input tensor at a single time step
-        outputs: output tensor at a single timestep
-        input_states: state tensor(s) from previous time step
-        output_states: state tensor(s) after cell transformation
-        attended: tensor(s) or None, represents inputs that should be static
-            (the same) for each time step. Used for implementing attention
-            mechanisms.
-
-    # Examples
-
-    ```python
-        # Use functional API to define RNN Cell transformation (in this case
-        # simple vanilla RNN) for a single time step:
-        units = 32
-        input_size = 5
-        x = Input((input_size,))
-        h_in = Input((units,))
-        h_ = add([Dense(units)(x), Dense(units, use_bias=False)(h_in)])
-        h_out = Activation('tanh')(h_)
-        cell = FunctionalRNNCell(inputs=x,
-                                 outputs=h_out,
-                                 input_states=h_in,
-                                 output_states=h_out)
-
-        # Inject cell in RNN and apply to input sequence
-        x_sequence = Input((None, input_size))
-        rnn = RNN(cell)
-        y = rnn(x_sequence)
-
-        # Modify the cell to make use of attention (condition transformation on
-        # "external" constants such as an image or another sequence):
-        attended_shape = (10,)
-        attended = Input(attended_shape)
-        attention_density = Dense(attended_shape[0], activation='softmax')(
-            concatenate([x, h_in]))
-        attention = multiply([attention_density, attended])
-        h2_ = add([h_, Dense(units)(attention)])
-        h2_out = Activation('tanh')(h2_)
-        attention_cell = FunctionalRNNCell(inputs=x,
-                                           outputs=h2_out,
-                                           input_states=h_in,
-                                           output_states=h2_out,
-                                           attended=attended)
-
-        # Pass the attentive cell to the AttentionRNN. Note that shape of
-        # attended is same as in cell (no time dimension added)
-        attention_rnn = AttentionRNN(attention_cell)
-        y2 = attention_rnn(x_sequence, attended=attended)
-
-        attention_model = Model([x_sequence, attended], y2)
-    ```
-    """
-    def __init__(
-        self,
-        inputs,
-        outputs,
-        input_states,
-        output_states,
-        attended=None,
-        **kwargs
-    ):
-        input_states = _to_list_or_none(input_states)
-        output_states = _to_list_or_none(output_states)
-        attended = _to_list_or_none(attended)
-
-        # the same tensor should not be present multiple times in output of
-        # wrapped Model
-        if outputs == output_states[0]:
-            self.first_state_is_output = True
-            model_outputs = output_states
-        else:
-            warnings.warn('it is expected by RNN that output tensor is same as'
-                          ' first state')
-            self.first_state_is_output = False
-            model_outputs = [outputs] + output_states
-        model = Model(
-            inputs=self._get_model_inputs(inputs, input_states, attended),
-            outputs=model_outputs
-        )
-        super(FunctionalRNNCell, self).__init__(layer=model, **kwargs)
-
-        in_states_shape = [K.int_shape(state) for state in input_states]
-        out_states_shape = [K.int_shape(state) for state in output_states]
-        if not in_states_shape == out_states_shape:
-            raise ValueError(
-                'shape of input_states: {} are not same as shape of '
-                'output_states: {}'.format(in_states_shape, out_states_shape))
-        self._state_size = [state_shape[-1] for state_shape in in_states_shape]
-
-    @property
-    def state_size(self):
-        return self._state_size
-
-    def call(self, inputs, states, attended=None):
-        """Defines the cell transformation for a single time step.
-
-        # Arguments
-            inputs: Tensor representing input at current time step.
-            states: Tensor or list/tuple of tensors representing states from
-                previous time step.
-            attended: Tensor or list of tensors or None representing inputs
-                that should be the same at each time step.
-
-        # Returns
-            output: output of cell transformation
-            new_states: the updated cell states
-        """
-        outputs = self.layer(self._get_model_inputs(inputs, states, attended))
-        if not isinstance(outputs, list):
-            # if a list of a single output is passed to Model it still
-            # just returns a tensor
-            outputs = [outputs]
-        output = outputs[0]
-        new_states = outputs if self.first_state_is_output else outputs[1:]
-        return output, new_states
-
-    @staticmethod
-    def _get_model_inputs(inputs, input_states, attended):
-        inputs = [inputs] + list(input_states)
-        if attended is not None:
-            inputs += attended
-
-        return inputs
-
-
 class RNN(Layer):
     """Base class for recurrent layers.
 
     # Arguments
         cell: A RNN cell instance. A RNN cell is a class that has:
             - a `call(input_at_t, states_at_t)` method, returning
-                `(output_at_t, states_at_t_plus_1)`.
+                `(output_at_t, states_at_t_plus_1)`. The call method of the
+                cell can also take the optional argument `constants`, see
+                section "Note on passing external constants" below.
             - a `state_size` attribute. This can be a single integer
                 (single state) in which case it is
                 the size of the recurrent state
@@ -423,6 +293,14 @@ class RNN(Layer):
         `states` should be a numpy array or list of numpy arrays representing
         the initial state of the RNN layer.
 
+    # Note on passing external constants to RNNs
+        You can pass "external" constants to the cell using the `constants`
+        keyword argument of RNN.__call__ (as well as RNN.call) method. This
+        requires that the `cell.call` method accepts the same keyword argument
+        `constants`. Such constants can be used to condition the cell
+        transformation on additional static inputs (not changing over time)
+        (a.k.a. as attention mechanism).
+
     # Examples
 
     ```python
@@ -494,13 +372,11 @@ def __init__(self, cell,
 
         self.supports_masking = True
         self.input_spec = [InputSpec(ndim=3)]
-        if hasattr(self.cell.state_size, '__len__'):
-            self.state_spec = [InputSpec(shape=(None, dim))
-                               for dim in self.cell.state_size]
-        else:
-            self.state_spec = InputSpec(shape=(None, self.cell.state_size))
+        self.state_spec = None
         self._states = None
-
+        self.constants_spec = None
+        self._n_constants = None  # used for splitting inputs after
+                                  # serialization of layer
     @property
     def states(self):
         if self._states is None:
@@ -546,6 +422,13 @@ def compute_mask(self, inputs, mask):
             return output_mask
 
     def build(self, input_shape):
+        # Note input_shape will be list of shapes of initial states and
+        # constants if these are passed in __call__.
+        if self._n_constants is not None:
+            constants_shape = input_shape[-self._n_constants:]
+        else:
+            constants_shape = None
+
         if isinstance(input_shape, list):
             input_shape = input_shape[0]
 
@@ -553,12 +436,32 @@ def build(self, input_shape):
         input_dim = input_shape[-1]
         self.input_spec[0] = InputSpec(shape=(batch_size, None, input_dim))
 
-        if self.stateful:
-            self.reset_states()
-
+        # allow cell (if layer) to build before we set or validate state_spec
         if isinstance(self.cell, Layer):
             step_input_shape = (input_shape[0],) + input_shape[2:]
-            self.cell.build(step_input_shape)
+            if constants_shape is not None:
+                self.cell.build([step_input_shape] + constants_shape)
+            else:
+                self.cell.build(step_input_shape)
+
+        # set or validate state_spec
+        if hasattr(self.cell.state_size, '__len__'):
+            state_size = list(self.cell.state_size)
+        else:
+            state_size = [self.cell.state_size]
+
+        if self.state_spec is not None:
+            # initial_state was passed in call, check compatibility
+            if not [spec.shape[-1] for spec in self.state_spec] == state_size:
+                raise ValueError(
+                    'an initial_state was passed that is not compatible with'
+                    ' cell.state_size, state_spec: {}, cell.state_size:'
+                    ' {}'.format(self.state_spec, self.cell.state_size))
+        else:
+            self.state_spec = [InputSpec(shape=(None, dim))
+                               for dim in state_size]
+        if self.stateful:
+            self.reset_states()
 
     def get_initial_state(self, inputs):
         # build an all-zero tensor of shape (samples, output_dim)
@@ -571,62 +474,68 @@ def get_initial_state(self, inputs):
         else:
             return [K.tile(initial_state, [1, self.cell.state_size])]
 
-    def __call__(self, inputs, initial_state=None, **kwargs):
-        # If there are multiple inputs, then
-        # they should be the main input and `initial_state`
-        # e.g. when loading model from file
-        if isinstance(inputs, (list, tuple)) and len(inputs) > 1 and initial_state is None:
-            initial_state = inputs[1:]
-            inputs = inputs[0]
+    def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
+        inputs, initial_state, constants = self._normalize_args(
+            inputs, initial_state, constants)
 
-        # If `initial_state` is specified,
-        # and if it a Keras tensor,
-        # then add it to the inputs and temporarily
-        # modify the input spec to include the state.
-        if initial_state is None:
+        if initial_state is None and constants is None:
             return super(RNN, self).__call__(inputs, **kwargs)
 
-        if not isinstance(initial_state, (list, tuple)):
-            initial_state = [initial_state]
+        # If any of `initial_state` or `constants` are specified and are Keras
+        # tensors, then add them to the inputs and temporarily modify the
+        # input_spec to include them.
 
-        is_keras_tensor = hasattr(initial_state[0], '_keras_history')
-        for tensor in initial_state:
+        check_list = []
+        if initial_state is not None:
+            kwargs['initial_state'] = initial_state
+            check_list += initial_state
+            self.state_spec = [InputSpec(shape=K.int_shape(state))
+                               for state in initial_state]
+        if constants is not None:
+            kwargs['constants'] = constants
+            check_list += constants
+            self.constants_spec = [InputSpec(shape=K.int_shape(constant))
+                                   for constant in constants]
+            self._n_constants = len(constants)
+        # at this point check_list cannot be empty
+        is_keras_tensor = hasattr(check_list[0], '_keras_history')
+        for tensor in check_list:
             if hasattr(tensor, '_keras_history') != is_keras_tensor:
-                raise ValueError('The initial state of an RNN layer cannot be'
-                                 ' specified with a mix of Keras tensors and'
-                                 ' non-Keras tensors')
+                raise ValueError('The initial state and constants of an RNN'
+                                 ' layer cannot be specified with a mix of'
+                                 ' Keras tensors and non-Keras tensors')
 
         if is_keras_tensor:
-            # Compute the full input spec, including state
-            input_spec = self.input_spec
-            state_spec = self.state_spec
-            if not isinstance(input_spec, list):
-                input_spec = [input_spec]
-            if not isinstance(state_spec, list):
-                state_spec = [state_spec]
-            self.input_spec = input_spec + state_spec
-
-            # Compute the full inputs, including state
-            inputs = [inputs] + list(initial_state)
-
-            # Perform the call
-            output = super(RNN, self).__call__(inputs, **kwargs)
-
-            # Restore original input spec
-            self.input_spec = input_spec
+            # Compute the full input spec, including state and constants
+            full_input = [inputs]
+            full_input_spec = self.input_spec
+            if initial_state:
+                full_input += initial_state
+                full_input_spec += self.state_spec
+            if constants:
+                full_input += constants
+                full_input_spec += self.constants_spec
+            # Perform the call with temporarily replaced input_spec
+            original_input_spec = self.input_spec
+            self.input_spec = full_input_spec
+            output = super(RNN, self).__call__(full_input, **kwargs)
+            self.input_spec = original_input_spec
             return output
         else:
-            kwargs['initial_state'] = initial_state
             return super(RNN, self).__call__(inputs, **kwargs)
 
-    def call(self, inputs, mask=None, training=None, initial_state=None):
+    def call(self,
+             inputs,
+             mask=None,
+             training=None,
+             initial_state=None,
+             constants=None):
         # input shape: `(samples, time (padded with zeros), input_dim)`
         # note that the .build() method of subclasses MUST define
         # self.input_spec and self.state_spec with complete input shapes.
         if isinstance(inputs, list):
-            initial_state = inputs[1:]
             inputs = inputs[0]
-        elif initial_state is not None:
+        if initial_state is not None:
             pass
         elif self.stateful:
             initial_state = self.states
@@ -656,13 +565,27 @@ def call(self, inputs, mask=None, training=None, initial_state=None):
                              'the time dimension by passing a `shape` '
                              'or `batch_shape` argument to your Input layer.')
 
+        kwargs = {}
         if has_arg(self.cell.call, 'training'):
-            step = functools.partial(self.cell.call, training=training)
+            kwargs['training'] = training
+
+        if constants:
+            if not has_arg(self.cell.call, 'constants'):
+                raise ValueError('RNN cell does not support constants')
+
+            def step(inputs, states):
+                constants = states[-self._n_constants:]
+                states = states[:-self._n_constants]
+                return self.cell.call(inputs, states, constants=constants,
+                                      **kwargs)
         else:
-            step = self.cell.call
+            def step(inputs, states):
+                return self.cell.call(inputs, states, **kwargs)
+
         last_output, outputs, states = K.rnn(step,
                                              inputs,
                                              initial_state,
+                                             constants=constants,
                                              go_backwards=self.go_backwards,
                                              mask=mask,
                                              unroll=self.unroll,
@@ -691,6 +614,48 @@ def call(self, inputs, mask=None, training=None, initial_state=None):
         else:
             return output
 
+    def _normalize_args(self, inputs, initial_state, constants):
+        """When running a model loaded from file, the input tensors
+        `initial_state` and `constants` can be passed to RNN.__call__ as part
+        of `inputs` in stead of by the dedicated keyword argumetes. In this
+        case `inputs` is a list of tensors of which the first one is the
+        actual (sequence) input followed by initial states, followed by
+        constants.
+
+        This method makes sure initial_states and constants are separated from
+        inputs and that the are lists of tensors (or None).
+
+        # Arguments
+            inputs: tensor of list/tuple of tensors
+            initial_state: tensor or list of tensors or None
+            constants: tensor or list of tensors or None
+
+        # Returns
+            inputs: tensor
+            initial_state: list of tensors or None
+            constants: list of tensors or None
+        """
+        if isinstance(inputs, list):
+            assert initial_state is None and constants is None
+            if self._n_constants is not None:
+                constants = inputs[-self._n_constants:]
+                inputs = inputs[:-self._n_constants]
+            if len(inputs) > 1:
+                initial_state = inputs[1:]
+            inputs = inputs[0]
+
+        def to_list_or_none(x):  # TODO break out?
+            if x is None or isinstance(x, list):
+                return x
+            if isinstance(x, tuple):
+                return list(x)
+            return [x]
+
+        initial_state = to_list_or_none(initial_state)
+        constants = to_list_or_none(constants)
+
+        return inputs, initial_state, constants
+
     def reset_states(self, states=None):
         if not self.stateful:
             raise AttributeError('Layer must be stateful.')
@@ -749,6 +714,9 @@ def get_config(self):
                   'go_backwards': self.go_backwards,
                   'stateful': self.stateful,
                   'unroll': self.unroll}
+        if self._n_constants is not None:
+            config['_n_constants'] = self._n_constants
+
         cell_config = self.cell.get_config()
         config['cell'] = {'class_name': self.cell.__class__.__name__,
                           'config': cell_config}
@@ -760,7 +728,10 @@ def from_config(cls, config, custom_objects=None):
         from . import deserialize as deserialize_layer
         cell = deserialize_layer(config.pop('cell'),
                                  custom_objects=custom_objects)
-        return cls(cell, **config)
+        n_constants = config.pop('_n_constants', None)
+        layer = cls(cell, **config)
+        layer._n_constants = n_constants
+        return layer
 
     @property
     def trainable_weights(self):
@@ -2131,294 +2102,3 @@ def from_config(cls, config):
         if 'implementation' in config and config['implementation'] == 0:
             config['implementation'] = 1
         return cls(**config)
-
-
-class AttentionRNN(RNN):
-    """Base class for attentive recurrent layers.
-
-    # Arguments
-        cell: A RNN cell instance supporting attention. It should implement:
-            - a `call(input_at_t, states_at_t, attended)` method, returning
-                `(output_at_t, states_at_t_plus_1)`. It must accept the keyword
-                argument `attended` which refers to the input(s) (tensor or
-                list of tensors) that is attended to and will be presented as a
-                whole at each timestep.
-            - a `state_size` attribute. This can be a single integer
-                (single state) in which case it is the size of the recurrent
-                state (which should be the same as the size of the cell
-                output). This can also be a list/tuple of integers
-                (one size per state). In this case, the first entry
-                (`state_size[0]`) should be the same as the size of the cell
-                output.
-            If the RNN cell is a keras layer, the input_shape passed to its
-            `build` method will be a list of the input shape of the regular
-            (sequence) input followed by the shape(s) of the attended.
-        **kwargs: See docs of super class RNN.
-
-    # Input shapes
-        3D tensor with shape `(batch_size, timesteps, input_dim)`,
-        (Optional) 2D tensors with shape `(batch_size, output_dim)`.
-
-    # Attended shapes
-        ND tensor of the shape expected by the attentive cell.
-
-    # Examples
-
-    ```python
-        units = 32
-        input_size = 5
-        attended_shape = (10,)
-
-        x = Input((input_size,))
-        h_in = Input((units,))
-        attended = Input(attended_shape)
-
-        # predict "attention density" based on input and previous state
-        attention_density = Dense(attended_shape[0], activation='softmax')(
-            concatenate([x, h_in]))
-        attention = multiply([attention_density, attended])
-
-        h_ = add([Dense(units)(x),
-                  Dense(units)(attention),
-                  Dense(units, use_bias=False)(h_in)])
-        h_out = Activation('tanh')(h_)
-
-        # create cell
-        attention_cell = FunctionalRNNCell(inputs=x,
-                                           outputs=h_out,
-                                           input_states=[h_in],
-                                           output_states=[h_out],
-                                           attended=attended)
-
-        # apply to input sequence
-        x_sequence = Input((None, input_size))
-        attention_rnn = AttentionRNN(attention_cell)
-        y = attention_rnn(x_sequence, attended=attended)
-
-        attention_model = Model([x_sequence, attended], y)
-    ```
-    """
-
-    def __init__(self, cell, **kwargs):
-        if isinstance(cell, (list, tuple)):
-            # Note: not obvious how one would want to propagate the attended
-            # for stacked cells, user should stack them manually into a single
-            # cell
-            raise ValueError('AttentionRNN only supports a single cell')
-        super(AttentionRNN, self).__init__(cell=cell, **kwargs)
-        # we let base class check that cel has call function before checking
-        # for the additional argument
-        if not has_arg(cell.call, 'attended'):
-            raise ValueError('cell.call does not take the required keyword '
-                             'argument attended')
-
-        self.attended_spec = None
-
-    def build(self, input_shape):
-        if isinstance(self.attended_spec, list):
-            attended_shapes = input_shape[-len(self.attended_spec):]
-        else:
-            attended_shapes = input_shape[-1:]
-
-        input_shape = input_shape[0]
-        batch_size = input_shape[0] if self.stateful else None
-        input_dim = input_shape[-1]
-        self.input_spec[0] = InputSpec(shape=(batch_size, None, input_dim))
-
-        if self.stateful:
-            self.reset_states()
-
-        if isinstance(self.cell, Layer):
-            step_input_shape = (input_shape[0],) + input_shape[2:]
-            self.cell.build([step_input_shape] + attended_shapes)
-
-    def __call__(self, inputs, initial_state=None, attended=None, **kwargs):
-        # If there are multiple inputs, then they should be the main input,
-        # `initial_state` and `attended`
-        # TODO what is meant by "e.g. when loading model from file" in comment
-        # in base class RNN, can there be a problem if initial states are not
-        # passed in the Attentive RNN with respect to this!?
-        inputs, initial_state, attended = self._normalize_args(
-            inputs, initial_state, attended)
-
-        if attended is None:
-            raise ValueError('attended input must be passed')
-        # we need to append attended spec to input spec below
-        self.attended_spec = [InputSpec(shape=K.int_shape(attended_))
-                              for attended_ in attended]
-
-        if initial_state:
-            check_list = initial_state + attended
-        else:
-            check_list = attended
-        # at this point check_list cannot be empty
-        is_keras_tensor = hasattr(check_list[0], '_keras_history')
-        for tensor in check_list:
-            if hasattr(tensor, '_keras_history') != is_keras_tensor:
-                raise ValueError('The initial state and attended of an RNN'
-                                 ' layer cannot be specified with a mix of'
-                                 ' Keras tensors and non-Keras tensors')
-
-        if is_keras_tensor:
-            # Compute the full input spec, including state and attended
-            input_spec = self.input_spec
-            state_spec = self.state_spec
-            if not isinstance(input_spec, list):
-                input_spec = [input_spec]
-            if not isinstance(state_spec, list):
-                state_spec = [state_spec]
-            self.input_spec = input_spec
-            inputs = [inputs]
-            if initial_state:
-                self.input_spec += state_spec
-                inputs += initial_state
-                kwargs['initial_state'] = initial_state
-            self.input_spec += self.attended_spec
-            inputs += attended
-            kwargs['attended'] = attended
-
-            # Perform the call
-            output = Layer.__call__(self, inputs, **kwargs)
-
-            # Restore original input spec
-            self.input_spec = input_spec
-            return output
-        else:
-            if initial_state:
-                kwargs['initial_state'] = initial_state
-            kwargs['attended'] = attended
-            return Layer.__call__(self, inputs, **kwargs)
-
-    def call(self,
-             inputs,
-             mask=None,
-             training=None,
-             initial_state=None,
-             attended=None):
-        # TODO this method duplicates almost everything in RNN.call,
-        # better solution?
-
-        # input shape: `(samples, time (padded with zeros), input_dim)`
-        # note that the .build() method of subclasses MUST define
-        # self.input_spec and self.state_spec with complete input shapes.
-        if isinstance(inputs, list):
-            inputs = inputs[0]
-        if initial_state is not None:
-            pass
-        elif self.stateful:
-            initial_state = self.states
-        else:
-            initial_state = self.get_initial_state(inputs)
-
-        if isinstance(mask, list):
-            mask = mask[0]
-
-        if len(initial_state) != len(self.states):
-            raise ValueError('Layer has ' + str(len(self.states)) +
-                             ' states but was passed ' +
-                             str(len(initial_state)) +
-                             ' initial states.')
-        input_shape = K.int_shape(inputs)
-        timesteps = input_shape[1]
-        if self.unroll and timesteps in [None, 1]:
-            raise ValueError('Cannot unroll a RNN if the '
-                             'time dimension is undefined or equal to 1. \n'
-                             '- If using a Sequential model, '
-                             'specify the time dimension by passing '
-                             'an `input_shape` or `batch_input_shape` '
-                             'argument to your first layer. If your '
-                             'first layer is an Embedding, you can '
-                             'also use the `input_length` argument.\n'
-                             '- If using the functional API, specify '
-                             'the time dimension by passing a `shape` '
-                             'or `batch_shape` argument to your Input layer.')
-
-        cell_kwargs = {'attended': attended}
-        if has_arg(self.cell.call, 'training'):
-            cell_kwargs['training'] = training
-
-        # NOTE: by passing the attended implicitly into the K.rnn it is not
-        # possible for theano backend to optimise the scan op, see section:
-        # "Explicitly passing inputs of the inner function to scan" in:
-        #   http://deeplearning.net/software/theano/library/scan.html#lib-scan-shared-variables
-        # but on the other hand we are not passing weights (shared variables)
-        # of the cell transformation anyway.
-        step = functools.partial(self.cell.call, **cell_kwargs)
-
-        last_output, outputs, states = K.rnn(step,
-                                             inputs,
-                                             initial_state,
-                                             go_backwards=self.go_backwards,
-                                             mask=mask,
-                                             unroll=self.unroll,
-                                             input_length=timesteps)
-        if self.stateful:
-            updates = []
-            for i in range(len(states)):
-                updates.append((self.states[i], states[i]))
-            self.add_update(updates, inputs)
-
-        if self.return_sequences:
-            output = outputs
-        else:
-            output = last_output
-
-        # Properly set learning phase
-        if getattr(last_output, '_uses_learning_phase', False):
-            output._uses_learning_phase = True
-
-        if self.return_state:
-            if not isinstance(states, (list, tuple)):
-                states = [states]
-            else:
-                states = list(states)
-            return [output] + states
-        else:
-            return output
-
-    def _normalize_args(self, inputs, initial_state, attended):
-        """The inputs `initial_state` and `attended` can be passed to
-        AttentionRNN.__call__ either by separate arguments or as part of
-        `inputs`. In this case `inputs` is a list of tensors of which the first
-        one is the actual (sequence) input followed by initial states followed
-        by the attended.
-
-        This method separates and normalizes the different groups of inputs.
-
-        # Arguments
-            inputs: tensor of list/tuple of tensors
-            initial_state: tensor or list of tensors or None
-            attended: tensor or list of tensors or None
-
-        # Returns
-            inputs: tensor
-            initial_state: list of tensors or None
-            attended: list of tensors or None
-        """
-        if isinstance(inputs, (list, tuple)):
-            remaining_inputs = inputs[1:]
-            inputs = inputs[0]
-            if remaining_inputs and initial_state is None:
-                if isinstance(self.state_spec, list):
-                    n_states = len(self.state_spec)
-                else:
-                    n_states = 1
-                initial_state = remaining_inputs[:n_states]
-                remaining_inputs = remaining_inputs[n_states:]
-            if remaining_inputs and attended is None:
-                attended = remaining_inputs
-            if len(remaining_inputs) > 0:
-                raise ValueError('too many inputs were passed')
-
-        initial_state = _to_list_or_none(initial_state)
-        attended = _to_list_or_none(attended)
-
-        return inputs, initial_state, attended
-
-
-def _to_list_or_none(x):  # TODO move? Very similar to topology._to_list
-    if x is None or isinstance(x, list):
-        return x
-    if isinstance(x, tuple):
-        return list(x)
-    return [x]
diff --git a/tests/keras/layers/recurrent_test.py b/tests/keras/layers/recurrent_test.py
index eab72ba44163..24aa68d4d761 100644
--- a/tests/keras/layers/recurrent_test.py
+++ b/tests/keras/layers/recurrent_test.py
@@ -568,20 +568,20 @@ def test_batch_size_equal_one(layer_class):
     model.train_on_batch(x, y)
 
 
-def test_attention_rnn():
+def test_rnn_cell_with_constants_layer():
 
-    class AttentionRNNCell(keras.layers.Layer):
+    class RNNCellWithConstants(keras.layers.Layer):
 
         def __init__(self, units, **kwargs):
             self.units = units
             self.state_size = units
-            super(AttentionRNNCell, self).__init__(**kwargs)
+            super(RNNCellWithConstants, self).__init__(**kwargs)
 
         def build(self, input_shape):
             if not isinstance(input_shape, list):
-                raise TypeError('expects shape of attended')
-            [input_shape, attended_shape] = input_shape
-            # will (and should) raise if more than one attended tensor passed
+                raise TypeError('expects constants shape')
+            [input_shape, constant_shape] = input_shape
+            # will (and should) raise if more than one constant passed
 
             self.input_kernel = self.add_weight(
                 shape=(input_shape[-1], self.units),
@@ -591,33 +591,33 @@ def build(self, input_shape):
                 shape=(self.units, self.units),
                 initializer='uniform',
                 name='recurrent_kernel')
-            self.attended_kernel = self.add_weight(
-                shape=(attended_shape[-1], self.units),
+            self.constant_kernel = self.add_weight(
+                shape=(constant_shape[-1], self.units),
                 initializer='uniform',
-                name='attended_kernel')
+                name='constant_kernel')
             self.built = True
 
-        def call(self, inputs, states, attended):
+        def call(self, inputs, states, constants):
             [prev_output] = states
-            [attended] = attended
+            [constant] = constants
             h_input = keras.backend.dot(inputs, self.input_kernel)
             h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
-            h_const = keras.backend.dot(attended, self.attended_kernel)
+            h_const = keras.backend.dot(constant, self.constant_kernel)
             output = h_input + h_state + h_const
             return output, [output]
 
         def get_config(self):
             config = {'units': self.units}
-            base_config = super(AttentionRNNCell, self).get_config()
+            base_config = super(RNNCellWithConstants, self).get_config()
             return dict(list(base_config.items()) + list(config.items()))
 
     # Test basic case.
     x = keras.Input((None, 5))
-    attended = keras.Input((3,))
-    cell = AttentionRNNCell(32)
-    layer = recurrent.AttentionRNN(cell)
-    y = layer(x, attended=attended)
-    model = keras.models.Model([x, attended], y)
+    c = keras.Input((3,))
+    cell = RNNCellWithConstants(32)
+    layer = recurrent.RNN(cell)
+    y = layer(x, constants=c)
+    model = keras.models.Model([x, c], y)
     model.compile(optimizer='rmsprop', loss='mse')
     model.train_on_batch(
         [np.zeros((6, 5, 5)), np.zeros((6, 3))],
@@ -626,73 +626,19 @@ def get_config(self):
 
     # Test basic case serialization.
     x_np = np.random.random((6, 5, 5))
-    attended_np = np.random.random((6, 3))
-    y_np = model.predict([x_np, attended_np])
+    c_np = np.random.random((6, 3))
+    y_np = model.predict([x_np, c_np])
     weights = model.get_weights()
     config = layer.get_config()
-    with keras.utils.CustomObjectScope({'AttentionRNNCell': AttentionRNNCell}):
-        layer = recurrent.AttentionRNN.from_config(config)
-    y = layer(x, attended=attended)
-    model = keras.models.Model([x, attended], y)
+    with keras.utils.CustomObjectScope(
+        {'RNNCellWithConstants': RNNCellWithConstants}):
+        layer = recurrent.RNN.from_config(config)
+    y = layer(x, constants=c)
+    model = keras.models.Model([x, c], y)
     model.set_weights(weights)
-    y_np_2 = model.predict([x_np, attended_np])
+    y_np_2 = model.predict([x_np, c_np])
     assert_allclose(y_np, y_np_2, atol=1e-4)
 
 
-def test_functional_rnn_cell():
-    layers = keras.layers
-
-    # Create the cell:
-    units = 8
-    input_size = 5
-    x = Input((input_size,))
-    h_tm1 = Input((units,))
-    h_ = layers.add([layers.Dense(units)(x), layers.Dense(units)(h_tm1)])
-    h = layers.Activation('tanh')(h_)
-    cell = recurrent.FunctionalRNNCell(inputs=x,
-                                       outputs=h,
-                                       input_states=h_tm1,
-                                       output_states=h)
-    # Test basic case.
-    x_seq = Input((None, input_size))
-    layer = recurrent.RNN(cell)
-    y = layer(x_seq)
-    model = keras.models.Model(x_seq, y)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.train_on_batch(np.zeros((6, 5, input_size)), np.zeros((6, units)))
-
-
-def test_functional_rnn_cell_with_attended():
-    layers = keras.layers
-
-    # Create the cell:
-    units = 8
-    input_size = 5
-    constant_shape = (10,)
-    x = Input((input_size,))
-    h_tm1 = Input((units,))
-    attended = Input(constant_shape)
-    h_ = layers.add([layers.Dense(units)(x),
-                     layers.Dense(units)(h_tm1),
-                     layers.Dense(units)(attended)])
-    h = layers.Activation('tanh')(h_)
-
-    cell = recurrent.FunctionalRNNCell(inputs=x,
-                                       outputs=h,
-                                       input_states=h_tm1,
-                                       output_states=h,
-                                       attended=attended)
-    # Test basic case.
-    x_seq = Input((None, input_size))
-    layer = recurrent.AttentionRNN(cell)
-    y = layer(x_seq, attended=attended)
-    model = keras.models.Model([x_seq, attended], y)
-    model.compile(optimizer='rmsprop', loss='mse')
-    model.train_on_batch(
-        [np.zeros((6, 5, input_size)), np.zeros((6, constant_shape[0]))],
-        np.zeros((6, units))
-    )
-
-
 if __name__ == '__main__':
     pytest.main([__file__])

From 95c2359b6fc1c447047fdcfbbbdcc614fcd8b1b1 Mon Sep 17 00:00:00 2001
From: andhus <andhus@kth.se>
Date: Sun, 22 Oct 2017 11:07:46 +0200
Subject: [PATCH 11/13] fixed PEP8 violations

---
 keras/layers/recurrent.py            | 4 ++--
 tests/keras/layers/recurrent_test.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/keras/layers/recurrent.py b/keras/layers/recurrent.py
index 867563cd94a2..31c670db9abf 100644
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@@ -375,8 +375,8 @@ def __init__(self, cell,
         self.state_spec = None
         self._states = None
         self.constants_spec = None
-        self._n_constants = None  # used for splitting inputs after
-                                  # serialization of layer
+        self._n_constants = None
+
     @property
     def states(self):
         if self._states is None:
diff --git a/tests/keras/layers/recurrent_test.py b/tests/keras/layers/recurrent_test.py
index 24aa68d4d761..d8256f9af851 100644
--- a/tests/keras/layers/recurrent_test.py
+++ b/tests/keras/layers/recurrent_test.py
@@ -630,8 +630,8 @@ def get_config(self):
     y_np = model.predict([x_np, c_np])
     weights = model.get_weights()
     config = layer.get_config()
-    with keras.utils.CustomObjectScope(
-        {'RNNCellWithConstants': RNNCellWithConstants}):
+    custom_objects = {'RNNCellWithConstants': RNNCellWithConstants}
+    with keras.utils.CustomObjectScope(custom_objects):
         layer = recurrent.RNN.from_config(config)
     y = layer(x, constants=c)
     model = keras.models.Model([x, c], y)

From 86fdd939079cb53e466ccc45485fc11dfd250b15 Mon Sep 17 00:00:00 2001
From: andhus <andhus@kth.se>
Date: Wed, 25 Oct 2017 01:02:03 +0200
Subject: [PATCH 12/13] fixed minor review comments

---
 keras/layers/recurrent.py | 82 +++++++++++++++++++--------------------
 1 file changed, 39 insertions(+), 43 deletions(-)

diff --git a/keras/layers/recurrent.py b/keras/layers/recurrent.py
index 31c670db9abf..d3c1119fecd1 100644
--- a/keras/layers/recurrent.py
+++ b/keras/layers/recurrent.py
@@ -295,11 +295,11 @@ class RNN(Layer):
 
     # Note on passing external constants to RNNs
         You can pass "external" constants to the cell using the `constants`
-        keyword argument of RNN.__call__ (as well as RNN.call) method. This
+        keyword argument of `RNN.__call__` (as well as `RNN.call`) method. This
         requires that the `cell.call` method accepts the same keyword argument
         `constants`. Such constants can be used to condition the cell
-        transformation on additional static inputs (not changing over time)
-        (a.k.a. as attention mechanism).
+        transformation on additional static inputs (not changing over time),
+        a.k.a. an attention mechanism.
 
     # Examples
 
@@ -375,7 +375,7 @@ def __init__(self, cell,
         self.state_spec = None
         self._states = None
         self.constants_spec = None
-        self._n_constants = None
+        self._num_constants = None
 
     @property
     def states(self):
@@ -424,8 +424,8 @@ def compute_mask(self, inputs, mask):
     def build(self, input_shape):
         # Note input_shape will be list of shapes of initial states and
         # constants if these are passed in __call__.
-        if self._n_constants is not None:
-            constants_shape = input_shape[-self._n_constants:]
+        if self._num_constants is not None:
+            constants_shape = input_shape[-self._num_constants:]
         else:
             constants_shape = None
 
@@ -475,7 +475,7 @@ def get_initial_state(self, inputs):
             return [K.tile(initial_state, [1, self.cell.state_size])]
 
     def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
-        inputs, initial_state, constants = self._normalize_args(
+        inputs, initial_state, constants = self._standardize_args(
             inputs, initial_state, constants)
 
         if initial_state is None and constants is None:
@@ -485,36 +485,33 @@ def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
         # tensors, then add them to the inputs and temporarily modify the
         # input_spec to include them.
 
-        check_list = []
+        additional_inputs = []
+        additional_specs = []
         if initial_state is not None:
             kwargs['initial_state'] = initial_state
-            check_list += initial_state
+            additional_inputs += initial_state
             self.state_spec = [InputSpec(shape=K.int_shape(state))
                                for state in initial_state]
+            additional_specs += self.state_spec
         if constants is not None:
             kwargs['constants'] = constants
-            check_list += constants
+            additional_inputs += constants
             self.constants_spec = [InputSpec(shape=K.int_shape(constant))
                                    for constant in constants]
-            self._n_constants = len(constants)
-        # at this point check_list cannot be empty
-        is_keras_tensor = hasattr(check_list[0], '_keras_history')
-        for tensor in check_list:
+            self._num_constants = len(constants)
+            additional_specs += self.constants_spec
+        # at this point additional_inputs cannot be empty
+        is_keras_tensor = hasattr(additional_inputs[0], '_keras_history')
+        for tensor in additional_inputs:
             if hasattr(tensor, '_keras_history') != is_keras_tensor:
-                raise ValueError('The initial state and constants of an RNN'
+                raise ValueError('The initial state or constants of an RNN'
                                  ' layer cannot be specified with a mix of'
                                  ' Keras tensors and non-Keras tensors')
 
         if is_keras_tensor:
             # Compute the full input spec, including state and constants
-            full_input = [inputs]
-            full_input_spec = self.input_spec
-            if initial_state:
-                full_input += initial_state
-                full_input_spec += self.state_spec
-            if constants:
-                full_input += constants
-                full_input_spec += self.constants_spec
+            full_input = [inputs] + additional_inputs
+            full_input_spec = self.input_spec + additional_specs
             # Perform the call with temporarily replaced input_spec
             original_input_spec = self.input_spec
             self.input_spec = full_input_spec
@@ -574,8 +571,8 @@ def call(self,
                 raise ValueError('RNN cell does not support constants')
 
             def step(inputs, states):
-                constants = states[-self._n_constants:]
-                states = states[:-self._n_constants]
+                constants = states[-self._num_constants:]
+                states = states[:-self._num_constants]
                 return self.cell.call(inputs, states, constants=constants,
                                       **kwargs)
         else:
@@ -614,19 +611,18 @@ def step(inputs, states):
         else:
             return output
 
-    def _normalize_args(self, inputs, initial_state, constants):
-        """When running a model loaded from file, the input tensors
-        `initial_state` and `constants` can be passed to RNN.__call__ as part
-        of `inputs` in stead of by the dedicated keyword argumetes. In this
-        case `inputs` is a list of tensors of which the first one is the
-        actual (sequence) input followed by initial states, followed by
-        constants.
+    def _standardize_args(self, inputs, initial_state, constants):
+        """Brings the arguments of `__call__` that can contain input tensors to
+        standard format.
 
-        This method makes sure initial_states and constants are separated from
-        inputs and that the are lists of tensors (or None).
+        When running a model loaded from file, the input tensors
+        `initial_state` and `constants` can be passed to `RNN.__call__` as part
+        of `inputs` instead of by the dedicated keyword arguments. This method
+        makes sure the arguments are separated and that `initial_state` and
+        `constants` are lists of tensors (or None).
 
         # Arguments
-            inputs: tensor of list/tuple of tensors
+            inputs: tensor or list/tuple of tensors
             initial_state: tensor or list of tensors or None
             constants: tensor or list of tensors or None
 
@@ -637,14 +633,14 @@ def _normalize_args(self, inputs, initial_state, constants):
         """
         if isinstance(inputs, list):
             assert initial_state is None and constants is None
-            if self._n_constants is not None:
-                constants = inputs[-self._n_constants:]
-                inputs = inputs[:-self._n_constants]
+            if self._num_constants is not None:
+                constants = inputs[-self._num_constants:]
+                inputs = inputs[:-self._num_constants]
             if len(inputs) > 1:
                 initial_state = inputs[1:]
             inputs = inputs[0]
 
-        def to_list_or_none(x):  # TODO break out?
+        def to_list_or_none(x):
             if x is None or isinstance(x, list):
                 return x
             if isinstance(x, tuple):
@@ -714,8 +710,8 @@ def get_config(self):
                   'go_backwards': self.go_backwards,
                   'stateful': self.stateful,
                   'unroll': self.unroll}
-        if self._n_constants is not None:
-            config['_n_constants'] = self._n_constants
+        if self._num_constants is not None:
+            config['num_constants'] = self._num_constants
 
         cell_config = self.cell.get_config()
         config['cell'] = {'class_name': self.cell.__class__.__name__,
@@ -728,9 +724,9 @@ def from_config(cls, config, custom_objects=None):
         from . import deserialize as deserialize_layer
         cell = deserialize_layer(config.pop('cell'),
                                  custom_objects=custom_objects)
-        n_constants = config.pop('_n_constants', None)
+        num_constants = config.pop('num_constants', None)
         layer = cls(cell, **config)
-        layer._n_constants = n_constants
+        layer._num_constants = num_constants
         return layer
 
     @property

From d33d919590f66ae7770be8f23e4746b037fbe2c5 Mon Sep 17 00:00:00 2001
From: andhus <andhus@kth.se>
Date: Wed, 25 Oct 2017 01:47:21 +0200
Subject: [PATCH 13/13] added test case for when both inital_state and
 constants are passed to RNN.__call__

---
 tests/keras/layers/recurrent_test.py | 99 +++++++++++++++++++++++++++-
 1 file changed, 98 insertions(+), 1 deletion(-)

diff --git a/tests/keras/layers/recurrent_test.py b/tests/keras/layers/recurrent_test.py
index d8256f9af851..19d318a060a3 100644
--- a/tests/keras/layers/recurrent_test.py
+++ b/tests/keras/layers/recurrent_test.py
@@ -632,13 +632,110 @@ def get_config(self):
     config = layer.get_config()
     custom_objects = {'RNNCellWithConstants': RNNCellWithConstants}
     with keras.utils.CustomObjectScope(custom_objects):
-        layer = recurrent.RNN.from_config(config)
+        layer = recurrent.RNN.from_config(config.copy())
     y = layer(x, constants=c)
     model = keras.models.Model([x, c], y)
     model.set_weights(weights)
     y_np_2 = model.predict([x_np, c_np])
     assert_allclose(y_np, y_np_2, atol=1e-4)
 
+    # test flat list inputs
+    with keras.utils.CustomObjectScope(custom_objects):
+        layer = recurrent.RNN.from_config(config.copy())
+    y = layer([x, c])
+    model = keras.models.Model([x, c], y)
+    model.set_weights(weights)
+    y_np_3 = model.predict([x_np, c_np])
+    assert_allclose(y_np, y_np_3, atol=1e-4)
+
+
+def test_rnn_cell_with_constants_layer_passing_initial_state():
+
+    class RNNCellWithConstants(keras.layers.Layer):
+
+        def __init__(self, units, **kwargs):
+            self.units = units
+            self.state_size = units
+            super(RNNCellWithConstants, self).__init__(**kwargs)
+
+        def build(self, input_shape):
+            if not isinstance(input_shape, list):
+                raise TypeError('expects constants shape')
+            [input_shape, constant_shape] = input_shape
+            # will (and should) raise if more than one constant passed
+
+            self.input_kernel = self.add_weight(
+                shape=(input_shape[-1], self.units),
+                initializer='uniform',
+                name='kernel')
+            self.recurrent_kernel = self.add_weight(
+                shape=(self.units, self.units),
+                initializer='uniform',
+                name='recurrent_kernel')
+            self.constant_kernel = self.add_weight(
+                shape=(constant_shape[-1], self.units),
+                initializer='uniform',
+                name='constant_kernel')
+            self.built = True
+
+        def call(self, inputs, states, constants):
+            [prev_output] = states
+            [constant] = constants
+            h_input = keras.backend.dot(inputs, self.input_kernel)
+            h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
+            h_const = keras.backend.dot(constant, self.constant_kernel)
+            output = h_input + h_state + h_const
+            return output, [output]
+
+        def get_config(self):
+            config = {'units': self.units}
+            base_config = super(RNNCellWithConstants, self).get_config()
+            return dict(list(base_config.items()) + list(config.items()))
+
+    # Test basic case.
+    x = keras.Input((None, 5))
+    c = keras.Input((3,))
+    s = keras.Input((32,))
+    cell = RNNCellWithConstants(32)
+    layer = recurrent.RNN(cell)
+    y = layer(x, initial_state=s, constants=c)
+    model = keras.models.Model([x, s, c], y)
+    model.compile(optimizer='rmsprop', loss='mse')
+    model.train_on_batch(
+        [np.zeros((6, 5, 5)), np.zeros((6, 32)), np.zeros((6, 3))],
+        np.zeros((6, 32))
+    )
+
+    # Test basic case serialization.
+    x_np = np.random.random((6, 5, 5))
+    s_np = np.random.random((6, 32))
+    c_np = np.random.random((6, 3))
+    y_np = model.predict([x_np, s_np, c_np])
+    weights = model.get_weights()
+    config = layer.get_config()
+    custom_objects = {'RNNCellWithConstants': RNNCellWithConstants}
+    with keras.utils.CustomObjectScope(custom_objects):
+        layer = recurrent.RNN.from_config(config.copy())
+    y = layer(x, initial_state=s, constants=c)
+    model = keras.models.Model([x, s, c], y)
+    model.set_weights(weights)
+    y_np_2 = model.predict([x_np, s_np, c_np])
+    assert_allclose(y_np, y_np_2, atol=1e-4)
+
+    # verify that state is used
+    y_np_2_different_s = model.predict([x_np, s_np + 10., c_np])
+    with pytest.raises(AssertionError):
+        assert_allclose(y_np, y_np_2_different_s, atol=1e-4)
+
+    # test flat list inputs
+    with keras.utils.CustomObjectScope(custom_objects):
+        layer = recurrent.RNN.from_config(config.copy())
+    y = layer([x, s, c])
+    model = keras.models.Model([x, s, c], y)
+    model.set_weights(weights)
+    y_np_3 = model.predict([x_np, s_np, c_np])
+    assert_allclose(y_np, y_np_3, atol=1e-4)
+
 
 if __name__ == '__main__':
     pytest.main([__file__])