From 361fb42b9ad571bd3e9cfbed0d1dda5d75c8b6a1 Mon Sep 17 00:00:00 2001
From: Scott Cyphers <scott.cyphers@intel.com>
Date: Wed, 31 Oct 2018 08:33:27 -0700
Subject: [PATCH 01/10] Doc for Batchnorm, fix argument order, cleanup some
 comments

---
 doc/sphinx/ngraph.doxyfile                    |    2 +-
 doc/sphinx/source/ops/batch_norm.rst          |  105 -
 .../source/ops/batch_norm_inference.rst       |   80 +
 doc/sphinx/source/ops/batch_norm_training.rst |   89 +
 .../ops/batch_norm_training_backprop.rst      |   71 +
 doc/sphinx/source/ops/index.rst               |    8 +-
 src/ngraph/descriptor/input.hpp               |    4 +-
 .../descriptor/layout/tensor_layout.hpp       |    4 +-
 src/ngraph/descriptor/output.hpp              |    2 +-
 src/ngraph/descriptor/tensor.hpp              |    2 +-
 src/ngraph/op/batch_norm.cpp                  |  180 +-
 src/ngraph/op/batch_norm.hpp                  |   33 +-
 src/ngraph/serializer.cpp                     |    6 +-
 test/autodiff.in.cpp                          |    2 +-
 test/backend_test.in.cpp                      |   22 +-
 test/backend_test.in.cpp-9bfce850             | 5571 +++++++++++++++++
 test/cpu_fusion.cpp                           |   10 +-
 test/cpu_fusion.cpp-41c1ba06                  | 3132 +++++++++
 test/type_prop.cpp                            |   80 +-
 19 files changed, 9148 insertions(+), 255 deletions(-)
 delete mode 100644 doc/sphinx/source/ops/batch_norm.rst
 create mode 100644 doc/sphinx/source/ops/batch_norm_inference.rst
 create mode 100644 doc/sphinx/source/ops/batch_norm_training.rst
 create mode 100644 doc/sphinx/source/ops/batch_norm_training_backprop.rst
 create mode 100644 test/backend_test.in.cpp-9bfce850
 create mode 100644 test/cpu_fusion.cpp-41c1ba06

diff --git a/doc/sphinx/ngraph.doxyfile b/doc/sphinx/ngraph.doxyfile
index 88da4a97846..ffea9942355 100644
--- a/doc/sphinx/ngraph.doxyfile
+++ b/doc/sphinx/ngraph.doxyfile
@@ -1807,7 +1807,7 @@ SEARCH_INCLUDES        = YES
 # preprocessor.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
-INCLUDE_PATH           =
+INCLUDE_PATH           = ../../src
 
 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
diff --git a/doc/sphinx/source/ops/batch_norm.rst b/doc/sphinx/source/ops/batch_norm.rst
deleted file mode 100644
index 53b117f389a..00000000000
--- a/doc/sphinx/source/ops/batch_norm.rst
+++ /dev/null
@@ -1,105 +0,0 @@
-.. batch_norm.rst:
-
-#########
-BatchNorm
-#########
-
-.. code-block:: cpp
-
-   BatchNorm  // Produces a normalized output
-
-
-Description
-===========
-
-Produces a normalized output.
-
-Inputs
-------
-
-+---------------------+-------------------------+-----------------------------+
-| Name                | Element Type            | Shape                       |
-+=====================+=========================+=============================+
-| ``input``           | same as ``gamma``       | \(..., C, ...\)             |
-+---------------------+-------------------------+-----------------------------+
-| ``gamma``           | any                     | \(C\)                       |
-+---------------------+-------------------------+-----------------------------+
-| ``beta``            | same as ``gamma``       | \(C\)                       |
-+---------------------+-------------------------+-----------------------------+
-| ``global_mean``     | same as ``gamma``       | \(C\)                       |
-+---------------------+-------------------------+-----------------------------+
-| ``global_variance`` | same as ``gamma``       | \(C\)                       |
-+---------------------+-------------------------+-----------------------------+
-| ``use_global``      | ``bool``                | \(\)                        |
-+---------------------+-------------------------+-----------------------------+
-
-
-Attributes
-----------
-
-+------------------+--------------------+---------------------+
-| Name             | Type               | Notes               |
-+==================+====================+=====================+
-| ``epsilon``      | same as ``input``  | Bias for variance   |
-+------------------+--------------------+---------------------+
-| ``channel_axis`` | size_t             | Channel axis        |
-+------------------+--------------------+---------------------+
-
-Outputs
--------
-
-+---------------------+-------------------------+-----------------------------+
-| Name                | Element Type            | Shape                       |
-+=====================+=========================+=============================+
-| ``normalized``      | same as ``gamma``       | same as ``input``           |
-+---------------------+-------------------------+-----------------------------+
-| ``batch_mean``      | same as ``gamma``       | \(C\)                       |
-+---------------------+-------------------------+-----------------------------+
-| ``batch_variance``  | same as ``gamma``       | \(C\)                       |
-+---------------------+-------------------------+-----------------------------+
-
-The ``batch_mean`` and ``batch_variance`` outputs are computed per-channel from 
-``input``. The values only need to be computed if ``use_global`` is ``false``, 
-or if they are used.
-
-
-Mathematical Definition
-=======================
-
-The axes of the input fall into two categories: positional and channel, with 
-channel being axis 1. For each position, there are :math:`C` channel values, 
-each normalized independently.
-
-Normalization of a channel sample is controlled by two values:
-
-*  the mean :math:`\mu`, and 
-*  the variance :math:`\sigma^2`; 
-
-and by two scaling attributes: :math:`\gamma` and :math:`\beta`. 
-
-The values for :math:`\mu` and :math:`\sigma^2` come either from computing the 
-mean and variance of ``input``, or from ``global_mean`` and ``global_variance``, 
-depending on the value of ``use_global``.
-
-.. math::
-
-   y_c = \frac{x_c-\mu_c}{\sqrt{\sigma^2_c+\epsilon}}\gamma_c+\beta_c
-
-The mean and variance can be arguments, or they may be computed for each channel 
-of ``input`` over the positional axes. When computed from ``input``, the mean 
-and variance per-channel are available as outputs.
-
-
-C++ Interface
-==============
-
-.. doxygenclass:: ngraph::op::BatchNormTraining
-   :project: ngraph
-   :members:
-
-
-.. doxygenclass:: ngraph::op::BatchNormInference
-   :project: ngraph
-   :members:
-
-
diff --git a/doc/sphinx/source/ops/batch_norm_inference.rst b/doc/sphinx/source/ops/batch_norm_inference.rst
new file mode 100644
index 00000000000..9017ac19c20
--- /dev/null
+++ b/doc/sphinx/source/ops/batch_norm_inference.rst
@@ -0,0 +1,80 @@
+.. batch_norm_inference.rst:
+
+##################
+BatchNormInference
+##################
+
+.. code-block:: cpp
+
+   BatchNormInference  // Adjust input for mean and variance
+
+
+Description
+===========
+
+
+
+Inputs
+------
+
++---------------------+-------------------------+------------------------------+
+| Name                | Element Type            | Shape                        |
++=====================+=========================+==============================+
+| ``input``           | real                    | :math:`(\bullet, C, \ldots)` |
++---------------------+-------------------------+------------------------------+
+| ``gamma``           | same as ``input``       | :math:`(C)`                  |
++---------------------+-------------------------+------------------------------+
+| ``beta``            | same as ``input``       | :math:`(C)`                  |
++---------------------+-------------------------+------------------------------+
+| ``mean``            | same as ``input``       | :math:`(C)`                  |
++---------------------+-------------------------+------------------------------+
+| ``variances``       | same as ``input``       | :math:`(C)`                  |
++---------------------+-------------------------+------------------------------+
+
+
+Attributes
+----------
+
++------------------+--------------------+--------------------------------------------------------+
+| Name             | Type               | Notes                                                  |
++==================+====================+========================================================+
+| ``epsilon``      | ``double``         | Small bias added to variance to avoid division by 0.   |
++------------------+--------------------+--------------------------------------------------------+
+
+Outputs
+-------
+
++---------------------+-------------------------+-----------------------------+
+| Name                | Element Type            | Shape                       |
++=====================+=========================+=============================+
+| ``normalized``      | same as ``gamma``       | Same as ``input``           |
++---------------------+-------------------------+-----------------------------+
+
+Mathematical Definition
+=======================
+
+The axes of the input fall into two categories: positional and channel, with 
+channel being axis 1. For each position, there are :math:`C` channel values, 
+each normalized independently.
+
+Normalization of a channel sample is controlled by two values:
+
+*  the `mean` :math:`\mu`, and
+   
+*  the `variance` :math:`\sigma^2`; 
+
+and by two scaling attributes: :math:`\gamma` and :math:`\beta`. 
+
+.. math::
+
+   \mathtt{normalized}_{\bullet, c, \ldots} = \frac{\mathtt{input}_{\bullet, c, \ldots}-\mu_c}{\sqrt{\sigma^2_c+\epsilon}}\gamma_c+\beta_c
+
+
+C++ Interface
+==============
+
+.. doxygenclass:: ngraph::op::BatchNormInference
+   :project: ngraph
+   :members:
+
+
diff --git a/doc/sphinx/source/ops/batch_norm_training.rst b/doc/sphinx/source/ops/batch_norm_training.rst
new file mode 100644
index 00000000000..a458d54a061
--- /dev/null
+++ b/doc/sphinx/source/ops/batch_norm_training.rst
@@ -0,0 +1,89 @@
+.. batch_norm_training.rst:
+
+#################
+BatchNormTraining
+#################
+
+.. code-block:: cpp
+
+   BatchNormTraining  // Compute mean and variance from the input.
+
+
+Description
+===========
+
+
+
+Inputs
+------
+
++---------------------+-------------------------+------------------------------+
+| Name                | Element Type            | Shape                        |
++=====================+=========================+==============================+
+| ``input``           | real                    | :math:`(\bullet, C, \ldots)` |
++---------------------+-------------------------+------------------------------+
+| ``gamma``           | same as ``input``       | :math:`(C)`                  |
++---------------------+-------------------------+------------------------------+
+| ``beta``            | same as ``input``       | :math:`(C)`                  |
++---------------------+-------------------------+------------------------------+
+
+
+Attributes
+----------
+
++------------------+--------------------+--------------------------------------------------------+
+| Name             | Type               | Notes                                                  |
++==================+====================+========================================================+
+| ``epsilon``      | ``double``         | Small bias added to variance to avoid division by 0.   |
++------------------+--------------------+--------------------------------------------------------+
+
+Outputs
+-------
+
++---------------------+-------------------------+-----------------------------+
+| Name                | Element Type            | Shape                       |
++=====================+=========================+=============================+
+| ``normalized``      | same as ``gamma``       | Same as ``input``           |
++---------------------+-------------------------+-----------------------------+
+| ``batch_mean``      | same as ``gamma``       | :math:`(C)`                 |
++---------------------+-------------------------+-----------------------------+
+| ``batch_variance``  | same as ``gamma``       | :math:`(C)`                 |
++---------------------+-------------------------+-----------------------------+
+
+The ``batch_mean`` and ``batch_variance`` outputs are computed per-channel from 
+``input``.
+
+
+Mathematical Definition
+=======================
+
+The axes of the input fall into two categories: positional and channel, with 
+channel being axis 1. For each position, there are :math:`C` channel values, 
+each normalized independently.
+
+Normalization of a channel sample is controlled by two values:
+
+*  the `batch_mean` :math:`\mu`, and
+   
+*  the `batch_variance` :math:`\sigma^2`; 
+
+and by two scaling attributes: :math:`\gamma` and :math:`\beta`. 
+
+The values for :math:`\mu` and :math:`\sigma^2` come from computing the 
+mean and variance of ``input``.
+
+.. math::
+
+   \mu_c &= \mathop{\mathbb{E}}\left(\mathtt{input}_{\bullet, c, \ldots}\right)\\
+   \sigma^2_c &= \mathop{\mathtt{Var}}\left(\mathtt{input}_{\bullet, c, \ldots}\right)\\
+   \mathtt{normlized}_{\bullet, c, \ldots} &= \frac{\mathtt{input}_{\bullet, c, \ldots}-\mu_c}{\sqrt{\sigma^2_c+\epsilon}}\gamma_c+\beta_c
+
+
+C++ Interface
+==============
+
+.. doxygenclass:: ngraph::op::BatchNormTraining
+   :project: ngraph
+   :members:
+
+
diff --git a/doc/sphinx/source/ops/batch_norm_training_backprop.rst b/doc/sphinx/source/ops/batch_norm_training_backprop.rst
new file mode 100644
index 00000000000..68004bbf092
--- /dev/null
+++ b/doc/sphinx/source/ops/batch_norm_training_backprop.rst
@@ -0,0 +1,71 @@
+.. batch_norm_training_backprop.rst:
+
+#########################
+BatchNormTrainingBackprop
+#########################
+
+.. code-block:: cpp
+
+   BatchNormTrainingBackprop  // Compute mean and variance backprop from the input.
+
+
+Description
+===========
+
+
+
+Inputs
+------
+
++----------------------+-------------------------+------------------------------+
+| Name                 | Element Type            | Shape                        |
++======================+=========================+==============================+
+| ``input``            | real                    | :math:`(\bullet, C, \ldots)` |
++----------------------+-------------------------+------------------------------+
+| ``gamma``            | same as ``input``       | :math:`(C)`                  |
++----------------------+-------------------------+------------------------------+
+| ``beta``             | same as ``input``       | :math:`(C)`                  |
++----------------------+-------------------------+------------------------------+
+| ``mean``             | same as ``input``       | :math:`(C)`                  |
++----------------------+-------------------------+------------------------------+
+| ``variance``         | same as ``input``       | :math:`(C)`                  |
++----------------------+-------------------------+------------------------------+
+| ``normalized_delta`` | same as ``input``       | :math:`input`                |
++----------------------+-------------------------+------------------------------+
+
+
+Attributes
+----------
+
++------------------+--------------------+--------------------------------------------------------+
+| Name             | Type               | Notes                                                  |
++==================+====================+========================================================+
+| ``epsilon``      | ``double``         | Small bias added to variance to avoid division by 0.   |
++------------------+--------------------+--------------------------------------------------------+
+
+Outputs
+-------
+
++---------------------+-------------------------+-----------------------------+
+| Name                | Element Type            | Shape                       |
++=====================+=========================+=============================+
+| ``input_delta``     | same as ``input``       | Same as ``input``           |
++---------------------+-------------------------+-----------------------------+
+| ``gamma_delta``     | same as ``gamma``       | :math:`(C)`                 |
++---------------------+-------------------------+-----------------------------+
+| ``beta_delta``      | same as ``beta``        | :math:`(C)`                 |
++---------------------+-------------------------+-----------------------------+
+
+
+Mathematical Definition
+=======================
+
+
+C++ Interface
+==============
+
+.. doxygenclass:: ngraph::op::BatchNormTrainingBackprop
+   :project: ngraph
+   :members:
+
+
diff --git a/doc/sphinx/source/ops/index.rst b/doc/sphinx/source/ops/index.rst
index 3b70d1e058f..8f0423dd082 100644
--- a/doc/sphinx/source/ops/index.rst
+++ b/doc/sphinx/source/ops/index.rst
@@ -56,7 +56,9 @@ Not currently a comprehensive list.
    * :doc:`atan`
    * :doc:`avg_pool`
    * :doc:`avg_pool_backprop`
-   * :doc:`batch_norm`
+   * :doc:`batch_norm_inference`
+   * :doc:`batch_norm_training`
+   * :doc:`batch_norm_training_backprop`
    * :doc:`broadcast`
    * :doc:`ceiling`
    * :doc:`concat`
@@ -123,7 +125,9 @@ Not currently a comprehensive list.
    atan.rst
    avg_pool.rst
    avg_pool_backprop.rst
-   batch_norm.rst
+   batch_norm_inference.rst
+   batch_norm_training.rst
+   batch_norm_training_backprop.rst
    broadcast.rst
    ceiling.rst
    concat.rst
diff --git a/src/ngraph/descriptor/input.hpp b/src/ngraph/descriptor/input.hpp
index e4ef4508778..74d96cd81ba 100644
--- a/src/ngraph/descriptor/input.hpp
+++ b/src/ngraph/descriptor/input.hpp
@@ -60,10 +60,10 @@ namespace ngraph
             void replace_output(Output& output);
 
         protected:
-            /// \return the tensor view for the connected output
+            /// \return the tensor for the connected output
             std::shared_ptr<const Tensor> get_tensor_ptr() const;
 
-            /// \return the tensor view for the connected output
+            /// \return the tensor for the connected output
             std::shared_ptr<Tensor> get_tensor_ptr();
 
         public:
diff --git a/src/ngraph/descriptor/layout/tensor_layout.hpp b/src/ngraph/descriptor/layout/tensor_layout.hpp
index 780a31d501c..5dc38fc5763 100644
--- a/src/ngraph/descriptor/layout/tensor_layout.hpp
+++ b/src/ngraph/descriptor/layout/tensor_layout.hpp
@@ -32,7 +32,7 @@ namespace ngraph
     {
         namespace layout
         {
-            /// \brief Interface for describing implementations of tensor views.
+            /// \brief Interface for describing implementations of tensors.
             ///
             /// Kernel selection will need to pay attention to the layout.
             class TensorLayout
@@ -44,7 +44,7 @@ namespace ngraph
 
             public:
                 virtual ~TensorLayout() {}
-                /// Extent of this view in buffer.
+                /// Extent of this tensor in buffer.
                 ///
                 /// When we support non-linear buffers, this will need to be something other than size_t.
                 size_t get_size() const;
diff --git a/src/ngraph/descriptor/output.hpp b/src/ngraph/descriptor/output.hpp
index b145fb528bb..f251a0b84ed 100644
--- a/src/ngraph/descriptor/output.hpp
+++ b/src/ngraph/descriptor/output.hpp
@@ -39,7 +39,7 @@ namespace ngraph
         public:
             /// \param node Node that owns this output.
             /// \param index Position of the output tensor in all output tensors
-            /// \param tensor The view of this tensor; where the value will be written
+            /// \param tensor The tensor where the value will be written
             Output(Node* node, size_t index, const std::shared_ptr<Tensor>& tensor);
 
             std::shared_ptr<Node> get_node() const;
diff --git a/src/ngraph/descriptor/tensor.hpp b/src/ngraph/descriptor/tensor.hpp
index 3a5ee901e23..1bf57dcb561 100644
--- a/src/ngraph/descriptor/tensor.hpp
+++ b/src/ngraph/descriptor/tensor.hpp
@@ -35,7 +35,7 @@ namespace ngraph
             class TensorLayout;
         }
 
-        /// \brief Compile-time descriptor of a first-class value that is a view of a tensor.
+        /// \brief Compile-time descriptor of a first-class value that is a tensor.
         class Tensor
         {
             Tensor(const Tensor&) = delete;
diff --git a/src/ngraph/op/batch_norm.cpp b/src/ngraph/op/batch_norm.cpp
index 251c86e418b..4f09e461bef 100644
--- a/src/ngraph/op/batch_norm.cpp
+++ b/src/ngraph/op/batch_norm.cpp
@@ -22,19 +22,17 @@
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/validation_util.hpp"
 
-ngraph::op::BatchNormInference::BatchNormInference(double eps,
-                                                   std::shared_ptr<ngraph::Node> gamma,
-                                                   std::shared_ptr<ngraph::Node> beta,
-                                                   std::shared_ptr<ngraph::Node> input,
-                                                   std::shared_ptr<ngraph::Node> mean,
-                                                   std::shared_ptr<ngraph::Node> variance)
-    : Op("BatchNormInference", check_single_output_args({gamma, beta, input, mean, variance}))
-    , m_epsilon(eps)
+ngraph::op::BatchNormTraining::BatchNormTraining(std::shared_ptr<ngraph::Node> input,
+                                                 std::shared_ptr<ngraph::Node> gamma,
+                                                 std::shared_ptr<ngraph::Node> beta,
+                                                 double epsilon)
+    : Op("BatchNormTraining", check_single_output_args({gamma, beta, input}))
+    , m_epsilon(epsilon)
 {
-    set_output_size(1);
     constructor_validate_and_infer_types();
 }
 
+// DEPRECATED
 ngraph::op::BatchNormTraining::BatchNormTraining(double eps,
                                                  std::shared_ptr<ngraph::Node> gamma,
                                                  std::shared_ptr<ngraph::Node> beta,
@@ -42,50 +40,124 @@ ngraph::op::BatchNormTraining::BatchNormTraining(double eps,
     : Op("BatchNormTraining", check_single_output_args({gamma, beta, input}))
     , m_epsilon(eps)
 {
-    set_output_size(3);
     constructor_validate_and_infer_types();
 }
 
-void ngraph::op::BatchNormInference::validate_and_infer_types()
+void ngraph::op::BatchNormTraining::validate_and_infer_types()
 {
     element::Type result_et;
     PartialShape result_batch_shape;
-    PartialShape result_channel_shape; // unused here
+    PartialShape result_channel_shape;
 
+    set_output_size(3);
     std::tie(result_et, result_batch_shape, result_channel_shape) =
         infer_batch_norm_forward(this,
                                  get_input_element_type(INPUT_DATA),
                                  get_input_element_type(INPUT_GAMMA),
                                  get_input_element_type(INPUT_BETA),
-                                 get_input_element_type(INPUT_MEAN),
-                                 get_input_element_type(INPUT_VARIANCE),
                                  get_input_partial_shape(INPUT_DATA),
                                  get_input_partial_shape(INPUT_GAMMA),
-                                 get_input_partial_shape(INPUT_BETA),
-                                 get_input_partial_shape(INPUT_MEAN),
-                                 get_input_partial_shape(INPUT_VARIANCE));
+                                 get_input_partial_shape(INPUT_BETA));
 
     set_output_type(0, result_et, result_batch_shape);
+    set_output_type(1, result_et, result_channel_shape);
+    set_output_type(2, result_et, result_channel_shape);
 }
 
-void ngraph::op::BatchNormTraining::validate_and_infer_types()
+std::shared_ptr<ngraph::Node>
+    ngraph::op::BatchNormTraining::copy_with_new_args(const NodeVector& new_args) const
+{
+    check_new_args_count(this, new_args);
+    return std::make_shared<BatchNormTraining>(
+        new_args.at(2), new_args.at(0), new_args.at(1), m_epsilon);
+}
+
+void ngraph::op::BatchNormTraining::generate_adjoints(autodiff::Adjoints& adjoints,
+                                                      const NodeVector& deltas)
+{
+    auto gamma = get_argument(0);
+    auto beta = get_argument(1);
+    auto input = get_argument(2);
+    std::shared_ptr<Node> mean = nullptr;
+    std::shared_ptr<Node> var = nullptr;
+
+    // Extract mean and variance outputs from BatchNormBase
+    // as these are used by BatchNormTrainingBackprop.
+    // The users of the outputs (GetOutputElements' Inputs) aren't sorted
+    // and get_n() is used to sort the inputs in the same order as Batchnorm's outputs
+    // Next, Mean and Variance (`at(1)` and `at(2)`) are extracted
+    // Please see `add_output` in `BatchNormBase::BatchNormBase` for more details
+
+    auto goes = op::get_output_elements(shared_from_this());
+    mean = goes.at(1);
+    var = goes.at(2);
+    if (!mean)
+    {
+        throw ngraph_error("GetOutputElement for mean is missing");
+    }
+
+    if (!var)
+    {
+        throw ngraph_error("GetOutputElement for variance is missing");
+    }
+
+    auto bbn = std::make_shared<op::BatchNormTrainingBackprop>(
+        input, gamma, beta, mean, var, deltas.at(0), get_eps_value());
+    auto dinput = std::make_shared<op::GetOutputElement>(bbn, 0);
+    auto dgamma = std::make_shared<op::GetOutputElement>(bbn, 1);
+    auto dbeta = std::make_shared<op::GetOutputElement>(bbn, 2);
+
+    adjoints.add_delta(input, dinput);
+    adjoints.add_delta(gamma, dgamma);
+    adjoints.add_delta(beta, dbeta);
+}
+
+ngraph::op::BatchNormInference::BatchNormInference(std::shared_ptr<ngraph::Node> input,
+                                                   std::shared_ptr<ngraph::Node> gamma,
+                                                   std::shared_ptr<ngraph::Node> beta,
+                                                   std::shared_ptr<ngraph::Node> mean,
+                                                   std::shared_ptr<ngraph::Node> variance,
+                                                   double epsilon)
+    : Op("BatchNormInference", check_single_output_args({gamma, beta, input, mean, variance}))
+    , m_epsilon(epsilon)
+{
+    constructor_validate_and_infer_types();
+}
+
+// DEPRECATED
+ngraph::op::BatchNormInference::BatchNormInference(double eps,
+                                                   std::shared_ptr<ngraph::Node> gamma,
+                                                   std::shared_ptr<ngraph::Node> beta,
+                                                   std::shared_ptr<ngraph::Node> input,
+                                                   std::shared_ptr<ngraph::Node> mean,
+                                                   std::shared_ptr<ngraph::Node> variance)
+    : Op("BatchNormInference", check_single_output_args({gamma, beta, input, mean, variance}))
+    , m_epsilon(eps)
+{
+    constructor_validate_and_infer_types();
+}
+
+void ngraph::op::BatchNormInference::validate_and_infer_types()
 {
     element::Type result_et;
     PartialShape result_batch_shape;
-    PartialShape result_channel_shape;
+    PartialShape result_channel_shape; // unused here
 
+    set_output_size(1);
     std::tie(result_et, result_batch_shape, result_channel_shape) =
         infer_batch_norm_forward(this,
                                  get_input_element_type(INPUT_DATA),
                                  get_input_element_type(INPUT_GAMMA),
                                  get_input_element_type(INPUT_BETA),
+                                 get_input_element_type(INPUT_MEAN),
+                                 get_input_element_type(INPUT_VARIANCE),
                                  get_input_partial_shape(INPUT_DATA),
                                  get_input_partial_shape(INPUT_GAMMA),
-                                 get_input_partial_shape(INPUT_BETA));
+                                 get_input_partial_shape(INPUT_BETA),
+                                 get_input_partial_shape(INPUT_MEAN),
+                                 get_input_partial_shape(INPUT_VARIANCE));
 
     set_output_type(0, result_et, result_batch_shape);
-    set_output_type(1, result_et, result_channel_shape);
-    set_output_type(2, result_et, result_channel_shape);
 }
 
 std::shared_ptr<ngraph::Node>
@@ -93,28 +165,20 @@ std::shared_ptr<ngraph::Node>
 {
     check_new_args_count(this, new_args);
     return std::make_shared<BatchNormInference>(
-        m_epsilon, new_args.at(0), new_args.at(1), new_args.at(2), new_args.at(3), new_args.at(4));
-}
-
-std::shared_ptr<ngraph::Node>
-    ngraph::op::BatchNormTraining::copy_with_new_args(const NodeVector& new_args) const
-{
-    check_new_args_count(this, new_args);
-    return std::make_shared<BatchNormTraining>(
-        m_epsilon, new_args.at(0), new_args.at(1), new_args.at(2));
+        new_args.at(2), new_args.at(0), new_args.at(1), new_args.at(3), new_args.at(4), m_epsilon);
 }
 
 ngraph::op::BatchNormTrainingBackprop::BatchNormTrainingBackprop(
-    double eps,
+    std::shared_ptr<ngraph::Node> input,
     std::shared_ptr<ngraph::Node> gamma,
     std::shared_ptr<ngraph::Node> beta,
-    std::shared_ptr<ngraph::Node> input,
     std::shared_ptr<ngraph::Node> mean,
     std::shared_ptr<ngraph::Node> variance,
-    std::shared_ptr<ngraph::Node> delta)
+    std::shared_ptr<ngraph::Node> delta,
+    double epsilon)
     : Op("BatchNormTrainingBackprop",
          check_single_output_args({gamma, beta, input, mean, variance, delta}))
-    , m_epsilon(eps)
+    , m_epsilon(epsilon)
 
 {
     set_output_size(3);
@@ -167,51 +231,11 @@ std::shared_ptr<ngraph::Node>
     ngraph::op::BatchNormTrainingBackprop::copy_with_new_args(const NodeVector& new_args) const
 {
     check_new_args_count(this, new_args);
-    return std::make_shared<op::BatchNormTrainingBackprop>(m_epsilon,
+    return std::make_shared<op::BatchNormTrainingBackprop>(new_args.at(2),
                                                            new_args.at(0),
                                                            new_args.at(1),
-                                                           new_args.at(2),
                                                            new_args.at(3),
                                                            new_args.at(4),
-                                                           new_args.at(5));
-}
-
-void ngraph::op::BatchNormTraining::generate_adjoints(autodiff::Adjoints& adjoints,
-                                                      const NodeVector& deltas)
-{
-    auto gamma = get_argument(0);
-    auto beta = get_argument(1);
-    auto input = get_argument(2);
-    std::shared_ptr<Node> mean = nullptr;
-    std::shared_ptr<Node> var = nullptr;
-
-    // Extract mean and variance outputs from BatchNormBase
-    // as these are used by BatchNormTrainingBackprop.
-    // The users of the outputs (GetOutputElements' Inputs) aren't sorted
-    // and get_n() is used to sort the inputs in the same order as Batchnorm's outputs
-    // Next, Mean and Variance (`at(1)` and `at(2)`) are extracted
-    // Please see `add_output` in `BatchNormBase::BatchNormBase` for more details
-
-    auto goes = op::get_output_elements(shared_from_this());
-    mean = goes.at(1);
-    var = goes.at(2);
-    if (!mean)
-    {
-        throw ngraph_error("GetOutputElement for mean is missing");
-    }
-
-    if (!var)
-    {
-        throw ngraph_error("GetOutputElement for variance is missing");
-    }
-
-    auto bbn = std::make_shared<op::BatchNormTrainingBackprop>(
-        get_eps_value(), gamma, beta, input, mean, var, deltas.at(0));
-    auto dinput = std::make_shared<op::GetOutputElement>(bbn, 0);
-    auto dgamma = std::make_shared<op::GetOutputElement>(bbn, 1);
-    auto dbeta = std::make_shared<op::GetOutputElement>(bbn, 2);
-
-    adjoints.add_delta(input, dinput);
-    adjoints.add_delta(gamma, dgamma);
-    adjoints.add_delta(beta, dbeta);
+                                                           new_args.at(5),
+                                                           m_epsilon);
 }
diff --git a/src/ngraph/op/batch_norm.hpp b/src/ngraph/op/batch_norm.hpp
index b2f30a9bd6d..1069ca2d322 100644
--- a/src/ngraph/op/batch_norm.hpp
+++ b/src/ngraph/op/batch_norm.hpp
@@ -27,9 +27,20 @@ namespace ngraph
 {
     namespace op
     {
+        // \brief Batchnorm for training operation
         class BatchNormTraining : public Op
         {
         public:
+            // \param input Must have rank >= 2, [., C, ...]
+            // \param gamma gamma scaling for normalized value. [C]
+            // \param beta bias added to the scaled normalized value [C]
+            // \param epsilon Avoids divsion by 0 if input has 0 variance
+            BatchNormTraining(std::shared_ptr<Node> input,
+                              std::shared_ptr<Node> gamma,
+                              std::shared_ptr<Node> beta,
+                              double epsilon);
+
+            // DEPRECATED
             // In this version of BatchNorm:
             //
             // MEAN AND VARIANCE: computed directly from the content of 'input'.
@@ -49,6 +60,7 @@ namespace ngraph
             //   output[0]: shall have the same shape as 'input'.
             //   output[1]: shall have rank 1, with the same span as input's channel axis.
             //   output[2]: shall have rank 1, with the same span as input's channel axis.
+            // DEPRECATED
             BatchNormTraining(double eps,
                               std::shared_ptr<Node> gamma,
                               std::shared_ptr<Node> beta,
@@ -75,6 +87,20 @@ namespace ngraph
         class BatchNormInference : public Op
         {
         public:
+            // \param input [., C, ...]
+            // \param gamma gamma scaling for normalized value. [C]
+            // \param beta bias added to the scaled normalized value [C]
+            // \param mean value for mean normalization [C]
+            // \param variance value for variance normalization [C]
+            // \param epsilon Avoids divsion by 0 if input has 0 variance
+            BatchNormInference(std::shared_ptr<ngraph::Node> input,
+                               std::shared_ptr<ngraph::Node> gamma,
+                               std::shared_ptr<ngraph::Node> beta,
+                               std::shared_ptr<ngraph::Node> mean,
+                               std::shared_ptr<ngraph::Node> variance,
+                               double epsilon);
+
+            // DEPRECATED
             // In this version of BatchNorm:
             //
             // MEAN AND VARIANCE: provided by the 'mean' and 'variance' parameters.
@@ -92,6 +118,7 @@ namespace ngraph
             //   mean:     must have rank 1, with the same span as input's channel axis.
             //   variance: must have rank 1, with the same span as input's channel axis.
             //   output:   shall have the same shape as 'input'.
+            // DEPRECATED
             BatchNormInference(double eps,
                                std::shared_ptr<ngraph::Node> gamma,
                                std::shared_ptr<ngraph::Node> beta,
@@ -125,13 +152,13 @@ namespace ngraph
         class BatchNormTrainingBackprop : public Op
         {
         public:
-            BatchNormTrainingBackprop(double eps,
+            BatchNormTrainingBackprop(std::shared_ptr<Node> input,
                                       std::shared_ptr<Node> gamma,
                                       std::shared_ptr<Node> beta,
-                                      std::shared_ptr<Node> input,
                                       std::shared_ptr<Node> mean,
                                       std::shared_ptr<Node> variance,
-                                      std::shared_ptr<Node> delta);
+                                      std::shared_ptr<Node> delta,
+                                      double epsilon);
 
             void validate_and_infer_types() override;
 
diff --git a/src/ngraph/serializer.cpp b/src/ngraph/serializer.cpp
index 4f1d611d15b..3e6e7eaf75b 100644
--- a/src/ngraph/serializer.cpp
+++ b/src/ngraph/serializer.cpp
@@ -531,21 +531,21 @@ static shared_ptr<ngraph::Function>
             case OP_TYPEID::BatchNormTraining:
             {
                 auto epsilon = node_js.at("eps").get<double>();
-                node = make_shared<op::BatchNormTraining>(epsilon, args[0], args[1], args[2]);
+                node = make_shared<op::BatchNormTraining>(args[2], args[0], args[1], epsilon);
                 break;
             }
             case OP_TYPEID::BatchNormInference:
             {
                 auto epsilon = node_js.at("eps").get<double>();
                 node = make_shared<op::BatchNormInference>(
-                    epsilon, args[0], args[1], args[2], args[3], args[4]);
+                    args[2], args[0], args[1], args[3], args[4], epsilon);
                 break;
             }
             case OP_TYPEID::BatchNormTrainingBackprop:
             {
                 auto epsilon = node_js.at("eps").get<double>();
                 node = make_shared<op::BatchNormTrainingBackprop>(
-                    epsilon, args[0], args[1], args[2], args[3], args[4], args[5]);
+                    args[2], args[0], args[1], args[3], args[4], args[5], epsilon);
                 break;
             }
             case OP_TYPEID::Broadcast:
diff --git a/test/autodiff.in.cpp b/test/autodiff.in.cpp
index 1f86a9aab04..2e66b09eb34 100644
--- a/test/autodiff.in.cpp
+++ b/test/autodiff.in.cpp
@@ -1644,7 +1644,7 @@ NGRAPH_TEST(${BACKEND_NAME}, backwards_batch_norm_three_outputs)
         auto B = make_shared<op::Parameter>(element::f64, shape_mean);
         auto C = make_shared<op::Parameter>(element::f64, shape_mean);
 
-        auto BN = make_shared<op::BatchNormTraining>(1e-3, B, C, A);
+        auto BN = make_shared<op::BatchNormTraining>(A, B, C, 1e-3);
         // make sure we create GOEs for mean and variance needed for bprop
         goes.push_back(make_shared<op::GetOutputElement>(BN, 1));
         goes.push_back(make_shared<op::GetOutputElement>(BN, 2));
diff --git a/test/backend_test.in.cpp b/test/backend_test.in.cpp
index 1a48f1ab80c..1b58addb5bd 100644
--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -299,7 +299,7 @@ NGRAPH_TEST(${BACKEND_NAME}, batch_norm_one_output)
     auto Gamma =
         op::Constant::create(element::f64, shape_mean, {1.75437676, 0.37950502, 1.13727544});
 
-    auto BN = make_shared<op::BatchNormInference>(1e-3, Gamma, Beta, A, Mean, Variance);
+    auto BN = make_shared<op::BatchNormInference>(A, Gamma, Beta, Mean, Variance, 1e-3);
     auto f = make_shared<Function>(BN, op::ParameterVector{A});
 
     auto backend = runtime::Backend::create("${BACKEND_NAME}");
@@ -329,7 +329,7 @@ NGRAPH_TEST(${BACKEND_NAME}, batch_norm_three_outputs)
     auto Gamma =
         op::Constant::create(element::f64, shape_mean, {1.75437676, 0.37950502, 1.13727544});
 
-    auto BN = make_shared<op::BatchNormTraining>(1e-3, Gamma, Beta, A);
+    auto BN = make_shared<op::BatchNormTraining>(A, Gamma, Beta, 1e-3);
 
     auto f0 =
         make_shared<Function>(make_shared<op::GetOutputElement>(BN, 0), op::ParameterVector{A});
@@ -4412,7 +4412,7 @@ NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_b1c2h2w2)
     auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
     double eps = 0.001;
     auto shape_r = Shape{1, 2, 2, 2};
-    auto bn = make_shared<op::BatchNormTraining>(eps, gamma, beta, input);
+    auto bn = make_shared<op::BatchNormTraining>(input, gamma, beta, eps);
 
     auto output_rt = std::make_shared<op::GetOutputElement>(bn, 0);
     auto mean_rt = std::make_shared<op::GetOutputElement>(bn, 1);
@@ -4475,7 +4475,7 @@ NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_b2c2h2w1)
     auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
     double eps = 0.001;
     auto shape_r = Shape{2, 2, 2, 1};
-    auto bn = make_shared<op::BatchNormTraining>(eps, gamma, beta, input);
+    auto bn = make_shared<op::BatchNormTraining>(input, gamma, beta, eps);
 
     auto output_rt = std::make_shared<op::GetOutputElement>(bn, 0);
     auto mean_rt = std::make_shared<op::GetOutputElement>(bn, 1);
@@ -4532,7 +4532,7 @@ NGRAPH_TEST(${BACKEND_NAME}, batchnorm_bprop_n4c3h2w2)
     auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
     double eps = 0.001;
     auto shape_r = Shape{4, 3, 2, 2};
-    auto bn = make_shared<op::BatchNormTraining>(eps, gamma, beta, input);
+    auto bn = make_shared<op::BatchNormTraining>(input, gamma, beta, eps);
     auto bn_dx = make_shared<op::GetOutputElement>(bn, 0);
     auto bn_dgamma = make_shared<op::GetOutputElement>(bn, 1);
     auto bn_dbeta = make_shared<op::GetOutputElement>(bn, 2);
@@ -4627,7 +4627,7 @@ NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_inference_b2c2h2w1)
     auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
     double eps = 0.001;
     auto shape_r = Shape{2, 2, 2, 1};
-    auto bn = make_shared<op::BatchNormInference>(eps, gamma, beta, input, mean, var);
+    auto bn = make_shared<op::BatchNormInference>(input, gamma, beta, mean, var, eps);
 
     auto f = make_shared<Function>(bn, op::ParameterVector{input, gamma, beta, mean, var});
     auto backend = runtime::Backend::create("${BACKEND_NAME}");
@@ -4676,7 +4676,7 @@ NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_globalstats_b2c2w2h1)
     auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
     double eps = 0.001;
     auto shape_r = Shape{2, 2, 2, 1};
-    auto bn = make_shared<op::BatchNormTraining>(eps, gamma, beta, input, mean, var);
+    auto bn = make_shared<op::BatchNormTraining>(input, gamma, beta, mean, var, eps);
 
     auto f = make_shared<Function>(bn, op::ParameterVector{gamma, beta, input, mean, var});
     auto backend = runtime::Backend::create("${BACKEND_NAME}");
@@ -5428,14 +5428,14 @@ NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_bprop)
     auto g = std::make_shared<op::Parameter>(element::f32, sca);
     auto b = std::make_shared<op::Parameter>(element::f32, sca);
     auto input = std::make_shared<op::Parameter>(element::f32, vec);
-    auto bn_fp = std::make_shared<op::BatchNormTraining>(eps, g, b, input);
+    auto bn_fp = std::make_shared<op::BatchNormTraining>(input, g, b, eps);
     auto bnorm = std::make_shared<op::GetOutputElement>(bn_fp, 0);
     auto mean = std::make_shared<op::GetOutputElement>(bn_fp, 1);
     auto var = std::make_shared<op::GetOutputElement>(bn_fp, 2);
 
     auto delta = std::make_shared<op::Parameter>(element::f32, vec);
     auto bn_bp =
-        std::make_shared<op::BatchNormTrainingBackprop>(eps, g, b, bnorm, mean, var, delta);
+        std::make_shared<op::BatchNormTrainingBackprop>(bnorm, g, b, mean, var, delta, eps);
     auto dx = std::make_shared<op::GetOutputElement>(bn_bp, 0);
 
     std::vector<std::vector<float>> args = {
@@ -5459,7 +5459,7 @@ NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_bprop_2step)
     auto g = std::make_shared<op::Parameter>(element::f32, sca);
     auto b = std::make_shared<op::Parameter>(element::f32, sca);
     auto input = std::make_shared<op::Parameter>(element::f32, vec);
-    auto bn_fp = std::make_shared<op::BatchNormTraining>(eps, g, b, input);
+    auto bn_fp = std::make_shared<op::BatchNormTraining>(input, g, b, eps);
     auto bnorm = std::make_shared<op::GetOutputElement>(bn_fp, 0);
     auto mean = std::make_shared<op::GetOutputElement>(bn_fp, 1);
     auto var = std::make_shared<op::GetOutputElement>(bn_fp, 2);
@@ -5480,7 +5480,7 @@ NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_bprop_2step)
     auto m = std::make_shared<op::Parameter>(element::f32, sca);
     auto v = std::make_shared<op::Parameter>(element::f32, sca);
     auto delta = std::make_shared<op::Parameter>(element::f32, vec);
-    auto bn_bp = std::make_shared<op::BatchNormTrainingBackprop>(eps, g, b, bn_output, m, v, delta);
+    auto bn_bp = std::make_shared<op::BatchNormTrainingBackprop>(bn_output, g, b, m, v, delta, eps);
     auto dx = std::make_shared<op::GetOutputElement>(bn_bp, 0);
 
     args.pop_back();               // remove x
diff --git a/test/backend_test.in.cpp-9bfce850 b/test/backend_test.in.cpp-9bfce850
new file mode 100644
index 00000000000..1b58addb5bd
--- /dev/null
+++ b/test/backend_test.in.cpp-9bfce850
@@ -0,0 +1,5571 @@
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <algorithm>
+#include <algorithm>
+#include <cinttypes>
+#include <cmath>
+#include <cstdlib>
+#include <random>
+#include <string>
+#include "gtest/gtest.h"
+
+#include "ngraph/autodiff/adjoints.hpp"
+#include "ngraph/graph_util.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/ngraph.hpp"
+#include "ngraph/op/experimental/generate_mask.hpp"
+#include "ngraph/serializer.hpp"
+#include "ngraph/state/rng_state.hpp"
+#include "util/all_close.hpp"
+#include "util/all_close_f.hpp"
+#include "util/ndarray.hpp"
+#include "util/random.hpp"
+#include "util/test_control.hpp"
+#include "util/test_tools.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static string s_manifest = "${MANIFEST}";
+
+static const vector<element::Type> s_known_element_types = {element::from<float>(),
+                                                            element::from<double>(),
+                                                            element::from<int8_t>(),
+                                                            element::from<int16_t>(),
+                                                            element::from<int32_t>(),
+                                                            element::from<int64_t>(),
+                                                            element::from<uint8_t>(),
+                                                            element::from<uint16_t>(),
+                                                            element::from<uint32_t>(),
+                                                            element::from<uint64_t>()};
+
+class UnhandledOp : public ngraph::op::Op
+{
+public:
+    UnhandledOp(const std::shared_ptr<Node>& arg)
+        : Op("Unsupported_op", check_single_output_args({arg}))
+    {
+        constructor_validate_and_infer_types();
+    }
+    shared_ptr<Node> copy_with_new_args(const NodeVector& new_args) const override
+    {
+        return make_shared<UnhandledOp>(new_args[0]);
+    }
+
+protected:
+    void validate_and_infer_types() override
+    {
+        set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
+    }
+};
+
+NGRAPH_TEST(${BACKEND_NAME}, unhandled_op)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto unhandled = make_shared<UnhandledOp>(A);
+    auto f = make_shared<Function>(unhandled, op::ParameterVector{A});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    shared_ptr<runtime::Tensor> a = backend->create_tensor<float>(shape);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor<float>(shape);
+    ASSERT_THROW(backend->call_with_validate(f, {result}, {a}), unsupported_op);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, function_name)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(A + B, op::ParameterVector{A, B}, "funky func name");
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor<float>(shape);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor<float>(shape);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor<float>(shape);
+
+    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
+    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ(read_vector<float>(result),
+              (test::NDArray<float, 2>({{6, 8}, {10, 12}})).get_vector());
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, node_name)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto C = A + B;
+    C->set_name("a node name");
+    auto f = make_shared<Function>(C, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape);
+
+    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
+    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ(read_vector<float>(result),
+              (test::NDArray<float, 2>({{6, 8}, {10, 12}})).get_vector());
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, aliased_output)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto C = A + B;
+    auto D = A * B;
+    auto E = op::Constant::create(element::f32, shape, {1, 2, 3, 4});
+    auto f = make_shared<Function>(NodeVector{C, C, D, D, C, E, E}, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> out1 = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> out2 = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> out3 = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> out4 = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> out5 = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> out6 = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> out7 = backend->create_tensor(element::f32, shape);
+
+    copy_data(a, vector<float>{0, 1, 2, 3});
+    copy_data(b, vector<float>{1, 2, 3, 4});
+    vector<float> expectedC{1, 3, 5, 7};
+    vector<float> expectedD{0, 2, 6, 12};
+    vector<float> expectedE{1, 2, 3, 4};
+
+    backend->call_with_validate(f, {out1, out2, out3, out4, out5, out6, out7}, {a, b});
+    EXPECT_EQ(expectedC, read_vector<float>(out1));
+    EXPECT_EQ(expectedC, read_vector<float>(out2));
+    EXPECT_EQ(expectedD, read_vector<float>(out3));
+    EXPECT_EQ(expectedD, read_vector<float>(out4));
+    EXPECT_EQ(expectedC, read_vector<float>(out5));
+    EXPECT_EQ(expectedE, read_vector<float>(out6));
+    EXPECT_EQ(expectedE, read_vector<float>(out7));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, parameter_as_output)
+{
+    Shape shape{3, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(A, op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape);
+
+    vector<float> expected{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+    vector<float> zero(shape_size(shape), 0);
+    copy_data(a, expected);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, abc)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto C = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>((A + B) * C, op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape);
+
+    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
+    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
+    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
+
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ(read_vector<float>(result),
+              (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
+
+    backend->call_with_validate(f, {result}, {b, a, c});
+    EXPECT_EQ(read_vector<float>(result),
+              (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
+
+    backend->call_with_validate(f, {result}, {a, c, b});
+    EXPECT_EQ(read_vector<float>(result),
+              (test::NDArray<float, 2>({{50, 72}, {98, 128}})).get_vector());
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, abc_int64)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::i64, shape);
+    auto B = make_shared<op::Parameter>(element::i64, shape);
+    auto C = make_shared<op::Parameter>(element::i64, shape);
+    auto f = make_shared<Function>((A + B) * C, op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::i64, shape);
+    copy_data(a, vector<int64_t>{1, 2, 3, 4});
+    auto b = backend->create_tensor(element::i64, shape);
+    copy_data(b, vector<int64_t>{5, 6, 7, 8});
+    auto c = backend->create_tensor(element::i64, shape);
+    copy_data(c, vector<int64_t>{9, 10, 11, 12});
+    auto result = backend->create_tensor(element::i64, shape);
+
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ((vector<int64_t>{54, 80, 110, 144}), read_vector<int64_t>(result));
+
+    backend->call_with_validate(f, {result}, {b, a, c});
+    EXPECT_EQ((vector<int64_t>{54, 80, 110, 144}), read_vector<int64_t>(result));
+
+    backend->call_with_validate(f, {result}, {a, c, b});
+    EXPECT_EQ((vector<int64_t>{50, 72, 98, 128}), read_vector<int64_t>(result));
+}
+
+// Multiple retrive values
+NGRAPH_TEST(${BACKEND_NAME}, multiple_result)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto C = make_shared<op::Parameter>(element::f32, shape);
+    auto A_add_B = make_shared<op::Add>(A, B);
+    auto A_add_B_mul_C = make_shared<op::Multiply>(A_add_B, C);
+
+    auto f =
+        make_shared<Function>(NodeVector{A_add_B, A_add_B_mul_C}, op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 2, 3, 4});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{5, 6, 7, 8});
+    auto c = backend->create_tensor(element::f32, shape);
+    copy_data(c, vector<float>{9, 10, 11, 12});
+
+    auto r0 = backend->create_tensor(element::f32, shape);
+    auto r1 = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {r0, r1}, {a, b, c});
+
+    EXPECT_EQ((vector<float>{6, 8, 10, 12}), read_vector<float>(r0));
+    EXPECT_EQ((vector<float>{54, 80, 110, 144}), read_vector<float>(r1));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, batch_norm_one_output)
+{
+    auto shape_in = Shape{2, 3};
+    auto shape_mean = Shape{3};
+
+    auto A = make_shared<op::Parameter>(element::f64, shape_in);
+    auto Mean =
+        op::Constant::create(element::f64, shape_mean, {0.00396654, -1.25294404, 1.16651872});
+    auto Variance =
+        op::Constant::create(element::f64, shape_mean, {2.40871689, 1.44969511, 0.23469392});
+    auto Beta =
+        op::Constant::create(element::f64, shape_mean, {2.14211921, -0.75733924, 0.42210531});
+    auto Gamma =
+        op::Constant::create(element::f64, shape_mean, {1.75437676, 0.37950502, 1.13727544});
+
+    auto BN = make_shared<op::BatchNormInference>(A, Gamma, Beta, Mean, Variance, 1e-3);
+    auto f = make_shared<Function>(BN, op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f64, shape_in);
+    copy_data(
+        a,
+        vector<double>{-1.97431703, -2.06521307, 0.54122217, 2.53375939, -0.22342691, 0.45340773});
+
+    auto result = backend->create_tensor(element::f64, shape_in);
+    vector<double> expected_result{
+        -0.09365749, -1.01327395, -1.04269195, 5.00118923, -0.43295258, -1.24840283};
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_TRUE(test::all_close(vector<double>{expected_result}, read_vector<double>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, batch_norm_three_outputs)
+{
+    auto shape_in = Shape{2, 3};
+    auto shape_mean = Shape{3};
+
+    auto A = make_shared<op::Parameter>(element::f64, shape_in);
+    auto Beta =
+        op::Constant::create(element::f64, shape_mean, {2.14211921, -0.75733924, 0.42210531});
+    auto Gamma =
+        op::Constant::create(element::f64, shape_mean, {1.75437676, 0.37950502, 1.13727544});
+
+    auto BN = make_shared<op::BatchNormTraining>(A, Gamma, Beta, 1e-3);
+
+    auto f0 =
+        make_shared<Function>(make_shared<op::GetOutputElement>(BN, 0), op::ParameterVector{A});
+    auto f1 =
+        make_shared<Function>(make_shared<op::GetOutputElement>(BN, 1), op::ParameterVector{A});
+    auto f2 =
+        make_shared<Function>(make_shared<op::GetOutputElement>(BN, 2), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f64, shape_in);
+    copy_data(
+        a,
+        vector<double>{-1.97431703, -2.06521307, 0.54122217, 2.53375939, -0.22342691, 0.45340773});
+
+    auto result0 = backend->create_tensor(element::f64, shape_in);
+    vector<double> expected_result0{
+        0.3879149, -1.13662076, 1.34494817, 3.89632344, -0.37805778, -0.50073695};
+
+    backend->call_with_validate(f0, {result0}, {a});
+    EXPECT_TRUE(test::all_close(vector<double>{expected_result0}, read_vector<double>(result0)));
+
+    auto result1 = backend->create_tensor(element::f64, shape_mean);
+    vector<double> expected_result1{0.27972114, -1.14431989, 0.49731493};
+
+    backend->call_with_validate(f1, {result1}, {a});
+    EXPECT_TRUE(test::all_close(vector<double>{expected_result1}, read_vector<double>(result1)));
+
+    auto result2 = backend->create_tensor(element::f64, shape_mean);
+    vector<double> expected_result2{5.08068895e+00, 8.48043919e-01, 1.92784308e-03};
+
+    backend->call_with_validate(f2, {result2}, {a});
+    EXPECT_TRUE(test::all_close(vector<double>{expected_result2}, read_vector<double>(result2)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, concat_matrix_colwise)
+{
+    Shape shape_a{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{2, 3};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_c{2, 3};
+    auto C = make_shared<op::Parameter>(element::f32, shape_c);
+    Shape shape_r{2, 8};
+    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{A, B, C}, 1),
+                                   op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{2, 4, 8, 16});
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{1, 2, 4, 8, 16, 32});
+    auto c = backend->create_tensor(element::f32, shape_c);
+    copy_data(c, vector<float>{2, 3, 5, 7, 11, 13});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ((vector<float>{2, 4, 1, 2, 4, 2, 3, 5, 8, 16, 8, 16, 32, 7, 11, 13}),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, concat_matrix_rowwise)
+{
+    Shape shape_a{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{3, 2};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_c{3, 2};
+    auto C = make_shared<op::Parameter>(element::f32, shape_c);
+    Shape shape_r{8, 2};
+    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{A, B, C}, 0),
+                                   op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{2, 4, 8, 16});
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{1, 2, 4, 8, 16, 32});
+    auto c = backend->create_tensor(element::f32, shape_c);
+    copy_data(c, vector<float>{2, 3, 5, 7, 11, 13});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ((vector<float>{2, 4, 8, 16, 1, 2, 4, 8, 16, 32, 2, 3, 5, 7, 11, 13}),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, concat_matrix_int64)
+{
+    Shape shape_a{2, 2};
+    auto A = make_shared<op::Parameter>(element::i64, shape_a);
+    Shape shape_b{3, 2};
+    auto B = make_shared<op::Parameter>(element::i64, shape_b);
+    Shape shape_c{3, 2};
+    auto C = make_shared<op::Parameter>(element::i64, shape_c);
+    Shape shape_r{8, 2};
+    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{A, B, C}, 0),
+                                   op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::i64, shape_a);
+    copy_data(a, vector<int64_t>{2, 4, 8, 16});
+    auto b = backend->create_tensor(element::i64, shape_b);
+    copy_data(b, vector<int64_t>{1, 2, 4, 8, 16, 32});
+    auto c = backend->create_tensor(element::i64, shape_c);
+    copy_data(c, vector<int64_t>{2, 3, 5, 7, 11, 13});
+    auto result = backend->create_tensor(element::i64, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ((vector<int64_t>{2, 4, 8, 16, 1, 2, 4, 8, 16, 32, 2, 3, 5, 7, 11, 13}),
+              read_vector<int64_t>(result));
+}
+
+// Params to drive concat_vector_large testing variations
+class concat_vector_params : public ::testing::TestWithParam<int>
+{
+protected:
+    concat_vector_params() { num_inputs = GetParam(); }
+    uint32_t num_inputs;
+};
+
+NGRAPH_TEST_P(${BACKEND_NAME}, concat_vector_params, concat_vector_large)
+{
+    Shape shape_a{1};
+    NodeVector inputs;
+    op::ParameterVector inputs_param;
+    for (uint32_t i = 0; i < num_inputs; i++)
+    {
+        auto A = make_shared<op::Parameter>(element::f32, shape_a);
+        inputs_param.push_back(A);
+        inputs.push_back(A);
+    }
+    Shape shape_r{num_inputs};
+    auto f = make_shared<Function>(make_shared<op::Concat>(inputs, 0), inputs_param);
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    std::vector<std::shared_ptr<runtime::Tensor>> inputs_value;
+    std::vector<float> ref_result;
+    for (uint32_t i = 0; i < num_inputs; i++)
+    {
+        auto a = backend->create_tensor(element::f32, shape_a);
+        copy_data(a, vector<float>{static_cast<float>(i)});
+        ref_result.push_back(static_cast<float>(i));
+        inputs_value.push_back(a);
+    }
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, inputs_value);
+    EXPECT_EQ(ref_result, read_vector<float>(result));
+}
+
+// concat_vector_large case generation
+// Add thhosw tests to cover paramter space overflow:
+// cuda kernel parameter space have limit, if there is large number of parameters,
+// there will be overflow for parameter space.
+NGRAPH_INSTANTIATE_TEST_CASE_P(${BACKEND_NAME},
+                               input_sizes,
+                               concat_vector_params,
+                               testing::Values(100, 128, 999));
+
+NGRAPH_TEST(${BACKEND_NAME}, concat_vector)
+{
+    Shape shape_a{4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{6};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_c{2};
+    auto C = make_shared<op::Parameter>(element::f32, shape_c);
+    Shape shape_r{12};
+    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{A, B, C}, 0),
+                                   op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{2, 4, 8, 16});
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{1, 2, 4, 8, 16, 32});
+    auto c = backend->create_tensor(element::f32, shape_c);
+    copy_data(c, vector<float>{18, 19});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ((vector<float>{2, 4, 8, 16, 1, 2, 4, 8, 16, 32, 18, 19}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, concat_4d_tensor)
+{
+    Shape shape{1, 1, 1, 1};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto C = make_shared<op::Parameter>(element::f32, shape);
+    Shape shape_r{3, 1, 1, 1};
+    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{A, B, C}, 0),
+                                   op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{2});
+    auto c = backend->create_tensor(element::f32, shape);
+    copy_data(c, vector<float>{3});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ((vector<float>{1, 2, 3}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, concat_2d_tensor)
+{
+    Shape shape{1, 1};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto C = make_shared<op::Parameter>(element::f32, shape);
+    Shape shape_r{3, 1};
+    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{A, B, C}, 0),
+                                   op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{2});
+    auto c = backend->create_tensor(element::f32, shape);
+    copy_data(c, vector<float>{3});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ((vector<float>{1, 2, 3}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_2d_tensor)
+{
+    Shape shape{1, 1};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto add1 = make_shared<op::Add>(A, B);
+    auto C = make_shared<op::Parameter>(element::f32, shape);
+    auto D = make_shared<op::Parameter>(element::f32, shape);
+    auto add2 = make_shared<op::Add>(C, D);
+    auto subtract = make_shared<op::Subtract>(C, A);
+    Shape shape_r{3, 1};
+    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{add1, add2, subtract}, 0),
+                                   op::ParameterVector{A, B, C, D});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{2});
+    auto c = backend->create_tensor(element::f32, shape);
+    copy_data(c, vector<float>{3});
+    auto d = backend->create_tensor(element::f32, shape);
+    copy_data(d, vector<float>{4});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c, d});
+    EXPECT_EQ((vector<float>{3, 7, 2}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_propagate_2d_tensor)
+{
+    Shape shape{1, 1};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto add1 = make_shared<op::Add>(A, B);
+    auto C = make_shared<op::Parameter>(element::f32, shape);
+    auto D = make_shared<op::Parameter>(element::f32, shape);
+    auto add2 = make_shared<op::Add>(C, D);
+    auto concat1 = make_shared<op::Concat>(NodeVector{add1, add2}, 0);
+    auto subtract = make_shared<op::Subtract>(C, A);
+    Shape shape_r{3, 1};
+    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{concat1, subtract}, 0),
+                                   op::ParameterVector{A, B, C, D});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{2});
+    auto c = backend->create_tensor(element::f32, shape);
+    copy_data(c, vector<float>{3});
+    auto d = backend->create_tensor(element::f32, shape);
+    copy_data(d, vector<float>{4});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c, d});
+    EXPECT_EQ((vector<float>{3, 7, 2}), read_vector<float>(result));
+}
+
+// from numpy import *
+// a=linspace(1,2*3*4*3*2,2*3*4*3*2)
+// b=linspace(1000+1,1000+2*3*3*3*2,2*3*3*3*2)
+// c=linspace(2000+1,2000+2*3*2*3*2,2*3*2*3*2)
+// a.shape=(2,3,4,3,2)
+// b.shape=(2,3,3,3,2)
+// c.shape=(2,3,2,3,2)
+// z=concatenate((a,b,c),axis=2)
+// z.shape=(2*3*(4+3+2)*3*2)
+// set_printoptions(suppress=True)
+// print(z)
+//
+// [    1.     2.     3.     4.     5.     6.     7.     8.     9.    10.
+//     11.    12.    13.    14.    15.    16.    17.    18.    19.    20.
+//     21.    22.    23.    24.  1001.  1002.  1003.  1004.  1005.  1006.
+//   1007.  1008.  1009.  1010.  1011.  1012.  1013.  1014.  1015.  1016.
+//   1017.  1018.  2001.  2002.  2003.  2004.  2005.  2006.  2007.  2008.
+//   2009.  2010.  2011.  2012.    25.    26.    27.    28.    29.    30.
+//     31.    32.    33.    34.    35.    36.    37.    38.    39.    40.
+//     41.    42.    43.    44.    45.    46.    47.    48.  1019.  1020.
+//   1021.  1022.  1023.  1024.  1025.  1026.  1027.  1028.  1029.  1030.
+//   1031.  1032.  1033.  1034.  1035.  1036.  2013.  2014.  2015.  2016.
+//   2017.  2018.  2019.  2020.  2021.  2022.  2023.  2024.    49.    50.
+//     51.    52.    53.    54.    55.    56.    57.    58.    59.    60.
+//     61.    62.    63.    64.    65.    66.    67.    68.    69.    70.
+//     71.    72.  1037.  1038.  1039.  1040.  1041.  1042.  1043.  1044.
+//   1045.  1046.  1047.  1048.  1049.  1050.  1051.  1052.  1053.  1054.
+//   2025.  2026.  2027.  2028.  2029.  2030.  2031.  2032.  2033.  2034.
+//   2035.  2036.    73.    74.    75.    76.    77.    78.    79.    80.
+//     81.    82.    83.    84.    85.    86.    87.    88.    89.    90.
+//     91.    92.    93.    94.    95.    96.  1055.  1056.  1057.  1058.
+//   1059.  1060.  1061.  1062.  1063.  1064.  1065.  1066.  1067.  1068.
+//   1069.  1070.  1071.  1072.  2037.  2038.  2039.  2040.  2041.  2042.
+//   2043.  2044.  2045.  2046.  2047.  2048.    97.    98.    99.   100.
+//    101.   102.   103.   104.   105.   106.   107.   108.   109.   110.
+//    111.   112.   113.   114.   115.   116.   117.   118.   119.   120.
+//   1073.  1074.  1075.  1076.  1077.  1078.  1079.  1080.  1081.  1082.
+//   1083.  1084.  1085.  1086.  1087.  1088.  1089.  1090.  2049.  2050.
+//   2051.  2052.  2053.  2054.  2055.  2056.  2057.  2058.  2059.  2060.
+//    121.   122.   123.   124.   125.   126.   127.   128.   129.   130.
+//    131.   132.   133.   134.   135.   136.   137.   138.   139.   140.
+//    141.   142.   143.   144.  1091.  1092.  1093.  1094.  1095.  1096.
+//   1097.  1098.  1099.  1100.  1101.  1102.  1103.  1104.  1105.  1106.
+//   1107.  1108.  2061.  2062.  2063.  2064.  2065.  2066.  2067.  2068.
+//   2069.  2070.  2071.  2072.]
+NGRAPH_TEST(${BACKEND_NAME}, concat_5d)
+{
+    vector<float> a_data(2 * 3 * 4 * 3 * 2);
+    for (int i = 0; i < 2 * 3 * 4 * 3 * 2; i++)
+    {
+        a_data[i] = float(i + 1);
+    }
+
+    vector<float> b_data(2 * 3 * 3 * 3 * 2);
+    for (int i = 0; i < 2 * 3 * 3 * 3 * 2; i++)
+    {
+        b_data[i] = 1000 + float(i + 1);
+    }
+
+    vector<float> c_data(2 * 3 * 2 * 3 * 2);
+    for (int i = 0; i < 2 * 3 * 2 * 3 * 2; i++)
+    {
+        c_data[i] = 2000 + float(i + 1);
+    }
+
+    Shape shape_a{2, 3, 4, 3, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{2, 3, 3, 3, 2};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_c{2, 3, 2, 3, 2};
+    auto C = make_shared<op::Parameter>(element::f32, shape_c);
+    Shape shape_r{2, 3, 9, 3, 2};
+
+    auto r = make_shared<op::Concat>(NodeVector{A, B, C}, 2);
+    auto f = make_shared<Function>(r, op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, a_data);
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, b_data);
+    auto c = backend->create_tensor(element::f32, shape_c);
+    copy_data(c, c_data);
+
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ(
+        (vector<float>{
+            1.,    2.,    3.,    4.,    5.,    6.,    7.,    8.,    9.,    10.,   11.,   12.,
+            13.,   14.,   15.,   16.,   17.,   18.,   19.,   20.,   21.,   22.,   23.,   24.,
+            1001., 1002., 1003., 1004., 1005., 1006., 1007., 1008., 1009., 1010., 1011., 1012.,
+            1013., 1014., 1015., 1016., 1017., 1018., 2001., 2002., 2003., 2004., 2005., 2006.,
+            2007., 2008., 2009., 2010., 2011., 2012., 25.,   26.,   27.,   28.,   29.,   30.,
+            31.,   32.,   33.,   34.,   35.,   36.,   37.,   38.,   39.,   40.,   41.,   42.,
+            43.,   44.,   45.,   46.,   47.,   48.,   1019., 1020., 1021., 1022., 1023., 1024.,
+            1025., 1026., 1027., 1028., 1029., 1030., 1031., 1032., 1033., 1034., 1035., 1036.,
+            2013., 2014., 2015., 2016., 2017., 2018., 2019., 2020., 2021., 2022., 2023., 2024.,
+            49.,   50.,   51.,   52.,   53.,   54.,   55.,   56.,   57.,   58.,   59.,   60.,
+            61.,   62.,   63.,   64.,   65.,   66.,   67.,   68.,   69.,   70.,   71.,   72.,
+            1037., 1038., 1039., 1040., 1041., 1042., 1043., 1044., 1045., 1046., 1047., 1048.,
+            1049., 1050., 1051., 1052., 1053., 1054., 2025., 2026., 2027., 2028., 2029., 2030.,
+            2031., 2032., 2033., 2034., 2035., 2036., 73.,   74.,   75.,   76.,   77.,   78.,
+            79.,   80.,   81.,   82.,   83.,   84.,   85.,   86.,   87.,   88.,   89.,   90.,
+            91.,   92.,   93.,   94.,   95.,   96.,   1055., 1056., 1057., 1058., 1059., 1060.,
+            1061., 1062., 1063., 1064., 1065., 1066., 1067., 1068., 1069., 1070., 1071., 1072.,
+            2037., 2038., 2039., 2040., 2041., 2042., 2043., 2044., 2045., 2046., 2047., 2048.,
+            97.,   98.,   99.,   100.,  101.,  102.,  103.,  104.,  105.,  106.,  107.,  108.,
+            109.,  110.,  111.,  112.,  113.,  114.,  115.,  116.,  117.,  118.,  119.,  120.,
+            1073., 1074., 1075., 1076., 1077., 1078., 1079., 1080., 1081., 1082., 1083., 1084.,
+            1085., 1086., 1087., 1088., 1089., 1090., 2049., 2050., 2051., 2052., 2053., 2054.,
+            2055., 2056., 2057., 2058., 2059., 2060., 121.,  122.,  123.,  124.,  125.,  126.,
+            127.,  128.,  129.,  130.,  131.,  132.,  133.,  134.,  135.,  136.,  137.,  138.,
+            139.,  140.,  141.,  142.,  143.,  144.,  1091., 1092., 1093., 1094., 1095., 1096.,
+            1097., 1098., 1099., 1100., 1101., 1102., 1103., 1104., 1105., 1106., 1107., 1108.,
+            2061., 2062., 2063., 2064., 2065., 2066., 2067., 2068., 2069., 2070., 2071., 2072.}),
+        read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, concat_zero_length_1d_last)
+{
+    Shape shape_a{4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{0};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{4};
+
+    auto r = make_shared<op::Concat>(NodeVector{A, B}, 0);
+    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    vector<float> a_data{1, 2, 3, 4};
+    vector<float> b_data(0);
+
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, a_data);
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, b_data);
+
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, concat_zero_length_1d_middle)
+{
+    Shape shape_a{4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{0};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_c{4};
+    auto C = make_shared<op::Parameter>(element::f32, shape_c);
+    Shape shape_r{8};
+
+    auto r = make_shared<op::Concat>(NodeVector{A, B, C}, 0);
+    auto f = make_shared<Function>(r, op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    vector<float> a_data{1, 2, 3, 4};
+    vector<float> b_data(0);
+    vector<float> c_data{5, 6, 7, 8};
+
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, a_data);
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, b_data);
+    auto c = backend->create_tensor(element::f32, shape_c);
+    copy_data(c, c_data);
+
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6, 7, 8}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, concat_zero_length_4d_middle)
+{
+    Shape shape_a{2, 2, 1, 1};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{2, 2, 0, 1};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_c{2, 2, 1, 1};
+    auto C = make_shared<op::Parameter>(element::f32, shape_c);
+    Shape shape_r{2, 2, 2, 1};
+
+    auto r = make_shared<op::Concat>(NodeVector{A, B, C}, 2);
+    auto f = make_shared<Function>(r, op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    vector<float> a_data{1, 2, 3, 4};
+    vector<float> b_data(0);
+    vector<float> c_data{5, 6, 7, 8};
+
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, a_data);
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, b_data);
+    auto c = backend->create_tensor(element::f32, shape_c);
+    copy_data(c, c_data);
+
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ((vector<float>{1, 5, 2, 6, 3, 7, 4, 8}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, lrn)
+{
+    Shape shape{2, 3, 2, 1};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto lrn = make_shared<op::LRN>(A, 1., 2., 1., 3);
+    auto f = make_shared<Function>(lrn, op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    vector<float> args{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, args);
+
+    auto result = backend->create_tensor(element::f32, shape);
+    backend->call_with_validate(f, {result}, {a});
+
+    vector<float> expected{0.f,
+                           0.05325444f,
+                           0.03402646f,
+                           0.01869806f,
+                           0.06805293f,
+                           0.03287071f,
+                           0.00509002f,
+                           0.00356153f,
+                           0.00174719f,
+                           0.0012555f,
+                           0.00322708f,
+                           0.00235574f};
+    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, select)
+{
+    Shape shape{2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::boolean, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto C = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Select>(A, B, C), op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::boolean, shape);
+    copy_data(a, vector<char>{0, 1, 1, 0, 0, 1, 0, 1});
+    auto b = backend->create_tensor(element::f32, shape);
+    copy_data(b, vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
+    auto c = backend->create_tensor(element::f32, shape);
+    copy_data(c, vector<float>{11, 12, 13, 14, 15, 16, 17, 18});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ((vector<float>{11, 2, 3, 14, 15, 6, 17, 8}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, tensor_constant)
+{
+    Shape shape{2, 2, 2};
+    auto A = op::Constant::create(element::f32, shape, {1, 2, 3, 4, 5, 6, 7, 8});
+    auto f = make_shared<Function>(A, op::ParameterVector{});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {});
+    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6, 7, 8}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, tensor_2constant)
+{
+    Shape shape{2, 2, 2};
+    auto A = op::Constant::create(element::f32, shape, {1, 2, 3, 4, 5, 6, 7, 8});
+    auto f = make_shared<Function>(NodeVector{A, A}, op::ParameterVector{});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto result0 = backend->create_tensor(element::f32, shape);
+    auto result1 = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result0, result1}, {});
+    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6, 7, 8}), read_vector<float>(result0));
+    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6, 7, 8}), read_vector<float>(result1));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, tensor_constant_with_op)
+{
+    Shape shape{2, 2, 2};
+    auto A = op::Constant::create(element::f32, shape, {-1, 2, 3, -4, 5, -6, -7, 8});
+    auto f = make_shared<Function>(make_shared<op::Abs>(A), op::ParameterVector{});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {});
+    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6, 7, 8}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, constant_multi_use)
+{
+    auto A = make_shared<op::Constant>(element::i32, Shape{}, std::vector<std::string>{"388"});
+    auto f = make_shared<Function>(A, op::ParameterVector{});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    std::shared_ptr<runtime::Tensor> r1 = backend->create_tensor(element::i32, Shape{});
+    backend->call_with_validate(f, {r1}, std::vector<std::shared_ptr<runtime::Tensor>>{});
+    EXPECT_EQ(read_vector<int>(r1), std::vector<int>{388});
+
+    std::shared_ptr<runtime::Tensor> r2 = backend->create_tensor(element::i32, Shape{});
+    backend->call_with_validate(f, {r2}, std::vector<std::shared_ptr<runtime::Tensor>>{});
+    EXPECT_EQ(read_vector<int>(r2), std::vector<int>{388});
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, function_call)
+{
+    // First create "f(A,B,C) = (A+B)*C".
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto C = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>((A + B) * C, op::ParameterVector{A, B, C});
+
+    // Now make "g(X,Y,Z) = f(X,Y,Z) + f(X,Y,Z)"
+    auto X = make_shared<op::Parameter>(element::f32, shape);
+    auto Y = make_shared<op::Parameter>(element::f32, shape);
+    auto Z = make_shared<op::Parameter>(element::f32, shape);
+    auto g =
+        make_shared<Function>(make_shared<op::FunctionCall>(f, NodeVector{X + Y, Y + Z, Z + X}) +
+                                  make_shared<op::FunctionCall>(f, NodeVector{X, Y, Z}),
+                              op::ParameterVector{X, Y, Z});
+
+    // Now call g on some test vectors.
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto x = backend->create_tensor(element::f32, shape);
+    copy_data(x, vector<float>{1, 2, 3, 4});
+    auto y = backend->create_tensor(element::f32, shape);
+    copy_data(y, vector<float>{5, 6, 7, 8});
+    auto z = backend->create_tensor(element::f32, shape);
+    copy_data(z, vector<float>{9, 10, 11, 12});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(g, {result}, {x, y, z});
+    EXPECT_EQ((vector<float>{254, 368, 502, 656}), read_vector<float>(result));
+
+    backend->call_with_validate(g, {result}, {y, x, z});
+    EXPECT_EQ((vector<float>{278, 400, 542, 704}), read_vector<float>(result));
+
+    backend->call_with_validate(g, {result}, {x, z, y});
+    EXPECT_EQ((vector<float>{194, 296, 418, 560}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, convert_int32_float32)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::i32, shape);
+    auto f =
+        make_shared<Function>(make_shared<op::Convert>(A, element::f32), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::i32, shape);
+    copy_data(a, vector<int32_t>{1, 2, 3, 4});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, convert_uint16_float32)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::u16, shape);
+    auto f =
+        make_shared<Function>(make_shared<op::Convert>(A, element::f32), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::u16, shape);
+    copy_data(a, vector<uint16_t>{1, 2, 3, 4});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, convert_int32_bool)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::i32, shape);
+    auto f = make_shared<Function>(make_shared<op::Convert>(A, element::boolean),
+                                   op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::i32, shape);
+    copy_data(a, vector<int32_t>{1, 2, 3, 4});
+    auto result = backend->create_tensor(element::boolean, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<char>{1, 2, 3, 4}), read_vector<char>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, convert_float32_bool)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Convert>(A, element::boolean),
+                                   op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 2, 3, 4});
+    auto result = backend->create_tensor(element::boolean, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<char>{1, 2, 3, 4}), read_vector<char>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, slice_scalar)
+{
+    Shape shape_a{};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_r{};
+    auto r = make_shared<op::Slice>(A, Coordinate{}, Coordinate{});
+    auto f = make_shared<Function>(r, op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{312});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{312}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, slice_matrix)
+{
+    Shape shape_a{4, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_r{3, 2};
+    auto r = make_shared<op::Slice>(A, Coordinate{0, 1}, Coordinate{3, 3});
+    auto f = make_shared<Function>(r, op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{2, 3, 6, 7, 10, 11}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, slice_vector)
+{
+    Shape shape_a{16};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_r{12};
+    auto r = make_shared<op::Slice>(A, Coordinate{2}, Coordinate{14});
+    auto f = make_shared<Function>(r, op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, slice_matrix_axis_0_overlap)
+{
+    Shape shape_a{4, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto B = make_shared<op::Parameter>(element::f32, shape_a);
+    auto C = make_shared<op::Add>(A, B);
+    Shape shape_r{2, 4};
+    auto D = make_shared<op::Slice>(C, Coordinate{0, 0}, Coordinate{2, 4});
+    auto E = make_shared<op::Slice>(C, Coordinate{1, 0}, Coordinate{3, 4});
+    auto r = make_shared<op::Add>(D, E);
+    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+    auto b = backend->create_tensor(element::f32, shape_a);
+    copy_data(b, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((vector<float>{12, 16, 20, 24, 28, 32, 36, 40}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, slice_matrix_strided)
+{
+    Shape shape_a{4, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_r{2, 2};
+    auto r = make_shared<op::Slice>(A, Coordinate{1, 0}, Coordinate{4, 4}, Strides{2, 3});
+    auto f = make_shared<Function>(r, op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{4, 7, 12, 15}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, slice_3d)
+{
+    Shape shape_a{4, 4, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_r{2, 2, 2};
+    auto r = make_shared<op::Slice>(A, Coordinate{1, 1, 1}, Coordinate{3, 3, 3});
+    auto f = make_shared<Function>(r, op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+
+                               16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+
+                               32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+
+                               48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{21, 22, 25, 26, 37, 38, 41, 42}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, slice_3d_strided)
+{
+    Shape shape_a{4, 4, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_r{2, 2, 2};
+    auto r = make_shared<op::Slice>(A, Coordinate{0, 0, 0}, Coordinate{4, 4, 4}, Strides{2, 2, 2});
+    auto f = make_shared<Function>(r, op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+
+                               16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+
+                               32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+
+                               48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{0, 2, 8, 10, 32, 34, 40, 42}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, slice_3d_strided_different_strides)
+{
+    Shape shape_a{4, 4, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_r{2, 2, 2};
+    auto r = make_shared<op::Slice>(A, Coordinate{0, 0, 0}, Coordinate{4, 4, 4}, Strides{2, 2, 3});
+    auto f = make_shared<Function>(r, op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+
+                               16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+
+                               32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+
+                               48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{0, 3, 8, 11, 32, 35, 40, 43}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, scalar_constant_float32)
+{
+    auto r = op::Constant::create(element::f32, Shape{}, {4.75});
+    auto f = make_shared<Function>(r, op::ParameterVector{});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto result = backend->create_tensor(element::f32, Shape{});
+
+    backend->call_with_validate(f, {result}, {});
+    EXPECT_EQ(vector<float>{4.75f}, read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, scalar_constant_int64)
+{
+    auto r = op::Constant::create(element::i64, Shape{}, {2112});
+    auto f = make_shared<Function>(r, op::ParameterVector{});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto result = backend->create_tensor(element::i64, Shape{});
+
+    backend->call_with_validate(f, {result}, {});
+    EXPECT_EQ(vector<int64_t>{2112}, read_vector<int64_t>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, tensor_constant_float32)
+{
+    Shape shape{2, 2};
+    auto r = op::Constant::create(element::f32, shape, {4.75, 4.5, -5.25, 0.0});
+    auto f = make_shared<Function>(r, op::ParameterVector{});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {});
+    EXPECT_EQ((vector<float>{4.75f, 4.5f, -5.25f, 0.0f}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, tensor_constant_int64)
+{
+    Shape shape{2, 2};
+    auto r = op::Constant::create(element::i64, shape, {2112, 1848, 1776, 1964});
+    auto f = make_shared<Function>(r, op::ParameterVector{});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto result = backend->create_tensor(element::i64, shape);
+
+    backend->call_with_validate(f, {result}, {});
+    EXPECT_EQ((vector<int64_t>{2112, 1848, 1776, 1964}), read_vector<int64_t>(result));
+}
+
+// TODO: Kahan sum only works in limited cases with CPU / Interpreter backend
+NGRAPH_TEST(${BACKEND_NAME}, kahan_sum_to_scalar)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0, 1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    float epsilon = 9.5367431640625e-7f;
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{epsilon, -1.f, 0.f, 1.f});
+    auto result = backend->create_tensor(element::f32, Shape{});
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_TRUE(test::all_close_f(vector<float>{epsilon}, read_vector<float>(result)));
+}
+
+// TODO: Kahan sum only works in limited cases with CPU / Interpreter backend
+NGRAPH_TEST(${BACKEND_NAME}, kahan_sum_3d_to_vector)
+{
+    Shape shape_a{3, 3, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3};
+    auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0, 1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    float epsilon_a = 1.220703125e-4f;
+    float epsilon_b = 3.0517578125e-5f;
+    float epsilon_c = 7.62939453125e-6f;
+    copy_data(a, vector<float>{1,  1,  1,  1,  1,  1,  epsilon_a, epsilon_b, epsilon_c,
+                               1,  1,  1,  1,  1,  1,  -1,        -1,        -1,
+                               -1, -1, -1, -1, -1, -1, -1,        -1,        -1});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_TRUE(test::all_close_f(vector<float>{epsilon_a, epsilon_b, epsilon_c},
+                                  read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, constant_equality_bool)
+{
+    Shape shape{4};
+    // auto A = make_shared<op::Parameter>(element::boolean, shape);
+    // auto B = make_shared<op::Parameter>(element::boolean, shape);
+    // auto f = make_shared<Function>(make_shared<op::Equal>(A, B), op::ParameterVector{A, B});
+
+    auto A = op::Constant::create(element::boolean, shape, {true, false, true, false});
+    auto B = op::Constant::create(element::boolean, shape, {true, true, true, true});
+    auto f = make_shared<Function>(make_shared<op::Equal>(A, B), op::ParameterVector{});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto result = backend->create_tensor(element::boolean, shape);
+
+    backend->call_with_validate(f, {result}, {});
+    EXPECT_EQ((vector<char>{true, false, true, false}), read_vector<char>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, replace_slice_scalar)
+{
+    Shape shape_a{};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{};
+    auto r = make_shared<op::ReplaceSlice>(A, B, Coordinate{}, Coordinate{});
+    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{312});
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{808});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((vector<float>{808}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, replace_slice_matrix_inplace)
+{
+    Shape shape_a{4, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto abs_A = make_shared<op::Abs>(A);
+
+    Shape shape_b{3, 2};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{4, 4};
+    auto r = make_shared<op::ReplaceSlice>(abs_A, B, Coordinate{0, 1}, Coordinate{3, 3});
+    auto abs_r = make_shared<op::Abs>(r);
+    auto f = make_shared<Function>(abs_r, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{102, 103, 106, 107, 110, 111});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((vector<float>{1, 102, 103, 4, 5, 106, 107, 8, 9, 110, 111, 12, 13, 14, 15, 16}),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, replace_slice_matrix)
+{
+    Shape shape_a{4, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{3, 2};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{4, 4};
+    auto r = make_shared<op::ReplaceSlice>(A, B, Coordinate{0, 1}, Coordinate{3, 3});
+    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{102, 103, 106, 107, 110, 111});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((vector<float>{1, 102, 103, 4, 5, 106, 107, 8, 9, 110, 111, 12, 13, 14, 15, 16}),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, replace_slice_vector)
+{
+    Shape shape_a{16};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{12};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{16};
+    auto r = make_shared<op::ReplaceSlice>(A, B, Coordinate{2}, Coordinate{14});
+    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ(
+        (vector<float>{0, 1, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 14, 15}),
+        read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, replace_slice_3d)
+{
+    Shape shape_a{4, 4, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{2, 2, 2};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{4, 4, 4};
+    auto r = make_shared<op::ReplaceSlice>(A, B, Coordinate{1, 1, 1}, Coordinate{3, 3, 3});
+    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+
+                               16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+
+                               32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+
+                               48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63});
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{921, 922, 925, 926, 937, 938, 941, 942});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((vector<float>{0,  1,  2,  3,  4,  5,   6,   7,  8,  9,   10,  11, 12, 13, 14, 15,
+
+                             16, 17, 18, 19, 20, 921, 922, 23, 24, 925, 926, 27, 28, 29, 30, 31,
+
+                             32, 33, 34, 35, 36, 937, 938, 39, 40, 941, 942, 43, 44, 45, 46, 47,
+
+                             48, 49, 50, 51, 52, 53,  54,  55, 56, 57,  58,  59, 60, 61, 62, 63}),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, replace_slice_3d_strided)
+{
+    Shape shape_a{4, 4, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{2, 2, 2};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{4, 4, 4};
+    auto r = make_shared<op::ReplaceSlice>(
+        A, B, Coordinate{0, 0, 0}, Coordinate{4, 4, 4}, Strides{2, 2, 2});
+    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+
+                               16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+
+                               32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+
+                               48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63});
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{900, 902, 908, 910, 932, 934, 940, 942});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((vector<float>{900, 1,  902, 3,  4,  5,  6,  7,  908, 9,  910, 11, 12, 13, 14, 15,
+
+                             16,  17, 18,  19, 20, 21, 22, 23, 24,  25, 26,  27, 28, 29, 30, 31,
+
+                             932, 33, 934, 35, 36, 37, 38, 39, 940, 41, 942, 43, 44, 45, 46, 47,
+
+                             48,  49, 50,  51, 52, 53, 54, 55, 56,  57, 58,  59, 60, 61, 62, 63}),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, replace_slice_3d_strided_different_strides)
+{
+    Shape shape_a{4, 4, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{2, 2, 2};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{4, 4, 4};
+    auto r = make_shared<op::ReplaceSlice>(
+        A, B, Coordinate{0, 0, 0}, Coordinate{4, 4, 4}, Strides{2, 2, 3});
+    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+
+                               16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+
+                               32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+
+                               48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63});
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{900, 903, 908, 911, 932, 935, 940, 943});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((vector<float>{900, 1,  2,  903, 4,  5,  6,  7,  908, 9,  10, 911, 12, 13, 14, 15,
+
+                             16,  17, 18, 19,  20, 21, 22, 23, 24,  25, 26, 27,  28, 29, 30, 31,
+
+                             932, 33, 34, 935, 36, 37, 38, 39, 940, 41, 42, 943, 44, 45, 46, 47,
+
+                             48,  49, 50, 51,  52, 53, 54, 55, 56,  57, 58, 59,  60, 61, 62, 63}),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_0d)
+{
+    Shape shape{};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{6});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{6}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_1d_nochange)
+{
+    Shape shape{8};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{0, 1, 2, 3, 4, 5, 6, 7});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{0, 1, 2, 3, 4, 5, 6, 7}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_1d_0)
+{
+    Shape shape{8};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{0, 1, 2, 3, 4, 5, 6, 7});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{7, 6, 5, 4, 3, 2, 1, 0}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_2d_nochange)
+{
+    Shape shape{4, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 2>({{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}}).get_vector());
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ(
+        (test::NDArray<float, 2>({{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}}).get_vector()),
+        read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_2d_0)
+{
+    Shape shape{4, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 2>({{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}}).get_vector());
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ(
+        (test::NDArray<float, 2>({{9, 10, 11}, {6, 7, 8}, {3, 4, 5}, {0, 1, 2}}).get_vector()),
+        read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_2d_1)
+{
+    Shape shape{4, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 2>({{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}}).get_vector());
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ(
+        (test::NDArray<float, 2>({{2, 1, 0}, {5, 4, 3}, {8, 7, 6}, {11, 10, 9}}).get_vector()),
+        read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_2d_01)
+{
+    Shape shape{4, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f =
+        make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{0, 1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 2>({{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}}).get_vector());
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ(
+        (test::NDArray<float, 2>({{11, 10, 9}, {8, 7, 6}, {5, 4, 3}, {2, 1, 0}}).get_vector()),
+        read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_nochange)
+{
+    Shape shape{2, 4, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
+                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
+                  .get_vector());
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
+                                        {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_0)
+{
+    Shape shape{2, 4, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
+                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
+                  .get_vector());
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((test::NDArray<float, 3>({{{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}},
+                                        {{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_1)
+{
+    Shape shape{2, 4, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
+                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
+                  .get_vector());
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((test::NDArray<float, 3>({{{9, 10, 11}, {6, 7, 8}, {3, 4, 5}, {0, 1, 2}},
+                                        {{21, 22, 23}, {18, 19, 20}, {15, 16, 17}, {12, 13, 14}}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_2)
+{
+    Shape shape{2, 4, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{2}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
+                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
+                  .get_vector());
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((test::NDArray<float, 3>({{{2, 1, 0}, {5, 4, 3}, {8, 7, 6}, {11, 10, 9}},
+                                        {{14, 13, 12}, {17, 16, 15}, {20, 19, 18}, {23, 22, 21}}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_01)
+{
+    Shape shape{2, 4, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f =
+        make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{0, 1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
+                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
+                  .get_vector());
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((test::NDArray<float, 3>({{{21, 22, 23}, {18, 19, 20}, {15, 16, 17}, {12, 13, 14}},
+                                        {{9, 10, 11}, {6, 7, 8}, {3, 4, 5}, {0, 1, 2}}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_02)
+{
+    Shape shape{2, 4, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f =
+        make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{0, 2}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
+                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
+                  .get_vector());
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((test::NDArray<float, 3>({{{14, 13, 12}, {17, 16, 15}, {20, 19, 18}, {23, 22, 21}},
+                                        {{2, 1, 0}, {5, 4, 3}, {8, 7, 6}, {11, 10, 9}}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_12)
+{
+    Shape shape{2, 4, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f =
+        make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{1, 2}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
+                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
+                  .get_vector());
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((test::NDArray<float, 3>({{{11, 10, 9}, {8, 7, 6}, {5, 4, 3}, {2, 1, 0}},
+                                        {{23, 22, 21}, {20, 19, 18}, {17, 16, 15}, {14, 13, 12}}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_012)
+{
+    Shape shape{2, 4, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{0, 1, 2}),
+                                   op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
+                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
+                  .get_vector());
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((test::NDArray<float, 3>({{{23, 22, 21}, {20, 19, 18}, {17, 16, 15}, {14, 13, 12}},
+                                        {{11, 10, 9}, {8, 7, 6}, {5, 4, 3}, {2, 1, 0}}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, numeric_float_nan)
+{
+    Shape shape{5};
+    auto A = op::Constant::create(element::f32, shape, {-2.5f, 25.5f, 2.25f, NAN, 6.0f});
+    auto B = op::Constant::create(element::f32, shape, {10.0f, 5.0f, 2.25f, 10.0f, NAN});
+    auto f = make_shared<Function>(make_shared<op::Equal>(A, B), op::ParameterVector{});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto result = backend->create_tensor(element::boolean, shape);
+    backend->call_with_validate(f, {result}, {});
+    EXPECT_EQ((vector<char>{false, false, true, false, false}), read_vector<char>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, numeric_double_nan)
+{
+    Shape shape{5};
+    auto A = op::Constant::create(element::f64, shape, {-2.5f, 25.5f, 2.25f, NAN, 6.0f});
+    auto B = op::Constant::create(element::f64, shape, {10.0f, 5.0f, 2.25f, 10.0f, NAN});
+    auto f = make_shared<Function>(make_shared<op::Equal>(A, B), op::ParameterVector{});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto result = backend->create_tensor(element::boolean, shape);
+    backend->call_with_validate(f, {result}, {});
+    EXPECT_EQ((vector<char>{false, false, true, false, false}), read_vector<char>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, numeric_float_inf)
+{
+    Shape shape{5};
+    auto A = op::Constant::create(element::f32, shape, {-2.5f, 25.5f, 2.25f, INFINITY, 6.0f});
+    auto B = op::Constant::create(element::f32, shape, {10.0f, 5.0f, 2.25f, 10.0f, -INFINITY});
+    auto f = make_shared<Function>(make_shared<op::Equal>(A, B), op::ParameterVector{});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto result = backend->create_tensor(element::boolean, shape);
+    backend->call_with_validate(f, {result}, {});
+    EXPECT_EQ((vector<char>{false, false, true, false, false}), read_vector<char>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, numeric_double_inf)
+{
+    Shape shape{5};
+    auto A = op::Constant::create(element::f64, shape, {-2.5f, 25.5f, 2.25f, INFINITY, 6.0f});
+    auto B = op::Constant::create(element::f64, shape, {10.0f, 5.0f, 2.25f, 10.0f, -INFINITY});
+    auto f = make_shared<Function>(make_shared<op::Equal>(A, B), op::ParameterVector{});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto result = backend->create_tensor(element::boolean, shape);
+    backend->call_with_validate(f, {result}, {});
+    EXPECT_EQ((vector<char>{false, false, true, false, false}), read_vector<char>(result));
+}
+
+//
+// From the XLA docs: https://www.tensorflow.org/performance/xla/operation_semantics#selectandscatter
+//
+NGRAPH_TEST(${BACKEND_NAME}, select_and_scatter_with_overlap)
+{
+    Shape shape_sel_a{};
+    auto SEL_A = make_shared<op::Parameter>(element::f32, shape_sel_a);
+    Shape shape_sel_b{};
+    auto SEL_B = make_shared<op::Parameter>(element::f32, shape_sel_b);
+    auto sel_f = make_shared<Function>(make_shared<op::Greater>(SEL_A, SEL_B),
+                                       op::ParameterVector{SEL_A, SEL_B});
+
+    Shape shape_scatter_a{};
+    auto SCATTER_A = make_shared<op::Parameter>(element::f32, shape_scatter_a);
+    Shape shape_scatter_b{};
+    auto SCATTER_B = make_shared<op::Parameter>(element::f32, shape_scatter_b);
+    auto scatter_f =
+        make_shared<Function>(SCATTER_A + SCATTER_B, op::ParameterVector{SCATTER_A, SCATTER_B});
+
+    Shape shape_a{4, 5};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{2, 2};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_c{};
+    auto C = make_shared<op::Parameter>(element::f32, shape_c);
+    Shape shape_r{4, 5};
+    Shape window_shape{2, 3};
+    auto window_strides = Strides{2, 2};
+    auto f = make_shared<Function>(
+        make_shared<op::SelectAndScatter>(A, B, C, sel_f, scatter_f, window_shape, window_strides),
+        op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a,
+              test::NDArray<float, 2>(
+                  {{7, 2, 5, 3, 8}, {3, 8, 9, 3, 4}, {1, 5, 7, 5, 6}, {0, 6, 2, 10, 2}})
+                  .get_vector());
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, test::NDArray<float, 2>({{2, 6}, {3, 1}}).get_vector());
+    auto c = backend->create_tensor(element::f32, shape_c);
+    copy_data(c, vector<float>{0});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ((test::NDArray<float, 2>(
+                   {{0, 0, 0, 0, 0}, {0, 0, 8, 0, 0}, {0, 0, 3, 0, 0}, {0, 0, 0, 1, 0}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+//
+// From the XLA docs: https://www.tensorflow.org/performance/xla/operation_semantics#selectandscatter
+//
+NGRAPH_TEST(${BACKEND_NAME}, select_and_scatter_without_overlap)
+{
+    Shape shape_sel_a{};
+    auto SEL_A = make_shared<op::Parameter>(element::f32, shape_sel_a);
+    Shape shape_sel_b{};
+    auto SEL_B = make_shared<op::Parameter>(element::f32, shape_sel_b);
+    auto sel_f = make_shared<Function>(make_shared<op::Greater>(SEL_A, SEL_B),
+                                       op::ParameterVector{SEL_A, SEL_B});
+
+    Shape shape_scatter_a{};
+    auto SCATTER_A = make_shared<op::Parameter>(element::f32, shape_scatter_a);
+    Shape shape_scatter_b{};
+    auto SCATTER_B = make_shared<op::Parameter>(element::f32, shape_scatter_b);
+    auto scatter_f =
+        make_shared<Function>(SCATTER_A + SCATTER_B, op::ParameterVector{SCATTER_A, SCATTER_B});
+
+    Shape shape_a{4, 6};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{2, 2};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_c{};
+    auto C = make_shared<op::Parameter>(element::f32, shape_c);
+    Shape shape_r{4, 6};
+    Shape window_shape{2, 3};
+    auto window_strides = Strides{2, 3};
+    auto f = make_shared<Function>(
+        make_shared<op::SelectAndScatter>(A, B, C, sel_f, scatter_f, window_shape, window_strides),
+        op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a,
+              test::NDArray<float, 2>(
+                  {{7, 2, 5, 3, 10, 2}, {3, 8, 9, 3, 4, 2}, {1, 5, 7, 5, 6, 1}, {0, 6, 2, 7, 2, 8}})
+                  .get_vector());
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, test::NDArray<float, 2>({{2, 6}, {3, 1}}).get_vector());
+    auto c = backend->create_tensor(element::f32, shape_c);
+    copy_data(c, vector<float>{0});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ((test::NDArray<float, 2>(
+                   {{0, 0, 0, 0, 6, 0}, {0, 0, 2, 0, 0, 0}, {0, 0, 3, 0, 0, 0}, {0, 0, 0, 0, 0, 1}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+//
+// Adapted from the XLA docs to provide an example in >2D: https://www.tensorflow.org/performance/xla/operation_semantics#selectandscatter
+//
+NGRAPH_TEST(${BACKEND_NAME}, select_and_scatter_3d_without_overlap)
+{
+    Shape shape_sel_a{};
+    auto SEL_A = make_shared<op::Parameter>(element::f32, shape_sel_a);
+    Shape shape_sel_b{};
+    auto SEL_B = make_shared<op::Parameter>(element::f32, shape_sel_b);
+    auto sel_f = make_shared<Function>(make_shared<op::Greater>(SEL_A, SEL_B),
+                                       op::ParameterVector{SEL_A, SEL_B});
+
+    Shape shape_scatter_a{};
+    auto SCATTER_A = make_shared<op::Parameter>(element::f32, shape_scatter_a);
+    Shape shape_scatter_b{};
+    auto SCATTER_B = make_shared<op::Parameter>(element::f32, shape_scatter_b);
+    auto scatter_f =
+        make_shared<Function>(SCATTER_A + SCATTER_B, op::ParameterVector{SCATTER_A, SCATTER_B});
+
+    Shape shape_a{2, 4, 6};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{1, 2, 2};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_c{};
+    auto C = make_shared<op::Parameter>(element::f32, shape_c);
+    Shape shape_r{2, 4, 6};
+    Shape window_shape{2, 2, 3};
+    auto window_strides = Strides{2, 2, 3};
+    auto f = make_shared<Function>(
+        make_shared<op::SelectAndScatter>(A, B, C, sel_f, scatter_f, window_shape, window_strides),
+        op::ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(
+        a,
+        test::NDArray<float, 3>(
+            {{{7, 2, 5, 3, 10, 2}, {3, 8, 9, 3, 4, 2}, {1, 5, 7, 5, 6, 1}, {0, 6, 2, 7, 2, 8}},
+             {{2, 5, 8, 3, 4, 2}, {1, 2, 8, 4, 5, 2}, {10, 2, 3, 4, 1, 0}, {4, 1, 2, 4, 5, 7}}})
+            .get_vector());
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, test::NDArray<float, 3>({{{2, 6}, {3, 1}}}).get_vector());
+    auto c = backend->create_tensor(element::f32, shape_c);
+    copy_data(c, vector<float>{0});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ(
+        (test::NDArray<float, 3>(
+             {{{0, 0, 0, 0, 6, 0}, {0, 0, 2, 0, 0, 0}, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 1}},
+              {{0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0}, {3, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0}}})
+             .get_vector()),
+        read_vector<float>(result));
+}
+
+template <typename OP>
+void make_unary_empty_test(const string& backend_name)
+{
+    Shape shape{0};
+
+    op::ParameterVector params;
+    NodeVector result_list;
+    for (size_t i = 0; i < s_known_element_types.size(); i++)
+    {
+        shared_ptr<op::Parameter> p = make_shared<op::Parameter>(s_known_element_types[i], shape);
+        params.push_back(p);
+        result_list.push_back(make_shared<OP>(p));
+    }
+
+    auto f = make_shared<Function>(result_list, params);
+    auto backend = runtime::Backend::create(backend_name);
+
+    vector<shared_ptr<runtime::Tensor>> inputs;
+    vector<shared_ptr<runtime::Tensor>> outputs;
+    for (size_t i = 0; i < s_known_element_types.size(); i++)
+    {
+        inputs.push_back(backend->create_tensor(s_known_element_types[i], shape));
+        outputs.push_back(backend->create_tensor(s_known_element_types[i], shape));
+    }
+
+    backend->call_with_validate(f, outputs, inputs);
+
+    EXPECT_EQ(read_vector<float>(inputs[0]).size(), 0);
+    EXPECT_EQ(read_vector<double>(inputs[1]).size(), 0);
+    EXPECT_EQ(read_vector<int8_t>(inputs[2]).size(), 0);
+    EXPECT_EQ(read_vector<int16_t>(inputs[3]).size(), 0);
+    EXPECT_EQ(read_vector<int32_t>(inputs[4]).size(), 0);
+    EXPECT_EQ(read_vector<int64_t>(inputs[5]).size(), 0);
+    EXPECT_EQ(read_vector<uint8_t>(inputs[6]).size(), 0);
+    EXPECT_EQ(read_vector<uint16_t>(inputs[7]).size(), 0);
+    EXPECT_EQ(read_vector<uint32_t>(inputs[8]).size(), 0);
+    EXPECT_EQ(read_vector<uint64_t>(inputs[9]).size(), 0);
+
+    EXPECT_EQ(read_vector<float>(outputs[0]).size(), 0);
+    EXPECT_EQ(read_vector<double>(outputs[1]).size(), 0);
+    EXPECT_EQ(read_vector<int8_t>(outputs[2]).size(), 0);
+    EXPECT_EQ(read_vector<int16_t>(outputs[3]).size(), 0);
+    EXPECT_EQ(read_vector<int32_t>(outputs[4]).size(), 0);
+    EXPECT_EQ(read_vector<int64_t>(outputs[5]).size(), 0);
+    EXPECT_EQ(read_vector<uint8_t>(outputs[6]).size(), 0);
+    EXPECT_EQ(read_vector<uint16_t>(outputs[7]).size(), 0);
+    EXPECT_EQ(read_vector<uint32_t>(outputs[8]).size(), 0);
+    EXPECT_EQ(read_vector<uint64_t>(outputs[9]).size(), 0);
+}
+
+template <typename OP>
+void make_binary_empty_test(const string& backend_name, bool is_comparison = false)
+{
+    Shape shape{0};
+    op::ParameterVector A;
+    for (size_t i = 0; i < s_known_element_types.size(); i++)
+    {
+        A.push_back(make_shared<op::Parameter>(s_known_element_types[i], shape));
+    }
+
+    NodeVector result_list;
+    for (shared_ptr<op::Parameter> p : A)
+    {
+        result_list.push_back(make_shared<OP>(p, p));
+    }
+
+    auto f = make_shared<Function>(result_list, A);
+    auto backend = runtime::Backend::create(backend_name);
+
+    vector<shared_ptr<runtime::Tensor>> inputs;
+    vector<shared_ptr<runtime::Tensor>> outputs;
+    for (size_t i = 0; i < s_known_element_types.size(); i++)
+    {
+        inputs.push_back(backend->create_tensor(s_known_element_types[i], shape));
+        if (is_comparison)
+        {
+            outputs.push_back(backend->create_tensor(element::from<char>(), shape));
+        }
+        else
+        {
+            outputs.push_back(backend->create_tensor(s_known_element_types[i], shape));
+        }
+    }
+
+    backend->call_with_validate(f, outputs, inputs);
+
+    EXPECT_EQ(read_vector<float>(inputs[0]).size(), 0);
+    EXPECT_EQ(read_vector<double>(inputs[1]).size(), 0);
+    EXPECT_EQ(read_vector<int8_t>(inputs[2]).size(), 0);
+    EXPECT_EQ(read_vector<int16_t>(inputs[3]).size(), 0);
+    EXPECT_EQ(read_vector<int32_t>(inputs[4]).size(), 0);
+    EXPECT_EQ(read_vector<int64_t>(inputs[5]).size(), 0);
+    EXPECT_EQ(read_vector<uint8_t>(inputs[6]).size(), 0);
+    EXPECT_EQ(read_vector<uint16_t>(inputs[7]).size(), 0);
+    EXPECT_EQ(read_vector<uint32_t>(inputs[8]).size(), 0);
+    EXPECT_EQ(read_vector<uint64_t>(inputs[9]).size(), 0);
+
+    if (is_comparison)
+    {
+        EXPECT_EQ(read_vector<char>(outputs[0]).size(), 0);
+        EXPECT_EQ(read_vector<char>(outputs[1]).size(), 0);
+        EXPECT_EQ(read_vector<char>(outputs[2]).size(), 0);
+        EXPECT_EQ(read_vector<char>(outputs[3]).size(), 0);
+        EXPECT_EQ(read_vector<char>(outputs[4]).size(), 0);
+        EXPECT_EQ(read_vector<char>(outputs[5]).size(), 0);
+        EXPECT_EQ(read_vector<char>(outputs[6]).size(), 0);
+        EXPECT_EQ(read_vector<char>(outputs[7]).size(), 0);
+        EXPECT_EQ(read_vector<char>(outputs[8]).size(), 0);
+        EXPECT_EQ(read_vector<char>(outputs[9]).size(), 0);
+    }
+    else
+    {
+        EXPECT_EQ(read_vector<float>(outputs[0]).size(), 0);
+        EXPECT_EQ(read_vector<double>(outputs[1]).size(), 0);
+        EXPECT_EQ(read_vector<int8_t>(outputs[2]).size(), 0);
+        EXPECT_EQ(read_vector<int16_t>(outputs[3]).size(), 0);
+        EXPECT_EQ(read_vector<int32_t>(outputs[4]).size(), 0);
+        EXPECT_EQ(read_vector<int64_t>(outputs[5]).size(), 0);
+        EXPECT_EQ(read_vector<uint8_t>(outputs[6]).size(), 0);
+        EXPECT_EQ(read_vector<uint16_t>(outputs[7]).size(), 0);
+        EXPECT_EQ(read_vector<uint32_t>(outputs[8]).size(), 0);
+        EXPECT_EQ(read_vector<uint64_t>(outputs[9]).size(), 0);
+    }
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_abs)
+{
+    make_unary_empty_test<op::Abs>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_ceiling)
+{
+    make_unary_empty_test<op::Ceiling>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_exp)
+{
+    make_unary_empty_test<op::Exp>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_floor)
+{
+    make_unary_empty_test<op::Floor>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_log)
+{
+    make_unary_empty_test<op::Log>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_negative)
+{
+    make_unary_empty_test<op::Negative>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_not)
+{
+    Shape shape{0};
+    auto A = make_shared<op::Parameter>(element::from<char>(), shape);
+    auto f = make_shared<Function>(make_shared<op::Not>(A), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::from<char>(), shape);
+    auto result = backend->create_tensor(element::from<char>(), shape);
+
+    backend->call_with_validate(f, {result}, {a});
+
+    auto in_vec = read_vector<char>(a);
+    auto out_vec = read_vector<char>(result);
+
+    EXPECT_EQ(in_vec.size(), 0);
+    EXPECT_EQ(out_vec.size(), 0);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_sign)
+{
+    make_unary_empty_test<op::Sign>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_sqrt)
+{
+    make_unary_empty_test<op::Sqrt>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_sin)
+{
+    make_unary_empty_test<op::Sin>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_sinh)
+{
+    make_unary_empty_test<op::Sinh>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_cos)
+{
+    make_unary_empty_test<op::Cos>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_cosh)
+{
+    make_unary_empty_test<op::Cosh>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_tan)
+{
+    make_unary_empty_test<op::Tan>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_tanh)
+{
+    make_unary_empty_test<op::Tanh>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_asin)
+{
+    make_unary_empty_test<op::Asin>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_acos)
+{
+    make_unary_empty_test<op::Acos>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_atan)
+{
+    make_unary_empty_test<op::Atan>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_add)
+{
+    make_binary_empty_test<op::Add>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_divide)
+{
+    make_binary_empty_test<op::Divide>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_eq)
+{
+    make_binary_empty_test<op::Equal>("${BACKEND_NAME}", true);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_greater)
+{
+    make_binary_empty_test<op::Greater>("${BACKEND_NAME}", true);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_greatereq)
+{
+    make_binary_empty_test<op::GreaterEq>("${BACKEND_NAME}", true);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_less)
+{
+    make_binary_empty_test<op::Less>("${BACKEND_NAME}", true);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_lesseq)
+{
+    make_binary_empty_test<op::LessEq>("${BACKEND_NAME}", true);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_maximum)
+{
+    make_binary_empty_test<op::Maximum>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_minimum)
+{
+    make_binary_empty_test<op::Minimum>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_multiply)
+{
+    make_binary_empty_test<op::Multiply>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_not_equal)
+{
+    make_binary_empty_test<op::NotEqual>("${BACKEND_NAME}", true);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_power)
+{
+    make_binary_empty_test<op::Power>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, zero_sized_subtract)
+{
+    make_binary_empty_test<op::Subtract>("${BACKEND_NAME}");
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, convolution_outlining)
+{
+    Shape shape_a{1, 2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{2, 2, 1, 1};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{1, 2, 2, 2};
+    auto conv1 = make_shared<op::Convolution>(A,
+                                              B,
+                                              Strides{1, 1},
+                                              Strides{1, 1},
+                                              CoordinateDiff{0, 0},
+                                              CoordinateDiff{0, 0},
+                                              Strides{1, 1});
+    auto conv2 = make_shared<op::Convolution>(conv1,
+                                              B,
+                                              Strides{1, 1},
+                                              Strides{1, 1},
+                                              CoordinateDiff{0, 0},
+                                              CoordinateDiff{0, 0},
+                                              Strides{1, 1});
+    auto f = make_shared<Function>(conv2, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{1.0f, 1.0f, 1.0f, 1.0f});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    vector<float> expected_result{4.0f, 4.0f, 4.0f, 4.0f, 4.0f, 4.0f, 4.0f, 4.0f};
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ(vector<float>{expected_result}, read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, computation_reuse)
+{
+    Shape shape_a{1, 16, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{32, 16, 1, 1};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{1, 32, 2, 2};
+    auto conv = make_shared<op::Convolution>(A,
+                                             B,
+                                             Strides{1, 1},
+                                             Strides{1, 1},
+                                             CoordinateDiff{0, 0},
+                                             CoordinateDiff{0, 0},
+                                             Strides{1, 1});
+    Shape pool_shape{1, 1};
+    auto pool = make_shared<op::AvgPool>(conv, pool_shape);
+    auto bias = make_shared<op::Broadcast>(
+        op::Constant::create(element::f32, Shape{}, {2.14}), shape_r, AxisSet{0, 1, 2, 3});
+    auto result_op = make_shared<op::Result>(pool + bias);
+    auto f = make_shared<Function>(ResultVector{result_op}, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    vector<float> input(64, 1.0f);
+    vector<float> weights(512, 0.5f);
+    vector<float> rv(128);
+
+    auto a = backend->create_tensor(element::f32, shape_a, input.data());
+    auto b = backend->create_tensor(element::f32, shape_b, weights.data());
+    auto result = backend->create_tensor(element::f32, shape_r, rv.data());
+
+    backend->call_with_validate(f, {result}, {a, b});
+
+    vector<float> rv_saved(rv);
+
+    b->set_stale(false);
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ(rv_saved, rv);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, pad_interior_1d)
+{
+    Shape shape_a{6};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{16};
+    Shape padding_below{0};
+    Shape padding_above{0};
+    Shape padding_interior{2};
+    auto f = make_shared<Function>(
+        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
+        op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, test::NDArray<float, 1>({1, 2, 3, 4, 5, 6}).get_vector());
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{2112});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((test::NDArray<float, 1>(
+                   {1, 2112, 2112, 2, 2112, 2112, 3, 2112, 2112, 4, 2112, 2112, 5, 2112, 2112, 6})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, pad_exterior_1d)
+{
+    Shape shape_a{6};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{15};
+    Shape padding_below{4};
+    Shape padding_above{5};
+    Shape padding_interior{0};
+    auto f = make_shared<Function>(
+        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
+        op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, test::NDArray<float, 1>({1, 2, 3, 4, 5, 6}).get_vector());
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{2112});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((test::NDArray<float, 1>(
+                   {2112, 2112, 2112, 2112, 1, 2, 3, 4, 5, 6, 2112, 2112, 2112, 2112, 2112})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, pad_interior_exterior_1d)
+{
+    Shape shape_a{6};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{25};
+    Shape padding_below{4};
+    Shape padding_above{5};
+    Shape padding_interior{2};
+    auto f = make_shared<Function>(
+        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
+        op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, test::NDArray<float, 1>({1, 2, 3, 4, 5, 6}).get_vector());
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{2112});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((test::NDArray<float, 1>({2112, 2112, 2112, 2112, 1,    2112, 2112, 2, 2112,
+                                        2112, 3,    2112, 2112, 4,    2112, 2112, 5, 2112,
+                                        2112, 6,    2112, 2112, 2112, 2112, 2112})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, pad_interior_exterior_2d)
+{
+    Shape shape_a{2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{7, 6};
+    Shape padding_below{1, 0};
+    Shape padding_above{2, 1};
+    Shape padding_interior{2, 1};
+    auto f = make_shared<Function>(
+        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
+        op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, test::NDArray<float, 2>({{1, 2, 3}, {4, 5, 6}}).get_vector());
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{9});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((test::NDArray<float, 2>({{9, 9, 9, 9, 9, 9},
+                                        {1, 9, 2, 9, 3, 9},
+                                        {9, 9, 9, 9, 9, 9},
+                                        {9, 9, 9, 9, 9, 9},
+                                        {4, 9, 5, 9, 6, 9},
+                                        {9, 9, 9, 9, 9, 9},
+                                        {9, 9, 9, 9, 9, 9}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, pad_exterior_2d_0x0)
+{
+    Shape shape_a{0, 0};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{5, 5};
+    Shape padding_below{2, 3};
+    Shape padding_above{3, 2};
+    Shape padding_interior{0, 0};
+    auto f = make_shared<Function>(
+        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
+        op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    // copy_data(a, test::NDArray<float, 2>({{}}).get_vector());
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{2112});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((test::NDArray<float, 2>({{2112, 2112, 2112, 2112, 2112},
+                                        {2112, 2112, 2112, 2112, 2112},
+                                        {2112, 2112, 2112, 2112, 2112},
+                                        {2112, 2112, 2112, 2112, 2112},
+                                        {2112, 2112, 2112, 2112, 2112}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, pad_exterior_2d_0x3)
+{
+    Shape shape_a{0, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{5, 5};
+    Shape padding_below{2, 1};
+    Shape padding_above{3, 1};
+    Shape padding_interior{0, 0};
+    auto f = make_shared<Function>(
+        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
+        op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    // copy_data(a, test::NDArray<float, 2>({}).get_vector());
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{2112});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((test::NDArray<float, 2>({{2112, 2112, 2112, 2112, 2112},
+                                        {2112, 2112, 2112, 2112, 2112},
+                                        {2112, 2112, 2112, 2112, 2112},
+                                        {2112, 2112, 2112, 2112, 2112},
+                                        {2112, 2112, 2112, 2112, 2112}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, pad_exterior_2d_3x0)
+{
+    Shape shape_a{3, 0};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{5, 5};
+    Shape padding_below{1, 3};
+    Shape padding_above{1, 2};
+    Shape padding_interior{0, 0};
+    auto f = make_shared<Function>(
+        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
+        op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    // copy_data(a, test::NDArray<float, 2>({}).get_vector());
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{2112});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((test::NDArray<float, 2>({{2112, 2112, 2112, 2112, 2112},
+                                        {2112, 2112, 2112, 2112, 2112},
+                                        {2112, 2112, 2112, 2112, 2112},
+                                        {2112, 2112, 2112, 2112, 2112},
+                                        {2112, 2112, 2112, 2112, 2112}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, pad_exterior_4d_1x2x2x2)
+{
+    Shape shape_a{1, 2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{1, 2, 4, 4};
+    Shape padding_below{0, 0, 1, 1};
+    Shape padding_above{0, 0, 1, 1};
+    Shape padding_interior{0, 0, 0, 0};
+    auto f = make_shared<Function>(
+        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
+        op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    // clang-format off
+    copy_data(a, test::NDArray<float, 4>(
+        {
+            {
+                {
+                    {0.0f, 0.0f},
+                    {0.0f, 0.0f}
+                },
+                {
+                    {0.0f, 0.0f},
+                    {0.0f, 0.0f}
+                }
+            }
+        }).get_vector());
+    // clang-format on
+
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{42});
+
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    // clang-format off
+    EXPECT_EQ((test::NDArray<float, 4>(
+        {
+            {
+                {
+                    {42.0f, 42.0f, 42.0f, 42.0f},
+                    {42.0f, 0.0f, 0.0f, 42.0f},
+                    {42.0f, 0.0f, 0.0f, 42.0f},
+                    {42.0f, 42.0f, 42.0f, 42.0f}
+                },
+                {
+                    {42.0f, 42.0f, 42.0f, 42.0f},
+                    {42.0f, 0.0f, 0.0f, 42.0f},
+                    {42.0f, 0.0f, 0.0f, 42.0f},
+                    {42.0f, 42.0f, 42.0f, 42.0f}
+                }
+            }
+        }).get_vector()),
+        read_vector<float>(result));
+    // clang-format on
+}
+
+// This is a regression test for one of TF's unit tests, which was failing.
+// The problem was inappropriate handling of the shape computation for a
+// zero-length axis with interior padding. Rather than subtract 1 from the
+// source shape and multiply by the interior padding (which causes underflow),
+// we should just count the pre-interior-padding length as zero.
+NGRAPH_TEST(${BACKEND_NAME}, pad_interior_exterior_4d_2x0x3x2)
+{
+    Shape shape_a{2, 0, 3, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape padding_below{1, 0, 0, 0};
+    Shape padding_above{0, 2, 0, 0};
+    Shape padding_interior{2, 1, 0, 0};
+    Shape shape_r{5, 2, 3, 2};
+    auto f = make_shared<Function>(
+        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
+        op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    // copy_data(a, test::NDArray<float, 2>({}).get_vector());
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{2112});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    vector<float> expected(5 * 2 * 3 * 2, 2112);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ(expected, read_vector<float>(result));
+}
+
+// This test covers the case with multiple image and with asymetric pad
+// bug has been found on nvGPU side now covered by this test
+NGRAPH_TEST(${BACKEND_NAME}, pad_2channel_2image_asym)
+{
+    Shape shape_a{2, 2, 4, 4};
+    auto window_movement_strides = Strides{2, 2};
+    Shape padding_below{0, 0, 0, 0};
+    Shape padding_above{0, 0, 2, 2};
+    Shape padding_interior{0, 0, 0, 0};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{2, 2, 6, 6};
+    auto f = make_shared<Function>(
+        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
+        op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a,
+              test::NDArray<float, 4>({{{{0, 1, 0, 2}, // img 0 chan 0
+                                         {0, 3, 2, 0},
+                                         {2, 0, 0, 0},
+                                         {0, 2, 1, 0}},
+
+                                        {{0, 0, 0, 2}, // img 0 chan 1
+                                         {0, 2, 3, 0},
+                                         {2, 0, 1, 0},
+                                         {2, 0, 0, 0}}},
+
+                                       {{{0, 2, 1, 1}, // img 1 chan 0
+                                         {0, 0, 2, 0},
+                                         {0, 0, 1, 2},
+                                         {0, 0, 0, 0}},
+
+                                        {{2, 1, 0, 0}, // img 1 chan 1
+                                         {0, 2, 0, 0},
+                                         {1, 1, 2, 0},
+                                         {1, 0, 0, 0}}}})
+                  .get_vector());
+
+    auto b = backend->create_tensor(element::f32, shape_b);
+    copy_data(b, vector<float>{42});
+
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((test::NDArray<float, 4>({{{{0, 1, 0, 2, 42, 42}, // img 0 chan 0
+                                          {0, 3, 2, 0, 42, 42},
+                                          {2, 0, 0, 0, 42, 42},
+                                          {0, 2, 1, 0, 42, 42},
+                                          {42, 42, 42, 42, 42, 42},
+                                          {42, 42, 42, 42, 42, 42}},
+
+                                         {{0, 0, 0, 2, 42, 42}, // img 1 chan 0
+                                          {0, 2, 3, 0, 42, 42},
+                                          {2, 0, 1, 0, 42, 42},
+                                          {2, 0, 0, 0, 42, 42},
+                                          {42, 42, 42, 42, 42, 42},
+                                          {42, 42, 42, 42, 42, 42}}},
+
+                                        {{{0, 2, 1, 1, 42, 42}, // img 1 chan 0
+                                          {0, 0, 2, 0, 42, 42},
+                                          {0, 0, 1, 2, 42, 42},
+                                          {0, 0, 0, 0, 42, 42},
+                                          {42, 42, 42, 42, 42, 42},
+                                          {42, 42, 42, 42, 42, 42}},
+
+                                         {{2, 1, 0, 0, 42, 42}, // img 1 chan 1
+                                          {0, 2, 0, 0, 42, 42},
+                                          {1, 1, 2, 0, 42, 42},
+                                          {1, 0, 0, 0, 42, 42},
+                                          {42, 42, 42, 42, 42, 42},
+                                          {42, 42, 42, 42, 42, 42}}}})
+                   .get_vector()),
+              read_vector<float>(result));
+}
+
+// Trivial case with no reduced axes.
+NGRAPH_TEST(${BACKEND_NAME}, product_trivial)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 2, 3, 4});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(result));
+}
+
+// Failure has been reported at 5D for some reason
+NGRAPH_TEST(${BACKEND_NAME}, product_trivial_5d)
+{
+    Shape shape{2, 2, 2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, product_to_scalar)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f =
+        make_shared<Function>(make_shared<op::Product>(A, AxisSet{0, 1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 2, 3, 4});
+    auto result = backend->create_tensor(element::f32, Shape{});
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{24}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, product_matrix_columns)
+{
+    Shape shape_a{3, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{2};
+    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{15, 48}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, product_matrix_rows)
+{
+    Shape shape_a{3, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3};
+    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{2, 12, 30}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, product_matrix_rows_zero)
+{
+    Shape shape_a{3, 0};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3};
+    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    copy_data(result, vector<float>({3, 3, 3}));
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 1, 1}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, product_matrix_cols_zero)
+{
+    // Now the reduction (g(x:float32[2,2],y:float32[]) = reduce(x,y,f,axes={})).
+    Shape shape_a{0, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{2};
+    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    copy_data(result, vector<float>({3, 3}));
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 1}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, product_vector_zero)
+{
+    Shape shape_a{0};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{};
+    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    copy_data(result, vector<float>({3}));
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, product_matrix_to_scalar_zero_by_zero)
+{
+    Shape shape_a{0, 0};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{};
+    auto f =
+        make_shared<Function>(make_shared<op::Product>(A, AxisSet{0, 1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    copy_data(result, vector<float>({3}));
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, product_3d_to_matrix_most_sig)
+{
+    Shape shape_a{3, 3, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3, 3};
+    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1 * 10 * 19,
+                             2 * 11 * 20,
+                             3 * 12 * 21,
+                             4 * 13 * 22,
+                             5 * 14 * 23,
+                             6 * 15 * 24,
+                             7 * 16 * 25,
+                             8 * 17 * 26,
+                             9 * 18 * 27}),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, product_3d_to_matrix_least_sig)
+{
+    Shape shape_a{3, 3, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3, 3};
+    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{2}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1 * 2 * 3,
+                             4 * 5 * 6,
+                             7 * 8 * 9,
+                             10 * 11 * 12,
+                             13 * 14 * 15,
+                             16 * 17 * 18,
+                             19 * 20 * 21,
+                             22 * 23 * 24,
+                             25 * 26 * 27}),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, product_3d_to_vector)
+{
+    Shape shape_a{3, 3, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3};
+    auto f =
+        make_shared<Function>(make_shared<op::Product>(A, AxisSet{0, 1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1.0f * 10.0f * 19.0f * 4.0f * 13.0f * 22.0f * 7.0f * 16.0f * 25.0f,
+                             2.0f * 11.0f * 20.0f * 5.0f * 14.0f * 23.0f * 8.0f * 17.0f * 26.0f,
+                             3.0f * 12.0f * 21.0f * 6.0f * 15.0f * 24.0f * 9.0f * 18.0f * 27.0f}),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, product_3d_to_scalar)
+{
+    Shape shape_a{3, 3, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{};
+    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{0, 1, 2}),
+                                   op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1,  2,  3,  4,  5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                               13, 12, 11, 10, 9, 8, 7, 6, 5, 4,  3,  2,  1});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_TRUE(test::all_close(vector<float>{1.0f * 10.0f * 9.0f * 4.0f * 13.0f * 6.0f * 7.0f *
+                                              12.0f * 3.0f * 2.0f * 11.0f * 8.0f * 5.0f * 14.0f *
+                                              5.0f * 8.0f * 11.0f * 2.0f * 3.0f * 12.0f * 7.0f *
+                                              6.0f * 13.0f * 4.0f * 9.0f * 10.0f * 1.0f},
+                                read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, product_3d_eliminate_zero_dim)
+{
+    Shape shape_a{3, 0, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3, 2};
+    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    // Overwrite the initial result vector to make sure we're not just coincidentally getting the right value.
+    copy_data(result, vector<float>{2112, 2112, 2112, 2112, 2112, 2112});
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 1, 1, 1, 1, 1}), read_vector<float>(result));
+}
+
+// Trivial case with no reduced axes.
+NGRAPH_TEST(${BACKEND_NAME}, max_trivial)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 2, 3, 4});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(result));
+}
+
+// Failure has been reported at 5D for some reason
+NGRAPH_TEST(${BACKEND_NAME}, max_trivial_5d)
+{
+    Shape shape{2, 2, 2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, max_to_scalar)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{0, 1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 2, 3, 4});
+    auto result = backend->create_tensor(element::f32, Shape{});
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{4}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, max_matrix_columns)
+{
+    Shape shape_a{3, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{2};
+    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{5, 6}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, max_matrix_rows)
+{
+    Shape shape_a{3, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3};
+    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{2, 4, 6}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, max_matrix_rows_zero)
+{
+    Shape shape_a{3, 0};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3};
+    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    copy_data(result, vector<float>({3, 3, 3}));
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{-std::numeric_limits<float>::infinity(),
+                             -std::numeric_limits<float>::infinity(),
+                             -std::numeric_limits<float>::infinity()}),
+              read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, max_matrix_cols_zero)
+{
+    // Now the reduction (g(x:float32[2,2],y:float32[]) = reduce(x,y,f,axes={})).
+    Shape shape_a{0, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{2};
+    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    copy_data(result, vector<float>({3, 3}));
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{-std::numeric_limits<float>::infinity(),
+                             -std::numeric_limits<float>::infinity()}),
+              read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, max_vector_zero)
+{
+    Shape shape_a{0};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{};
+    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    copy_data(result, vector<float>({3}));
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{-std::numeric_limits<float>::infinity()}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, max_matrix_to_scalar_zero_by_zero)
+{
+    Shape shape_a{0, 0};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{};
+    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{0, 1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    copy_data(result, vector<float>({3}));
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{-std::numeric_limits<float>::infinity()}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, max_3d_to_matrix_most_sig)
+{
+    Shape shape_a{3, 3, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3, 3};
+    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{19, 20, 21, 22, 23, 24, 25, 26, 27}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, max_3d_to_matrix_least_sig)
+{
+    Shape shape_a{3, 3, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3, 3};
+    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{2}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{3, 6, 9, 12, 15, 18, 21, 24, 27}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, max_3d_to_vector)
+{
+    Shape shape_a{3, 3, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3};
+    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{0, 1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{25.0f, 26.0f, 27.0f}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, max_3d_to_scalar)
+{
+    Shape shape_a{3, 3, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{};
+    auto f =
+        make_shared<Function>(make_shared<op::Max>(A, AxisSet{0, 1, 2}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1,  2,  3,  4,  5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                               13, 12, 11, 10, 9, 8, 7, 6, 5, 4,  3,  2,  1});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{14.0f}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, max_3d_eliminate_zero_dim)
+{
+    Shape shape_a{3, 0, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3, 2};
+    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    // Overwrite the initial result vector to make sure we're not just coincidentally getting the right value.
+    copy_data(result, vector<float>{2112, 2112, 2112, 2112, 2112, 2112});
+
+    float mi = -std::numeric_limits<float>::infinity();
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{mi, mi, mi, mi, mi, mi}), read_vector<float>(result));
+}
+
+// Trivial case with no reduced axes.
+NGRAPH_TEST(${BACKEND_NAME}, min_trivial)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 2, 3, 4});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(result));
+}
+
+// Failure has been reported at 5D for some reason
+NGRAPH_TEST(${BACKEND_NAME}, min_trivial_5d)
+{
+    Shape shape{2, 2, 2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}),
+              read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, min_to_scalar)
+{
+    Shape shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{0, 1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{1, 2, 3, 4});
+    auto result = backend->create_tensor(element::f32, Shape{});
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, min_matrix_columns)
+{
+    Shape shape_a{3, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{2};
+    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 2}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, min_matrix_rows)
+{
+    Shape shape_a{3, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3};
+    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 3, 5}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, min_matrix_rows_zero)
+{
+    Shape shape_a{3, 0};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3};
+    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    copy_data(result, vector<float>({3, 3, 3}));
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{std::numeric_limits<float>::infinity(),
+                             std::numeric_limits<float>::infinity(),
+                             std::numeric_limits<float>::infinity()}),
+              read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, min_matrix_cols_zero)
+{
+    // Now the reduction (g(x:float32[2,2],y:float32[]) = reduce(x,y,f,axes={})).
+    Shape shape_a{0, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{2};
+    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    copy_data(result, vector<float>({3, 3}));
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{std::numeric_limits<float>::infinity(),
+                             std::numeric_limits<float>::infinity()}),
+              read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, min_vector_zero)
+{
+    Shape shape_a{0};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{};
+    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    copy_data(result, vector<float>({3}));
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{std::numeric_limits<float>::infinity()}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, min_matrix_to_scalar_zero_by_zero)
+{
+    Shape shape_a{0, 0};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{};
+    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{0, 1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    copy_data(result, vector<float>({3}));
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{std::numeric_limits<float>::infinity()}), read_vector<float>(result));
+
+    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
+    // input tensors, so let's do this too.
+    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, min_3d_to_matrix_most_sig)
+{
+    Shape shape_a{3, 3, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3, 3};
+    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, min_3d_to_matrix_least_sig)
+{
+    Shape shape_a{3, 3, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3, 3};
+    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{2}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 4, 7, 10, 13, 16, 19, 22, 25}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, min_3d_to_vector)
+{
+    Shape shape_a{3, 3, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3};
+    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{0, 1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
+                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1, 2, 3}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, min_3d_to_scalar)
+{
+    Shape shape_a{3, 3, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{};
+    auto f =
+        make_shared<Function>(make_shared<op::Min>(A, AxisSet{0, 1, 2}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1,  2,  3,  4,  5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                               13, 12, 11, 10, 9, 8, 7, 6, 5, 4,  3,  2,  1});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{1}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, min_3d_eliminate_zero_dim)
+{
+    Shape shape_a{3, 0, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{3, 2};
+    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+
+    // Overwrite the initial result vector to make sure we're not just coincidentally getting the right value.
+    copy_data(result, vector<float>{2112, 2112, 2112, 2112, 2112, 2112});
+
+    float inf = std::numeric_limits<float>::infinity();
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((vector<float>{inf, inf, inf, inf, inf, inf}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, sigmoid_n1c1h2w2)
+{
+    auto input = make_shared<op::Parameter>(element::f32, Shape{1, 1, 2, 2});
+    auto sigmoid_node = make_shared<op::Sigmoid>(input);
+    auto func = make_shared<Function>(sigmoid_node, op::ParameterVector{input});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, input->get_shape());
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, input->get_shape());
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f};
+    copy_data(a, dataA);
+
+    backend->call_with_validate(func, {result}, {a});
+    vector<float> expected{0.73105858f, 0.98201379f, 0.73105858f, 0.98201379f};
+    ASSERT_TRUE(read_vector<float>(result) == expected);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, sigmoid_n1c1h4)
+{
+    auto input = make_shared<op::Parameter>(element::f32, Shape{1, 1, 4});
+    auto sigmoid_node = make_shared<op::Sigmoid>(input);
+    auto func = make_shared<Function>(sigmoid_node, op::ParameterVector{input});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, input->get_shape());
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, input->get_shape());
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f};
+    copy_data(a, dataA);
+
+    backend->call_with_validate(func, {result}, {a});
+    vector<float> expected{0.73105858f, 0.98201379f, 0.73105858f, 0.98201379f};
+    ASSERT_TRUE(read_vector<float>(result) == expected);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, sigmoid_bprop_n1c1h4)
+{
+    auto input = make_shared<op::Parameter>(element::f32, Shape{1, 1, 4});
+    auto delta = make_shared<op::Parameter>(element::f32, Shape{1, 1, 4});
+    auto sigmoid_node = make_shared<op::SigmoidBackprop>(input, delta);
+    auto func = make_shared<Function>(sigmoid_node, op::ParameterVector{input, delta});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, input->get_shape());
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, delta->get_shape());
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, input->get_shape());
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f};
+    vector<float> dataB{1.0f, 1.0f, 1.0f, 1.0f};
+
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+    backend->call_with_validate(func, {result}, {a, b});
+
+    vector<float> expected{0.196612f, 0.0176627f, 0.196612f, 0.0176627f};
+    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, relu_2Dfprop)
+{
+    auto shape_a = Shape{2, 5};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto relu = make_shared<op::Relu>(A);
+    auto shape_rt = Shape{2, 5};
+    auto f = make_shared<Function>(relu, op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0};
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, relu_4Dfprop)
+{
+    auto shape_a = Shape{2, 2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto relu = make_shared<op::Relu>(A);
+    auto shape_rt = Shape{2, 2, 2, 2};
+    auto f = make_shared<Function>(relu, op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1};
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, fuse_max_with_constant_zero_input_as_relu)
+{
+    auto shape_a = Shape{2, 5};
+    auto A = op::Constant::create(element::f32, shape_a, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+    auto B = make_shared<op::Parameter>(element::f32, shape_a);
+    auto max = make_shared<op::Maximum>(A, B);
+    auto shape_rt = Shape{2, 5};
+    auto f = make_shared<Function>(max, op::ParameterVector{B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto b = backend->create_tensor(element::f32, shape_a);
+    copy_data(b, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0};
+
+    backend->call_with_validate(f, {result}, {b});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, relu_2Dbackprop)
+{
+    auto shape_a = Shape{2, 5};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto delta_val = make_shared<op::Parameter>(element::f32, shape_a);
+    auto relu = make_shared<op::ReluBackprop>(A, delta_val);
+    auto shape_rt = Shape{2, 5};
+    auto f = make_shared<Function>(relu, op::ParameterVector{A, delta_val});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
+    auto delta = backend->create_tensor(element::f32, shape_a);
+    copy_data(delta, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    vector<float> expected{1, 2, 0, 4, 0, 6, 7, 0, 9, 0};
+
+    backend->call_with_validate(f, {result}, {a, delta});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, relu_4Dbackprop)
+{
+    auto shape_a = Shape{2, 2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto delta_val = make_shared<op::Parameter>(element::f32, shape_a);
+    auto relu = make_shared<op::ReluBackprop>(A, delta_val);
+    auto shape_rt = Shape{2, 2, 2, 2};
+    auto f = make_shared<Function>(relu, op::ParameterVector{A, delta_val});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1});
+    auto delta = backend->create_tensor(element::f32, shape_a);
+    copy_data(delta, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1});
+    auto result = backend->create_tensor(element::f32, shape_rt);
+    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1};
+
+    backend->call_with_validate(f, {result}, {a, delta});
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, softmax_all)
+{
+    Shape shape{2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f =
+        make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0, 1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{-3, -2, -1, 0, 1, 2});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    auto d = expf(-3) + expf(-2) + expf(-1) + expf(0) + expf(1) + expf(2);
+
+    backend->call_with_validate(f, {result}, {a});
+    vector<float> expected{
+        expf(-3) / d, expf(-2) / d, expf(-1) / d, expf(0) / d, expf(1) / d, expf(2) / d};
+    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result)));
+
+    // empty AxisSet is the same as "full" AxisSet
+    f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{}), op::ParameterVector{A});
+    backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, softmax_axis_3d)
+{
+    Shape shape{2, 2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{-10, -20, -30, -40, -50, -60, -1, -2, -3, -4, -5, -6});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    auto d0 = expf(-10) + expf(-1);
+    auto d1 = expf(-20) + expf(-2);
+    auto d2 = expf(-30) + expf(-3);
+    auto d3 = expf(-40) + expf(-4);
+    auto d4 = expf(-50) + expf(-5);
+    auto d5 = expf(-60) + expf(-6);
+
+    backend->call_with_validate(f, {result}, {a});
+    vector<float> expected{expf(-10) / d0,
+                           expf(-20) / d1,
+                           expf(-30) / d2,
+                           expf(-40) / d3,
+                           expf(-50) / d4,
+                           expf(-60) / d5,
+                           expf(-1) / d0,
+                           expf(-2) / d1,
+                           expf(-3) / d2,
+                           expf(-4) / d3,
+                           expf(-5) / d4,
+                           expf(-6) / d5};
+
+    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, softmax_axis_3d_double)
+{
+    Shape shape{2, 2, 3};
+    auto A = make_shared<op::Parameter>(element::f64, shape);
+    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::f64, shape);
+    copy_data(a, vector<double>{-10, -20, -30, -40, -50, -60, -1, -2, -3, -4, -5, -6});
+    auto result = backend->create_tensor(element::f64, shape);
+
+    auto d0 = expf(-10) + expf(-1);
+    auto d1 = expf(-20) + expf(-2);
+    auto d2 = expf(-30) + expf(-3);
+    auto d3 = expf(-40) + expf(-4);
+    auto d4 = expf(-50) + expf(-5);
+    auto d5 = expf(-60) + expf(-6);
+
+    backend->call_with_validate(f, {result}, {a});
+    vector<double> expected{expf(-10) / d0,
+                            expf(-20) / d1,
+                            expf(-30) / d2,
+                            expf(-40) / d3,
+                            expf(-50) / d4,
+                            expf(-60) / d5,
+                            expf(-1) / d0,
+                            expf(-2) / d1,
+                            expf(-3) / d2,
+                            expf(-4) / d3,
+                            expf(-5) / d4,
+                            expf(-6) / d5};
+
+    EXPECT_TRUE(test::all_close(expected, read_vector<double>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, softmax_axis)
+{
+    Shape shape{2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{1}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{-10, -20, -30, -40, -50, -60});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    auto d0 = expf(-10) + expf(-20) + expf(-30);
+    auto d1 = expf(-40) + expf(-50) + expf(-60);
+
+    backend->call_with_validate(f, {result}, {a});
+    vector<float> expected{expf(-10) / d0,
+                           expf(-20) / d0,
+                           expf(-30) / d0,
+                           expf(-40) / d1,
+                           expf(-50) / d1,
+                           expf(-60) / d1};
+    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, softmax_axis_2)
+{
+    Shape shape{2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{-10, -20, -30, -40, -50, -60});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    auto d0 = expf(-10) + expf(-40);
+    auto d1 = expf(-20) + expf(-50);
+    auto d2 = expf(-30) + expf(-60);
+
+    backend->call_with_validate(f, {result}, {a});
+    vector<float> expected{expf(-10) / d0,
+                           expf(-20) / d1,
+                           expf(-30) / d2,
+                           expf(-40) / d0,
+                           expf(-50) / d1,
+                           expf(-60) / d2};
+    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, softmax_axis_3d_trivial)
+{
+    Shape shape{1, 2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{-10, -20, -30, -40, -50, -60});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    vector<float> expected{1, 1, 1, 1, 1, 1};
+    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, softmax_underflow)
+{
+    Shape shape{2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto low = std::numeric_limits<float>::lowest();
+
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, vector<float>{low, 1, 2, 3, 4, 5});
+    auto result = backend->create_tensor(element::f32, shape);
+
+    auto d0 = expf(low) + expf(3);
+    auto d1 = expf(1) + expf(4);
+    auto d2 = expf(2) + expf(5);
+
+    backend->call_with_validate(f, {result}, {a});
+    vector<float> expected{
+        expf(low) / d0, expf(1) / d1, expf(2) / d2, expf(3) / d0, expf(4) / d1, expf(5) / d2};
+    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, multiple_backends)
+{
+    Shape shape{2, 2};
+    auto A1 = make_shared<op::Parameter>(element::f32, shape);
+    auto B1 = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(A1 + B1, op::ParameterVector{A1, B1});
+
+    auto A2 = make_shared<op::Parameter>(element::f32, shape);
+    auto B2 = make_shared<op::Parameter>(element::f32, shape);
+    auto g = make_shared<Function>(A2 * B2, op::ParameterVector{A2, B2});
+
+    auto backend1 = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto backend2 = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a1 = backend1->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> b1 = backend1->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> result1 = backend1->create_tensor(element::f32, shape);
+
+    shared_ptr<runtime::Tensor> a2 = backend2->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> b2 = backend2->create_tensor(element::f32, shape);
+    shared_ptr<runtime::Tensor> result2 = backend2->create_tensor(element::f32, shape);
+
+    copy_data(a1, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
+    copy_data(b1, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
+
+    copy_data(a2, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
+    copy_data(b2, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
+
+    backend1->call_with_validate(f, {result1}, {a1, b1});
+    EXPECT_EQ(read_vector<float>(result1),
+              (test::NDArray<float, 2>({{6, 8}, {10, 12}})).get_vector());
+
+    backend2->call_with_validate(g, {result2}, {a2, b2});
+    EXPECT_EQ(read_vector<float>(result2),
+              (test::NDArray<float, 2>({{5, 12}, {21, 32}})).get_vector());
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, tensorview_custom_mem)
+{
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    Shape shape{2, 2};
+
+    auto make_external = [&]() {
+        auto A = make_shared<op::Parameter>(element::f32, shape);
+        auto B = make_shared<op::Parameter>(element::f32, shape);
+        auto f = make_shared<Function>(make_shared<op::Divide>(A, B), op::ParameterVector{A, B});
+
+        return f;
+    };
+
+    auto f = make_external();
+
+    vector<float> av{2, 4, 8, 16};
+    vector<float> bv{1, 2, 4, 8};
+    // use custom mem with tensorview, no need to copy data
+    auto a = backend->create_tensor(element::f32, shape, av.data());
+    auto b = backend->create_tensor(element::f32, shape, bv.data());
+
+    // use custom mem with result tensorview
+    vector<float> rv{0, 0, 0, 0};
+    auto result = backend->create_tensor(element::f32, shape, rv.data());
+
+    // result should be in memory without needing explict read
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((vector<float>{2, 2, 2, 2}), rv);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, validate_call_input_count)
+{
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    Shape shape{2, 2};
+
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Add>(A, B), op::ParameterVector{A, B});
+
+    auto a = backend->create_tensor(element::f32, shape);
+    auto b = backend->create_tensor(element::f32, shape);
+    auto c = backend->create_tensor(element::f32, shape);
+
+    EXPECT_ANY_THROW(backend->call_with_validate(f, {c}, {a}));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, validate_call_input_type)
+{
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    Shape shape{2, 2};
+
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Add>(A, B), op::ParameterVector{A, B});
+
+    auto a = backend->create_tensor(element::i32, shape);
+    auto b = backend->create_tensor(element::f32, shape);
+    auto c = backend->create_tensor(element::f32, shape);
+
+    EXPECT_ANY_THROW(backend->call_with_validate(f, {c}, {a, b}));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, validate_call_input_shape)
+{
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    Shape shape{2, 2};
+
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Add>(A, B), op::ParameterVector{A, B});
+
+    auto a = backend->create_tensor(element::f32, {2, 3});
+    auto b = backend->create_tensor(element::f32, shape);
+    auto c = backend->create_tensor(element::f32, shape);
+
+    EXPECT_ANY_THROW(backend->call_with_validate(f, {c}, {a, b}));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, validate_call_output_count)
+{
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    Shape shape{2, 2};
+
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Add>(A, B), op::ParameterVector{A, B});
+
+    auto a = backend->create_tensor(element::f32, shape);
+    auto b = backend->create_tensor(element::f32, shape);
+    auto c = backend->create_tensor(element::f32, shape);
+    auto d = backend->create_tensor(element::f32, shape);
+
+    EXPECT_ANY_THROW(backend->call_with_validate(f, {c, d}, {a, b}));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, validate_call_output_type)
+{
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    Shape shape{2, 2};
+
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Add>(A, B), op::ParameterVector{A, B});
+
+    auto a = backend->create_tensor(element::i32, shape);
+    auto b = backend->create_tensor(element::f32, shape);
+    auto c = backend->create_tensor(element::f32, shape);
+
+    EXPECT_ANY_THROW(backend->call_with_validate(f, {a}, {b, c}));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, validate_call_output_shape)
+{
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    Shape shape{2, 2};
+
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto B = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Add>(A, B), op::ParameterVector{A, B});
+
+    auto a = backend->create_tensor(element::f32, {2, 3});
+    auto b = backend->create_tensor(element::f32, shape);
+    auto c = backend->create_tensor(element::f32, shape);
+
+    EXPECT_ANY_THROW(backend->call_with_validate(f, {a}, {c, b}));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, logical_and)
+{
+    Shape shape{2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::boolean, shape);
+    auto B = make_shared<op::Parameter>(element::boolean, shape);
+    auto f = make_shared<Function>(make_shared<op::And>(A, B), op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::boolean, shape);
+    copy_data(a, vector<char>{1, 0, 1, 1, 1, 0, 1, 0});
+    auto b = backend->create_tensor(element::boolean, shape);
+    copy_data(b, vector<char>{0, 0, 1, 0, 0, 1, 1, 0});
+    auto result = backend->create_tensor(element::boolean, shape);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((vector<char>{0, 0, 1, 0, 0, 0, 1, 0}), read_vector<char>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, logical_or)
+{
+    Shape shape{2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::boolean, shape);
+    auto B = make_shared<op::Parameter>(element::boolean, shape);
+    auto f = make_shared<Function>(make_shared<op::Or>(A, B), op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::boolean, shape);
+    copy_data(a, vector<char>{1, 0, 1, 1, 1, 0, 1, 0});
+    auto b = backend->create_tensor(element::boolean, shape);
+    copy_data(b, vector<char>{0, 0, 1, 0, 0, 1, 1, 0});
+    auto result = backend->create_tensor(element::boolean, shape);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((vector<char>{1, 0, 1, 1, 1, 1, 1, 0}), read_vector<char>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_b1c2h2w2)
+{
+    auto input_shape = Shape{1, 2, 2, 2};
+    auto input = make_shared<op::Parameter>(element::f32, input_shape);
+    auto mean_shape = Shape{2};
+    auto var_shape = Shape{2};
+    auto gamma_shape = Shape{2};
+    auto gamma = make_shared<op::Parameter>(element::f32, gamma_shape);
+    auto beta_shape = Shape{2};
+    auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
+    double eps = 0.001;
+    auto shape_r = Shape{1, 2, 2, 2};
+    auto bn = make_shared<op::BatchNormTraining>(input, gamma, beta, eps);
+
+    auto output_rt = std::make_shared<op::GetOutputElement>(bn, 0);
+    auto mean_rt = std::make_shared<op::GetOutputElement>(bn, 1);
+    auto variance_rt = std::make_shared<op::GetOutputElement>(bn, 2);
+
+    auto f = make_shared<Function>(NodeVector{output_rt, mean_rt, variance_rt},
+                                   op::ParameterVector{input, gamma, beta});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto _input = backend->create_tensor(element::f32, Shape{1, 2, 2, 2});
+
+    copy_data(_input,
+              vector<float>{0.54881352f,
+                            0.71518934f,
+                            0.60276335f,
+                            0.54488319f,
+                            0.42365479f,
+                            0.64589411f,
+                            0.4375872f,
+                            0.89177299f});
+    auto _gamma = backend->create_tensor(element::f32, gamma_shape);
+    copy_data(_gamma, vector<float>{1.0f, 1.0f});
+    auto _beta = backend->create_tensor(element::f32, beta_shape);
+    copy_data(_beta, vector<float>{0.0f, 0.0f});
+    auto bn_output = backend->create_tensor(element::f32, shape_r);
+    auto result_mean = backend->create_tensor(element::f32, mean_shape);
+    auto result_variance = backend->create_tensor(element::f32, var_shape);
+
+    vector<float> expected_result{-0.71498716f,
+                                  1.48388731f,
+                                  -0.00196938f,
+                                  -0.76693159f,
+                                  -0.91316032f,
+                                  0.23943391f,
+                                  -0.84090298f,
+                                  1.51462936f};
+    vector<float> expected_mean{0.602912f, 0.599727f};
+    vector<float> expected_variance{0.00472505f, 0.0361782f};
+
+    backend->call_with_validate(
+        f, {bn_output, result_mean, result_variance}, {_input, _gamma, _beta});
+
+    EXPECT_TRUE(test::all_close(expected_result, read_vector<float>(bn_output), 1e-5f, 1e-6f));
+    EXPECT_TRUE(test::all_close(expected_mean, read_vector<float>(result_mean), 1e-5f, 1e-6f));
+    EXPECT_TRUE(
+        test::all_close(expected_variance, read_vector<float>(result_variance), 1e-5f, 1e-6f));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_b2c2h2w1)
+{
+    auto input_shape = Shape{2, 2, 2, 1};
+    auto input = make_shared<op::Parameter>(element::f32, input_shape);
+    auto mean_shape = Shape{2};
+    auto var_shape = Shape{2};
+    auto gamma_shape = Shape{2};
+    auto gamma = make_shared<op::Parameter>(element::f32, gamma_shape);
+    auto beta_shape = Shape{2};
+    auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
+    double eps = 0.001;
+    auto shape_r = Shape{2, 2, 2, 1};
+    auto bn = make_shared<op::BatchNormTraining>(input, gamma, beta, eps);
+
+    auto output_rt = std::make_shared<op::GetOutputElement>(bn, 0);
+    auto mean_rt = std::make_shared<op::GetOutputElement>(bn, 1);
+    auto variance_rt = std::make_shared<op::GetOutputElement>(bn, 2);
+
+    auto f = make_shared<Function>(NodeVector{output_rt, mean_rt, variance_rt},
+                                   op::ParameterVector{input, gamma, beta});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto _input = backend->create_tensor(element::f32, input_shape);
+    copy_data(_input,
+              vector<float>{0.54881352f,
+                            0.71518934f,
+                            0.60276335f,
+                            0.54488319f,
+                            0.42365479f,
+                            0.64589411f,
+                            0.4375872f,
+                            0.89177299f});
+
+    auto _gamma = backend->create_tensor(element::f32, gamma_shape);
+    copy_data(_gamma, vector<float>{1.0f, 1.0f});
+    auto _beta = backend->create_tensor(element::f32, beta_shape);
+    copy_data(_beta, vector<float>{0.0f, 0.0f});
+    auto bn_output = backend->create_tensor(element::f32, shape_r);
+    auto result_mean = backend->create_tensor(element::f32, mean_shape);
+    auto result_variance = backend->create_tensor(element::f32, var_shape);
+
+    vector<float> expected_result{
+        -0.30327f, 1.1561f, -0.0963782f, -0.434702f, -1.4011f, 0.548275f, -1.06187f, 1.59295f};
+    vector<float> expected_mean{0.583388f, 0.619252f};
+    vector<float> expected_variance{0.0119972f, 0.0282681f};
+    backend->call_with_validate(
+        f, {bn_output, result_mean, result_variance}, {_input, _gamma, _beta});
+
+    EXPECT_TRUE(test::all_close(expected_result, read_vector<float>(bn_output)));
+    EXPECT_TRUE(test::all_close(expected_mean, read_vector<float>(result_mean)));
+    EXPECT_TRUE(
+        test::all_close(expected_variance, read_vector<float>(result_variance), 1e-5f, 1e-6f));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, batchnorm_bprop_n4c3h2w2)
+{
+    auto input_shape = Shape{4, 3, 2, 2};
+    auto shape_mean = Shape{3};
+    auto input = make_shared<op::Parameter>(element::f32, input_shape);
+    auto mean_shape = Shape{3};
+    auto mean = make_shared<op::Parameter>(element::f32, mean_shape);
+    auto var_shape = Shape{3};
+    auto var = make_shared<op::Parameter>(element::f32, var_shape);
+    auto gamma_shape = Shape{3};
+    auto gamma = make_shared<op::Parameter>(element::f32, gamma_shape);
+    auto beta_shape = Shape{3};
+    auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
+    double eps = 0.001;
+    auto shape_r = Shape{4, 3, 2, 2};
+    auto bn = make_shared<op::BatchNormTraining>(input, gamma, beta, eps);
+    auto bn_dx = make_shared<op::GetOutputElement>(bn, 0);
+    auto bn_dgamma = make_shared<op::GetOutputElement>(bn, 1);
+    auto bn_dbeta = make_shared<op::GetOutputElement>(bn, 2);
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto _input = backend->create_tensor(element::f32, input_shape);
+    vector<float> dataInput{
+        10.76331902f, 11.51178265f, 10.31018162f, 12.2993021f,  14.17626667f, 14.63498497f,
+        13.63494492f, 13.84248161f, 11.34602547f, 13.22014618f, 10.46686649f, 10.39842987f,
+        12.94806862f, 11.71670246f, 14.94438076f, 13.13236618f, 13.40889645f, 12.76128387f,
+        11.34430027f, 11.86629677f, 11.11464024f, 10.93221283f, 11.95324039f, 10.96581173f,
+        13.05455494f, 14.41404247f, 13.11169434f, 11.26559448f, 10.89965153f, 14.08202171f,
+        11.12685776f, 12.58428574f, 12.59247875f, 13.00187492f, 12.66310215f, 10.06655025f,
+        12.62048626f, 14.47942352f, 13.84950638f, 10.61425877f, 11.47936344f, 13.06011772f,
+        13.63069057f, 12.31748772f, 13.84555244f, 10.95815468f, 12.78933334f, 12.75389099f};
+    copy_data(_input, dataInput);
+    auto _mean = backend->create_tensor(element::f32, mean_shape);
+    copy_data(_mean, vector<float>{12.56472874f, 12.80312157f, 11.81676865f});
+    auto _var = backend->create_tensor(element::f32, var_shape);
+    copy_data(_var, vector<float>{1.94557643f, 1.32772446f, 1.28163588f});
+
+    auto _gamma = backend->create_tensor(element::f32, gamma_shape);
+    copy_data(_gamma, vector<float>{2.0f, 2.0f, 2.0f});
+    auto _beta = backend->create_tensor(element::f32, beta_shape);
+    copy_data(_beta, vector<float>{1.0f, 1.0f, 1.0f});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    shared_ptr<runtime::Tensor> _delta = backend->create_tensor(element::f32, shape_r);
+    vector<float> deltaData(shape_size(shape_r), 20.0f);
+    copy_data(_delta, deltaData);
+
+    auto f = make_shared<Function>(NodeVector{bn_dx, bn_dgamma, bn_dbeta},
+                                   op::ParameterVector{mean, var, input, gamma, beta});
+
+    auto C = std::make_shared<op::Parameter>(element::f32, shape_r);
+
+    auto zero = ngraph::make_zero(bn_dgamma->get_element_type(), bn_dgamma->get_shape());
+    ngraph::autodiff::Adjoints adjoints(NodeVector{bn_dx, bn_dgamma, bn_dbeta},
+                                        NodeVector{C, zero, zero});
+
+    auto dinput = adjoints.backprop_node(input);
+    auto dgamma = adjoints.backprop_node(gamma);
+    auto dbeta = adjoints.backprop_node(beta);
+
+    auto df = make_shared<Function>(NodeVector{dinput, dgamma, dbeta},
+                                    op::ParameterVector{mean, var, input, gamma, beta, C});
+
+    // roundtrip serialization
+    string js = serialize(df, 4);
+    istringstream in(js);
+    df = deserialize(in);
+
+    shared_ptr<runtime::Tensor> _dinput = backend->create_tensor(element::f32, shape_r);
+    shared_ptr<runtime::Tensor> _dgamma = backend->create_tensor(element::f32, gamma_shape);
+    shared_ptr<runtime::Tensor> _dbeta = backend->create_tensor(element::f32, beta_shape);
+
+    backend->call_with_validate(
+        df, {_dinput, _dgamma, _dbeta}, {_mean, _var, _input, _gamma, _beta, _delta});
+
+    vector<float> expected_input{
+        8.17051607e-06f,  4.77576657e-06f,  1.02257760e-05f,  1.20387525e-06f,  -1.73868522e-06f,
+        3.84632768e-06f,  -1.07932050e-05f, -2.57458956e-06f, -2.22166714e-06f, -8.38779043e-06f,
+        -2.48082982e-06f, 5.89238360e-06f,  -2.52895109e-07f, -8.68433445e-06f, -5.82726737e-06f,
+        8.84659658e-06f,  3.03944108e-05f,  4.05480879e-05f,  1.84123158e-05f,  2.30061178e-05f,
+        1.34087590e-05f,  -9.26072571e-07f, -3.22908454e-05f, -2.07365116e-05f, -4.21330941e-05f,
+        2.83083100e-05f,  -3.71039101e-05f, -4.84390640e-06f, -2.93012376e-05f, 5.68858087e-06f,
+        1.83181458e-05f,  -1.07494506e-05f, -2.32429103e-06f, 6.92914809e-06f,  -6.66512321e-06f,
+        -7.00302840e-06f, -3.46675184e-06f, -4.36748381e-06f, 6.73822226e-07f,  -4.20158993e-06f,
+        3.83005061e-06f,  5.85143729e-06f,  4.17875243e-06f,  -8.64167783e-06f, 1.00170803e-05f,
+        -4.23939666e-06f, 4.80201680e-06f,  4.62702078e-06f};
+
+    ASSERT_TRUE(ngraph::test::all_close(read_vector<float>(_dinput), expected_input, 1e-3f, 1e-4f));
+    vector<float> expected_dgamma{7.06315041e-05f, -2.35289335e-04f, -5.06639481e-05f};
+    ASSERT_TRUE(
+        ngraph::test::all_close(read_vector<float>(_dgamma), expected_dgamma, 1e-2f, 1e-3f));
+    vector<float> expected_dbeta{320.f, 320.f, 320.f};
+    ASSERT_TRUE(ngraph::test::all_close(read_vector<float>(_dbeta), expected_dbeta, 1e-4f, 1e-8f));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_inference_b2c2h2w1)
+{
+    auto input_shape = Shape{2, 2, 2, 1};
+    auto input = make_shared<op::Parameter>(element::f32, input_shape);
+    auto mean_shape = Shape{2};
+    auto mean = make_shared<op::Parameter>(element::f32, mean_shape);
+    auto var_shape = Shape{2};
+    auto var = make_shared<op::Parameter>(element::f32, var_shape);
+    auto gamma_shape = Shape{2};
+    auto gamma = make_shared<op::Parameter>(element::f32, gamma_shape);
+    auto beta_shape = Shape{2};
+    auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
+    double eps = 0.001;
+    auto shape_r = Shape{2, 2, 2, 1};
+    auto bn = make_shared<op::BatchNormInference>(input, gamma, beta, mean, var, eps);
+
+    auto f = make_shared<Function>(bn, op::ParameterVector{input, gamma, beta, mean, var});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto _input = backend->create_tensor(element::f32, input_shape);
+    copy_data(_input,
+              vector<float>{0.54881352f,
+                            0.71518934f,
+                            0.60276335f,
+                            0.54488319f,
+                            0.42365479f,
+                            0.64589411f,
+                            0.4375872f,
+                            0.89177299f});
+
+    auto _gamma = backend->create_tensor(element::f32, gamma_shape);
+    copy_data(_gamma, vector<float>{1.0f, 1.0f});
+    auto _beta = backend->create_tensor(element::f32, beta_shape);
+    copy_data(_beta, vector<float>{0.0f, 0.0f});
+    auto _mean = backend->create_tensor(element::f32, mean_shape);
+    copy_data(_mean, vector<float>{0.583388f, 0.619252f});
+    auto _var = backend->create_tensor(element::f32, var_shape);
+    copy_data(_var, vector<float>{0.0119972f, 0.0282681f});
+    auto bn_output = backend->create_tensor(element::f32, shape_r);
+
+    vector<float> expected_result{
+        -0.30327f, 1.1561f, -0.0963782f, -0.434702f, -1.4011f, 0.548275f, -1.06187f, 1.59295f};
+    backend->call_with_validate(f, {bn_output}, {_input, _gamma, _beta, _mean, _var});
+
+    ASSERT_TRUE(
+        ngraph::test::all_close(expected_result, read_vector<float>(bn_output), 1e-3f, 1e-4f));
+}
+
+#if 0
+NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_globalstats_b2c2w2h1)
+{
+    auto input_shape = Shape{2, 2, 2, 1};
+    auto input = make_shared<op::Parameter>(element::f32, input_shape);
+    auto mean_shape = Shape{2};
+    auto mean = make_shared<op::Parameter>(element::f32, mean_shape);
+    auto var_shape = Shape{2};
+    auto var = make_shared<op::Parameter>(element::f32, var_shape);
+    auto gamma_shape = Shape{2};
+    auto gamma = make_shared<op::Parameter>(element::f32, gamma_shape);
+    auto beta_shape = Shape{2};
+    auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
+    double eps = 0.001;
+    auto shape_r = Shape{2, 2, 2, 1};
+    auto bn = make_shared<op::BatchNormTraining>(input, gamma, beta, mean, var, eps);
+
+    auto f = make_shared<Function>(bn, op::ParameterVector{gamma, beta, input, mean, var});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto _input = backend->create_tensor(element::f32, input_shape);
+    copy_data(_input,
+              vector<float>{0.54881352f,
+                            0.71518934f,
+                            0.60276335f,
+                            0.54488319f,
+                            0.42365479f,
+                            0.64589411f,
+                            0.4375872f,
+                            0.89177299f});
+
+    auto _gamma = backend->create_tensor(element::f32, gamma_shape);
+    copy_data(_gamma, vector<float>{1.0f, 1.0f});
+    auto _beta = backend->create_tensor(element::f32, beta_shape);
+    copy_data(_beta, vector<float>{0.0f, 0.0f});
+    auto _mean = backend->create_tensor(element::f32, mean_shape);
+    copy_data(_mean, vector<float>{0.583388f, 0.619252f});
+    auto _var = backend->create_tensor(element::f32, var_shape);
+    copy_data(_var, vector<float>{0.0119972f, 0.0282681f});
+    auto bn_output = backend->create_tensor(element::f32, shape_r);
+
+    vector<float> expected_result{
+        -0.30327f, 1.1561f, -0.0963782f, -0.434702f, -1.4011f, 0.548275f, -1.06187f, 1.59295f};
+    backend->call_with_validate(f, {bn_output}, {_gamma, _beta, _input, _mean, _var});
+
+    ASSERT_TRUE(
+        ngraph::test::all_close(expected_result, read_vector<float>(bn_output), 1e-3f, 1e-4f));
+}
+#endif
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_sequence_n2c3h4w2)
+{
+    Shape shape{2, 3, 4, 2};
+    Shape seq_len_shape{4};
+    auto A = make_shared<op::Parameter>(element::i32, shape);
+    auto B = make_shared<op::Parameter>(element::i32, seq_len_shape);
+
+    size_t batch_axis = 2;
+    size_t sequence_axis = 1;
+    auto rs = std::make_shared<op::ReverseSequence>(A, B, batch_axis, sequence_axis);
+
+    auto f = make_shared<Function>(rs, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shape);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, seq_len_shape);
+
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i32, shape);
+
+    std::vector<int> input{
+        0,  0, 3,  0, 6,  0, 9,  0, 1,  0, 4,  0, 7,  0, 10, 0, 2,  0, 5,  0, 8,  0, 11, 0,
+        12, 0, 15, 0, 18, 0, 21, 0, 13, 0, 16, 0, 19, 0, 22, 0, 14, 0, 17, 0, 20, 0, 23, 0,
+    };
+
+    std::vector<int> seq_lenghts{1, 2, 1, 2};
+    copy_data(b, seq_lenghts);
+
+    std::vector<int> expected{
+        0,  0, 4,  0, 6,  0, 10, 0, 1,  0, 3,  0, 7,  0, 9,  0, 2,  0, 5,  0, 8,  0, 11, 0,
+
+        12, 0, 16, 0, 18, 0, 22, 0, 13, 0, 15, 0, 19, 0, 21, 0, 14, 0, 17, 0, 20, 0, 23, 0};
+
+    copy_data(a, input);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ(read_vector<int>(result), expected);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_sequence_n4c3h2w2)
+{
+    Shape shape{4, 3, 2, 2};
+    auto A = make_shared<op::Parameter>(element::i32, shape);
+    Shape seq_len_shape{4};
+    auto B = make_shared<op::Parameter>(element::i32, seq_len_shape);
+
+    size_t batch_axis = 0;
+    size_t sequence_axis = 1;
+
+    auto rs = std::make_shared<op::ReverseSequence>(A, B, batch_axis, sequence_axis);
+
+    auto f = make_shared<Function>(rs, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shape);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, seq_len_shape);
+
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i32, shape);
+
+    std::vector<int> seq_lenghts{1, 2, 3, 3};
+    copy_data(b, seq_lenghts);
+
+    std::vector<int> input{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                           16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                           32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47};
+
+    std::vector<int> expected{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 16, 17, 18, 19,
+                              12, 13, 14, 15, 20, 21, 22, 23, 32, 33, 34, 35, 28, 29, 30, 31,
+                              24, 25, 26, 27, 44, 45, 46, 47, 40, 41, 42, 43, 36, 37, 38, 39};
+
+    copy_data(a, input);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ(read_vector<int>(result), expected);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, reverse_sequence_n4d2c3h2w2)
+{
+    Shape shape{4, 2, 3, 2, 2};
+    auto A = make_shared<op::Parameter>(element::i32, shape);
+    Shape seq_len_shape{4};
+    auto B = make_shared<op::Parameter>(element::i32, seq_len_shape);
+
+    size_t batch_axis = 0;
+    size_t sequence_axis = 2;
+
+    auto rs = std::make_shared<op::ReverseSequence>(A, B, batch_axis, sequence_axis);
+
+    auto f = make_shared<Function>(rs, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shape);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, seq_len_shape);
+
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i32, shape);
+
+    std::vector<int> input{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                           16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                           32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+                           48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+                           64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
+
+    std::vector<int> expected{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                              16, 17, 18, 19, 20, 21, 22, 23, 28, 29, 30, 31, 24, 25, 26, 27,
+                              32, 33, 34, 35, 40, 41, 42, 43, 36, 37, 38, 39, 44, 45, 46, 47,
+                              48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+                              64, 65, 66, 67, 68, 69, 70, 71, 76, 77, 78, 79, 72, 73, 74, 75,
+                              80, 81, 82, 83, 88, 89, 90, 91, 84, 85, 86, 87, 92, 93, 94, 95};
+
+    copy_data(a, input);
+
+    std::vector<int> seq_lenghts{1, 2, 1, 2};
+    copy_data(b, seq_lenghts);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ(read_vector<int>(result), expected);
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, generate_mask)
+{
+    Shape scalar{};
+    Shape result_shape{1, 128};
+    const unsigned int seed = 777;
+    auto training = op::Constant::create(element::f32, Shape{}, {1});
+    auto gen_mask = make_shared<op::GenerateMask>(training, result_shape, element::f32, seed, 0.5);
+    auto gen_mask2 = make_shared<op::GenerateMask>(training, result_shape, element::f32, seed, 0.5);
+    auto f = make_shared<Function>(NodeVector{gen_mask, gen_mask2}, op::ParameterVector{});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto is_not_zero_or_one = [](float num) { return num != 0.f && num != 1.f; };
+
+    auto result_tv1 = backend->create_tensor<float>(result_shape);
+    auto result_tv2 = backend->create_tensor<float>(result_shape);
+    backend->call_with_validate(f, {result_tv1, result_tv2}, {});
+    auto result1 = read_vector<float>(result_tv1);
+    auto result2 = read_vector<float>(result_tv2);
+    ASSERT_EQ(result1, result2);
+    ASSERT_FALSE(std::any_of(result1.begin(), result1.end(), is_not_zero_or_one));
+    backend->call_with_validate(f, {result_tv1, result_tv2}, {});
+    auto result1_2 = read_vector<float>(result_tv1);
+    auto result2_2 = read_vector<float>(result_tv2);
+    ASSERT_NE(result1, result1_2);
+    ASSERT_FALSE(std::any_of(result1_2.begin(), result1_2.end(), is_not_zero_or_one));
+    ASSERT_NE(result2, result2_2);
+    ASSERT_FALSE(std::any_of(result2_2.begin(), result2_2.end(), is_not_zero_or_one));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, quantize)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape;
+    AxisSet quantization_axes;
+
+    auto input_type = element::f32;
+    auto output_type = element::u8;
+
+    typedef float input_c_type;
+    typedef uint8_t output_c_type;
+
+    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_INFINITY;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(input_type, scale_offset_shape, {2});
+    auto offset = op::Constant::create(output_type, scale_offset_shape, {1});
+    auto quantize =
+        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
+    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+    // divide by scale                2  2  2  2  2  2  2  2  2  2  2   2
+    // equals (rounded)               0  1  1  2  2  3  3  4  4  5  5   6
+    // plus offset                    1  1  1  1  1  1  1  1  1  1  1   1
+    // equals                         1  2  2  3  3  4  4  5  5  6  6   7
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ((vector<output_c_type>{1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7}),
+              read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, dequantize)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape;
+    AxisSet quantization_axes;
+
+    auto input_type = element::u8;
+    auto output_type = element::f32;
+
+    typedef uint8_t input_c_type;
+    typedef float output_c_type;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(output_type, scale_offset_shape, {2});
+    auto offset = op::Constant::create(input_type, scale_offset_shape, {1});
+    auto dequantize = make_shared<op::Dequantize>(X, scale, offset, output_type, quantization_axes);
+    auto f = make_shared<Function>(dequantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7});
+    // minus offset                   1  1  1  1  1  1  1  1  1  1  1  1
+    // eqauls                         0  1  1  2  2  3  3  4  4  5  5  6
+    // multiplied by scale            2  2  2  2  2  2  2  2  2  2  2  2
+    // equals                         0  2  2  4  4  6  6  8  8 10 10 12
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ((vector<output_c_type>{0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12}),
+              read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, dequantize_zero_offset)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape;
+    AxisSet quantization_axes;
+
+    auto input_type = element::u8;
+    auto output_type = element::f32;
+
+    typedef uint8_t input_c_type;
+    typedef float output_c_type;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(output_type, scale_offset_shape, {2});
+    auto offset = op::Constant::create(input_type, scale_offset_shape, {0});
+    auto dequantize = make_shared<op::Dequantize>(X, scale, offset, output_type, quantization_axes);
+    auto f = make_shared<Function>(dequantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7});
+    // minus offset                   0  0  0  0  0  0  0  0  0  0  0  0
+    // multiplied by scale            2  2  2  2  2  2  2  2  2  2  2  2
+    // equals                         2  4  4  6  6  8  8  10 10 12 12 14
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ((vector<output_c_type>{2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14}),
+              read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, quantize_axes)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape{4};
+    AxisSet quantization_axes{0};
+
+    auto input_type = element::f32;
+    auto output_type = element::u8;
+
+    typedef float input_c_type;
+    typedef uint8_t output_c_type;
+
+    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_INFINITY;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(input_type, scale_offset_shape, {2, 3, 4, 5});
+    auto offset = op::Constant::create(output_type, scale_offset_shape, {10, 20, 30, 40});
+    auto quantize =
+        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
+    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+    // divided by scale               2  2  2  3  3  3  4  4  4  5  5   5
+    // equals (rounded)               0  1  1  1  1  2  2  2  2  2  2   2
+    // plus offset                   10 10 10 20 20 20 30 30 30 40 40  40
+    // equals                        10 11 11 21 21 22 32 32 32 42 42  42
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ((vector<output_c_type>{10, 11, 11, 21, 21, 22, 32, 32, 32, 42, 42, 42}),
+              read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, dequantize_axes)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape{4};
+    AxisSet quantization_axes{0};
+
+    auto input_type = element::u8;
+    auto output_type = element::f32;
+
+    typedef uint8_t input_c_type;
+    typedef float output_c_type;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(output_type, scale_offset_shape, {2, 3, 4, 5});
+    auto offset = op::Constant::create(input_type, scale_offset_shape, {10, 20, 30, 40});
+    auto dequantize = make_shared<op::Dequantize>(X, scale, offset, output_type, quantization_axes);
+    auto f = make_shared<Function>(dequantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{10, 11, 11, 21, 21, 22, 32, 32, 32, 42, 42, 42});
+    // minus offset                   10  10  10  20  20  20  30  30  30  40  40  40
+    // equals                          0   1   1   1   1   2   2   2   2   2   2   2
+    // multiplied by scale             2   2   2   3   3   3   4   4   4   5   5   5
+    // equals                          0   2   2   3   3   6   8   8   8  10  10  10
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ((vector<output_c_type>{0, 2, 2, 3, 3, 6, 8, 8, 8, 10, 10, 10}),
+              read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, quantize_int8)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape;
+    AxisSet quantization_axes;
+
+    auto input_type = element::f32;
+    auto output_type = element::i8;
+
+    typedef float input_c_type;
+    typedef int8_t output_c_type;
+
+    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_INFINITY;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(input_type, scale_offset_shape, {2});
+    auto offset = op::Constant::create(output_type, scale_offset_shape, {1});
+    auto quantize =
+        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
+    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{0, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11});
+    // divide by scale                2   2  2   2  2   2  2   2  2   2  2    2
+    // equals (rounded)               0  -1  1  -2  2  -3  3  -4  4  -5  5   -6
+    // plus offset                    1   1  1   1  1   1  1   1  1   1  1    1
+    // equals                         1   0  2  -1  3  -2  4  -3  5  -4  6   -5
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ((vector<output_c_type>{1, 0, 2, -1, 3, -2, 4, -3, 5, -4, 6, -5}),
+              read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, dequantize_int8)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape;
+    AxisSet quantization_axes;
+
+    auto input_type = element::i8;
+    auto output_type = element::f32;
+
+    typedef int8_t input_c_type;
+    typedef float output_c_type;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(output_type, scale_offset_shape, {2});
+    auto offset = op::Constant::create(input_type, scale_offset_shape, {1});
+    auto dequantize = make_shared<op::Dequantize>(X, scale, offset, output_type, quantization_axes);
+    auto f = make_shared<Function>(dequantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{1, 0, 2, -1, 3, -2, 4, -3, 5, -4, 6, -5});
+    // minus offset                   1  1  1   1  1   1  1   1  1   1  1   1
+    // equals                         0 -1  1  -2  2  -3  3  -4  4  -5  5  -6
+    // multiplied by scale            2  2  2   2  2   2  2   2  2   2  2   2
+    // equals                         0 -2  2  -4  4  -6  6  -8  8 -10 10 -12
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ((vector<output_c_type>{0, -2, 2, -4, 4, -6, 6, -8, 8, -10, 10, -12}),
+              read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, quantize_clamp)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape;
+    AxisSet quantization_axes;
+
+    auto input_type = element::f32;
+    auto output_type = element::i8;
+
+    typedef float input_c_type;
+    typedef int8_t output_c_type;
+
+    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_INFINITY;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(input_type, scale_offset_shape, {0.00001});
+    auto offset = op::Constant::create(output_type, scale_offset_shape, {1});
+    auto quantize =
+        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
+    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{0, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11});
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ(
+        (vector<output_c_type>{1, -128, 127, -128, 127, -128, 127, -128, 127, -128, 127, -128}),
+        read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_NEAREST_TOWARD_ZERO)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape;
+    AxisSet quantization_axes;
+
+    auto input_type = element::f32;
+    auto output_type = element::i8;
+
+    typedef float input_c_type;
+    typedef int8_t output_c_type;
+
+    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_ZERO;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
+    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
+    auto quantize =
+        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
+    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
+    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
+    // equals (rounded)               2   2   3  -2   -2   -3   3   3   4   -3   -3   -4
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ((vector<output_c_type>{2, 2, 3, -2, -2, -3, 3, 3, 4, -3, -3, -4}),
+              read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_NEAREST_UPWARD)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape;
+    AxisSet quantization_axes;
+
+    auto input_type = element::f32;
+    auto output_type = element::i8;
+
+    typedef float input_c_type;
+    typedef int8_t output_c_type;
+
+    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_UPWARD;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
+    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
+    auto quantize =
+        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
+    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
+    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
+    // equals (rounded)               2   3   3  -2   -2   -3   3   4   4   -3   -3   -4
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ((vector<output_c_type>{2, 3, 3, -2, -2, -3, 3, 4, 4, -3, -3, -4}),
+              read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_NEAREST_DOWNWARD)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape;
+    AxisSet quantization_axes;
+
+    auto input_type = element::f32;
+    auto output_type = element::i8;
+
+    typedef float input_c_type;
+    typedef int8_t output_c_type;
+
+    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_DOWNWARD;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
+    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
+    auto quantize =
+        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
+    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
+    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
+    // equals (rounded)               2   2   3  -2   -3   -3   3   3   4   -3   -4   -4
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ((vector<output_c_type>{2, 2, 3, -2, -3, -3, 3, 3, 4, -3, -4, -4}),
+              read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_NEAREST_TOWARD_EVEN)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape;
+    AxisSet quantization_axes;
+
+    auto input_type = element::f32;
+    auto output_type = element::i8;
+
+    typedef float input_c_type;
+    typedef int8_t output_c_type;
+
+    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_EVEN;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
+    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
+    auto quantize =
+        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
+    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
+    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
+    // equals (rounded)               2   2   3  -2   -2   -3   3   4   4   -3   -4   -4
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ((vector<output_c_type>{2, 2, 3, -2, -2, -3, 3, 4, 4, -3, -4, -4}),
+              read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_TOWARD_INFINITY)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape;
+    AxisSet quantization_axes;
+
+    auto input_type = element::f32;
+    auto output_type = element::i8;
+
+    typedef float input_c_type;
+    typedef int8_t output_c_type;
+
+    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_TOWARD_INFINITY;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
+    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
+    auto quantize = make_shared<op::Quantize>(
+        X,
+        scale,
+        offset,
+        output_type,
+        quantization_axes,
+        static_cast<op::Quantize::RoundMode>(static_cast<int>(round_mode)));
+    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
+    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
+    // equals (rounded)               3   3   3  -3   -3   -3   4   4   4   -4   -4   -4
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ((vector<output_c_type>{3, 3, 3, -3, -3, -3, 4, 4, 4, -4, -4, -4}),
+              read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_TOWARD_ZERO)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape;
+    AxisSet quantization_axes;
+
+    auto input_type = element::f32;
+    auto output_type = element::i8;
+
+    typedef float input_c_type;
+    typedef int8_t output_c_type;
+
+    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_TOWARD_ZERO;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
+    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
+    auto quantize = make_shared<op::Quantize>(
+        X,
+        scale,
+        offset,
+        output_type,
+        quantization_axes,
+        static_cast<op::Quantize::RoundMode>(static_cast<int>(round_mode)));
+    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
+    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
+    // equals (rounded)               2   2   2  -2   -2   -2   3   3   3   -3   -3   -3
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ((vector<output_c_type>{2, 2, 2, -2, -2, -2, 3, 3, 3, -3, -3, -3}),
+              read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_UP)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape;
+    AxisSet quantization_axes;
+
+    auto input_type = element::f32;
+    auto output_type = element::i8;
+
+    typedef float input_c_type;
+    typedef int8_t output_c_type;
+
+    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_UP;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
+    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
+    auto quantize =
+        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
+    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
+    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
+    // equals (rounded)               3   3   3  -2   -2   -2   4   4   4   -3   -3   -3
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ((vector<output_c_type>{3, 3, 3, -2, -2, -2, 4, 4, 4, -3, -3, -3}),
+              read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_DOWN)
+{
+    Shape input_shape{4, 3};
+    Shape scale_offset_shape;
+    AxisSet quantization_axes;
+
+    auto input_type = element::f32;
+    auto output_type = element::i8;
+
+    typedef float input_c_type;
+    typedef int8_t output_c_type;
+
+    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_DOWN;
+
+    auto X = make_shared<op::Parameter>(input_type, input_shape);
+    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
+    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
+    auto quantize =
+        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
+    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    auto x = backend->create_tensor(input_type, input_shape);
+    auto y = backend->create_tensor(output_type, input_shape);
+
+    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
+    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
+    // equals (rounded)               2   2   2  -3   -3   -3   3   3   3   -4   -4   -4
+
+    backend->call_with_validate(f, {y}, {x});
+    EXPECT_EQ((vector<output_c_type>{2, 2, 2, -3, -3, -3, 3, 3, 3, -4, -4, -4}),
+              read_vector<output_c_type>(y));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_bprop)
+{
+    Shape sca{1};
+    Shape vec{1, 1, 1, 2};
+    double eps = 1.0e-04;
+
+    auto g = std::make_shared<op::Parameter>(element::f32, sca);
+    auto b = std::make_shared<op::Parameter>(element::f32, sca);
+    auto input = std::make_shared<op::Parameter>(element::f32, vec);
+    auto bn_fp = std::make_shared<op::BatchNormTraining>(input, g, b, eps);
+    auto bnorm = std::make_shared<op::GetOutputElement>(bn_fp, 0);
+    auto mean = std::make_shared<op::GetOutputElement>(bn_fp, 1);
+    auto var = std::make_shared<op::GetOutputElement>(bn_fp, 2);
+
+    auto delta = std::make_shared<op::Parameter>(element::f32, vec);
+    auto bn_bp =
+        std::make_shared<op::BatchNormTrainingBackprop>(bnorm, g, b, mean, var, delta, eps);
+    auto dx = std::make_shared<op::GetOutputElement>(bn_bp, 0);
+
+    std::vector<std::vector<float>> args = {
+        {1.0f},       // gamma
+        {1.0f},       // beta
+        {1.1f, 1.0f}, // x
+        {1.0f, 1.0f}, // dy
+    };
+
+    auto func = std::make_shared<Function>(dx, op::ParameterVector{g, b, input, delta});
+    auto results = execute(func, args, "${BACKEND_NAME}");
+    EXPECT_TRUE(test::all_close_f(std::vector<float>{350.957, -388.67}, results.at(0)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_bprop_2step)
+{
+    Shape sca{1};
+    Shape vec{1, 1, 1, 2};
+    double eps = 1.0e-04;
+
+    auto g = std::make_shared<op::Parameter>(element::f32, sca);
+    auto b = std::make_shared<op::Parameter>(element::f32, sca);
+    auto input = std::make_shared<op::Parameter>(element::f32, vec);
+    auto bn_fp = std::make_shared<op::BatchNormTraining>(input, g, b, eps);
+    auto bnorm = std::make_shared<op::GetOutputElement>(bn_fp, 0);
+    auto mean = std::make_shared<op::GetOutputElement>(bn_fp, 1);
+    auto var = std::make_shared<op::GetOutputElement>(bn_fp, 2);
+
+    auto func_bn =
+        std::make_shared<Function>(NodeVector{bnorm, mean, var}, op::ParameterVector{g, b, input});
+
+    std::vector<std::vector<float>> args = {
+        {1.0f},       // gamma
+        {1.0f},       // beta
+        {1.1f, 1.0f}, // x
+    };
+    auto results = execute(func_bn, args, "${BACKEND_NAME}");
+
+    g = std::make_shared<op::Parameter>(element::f32, sca);
+    b = std::make_shared<op::Parameter>(element::f32, sca);
+    auto bn_output = std::make_shared<op::Parameter>(element::f32, vec);
+    auto m = std::make_shared<op::Parameter>(element::f32, sca);
+    auto v = std::make_shared<op::Parameter>(element::f32, sca);
+    auto delta = std::make_shared<op::Parameter>(element::f32, vec);
+    auto bn_bp = std::make_shared<op::BatchNormTrainingBackprop>(bn_output, g, b, m, v, delta, eps);
+    auto dx = std::make_shared<op::GetOutputElement>(bn_bp, 0);
+
+    args.pop_back();               // remove x
+    args.push_back(results.at(0)); // bn_output
+    args.push_back(results.at(1)); // m
+    args.push_back(results.at(2)); // v
+    args.push_back({1.0f, 1.0f});  // dy
+
+    auto func = std::make_shared<Function>(dx, op::ParameterVector{g, b, bn_output, m, v, delta});
+    results = execute(func, args, "${BACKEND_NAME}");
+    EXPECT_TRUE(test::all_close_f(std::vector<float>{350.957, -388.67}, results.at(0)));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, shape_of_scalar)
+{
+    Shape input_shape{};
+    Shape output_shape{0};
+
+    auto A = std::make_shared<op::Parameter>(element::f32, input_shape);
+    auto f = std::make_shared<Function>(std::make_shared<op::ShapeOf>(A), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::f32, input_shape);
+    copy_data(a, vector<float>{0});
+    auto result = backend->create_tensor(element::u64, output_shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    vector<uint64_t> expected{};
+    EXPECT_EQ(expected, read_vector<uint64_t>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, shape_of_vector)
+{
+    Shape input_shape{2};
+    Shape output_shape{1};
+
+    auto A = std::make_shared<op::Parameter>(element::f32, input_shape);
+    auto f = std::make_shared<Function>(std::make_shared<op::ShapeOf>(A), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::f32, input_shape);
+    copy_data(a, vector<float>(2, 0));
+    auto result = backend->create_tensor(element::u64, output_shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    vector<uint64_t> expected{2};
+    EXPECT_EQ(expected, read_vector<uint64_t>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, shape_of_matrix)
+{
+    Shape input_shape{2, 4};
+    Shape output_shape{2};
+
+    auto A = std::make_shared<op::Parameter>(element::f32, input_shape);
+    auto f = std::make_shared<Function>(std::make_shared<op::ShapeOf>(A), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::f32, input_shape);
+    copy_data(a, vector<float>(2 * 4, 0));
+    auto result = backend->create_tensor(element::u64, output_shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    vector<uint64_t> expected{2, 4};
+    EXPECT_EQ(expected, read_vector<uint64_t>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, shape_of_5d)
+{
+    Shape input_shape{2, 4, 8, 16, 32};
+    Shape output_shape{5};
+
+    auto A = std::make_shared<op::Parameter>(element::f32, input_shape);
+    auto f = std::make_shared<Function>(std::make_shared<op::ShapeOf>(A), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    auto a = backend->create_tensor(element::f32, input_shape);
+    copy_data(a, vector<float>(2 * 4 * 8 * 16 * 32, 0));
+    auto result = backend->create_tensor(element::u64, output_shape);
+
+    backend->call_with_validate(f, {result}, {a});
+    vector<uint64_t> expected{2, 4, 8, 16, 32};
+    EXPECT_EQ(expected, read_vector<uint64_t>(result));
+}
diff --git a/test/cpu_fusion.cpp b/test/cpu_fusion.cpp
index 481814c8d7e..5c7957dde7e 100644
--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -718,7 +718,7 @@ TEST(cpu_fusion, batchnorm_fprop_relu_b1c2h2w2)
     auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
     double eps = 0.001;
     auto shape_r = Shape{1, 2, 2, 2};
-    auto bn = make_shared<op::BatchNormTraining>(eps, gamma, beta, input);
+    auto bn = make_shared<op::BatchNormTraining>(input, gamma, beta, eps);
 
     auto output_rt = std::make_shared<op::GetOutputElement>(bn, 0);
     // Note, op::Splice is used to break Relu(BatchNorm) fusion
@@ -1082,8 +1082,8 @@ shared_ptr<Function> gen_groupconv_batchnorm(const bool add_goe,
 
     // Adding a goe will stop fusion since the patterns wont expect to see this op
     auto bn =
-        add_goe ? std::make_shared<op::BatchNormInference>(eps, gamma, beta, goe_bn, mean, var)
-                : std::make_shared<op::BatchNormInference>(eps, gamma, beta, group_conv, mean, var);
+        add_goe ? std::make_shared<op::BatchNormInference>(goe_bn, gamma, beta, mean, var, eps)
+                : std::make_shared<op::BatchNormInference>(group_conv, gamma, beta, mean, var, eps);
     if (with_relu)
     {
         auto prelu = std::make_shared<op::Relu>(bn);
@@ -1767,7 +1767,7 @@ TEST(cpu_fusion, conv_batch_norm_folding)
         auto mean = std::make_shared<op::Parameter>(element::f32, shape_norm);
         auto var = std::make_shared<op::Parameter>(element::f32, shape_norm);
         auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
-        auto bn = std::make_shared<op::BatchNormInference>(eps, gamma, beta, conv, mean, var);
+        auto bn = std::make_shared<op::BatchNormInference>(conv, gamma, beta, mean, var, eps);
         auto f = make_shared<Function>(NodeVector{bn},
                                        op::ParameterVector{input, weights, gamma, beta, mean, var});
         return f;
@@ -1829,7 +1829,7 @@ TEST(cpu_fusion, convbias_batch_norm_folding)
         auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
         auto convbias =
             conv + std::make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
-        auto bn = std::make_shared<op::BatchNormInference>(eps, gamma, beta, convbias, mean, var);
+        auto bn = std::make_shared<op::BatchNormInference>(convbias, gamma, beta, mean, var, eps);
         auto f = make_shared<Function>(
             NodeVector{bn}, op::ParameterVector{input, weights, bias, gamma, beta, mean, var});
         return f;
diff --git a/test/cpu_fusion.cpp-41c1ba06 b/test/cpu_fusion.cpp-41c1ba06
new file mode 100644
index 00000000000..e377ab0f432
--- /dev/null
+++ b/test/cpu_fusion.cpp-41c1ba06
@@ -0,0 +1,3132 @@
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <algorithm>
+#include <cstdio>
+#include <iostream>
+#include <list>
+#include <memory>
+
+#include "gtest/gtest.h"
+#include "ngraph/autodiff/adjoints.hpp"
+#include "ngraph/file_util.hpp"
+#include "ngraph/graph_util.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/ngraph.hpp"
+#include "ngraph/op/batch_norm.hpp"
+#include "ngraph/op/concat.hpp"
+#include "ngraph/op/get_output_element.hpp"
+#include "ngraph/op/max_pool.hpp"
+#include "ngraph/op/negative.hpp"
+#include "ngraph/op/parameter.hpp"
+#include "ngraph/op/relu.hpp"
+#include "ngraph/op/sigmoid.hpp"
+#include "ngraph/op/sum.hpp"
+#include "ngraph/op/tanh.hpp"
+#include "ngraph/pass/algebraic_simplification.hpp"
+#include "ngraph/pass/core_fusion.hpp"
+#include "ngraph/pass/graph_rewrite.hpp"
+#include "ngraph/pass/manager.hpp"
+#include "ngraph/pass/reshape_elimination.hpp"
+#include "ngraph/pass/visualize_tree.hpp"
+#include "ngraph/pattern/matcher.hpp"
+#include "ngraph/pattern/op/label.hpp"
+#include "ngraph/pattern/op/skip.hpp"
+#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
+#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
+#include "ngraph/runtime/cpu/op/batch_dot.hpp"
+#include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
+#include "ngraph/runtime/cpu/op/bounded_relu.hpp"
+#include "ngraph/runtime/cpu/op/conv_add.hpp"
+#include "ngraph/runtime/cpu/op/conv_bias.hpp"
+#include "ngraph/runtime/cpu/op/conv_relu.hpp"
+#include "ngraph/runtime/cpu/op/convert_layout.hpp"
+#include "ngraph/runtime/cpu/op/group_conv.hpp"
+#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
+#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
+#include "ngraph/runtime/cpu/op/lstm.hpp"
+#include "ngraph/runtime/cpu/op/matmul_bias.hpp"
+#include "ngraph/runtime/cpu/op/rnn.hpp"
+#include "ngraph/runtime/cpu/op/sigmoid_mul.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_concat_inputs.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_mat_fusion.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_rnn_fusion.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_workspace_insertion.hpp"
+#include "ngraph/serializer.hpp"
+#include "ngraph/util.hpp"
+#include "nlohmann/json.hpp"
+#include "util/all_close.hpp"
+#include "util/autodiff/backprop_function.hpp"
+#include "util/autodiff/numeric_compare.hpp"
+#include "util/matcher.hpp"
+#include "util/random.hpp"
+#include "util/random.hpp"
+#include "util/test_tools.hpp"
+
+using namespace ngraph;
+using namespace std;
+
+TEST(cpu_fusion, gemm_pattern)
+{
+    Shape shape_w{2, 4};
+    Shape shape_x{4, 1};
+    Shape shape_b{1};
+    auto A = make_shared<op::Parameter>(element::f32, shape_w);
+    auto B = make_shared<op::Parameter>(element::f32, shape_x);
+    auto C = make_shared<op::Parameter>(element::f32, shape_b);
+
+    auto dot = make_shared<op::Dot>(A, B);
+    auto broadcast = make_shared<op::Broadcast>(C, dot->get_shape(), AxisSet{0});
+    auto add = dot + broadcast;
+
+    auto W = std::make_shared<pattern::op::Label>(A);
+    auto x = std::make_shared<pattern::op::Label>(B);
+
+    auto reshape_pred = [](std::shared_ptr<Node> n) {
+        return static_cast<bool>(std::dynamic_pointer_cast<op::Reshape>(n));
+    };
+
+    auto skip_w = std::make_shared<pattern::op::Skip>(W, reshape_pred);
+    auto skip_x = std::make_shared<pattern::op::Skip>(x, reshape_pred);
+
+    auto pdot = make_shared<op::Dot>(skip_w, skip_x);
+    auto b = std::make_shared<pattern::op::Label>(C);
+    auto pbroadcast = make_shared<op::Broadcast>(b, dot->get_shape(), AxisSet{0});
+    auto padd = pdot + pbroadcast;
+
+    TestMatcher n(nullptr);
+    ASSERT_TRUE(n.match(padd, add));
+    ASSERT_EQ(n.get_pattern_map()[W], A);
+    ASSERT_EQ(n.get_pattern_map()[x], B);
+    ASSERT_EQ(n.get_pattern_map()[b], C);
+
+    auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, W->get_shape());
+    auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, x->get_shape());
+    auto re_dot = make_shared<op::Dot>(reshape_w, reshape_x);
+    auto re_add = re_dot + broadcast;
+    ASSERT_TRUE(n.match(padd, re_add));
+    ASSERT_EQ(n.get_pattern_map()[W], A);
+    ASSERT_EQ(n.get_pattern_map()[x], B);
+    ASSERT_EQ(n.get_pattern_map()[b], C);
+
+    auto cg = make_shared<op::MatmulBias>(
+        W, x, C, W->get_shape(), x->get_shape(), false, false, AxisSet{0});
+}
+
+TEST(cpu_fusion, gemm_cpu_broadcast_row)
+{
+    Shape shapeA{3, 2};
+    Shape shapeB{2, 3};
+    Shape shapeC{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shapeA);
+    auto B = make_shared<op::Parameter>(element::f32, shapeB);
+
+    auto bias = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{2.0f, 3.0f});
+
+    auto cg = make_shared<op::MatmulBias>(
+        A, B, bias, A->get_shape(), B->get_shape(), true, true, AxisSet{0});
+
+    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("CPU");
+
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeB);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shapeC);
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
+    vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    vector<float> expected{11, 30, 38, 111};
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+
+TEST(cpu_fusion, gemm_cpu_broadcast_column)
+{
+    Shape shapeA{3, 2};
+    Shape shapeB{2, 3};
+    Shape shapeC{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shapeA);
+    auto B = make_shared<op::Parameter>(element::f32, shapeB);
+
+    auto bias = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{2.0f, 3.0f});
+
+    auto cg = make_shared<op::MatmulBias>(
+        A, B, bias, A->get_shape(), B->get_shape(), true, true, AxisSet{1});
+
+    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("CPU");
+
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeB);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shapeC);
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
+    vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    vector<float> expected{11, 29, 39, 111};
+    EXPECT_EQ(read_vector<float>(result), expected);
+}
+
+TEST(cpu_fusion, gemm_cpu_broadcast_matrix)
+{
+    Shape shapeA{3, 2};
+    Shape shapeB{2, 3};
+    Shape shapeC{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shapeA);
+    auto B = make_shared<op::Parameter>(element::f32, shapeB);
+
+    auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
+    auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
+
+    auto one = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{1.0f});
+
+    auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{0, 1});
+    auto cg = make_shared<op::MatmulBias>(
+        A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{0, 1});
+
+    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("CPU");
+
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeB);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shapeC);
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
+    vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    vector<float> expected{10, 28, 37, 109};
+    ASSERT_TRUE(read_vector<float>(result) == expected);
+}
+
+TEST(cpu_fusion, gemm_cpu_no_bias)
+{
+    auto shapeA = Shape{3, 2};
+    auto shapeB = Shape{2, 3};
+    auto shapeC = Shape{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shapeA);
+    auto B = make_shared<op::Parameter>(element::f32, shapeB);
+
+    auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
+    auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
+
+    auto cg =
+        make_shared<op::MatmulBias>(A, B, nullptr, A->get_shape(), B->get_shape(), true, true);
+
+    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("CPU");
+
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeB);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shapeC);
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
+    vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    vector<float> expected{9, 27, 36, 108};
+    ASSERT_TRUE(read_vector<float>(result) == expected);
+}
+
+TEST(cpu_fusion, cpu_fusion_pass_basic)
+{
+    Shape shape{};
+    Shape shape_w{2, 4};
+    Shape shape_x{4, 1};
+    Shape shape_b{1};
+    auto A = make_shared<op::Parameter>(element::f32, shape_w);
+    auto B = make_shared<op::Parameter>(element::f32, shape_x);
+    auto C = make_shared<op::Parameter>(element::f32, shape_b);
+
+    auto dot = make_shared<op::Dot>(A, B);
+    auto broadcast = make_shared<op::Broadcast>(C, dot->get_shape(), AxisSet{0});
+    auto add = dot + broadcast;
+    auto graph = make_shared<op::Abs>(add);
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
+        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
+    auto func = make_shared<Function>(graph, op::ParameterVector{A, B, C});
+    pass_manager.run_passes(func);
+    ASSERT_NE(std::dynamic_pointer_cast<op::MatmulBias>(graph->get_argument(0)), nullptr);
+}
+
+TEST(cpu_fusion, commutative_matmul_bias)
+{
+    Shape shape{};
+    Shape shape_w{2, 4};
+    Shape shape_x{4, 1};
+    Shape shape_b{1};
+    auto A = make_shared<op::Parameter>(element::f32, shape_w);
+    auto B = make_shared<op::Parameter>(element::f32, shape_x);
+    auto C = make_shared<op::Parameter>(element::f32, shape_b);
+
+    auto dot = make_shared<op::Dot>(A, B);
+    auto broadcast = make_shared<op::Broadcast>(C, dot->get_shape(), AxisSet{0});
+    auto add = broadcast + dot;
+    auto graph = make_shared<op::Abs>(add);
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
+        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
+    auto func = make_shared<Function>(graph, op::ParameterVector{A, B, C});
+    pass_manager.run_passes(func);
+    ASSERT_NE(std::dynamic_pointer_cast<op::MatmulBias>(graph->get_argument(0)), nullptr);
+}
+
+TEST(cpu_fusion, cpu_fusion_pass_matmul_bias)
+{
+    Shape shape_w{2, 4};
+    Shape shape_x{4, 1};
+    Shape shape_b{1};
+    auto W = make_shared<op::Parameter>(element::f32, shape_w);
+    auto x = make_shared<op::Parameter>(element::f32, shape_x);
+    auto b = make_shared<op::Parameter>(element::f32, shape_b);
+
+    auto mmb = std::make_shared<op::MatmulBias>(
+        W, x, nullptr, W->get_shape(), x->get_shape(), false, false);
+    auto broadcast = std::make_shared<op::Broadcast>(b, mmb->get_shape(), AxisSet{0});
+    auto add = mmb + broadcast;
+
+    auto graph = make_shared<op::Abs>(add);
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
+        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
+    auto func = make_shared<Function>(graph, op::ParameterVector{W, x, b});
+    pass_manager.run_passes(func);
+    auto gmm = graph->get_argument(0);
+    ASSERT_TRUE(std::dynamic_pointer_cast<op::MatmulBias>(gmm));
+    ASSERT_EQ(gmm->get_argument(2), b);
+}
+
+TEST(cpu_fusion, cpu_fusion_pass_matmul_no_bias)
+{
+    Shape shape_w{4, 2};
+    Shape shape_x{1, 4};
+    auto W = make_shared<op::Parameter>(element::f32, shape_w);
+    auto x = make_shared<op::Parameter>(element::f32, shape_x);
+
+    auto reshape_w = std::make_shared<op::Reshape>(W, AxisVector{1, 0}, Shape{2, 4});
+    auto reshape_x = std::make_shared<op::Reshape>(x, AxisVector{1, 0}, Shape{4, 1});
+    auto re_dot = make_shared<op::Dot>(reshape_w, reshape_x);
+    auto graph = make_shared<op::Abs>(re_dot);
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
+        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
+    auto func = make_shared<Function>(graph, op::ParameterVector{W, x});
+    pass_manager.run_passes(func);
+    size_t mmb = count_ops_of_type<op::MatmulBias>(func);
+    ASSERT_EQ(mmb, 1);
+}
+
+TEST(cpu_fusion, gemm_mlp)
+{
+    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/mnist_mlp_forward.json");
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
+        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
+    pass_manager.run_passes(func);
+    auto mmbs = count_ops_of_type<op::MatmulBias>(func);
+    ASSERT_EQ(mmbs, 3);
+}
+
+TEST(cpu_fusion, fuse_fprop_bn)
+{
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::VisualizeTree>("bn_fprop_before_fusion.png");
+    pass_manager.register_pass<ngraph::pass::ReshapeElimination>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
+        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
+    pass_manager.register_pass<pass::VisualizeTree>("bn_fprop_after_fusion.png");
+    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/bn_fprop_b2c3h2w2.json");
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    pass_manager.run_passes(func);
+    size_t ccg = count_ops_of_type<op::BatchNormTraining>(func);
+    ASSERT_EQ(ccg, 1);
+}
+
+TEST(cpu_fusion, zero_padded_reshaped_conv)
+{
+    auto X = make_shared<op::Parameter>(element::f32, Shape{1, 2, 2, 1});
+    auto F = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1, 1});
+
+    auto pad_value = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{0.0f});
+
+    auto pad =
+        make_shared<op::Pad>(X, pad_value, Shape{0, 1, 0, 0}, Shape{0, 0, 1, 0}, Shape{0, 0, 0, 0});
+
+    auto reshape = make_shared<op::Reshape>(pad, AxisVector{0, 3, 1, 2}, Shape{1, 1, 3, 3});
+
+    auto conv = make_shared<op::Convolution>(reshape,
+                                             F,
+                                             Strides{1, 1},
+                                             Strides{1, 1},
+                                             CoordinateDiff{0, 0},
+                                             CoordinateDiff{0, 0},
+                                             Strides{1, 1});
+
+    auto func = make_shared<Function>(conv, op::ParameterVector{X, F});
+
+    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 1);
+
+    auto backend = runtime::Backend::create("CPU");
+    backend->compile(func);
+
+    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 0);
+}
+
+TEST(cpu_fusion, zero_padded_conv)
+{
+    auto X = make_shared<op::Parameter>(element::f32, Shape{1, 1, 2, 2});
+    auto F = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1, 1});
+
+    auto pad_value = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{0.0f});
+
+    auto pad =
+        make_shared<op::Pad>(X, pad_value, Shape{0, 0, 0, 1}, Shape{0, 0, 1, 0}, Shape{0, 0, 0, 0});
+
+    auto conv = make_shared<op::Convolution>(pad,
+                                             F,
+                                             Strides{1, 1},
+                                             Strides{1, 1},
+                                             CoordinateDiff{0, 0},
+                                             CoordinateDiff{0, 0},
+                                             Strides{1, 1});
+
+    auto func = make_shared<Function>(conv, op::ParameterVector{X, F});
+
+    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 1);
+
+    auto backend = runtime::Backend::create("CPU");
+    backend->compile(func);
+
+    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 0);
+}
+
+TEST(cpu_fusion, non_zero_padded_conv)
+{
+    auto X = make_shared<op::Parameter>(element::f32, Shape{1, 1, 2, 2});
+    auto F = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1, 1});
+
+    auto pad_value = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{1.0f});
+
+    auto pad =
+        make_shared<op::Pad>(X, pad_value, Shape{0, 0, 0, 1}, Shape{0, 0, 1, 0}, Shape{0, 0, 0, 0});
+
+    auto conv = make_shared<op::Convolution>(pad,
+                                             F,
+                                             Strides{1, 1},
+                                             Strides{1, 1},
+                                             CoordinateDiff{0, 0},
+                                             CoordinateDiff{0, 0},
+                                             Strides{1, 1});
+
+    auto func = make_shared<Function>(conv, op::ParameterVector{X, F});
+
+    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 1);
+
+    auto backend = runtime::Backend::create("CPU");
+    backend->compile(func);
+
+    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 1);
+}
+
+TEST(cpu_fusion, zero_padded_conv_backprop_filters)
+{
+    auto X = make_shared<op::Parameter>(element::f32, Shape{1, 1, 2, 2});
+    auto F = make_shared<op::Parameter>(element::f32, Shape{1, 1, 2, 2});
+
+    auto pad_value = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{0.0f});
+
+    auto pad =
+        make_shared<op::Pad>(X, pad_value, Shape{0, 0, 0, 1}, Shape{0, 0, 1, 0}, Shape{0, 0, 0, 0});
+
+    auto conv = make_shared<op::ConvolutionBackpropFilters>(pad,
+                                                            Shape{1, 1, 2, 2},
+                                                            F,
+                                                            Strides{1, 1},
+                                                            Strides{1, 1},
+                                                            CoordinateDiff{0, 0},
+                                                            CoordinateDiff{0, 0},
+                                                            Strides{1, 1});
+
+    auto func = make_shared<Function>(conv, op::ParameterVector{X, F});
+
+    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 1);
+
+    auto backend = runtime::Backend::create("CPU");
+    backend->compile(func);
+
+    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 0);
+}
+
+TEST(cpu_fusion, fuse_conv_bias)
+{
+    pass::Manager pass_manager;
+    pass_manager.register_pass<ngraph::pass::ReshapeElimination>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
+        runtime::cpu::pass::CPUFusion::DIFFERENTIABLE_FUSIONS);
+    const string json_path = file_util::path_join(SERIALIZED_ZOO, "conv_bias.json");
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    pass_manager.run_passes(func);
+    size_t cb = count_ops_of_type<op::ConvolutionBias>(func);
+    ASSERT_GT(cb, 0);
+}
+
+struct ConvolutionBiasTestData
+{
+    size_t n{0};
+    size_t c{0};
+    size_t filter{0};
+    size_t kernel_size{0};
+    size_t w{0};
+    size_t h{0};
+    shared_ptr<runtime::Tensor> data_val;
+    shared_ptr<runtime::Tensor> weights_val;
+    shared_ptr<runtime::Tensor> bias_val;
+    shared_ptr<runtime::Tensor> result_val;
+    shared_ptr<runtime::Tensor> delta_val;
+    shared_ptr<runtime::Tensor> d_data_val;
+    shared_ptr<runtime::Tensor> d_weights_val;
+    shared_ptr<runtime::Tensor> d_bias_val;
+    vector<float> expected_result_val;
+    vector<float> expected_d_data_val;
+    vector<float> expected_d_weights_val;
+    vector<float> expected_d_bias_val;
+
+    Shape data_shape;
+    Shape weights_shape;
+    Shape bias_shape;
+    Shape result_shape;
+    shared_ptr<op::Parameter> data;
+    shared_ptr<op::Parameter> weights;
+    shared_ptr<op::Parameter> bias;
+    shared_ptr<op::Parameter> delta;
+
+    void n1c1h3w3(runtime::Backend* backend)
+    {
+        n = 1;
+        c = 1;
+        filter = 1;
+        kernel_size = 3;
+        w = 3;
+        h = w;
+
+        data_shape = Shape{n, c, h, w};
+        data = make_shared<op::Parameter>(element::f32, data_shape);
+        weights_shape = Shape{filter, c, kernel_size, kernel_size};
+        weights = make_shared<op::Parameter>(element::f32, weights_shape);
+        bias_shape = Shape{filter};
+        bias = make_shared<op::Parameter>(element::f32, bias_shape);
+        result_shape = Shape{n, filter, 1, 1};
+
+        data_val = backend->create_tensor(element::f32, data_shape);
+        copy_data(data_val,
+                  vector<float>{-0.67765152f,
+                                0.10073948f,
+                                0.57595438f,
+                                -0.3469252f,
+                                -0.22134334f,
+                                -1.80471897f,
+                                -0.80642909f,
+                                1.22033095f,
+                                2.23235631f});
+        weights_val = backend->create_tensor(element::f32, weights_shape);
+        copy_data(weights_val,
+                  vector<float>{0.20070229f,
+                                -0.54968649f,
+                                -0.19819015f,
+                                -0.38577855f,
+                                1.37109005f,
+                                -0.23789984f,
+                                0.14867957f,
+                                -0.49851316f,
+                                -0.84815776f});
+        bias_val = backend->create_tensor(element::f32, bias_shape);
+        copy_data(bias_val, vector<float>{0.07811152f});
+
+        result_val = backend->create_tensor(element::f32, result_shape);
+        copy_data(result_val, vector<float>{0});
+
+        delta = make_shared<op::Parameter>(element::f32, result_shape);
+        delta_val = backend->create_tensor(element::f32, result_shape);
+        copy_data(delta_val, vector<float>{-2.58936238f});
+
+        d_data_val = backend->create_tensor(element::f32, data_shape);
+        copy_data(d_data_val, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0});
+
+        d_weights_val = backend->create_tensor(element::f32, weights_shape);
+        copy_data(d_weights_val, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0});
+
+        d_bias_val = backend->create_tensor(element::f32, bias_shape);
+        copy_data(d_bias_val, vector<float>{0});
+
+        expected_result_val = vector<float>{-2.58936238f};
+        expected_d_data_val = vector<float>{-0.51969099f,
+                                            1.42333758f,
+                                            0.5131861f,
+                                            0.99892044f,
+                                            -3.5502491f,
+                                            0.61600888f,
+                                            -0.3849853f,
+                                            1.29083121f,
+                                            2.19618773f};
+        expected_d_weights_val = vector<float>{1.7546854f,
+                                               -0.26085103f,
+                                               -1.49135458f,
+                                               0.89831507f,
+                                               0.57313812f,
+                                               4.67307138f,
+                                               2.08813715f,
+                                               -3.15987897f,
+                                               -5.7803793f};
+        expected_d_bias_val = vector<float>{-2.58936238f};
+    }
+};
+
+TEST(cpu_fusion, conv_bias_fprop_n1c1h3w3)
+{
+    auto backend = runtime::Backend::create("CPU");
+
+    ConvolutionBiasTestData conv_test;
+    conv_test.n1c1h3w3(backend.get());
+
+    auto convolution = make_shared<op::Convolution>(conv_test.data, conv_test.weights);
+    auto convolution_bias = make_shared<op::ConvolutionBias>(convolution, conv_test.bias);
+
+    auto f = make_shared<Function>(
+        convolution_bias, op::ParameterVector{conv_test.data, conv_test.weights, conv_test.bias});
+
+    backend->call_with_validate(
+        f, {conv_test.result_val}, {conv_test.data_val, conv_test.weights_val, conv_test.bias_val});
+    auto result_vec = read_vector<float>(conv_test.result_val);
+
+    EXPECT_TRUE(
+        test::all_close(conv_test.expected_result_val, read_vector<float>(conv_test.result_val)));
+}
+
+TEST(cpu_fusion, conv_bias_bprop_n1c1h3w3)
+{
+    auto backend = runtime::Backend::create("CPU");
+
+    ConvolutionBiasTestData conv_test;
+    conv_test.n1c1h3w3(backend.get());
+
+    auto convolution = make_shared<op::Convolution>(conv_test.data, conv_test.weights);
+    auto convolution_bias = make_shared<op::ConvolutionBias>(convolution, conv_test.bias);
+
+    auto f = make_shared<Function>(
+        convolution_bias, op::ParameterVector{conv_test.data, conv_test.weights, conv_test.bias});
+
+    ngraph::autodiff::Adjoints adjoints(NodeVector{convolution_bias}, NodeVector{conv_test.delta});
+
+    auto d_data = adjoints.backprop_node(conv_test.data);
+    auto d_weights = adjoints.backprop_node(conv_test.weights);
+    auto d_bias = adjoints.backprop_node(conv_test.bias);
+
+    auto df = make_shared<Function>(
+        NodeVector{d_data, d_weights, d_bias},
+        op::ParameterVector{conv_test.data, conv_test.weights, conv_test.bias, conv_test.delta});
+    backend->call_with_validate(
+        df,
+        {conv_test.d_data_val, conv_test.d_weights_val, conv_test.d_bias_val},
+        {conv_test.data_val, conv_test.weights_val, conv_test.bias_val, conv_test.delta_val});
+
+    EXPECT_TRUE(
+        test::all_close(conv_test.expected_d_data_val, read_vector<float>(conv_test.d_data_val)));
+    EXPECT_TRUE(test::all_close(conv_test.expected_d_weights_val,
+                                read_vector<float>(conv_test.d_weights_val)));
+    EXPECT_TRUE(
+        test::all_close(conv_test.expected_d_bias_val, read_vector<float>(conv_test.d_bias_val)));
+}
+
+TEST(cpu_fusion, conv_bias_bprop)
+{
+    Shape shape{2, 2, 1, 1};
+    auto data_batch = std::make_shared<op::Parameter>(element::f32, shape);
+    auto filters = std::make_shared<op::Parameter>(element::f32, shape);
+    auto delta = std::make_shared<op::Parameter>(element::f32, shape);
+    auto bias = make_shared<op::Parameter>(element::f32, Shape{shape[0]});
+    auto pbroadcast = std::make_shared<op::Broadcast>(bias, shape, AxisSet{1, 2, 3});
+    auto conv = std::make_shared<op::Convolution>(data_batch, filters);
+    auto conv_bias = std::make_shared<op::Add>(conv, pbroadcast);
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+    pass_manager.register_pass<pass::VisualizeTree>("conv_bias_bprop_fusion");
+    auto f = make_shared<Function>(conv_bias, op::ParameterVector{data_batch, filters, bias});
+
+    ngraph::autodiff::Adjoints adjoints(NodeVector{conv_bias}, NodeVector{delta});
+
+    auto d_data = adjoints.backprop_node(data_batch);
+    auto d_weights = adjoints.backprop_node(filters);
+    auto d_bias = adjoints.backprop_node(bias);
+
+    auto df = make_shared<Function>(NodeVector{d_data, d_weights, d_bias},
+                                    op::ParameterVector{data_batch, filters, bias, delta});
+
+    pass_manager.run_passes(df);
+    size_t ccg = count_ops_of_type<op::ConvolutionBiasBackpropFiltersBias>(df);
+    ASSERT_EQ(ccg, 1);
+}
+
+TEST(cpu_fusion, batchnorm_fprop_relu_b1c2h2w2)
+{
+    auto input_shape = Shape{1, 2, 2, 2};
+    auto input = make_shared<op::Parameter>(element::f32, input_shape);
+    auto mean_shape = Shape{2};
+    auto var_shape = Shape{2};
+    auto gamma_shape = Shape{2};
+    auto gamma = make_shared<op::Parameter>(element::f32, gamma_shape);
+    auto beta_shape = Shape{2};
+    auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
+    double eps = 0.001;
+    auto shape_r = Shape{1, 2, 2, 2};
+    auto bn = make_shared<op::BatchNormTraining>(input, gamma, beta, eps);
+
+    auto output_rt = std::make_shared<op::GetOutputElement>(bn, 0);
+    // Note, op::Splice is used to break Relu(BatchNorm) fusion
+    // otherwise we will be comparing two BatchNormRelus
+    // Unfortunately, we can't use INTERPRETER for
+    // verifying the results as it doesn't implement
+    // BatchNorm op.
+    auto slice =
+        std::make_shared<op::Slice>(output_rt, Coordinate{0, 0, 0, 0}, Coordinate{1, 2, 2, 2});
+    auto output_relu = std::make_shared<op::Relu>(slice);
+    auto mean_rt = std::make_shared<op::GetOutputElement>(bn, 1);
+    auto variance_rt = std::make_shared<op::GetOutputElement>(bn, 2);
+
+    auto bn_relu = make_shared<op::BatchNormTrainingRelu>(input, gamma, beta, eps);
+    auto output_rt_bnr = std::make_shared<op::GetOutputElement>(bn_relu, 0);
+    auto mean_rt_bnr = std::make_shared<op::GetOutputElement>(bn_relu, 1);
+    auto variance_rt_bnr = std::make_shared<op::GetOutputElement>(bn_relu, 2);
+
+    auto f = make_shared<Function>(
+        NodeVector{output_relu, mean_rt, variance_rt, output_rt_bnr, mean_rt_bnr, variance_rt_bnr},
+        op::ParameterVector{input, gamma, beta});
+    auto backend = runtime::Backend::create("CPU");
+
+    // Create some tensors for input/output
+    auto input_t = backend->create_tensor(element::f32, Shape{1, 2, 2, 2});
+
+    copy_data(input_t,
+              vector<float>{0.54881352f,
+                            0.71518934f,
+                            0.60276335f,
+                            0.54488319f,
+                            0.42365479f,
+                            0.64589411f,
+                            0.4375872f,
+                            0.89177299f});
+    auto gamma_t = backend->create_tensor(element::f32, gamma_shape);
+    copy_data(gamma_t, vector<float>{1.0f, 1.0f});
+    auto beta_t = backend->create_tensor(element::f32, beta_shape);
+    copy_data(beta_t, vector<float>{0.0f, 0.0f});
+    auto bn_output = backend->create_tensor(element::f32, shape_r);
+    auto result_mean = backend->create_tensor(element::f32, mean_shape);
+    auto result_variance = backend->create_tensor(element::f32, var_shape);
+
+    auto bn_output_bnr = backend->create_tensor(element::f32, shape_r);
+    auto result_mean_bnr = backend->create_tensor(element::f32, mean_shape);
+    auto result_variance_bnr = backend->create_tensor(element::f32, var_shape);
+
+    backend->call_with_validate(f,
+                                {bn_output,
+                                 result_mean,
+                                 result_variance,
+                                 bn_output_bnr,
+                                 result_mean_bnr,
+                                 result_variance_bnr},
+                                {input_t, gamma_t, beta_t});
+
+    EXPECT_TRUE(test::all_close(read_vector<float>(bn_output), read_vector<float>(bn_output_bnr)));
+    EXPECT_TRUE(
+        test::all_close(read_vector<float>(result_mean), read_vector<float>(result_mean_bnr)));
+    EXPECT_TRUE(test::all_close(read_vector<float>(result_variance),
+                                read_vector<float>(result_variance_bnr)));
+}
+
+TEST(cpu_fusion, fuse_conv_relu)
+{
+    auto A = std::make_shared<op::Parameter>(element::f32, Shape{2, 1, 2, 2});
+    auto weights = std::make_shared<op::Parameter>(element::f32, Shape{1, 1, 2, 2});
+    auto convolution = std::make_shared<op::Convolution>(A, weights, Strides{1, 1}, Strides{1, 1});
+    auto relu = std::make_shared<op::Relu>(convolution);
+    auto abs_node =
+        std::make_shared<op::Abs>(std::make_shared<op::Abs>(std::make_shared<op::Abs>(relu)));
+    auto func = make_shared<Function>(abs_node, op::ParameterVector{A, weights});
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
+        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
+    pass_manager.run_passes(func);
+    size_t cb = count_ops_of_type<op::ConvolutionRelu>(func);
+    ASSERT_GT(cb, 0);
+}
+
+TEST(cpu_fusion, conv_relu_n2c1h2w2_2)
+{
+    Shape shape_a{2, 1, 6, 6};
+    Shape shape_weights{1, 1, 2, 2};
+
+    auto make_int_function = [shape_a, shape_weights]() {
+        auto A = std::make_shared<op::Parameter>(element::f32, shape_a);
+        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
+        auto conv = std::make_shared<op::Convolution>(A, weights, Strides{2, 2}, Strides{1, 1});
+        auto relu = std::make_shared<op::Relu>(conv);
+        auto f = make_shared<Function>(NodeVector{relu}, op::ParameterVector{A, weights});
+        return f;
+    };
+
+    auto int_f = make_int_function();
+
+    auto make_cpu_function = [shape_a, shape_weights]() {
+        auto A = std::make_shared<op::Parameter>(element::f32, shape_a);
+        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
+        auto conv = std::make_shared<op::Convolution>(A, weights, Strides{2, 2}, Strides{1, 1});
+        auto conv_relu = std::make_shared<op::ConvolutionRelu>(conv);
+        auto f = make_shared<Function>(NodeVector{conv_relu}, op::ParameterVector{A, weights});
+        return f;
+    };
+
+    auto cpu_f = make_cpu_function();
+
+    vector<vector<float>> args{
+        {1.25f,  2.25f, 5.25f, 6.25f,  -1.25f, -1.25f, 3.25f, -4.25f, 7.25f,  8.25f,  -1.25f,
+         -1.25f, 1.25f, 2.25f, -3.25f, 2.25f,  4.25f,  4.25f, 1.25f,  2.25f,  -4.25f, 2.25f,
+         4.25f,  4.25f, 0.f,   0.f,    -1.f,   0.f,    2.f,   2.f,    0.f,    0.f,    0.f,
+         0.f,    2.f,   2.f,   1.25f,  2.25f,  5.25f,  6.25f, 1.25f,  1.25f,  3.25f,  4.25f,
+         -7.25f, 8.25f, 1.25f, -1.25f, -1.25f, 2.25f,  3.25f, 2.25f,  -4.25f, -4.25f, -1.25f,
+         -2.25f, 4.25f, 2.25f, 4.25f,  4.25f,  0.f,    0.f,   1.f,    0.f,    -2.f,   2.f,
+         0.f,    0.f,   0.f,   0.f,    -2.f,   -2.f},
+        {2., 2., 2., 2.}};
+
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+}
+
+TEST(cpu_fusion, conv_bias_relu_n2c1h2w2_2)
+{
+    Shape shape_a{2, 1, 6, 6};
+    Shape shape_weights{1, 1, 2, 2};
+    Shape shape_bias{1};
+
+    auto make_int_function = [shape_a, shape_weights, shape_bias]() {
+        auto A = std::make_shared<op::Parameter>(element::f32, shape_a);
+        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
+        auto conv = std::make_shared<op::Convolution>(A, weights, Strides{2, 2}, Strides{1, 1});
+        auto bias = std::make_shared<op::Parameter>(element::f32, shape_bias);
+        auto conv_bias =
+            conv + std::make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
+        auto relu = std::make_shared<op::Relu>(conv_bias);
+        auto f = make_shared<Function>(NodeVector{relu}, op::ParameterVector{A, weights, bias});
+        return f;
+    };
+
+    auto int_f = make_int_function();
+
+    auto make_cpu_function = [shape_a, shape_weights, shape_bias]() {
+        auto A = std::make_shared<op::Parameter>(element::f32, shape_a);
+        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
+        auto bias = std::make_shared<op::Parameter>(element::f32, shape_bias);
+        auto conv = std::make_shared<op::Convolution>(A, weights, Strides{2, 2}, Strides{1, 1});
+        auto conv_bias_relu = std::make_shared<op::ConvolutionBias>(conv, bias, true);
+        auto f = make_shared<Function>(NodeVector{conv_bias_relu},
+                                       op::ParameterVector{A, weights, bias});
+        return f;
+    };
+
+    auto cpu_f = make_cpu_function();
+
+    vector<vector<float>> args{
+        {1.25f,  2.25f, 5.25f, 6.25f,  -1.25f, -1.25f, 3.25f, -4.25f, 7.25f,  8.25f,  -1.25f,
+         -1.25f, 1.25f, 2.25f, -3.25f, 2.25f,  4.25f,  4.25f, 1.25f,  2.25f,  -4.25f, 2.25f,
+         4.25f,  4.25f, 0.f,   0.f,    -1.f,   0.f,    2.f,   2.f,    0.f,    0.f,    0.f,
+         0.f,    2.f,   2.f,   1.25f,  2.25f,  5.25f,  6.25f, 1.25f,  1.25f,  3.25f,  4.25f,
+         -7.25f, 8.25f, 1.25f, -1.25f, -1.25f, 2.25f,  3.25f, 2.25f,  -4.25f, -4.25f, -1.25f,
+         -2.25f, 4.25f, 2.25f, 4.25f,  4.25f,  0.f,    0.f,   1.f,    0.f,    -2.f,   2.f,
+         0.f,    0.f,   0.f,   0.f,    -2.f,   -2.f},
+        {2., 2., 2., 2.},
+        {0.1f}};
+
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+}
+
+TEST(cpu_fusion, conv_horizontal_fusion)
+{
+    Shape shape_a{2, 1, 6, 6};
+    Shape shape_weights{1, 1, 2, 2};
+    Shape shape_bias{1};
+
+    auto make_function = [shape_a, shape_weights, shape_bias]() {
+        auto A = std::make_shared<op::Parameter>(element::f32, shape_a);
+        auto weights1 = std::make_shared<op::Parameter>(element::f32, shape_weights);
+        auto conv1 = std::make_shared<op::Convolution>(A, weights1, Strides{2, 2}, Strides{1, 1});
+        auto bias1 = std::make_shared<op::Parameter>(element::f32, shape_bias);
+        auto conv_bias1 =
+            conv1 + std::make_shared<op::Broadcast>(bias1, conv1->get_shape(), AxisSet{0, 2, 3});
+        auto relu1 = std::make_shared<op::Relu>(conv_bias1);
+
+        auto weights2 = std::make_shared<op::Parameter>(element::f32, shape_weights);
+        auto conv2 = std::make_shared<op::Convolution>(A, weights2, Strides{2, 2}, Strides{1, 1});
+        auto bias2 = std::make_shared<op::Parameter>(element::f32, shape_bias);
+        auto conv_bias2 =
+            conv2 + std::make_shared<op::Broadcast>(bias2, conv2->get_shape(), AxisSet{0, 2, 3});
+        auto relu2 = std::make_shared<op::Relu>(conv_bias2);
+
+        auto concat = std::make_shared<op::Concat>(NodeVector{relu1, relu2}, 1);
+        auto f = make_shared<Function>(NodeVector{concat},
+                                       op::ParameterVector{A, weights1, bias1, weights2, bias2});
+        return f;
+    };
+    auto int_f = make_function();
+    auto cpu_f = make_function();
+
+    vector<vector<float>> args{
+        {1.25f,  2.25f, 5.25f, 6.25f,  -1.25f, -1.25f, 3.25f, -4.25f, 7.25f,  8.25f,  -1.25f,
+         -1.25f, 1.25f, 2.25f, -3.25f, 2.25f,  4.25f,  4.25f, 1.25f,  2.25f,  -4.25f, 2.25f,
+         4.25f,  4.25f, 0.f,   0.f,    -1.f,   0.f,    2.f,   2.f,    0.f,    0.f,    0.f,
+         0.f,    2.f,   2.f,   1.25f,  2.25f,  5.25f,  6.25f, 1.25f,  1.25f,  3.25f,  4.25f,
+         -7.25f, 8.25f, 1.25f, -1.25f, -1.25f, 2.25f,  3.25f, 2.25f,  -4.25f, -4.25f, -1.25f,
+         -2.25f, 4.25f, 2.25f, 4.25f,  4.25f,  0.f,    0.f,   1.f,    0.f,    -2.f,   2.f,
+         0.f,    0.f,   0.f,   0.f,    -2.f,   -2.f},
+        {2., 2., 2., 2.},
+        {0.1f},
+        {3., 3., 3., 3.},
+        {0.2f}};
+
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+
+    size_t cpu_cb = count_ops_of_type<op::ConvolutionBias>(cpu_f);
+    ASSERT_EQ(cpu_cb, 1);
+}
+
+// ConvolutionBiasAdd relies on an in-place fused MKLDNN kernel.
+// Need to ensure that it is fused only when in-place buffer allocation is feasible
+shared_ptr<Function> gen_conv_bias_add(bool param_input, bool result_output)
+{
+    auto A = make_shared<op::Parameter>(element::f32, Shape{2, 1, 2, 2});
+    auto weights = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1, 1});
+    auto bias = make_shared<op::Parameter>(element::f32, Shape{1});
+    auto conv = make_shared<op::Convolution>(A, weights, Strides{1, 1}, Strides{1, 1});
+    auto bias_broadcast = make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
+    auto convbias = conv + bias_broadcast;
+    auto B = make_shared<op::Parameter>(element::f32, Shape{2, 1, 2, 2});
+    auto abs_B = make_shared<op::Abs>(B);
+    auto add =
+        param_input ? make_shared<op::Add>(convbias, B) : make_shared<op::Add>(convbias, abs_B);
+    auto abs = make_shared<op::Abs>(add);
+
+    return result_output ? make_shared<Function>(add, op::ParameterVector{A, weights, bias, B})
+                         : make_shared<Function>(abs, op::ParameterVector{A, weights, bias, B});
+}
+
+TEST(cpu_fusion, fuse_conv_bias_add)
+{
+    auto func_fuse = gen_conv_bias_add(false, false);
+    auto func_nofuse1 = gen_conv_bias_add(true, false);
+    auto func_nofuse2 = gen_conv_bias_add(false, true);
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+    pass_manager.run_passes(func_fuse);
+    ASSERT_EQ(count_ops_of_type<op::ConvolutionBiasAdd>(func_fuse), 1);
+
+    pass_manager.run_passes(func_nofuse1);
+    ASSERT_EQ(count_ops_of_type<op::ConvolutionBiasAdd>(func_nofuse1), 0);
+
+    pass_manager.run_passes(func_nofuse2);
+    ASSERT_EQ(count_ops_of_type<op::ConvolutionBiasAdd>(func_nofuse2), 1);
+}
+
+TEST(cpu_fusion, conv_bias_add)
+{
+    auto int_f = gen_conv_bias_add(false, false);
+    auto cpu_f = gen_conv_bias_add(false, false);
+
+    vector<vector<float>> args{{1.25f, 2.25f, 5.25f, 6.25f, -1.25f, -1.25f, 3.25f, -4.25f},
+                               {-1.25f},
+                               {2.25f},
+                               {1.25f, 2.25f, -3.25f, 2.25f, 4.25f, 4.25f, 1.25f, 2.25f}};
+
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+}
+
+// ConvolutionAdd relies on an in-place fused MKLDNN kernel.
+// Need to ensure that it is fused only when in-place buffer allocation is feasible
+shared_ptr<Function> gen_conv_add(bool param_input, bool result_output)
+{
+    auto A = make_shared<op::Parameter>(element::f32, Shape{2, 1, 2, 2});
+    auto weights = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1, 1});
+    auto conv = make_shared<op::Convolution>(A, weights, Strides{1, 1}, Strides{1, 1});
+    auto B = make_shared<op::Parameter>(element::f32, Shape{2, 1, 2, 2});
+    auto abs_B = make_shared<op::Abs>(B);
+    auto add = param_input ? make_shared<op::Add>(conv, B) : make_shared<op::Add>(conv, abs_B);
+    auto abs = make_shared<op::Abs>(add);
+
+    return result_output ? make_shared<Function>(add, op::ParameterVector{A, weights, B})
+                         : make_shared<Function>(abs, op::ParameterVector{A, weights, B});
+}
+
+TEST(cpu_fusion, fuse_conv_add)
+{
+    auto func_fuse = gen_conv_add(false, false);
+    auto func_nofuse1 = gen_conv_add(true, false);
+    auto func_nofuse2 = gen_conv_add(false, true);
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+    pass_manager.run_passes(func_fuse);
+    ASSERT_EQ(count_ops_of_type<op::ConvolutionAdd>(func_fuse), 1);
+
+    pass_manager.run_passes(func_nofuse1);
+    ASSERT_EQ(count_ops_of_type<op::ConvolutionAdd>(func_nofuse1), 0);
+
+    pass_manager.run_passes(func_nofuse2);
+    ASSERT_EQ(count_ops_of_type<op::ConvolutionAdd>(func_nofuse2), 1);
+}
+
+TEST(cpu_fusion, conv_add)
+{
+    auto int_f = gen_conv_add(false, false);
+    auto cpu_f = gen_conv_add(false, false);
+
+    vector<vector<float>> args{{1.25f, 2.25f, 5.25f, 6.25f, -1.25f, -1.25f, 3.25f, -4.25f},
+                               {-1.25f},
+                               {1.25f, 2.25f, -3.25f, 2.25f, 4.25f, 4.25f, 1.25f, 2.25f}};
+
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+
+    int_f = gen_conv_add(false, true);
+    cpu_f = gen_conv_add(false, true);
+
+    int_results = execute(int_f, args, "INTERPRETER");
+    cpu_results = execute(cpu_f, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+}
+
+shared_ptr<Function> gen_groupconv_batchnorm(const bool add_goe,
+                                             const bool with_relu,
+                                             const Shape shape_in,
+                                             const Shape shape_weights,
+                                             const Shape shape_out,
+                                             const size_t groups)
+{
+    auto input = make_shared<op::Parameter>(element::f32, shape_in);
+    auto weights = make_shared<op::Parameter>(element::f32, shape_weights);
+
+    unsigned long OC = shape_out.at(1);
+    Shape shape_bn{OC};
+    auto group_conv = make_shared<op::GroupConvolution>(input,
+                                                        weights,
+                                                        Strides{1, 1},
+                                                        Strides{1, 1},
+                                                        CoordinateDiff{0, 0},
+                                                        CoordinateDiff{0, 0},
+                                                        Strides{1, 1},
+                                                        groups,
+                                                        shape_out);
+
+    double eps = 0.001;
+    auto gamma = std::make_shared<op::Parameter>(element::f32, shape_bn);
+    auto beta = std::make_shared<op::Parameter>(element::f32, shape_bn);
+    auto mean = std::make_shared<op::Parameter>(element::f32, shape_bn);
+    auto var = std::make_shared<op::Parameter>(element::f32, shape_bn);
+
+    auto goe_bn = std::make_shared<op::GetOutputElement>(group_conv, 0);
+
+    // Adding a goe will stop fusion since the patterns wont expect to see this op
+    auto bn =
+        add_goe ? std::make_shared<op::BatchNormInference>(goe_bn, gamma, beta, mean, var, eps)
+                : std::make_shared<op::BatchNormInference>(group_conv, gamma, beta, mean, var, eps);
+    if (with_relu)
+    {
+        auto prelu = std::make_shared<op::Relu>(bn);
+        auto f = make_shared<Function>(NodeVector{prelu},
+                                       op::ParameterVector{input, weights, gamma, beta, mean, var});
+        return f;
+    }
+    else
+    {
+        auto f = make_shared<Function>(NodeVector{bn},
+                                       op::ParameterVector{input, weights, gamma, beta, mean, var});
+        return f;
+    }
+}
+
+void fuse_groupconv_batchnorm_helper(Shape shape_in,
+                                     Shape shape_weights,
+                                     Shape shape_r,
+                                     size_t groups)
+{
+    auto func_fuse =
+        gen_groupconv_batchnorm(false, false, shape_in, shape_weights, shape_r, groups);
+    auto func_fuse2 =
+        gen_groupconv_batchnorm(false, true, shape_in, shape_weights, shape_r, groups);
+
+    {
+        pass::Manager pass_manager;
+        pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+        pass_manager.run_passes(func_fuse);
+        ASSERT_EQ(count_ops_of_type<op::GroupConvolutionBias>(func_fuse), 1);
+    }
+
+    {
+        // test groupconv + batchnorm + relu fusion
+        pass::Manager pass_manager;
+        pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+        pass_manager.run_passes(func_fuse2);
+        ASSERT_EQ(count_ops_of_type<op::GroupConvolutionBias>(func_fuse2), 1);
+        ASSERT_EQ(count_ops_of_type<op::Relu>(func_fuse2), 0);
+    }
+}
+
+void groupconv_batchnorm_test_val_helper(
+    const bool with_relu, Shape shape_in, Shape shape_weights, Shape shape_r, size_t groups)
+{
+    shared_ptr<Function> fuse_func =
+        gen_groupconv_batchnorm(false, with_relu, shape_in, shape_weights, shape_r, groups);
+    shared_ptr<Function> nofuse_func =
+        gen_groupconv_batchnorm(true, with_relu, shape_in, shape_weights, shape_r, groups);
+
+    test::Uniform<float> rng(1.0f, 100.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : fuse_func->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+
+    auto fuse_results = execute(fuse_func, args, "CPU");
+    auto nofuse_results = execute(nofuse_func, args, "CPU");
+
+    EXPECT_TRUE(test::all_close(fuse_results.at(0), nofuse_results.at(0)));
+}
+
+TEST(cpu_fusion, fuse_groupconv_batchnorm1)
+{
+    Shape shape_in{1, 20, 5, 5};
+    Shape shape_weights{8, 10, 3, 3};
+    Shape shape_r{1, 8, 3, 3};
+    fuse_groupconv_batchnorm_helper(shape_in, shape_weights, shape_r, 2);
+    groupconv_batchnorm_test_val_helper(false, shape_in, shape_weights, shape_r, 2);
+    groupconv_batchnorm_test_val_helper(true, shape_in, shape_weights, shape_r, 2);
+}
+
+TEST(cpu_fusion, fuse_groupconv_batchnorm2)
+{
+    Shape shape_in{1, 20, 5, 5};
+    Shape shape_weights{5, 4, 3, 3};
+    Shape shape_r{1, 5, 3, 3};
+    fuse_groupconv_batchnorm_helper(shape_in, shape_weights, shape_r, 5);
+    groupconv_batchnorm_test_val_helper(false, shape_in, shape_weights, shape_r, 5);
+    groupconv_batchnorm_test_val_helper(true, shape_in, shape_weights, shape_r, 5);
+}
+
+TEST(cpu_fusion, fuse_groupconv_batchnorm3)
+{
+    Shape shape_in{1, 20, 5, 5};
+    Shape shape_weights{20, 1, 3, 3};
+    Shape shape_r{1, 20, 3, 3};
+    fuse_groupconv_batchnorm_helper(shape_in, shape_weights, shape_r, 20);
+    groupconv_batchnorm_test_val_helper(false, shape_in, shape_weights, shape_r, 20);
+    groupconv_batchnorm_test_val_helper(true, shape_in, shape_weights, shape_r, 20);
+}
+
+TEST(cpu_fusion, fuse_groupconv_batchnorm4)
+{
+    Shape shape_in{1, 20, 4, 4};
+    Shape shape_weights{5, 20, 1, 1};
+    Shape shape_r{1, 5, 4, 4};
+    fuse_groupconv_batchnorm_helper(shape_in, shape_weights, shape_r, 1);
+    groupconv_batchnorm_test_val_helper(false, shape_in, shape_weights, shape_r, 1);
+    groupconv_batchnorm_test_val_helper(true, shape_in, shape_weights, shape_r, 1);
+}
+
+std::vector<shared_ptr<runtime::Tensor>> rnn_matrix_fusion_eval(const size_t time_steps,
+                                                                const Shape& data_shape,
+                                                                const Shape& weights_shape,
+                                                                const Shape& bias_shape,
+                                                                const vector<float>& data_val,
+                                                                const vector<float>& weights_val,
+                                                                const vector<float>& bias_val,
+                                                                const bool enable_pass)
+{
+    auto data = make_shared<op::Parameter>(element::f32, data_shape);
+    auto weights = make_shared<op::Parameter>(element::f32, weights_shape);
+    auto bias = make_shared<op::Parameter>(element::f32, bias_shape);
+
+    // results from each time step
+    NodeVector results;
+    for (size_t t = 0; t < time_steps; ++t)
+    {
+        auto data_slice = make_shared<op::Slice>(
+            data, Coordinate{0, t, 0}, Coordinate{data_shape[0], t + 1, data_shape[2]});
+        auto data_reshape = make_shared<op::Reshape>(
+            data_slice, AxisVector{0, 1, 2}, Shape{data_shape[0], data_shape[2]});
+        auto weights_reshape = make_shared<op::Reshape>(
+            weights, AxisVector{1, 0}, Shape{weights_shape[1], weights_shape[0]});
+        auto dot = make_shared<op::Dot>(data_reshape, weights_reshape);
+        auto bias_broadcast = make_shared<op::Broadcast>(bias, dot->get_shape(), AxisSet{0});
+        auto add = make_shared<op::Add>(dot, bias_broadcast);
+        results.push_back(add);
+    }
+    auto func = make_shared<Function>(results, op::ParameterVector{data, weights, bias});
+    if (enable_pass)
+    {
+        pass::Manager pass_manager;
+        pass_manager.register_pass<runtime::cpu::pass::CPURnnMatFusion>();
+        pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
+            runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
+        pass_manager.run_passes(func);
+        // check all of our dot/add are converted to a single MatmulBias op.
+        size_t count = count_ops_of_type<op::MatmulBias>(func);
+        EXPECT_EQ(count, 1);
+    }
+
+    auto backend = runtime::Backend::create("CPU");
+
+    shared_ptr<runtime::Tensor> data_tensor =
+        backend->create_tensor(element::f32, data->get_shape());
+    shared_ptr<runtime::Tensor> weights_tensor =
+        backend->create_tensor(element::f32, weights->get_shape());
+    shared_ptr<runtime::Tensor> bias_tensor =
+        backend->create_tensor(element::f32, bias->get_shape());
+
+    std::vector<shared_ptr<runtime::Tensor>> result_tensors;
+    for (auto r : results)
+    {
+        result_tensors.push_back(backend->create_tensor(element::f32, r->get_shape()));
+    }
+
+    copy_data(data_tensor, data_val);
+    copy_data(weights_tensor, weights_val);
+    copy_data(bias_tensor, bias_val);
+    backend->call_with_validate(func, result_tensors, {data_tensor, weights_tensor, bias_tensor});
+    return result_tensors;
+}
+
+TEST(cpu_fusion, rnn_matrix_fusion_eval_pass)
+{
+    const size_t time_steps = 4;
+    Shape data_shape{3, time_steps, 5};
+    Shape weights_shape{6, data_shape[2]};
+    Shape bias_shape{6};
+
+    test::Uniform<float> rng{0, 1, 0};
+    vector<float> data_val(shape_size(data_shape));
+    vector<float> weights_val(shape_size(weights_shape));
+    vector<float> bias_val(shape_size(bias_shape));
+    rng.initialize(data_val);
+    rng.initialize(weights_val);
+    rng.initialize(bias_val);
+
+    std::vector<shared_ptr<runtime::Tensor>> result_expected = rnn_matrix_fusion_eval(
+        time_steps, data_shape, weights_shape, bias_shape, data_val, weights_val, bias_val, false);
+    std::vector<shared_ptr<runtime::Tensor>> result_fused = rnn_matrix_fusion_eval(
+        time_steps, data_shape, weights_shape, bias_shape, data_val, weights_val, bias_val, true);
+    for (size_t i = 0; i < result_expected.size(); ++i)
+    {
+        EXPECT_TRUE(test::all_close<float>(result_expected[i], result_fused[i]));
+    }
+}
+
+TEST(cpu_fusion, rnn_fusion_from_json_model)
+{
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPURnnMatFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
+        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
+    const string json_path =
+        file_util::path_join(SERIALIZED_ZOO, "mxnet/rnn-10-step-fusion-test.json");
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    pass_manager.run_passes(func);
+    const size_t NUM_STEPS = 10;
+    auto mmb_predicate = [](std::shared_ptr<Node> node) {
+        auto users = node->get_users();
+        return users.size() == NUM_STEPS &&
+               std::all_of(begin(users), end(users), [](std::shared_ptr<Node> n) {
+                   return std::dynamic_pointer_cast<op::Slice>(n) != nullptr;
+               });
+    };
+
+    auto mmbs = get_ops_of_type<op::MatmulBias>(func);
+    ASSERT_TRUE(std::any_of(begin(mmbs), end(mmbs), mmb_predicate));
+}
+
+TEST(cpu_fusion, weight_fusion)
+{
+    auto param = std::make_shared<op::Parameter>(element::f32, Shape{64});
+    auto reshape_conv =
+        std::make_shared<ngraph::op::Reshape>(param, AxisVector{0}, Shape{16, 4, 1, 1});
+    auto data_conv = std::make_shared<op::Parameter>(element::f32, Shape{16, 4, 7, 7});
+    auto tvt = reshape_conv->get_outputs().at(0).get_tensor_ptr().get();
+    auto lt_desc = std::make_shared<runtime::cpu::LayoutDescriptor>(*tvt);
+    auto cvt_lt_conv = std::make_shared<runtime::cpu::op::ConvertLayout>(reshape_conv, lt_desc);
+    auto conv = std::make_shared<ngraph::op::Convolution>(
+        data_conv, cvt_lt_conv, Strides{1, 1}, Strides{1, 1});
+
+    auto reshape_conv_bprop =
+        std::make_shared<op::Reshape>(param, AxisVector{0}, Shape{16, 4, 1, 1});
+    auto dummy_arg_conv_bprop = std::make_shared<op::Parameter>(element::f32, Shape{1, 16, 7, 7});
+    auto tvt_bprop = reshape_conv_bprop->get_outputs().at(0).get_tensor_ptr().get();
+    auto lt_desc_bprop = std::make_shared<runtime::cpu::LayoutDescriptor>(*tvt_bprop);
+    auto cvt_lt_conv_bprop =
+        std::make_shared<runtime::cpu::op::ConvertLayout>(reshape_conv_bprop, lt_desc_bprop);
+    auto conv_bprop = std::make_shared<op::ConvolutionBackpropData>(Shape{1, 4, 7, 7},
+                                                                    cvt_lt_conv_bprop,
+                                                                    dummy_arg_conv_bprop,
+                                                                    Strides{1, 1},
+                                                                    Strides{1, 1},
+                                                                    CoordinateDiff{0, 0},
+                                                                    CoordinateDiff{0, 0},
+                                                                    Strides{1, 1});
+
+    auto conv_relu = std::make_shared<op::Relu>(conv);
+    auto conv_bprop_abs = std::make_shared<op::Abs>(conv_bprop);
+
+    auto f = make_shared<Function>(NodeVector{conv_relu, conv_bprop_abs},
+                                   op::ParameterVector{param, data_conv, dummy_arg_conv_bprop});
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutOptimizations>();
+    pass_manager.run_passes(f);
+
+    auto new_conv_bprop_data = conv_bprop_abs->get_argument(0);
+    auto new_convert_layout = new_conv_bprop_data->get_argument(0);
+
+    ASSERT_EQ(std::dynamic_pointer_cast<runtime::cpu::op::ConvertLayout>(
+                  new_convert_layout->get_argument(0)),
+              cvt_lt_conv);
+}
+
+TEST(cpu_fusion, max_pool_with_indices)
+{
+    Shape shape_a{10, 3, 28, 28};
+    auto input = std::make_shared<op::Parameter>(element::f32, shape_a);
+    Shape window_shape{2, 2};
+    auto max_pool = std::make_shared<op::MaxPool>(input, window_shape);
+    auto C = std::make_shared<op::Parameter>(element::f32, max_pool->get_shape());
+
+    ngraph::autodiff::Adjoints adjoints(NodeVector{max_pool}, NodeVector{C});
+
+    auto dinput = adjoints.backprop_node(input);
+
+    auto df = std::make_shared<Function>(NodeVector{dinput}, op::ParameterVector{input, C});
+
+    auto f = std::make_shared<Function>(NodeVector{max_pool}, op::ParameterVector{input});
+
+    {
+        pass::Manager pass_manager;
+        pass_manager.register_pass<pass::VisualizeTree>("max_pool_fprop_before.pdf");
+        pass_manager.run_passes(f);
+    }
+
+    {
+        NodeVector nv_cwi;
+        pass::Manager pass_manager;
+        pass_manager.register_pass<pass::VisualizeTree>("max_pool_bprop_before.pdf");
+        pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi);
+        pass_manager.register_pass<pass::VisualizeTree>("max_pool_bprop_after.pdf");
+        pass_manager.run_passes(df);
+    }
+
+    {
+        pass::Manager pass_manager;
+        pass_manager.register_pass<pass::VisualizeTree>("max_pool_fprop_after.pdf");
+        pass_manager.run_passes(f);
+    }
+
+    auto maxpool_goe_output =
+        std::dynamic_pointer_cast<op::GetOutputElement>(f->get_results().at(0)->get_argument(0));
+    ASSERT_TRUE(maxpool_goe_output);
+    ASSERT_EQ(maxpool_goe_output->get_n(), 0);
+    auto maxpool_with_indices = df->get_results().at(0)->get_argument(0);
+    auto maxpool_goe_indices =
+        std::dynamic_pointer_cast<op::GetOutputElement>(maxpool_with_indices->get_argument(2));
+    ASSERT_TRUE(maxpool_goe_indices);
+    ASSERT_EQ(maxpool_goe_indices->get_n(), 1);
+}
+
+TEST(cpu_fusion, backwards_maxpool_with_indices_n4_c1_hw4_2x2_max)
+{
+    Shape shape_a{1, 4, 4, 4};
+    Shape maxpool_shape{1, 4, 3, 3};
+    auto A = std::make_shared<op::Parameter>(element::f32, shape_a);
+    Shape window_shape{2, 2};
+    auto window_movement_strides = Strides{1, 1};
+    auto maxpool = std::make_shared<op::MaxPool>(A, window_shape, window_movement_strides);
+    auto f = std::make_shared<Function>(maxpool, op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("CPU");
+    shared_ptr<runtime::Tensor> ep = backend->create_tensor(element::f32, maxpool_shape);
+    vector<float> dataEp(shape_size(maxpool_shape), 4);
+
+    shared_ptr<runtime::Tensor> input = backend->create_tensor(element::f32, shape_a);
+    shared_ptr<runtime::Tensor> output = backend->create_tensor(element::f32, shape_a);
+
+    vector<float> dataInput{11.f, 31.f, 40.f, 47.f, 13.f, 61.f, 48.f, 59.f, 17.f, 39.f, 64.f,
+                            62.f, 45.f, 55.f, 36.f, 19.f, 65.f, 33.f, 49.f, 30.f, 56.f, 41.f,
+                            53.f, 58.f, 22.f, 35.f, 52.f, 50.f, 63.f, 54.f, 12.f, 26.f, 44.f,
+                            21.f, 69.f, 24.f, 46.f, 25.f, 51.f, 29.f, 72.f, 15.f, 73.f, 10.f,
+                            16.f, 37.f, 70.f, 32.f, 28.f, 66.f, 57.f, 27.f, 60.f, 42.f, 43.f,
+                            71.f, 18.f, 38.f, 67.f, 68.f, 14.f, 20.f, 34.f, 23.f};
+
+    vector<float> expected{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 12.0f, 0.0f, 4.0f, 0.0f, 0.0f,  16.0f,
+                           0.0f, 0.0f, 4.0f, 0.0f, 0.0f, 4.0f,  0.0f, 0.0f, 0.0f, 4.0f,  0.0f,
+                           8.0f, 8.0f, 0.0f, 0.0f, 4.0f, 0.0f,  4.0f, 4.0f, 0.0f, 0.0f,  0.0f,
+                           0.0f, 8.0f, 0.0f, 4.0f, 0.0f, 0.0f,  0.0f, 8.0f, 0.0f, 16.0f, 0.0f,
+                           0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 8.0f,  0.0f, 0.0f, 4.0f, 0.0f,  0.0f,
+                           8.0f, 0.0f, 4.0f, 8.0f, 4.0f, 0.0f,  0.0f, 0.0f, 0.0f};
+
+    copy_data(ep, dataEp);
+    copy_data(input, dataInput);
+
+    auto C = std::make_shared<op::Parameter>(element::f32, maxpool_shape);
+    auto df = autodiff::backprop_function(f);
+
+    {
+        NodeVector nv_cwi;
+        pass::Manager pass_manager;
+        pass_manager.register_pass<pass::VisualizeTree>("max_pool_bprop_before2.pdf");
+        pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi);
+        pass_manager.register_pass<pass::VisualizeTree>("max_pool_bprop_after2.pdf");
+        pass_manager.run_passes(df);
+    }
+
+    backend->call_with_validate(df, {output}, {input, ep});
+    ASSERT_TRUE(read_vector<float>(output) == expected);
+}
+
+#if 0
+TEST(cpu_fusion, loop_kernel_one_input_one_output)
+{
+    Shape shapeA{2, 2};
+    auto A = make_shared<op::Parameter>(element::i32, shapeA);
+    auto neg_a = make_shared<op::Negative>(A);
+    auto lk = make_shared<runtime::cpu::op::LoopKernel>(
+        NodeVector{neg_a}, NodeVector{neg_a}, NodeVector{A});
+    auto f = make_shared<Function>(NodeVector{lk}, op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("CPU");
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i32, shapeA);
+
+    vector<int> dataA{1, 4, 1, 4};
+    copy_data(a, dataA);
+    vector<int> expected{-1, -4, -1, -4};
+
+    backend->call_with_validate(f, {result}, {a});
+
+    EXPECT_EQ(read_vector<int>(result), expected);
+}
+
+TEST(cpu_fusion, loop_kernel_embedded_graph)
+{
+    Shape shapeA{2, 2};
+    auto A = make_shared<op::Parameter>(element::i32, shapeA);
+    auto B = make_shared<op::Parameter>(element::i32, shapeA);
+    auto neg_a = make_shared<op::Negative>(A);
+    auto neg_b = make_shared<op::Negative>(B);
+    auto add = neg_a + neg_b;
+    auto lk = make_shared<runtime::cpu::op::LoopKernel>(
+        NodeVector{add}, NodeVector{add}, NodeVector{neg_a, neg_b});
+    auto f = make_shared<Function>(NodeVector{lk}, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("CPU");
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i32, shapeA);
+
+    vector<int> dataA{1, 4, 1, 4};
+    copy_data(a, dataA);
+    vector<int> dataB{1, 2, 3, 4};
+    copy_data(b, dataB);
+    vector<int> expected{-2, -6, -4, -8};
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ(read_vector<int>(result), expected);
+}
+
+TEST(cpu_fusion, loop_kernel_two_inputs_one_output)
+{
+    Shape shapeA{2, 2};
+    auto A = make_shared<op::Parameter>(element::i32, shapeA);
+    auto B = make_shared<op::Parameter>(element::i32, shapeA);
+    auto add = A + B;
+    auto lk = make_shared<runtime::cpu::op::LoopKernel>(
+        NodeVector{add}, NodeVector{add}, NodeVector{A, B});
+    auto f = make_shared<Function>(NodeVector{lk}, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("CPU");
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i32, shapeA);
+
+    vector<int> dataA{1, 4, 1, 4};
+    copy_data(a, dataA);
+    vector<int> dataB{1, 2, 3, 4};
+    copy_data(b, dataB);
+    vector<int> expected{2, 6, 4, 8};
+
+    backend->call_with_validate(f, {result}, {a, b});
+
+    EXPECT_EQ(read_vector<int>(result), expected);
+}
+
+TEST(cpu_fusion, loop_kernel_multiple_outputs)
+{
+    Shape shapeA{2, 2};
+    auto A = make_shared<op::Parameter>(element::i32, shapeA);
+    auto B = make_shared<op::Parameter>(element::i32, shapeA);
+    auto C = make_shared<op::Parameter>(element::i32, shapeA);
+    auto D = make_shared<op::Parameter>(element::i32, shapeA);
+
+    auto neg_a = make_shared<op::Negative>(A);
+    auto neg_b = make_shared<op::Negative>(B);
+    auto add_ab = neg_a + neg_b;
+    auto add_cd = C + B;
+    auto add_cd_abs = make_shared<op::Abs>(add_cd);
+    auto add_ab_abs = make_shared<op::Abs>(add_ab);
+    auto add_aab = add_ab_abs + A;
+    auto add_cdd = add_cd_abs + D;
+
+    auto lk = make_shared<runtime::cpu::op::LoopKernel>(
+        NodeVector{neg_a, neg_b, add_ab, add_cd, add_cd_abs, add_ab_abs, add_aab, add_cdd},
+        NodeVector{add_aab, add_cdd, neg_b},
+        NodeVector{A, B, C, D});
+    auto add_aab_goe = std::make_shared<op::GetOutputElement>(lk, 0);
+    auto add_cdd_goe = std::make_shared<op::GetOutputElement>(lk, 1);
+    auto neg_b_goe = std::make_shared<op::GetOutputElement>(lk, 2);
+
+    auto f = make_shared<Function>(NodeVector{add_aab_goe, add_cdd_goe, neg_b_goe},
+                                   op::ParameterVector{A, B, C, D});
+
+    auto backend = runtime::Backend::create("CPU");
+
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> d = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> r1 = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> r2 = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> r3 = backend->create_tensor(element::i32, shapeA);
+
+    vector<int> dataA{1, 4, 1, 4};
+    vector<int> dataB{3, 3, 3, 9};
+    vector<int> dataC{1, 2, 3, 4};
+    vector<int> dataD{-2, 2, -1, 1};
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+    copy_data(c, dataC);
+    copy_data(d, dataD);
+
+    backend->call_with_validate(f, {r1, r2, r3}, {a, b, c, d});
+
+    vector<int> expected1{5, 11, 5, 17};
+    vector<int> expected2{2, 7, 5, 14};
+    vector<int> expected3{-3, -3, -3, -9};
+    EXPECT_EQ(read_vector<int>(r1), expected1);
+    EXPECT_EQ(read_vector<int>(r2), expected2);
+    EXPECT_EQ(read_vector<int>(r3), expected3);
+}
+
+TEST(cpu_fusion, loop_kernel_copy_with_new_args)
+{
+    Shape shapeA{2, 2};
+    auto A = make_shared<op::Parameter>(element::i32, shapeA);
+    auto B = make_shared<op::Parameter>(element::i32, shapeA);
+    auto C = make_shared<op::Parameter>(element::i32, shapeA);
+    auto D = make_shared<op::Parameter>(element::i32, shapeA);
+
+    auto neg_a = make_shared<op::Negative>(A);
+    auto neg_b = make_shared<op::Negative>(B);
+    auto add_ab = neg_a + neg_b;
+    auto add_cd = C + B;
+    auto add_cd_abs = make_shared<op::Abs>(add_cd);
+    auto add_ab_abs = make_shared<op::Abs>(add_ab);
+    auto add_aab = add_ab_abs + A;
+    auto add_cdd = add_cd_abs + D;
+
+    auto lk = make_shared<runtime::cpu::op::LoopKernel>(
+        NodeVector{neg_a, neg_b, add_ab, add_cd, add_cd_abs, add_ab_abs, add_aab, add_cdd},
+        NodeVector{add_aab, add_cdd, neg_b},
+        NodeVector{A, B, C, D});
+    auto add_aab_goe = std::make_shared<op::GetOutputElement>(lk, 0);
+    auto add_cdd_goe = std::make_shared<op::GetOutputElement>(lk, 1);
+    auto neg_b_goe = std::make_shared<op::GetOutputElement>(lk, 2);
+
+    auto f = make_shared<Function>(NodeVector{add_aab_goe, add_cdd_goe, neg_b_goe},
+                                   op::ParameterVector{A, B, C, D});
+
+    auto copy_f = clone_function(*f);
+
+    auto backend = runtime::Backend::create("CPU");
+
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> d = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> r1 = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> r2 = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> r3 = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> copy_r1 = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> copy_r2 = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::Tensor> copy_r3 = backend->create_tensor(element::i32, shapeA);
+
+    vector<int> dataA{1, 4, 1, 4};
+    vector<int> dataB{3, 3, 3, 9};
+    vector<int> dataC{1, 2, 3, 4};
+    vector<int> dataD{-2, 2, -1, 1};
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+    copy_data(c, dataC);
+    copy_data(d, dataD);
+
+    backend->call_with_validate(f, {r1, r2, r3}, {a, b, c, d});
+    backend->call_with_validate(copy_f, {copy_r1, copy_r2, copy_r3}, {a, b, c, d});
+
+    EXPECT_EQ(read_vector<int>(r1), read_vector<int>(copy_r1));
+    EXPECT_EQ(read_vector<int>(r2), read_vector<int>(copy_r2));
+    EXPECT_EQ(read_vector<int>(r3), read_vector<int>(copy_r3));
+}
+
+#endif
+
+static std::shared_ptr<ngraph::Function> make_forward_function()
+{
+    Shape shape_a{10, 3, 28, 28};
+    auto input = std::make_shared<op::Parameter>(element::f32, shape_a);
+    Shape window_shape{2, 2};
+    auto max_pool = std::make_shared<op::MaxPool>(input, window_shape);
+    auto neg = std::make_shared<op::Negative>(max_pool);
+    auto absn = std::make_shared<op::Abs>(max_pool);
+    return std::make_shared<Function>(NodeVector{max_pool, neg, absn}, op::ParameterVector{input});
+}
+
+static std::pair<std::shared_ptr<ngraph::Function>, std::vector<std::shared_ptr<ngraph::Node>>>
+    make_backward_function(std::shared_ptr<ngraph::Function> f)
+{
+    // get parameters
+    std::vector<std::shared_ptr<ngraph::op::Parameter>> back_parameters = f->get_parameters();
+
+    ngraph::NodeVector adjoints;
+    ngraph::NodeVector outputs;
+    for (auto Y : f->get_results())
+    {
+        // Get the output
+        // Create the Adjoint
+        auto C = std::make_shared<ngraph::op::Parameter>(Y->get_element_type(), Y->get_shape());
+        outputs.push_back(Y);
+        adjoints.push_back(C);
+    }
+
+    ngraph::autodiff::Adjoints adjoint{outputs, adjoints};
+
+    // Perform autodiff
+    std::vector<std::shared_ptr<Node>> dYdXs(back_parameters.size());
+    transform(back_parameters.begin(),
+              back_parameters.end(),
+              dYdXs.begin(),
+              [&adjoint](const std::shared_ptr<Node>& X) { return adjoint.backprop_node(X); });
+
+    // create the backward function
+    std::vector<std::shared_ptr<ngraph::op::Parameter>> param_adjoints;
+    for (auto n : adjoints)
+        param_adjoints.push_back(std::dynamic_pointer_cast<ngraph::op::Parameter>(n));
+    back_parameters.insert(back_parameters.begin(), param_adjoints.begin(), param_adjoints.end());
+
+    return {std::make_shared<ngraph::Function>(dYdXs, back_parameters), adjoints};
+}
+
+void optimize_graph(std::shared_ptr<ngraph::Function>& f, std::shared_ptr<ngraph::Function> bf)
+{
+    // start by removing excess reshapes
+    NodeVector nv_cwi;
+    ngraph::pass::Manager pass_manager;
+    pass_manager.register_pass<ngraph::pass::ReshapeElimination>();
+    pass_manager.register_pass<ngraph::pass::ReshapeElimination>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi);
+    pass_manager.register_pass<pass::VisualizeTree>("before.fprop_cache.pdf");
+
+    pass_manager.run_passes(f);
+    pass_manager.run_passes(bf);
+    if (nv_cwi.size() > 0)
+    {
+        NodeVector new_outputs;
+        for (auto r : f->get_results())
+        {
+            new_outputs.push_back(r->get_argument(0));
+        }
+
+        new_outputs.insert(new_outputs.end(), nv_cwi.begin(), nv_cwi.end());
+        f = std::make_shared<ngraph::Function>(new_outputs, f->get_parameters());
+    }
+
+    ngraph::NodeVector dYdXs;
+    for (size_t i = 0; i < bf->get_output_size(); ++i)
+    {
+        dYdXs.push_back(bf->get_output_op(i)->get_argument(0));
+    }
+
+    ngraph::NodeVector combined_outputs;
+    for (auto r : f->get_results())
+    {
+        combined_outputs.push_back(r->get_argument(0));
+    }
+
+    combined_outputs.insert(combined_outputs.end(), dYdXs.begin(), dYdXs.end());
+
+    std::vector<std::shared_ptr<ngraph::op::Parameter>> combined_parameters = f->get_parameters();
+    std::vector<std::shared_ptr<ngraph::op::Parameter>> back_parameters = bf->get_parameters();
+
+    combined_parameters.insert(
+        combined_parameters.end(), back_parameters.begin(), back_parameters.end());
+    auto combinedf = std::make_shared<ngraph::Function>(combined_outputs, combined_parameters);
+    // rerun Reshape elimination to help simplify the graph again, run CPUFusion
+    // this replaces nodes in both f and bf due to shared-ptr - ness
+    ngraph::pass::Manager pass_manager_comb;
+    pass_manager_comb.register_pass<ngraph::pass::ReshapeElimination>();
+    pass_manager_comb.register_pass<ngraph::runtime::cpu::pass::CPUFusion>();
+    pass_manager_comb.run_passes(combinedf);
+}
+
+TEST(cpu_fusion, maxpool_with_indices_in_mxnet)
+{
+    auto f = make_forward_function();
+    auto bfa = make_backward_function(f);
+    auto maybe_bf = bfa.first;
+    auto adjoints = bfa.second;
+    optimize_graph(f, maybe_bf);
+    auto fprop_cache = ngraph::cache_fprop(f, maybe_bf);
+
+    auto mpwi_bprop = fprop_cache.bprop->get_results().at(0)->get_argument(0);
+    ASSERT_TRUE(std::dynamic_pointer_cast<op::Parameter>(mpwi_bprop->get_argument(0)));
+    ASSERT_TRUE(std::dynamic_pointer_cast<op::Parameter>(mpwi_bprop->get_argument(2)));
+}
+
+TEST(cpu_fusion, conv_batch_norm_folding)
+{
+    Shape shape_input{1, 8, 3, 3};
+    Shape shape_weights{2, 8, 1, 1};
+    Shape shape_norm{2};
+
+    auto make_function = [shape_input, shape_weights, shape_norm]() {
+        auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
+        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
+        double eps = 0.001;
+        auto gamma = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto beta = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto mean = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto var = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
+        auto bn = std::make_shared<op::BatchNormInference>(conv, gamma, beta, mean, var, eps);
+        auto f = make_shared<Function>(NodeVector{bn},
+                                       op::ParameterVector{input, weights, gamma, beta, mean, var});
+        return f;
+    };
+
+    auto int_f = make_function();
+    auto cpu_f = make_function();
+
+    vector<vector<float>> args{
+        {1.25f,  2.25f, 5.25f, 6.25f,  -1.25f, -1.25f, 3.25f, -4.25f, 7.25f,  8.25f,  -1.25f,
+         -1.25f, 1.25f, 2.25f, -3.25f, 2.25f,  4.25f,  4.25f, 1.25f,  2.25f,  -4.25f, 2.25f,
+         4.25f,  4.25f, 0.f,   0.f,    -1.f,   0.f,    2.f,   2.f,    0.f,    0.f,    0.f,
+         0.f,    2.f,   2.f,   1.25f,  2.25f,  5.25f,  6.25f, 1.25f,  1.25f,  3.25f,  4.25f,
+         -7.25f, 8.25f, 1.25f, -1.25f, -1.25f, 2.25f,  3.25f, 2.25f,  -4.25f, -4.25f, -1.25f,
+         -2.25f, 4.25f, 2.25f, 4.25f,  4.25f,  0.f,    0.f,   1.f,    0.f,    -2.f,   2.f,
+         0.f,    0.f,   0.f,   0.f,    -2.f,   -2.f},
+        {1.25f,
+         2.25f,
+         5.25f,
+         6.25f,
+         -1.25f,
+         -1.25f,
+         3.25f,
+         -4.25f,
+         7.25f,
+         8.25f,
+         -1.25f,
+         0.f,
+         0.f,
+         0.f,
+         0.f,
+         -2.f},
+        {-0.9384f, 0.01875f},
+        {11.0f, 1.3f},
+        {0.12f, 0.31f},
+        {0.01f, 0.11f},
+    };
+
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+}
+
+TEST(cpu_fusion, convbias_batch_norm_folding)
+{
+    Shape shape_input{2, 8, 5, 5};
+    Shape shape_weights{2, 8, 2, 2};
+    Shape shape_norm{2};
+
+    auto make_function = [shape_input, shape_weights, shape_norm]() {
+        auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
+        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
+        auto bias = std::make_shared<op::Parameter>(element::f32, Shape{2});
+        double eps = 1.01;
+        auto gamma = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto beta = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto mean = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto var = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
+        auto convbias =
+            conv + std::make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
+        auto bn = std::make_shared<op::BatchNormInference>(convbias, gamma, beta, mean, var, eps);
+        auto f = make_shared<Function>(
+            NodeVector{bn}, op::ParameterVector{input, weights, bias, gamma, beta, mean, var});
+        return f;
+    };
+
+    auto int_f = make_function();
+    auto cpu_f = make_function();
+
+    test::Uniform<float> rng(1.0f, 100.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+}
+
+TEST(cpu_fusion, conv_affine_folding)
+{
+    Shape shape_input{1, 8, 3, 3};
+    Shape shape_weights{2, 8, 1, 1};
+    Shape shape_norm{2};
+
+    auto make_function = [shape_input, shape_weights, shape_norm]() {
+        auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
+        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
+
+        auto a = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto b = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
+        auto out = std::make_shared<op::Add>(
+            std::make_shared<op::Multiply>(
+                conv, std::make_shared<op::Broadcast>(a, conv->get_shape(), AxisSet{0, 2, 3})),
+            std::make_shared<op::Broadcast>(b, conv->get_shape(), AxisSet{0, 2, 3}));
+        auto f = make_shared<Function>(NodeVector{out}, op::ParameterVector{input, weights, a, b});
+        return f;
+    };
+
+    auto int_f = make_function();
+    auto cpu_f = make_function();
+
+    vector<vector<float>> args{
+        {1.25f,  2.25f, 5.25f, 6.25f,  -1.25f, -1.25f, 3.25f, -4.25f, 7.25f,  8.25f,  -1.25f,
+         -1.25f, 1.25f, 2.25f, -3.25f, 2.25f,  4.25f,  4.25f, 1.25f,  2.25f,  -4.25f, 2.25f,
+         4.25f,  4.25f, 0.f,   0.f,    -1.f,   0.f,    2.f,   2.f,    0.f,    0.f,    0.f,
+         0.f,    2.f,   2.f,   1.25f,  2.25f,  5.25f,  6.25f, 1.25f,  1.25f,  3.25f,  4.25f,
+         -7.25f, 8.25f, 1.25f, -1.25f, -1.25f, 2.25f,  3.25f, 2.25f,  -4.25f, -4.25f, -1.25f,
+         -2.25f, 4.25f, 2.25f, 4.25f,  4.25f,  0.f,    0.f,   1.f,    0.f,    -2.f,   2.f,
+         0.f,    0.f,   0.f,   0.f,    -2.f,   -2.f},
+        {1.25f,
+         2.25f,
+         5.25f,
+         6.25f,
+         -1.25f,
+         -1.25f,
+         3.25f,
+         -4.25f,
+         7.25f,
+         8.25f,
+         -1.25f,
+         0.f,
+         0.f,
+         0.f,
+         0.f,
+         -2.f},
+        {-0.9384f, 0.01875f},
+        {11.0f, 1.3f},
+    };
+
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+}
+
+TEST(cpu_fusion, convbias_affine_folding)
+{
+    Shape shape_input{1, 6, 3, 3};
+    Shape shape_weights{3, 6, 1, 1};
+    Shape shape_norm{3};
+
+    auto make_function = [shape_input, shape_weights, shape_norm]() {
+        auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
+        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
+        auto bias = std::make_shared<op::Parameter>(element::f32, Shape{3});
+
+        auto a = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto b = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
+        auto convbias =
+            conv + std::make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
+        auto out = std::make_shared<op::Add>(
+            std::make_shared<op::Multiply>(
+                convbias, std::make_shared<op::Broadcast>(a, conv->get_shape(), AxisSet{0, 2, 3})),
+            std::make_shared<op::Broadcast>(b, conv->get_shape(), AxisSet{0, 2, 3}));
+        auto f =
+            make_shared<Function>(NodeVector{out}, op::ParameterVector{input, weights, bias, a, b});
+        return f;
+    };
+
+    auto int_f = make_function();
+    auto cpu_f = make_function();
+
+    test::Uniform<float> rng(20.0f, 300.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+}
+
+TEST(cpu_fusion, group_convolution_fusion)
+{
+    Shape shape_a{1, 32, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{2, 16, 1, 1};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{1, 2, 2, 2};
+
+    auto a_slice0 = std::make_shared<op::Slice>(A, Coordinate{0, 0, 0, 0}, Coordinate{1, 16, 2, 2});
+    auto a_slice1 =
+        std::make_shared<op::Slice>(A, Coordinate{0, 16, 0, 0}, Coordinate{1, 32, 2, 2});
+
+    auto b_slice0 = std::make_shared<op::Slice>(B, Coordinate{0, 0, 0, 0}, Coordinate{1, 16, 1, 1});
+    auto b_slice1 = std::make_shared<op::Slice>(B, Coordinate{1, 0, 0, 0}, Coordinate{2, 16, 1, 1});
+
+    auto conv_lower = make_shared<op::Convolution>(a_slice0,
+                                                   b_slice0,
+                                                   Strides{1, 1},
+                                                   Strides{1, 1},
+                                                   CoordinateDiff{0, 0},
+                                                   CoordinateDiff{0, 0},
+                                                   Strides{1, 1});
+
+    auto conv_upper = make_shared<op::Convolution>(a_slice1,
+                                                   b_slice1,
+                                                   Strides{1, 1},
+                                                   Strides{1, 1},
+                                                   CoordinateDiff{0, 0},
+                                                   CoordinateDiff{0, 0},
+                                                   Strides{1, 1});
+
+    auto concat = make_shared<op::Concat>(NodeVector{conv_lower, conv_upper}, 1);
+
+    auto f = make_shared<Function>(NodeVector{concat}, op::ParameterVector{A, B});
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::VisualizeTree>("before_group.pdf");
+    pass_manager.register_pass<runtime::cpu::pass::CPUBatchFusion>();
+    pass_manager.register_pass<pass::VisualizeTree>("after_group.pdf");
+    pass_manager.run_passes(f);
+    auto gc =
+        std::dynamic_pointer_cast<op::GroupConvolution>(f->get_results().at(0)->get_argument(0));
+    ASSERT_TRUE(gc);
+}
+
+TEST(cpu_fusion, group_convolution)
+{
+    auto backend = runtime::Backend::create("CPU");
+    test::Uniform<float> rng(2.0f, 10.0f);
+
+    const size_t GROUPS = 2;
+    Shape shape_a{1, 32, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{2, 16, 1, 1};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{1, 2, 2, 2};
+    auto group_conv = make_shared<op::GroupConvolution>(A,
+                                                        B,
+                                                        Strides{1, 1},
+                                                        Strides{1, 1},
+                                                        CoordinateDiff{0, 0},
+                                                        CoordinateDiff{0, 0},
+                                                        Strides{1, 1},
+                                                        GROUPS,
+                                                        shape_r);
+
+    Shape shape_c{1, 16, 2, 2};
+    auto C = make_shared<op::Parameter>(element::f32, shape_c);
+    Shape shape_d{1, 16, 1, 1};
+    auto D = make_shared<op::Parameter>(element::f32, shape_d);
+    auto conv_lower = make_shared<op::Convolution>(C,
+                                                   D,
+                                                   Strides{1, 1},
+                                                   Strides{1, 1},
+                                                   CoordinateDiff{0, 0},
+                                                   CoordinateDiff{0, 0},
+                                                   Strides{1, 1});
+
+    auto E = make_shared<op::Parameter>(element::f32, shape_c);
+    auto F = make_shared<op::Parameter>(element::f32, shape_d);
+    auto conv_upper = make_shared<op::Convolution>(E,
+                                                   F,
+                                                   Strides{1, 1},
+                                                   Strides{1, 1},
+                                                   CoordinateDiff{0, 0},
+                                                   CoordinateDiff{0, 0},
+                                                   Strides{1, 1});
+
+    auto f = make_shared<Function>(NodeVector{group_conv, conv_lower, conv_upper},
+                                   op::ParameterVector{A, B, C, D, E, F});
+
+    auto a_ = rng.initialize(backend->create_tensor(element::f32, shape_a));
+    auto b_ = rng.initialize(backend->create_tensor(element::f32, shape_b));
+
+    vector<float> rv(shape_size(shape_r), 0);
+    auto group_result = std::dynamic_pointer_cast<ngraph::runtime::cpu::CPUTensorView>(
+        backend->create_tensor(element::f32, shape_r, rv.data()));
+
+    auto av = read_vector<float>(a_);
+    auto bv = read_vector<float>(b_);
+    auto c_ = backend->create_tensor(element::f32, shape_c, av.data()); // lower data
+    auto d_ = backend->create_tensor(element::f32, shape_d, bv.data()); // upper data
+
+    auto e_ =
+        backend->create_tensor(element::f32, shape_c, av.data() + av.size() / 2); // lower weights
+    auto f_ =
+        backend->create_tensor(element::f32, shape_d, bv.data() + bv.size() / 2); // upper weights
+
+    Shape shape_ur{1, 1, 2, 2};
+    // allocate a contigious storage for both lower and upper halves.
+    vector<float> erv(shape_size(shape_r), 0);
+    auto lower_result = std::dynamic_pointer_cast<ngraph::runtime::cpu::CPUTensorView>(
+        backend->create_tensor(element::f32, shape_ur, erv.data()));
+    auto upper_result = std::dynamic_pointer_cast<ngraph::runtime::cpu::CPUTensorView>(
+        backend->create_tensor(element::f32, shape_ur, erv.data() + erv.size() / 2));
+    backend->call_with_validate(
+        f, {group_result, lower_result, upper_result}, {a_, b_, c_, d_, e_, f_});
+    ASSERT_EQ(rv, erv);
+}
+
+//TODO(Pruthvi) enable this test after MKLDNN RNN bug is fixed
+#if 0
+TEST(cpu_fusion, rnn_fprop_1_lstm_cell)
+{
+    auto src_layer = make_shared<op::Parameter>(element::f32, Shape{10, 100});
+    auto src_iter = make_shared<op::Parameter>(element::f32, Shape{20, 100});
+    auto weights_layer = make_shared<op::Parameter>(element::f32, Shape{400, 100});
+    auto weights_iter = make_shared<op::Parameter>(element::f32, Shape{400, 100});
+    auto biases = make_shared<op::Parameter>(element::f32, Shape{400});
+    const int number_of_timesteps = 1;
+    const int number_of_gates_per_cell = 4;
+    const int src_seq_length = 1;
+    const int src_layer_feature_size = 100;
+    const int feature_size = 100;
+    const int num_rnn_cell_states = 2;
+    const int rnn_direction = 1;
+    const int num_of_rnn_fused_layer = 1;
+    auto rnn_node = make_shared<op::Rnn>(src_layer,
+                                         src_iter,
+                                         weights_layer,
+                                         weights_iter,
+                                         biases,
+                                         number_of_timesteps,
+                                         number_of_gates_per_cell,
+                                         src_seq_length,
+                                         src_layer_feature_size,
+                                         feature_size,
+                                         num_rnn_cell_states,
+                                         rnn_direction,
+                                         num_of_rnn_fused_layer);
+    auto rnn_ht_output = make_shared<op::GetOutputElement>(rnn_node, 0);
+    auto rnn_ct_output = make_shared<op::GetOutputElement>(rnn_node, 1);
+
+    auto func = make_shared<Function>(
+        NodeVector{rnn_ht_output, rnn_ct_output},
+        op::ParameterVector{src_layer, src_iter, weights_layer, weights_iter, biases});
+    auto backend = runtime::Backend::create("CPU");
+
+    shared_ptr<runtime::Tensor> src_layer_t =
+        backend->create_tensor(element::f32, src_layer->get_shape());
+    shared_ptr<runtime::Tensor> src_iter_t =
+        backend->create_tensor(element::f32, src_iter->get_shape());
+    shared_ptr<runtime::Tensor> weights_layer_t =
+        backend->create_tensor(element::f32, weights_layer->get_shape());
+    shared_ptr<runtime::Tensor> weights_iter_t =
+        backend->create_tensor(element::f32, weights_iter->get_shape());
+    shared_ptr<runtime::Tensor> biases_t =
+        backend->create_tensor(element::f32, biases->get_shape());
+    shared_ptr<runtime::Tensor> result_ht = backend->create_tensor(element::f32, {10, 100});
+    shared_ptr<runtime::Tensor> result_ct =
+        backend->create_tensor(element::f32, Shape{20, 100});
+
+    copy_data(src_layer_t, vector<float>(1000, 1));
+    copy_data(src_iter_t, vector<float>(2000, 1));
+    copy_data(weights_layer_t, vector<float>(400 * 100, 1));
+    copy_data(weights_iter_t, vector<float>(400 * 100, 1));
+    copy_data(biases_t, vector<float>(400, 1));
+
+    backend->call_with_validate(
+        func,
+        {result_ht, result_ct},
+        {src_layer_t, src_iter_t, weights_layer_t, weights_iter_t, biases_t});
+    vector<float> expected_ht(10 * 100, 0.964028f);
+    vector<float> expected_ct;
+    for (size_t i = 0; i < 20 * 100; i++)
+    {
+        if (i < 1000)
+        {
+            expected_ct.push_back(0.964028f);
+        }
+        else
+        {
+            expected_ct.push_back(2.0f);
+        }
+    }
+
+    EXPECT_TRUE(test::all_close(expected_ht, read_vector<float>(result_ht)));
+    EXPECT_TRUE(test::all_close(expected_ct, read_vector<float>(result_ct)));
+}
+#endif
+
+TEST(cpu_fusion, fuse_lstm_cells)
+{
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::ConcatInputs>();
+    const string json_path =
+        file_util::path_join(SERIALIZED_ZOO, "mxnet/2rnn_layer_3lstm_cell.json");
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    pass_manager.run_passes(func);
+    auto lstm_ops = get_ops_of_type<op::Lstm>(func);
+    EXPECT_EQ(lstm_ops.size(), 6);
+}
+
+TEST(cpu_fusion, fuse_2_layer_rnn)
+{
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::RNNFusion>();
+    const string json_path =
+        file_util::path_join(SERIALIZED_ZOO, "mxnet/2rnn_layer_3lstm_cell.json");
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    pass_manager.run_passes(func);
+    size_t count = count_ops_of_type<op::Rnn>(func);
+    auto rnn_ops = get_ops_of_type<op::Rnn>(func);
+    EXPECT_EQ(rnn_ops.size(), count);
+    for (auto& node : rnn_ops)
+    {
+        EXPECT_EQ(node->get_num_timesteps(), node->get_src_sequence_length());
+        EXPECT_EQ(node->get_num_cell_states(), node->get_argument(1)->get_arguments().size());
+    }
+}
+
+TEST(cpu_fusion, fuse_1_layer_rnn)
+{
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::RNNFusion>();
+    const string json_path =
+        file_util::path_join(SERIALIZED_ZOO, "mxnet/1rnn_layer_3lstm_cell.json");
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    pass_manager.run_passes(func);
+    size_t count = count_ops_of_type<op::Rnn>(func);
+    auto rnn_ops = get_ops_of_type<op::Rnn>(func);
+    EXPECT_EQ(rnn_ops.size(), 1);
+    EXPECT_EQ(rnn_ops.size(), count);
+    for (auto& node : rnn_ops)
+    {
+        EXPECT_EQ(node->get_num_timesteps(), node->get_src_sequence_length());
+        EXPECT_EQ(node->get_num_cell_states(), node->get_argument(1)->get_arguments().size());
+    }
+}
+
+static std::shared_ptr<Function> make_function(const std::string& file_name)
+{
+    const string json_path = file_util::path_join(SERIALIZED_ZOO, file_name);
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    return func;
+}
+
+TEST(cpu_fusion, rnn_fusion_inter_vs_cpu_1lstm_cell)
+{
+    const std::string file_name("mxnet/1_lstm_cell_forward.json");
+    auto cpu_f = make_function(file_name);
+    auto int_f = make_function(file_name);
+    test::Uniform<float> rng(0.0f, 1.0f);
+    vector<vector<float>> args;
+
+    for (shared_ptr<op::Parameter> param : int_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    for (size_t i = 0; i < cpu_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
+    }
+}
+
+TEST(cpu_fusion, rnn_fusion_inter_vs_cpu_1rnn_layer_3lstm_cell)
+{
+    const std::string file_name("mxnet/1rnn_layer_3lstm_cell.json");
+    auto cpu_f = make_function(file_name);
+    auto int_f = make_function(file_name);
+    test::Uniform<float> rng(0.0f, 1.0f);
+    vector<vector<float>> args;
+
+    for (shared_ptr<op::Parameter> param : int_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    for (size_t i = 0; i < cpu_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
+    }
+}
+
+TEST(cpu_fusion, rnn_fusion_inter_vs_cpu_2rnn_layer_3lstm_cell)
+{
+    const std::string file_name("mxnet/2rnn_layer_3lstm_cell.json");
+    auto cpu_f = make_function(file_name);
+    auto int_f = make_function(file_name);
+    test::Uniform<float> rng(0.0f, 1.0f);
+    vector<vector<float>> args;
+
+    for (shared_ptr<op::Parameter> param : int_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    for (size_t i = 0; i < cpu_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
+    }
+}
+
+#if 0
+
+TEST(cpu_fusion, loop_kernel_fusion_multiple_groups_pruned)
+{
+    auto make_function = []() -> std::shared_ptr<Function> {
+        Shape shape{};
+        auto a = make_shared<op::Parameter>(element::f32, shape);
+        auto b = make_shared<op::Parameter>(element::f32, shape);
+        auto c = make_shared<op::Parameter>(element::f32, shape);
+        auto add_ab = a + b;
+        auto add_abs = std::make_shared<op::Abs>(add_ab);
+        auto abs_neg = std::make_shared<op::Negative>(add_abs);
+        auto sub_c_neg = c - abs_neg;
+
+        auto d = make_shared<op::Parameter>(element::f32, shape);
+        auto d_abs = std::make_shared<op::Abs>(d);
+        auto add_d = d_abs + add_ab;
+        auto neg_d = std::make_shared<op::Negative>(add_d);
+
+        auto mul_cd = neg_d * sub_c_neg;
+        auto f =
+            std::make_shared<Function>(ngraph::NodeVector{mul_cd}, op::ParameterVector{a, b, c, d});
+
+        return f;
+    };
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPULoopKernelFusion>(3);
+    auto cpu_f = make_function();
+    auto int_f = make_function();
+    pass_manager.run_passes(cpu_f);
+    test::Uniform<float> rng(-100.0f, 100.0f);
+    vector<vector<float>> args;
+
+    size_t lkn = count_ops_of_type<runtime::cpu::op::LoopKernel>(cpu_f);
+    ASSERT_GT(lkn, 0);
+
+    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    for (size_t i = 0; i < cpu_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
+    }
+}
+
+TEST(cpu_fusion, loop_kernel_fusion_bounded_relu)
+{
+    auto make_function = []() -> std::shared_ptr<Function> {
+        Shape shape{};
+        auto a = make_shared<op::Parameter>(element::f32, shape);
+        auto relu = make_shared<op::Relu>(a);
+        auto upper_bound =
+            op::Constant::create<float>(element::f32, shape, std::vector<float>{6.0f});
+        auto minn = make_shared<op::Minimum>(relu, upper_bound);
+        auto absn = make_shared<op::Abs>(minn);
+        auto negn = std::make_shared<op::Negative>(absn);
+
+        auto f = std::make_shared<Function>(ngraph::NodeVector{negn}, op::ParameterVector{a});
+
+        return f;
+    };
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::VisualizeTree>("before_relu_fusion.pdf");
+    pass_manager.register_pass<runtime::cpu::pass::CPULoopKernelFusion>(3);
+    pass_manager.register_pass<pass::VisualizeTree>("after_relu_fusion.pdf");
+    auto cpu_f = make_function();
+    auto int_f = make_function();
+    pass_manager.run_passes(cpu_f);
+    test::Uniform<float> rng(-100.0f, 100.0f);
+    vector<vector<float>> args;
+
+    size_t lkn = count_ops_of_type<runtime::cpu::op::LoopKernel>(cpu_f);
+    ASSERT_GT(lkn, 0);
+
+    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    for (size_t i = 0; i < cpu_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
+    }
+}
+
+TEST(cpu_fusion, loop_kernel_fusion_multiple_groups)
+{
+    auto make_function = []() -> std::shared_ptr<Function> {
+        Shape shape{};
+        auto a = make_shared<op::Parameter>(element::f32, shape);
+        auto b = make_shared<op::Parameter>(element::f32, shape);
+        auto c = make_shared<op::Parameter>(element::f32, shape);
+        auto add_ab = a + b;
+        auto add_abs = std::make_shared<op::Abs>(add_ab);
+        auto abs_neg = std::make_shared<op::Negative>(add_abs);
+        auto sub_c_neg = c - abs_neg;
+
+        auto d = make_shared<op::Parameter>(element::f32, shape);
+        auto d_abs = std::make_shared<op::Abs>(d);
+        auto add_d = d_abs + add_ab;
+        auto neg_d = std::make_shared<op::Negative>(add_d);
+
+        auto mul_cd = neg_d * sub_c_neg;
+        auto f =
+            std::make_shared<Function>(ngraph::NodeVector{mul_cd}, op::ParameterVector{a, b, c, d});
+
+        return f;
+    };
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPULoopKernelFusion>(2);
+    auto cpu_f = make_function();
+    auto int_f = make_function();
+    pass_manager.run_passes(cpu_f);
+    test::Uniform<float> rng(-100.0f, 100.0f);
+    vector<vector<float>> args;
+
+    size_t lkn = count_ops_of_type<runtime::cpu::op::LoopKernel>(cpu_f);
+    ASSERT_GT(lkn, 0);
+
+    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    for (size_t i = 0; i < cpu_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
+    }
+}
+
+TEST(cpu_fusion, loop_kernel_fusion_one_group)
+{
+    auto make_function = []() -> std::shared_ptr<Function> {
+        Shape shape{};
+        auto a = make_shared<op::Parameter>(element::f32, shape);
+        auto b = make_shared<op::Parameter>(element::f32, shape);
+        auto c = make_shared<op::Parameter>(element::f32, shape);
+        auto add_ab = a + b;
+        auto add_abs = std::make_shared<op::Abs>(add_ab);
+        auto abs_neg = std::make_shared<op::Negative>(add_abs);
+        auto sub_c_neg = c - abs_neg;
+        auto d = make_shared<op::Parameter>(element::f32, shape);
+        auto add_d = sub_c_neg + d;
+        auto abs_add_d = std::make_shared<op::Abs>(add_d);
+        auto e = make_shared<op::Parameter>(element::f32, shape);
+        auto add_e = e + abs_add_d;
+        auto neg_e = std::make_shared<op::Negative>(add_e);
+
+        auto f = std::make_shared<Function>(ngraph::NodeVector{neg_e},
+                                            op::ParameterVector{a, b, c, d, e});
+
+        return f;
+
+    };
+
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPULoopKernelFusion>(2);
+    auto cpu_f = make_function();
+    auto int_f = make_function();
+    pass_manager.run_passes(cpu_f);
+    test::Uniform<float> rng(-100.0f, 100.0f);
+    vector<vector<float>> args;
+
+    size_t lkn = count_ops_of_type<runtime::cpu::op::LoopKernel>(cpu_f);
+    ASSERT_GT(lkn, 0);
+
+    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    for (size_t i = 0; i < cpu_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
+    }
+}
+
+#endif
+
+TEST(cpu_fusion, sigmoid_multiply_fusion)
+{
+    pass::Manager pass_manager;
+    pass_manager.register_pass<pass::CoreFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/3_lstm_cell_forward.json");
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    pass_manager.run_passes(func);
+    size_t ccg = count_ops_of_type<op::SigmoidMultiply>(func);
+    ASSERT_EQ(ccg, 18);
+}
+
+void sigmoid_multiply_fusion_forward_compute(runtime::Backend* backend,
+                                             const op::ParameterVector& input_params,
+                                             const vector<vector<float>>& input_data,
+                                             const vector<Shape>& input_shapes,
+                                             const Shape& result_shape,
+                                             shared_ptr<Node> input_0_node,
+                                             shared_ptr<Node> input_1_node,
+                                             const vector<float>& expected)
+{
+    shared_ptr<runtime::Tensor> result_tensor = backend->create_tensor(element::f32, result_shape);
+
+    vector<shared_ptr<runtime::Tensor>> input_tensors;
+    for (int i = 0; i < input_params.size(); ++i)
+    {
+        input_tensors.push_back(backend->create_tensor(element::f32, input_shapes[i]));
+        copy_data(input_tensors[i], input_data[i]);
+    }
+
+    auto mul_node = input_0_node * input_1_node;
+    auto func = make_shared<Function>(mul_node, input_params);
+    backend->call_with_validate(func, {result_tensor}, input_tensors);
+    EXPECT_TRUE(test::all_close(read_vector<float>(result_tensor), expected));
+}
+
+TEST(cpu_fusion, sigmoid_multiply_fusion_forward)
+{
+    auto backend = runtime::Backend::create("CPU");
+
+    Shape data_shape{1, 1, 2, 2};
+    Shape const_shape{1};
+
+    vector<float> input_0_data{1.f, 2.f, 3.f, 4.f};
+    vector<float> input_1_data{1.2f, 2.3f, 3.5f, 4.7f};
+    vector<float> const_data{1.2f};
+    {
+        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_2_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto sigmoid_0 = make_shared<op::Sigmoid>(input_0_param);
+        auto sigmoid_1 = make_shared<op::Add>(input_1_param, input_2_param);
+        vector<float> expected{1.60833f, 3.78743f, 6.19173f, 8.54352f};
+        op::ParameterVector input_params{input_0_param, input_1_param, input_2_param};
+        vector<vector<float>> input_data{input_0_data, input_0_data, input_1_data};
+        vector<Shape> input_shapes{data_shape, data_shape, data_shape};
+        sigmoid_multiply_fusion_forward_compute(backend.get(),
+                                                input_params,
+                                                input_data,
+                                                input_shapes,
+                                                data_shape,
+                                                sigmoid_0,
+                                                sigmoid_1,
+                                                expected);
+    }
+    {
+        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_1_param = make_shared<op::Parameter>(element::f32, const_shape);
+        auto sigmoid_0 = make_shared<op::Broadcast>(input_1_param, data_shape, AxisSet{1, 2, 3});
+        auto sigmoid_1 = make_shared<op::Sigmoid>(input_0_param);
+        vector<float> expected{0.87727f, 1.05696f, 1.14309f, 1.17842f};
+        op::ParameterVector input_params{input_0_param, input_1_param};
+        vector<vector<float>> input_data{input_0_data, const_data};
+        vector<Shape> input_shapes{data_shape, const_shape};
+        sigmoid_multiply_fusion_forward_compute(backend.get(),
+                                                input_params,
+                                                input_data,
+                                                input_shapes,
+                                                data_shape,
+                                                sigmoid_0,
+                                                sigmoid_1,
+                                                expected);
+    }
+    {
+        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_1_param = make_shared<op::Parameter>(element::f32, const_shape);
+        auto sigmoid_0 = make_shared<op::Sigmoid>(input_0_param);
+        auto sigmoid_1 = make_shared<op::Broadcast>(input_1_param, data_shape, AxisSet{1, 2, 3});
+        vector<float> expected{0.87727f, 1.05696f, 1.14309f, 1.17842f};
+        op::ParameterVector input_params{input_0_param, input_1_param};
+        vector<vector<float>> input_data{input_0_data, const_data};
+        vector<Shape> input_shapes{data_shape, const_shape};
+        sigmoid_multiply_fusion_forward_compute(backend.get(),
+                                                input_params,
+                                                input_data,
+                                                input_shapes,
+                                                data_shape,
+                                                sigmoid_0,
+                                                sigmoid_1,
+                                                expected);
+    }
+    {
+        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto sigmoid_0 = make_shared<op::Sigmoid>(input_0_param);
+        auto sigmoid_1 = make_shared<op::Sigmoid>(input_1_param);
+        vector<float> expected{0.561837f, 0.800536f, 0.924652f, 0.973163f};
+        op::ParameterVector input_params{input_0_param, input_1_param};
+        vector<vector<float>> input_data{input_0_data, input_1_data};
+        vector<Shape> input_shapes{data_shape, data_shape};
+        sigmoid_multiply_fusion_forward_compute(backend.get(),
+                                                input_params,
+                                                input_data,
+                                                input_shapes,
+                                                data_shape,
+                                                sigmoid_0,
+                                                sigmoid_1,
+                                                expected);
+    }
+    {
+        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto sigmoid_0 = make_shared<op::Sigmoid>(input_0_param);
+        auto sigmoid_1 = make_shared<op::Tanh>(input_1_param);
+        vector<float> expected{0.60945f, 0.863266f, 0.950838f, 0.981851f};
+        op::ParameterVector input_params{input_0_param, input_1_param};
+        vector<vector<float>> input_data{input_0_data, input_1_data};
+        vector<Shape> input_shapes{data_shape, data_shape};
+        sigmoid_multiply_fusion_forward_compute(backend.get(),
+                                                input_params,
+                                                input_data,
+                                                input_shapes,
+                                                data_shape,
+                                                sigmoid_0,
+                                                sigmoid_1,
+                                                expected);
+    }
+    {
+        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto sigmoid_0 = make_shared<op::Tanh>(input_0_param);
+        auto sigmoid_1 = make_shared<op::Sigmoid>(input_1_param);
+        vector<float> expected{0.585304f, 0.876182f, 0.965887f, 0.990322f};
+        op::ParameterVector input_params{input_0_param, input_1_param};
+        vector<vector<float>> input_data{input_0_data, input_1_data};
+        vector<Shape> input_shapes{data_shape, data_shape};
+        sigmoid_multiply_fusion_forward_compute(backend.get(),
+                                                input_params,
+                                                input_data,
+                                                input_shapes,
+                                                data_shape,
+                                                sigmoid_0,
+                                                sigmoid_1,
+                                                expected);
+    }
+    {
+        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto sigmoid_0 = make_shared<op::Tanh>(input_0_param);
+        auto sigmoid_1 = make_shared<op::Tanh>(input_1_param);
+        vector<float> expected{0.634907f, 0.94484f, 0.993242f, 0.999164f};
+        op::ParameterVector input_params{input_0_param, input_1_param};
+        vector<vector<float>> input_data{input_0_data, input_1_data};
+        vector<Shape> input_shapes{data_shape, data_shape};
+        sigmoid_multiply_fusion_forward_compute(backend.get(),
+                                                input_params,
+                                                input_data,
+                                                input_shapes,
+                                                data_shape,
+                                                sigmoid_0,
+                                                sigmoid_1,
+                                                expected);
+    }
+}
+
+void sigmoid_multiply_fusion_backward_compute(runtime::Backend* backend,
+                                              const op::ParameterVector& input_params,
+                                              const vector<vector<float>>& input_data,
+                                              const vector<Shape>& input_shapes,
+                                              const vector<float> delta_data,
+                                              const Shape& delta_shape,
+                                              const Shape& d_input_0_shape,
+                                              const Shape& d_input_1_shape,
+                                              shared_ptr<Node> input_0_node,
+                                              shared_ptr<Node> input_1_node,
+                                              shared_ptr<Node> input_0_adjoint,
+                                              shared_ptr<Node> input_1_adjoint,
+                                              const vector<float>& expected_0,
+                                              const vector<float>& expected_1)
+{
+    vector<shared_ptr<runtime::Tensor>> input_tensors;
+    for (int i = 0; i < input_params.size(); ++i)
+    {
+        input_tensors.push_back(backend->create_tensor(element::f32, input_shapes[i]));
+        copy_data(input_tensors[i], input_data[i]);
+    }
+
+    auto delta_param = make_shared<op::Parameter>(element::f32, delta_shape);
+    shared_ptr<runtime::Tensor> delta_tensor = backend->create_tensor(element::f32, delta_shape);
+    copy_data(delta_tensor, delta_data);
+
+    op::ParameterVector back_params(input_params);
+    back_params.push_back(delta_param);
+    input_tensors.push_back(delta_tensor);
+
+    shared_ptr<runtime::Tensor> d_input_0_tensor =
+        backend->create_tensor(element::f32, d_input_0_shape);
+    shared_ptr<runtime::Tensor> d_input_1_tensor =
+        backend->create_tensor(element::f32, d_input_1_shape);
+
+    using FunctionType = op::SigmoidMultiply::FunctionType;
+    auto input_0_type = op::SigmoidMultiply::identify_node_type(input_0_node);
+    auto input_1_type = op::SigmoidMultiply::identify_node_type(input_1_node);
+    // for Identity functions, we use the node itself, otherwise use its input
+    // where we will apply the function of input node
+    auto input_0_alt =
+        (input_0_type == FunctionType::Identity) ? input_0_node : input_0_node->get_argument(0);
+    auto input_1_alt =
+        (input_1_type == FunctionType::Identity) ? input_1_node : input_1_node->get_argument(0);
+    auto sigmoid_mul =
+        make_shared<op::SigmoidMultiply>(input_0_alt, input_1_alt, input_0_type, input_1_type);
+
+    ngraph::autodiff::Adjoints adjoints(NodeVector{sigmoid_mul}, NodeVector{delta_param});
+    auto d_input_0 = adjoints.backprop_node(input_0_adjoint);
+    auto d_input_1 = adjoints.backprop_node(input_1_adjoint);
+    auto df = make_shared<Function>(NodeVector{d_input_0, d_input_1}, back_params);
+    backend->call_with_validate(df, {d_input_0_tensor, d_input_1_tensor}, input_tensors);
+    EXPECT_TRUE(test::all_close(read_vector<float>(d_input_0_tensor), expected_0));
+    EXPECT_TRUE(test::all_close(read_vector<float>(d_input_1_tensor), expected_1));
+}
+
+TEST(cpu_fusion, sigmoid_multiply_fusion_backward)
+{
+    auto backend = runtime::Backend::create("CPU");
+
+    Shape data_shape{1, 1, 2, 2};
+    Shape const_shape{1};
+
+    vector<float> input_0_data{1.f, 2.f, 3.f, 4.f};
+    vector<float> input_1_data{1.2f, 2.2f, 3.2f, 4.2f};
+    vector<float> const_data{1.2f};
+    vector<float> delta_data(shape_size(data_shape), 20.0f);
+
+    {
+        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_2_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto sigmoid_0 = make_shared<op::Sigmoid>(input_0_param);
+        auto sigmoid_1 = make_shared<op::Add>(input_1_param, input_2_param);
+        vector<float> expected_0{8.65093f, 8.81946f, 5.60191f, 2.89668f};
+        vector<float> expected_1{14.6212f, 17.6159f, 19.0515f, 19.6403f};
+        op::ParameterVector input_params{input_0_param, input_1_param, input_2_param};
+        vector<vector<float>> input_data{input_0_data, input_0_data, input_1_data};
+        vector<Shape> input_shapes{data_shape, data_shape, data_shape};
+        sigmoid_multiply_fusion_backward_compute(backend.get(),
+                                                 input_params,
+                                                 input_data,
+                                                 input_shapes,
+                                                 delta_data,
+                                                 data_shape,
+                                                 data_shape,
+                                                 data_shape,
+                                                 sigmoid_0,
+                                                 sigmoid_1,
+                                                 input_0_param,
+                                                 sigmoid_1,
+                                                 expected_0,
+                                                 expected_1);
+    }
+    {
+        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_1_param = make_shared<op::Parameter>(element::f32, const_shape);
+        auto sigmoid_0 = make_shared<op::Broadcast>(input_1_param, data_shape, AxisSet{1, 2, 3});
+        auto sigmoid_1 = make_shared<op::Tanh>(input_0_param);
+        vector<float> expected_0{15.2319f, 19.2806f, 19.9011f, 19.9866f};
+        vector<float> expected_1{10.0794f, 1.69562f, 0.236785f, 0.0321828f};
+        op::ParameterVector input_params{input_0_param, input_1_param};
+        vector<vector<float>> input_data{input_0_data, const_data};
+        vector<Shape> input_shapes{data_shape, const_shape};
+        sigmoid_multiply_fusion_backward_compute(backend.get(),
+                                                 input_params,
+                                                 input_data,
+                                                 input_shapes,
+                                                 delta_data,
+                                                 data_shape,
+                                                 data_shape,
+                                                 data_shape,
+                                                 sigmoid_0,
+                                                 sigmoid_1,
+                                                 sigmoid_0,
+                                                 input_0_param,
+                                                 expected_0,
+                                                 expected_1);
+    }
+    {
+        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_1_param = make_shared<op::Parameter>(element::f32, const_shape);
+        auto sigmoid_0 = make_shared<op::Tanh>(input_0_param);
+        auto sigmoid_1 = make_shared<op::Broadcast>(input_1_param, data_shape, AxisSet{1, 2, 3});
+        vector<float> expected_0{10.0794f, 1.69562f, 0.236785f, 0.0321828f};
+        vector<float> expected_1{15.2319f, 19.2806f, 19.9011f, 19.9866f};
+        op::ParameterVector input_params{input_0_param, input_1_param};
+        vector<vector<float>> input_data{input_0_data, const_data};
+        vector<Shape> input_shapes{data_shape, const_shape};
+        sigmoid_multiply_fusion_backward_compute(backend.get(),
+                                                 input_params,
+                                                 input_data,
+                                                 input_shapes,
+                                                 delta_data,
+                                                 data_shape,
+                                                 data_shape,
+                                                 data_shape,
+                                                 sigmoid_0,
+                                                 sigmoid_1,
+                                                 input_0_param,
+                                                 sigmoid_1,
+                                                 expected_0,
+                                                 expected_1);
+    }
+    {
+        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto sigmoid_0 = make_shared<op::Sigmoid>(input_0_param);
+        auto sigmoid_1 = make_shared<op::Sigmoid>(input_1_param);
+        vector<float> expected_0{3.02202f, 1.89041f, 0.868146f, 0.348035f};
+        vector<float> expected_1{2.60102f, 1.58192f, 0.716941f, 0.285879f};
+        op::ParameterVector input_params{input_0_param, input_1_param};
+        vector<vector<float>> input_data{input_0_data, input_1_data};
+        vector<Shape> input_shapes{data_shape, data_shape};
+        sigmoid_multiply_fusion_backward_compute(backend.get(),
+                                                 input_params,
+                                                 input_data,
+                                                 input_shapes,
+                                                 delta_data,
+                                                 data_shape,
+                                                 data_shape,
+                                                 data_shape,
+                                                 sigmoid_0,
+                                                 sigmoid_1,
+                                                 input_0_param,
+                                                 input_1_param,
+                                                 expected_0,
+                                                 expected_1);
+    }
+    {
+        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto sigmoid_0 = make_shared<op::Sigmoid>(input_0_param);
+        auto sigmoid_1 = make_shared<op::Tanh>(input_1_param);
+        vector<float> expected_0{3.27813f, 2.04894f, 0.900536f, 0.353095f};
+        vector<float> expected_1{4.45975f, 0.84425f, 0.126201f, 0.0176579f};
+        op::ParameterVector input_params{input_0_param, input_1_param};
+        vector<vector<float>> input_data{input_0_data, input_1_data};
+        vector<Shape> input_shapes{data_shape, data_shape};
+        sigmoid_multiply_fusion_backward_compute(backend.get(),
+                                                 input_params,
+                                                 input_data,
+                                                 input_shapes,
+                                                 delta_data,
+                                                 data_shape,
+                                                 data_shape,
+                                                 data_shape,
+                                                 sigmoid_0,
+                                                 sigmoid_1,
+                                                 input_0_param,
+                                                 input_1_param,
+                                                 expected_0,
+                                                 expected_1);
+    }
+    {
+        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto sigmoid_0 = make_shared<op::Tanh>(input_0_param);
+        auto sigmoid_1 = make_shared<op::Sigmoid>(input_1_param);
+        vector<float> expected_0{6.45521f, 1.27207f, 0.189593f, 0.0264228f};
+        vector<float> expected_1{2.70967f, 1.7314f, 0.748913f, 0.29092f};
+        op::ParameterVector input_params{input_0_param, input_1_param};
+        vector<vector<float>> input_data{input_0_data, input_1_data};
+        vector<Shape> input_shapes{data_shape, data_shape};
+        sigmoid_multiply_fusion_backward_compute(backend.get(),
+                                                 input_params,
+                                                 input_data,
+                                                 input_shapes,
+                                                 delta_data,
+                                                 data_shape,
+                                                 data_shape,
+                                                 data_shape,
+                                                 sigmoid_0,
+                                                 sigmoid_1,
+                                                 input_0_param,
+                                                 input_1_param,
+                                                 expected_0,
+                                                 expected_1);
+    }
+    {
+        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
+        auto sigmoid_0 = make_shared<op::Tanh>(input_0_param);
+        auto sigmoid_1 = make_shared<op::Tanh>(input_1_param);
+        vector<float> expected_0{7.00227f, 1.37874f, 0.196666f, 0.026807f};
+        vector<float> expected_1{4.64603f, 0.924027f, 0.131829f, 0.0179692f};
+        op::ParameterVector input_params{input_0_param, input_1_param};
+        vector<vector<float>> input_data{input_0_data, input_1_data};
+        vector<Shape> input_shapes{data_shape, data_shape};
+        sigmoid_multiply_fusion_backward_compute(backend.get(),
+                                                 input_params,
+                                                 input_data,
+                                                 input_shapes,
+                                                 delta_data,
+                                                 data_shape,
+                                                 data_shape,
+                                                 data_shape,
+                                                 sigmoid_0,
+                                                 sigmoid_1,
+                                                 input_0_param,
+                                                 input_1_param,
+                                                 expected_0,
+                                                 expected_1);
+    }
+}
+
+TEST(cpu_fusion, fuse_batch_dot)
+{
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUBatchFusion>();
+    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/batch_dot_3.json");
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    pass_manager.run_passes(func);
+    size_t ccg = count_ops_of_type<op::BatchDot>(func);
+    ASSERT_EQ(ccg, 1);
+}
+
+TEST(cpu_fusion, fuse_batch_dot_forward)
+{
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUBatchFusion>();
+
+    const std::string file_name("mxnet/batch_dot_3.json");
+    auto cpu_f = make_function(file_name);
+    auto int_f = make_function(file_name);
+    pass_manager.run_passes(cpu_f);
+    test::Uniform<float> rng(0.0f, 1.0f);
+    vector<vector<float>> args;
+
+    for (shared_ptr<op::Parameter> param : int_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    for (size_t i = 0; i < int_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
+    }
+}
+
+TEST(cpu_fusion, fuse_rnn_across_layer)
+{
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::RNNFusion>();
+    pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>();
+    pass_manager.register_pass<runtime::cpu::pass::MultiLayerRNNFusion>();
+    const string json_path =
+        file_util::path_join(SERIALIZED_ZOO, "mxnet/2rnn_layer_1timestep.json");
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    pass_manager.run_passes(func);
+    size_t ref_rnn_count = 1;
+    auto rnn_count = count_ops_of_type<op::Rnn>(func);
+    EXPECT_EQ(ref_rnn_count, rnn_count);
+}
+
+TEST(cpu_fusion, fuse_rnn_across_2layer_1timestep)
+{
+    const std::string file_name("mxnet/2rnn_layer_1timestep.json");
+    auto cpu_f = make_function(file_name);
+    auto int_f = make_function(file_name);
+    test::Uniform<float> rng(0.0f, 1.0f);
+    vector<vector<float>> args;
+
+    for (shared_ptr<op::Parameter> param : int_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+
+    // TODO (pruthvi): Enable this after fixing failing
+    // mxnet rnn unit tests
+    // EXPECT_EQ(1, count_ops_of_type<op::Rnn>(cpu_f));
+    for (size_t i = 0; i < cpu_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(1), int_results.at(1), 1.0e-4f, 1.0e-4f));
+    }
+}
+
+static void check_bounded_relu(Shape param_shape, float constant_val)
+{
+    auto make_function = [](Shape input_shape, float alpha_val) {
+        auto relu_input = std::make_shared<op::Parameter>(element::f32, input_shape);
+        auto relu = std::make_shared<op::Relu>(relu_input);
+        auto alpha = op::Constant::create<float>(
+            element::f32, input_shape, std::vector<float>(1.0f, alpha_val));
+        auto min = std::make_shared<op::Minimum>(relu, alpha);
+        auto f = make_shared<Function>(NodeVector{min}, op::ParameterVector{relu_input});
+        return f;
+    };
+
+    auto cpu_f = make_function(param_shape, constant_val);
+    auto int_f = make_function(param_shape, constant_val);
+    test::Uniform<float> rng(-10.0f, 10.0f);
+    vector<vector<float>> args;
+
+    for (shared_ptr<op::Parameter> param : int_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+
+    EXPECT_EQ(1, count_ops_of_type<op::BoundedRelu>(cpu_f));
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0), 1.0e-4f, 1.0e-4f));
+}
+
+TEST(cpu_fusion, fuse_bounded_relu_inter_vs_cpu)
+{
+    check_bounded_relu(Shape{4, 3, 2, 2}, 6.0f);
+    check_bounded_relu(Shape{4, 3}, 4.0f);
+    check_bounded_relu(Shape{4, 3, 2}, 2.0f);
+}
+
+TEST(cpu_fusion, dot_batch_forward)
+{
+    const Shape shape_a{2, 3, 2};
+    const Shape shape_b{2, 3};
+
+    auto generate_func = [&shape_a, &shape_b]() -> shared_ptr<Function> {
+        auto a = make_shared<op::Parameter>(element::f32, shape_a);
+        auto b = make_shared<op::Parameter>(element::f32, shape_b);
+        auto dot = make_shared<op::Dot>(a, b);
+        return make_shared<Function>(dot, op::ParameterVector{a, b});
+    };
+    shared_ptr<Function> cpu_func = generate_func();
+    shared_ptr<Function> int_func = generate_func();
+
+    test::Uniform<float> rng(0.0f, 1.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : int_func->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+
+    auto int_results = execute(int_func, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_func, args, "CPU");
+    for (size_t i = 0; i < cpu_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
+    }
+}
+static std::shared_ptr<Function>
+    create_rnn_input_linear_transformation_function(size_t num_timesteps, bool data_is_4d = false)
+{
+    auto W = std::make_shared<op::Parameter>(element::f32, Shape{400, 50});
+    auto bias = std::make_shared<op::Parameter>(element::f32, Shape{400});
+    op::ParameterVector params{W, bias};
+    auto create_graph = [&]() -> std::shared_ptr<Node> {
+
+        auto data_param = (data_is_4d)
+                              ? std::make_shared<op::Parameter>(element::f32, Shape{2, 5, 1, 50})
+                              : std::make_shared<op::Parameter>(element::f32, Shape{10, 1, 50});
+        params.push_back(data_param);
+        auto reshape_axis_order = data_is_4d ? AxisVector{0, 1, 2, 3} : AxisVector{0, 1, 2};
+        auto data_param_reshape =
+            std::make_shared<op::Reshape>(data_param, reshape_axis_order, Shape{10, 50});
+        auto W_reshape = std::make_shared<op::Reshape>(W, AxisVector{1, 0}, Shape{50, 400});
+        auto dot = std::make_shared<op::Dot>(data_param_reshape, W_reshape);
+        auto bias_broadcast = make_shared<op::Broadcast>(bias, dot->get_shape(), AxisSet{0});
+        auto add_bias = std::make_shared<op::Add>(dot, bias_broadcast);
+        return add_bias;
+
+    };
+
+    NodeVector graph_nodes;
+    for (size_t i = 0; i < num_timesteps; i++)
+    {
+        graph_nodes.push_back(create_graph());
+    }
+    auto concat = std::make_shared<op::Concat>(graph_nodes, 0);
+    return make_shared<Function>(NodeVector{concat}, params);
+}
+
+TEST(cpu_fusion, fuse_rnn_input_across_time_steps)
+{
+    auto func = create_rnn_input_linear_transformation_function(10);
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPURnnMatFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+    pass_manager.run_passes(func);
+    size_t ref_matmulbias_count = 1;
+    auto matmulbias_count = count_ops_of_type<op::MatmulBias>(func);
+    EXPECT_EQ(ref_matmulbias_count, matmulbias_count);
+}
+
+TEST(cpu_fusion, fuse_rnn_input_across_time_steps_4d_data)
+{
+    auto func = create_rnn_input_linear_transformation_function(10, true);
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPURnnMatFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+    pass_manager.run_passes(func);
+    size_t ref_matmulbias_count = 10; // no CPURnnMatFusion transformations
+    auto matmulbias_count = count_ops_of_type<op::MatmulBias>(func);
+    EXPECT_EQ(ref_matmulbias_count, matmulbias_count);
+}
+
+TEST(cpu_fusion, rnn_input_fusion_inter_vs_cpu)
+{
+    shared_ptr<Function> cpu_func = create_rnn_input_linear_transformation_function(10);
+    shared_ptr<Function> int_func = create_rnn_input_linear_transformation_function(10);
+
+    test::Uniform<float> rng(-10.0f, 10.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : int_func->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+
+    auto int_results = execute(int_func, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_func, args, "CPU");
+    for (size_t i = 0; i < cpu_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
+    }
+}
diff --git a/test/type_prop.cpp b/test/type_prop.cpp
index e2dcf156109..282ff1622f6 100644
--- a/test/type_prop.cpp
+++ b/test/type_prop.cpp
@@ -208,7 +208,7 @@ TEST(type_prop, batchnorm_training_rank_less_than_2)
     auto dummy = make_shared<op::Parameter>(element::f32, Shape{1});
     try
     {
-        auto bc = make_shared<op::BatchNormTraining>(0.001, dummy, dummy, dummy);
+        auto bc = make_shared<op::BatchNormTraining>(dummy, dummy, dummy, 0.001);
         FAIL() << "BatchNorm c-tor should throw for tensors whose rank is less than 2";
     }
     catch (const NodeValidationError& error)
@@ -229,7 +229,7 @@ TEST(type_prop, batchnorm_training_zero_channel_check)
     auto beta = make_shared<op::Parameter>(element::f32, Shape{0});
     try
     {
-        auto bc = make_shared<op::BatchNormTraining>(0.001, gamma, beta, data_batch);
+        auto bc = make_shared<op::BatchNormTraining>(data_batch, gamma, beta, 0.001);
         FAIL() << "BatchNorm c-tor should throw for tensors w/ zero-dimension channels";
     }
     catch (const NodeValidationError& error)
@@ -250,7 +250,7 @@ TEST(type_prop, batchnorm_training_et_check)
 
     try
     {
-        auto bc = make_shared<op::BatchNormTraining>(0.001, gamma, beta, data_batch);
+        auto bc = make_shared<op::BatchNormTraining>(data_batch, gamma, beta, 0.001);
         FAIL() << "BatchNorm c-tor should throw for different element types";
     }
     catch (const NodeValidationError& error)
@@ -271,7 +271,7 @@ TEST(type_prop, batchnorm_training_shape_check)
 
     try
     {
-        auto bc = make_shared<op::BatchNormTraining>(0.001, gamma, beta, data_batch);
+        auto bc = make_shared<op::BatchNormTraining>(data_batch, gamma, beta, 0.001);
         FAIL() << "BatchNorm c-tor should throw if gamma and beta shapes don't match";
     }
     catch (const NodeValidationError& error)
@@ -296,7 +296,7 @@ TEST(type_prop, batchnorm_training_backprop_et_check)
     try
     {
         auto bc = make_shared<op::BatchNormTrainingBackprop>(
-            0.001, gamma, beta, data_batch, mean, variance, delta);
+            data_batch, gamma, beta, mean, variance, delta, 0.001);
         FAIL() << "Deduced type should disagree with c-tor arguments";
     }
     catch (const NodeValidationError& error)
@@ -321,7 +321,7 @@ TEST(type_prop, batchnorm_training_backprop_shape_check)
     try
     {
         auto bc = make_shared<op::BatchNormTrainingBackprop>(
-            0.001, gamma, beta, data_batch, mean, variance, delta);
+            data_batch, gamma, beta, mean, variance, delta, 0.001);
         FAIL() << "Deduced type should disagree with c-tor arguments";
     }
     catch (const NodeValidationError& error)
@@ -345,7 +345,7 @@ TEST(type_prop, batchnorm_training_backprop_delta_check)
     try
     {
         auto bc = make_shared<op::BatchNormTrainingBackprop>(
-            0.001, dummy, dummy, param, dummy, dummy, delta);
+            param, dummy, dummy, dummy, dummy, delta, 0.001);
         FAIL() << "Deduced type should disagree with c-tor arguments";
     }
     catch (const NodeValidationError& error)
@@ -379,7 +379,7 @@ TEST(type_prop, batchnorm_inference_partial_all_rank_dynamic)
     auto mean = make_shared<op::Parameter>(mean_et, mean_shape);
     auto variance = make_shared<op::Parameter>(variance_et, variance_shape);
 
-    auto bn = make_shared<op::BatchNormInference>(epsilon, gamma, beta, data_batch, mean, variance);
+    auto bn = make_shared<op::BatchNormInference>(data_batch, gamma, beta, mean, variance, epsilon);
 
     ASSERT_EQ(bn->get_output_size(), 1);
     ASSERT_EQ(bn->get_output_element_type(0), data_batch_et);
@@ -407,7 +407,7 @@ TEST(type_prop, batchnorm_inference_partial_input_rank_static_dynamic_ok)
     auto mean = make_shared<op::Parameter>(mean_et, mean_shape);
     auto variance = make_shared<op::Parameter>(variance_et, variance_shape);
 
-    auto bn = make_shared<op::BatchNormInference>(epsilon, gamma, beta, data_batch, mean, variance);
+    auto bn = make_shared<op::BatchNormInference>(data_batch, gamma, beta, mean, variance, epsilon);
 
     ASSERT_EQ(bn->get_output_size(), 1);
     ASSERT_EQ(bn->get_output_element_type(0), data_batch_et);
@@ -439,7 +439,7 @@ TEST(type_prop, batchnorm_inference_partial_input_rank_static_dynamic_zero_chann
     try
     {
         auto bn =
-            make_shared<op::BatchNormInference>(epsilon, gamma, beta, data_batch, mean, variance);
+            make_shared<op::BatchNormInference>(data_batch, gamma, beta, mean, variance, epsilon);
         FAIL() << "Zero channel count not detected";
     }
     catch (const NodeValidationError& error)
@@ -472,7 +472,7 @@ TEST(type_prop, batchnorm_inference_partial_input_rank_dynamic_some_rank_static_
     auto mean = make_shared<op::Parameter>(mean_et, mean_shape);
     auto variance = make_shared<op::Parameter>(variance_et, variance_shape);
 
-    auto bn = make_shared<op::BatchNormInference>(epsilon, gamma, beta, data_batch, mean, variance);
+    auto bn = make_shared<op::BatchNormInference>(data_batch, gamma, beta, mean, variance, epsilon);
 
     ASSERT_EQ(bn->get_output_size(), 1);
     ASSERT_EQ(bn->get_output_element_type(0), data_batch_et);
@@ -502,7 +502,7 @@ TEST(type_prop, batchnorm_inference_partial_input_rank_dynamic_some_rank_static_
     try
     {
         auto bn =
-            make_shared<op::BatchNormInference>(epsilon, gamma, beta, data_batch, mean, variance);
+            make_shared<op::BatchNormInference>(data_batch, gamma, beta, mean, variance, epsilon);
         FAIL() << "Wrong gamma/beta/mean/variance shape not detected";
     }
     catch (const NodeValidationError& error)
@@ -541,7 +541,7 @@ TEST(type_prop,
     try
     {
         auto bn =
-            make_shared<op::BatchNormInference>(epsilon, gamma, beta, data_batch, mean, variance);
+            make_shared<op::BatchNormInference>(data_batch, gamma, beta, mean, variance, epsilon);
         FAIL() << "Inconsistent gamma/beta/mean/variance shape not detected";
     }
     catch (const NodeValidationError& error)
@@ -579,7 +579,7 @@ TEST(type_prop,
     try
     {
         auto bn =
-            make_shared<op::BatchNormInference>(epsilon, gamma, beta, data_batch, mean, variance);
+            make_shared<op::BatchNormInference>(data_batch, gamma, beta, mean, variance, epsilon);
         FAIL() << "Inconsistent gamma/beta/mean/variance channel count not detected";
     }
     catch (const NodeValidationError& error)
@@ -613,7 +613,7 @@ TEST(type_prop, batchnorm_inference_partial_input_rank_static_dynamic_some_stati
     auto mean = make_shared<op::Parameter>(mean_et, mean_shape);
     auto variance = make_shared<op::Parameter>(variance_et, variance_shape);
 
-    auto bn = make_shared<op::BatchNormInference>(epsilon, gamma, beta, data_batch, mean, variance);
+    auto bn = make_shared<op::BatchNormInference>(data_batch, gamma, beta, mean, variance, epsilon);
 
     ASSERT_EQ(bn->get_output_size(), 1);
     ASSERT_EQ(bn->get_output_element_type(0), data_batch_et);
@@ -645,7 +645,7 @@ TEST(type_prop,
     try
     {
         auto bn =
-            make_shared<op::BatchNormInference>(epsilon, gamma, beta, data_batch, mean, variance);
+            make_shared<op::BatchNormInference>(data_batch, gamma, beta, mean, variance, epsilon);
         FAIL() << "Inconsistent input/gamma/beta/mean/variance channel count not detected";
     }
     catch (const NodeValidationError& error)
@@ -674,7 +674,7 @@ TEST(type_prop, batchnorm_training_partial_all_rank_dynamic)
     auto gamma = make_shared<op::Parameter>(gamma_et, gamma_shape);
     auto beta = make_shared<op::Parameter>(beta_et, beta_shape);
 
-    auto bn = make_shared<op::BatchNormTraining>(epsilon, gamma, beta, data_batch);
+    auto bn = make_shared<op::BatchNormTraining>(data_batch, gamma, beta, epsilon);
 
     ASSERT_EQ(bn->get_output_size(), 3);
     ASSERT_EQ(bn->get_output_element_type(0), data_batch_et);
@@ -700,7 +700,7 @@ TEST(type_prop, batchnorm_training_partial_input_rank_static_dynamic_batch_size_
     auto gamma = make_shared<op::Parameter>(gamma_et, gamma_shape);
     auto beta = make_shared<op::Parameter>(beta_et, beta_shape);
 
-    auto bn = make_shared<op::BatchNormTraining>(epsilon, gamma, beta, data_batch);
+    auto bn = make_shared<op::BatchNormTraining>(data_batch, gamma, beta, epsilon);
 
     ASSERT_EQ(bn->get_output_size(), 3);
     ASSERT_EQ(bn->get_output_element_type(0), data_batch_et);
@@ -727,7 +727,7 @@ TEST(type_prop, batchnorm_training_partial_input_rank_static_dynamic_channel_cou
     auto gamma = make_shared<op::Parameter>(gamma_et, gamma_shape);
     auto beta = make_shared<op::Parameter>(beta_et, beta_shape);
 
-    auto bn = make_shared<op::BatchNormTraining>(epsilon, gamma, beta, data_batch);
+    auto bn = make_shared<op::BatchNormTraining>(data_batch, gamma, beta, epsilon);
 
     ASSERT_EQ(bn->get_output_size(), 3);
     ASSERT_EQ(bn->get_output_element_type(0), data_batch_et);
@@ -755,7 +755,7 @@ TEST(type_prop, batchnorm_training_partial_input_rank_static_dynamic_zero_channe
     auto beta = make_shared<op::Parameter>(beta_et, beta_shape);
     try
     {
-        auto bn = make_shared<op::BatchNormTraining>(epsilon, gamma, beta, data_batch);
+        auto bn = make_shared<op::BatchNormTraining>(data_batch, gamma, beta, epsilon);
         FAIL() << "Zero channel count not detected";
     }
     catch (const NodeValidationError& error)
@@ -782,7 +782,7 @@ TEST(type_prop, batchnorm_training_partial_input_rank_dynamic_some_rank_static_d
     auto gamma = make_shared<op::Parameter>(gamma_et, gamma_shape);
     auto beta = make_shared<op::Parameter>(beta_et, beta_shape);
 
-    auto bn = make_shared<op::BatchNormTraining>(epsilon, gamma, beta, data_batch);
+    auto bn = make_shared<op::BatchNormTraining>(data_batch, gamma, beta, epsilon);
 
     ASSERT_EQ(bn->get_output_size(), 3);
     ASSERT_EQ(bn->get_output_element_type(0), data_batch_et);
@@ -809,7 +809,7 @@ TEST(type_prop, batchnorm_training_partial_input_rank_dynamic_some_rank_static_d
 
     try
     {
-        auto bn = make_shared<op::BatchNormTraining>(epsilon, gamma, beta, data_batch);
+        auto bn = make_shared<op::BatchNormTraining>(data_batch, gamma, beta, epsilon);
         FAIL() << "Wrong gamma/beta shape not detected";
     }
     catch (const NodeValidationError& error)
@@ -840,7 +840,7 @@ TEST(type_prop,
 
     try
     {
-        auto bn = make_shared<op::BatchNormTraining>(epsilon, gamma, beta, data_batch);
+        auto bn = make_shared<op::BatchNormTraining>(data_batch, gamma, beta, epsilon);
         FAIL() << "Inconsistent gamma/beta shape not detected";
     }
     catch (const NodeValidationError& error)
@@ -870,7 +870,7 @@ TEST(type_prop,
 
     try
     {
-        auto bn = make_shared<op::BatchNormTraining>(epsilon, gamma, beta, data_batch);
+        auto bn = make_shared<op::BatchNormTraining>(data_batch, gamma, beta, epsilon);
         FAIL() << "Inconsistent gamma/beta channel count not detected";
     }
     catch (const NodeValidationError& error)
@@ -897,7 +897,7 @@ TEST(type_prop, batchnorm_training_partial_input_rank_static_dynamic_some_static
     auto gamma = make_shared<op::Parameter>(gamma_et, gamma_shape);
     auto beta = make_shared<op::Parameter>(beta_et, beta_shape);
 
-    auto bn = make_shared<op::BatchNormTraining>(epsilon, gamma, beta, data_batch);
+    auto bn = make_shared<op::BatchNormTraining>(data_batch, gamma, beta, epsilon);
 
     ASSERT_EQ(bn->get_output_size(), 3);
     ASSERT_EQ(bn->get_output_element_type(0), data_batch_et);
@@ -926,7 +926,7 @@ TEST(type_prop,
 
     try
     {
-        auto bn = make_shared<op::BatchNormTraining>(epsilon, gamma, beta, data_batch);
+        auto bn = make_shared<op::BatchNormTraining>(data_batch, gamma, beta, epsilon);
         FAIL() << "Inconsistent input/gamma/beta channel count not detected";
     }
     catch (const NodeValidationError& error)
@@ -970,7 +970,7 @@ TEST(type_prop, batchnorm_training_backprop_partial_all_rank_dynamic)
     auto delta = make_shared<op::Parameter>(delta_et, delta_shape);
 
     auto bn = make_shared<op::BatchNormTrainingBackprop>(
-        epsilon, gamma, beta, data_batch, mean, variance, delta);
+        data_batch, gamma, beta, mean, variance, delta, epsilon);
 
     ASSERT_EQ(bn->get_output_size(), 3);
     ASSERT_EQ(bn->get_output_element_type(0), data_batch_et);
@@ -1006,7 +1006,7 @@ TEST(type_prop, batchnorm_training_backprop_partial_input_rank_static_dynamic_ok
     auto delta = make_shared<op::Parameter>(delta_et, delta_shape);
 
     auto bn = make_shared<op::BatchNormTrainingBackprop>(
-        epsilon, gamma, beta, data_batch, mean, variance, delta);
+        data_batch, gamma, beta, mean, variance, delta, epsilon);
 
     ASSERT_EQ(bn->get_output_size(), 3);
     ASSERT_EQ(bn->get_output_element_type(0), data_batch_et);
@@ -1045,7 +1045,7 @@ TEST(type_prop, batchnorm_training_backprop_partial_input_rank_static_dynamic_ze
     try
     {
         auto bn = make_shared<op::BatchNormTrainingBackprop>(
-            epsilon, gamma, beta, data_batch, mean, variance, delta);
+            data_batch, gamma, beta, mean, variance, delta, epsilon);
         FAIL() << "Zero channel count not detected";
     }
     catch (const NodeValidationError& error)
@@ -1082,7 +1082,7 @@ TEST(type_prop, batchnorm_training_backprop_partial_delta_rank_static_dynamic_ok
     auto delta = make_shared<op::Parameter>(delta_et, delta_shape);
 
     auto bn = make_shared<op::BatchNormTrainingBackprop>(
-        epsilon, gamma, beta, data_batch, mean, variance, delta);
+        data_batch, gamma, beta, mean, variance, delta, epsilon);
 
     ASSERT_EQ(bn->get_output_size(), 3);
     ASSERT_EQ(bn->get_output_element_type(0), data_batch_et);
@@ -1118,7 +1118,7 @@ TEST(type_prop, batchnorm_training_backprop_partial_delta_rank_static_dynamic_ch
     auto delta = make_shared<op::Parameter>(delta_et, delta_shape);
 
     auto bn = make_shared<op::BatchNormTrainingBackprop>(
-        epsilon, gamma, beta, data_batch, mean, variance, delta);
+        data_batch, gamma, beta, mean, variance, delta, epsilon);
 
     ASSERT_EQ(bn->get_output_size(), 3);
     ASSERT_EQ(bn->get_output_element_type(0), data_batch_et);
@@ -1156,7 +1156,7 @@ TEST(type_prop, batchnorm_training_backprop_partial_delta_rank_static_dynamic_ze
     try
     {
         auto bn = make_shared<op::BatchNormTrainingBackprop>(
-            epsilon, gamma, beta, data_batch, mean, variance, delta);
+            data_batch, gamma, beta, mean, variance, delta, epsilon);
         FAIL() << "Zero channel count not detected";
     }
     catch (const NodeValidationError& error)
@@ -1194,7 +1194,7 @@ TEST(type_prop,
     auto delta = make_shared<op::Parameter>(delta_et, delta_shape);
 
     auto bn = make_shared<op::BatchNormTrainingBackprop>(
-        epsilon, gamma, beta, data_batch, mean, variance, delta);
+        data_batch, gamma, beta, mean, variance, delta, epsilon);
 
     ASSERT_EQ(bn->get_output_size(), 3);
     ASSERT_EQ(bn->get_output_element_type(0), data_batch_et);
@@ -1233,7 +1233,7 @@ TEST(
     try
     {
         auto bn = make_shared<op::BatchNormTrainingBackprop>(
-            epsilon, gamma, beta, data_batch, mean, variance, delta);
+            data_batch, gamma, beta, mean, variance, delta, epsilon);
         FAIL() << "Wrong gamma/beta/mean/variance shape not detected";
     }
     catch (const NodeValidationError& error)
@@ -1276,7 +1276,7 @@ TEST(
     try
     {
         auto bn = make_shared<op::BatchNormTrainingBackprop>(
-            epsilon, gamma, beta, data_batch, mean, variance, delta);
+            data_batch, gamma, beta, mean, variance, delta, epsilon);
         FAIL() << "Wrong gamma/beta/mean/variance shape not detected";
     }
     catch (const NodeValidationError& error)
@@ -1318,7 +1318,7 @@ TEST(
     try
     {
         auto bn = make_shared<op::BatchNormTrainingBackprop>(
-            epsilon, gamma, beta, data_batch, mean, variance, delta);
+            data_batch, gamma, beta, mean, variance, delta, epsilon);
         FAIL() << "nconsistent gamma/beta/mean/variance channel count not detected";
     }
     catch (const NodeValidationError& error)
@@ -1357,7 +1357,7 @@ TEST(type_prop,
     auto delta = make_shared<op::Parameter>(delta_et, delta_shape);
 
     auto bn = make_shared<op::BatchNormTrainingBackprop>(
-        epsilon, gamma, beta, data_batch, mean, variance, delta);
+        data_batch, gamma, beta, mean, variance, delta, epsilon);
 
     ASSERT_EQ(bn->get_output_size(), 3);
     ASSERT_EQ(bn->get_output_element_type(0), data_batch_et);
@@ -1396,7 +1396,7 @@ TEST(
     try
     {
         auto bn = make_shared<op::BatchNormTrainingBackprop>(
-            epsilon, gamma, beta, data_batch, mean, variance, delta);
+            data_batch, gamma, beta, mean, variance, delta, epsilon);
         FAIL() << "Inconsistent delta/gamma/beta/mean/variance channel count not detected";
     }
     catch (const NodeValidationError& error)
@@ -1439,7 +1439,7 @@ TEST(
     try
     {
         auto bn = make_shared<op::BatchNormTrainingBackprop>(
-            epsilon, gamma, beta, data_batch, mean, variance, delta);
+            data_batch, gamma, beta, mean, variance, delta, epsilon);
         FAIL() << "Inconsistent input/delta batch size not detected";
     }
     catch (const NodeValidationError& error)
@@ -1483,7 +1483,7 @@ TEST(
     try
     {
         auto bn = make_shared<op::BatchNormTrainingBackprop>(
-            epsilon, gamma, beta, data_batch, mean, variance, delta);
+            data_batch, gamma, beta, mean, variance, delta, epsilon);
         FAIL() << "Inconsistent input/delta spatial dimensions not detected";
     }
     catch (const NodeValidationError& error)

From 0c7ff9c858b25dedff2f215ce20d535b3cff1cc9 Mon Sep 17 00:00:00 2001
From: Adam Procter <adam.m.procter@intel.com>
Date: Mon, 12 Nov 2018 16:02:06 -0800
Subject: [PATCH 02/10] Update
 doc/sphinx/source/ops/batch_norm_training_backprop.rst

Co-Authored-By: diyessi <diyessi@users.noreply.github.com>
---
 doc/sphinx/source/ops/batch_norm_training_backprop.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/sphinx/source/ops/batch_norm_training_backprop.rst b/doc/sphinx/source/ops/batch_norm_training_backprop.rst
index 68004bbf092..f759cc1fbf1 100644
--- a/doc/sphinx/source/ops/batch_norm_training_backprop.rst
+++ b/doc/sphinx/source/ops/batch_norm_training_backprop.rst
@@ -30,7 +30,7 @@ Inputs
 +----------------------+-------------------------+------------------------------+
 | ``variance``         | same as ``input``       | :math:`(C)`                  |
 +----------------------+-------------------------+------------------------------+
-| ``normalized_delta`` | same as ``input``       | :math:`input`                |
+| ``normalized_delta`` | same as ``input``       | same as ``input``                |
 +----------------------+-------------------------+------------------------------+
 
 

From 9ac7c3d6825bd846541a6eac7889964d0ad178f6 Mon Sep 17 00:00:00 2001
From: Scott Cyphers <scott.cyphers@intel.com>
Date: Mon, 12 Nov 2018 16:04:35 -0800
Subject: [PATCH 03/10] Remove unwanted files

---
 test/backend_test.in.cpp-9bfce850 | 5571 -----------------------------
 test/cpu_fusion.cpp-41c1ba06      | 3132 ----------------
 2 files changed, 8703 deletions(-)
 delete mode 100644 test/backend_test.in.cpp-9bfce850
 delete mode 100644 test/cpu_fusion.cpp-41c1ba06

diff --git a/test/backend_test.in.cpp-9bfce850 b/test/backend_test.in.cpp-9bfce850
deleted file mode 100644
index 1b58addb5bd..00000000000
--- a/test/backend_test.in.cpp-9bfce850
+++ /dev/null
@@ -1,5571 +0,0 @@
-//*****************************************************************************
-// Copyright 2017-2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include <algorithm>
-#include <algorithm>
-#include <cinttypes>
-#include <cmath>
-#include <cstdlib>
-#include <random>
-#include <string>
-#include "gtest/gtest.h"
-
-#include "ngraph/autodiff/adjoints.hpp"
-#include "ngraph/graph_util.hpp"
-#include "ngraph/log.hpp"
-#include "ngraph/ngraph.hpp"
-#include "ngraph/op/experimental/generate_mask.hpp"
-#include "ngraph/serializer.hpp"
-#include "ngraph/state/rng_state.hpp"
-#include "util/all_close.hpp"
-#include "util/all_close_f.hpp"
-#include "util/ndarray.hpp"
-#include "util/random.hpp"
-#include "util/test_control.hpp"
-#include "util/test_tools.hpp"
-
-using namespace std;
-using namespace ngraph;
-
-static string s_manifest = "${MANIFEST}";
-
-static const vector<element::Type> s_known_element_types = {element::from<float>(),
-                                                            element::from<double>(),
-                                                            element::from<int8_t>(),
-                                                            element::from<int16_t>(),
-                                                            element::from<int32_t>(),
-                                                            element::from<int64_t>(),
-                                                            element::from<uint8_t>(),
-                                                            element::from<uint16_t>(),
-                                                            element::from<uint32_t>(),
-                                                            element::from<uint64_t>()};
-
-class UnhandledOp : public ngraph::op::Op
-{
-public:
-    UnhandledOp(const std::shared_ptr<Node>& arg)
-        : Op("Unsupported_op", check_single_output_args({arg}))
-    {
-        constructor_validate_and_infer_types();
-    }
-    shared_ptr<Node> copy_with_new_args(const NodeVector& new_args) const override
-    {
-        return make_shared<UnhandledOp>(new_args[0]);
-    }
-
-protected:
-    void validate_and_infer_types() override
-    {
-        set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
-    }
-};
-
-NGRAPH_TEST(${BACKEND_NAME}, unhandled_op)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto unhandled = make_shared<UnhandledOp>(A);
-    auto f = make_shared<Function>(unhandled, op::ParameterVector{A});
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    shared_ptr<runtime::Tensor> a = backend->create_tensor<float>(shape);
-    shared_ptr<runtime::Tensor> result = backend->create_tensor<float>(shape);
-    ASSERT_THROW(backend->call_with_validate(f, {result}, {a}), unsupported_op);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, function_name)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(A + B, op::ParameterVector{A, B}, "funky func name");
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    shared_ptr<runtime::Tensor> a = backend->create_tensor<float>(shape);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor<float>(shape);
-    shared_ptr<runtime::Tensor> result = backend->create_tensor<float>(shape);
-
-    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
-    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ(read_vector<float>(result),
-              (test::NDArray<float, 2>({{6, 8}, {10, 12}})).get_vector());
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, node_name)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto C = A + B;
-    C->set_name("a node name");
-    auto f = make_shared<Function>(C, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape);
-
-    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
-    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ(read_vector<float>(result),
-              (test::NDArray<float, 2>({{6, 8}, {10, 12}})).get_vector());
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, aliased_output)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto C = A + B;
-    auto D = A * B;
-    auto E = op::Constant::create(element::f32, shape, {1, 2, 3, 4});
-    auto f = make_shared<Function>(NodeVector{C, C, D, D, C, E, E}, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> out1 = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> out2 = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> out3 = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> out4 = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> out5 = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> out6 = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> out7 = backend->create_tensor(element::f32, shape);
-
-    copy_data(a, vector<float>{0, 1, 2, 3});
-    copy_data(b, vector<float>{1, 2, 3, 4});
-    vector<float> expectedC{1, 3, 5, 7};
-    vector<float> expectedD{0, 2, 6, 12};
-    vector<float> expectedE{1, 2, 3, 4};
-
-    backend->call_with_validate(f, {out1, out2, out3, out4, out5, out6, out7}, {a, b});
-    EXPECT_EQ(expectedC, read_vector<float>(out1));
-    EXPECT_EQ(expectedC, read_vector<float>(out2));
-    EXPECT_EQ(expectedD, read_vector<float>(out3));
-    EXPECT_EQ(expectedD, read_vector<float>(out4));
-    EXPECT_EQ(expectedC, read_vector<float>(out5));
-    EXPECT_EQ(expectedE, read_vector<float>(out6));
-    EXPECT_EQ(expectedE, read_vector<float>(out7));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, parameter_as_output)
-{
-    Shape shape{3, 4};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(A, op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape);
-
-    vector<float> expected{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-    vector<float> zero(shape_size(shape), 0);
-    copy_data(a, expected);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ(read_vector<float>(result), expected);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, abc)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto C = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>((A + B) * C, op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape);
-
-    copy_data(a, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
-    copy_data(b, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
-    copy_data(c, test::NDArray<float, 2>({{9, 10}, {11, 12}}).get_vector());
-
-    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ(read_vector<float>(result),
-              (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
-
-    backend->call_with_validate(f, {result}, {b, a, c});
-    EXPECT_EQ(read_vector<float>(result),
-              (test::NDArray<float, 2>({{54, 80}, {110, 144}})).get_vector());
-
-    backend->call_with_validate(f, {result}, {a, c, b});
-    EXPECT_EQ(read_vector<float>(result),
-              (test::NDArray<float, 2>({{50, 72}, {98, 128}})).get_vector());
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, abc_int64)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::i64, shape);
-    auto B = make_shared<op::Parameter>(element::i64, shape);
-    auto C = make_shared<op::Parameter>(element::i64, shape);
-    auto f = make_shared<Function>((A + B) * C, op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::i64, shape);
-    copy_data(a, vector<int64_t>{1, 2, 3, 4});
-    auto b = backend->create_tensor(element::i64, shape);
-    copy_data(b, vector<int64_t>{5, 6, 7, 8});
-    auto c = backend->create_tensor(element::i64, shape);
-    copy_data(c, vector<int64_t>{9, 10, 11, 12});
-    auto result = backend->create_tensor(element::i64, shape);
-
-    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ((vector<int64_t>{54, 80, 110, 144}), read_vector<int64_t>(result));
-
-    backend->call_with_validate(f, {result}, {b, a, c});
-    EXPECT_EQ((vector<int64_t>{54, 80, 110, 144}), read_vector<int64_t>(result));
-
-    backend->call_with_validate(f, {result}, {a, c, b});
-    EXPECT_EQ((vector<int64_t>{50, 72, 98, 128}), read_vector<int64_t>(result));
-}
-
-// Multiple retrive values
-NGRAPH_TEST(${BACKEND_NAME}, multiple_result)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto C = make_shared<op::Parameter>(element::f32, shape);
-    auto A_add_B = make_shared<op::Add>(A, B);
-    auto A_add_B_mul_C = make_shared<op::Multiply>(A_add_B, C);
-
-    auto f =
-        make_shared<Function>(NodeVector{A_add_B, A_add_B_mul_C}, op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{1, 2, 3, 4});
-    auto b = backend->create_tensor(element::f32, shape);
-    copy_data(b, vector<float>{5, 6, 7, 8});
-    auto c = backend->create_tensor(element::f32, shape);
-    copy_data(c, vector<float>{9, 10, 11, 12});
-
-    auto r0 = backend->create_tensor(element::f32, shape);
-    auto r1 = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {r0, r1}, {a, b, c});
-
-    EXPECT_EQ((vector<float>{6, 8, 10, 12}), read_vector<float>(r0));
-    EXPECT_EQ((vector<float>{54, 80, 110, 144}), read_vector<float>(r1));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, batch_norm_one_output)
-{
-    auto shape_in = Shape{2, 3};
-    auto shape_mean = Shape{3};
-
-    auto A = make_shared<op::Parameter>(element::f64, shape_in);
-    auto Mean =
-        op::Constant::create(element::f64, shape_mean, {0.00396654, -1.25294404, 1.16651872});
-    auto Variance =
-        op::Constant::create(element::f64, shape_mean, {2.40871689, 1.44969511, 0.23469392});
-    auto Beta =
-        op::Constant::create(element::f64, shape_mean, {2.14211921, -0.75733924, 0.42210531});
-    auto Gamma =
-        op::Constant::create(element::f64, shape_mean, {1.75437676, 0.37950502, 1.13727544});
-
-    auto BN = make_shared<op::BatchNormInference>(A, Gamma, Beta, Mean, Variance, 1e-3);
-    auto f = make_shared<Function>(BN, op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f64, shape_in);
-    copy_data(
-        a,
-        vector<double>{-1.97431703, -2.06521307, 0.54122217, 2.53375939, -0.22342691, 0.45340773});
-
-    auto result = backend->create_tensor(element::f64, shape_in);
-    vector<double> expected_result{
-        -0.09365749, -1.01327395, -1.04269195, 5.00118923, -0.43295258, -1.24840283};
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_TRUE(test::all_close(vector<double>{expected_result}, read_vector<double>(result)));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, batch_norm_three_outputs)
-{
-    auto shape_in = Shape{2, 3};
-    auto shape_mean = Shape{3};
-
-    auto A = make_shared<op::Parameter>(element::f64, shape_in);
-    auto Beta =
-        op::Constant::create(element::f64, shape_mean, {2.14211921, -0.75733924, 0.42210531});
-    auto Gamma =
-        op::Constant::create(element::f64, shape_mean, {1.75437676, 0.37950502, 1.13727544});
-
-    auto BN = make_shared<op::BatchNormTraining>(A, Gamma, Beta, 1e-3);
-
-    auto f0 =
-        make_shared<Function>(make_shared<op::GetOutputElement>(BN, 0), op::ParameterVector{A});
-    auto f1 =
-        make_shared<Function>(make_shared<op::GetOutputElement>(BN, 1), op::ParameterVector{A});
-    auto f2 =
-        make_shared<Function>(make_shared<op::GetOutputElement>(BN, 2), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f64, shape_in);
-    copy_data(
-        a,
-        vector<double>{-1.97431703, -2.06521307, 0.54122217, 2.53375939, -0.22342691, 0.45340773});
-
-    auto result0 = backend->create_tensor(element::f64, shape_in);
-    vector<double> expected_result0{
-        0.3879149, -1.13662076, 1.34494817, 3.89632344, -0.37805778, -0.50073695};
-
-    backend->call_with_validate(f0, {result0}, {a});
-    EXPECT_TRUE(test::all_close(vector<double>{expected_result0}, read_vector<double>(result0)));
-
-    auto result1 = backend->create_tensor(element::f64, shape_mean);
-    vector<double> expected_result1{0.27972114, -1.14431989, 0.49731493};
-
-    backend->call_with_validate(f1, {result1}, {a});
-    EXPECT_TRUE(test::all_close(vector<double>{expected_result1}, read_vector<double>(result1)));
-
-    auto result2 = backend->create_tensor(element::f64, shape_mean);
-    vector<double> expected_result2{5.08068895e+00, 8.48043919e-01, 1.92784308e-03};
-
-    backend->call_with_validate(f2, {result2}, {a});
-    EXPECT_TRUE(test::all_close(vector<double>{expected_result2}, read_vector<double>(result2)));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, concat_matrix_colwise)
-{
-    Shape shape_a{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{2, 3};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_c{2, 3};
-    auto C = make_shared<op::Parameter>(element::f32, shape_c);
-    Shape shape_r{2, 8};
-    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{A, B, C}, 1),
-                                   op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{2, 4, 8, 16});
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{1, 2, 4, 8, 16, 32});
-    auto c = backend->create_tensor(element::f32, shape_c);
-    copy_data(c, vector<float>{2, 3, 5, 7, 11, 13});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ((vector<float>{2, 4, 1, 2, 4, 2, 3, 5, 8, 16, 8, 16, 32, 7, 11, 13}),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, concat_matrix_rowwise)
-{
-    Shape shape_a{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{3, 2};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_c{3, 2};
-    auto C = make_shared<op::Parameter>(element::f32, shape_c);
-    Shape shape_r{8, 2};
-    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{A, B, C}, 0),
-                                   op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{2, 4, 8, 16});
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{1, 2, 4, 8, 16, 32});
-    auto c = backend->create_tensor(element::f32, shape_c);
-    copy_data(c, vector<float>{2, 3, 5, 7, 11, 13});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ((vector<float>{2, 4, 8, 16, 1, 2, 4, 8, 16, 32, 2, 3, 5, 7, 11, 13}),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, concat_matrix_int64)
-{
-    Shape shape_a{2, 2};
-    auto A = make_shared<op::Parameter>(element::i64, shape_a);
-    Shape shape_b{3, 2};
-    auto B = make_shared<op::Parameter>(element::i64, shape_b);
-    Shape shape_c{3, 2};
-    auto C = make_shared<op::Parameter>(element::i64, shape_c);
-    Shape shape_r{8, 2};
-    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{A, B, C}, 0),
-                                   op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::i64, shape_a);
-    copy_data(a, vector<int64_t>{2, 4, 8, 16});
-    auto b = backend->create_tensor(element::i64, shape_b);
-    copy_data(b, vector<int64_t>{1, 2, 4, 8, 16, 32});
-    auto c = backend->create_tensor(element::i64, shape_c);
-    copy_data(c, vector<int64_t>{2, 3, 5, 7, 11, 13});
-    auto result = backend->create_tensor(element::i64, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ((vector<int64_t>{2, 4, 8, 16, 1, 2, 4, 8, 16, 32, 2, 3, 5, 7, 11, 13}),
-              read_vector<int64_t>(result));
-}
-
-// Params to drive concat_vector_large testing variations
-class concat_vector_params : public ::testing::TestWithParam<int>
-{
-protected:
-    concat_vector_params() { num_inputs = GetParam(); }
-    uint32_t num_inputs;
-};
-
-NGRAPH_TEST_P(${BACKEND_NAME}, concat_vector_params, concat_vector_large)
-{
-    Shape shape_a{1};
-    NodeVector inputs;
-    op::ParameterVector inputs_param;
-    for (uint32_t i = 0; i < num_inputs; i++)
-    {
-        auto A = make_shared<op::Parameter>(element::f32, shape_a);
-        inputs_param.push_back(A);
-        inputs.push_back(A);
-    }
-    Shape shape_r{num_inputs};
-    auto f = make_shared<Function>(make_shared<op::Concat>(inputs, 0), inputs_param);
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    std::vector<std::shared_ptr<runtime::Tensor>> inputs_value;
-    std::vector<float> ref_result;
-    for (uint32_t i = 0; i < num_inputs; i++)
-    {
-        auto a = backend->create_tensor(element::f32, shape_a);
-        copy_data(a, vector<float>{static_cast<float>(i)});
-        ref_result.push_back(static_cast<float>(i));
-        inputs_value.push_back(a);
-    }
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, inputs_value);
-    EXPECT_EQ(ref_result, read_vector<float>(result));
-}
-
-// concat_vector_large case generation
-// Add thhosw tests to cover paramter space overflow:
-// cuda kernel parameter space have limit, if there is large number of parameters,
-// there will be overflow for parameter space.
-NGRAPH_INSTANTIATE_TEST_CASE_P(${BACKEND_NAME},
-                               input_sizes,
-                               concat_vector_params,
-                               testing::Values(100, 128, 999));
-
-NGRAPH_TEST(${BACKEND_NAME}, concat_vector)
-{
-    Shape shape_a{4};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{6};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_c{2};
-    auto C = make_shared<op::Parameter>(element::f32, shape_c);
-    Shape shape_r{12};
-    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{A, B, C}, 0),
-                                   op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{2, 4, 8, 16});
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{1, 2, 4, 8, 16, 32});
-    auto c = backend->create_tensor(element::f32, shape_c);
-    copy_data(c, vector<float>{18, 19});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ((vector<float>{2, 4, 8, 16, 1, 2, 4, 8, 16, 32, 18, 19}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, concat_4d_tensor)
-{
-    Shape shape{1, 1, 1, 1};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto C = make_shared<op::Parameter>(element::f32, shape);
-    Shape shape_r{3, 1, 1, 1};
-    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{A, B, C}, 0),
-                                   op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{1});
-    auto b = backend->create_tensor(element::f32, shape);
-    copy_data(b, vector<float>{2});
-    auto c = backend->create_tensor(element::f32, shape);
-    copy_data(c, vector<float>{3});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ((vector<float>{1, 2, 3}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, concat_2d_tensor)
-{
-    Shape shape{1, 1};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto C = make_shared<op::Parameter>(element::f32, shape);
-    Shape shape_r{3, 1};
-    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{A, B, C}, 0),
-                                   op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{1});
-    auto b = backend->create_tensor(element::f32, shape);
-    copy_data(b, vector<float>{2});
-    auto c = backend->create_tensor(element::f32, shape);
-    copy_data(c, vector<float>{3});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ((vector<float>{1, 2, 3}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_2d_tensor)
-{
-    Shape shape{1, 1};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto add1 = make_shared<op::Add>(A, B);
-    auto C = make_shared<op::Parameter>(element::f32, shape);
-    auto D = make_shared<op::Parameter>(element::f32, shape);
-    auto add2 = make_shared<op::Add>(C, D);
-    auto subtract = make_shared<op::Subtract>(C, A);
-    Shape shape_r{3, 1};
-    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{add1, add2, subtract}, 0),
-                                   op::ParameterVector{A, B, C, D});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{1});
-    auto b = backend->create_tensor(element::f32, shape);
-    copy_data(b, vector<float>{2});
-    auto c = backend->create_tensor(element::f32, shape);
-    copy_data(c, vector<float>{3});
-    auto d = backend->create_tensor(element::f32, shape);
-    copy_data(d, vector<float>{4});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b, c, d});
-    EXPECT_EQ((vector<float>{3, 7, 2}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_propagate_2d_tensor)
-{
-    Shape shape{1, 1};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto add1 = make_shared<op::Add>(A, B);
-    auto C = make_shared<op::Parameter>(element::f32, shape);
-    auto D = make_shared<op::Parameter>(element::f32, shape);
-    auto add2 = make_shared<op::Add>(C, D);
-    auto concat1 = make_shared<op::Concat>(NodeVector{add1, add2}, 0);
-    auto subtract = make_shared<op::Subtract>(C, A);
-    Shape shape_r{3, 1};
-    auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{concat1, subtract}, 0),
-                                   op::ParameterVector{A, B, C, D});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{1});
-    auto b = backend->create_tensor(element::f32, shape);
-    copy_data(b, vector<float>{2});
-    auto c = backend->create_tensor(element::f32, shape);
-    copy_data(c, vector<float>{3});
-    auto d = backend->create_tensor(element::f32, shape);
-    copy_data(d, vector<float>{4});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b, c, d});
-    EXPECT_EQ((vector<float>{3, 7, 2}), read_vector<float>(result));
-}
-
-// from numpy import *
-// a=linspace(1,2*3*4*3*2,2*3*4*3*2)
-// b=linspace(1000+1,1000+2*3*3*3*2,2*3*3*3*2)
-// c=linspace(2000+1,2000+2*3*2*3*2,2*3*2*3*2)
-// a.shape=(2,3,4,3,2)
-// b.shape=(2,3,3,3,2)
-// c.shape=(2,3,2,3,2)
-// z=concatenate((a,b,c),axis=2)
-// z.shape=(2*3*(4+3+2)*3*2)
-// set_printoptions(suppress=True)
-// print(z)
-//
-// [    1.     2.     3.     4.     5.     6.     7.     8.     9.    10.
-//     11.    12.    13.    14.    15.    16.    17.    18.    19.    20.
-//     21.    22.    23.    24.  1001.  1002.  1003.  1004.  1005.  1006.
-//   1007.  1008.  1009.  1010.  1011.  1012.  1013.  1014.  1015.  1016.
-//   1017.  1018.  2001.  2002.  2003.  2004.  2005.  2006.  2007.  2008.
-//   2009.  2010.  2011.  2012.    25.    26.    27.    28.    29.    30.
-//     31.    32.    33.    34.    35.    36.    37.    38.    39.    40.
-//     41.    42.    43.    44.    45.    46.    47.    48.  1019.  1020.
-//   1021.  1022.  1023.  1024.  1025.  1026.  1027.  1028.  1029.  1030.
-//   1031.  1032.  1033.  1034.  1035.  1036.  2013.  2014.  2015.  2016.
-//   2017.  2018.  2019.  2020.  2021.  2022.  2023.  2024.    49.    50.
-//     51.    52.    53.    54.    55.    56.    57.    58.    59.    60.
-//     61.    62.    63.    64.    65.    66.    67.    68.    69.    70.
-//     71.    72.  1037.  1038.  1039.  1040.  1041.  1042.  1043.  1044.
-//   1045.  1046.  1047.  1048.  1049.  1050.  1051.  1052.  1053.  1054.
-//   2025.  2026.  2027.  2028.  2029.  2030.  2031.  2032.  2033.  2034.
-//   2035.  2036.    73.    74.    75.    76.    77.    78.    79.    80.
-//     81.    82.    83.    84.    85.    86.    87.    88.    89.    90.
-//     91.    92.    93.    94.    95.    96.  1055.  1056.  1057.  1058.
-//   1059.  1060.  1061.  1062.  1063.  1064.  1065.  1066.  1067.  1068.
-//   1069.  1070.  1071.  1072.  2037.  2038.  2039.  2040.  2041.  2042.
-//   2043.  2044.  2045.  2046.  2047.  2048.    97.    98.    99.   100.
-//    101.   102.   103.   104.   105.   106.   107.   108.   109.   110.
-//    111.   112.   113.   114.   115.   116.   117.   118.   119.   120.
-//   1073.  1074.  1075.  1076.  1077.  1078.  1079.  1080.  1081.  1082.
-//   1083.  1084.  1085.  1086.  1087.  1088.  1089.  1090.  2049.  2050.
-//   2051.  2052.  2053.  2054.  2055.  2056.  2057.  2058.  2059.  2060.
-//    121.   122.   123.   124.   125.   126.   127.   128.   129.   130.
-//    131.   132.   133.   134.   135.   136.   137.   138.   139.   140.
-//    141.   142.   143.   144.  1091.  1092.  1093.  1094.  1095.  1096.
-//   1097.  1098.  1099.  1100.  1101.  1102.  1103.  1104.  1105.  1106.
-//   1107.  1108.  2061.  2062.  2063.  2064.  2065.  2066.  2067.  2068.
-//   2069.  2070.  2071.  2072.]
-NGRAPH_TEST(${BACKEND_NAME}, concat_5d)
-{
-    vector<float> a_data(2 * 3 * 4 * 3 * 2);
-    for (int i = 0; i < 2 * 3 * 4 * 3 * 2; i++)
-    {
-        a_data[i] = float(i + 1);
-    }
-
-    vector<float> b_data(2 * 3 * 3 * 3 * 2);
-    for (int i = 0; i < 2 * 3 * 3 * 3 * 2; i++)
-    {
-        b_data[i] = 1000 + float(i + 1);
-    }
-
-    vector<float> c_data(2 * 3 * 2 * 3 * 2);
-    for (int i = 0; i < 2 * 3 * 2 * 3 * 2; i++)
-    {
-        c_data[i] = 2000 + float(i + 1);
-    }
-
-    Shape shape_a{2, 3, 4, 3, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{2, 3, 3, 3, 2};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_c{2, 3, 2, 3, 2};
-    auto C = make_shared<op::Parameter>(element::f32, shape_c);
-    Shape shape_r{2, 3, 9, 3, 2};
-
-    auto r = make_shared<op::Concat>(NodeVector{A, B, C}, 2);
-    auto f = make_shared<Function>(r, op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, a_data);
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, b_data);
-    auto c = backend->create_tensor(element::f32, shape_c);
-    copy_data(c, c_data);
-
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ(
-        (vector<float>{
-            1.,    2.,    3.,    4.,    5.,    6.,    7.,    8.,    9.,    10.,   11.,   12.,
-            13.,   14.,   15.,   16.,   17.,   18.,   19.,   20.,   21.,   22.,   23.,   24.,
-            1001., 1002., 1003., 1004., 1005., 1006., 1007., 1008., 1009., 1010., 1011., 1012.,
-            1013., 1014., 1015., 1016., 1017., 1018., 2001., 2002., 2003., 2004., 2005., 2006.,
-            2007., 2008., 2009., 2010., 2011., 2012., 25.,   26.,   27.,   28.,   29.,   30.,
-            31.,   32.,   33.,   34.,   35.,   36.,   37.,   38.,   39.,   40.,   41.,   42.,
-            43.,   44.,   45.,   46.,   47.,   48.,   1019., 1020., 1021., 1022., 1023., 1024.,
-            1025., 1026., 1027., 1028., 1029., 1030., 1031., 1032., 1033., 1034., 1035., 1036.,
-            2013., 2014., 2015., 2016., 2017., 2018., 2019., 2020., 2021., 2022., 2023., 2024.,
-            49.,   50.,   51.,   52.,   53.,   54.,   55.,   56.,   57.,   58.,   59.,   60.,
-            61.,   62.,   63.,   64.,   65.,   66.,   67.,   68.,   69.,   70.,   71.,   72.,
-            1037., 1038., 1039., 1040., 1041., 1042., 1043., 1044., 1045., 1046., 1047., 1048.,
-            1049., 1050., 1051., 1052., 1053., 1054., 2025., 2026., 2027., 2028., 2029., 2030.,
-            2031., 2032., 2033., 2034., 2035., 2036., 73.,   74.,   75.,   76.,   77.,   78.,
-            79.,   80.,   81.,   82.,   83.,   84.,   85.,   86.,   87.,   88.,   89.,   90.,
-            91.,   92.,   93.,   94.,   95.,   96.,   1055., 1056., 1057., 1058., 1059., 1060.,
-            1061., 1062., 1063., 1064., 1065., 1066., 1067., 1068., 1069., 1070., 1071., 1072.,
-            2037., 2038., 2039., 2040., 2041., 2042., 2043., 2044., 2045., 2046., 2047., 2048.,
-            97.,   98.,   99.,   100.,  101.,  102.,  103.,  104.,  105.,  106.,  107.,  108.,
-            109.,  110.,  111.,  112.,  113.,  114.,  115.,  116.,  117.,  118.,  119.,  120.,
-            1073., 1074., 1075., 1076., 1077., 1078., 1079., 1080., 1081., 1082., 1083., 1084.,
-            1085., 1086., 1087., 1088., 1089., 1090., 2049., 2050., 2051., 2052., 2053., 2054.,
-            2055., 2056., 2057., 2058., 2059., 2060., 121.,  122.,  123.,  124.,  125.,  126.,
-            127.,  128.,  129.,  130.,  131.,  132.,  133.,  134.,  135.,  136.,  137.,  138.,
-            139.,  140.,  141.,  142.,  143.,  144.,  1091., 1092., 1093., 1094., 1095., 1096.,
-            1097., 1098., 1099., 1100., 1101., 1102., 1103., 1104., 1105., 1106., 1107., 1108.,
-            2061., 2062., 2063., 2064., 2065., 2066., 2067., 2068., 2069., 2070., 2071., 2072.}),
-        read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, concat_zero_length_1d_last)
-{
-    Shape shape_a{4};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{0};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{4};
-
-    auto r = make_shared<op::Concat>(NodeVector{A, B}, 0);
-    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    vector<float> a_data{1, 2, 3, 4};
-    vector<float> b_data(0);
-
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, a_data);
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, b_data);
-
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, concat_zero_length_1d_middle)
-{
-    Shape shape_a{4};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{0};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_c{4};
-    auto C = make_shared<op::Parameter>(element::f32, shape_c);
-    Shape shape_r{8};
-
-    auto r = make_shared<op::Concat>(NodeVector{A, B, C}, 0);
-    auto f = make_shared<Function>(r, op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    vector<float> a_data{1, 2, 3, 4};
-    vector<float> b_data(0);
-    vector<float> c_data{5, 6, 7, 8};
-
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, a_data);
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, b_data);
-    auto c = backend->create_tensor(element::f32, shape_c);
-    copy_data(c, c_data);
-
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6, 7, 8}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, concat_zero_length_4d_middle)
-{
-    Shape shape_a{2, 2, 1, 1};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{2, 2, 0, 1};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_c{2, 2, 1, 1};
-    auto C = make_shared<op::Parameter>(element::f32, shape_c);
-    Shape shape_r{2, 2, 2, 1};
-
-    auto r = make_shared<op::Concat>(NodeVector{A, B, C}, 2);
-    auto f = make_shared<Function>(r, op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    vector<float> a_data{1, 2, 3, 4};
-    vector<float> b_data(0);
-    vector<float> c_data{5, 6, 7, 8};
-
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, a_data);
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, b_data);
-    auto c = backend->create_tensor(element::f32, shape_c);
-    copy_data(c, c_data);
-
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ((vector<float>{1, 5, 2, 6, 3, 7, 4, 8}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, lrn)
-{
-    Shape shape{2, 3, 2, 1};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto lrn = make_shared<op::LRN>(A, 1., 2., 1., 3);
-    auto f = make_shared<Function>(lrn, op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    vector<float> args{0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, args);
-
-    auto result = backend->create_tensor(element::f32, shape);
-    backend->call_with_validate(f, {result}, {a});
-
-    vector<float> expected{0.f,
-                           0.05325444f,
-                           0.03402646f,
-                           0.01869806f,
-                           0.06805293f,
-                           0.03287071f,
-                           0.00509002f,
-                           0.00356153f,
-                           0.00174719f,
-                           0.0012555f,
-                           0.00322708f,
-                           0.00235574f};
-    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result)));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, select)
-{
-    Shape shape{2, 2, 2};
-    auto A = make_shared<op::Parameter>(element::boolean, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto C = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Select>(A, B, C), op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::boolean, shape);
-    copy_data(a, vector<char>{0, 1, 1, 0, 0, 1, 0, 1});
-    auto b = backend->create_tensor(element::f32, shape);
-    copy_data(b, vector<float>{1, 2, 3, 4, 5, 6, 7, 8});
-    auto c = backend->create_tensor(element::f32, shape);
-    copy_data(c, vector<float>{11, 12, 13, 14, 15, 16, 17, 18});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ((vector<float>{11, 2, 3, 14, 15, 6, 17, 8}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, tensor_constant)
-{
-    Shape shape{2, 2, 2};
-    auto A = op::Constant::create(element::f32, shape, {1, 2, 3, 4, 5, 6, 7, 8});
-    auto f = make_shared<Function>(A, op::ParameterVector{});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {});
-    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6, 7, 8}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, tensor_2constant)
-{
-    Shape shape{2, 2, 2};
-    auto A = op::Constant::create(element::f32, shape, {1, 2, 3, 4, 5, 6, 7, 8});
-    auto f = make_shared<Function>(NodeVector{A, A}, op::ParameterVector{});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto result0 = backend->create_tensor(element::f32, shape);
-    auto result1 = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result0, result1}, {});
-    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6, 7, 8}), read_vector<float>(result0));
-    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6, 7, 8}), read_vector<float>(result1));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, tensor_constant_with_op)
-{
-    Shape shape{2, 2, 2};
-    auto A = op::Constant::create(element::f32, shape, {-1, 2, 3, -4, 5, -6, -7, 8});
-    auto f = make_shared<Function>(make_shared<op::Abs>(A), op::ParameterVector{});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {});
-    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6, 7, 8}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, constant_multi_use)
-{
-    auto A = make_shared<op::Constant>(element::i32, Shape{}, std::vector<std::string>{"388"});
-    auto f = make_shared<Function>(A, op::ParameterVector{});
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    std::shared_ptr<runtime::Tensor> r1 = backend->create_tensor(element::i32, Shape{});
-    backend->call_with_validate(f, {r1}, std::vector<std::shared_ptr<runtime::Tensor>>{});
-    EXPECT_EQ(read_vector<int>(r1), std::vector<int>{388});
-
-    std::shared_ptr<runtime::Tensor> r2 = backend->create_tensor(element::i32, Shape{});
-    backend->call_with_validate(f, {r2}, std::vector<std::shared_ptr<runtime::Tensor>>{});
-    EXPECT_EQ(read_vector<int>(r2), std::vector<int>{388});
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, function_call)
-{
-    // First create "f(A,B,C) = (A+B)*C".
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto C = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>((A + B) * C, op::ParameterVector{A, B, C});
-
-    // Now make "g(X,Y,Z) = f(X,Y,Z) + f(X,Y,Z)"
-    auto X = make_shared<op::Parameter>(element::f32, shape);
-    auto Y = make_shared<op::Parameter>(element::f32, shape);
-    auto Z = make_shared<op::Parameter>(element::f32, shape);
-    auto g =
-        make_shared<Function>(make_shared<op::FunctionCall>(f, NodeVector{X + Y, Y + Z, Z + X}) +
-                                  make_shared<op::FunctionCall>(f, NodeVector{X, Y, Z}),
-                              op::ParameterVector{X, Y, Z});
-
-    // Now call g on some test vectors.
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto x = backend->create_tensor(element::f32, shape);
-    copy_data(x, vector<float>{1, 2, 3, 4});
-    auto y = backend->create_tensor(element::f32, shape);
-    copy_data(y, vector<float>{5, 6, 7, 8});
-    auto z = backend->create_tensor(element::f32, shape);
-    copy_data(z, vector<float>{9, 10, 11, 12});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(g, {result}, {x, y, z});
-    EXPECT_EQ((vector<float>{254, 368, 502, 656}), read_vector<float>(result));
-
-    backend->call_with_validate(g, {result}, {y, x, z});
-    EXPECT_EQ((vector<float>{278, 400, 542, 704}), read_vector<float>(result));
-
-    backend->call_with_validate(g, {result}, {x, z, y});
-    EXPECT_EQ((vector<float>{194, 296, 418, 560}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, convert_int32_float32)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::i32, shape);
-    auto f =
-        make_shared<Function>(make_shared<op::Convert>(A, element::f32), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::i32, shape);
-    copy_data(a, vector<int32_t>{1, 2, 3, 4});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, convert_uint16_float32)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::u16, shape);
-    auto f =
-        make_shared<Function>(make_shared<op::Convert>(A, element::f32), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::u16, shape);
-    copy_data(a, vector<uint16_t>{1, 2, 3, 4});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, convert_int32_bool)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::i32, shape);
-    auto f = make_shared<Function>(make_shared<op::Convert>(A, element::boolean),
-                                   op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::i32, shape);
-    copy_data(a, vector<int32_t>{1, 2, 3, 4});
-    auto result = backend->create_tensor(element::boolean, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<char>{1, 2, 3, 4}), read_vector<char>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, convert_float32_bool)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Convert>(A, element::boolean),
-                                   op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{1, 2, 3, 4});
-    auto result = backend->create_tensor(element::boolean, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<char>{1, 2, 3, 4}), read_vector<char>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, slice_scalar)
-{
-    Shape shape_a{};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_r{};
-    auto r = make_shared<op::Slice>(A, Coordinate{}, Coordinate{});
-    auto f = make_shared<Function>(r, op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{312});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{312}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, slice_matrix)
-{
-    Shape shape_a{4, 4};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_r{3, 2};
-    auto r = make_shared<op::Slice>(A, Coordinate{0, 1}, Coordinate{3, 3});
-    auto f = make_shared<Function>(r, op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{2, 3, 6, 7, 10, 11}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, slice_vector)
-{
-    Shape shape_a{16};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_r{12};
-    auto r = make_shared<op::Slice>(A, Coordinate{2}, Coordinate{14});
-    auto f = make_shared<Function>(r, op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, slice_matrix_axis_0_overlap)
-{
-    Shape shape_a{4, 4};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    auto B = make_shared<op::Parameter>(element::f32, shape_a);
-    auto C = make_shared<op::Add>(A, B);
-    Shape shape_r{2, 4};
-    auto D = make_shared<op::Slice>(C, Coordinate{0, 0}, Coordinate{2, 4});
-    auto E = make_shared<op::Slice>(C, Coordinate{1, 0}, Coordinate{3, 4});
-    auto r = make_shared<op::Add>(D, E);
-    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    auto b = backend->create_tensor(element::f32, shape_a);
-    copy_data(b, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((vector<float>{12, 16, 20, 24, 28, 32, 36, 40}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, slice_matrix_strided)
-{
-    Shape shape_a{4, 4};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_r{2, 2};
-    auto r = make_shared<op::Slice>(A, Coordinate{1, 0}, Coordinate{4, 4}, Strides{2, 3});
-    auto f = make_shared<Function>(r, op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{4, 7, 12, 15}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, slice_3d)
-{
-    Shape shape_a{4, 4, 4};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_r{2, 2, 2};
-    auto r = make_shared<op::Slice>(A, Coordinate{1, 1, 1}, Coordinate{3, 3, 3});
-    auto f = make_shared<Function>(r, op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-
-                               16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-
-                               32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-
-                               48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{21, 22, 25, 26, 37, 38, 41, 42}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, slice_3d_strided)
-{
-    Shape shape_a{4, 4, 4};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_r{2, 2, 2};
-    auto r = make_shared<op::Slice>(A, Coordinate{0, 0, 0}, Coordinate{4, 4, 4}, Strides{2, 2, 2});
-    auto f = make_shared<Function>(r, op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-
-                               16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-
-                               32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-
-                               48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{0, 2, 8, 10, 32, 34, 40, 42}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, slice_3d_strided_different_strides)
-{
-    Shape shape_a{4, 4, 4};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_r{2, 2, 2};
-    auto r = make_shared<op::Slice>(A, Coordinate{0, 0, 0}, Coordinate{4, 4, 4}, Strides{2, 2, 3});
-    auto f = make_shared<Function>(r, op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-
-                               16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-
-                               32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-
-                               48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{0, 3, 8, 11, 32, 35, 40, 43}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, scalar_constant_float32)
-{
-    auto r = op::Constant::create(element::f32, Shape{}, {4.75});
-    auto f = make_shared<Function>(r, op::ParameterVector{});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto result = backend->create_tensor(element::f32, Shape{});
-
-    backend->call_with_validate(f, {result}, {});
-    EXPECT_EQ(vector<float>{4.75f}, read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, scalar_constant_int64)
-{
-    auto r = op::Constant::create(element::i64, Shape{}, {2112});
-    auto f = make_shared<Function>(r, op::ParameterVector{});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto result = backend->create_tensor(element::i64, Shape{});
-
-    backend->call_with_validate(f, {result}, {});
-    EXPECT_EQ(vector<int64_t>{2112}, read_vector<int64_t>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, tensor_constant_float32)
-{
-    Shape shape{2, 2};
-    auto r = op::Constant::create(element::f32, shape, {4.75, 4.5, -5.25, 0.0});
-    auto f = make_shared<Function>(r, op::ParameterVector{});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {});
-    EXPECT_EQ((vector<float>{4.75f, 4.5f, -5.25f, 0.0f}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, tensor_constant_int64)
-{
-    Shape shape{2, 2};
-    auto r = op::Constant::create(element::i64, shape, {2112, 1848, 1776, 1964});
-    auto f = make_shared<Function>(r, op::ParameterVector{});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto result = backend->create_tensor(element::i64, shape);
-
-    backend->call_with_validate(f, {result}, {});
-    EXPECT_EQ((vector<int64_t>{2112, 1848, 1776, 1964}), read_vector<int64_t>(result));
-}
-
-// TODO: Kahan sum only works in limited cases with CPU / Interpreter backend
-NGRAPH_TEST(${BACKEND_NAME}, kahan_sum_to_scalar)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0, 1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    float epsilon = 9.5367431640625e-7f;
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{epsilon, -1.f, 0.f, 1.f});
-    auto result = backend->create_tensor(element::f32, Shape{});
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_TRUE(test::all_close_f(vector<float>{epsilon}, read_vector<float>(result)));
-}
-
-// TODO: Kahan sum only works in limited cases with CPU / Interpreter backend
-NGRAPH_TEST(${BACKEND_NAME}, kahan_sum_3d_to_vector)
-{
-    Shape shape_a{3, 3, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3};
-    auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0, 1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    float epsilon_a = 1.220703125e-4f;
-    float epsilon_b = 3.0517578125e-5f;
-    float epsilon_c = 7.62939453125e-6f;
-    copy_data(a, vector<float>{1,  1,  1,  1,  1,  1,  epsilon_a, epsilon_b, epsilon_c,
-                               1,  1,  1,  1,  1,  1,  -1,        -1,        -1,
-                               -1, -1, -1, -1, -1, -1, -1,        -1,        -1});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_TRUE(test::all_close_f(vector<float>{epsilon_a, epsilon_b, epsilon_c},
-                                  read_vector<float>(result)));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, constant_equality_bool)
-{
-    Shape shape{4};
-    // auto A = make_shared<op::Parameter>(element::boolean, shape);
-    // auto B = make_shared<op::Parameter>(element::boolean, shape);
-    // auto f = make_shared<Function>(make_shared<op::Equal>(A, B), op::ParameterVector{A, B});
-
-    auto A = op::Constant::create(element::boolean, shape, {true, false, true, false});
-    auto B = op::Constant::create(element::boolean, shape, {true, true, true, true});
-    auto f = make_shared<Function>(make_shared<op::Equal>(A, B), op::ParameterVector{});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto result = backend->create_tensor(element::boolean, shape);
-
-    backend->call_with_validate(f, {result}, {});
-    EXPECT_EQ((vector<char>{true, false, true, false}), read_vector<char>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, replace_slice_scalar)
-{
-    Shape shape_a{};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{};
-    auto r = make_shared<op::ReplaceSlice>(A, B, Coordinate{}, Coordinate{});
-    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{312});
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{808});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((vector<float>{808}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, replace_slice_matrix_inplace)
-{
-    Shape shape_a{4, 4};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    auto abs_A = make_shared<op::Abs>(A);
-
-    Shape shape_b{3, 2};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{4, 4};
-    auto r = make_shared<op::ReplaceSlice>(abs_A, B, Coordinate{0, 1}, Coordinate{3, 3});
-    auto abs_r = make_shared<op::Abs>(r);
-    auto f = make_shared<Function>(abs_r, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{102, 103, 106, 107, 110, 111});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((vector<float>{1, 102, 103, 4, 5, 106, 107, 8, 9, 110, 111, 12, 13, 14, 15, 16}),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, replace_slice_matrix)
-{
-    Shape shape_a{4, 4};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{3, 2};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{4, 4};
-    auto r = make_shared<op::ReplaceSlice>(A, B, Coordinate{0, 1}, Coordinate{3, 3});
-    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{102, 103, 106, 107, 110, 111});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((vector<float>{1, 102, 103, 4, 5, 106, 107, 8, 9, 110, 111, 12, 13, 14, 15, 16}),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, replace_slice_vector)
-{
-    Shape shape_a{16};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{12};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{16};
-    auto r = make_shared<op::ReplaceSlice>(A, B, Coordinate{2}, Coordinate{14});
-    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ(
-        (vector<float>{0, 1, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 14, 15}),
-        read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, replace_slice_3d)
-{
-    Shape shape_a{4, 4, 4};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{2, 2, 2};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{4, 4, 4};
-    auto r = make_shared<op::ReplaceSlice>(A, B, Coordinate{1, 1, 1}, Coordinate{3, 3, 3});
-    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-
-                               16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-
-                               32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-
-                               48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63});
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{921, 922, 925, 926, 937, 938, 941, 942});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((vector<float>{0,  1,  2,  3,  4,  5,   6,   7,  8,  9,   10,  11, 12, 13, 14, 15,
-
-                             16, 17, 18, 19, 20, 921, 922, 23, 24, 925, 926, 27, 28, 29, 30, 31,
-
-                             32, 33, 34, 35, 36, 937, 938, 39, 40, 941, 942, 43, 44, 45, 46, 47,
-
-                             48, 49, 50, 51, 52, 53,  54,  55, 56, 57,  58,  59, 60, 61, 62, 63}),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, replace_slice_3d_strided)
-{
-    Shape shape_a{4, 4, 4};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{2, 2, 2};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{4, 4, 4};
-    auto r = make_shared<op::ReplaceSlice>(
-        A, B, Coordinate{0, 0, 0}, Coordinate{4, 4, 4}, Strides{2, 2, 2});
-    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-
-                               16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-
-                               32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-
-                               48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63});
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{900, 902, 908, 910, 932, 934, 940, 942});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((vector<float>{900, 1,  902, 3,  4,  5,  6,  7,  908, 9,  910, 11, 12, 13, 14, 15,
-
-                             16,  17, 18,  19, 20, 21, 22, 23, 24,  25, 26,  27, 28, 29, 30, 31,
-
-                             932, 33, 934, 35, 36, 37, 38, 39, 940, 41, 942, 43, 44, 45, 46, 47,
-
-                             48,  49, 50,  51, 52, 53, 54, 55, 56,  57, 58,  59, 60, 61, 62, 63}),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, replace_slice_3d_strided_different_strides)
-{
-    Shape shape_a{4, 4, 4};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{2, 2, 2};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{4, 4, 4};
-    auto r = make_shared<op::ReplaceSlice>(
-        A, B, Coordinate{0, 0, 0}, Coordinate{4, 4, 4}, Strides{2, 2, 3});
-    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-
-                               16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-
-                               32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-
-                               48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63});
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{900, 903, 908, 911, 932, 935, 940, 943});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((vector<float>{900, 1,  2,  903, 4,  5,  6,  7,  908, 9,  10, 911, 12, 13, 14, 15,
-
-                             16,  17, 18, 19,  20, 21, 22, 23, 24,  25, 26, 27,  28, 29, 30, 31,
-
-                             932, 33, 34, 935, 36, 37, 38, 39, 940, 41, 42, 943, 44, 45, 46, 47,
-
-                             48,  49, 50, 51,  52, 53, 54, 55, 56,  57, 58, 59,  60, 61, 62, 63}),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_0d)
-{
-    Shape shape{};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{6});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{6}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_1d_nochange)
-{
-    Shape shape{8};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{0, 1, 2, 3, 4, 5, 6, 7});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{0, 1, 2, 3, 4, 5, 6, 7}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_1d_0)
-{
-    Shape shape{8};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{0, 1, 2, 3, 4, 5, 6, 7});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{7, 6, 5, 4, 3, 2, 1, 0}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_2d_nochange)
-{
-    Shape shape{4, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a,
-              test::NDArray<float, 2>({{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}}).get_vector());
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ(
-        (test::NDArray<float, 2>({{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}}).get_vector()),
-        read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_2d_0)
-{
-    Shape shape{4, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a,
-              test::NDArray<float, 2>({{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}}).get_vector());
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ(
-        (test::NDArray<float, 2>({{9, 10, 11}, {6, 7, 8}, {3, 4, 5}, {0, 1, 2}}).get_vector()),
-        read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_2d_1)
-{
-    Shape shape{4, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a,
-              test::NDArray<float, 2>({{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}}).get_vector());
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ(
-        (test::NDArray<float, 2>({{2, 1, 0}, {5, 4, 3}, {8, 7, 6}, {11, 10, 9}}).get_vector()),
-        read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_2d_01)
-{
-    Shape shape{4, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f =
-        make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{0, 1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a,
-              test::NDArray<float, 2>({{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}}).get_vector());
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ(
-        (test::NDArray<float, 2>({{11, 10, 9}, {8, 7, 6}, {5, 4, 3}, {2, 1, 0}}).get_vector()),
-        read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_nochange)
-{
-    Shape shape{2, 4, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a,
-              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
-                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
-                  .get_vector());
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
-                                        {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_0)
-{
-    Shape shape{2, 4, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a,
-              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
-                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
-                  .get_vector());
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((test::NDArray<float, 3>({{{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}},
-                                        {{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}}})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_1)
-{
-    Shape shape{2, 4, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a,
-              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
-                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
-                  .get_vector());
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((test::NDArray<float, 3>({{{9, 10, 11}, {6, 7, 8}, {3, 4, 5}, {0, 1, 2}},
-                                        {{21, 22, 23}, {18, 19, 20}, {15, 16, 17}, {12, 13, 14}}})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_2)
-{
-    Shape shape{2, 4, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{2}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a,
-              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
-                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
-                  .get_vector());
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((test::NDArray<float, 3>({{{2, 1, 0}, {5, 4, 3}, {8, 7, 6}, {11, 10, 9}},
-                                        {{14, 13, 12}, {17, 16, 15}, {20, 19, 18}, {23, 22, 21}}})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_01)
-{
-    Shape shape{2, 4, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f =
-        make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{0, 1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a,
-              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
-                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
-                  .get_vector());
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((test::NDArray<float, 3>({{{21, 22, 23}, {18, 19, 20}, {15, 16, 17}, {12, 13, 14}},
-                                        {{9, 10, 11}, {6, 7, 8}, {3, 4, 5}, {0, 1, 2}}})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_02)
-{
-    Shape shape{2, 4, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f =
-        make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{0, 2}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a,
-              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
-                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
-                  .get_vector());
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((test::NDArray<float, 3>({{{14, 13, 12}, {17, 16, 15}, {20, 19, 18}, {23, 22, 21}},
-                                        {{2, 1, 0}, {5, 4, 3}, {8, 7, 6}, {11, 10, 9}}})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_12)
-{
-    Shape shape{2, 4, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f =
-        make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{1, 2}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a,
-              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
-                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
-                  .get_vector());
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((test::NDArray<float, 3>({{{11, 10, 9}, {8, 7, 6}, {5, 4, 3}, {2, 1, 0}},
-                                        {{23, 22, 21}, {20, 19, 18}, {17, 16, 15}, {14, 13, 12}}})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_3d_012)
-{
-    Shape shape{2, 4, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Reverse>(A, AxisSet{0, 1, 2}),
-                                   op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a,
-              test::NDArray<float, 3>({{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}, {9, 10, 11}},
-                                       {{12, 13, 14}, {15, 16, 17}, {18, 19, 20}, {21, 22, 23}}})
-                  .get_vector());
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((test::NDArray<float, 3>({{{23, 22, 21}, {20, 19, 18}, {17, 16, 15}, {14, 13, 12}},
-                                        {{11, 10, 9}, {8, 7, 6}, {5, 4, 3}, {2, 1, 0}}})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, numeric_float_nan)
-{
-    Shape shape{5};
-    auto A = op::Constant::create(element::f32, shape, {-2.5f, 25.5f, 2.25f, NAN, 6.0f});
-    auto B = op::Constant::create(element::f32, shape, {10.0f, 5.0f, 2.25f, 10.0f, NAN});
-    auto f = make_shared<Function>(make_shared<op::Equal>(A, B), op::ParameterVector{});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto result = backend->create_tensor(element::boolean, shape);
-    backend->call_with_validate(f, {result}, {});
-    EXPECT_EQ((vector<char>{false, false, true, false, false}), read_vector<char>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, numeric_double_nan)
-{
-    Shape shape{5};
-    auto A = op::Constant::create(element::f64, shape, {-2.5f, 25.5f, 2.25f, NAN, 6.0f});
-    auto B = op::Constant::create(element::f64, shape, {10.0f, 5.0f, 2.25f, 10.0f, NAN});
-    auto f = make_shared<Function>(make_shared<op::Equal>(A, B), op::ParameterVector{});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto result = backend->create_tensor(element::boolean, shape);
-    backend->call_with_validate(f, {result}, {});
-    EXPECT_EQ((vector<char>{false, false, true, false, false}), read_vector<char>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, numeric_float_inf)
-{
-    Shape shape{5};
-    auto A = op::Constant::create(element::f32, shape, {-2.5f, 25.5f, 2.25f, INFINITY, 6.0f});
-    auto B = op::Constant::create(element::f32, shape, {10.0f, 5.0f, 2.25f, 10.0f, -INFINITY});
-    auto f = make_shared<Function>(make_shared<op::Equal>(A, B), op::ParameterVector{});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto result = backend->create_tensor(element::boolean, shape);
-    backend->call_with_validate(f, {result}, {});
-    EXPECT_EQ((vector<char>{false, false, true, false, false}), read_vector<char>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, numeric_double_inf)
-{
-    Shape shape{5};
-    auto A = op::Constant::create(element::f64, shape, {-2.5f, 25.5f, 2.25f, INFINITY, 6.0f});
-    auto B = op::Constant::create(element::f64, shape, {10.0f, 5.0f, 2.25f, 10.0f, -INFINITY});
-    auto f = make_shared<Function>(make_shared<op::Equal>(A, B), op::ParameterVector{});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto result = backend->create_tensor(element::boolean, shape);
-    backend->call_with_validate(f, {result}, {});
-    EXPECT_EQ((vector<char>{false, false, true, false, false}), read_vector<char>(result));
-}
-
-//
-// From the XLA docs: https://www.tensorflow.org/performance/xla/operation_semantics#selectandscatter
-//
-NGRAPH_TEST(${BACKEND_NAME}, select_and_scatter_with_overlap)
-{
-    Shape shape_sel_a{};
-    auto SEL_A = make_shared<op::Parameter>(element::f32, shape_sel_a);
-    Shape shape_sel_b{};
-    auto SEL_B = make_shared<op::Parameter>(element::f32, shape_sel_b);
-    auto sel_f = make_shared<Function>(make_shared<op::Greater>(SEL_A, SEL_B),
-                                       op::ParameterVector{SEL_A, SEL_B});
-
-    Shape shape_scatter_a{};
-    auto SCATTER_A = make_shared<op::Parameter>(element::f32, shape_scatter_a);
-    Shape shape_scatter_b{};
-    auto SCATTER_B = make_shared<op::Parameter>(element::f32, shape_scatter_b);
-    auto scatter_f =
-        make_shared<Function>(SCATTER_A + SCATTER_B, op::ParameterVector{SCATTER_A, SCATTER_B});
-
-    Shape shape_a{4, 5};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{2, 2};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_c{};
-    auto C = make_shared<op::Parameter>(element::f32, shape_c);
-    Shape shape_r{4, 5};
-    Shape window_shape{2, 3};
-    auto window_strides = Strides{2, 2};
-    auto f = make_shared<Function>(
-        make_shared<op::SelectAndScatter>(A, B, C, sel_f, scatter_f, window_shape, window_strides),
-        op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a,
-              test::NDArray<float, 2>(
-                  {{7, 2, 5, 3, 8}, {3, 8, 9, 3, 4}, {1, 5, 7, 5, 6}, {0, 6, 2, 10, 2}})
-                  .get_vector());
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, test::NDArray<float, 2>({{2, 6}, {3, 1}}).get_vector());
-    auto c = backend->create_tensor(element::f32, shape_c);
-    copy_data(c, vector<float>{0});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ((test::NDArray<float, 2>(
-                   {{0, 0, 0, 0, 0}, {0, 0, 8, 0, 0}, {0, 0, 3, 0, 0}, {0, 0, 0, 1, 0}})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-//
-// From the XLA docs: https://www.tensorflow.org/performance/xla/operation_semantics#selectandscatter
-//
-NGRAPH_TEST(${BACKEND_NAME}, select_and_scatter_without_overlap)
-{
-    Shape shape_sel_a{};
-    auto SEL_A = make_shared<op::Parameter>(element::f32, shape_sel_a);
-    Shape shape_sel_b{};
-    auto SEL_B = make_shared<op::Parameter>(element::f32, shape_sel_b);
-    auto sel_f = make_shared<Function>(make_shared<op::Greater>(SEL_A, SEL_B),
-                                       op::ParameterVector{SEL_A, SEL_B});
-
-    Shape shape_scatter_a{};
-    auto SCATTER_A = make_shared<op::Parameter>(element::f32, shape_scatter_a);
-    Shape shape_scatter_b{};
-    auto SCATTER_B = make_shared<op::Parameter>(element::f32, shape_scatter_b);
-    auto scatter_f =
-        make_shared<Function>(SCATTER_A + SCATTER_B, op::ParameterVector{SCATTER_A, SCATTER_B});
-
-    Shape shape_a{4, 6};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{2, 2};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_c{};
-    auto C = make_shared<op::Parameter>(element::f32, shape_c);
-    Shape shape_r{4, 6};
-    Shape window_shape{2, 3};
-    auto window_strides = Strides{2, 3};
-    auto f = make_shared<Function>(
-        make_shared<op::SelectAndScatter>(A, B, C, sel_f, scatter_f, window_shape, window_strides),
-        op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a,
-              test::NDArray<float, 2>(
-                  {{7, 2, 5, 3, 10, 2}, {3, 8, 9, 3, 4, 2}, {1, 5, 7, 5, 6, 1}, {0, 6, 2, 7, 2, 8}})
-                  .get_vector());
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, test::NDArray<float, 2>({{2, 6}, {3, 1}}).get_vector());
-    auto c = backend->create_tensor(element::f32, shape_c);
-    copy_data(c, vector<float>{0});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ((test::NDArray<float, 2>(
-                   {{0, 0, 0, 0, 6, 0}, {0, 0, 2, 0, 0, 0}, {0, 0, 3, 0, 0, 0}, {0, 0, 0, 0, 0, 1}})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-//
-// Adapted from the XLA docs to provide an example in >2D: https://www.tensorflow.org/performance/xla/operation_semantics#selectandscatter
-//
-NGRAPH_TEST(${BACKEND_NAME}, select_and_scatter_3d_without_overlap)
-{
-    Shape shape_sel_a{};
-    auto SEL_A = make_shared<op::Parameter>(element::f32, shape_sel_a);
-    Shape shape_sel_b{};
-    auto SEL_B = make_shared<op::Parameter>(element::f32, shape_sel_b);
-    auto sel_f = make_shared<Function>(make_shared<op::Greater>(SEL_A, SEL_B),
-                                       op::ParameterVector{SEL_A, SEL_B});
-
-    Shape shape_scatter_a{};
-    auto SCATTER_A = make_shared<op::Parameter>(element::f32, shape_scatter_a);
-    Shape shape_scatter_b{};
-    auto SCATTER_B = make_shared<op::Parameter>(element::f32, shape_scatter_b);
-    auto scatter_f =
-        make_shared<Function>(SCATTER_A + SCATTER_B, op::ParameterVector{SCATTER_A, SCATTER_B});
-
-    Shape shape_a{2, 4, 6};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{1, 2, 2};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_c{};
-    auto C = make_shared<op::Parameter>(element::f32, shape_c);
-    Shape shape_r{2, 4, 6};
-    Shape window_shape{2, 2, 3};
-    auto window_strides = Strides{2, 2, 3};
-    auto f = make_shared<Function>(
-        make_shared<op::SelectAndScatter>(A, B, C, sel_f, scatter_f, window_shape, window_strides),
-        op::ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(
-        a,
-        test::NDArray<float, 3>(
-            {{{7, 2, 5, 3, 10, 2}, {3, 8, 9, 3, 4, 2}, {1, 5, 7, 5, 6, 1}, {0, 6, 2, 7, 2, 8}},
-             {{2, 5, 8, 3, 4, 2}, {1, 2, 8, 4, 5, 2}, {10, 2, 3, 4, 1, 0}, {4, 1, 2, 4, 5, 7}}})
-            .get_vector());
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, test::NDArray<float, 3>({{{2, 6}, {3, 1}}}).get_vector());
-    auto c = backend->create_tensor(element::f32, shape_c);
-    copy_data(c, vector<float>{0});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ(
-        (test::NDArray<float, 3>(
-             {{{0, 0, 0, 0, 6, 0}, {0, 0, 2, 0, 0, 0}, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 1}},
-              {{0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0}, {3, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0}}})
-             .get_vector()),
-        read_vector<float>(result));
-}
-
-template <typename OP>
-void make_unary_empty_test(const string& backend_name)
-{
-    Shape shape{0};
-
-    op::ParameterVector params;
-    NodeVector result_list;
-    for (size_t i = 0; i < s_known_element_types.size(); i++)
-    {
-        shared_ptr<op::Parameter> p = make_shared<op::Parameter>(s_known_element_types[i], shape);
-        params.push_back(p);
-        result_list.push_back(make_shared<OP>(p));
-    }
-
-    auto f = make_shared<Function>(result_list, params);
-    auto backend = runtime::Backend::create(backend_name);
-
-    vector<shared_ptr<runtime::Tensor>> inputs;
-    vector<shared_ptr<runtime::Tensor>> outputs;
-    for (size_t i = 0; i < s_known_element_types.size(); i++)
-    {
-        inputs.push_back(backend->create_tensor(s_known_element_types[i], shape));
-        outputs.push_back(backend->create_tensor(s_known_element_types[i], shape));
-    }
-
-    backend->call_with_validate(f, outputs, inputs);
-
-    EXPECT_EQ(read_vector<float>(inputs[0]).size(), 0);
-    EXPECT_EQ(read_vector<double>(inputs[1]).size(), 0);
-    EXPECT_EQ(read_vector<int8_t>(inputs[2]).size(), 0);
-    EXPECT_EQ(read_vector<int16_t>(inputs[3]).size(), 0);
-    EXPECT_EQ(read_vector<int32_t>(inputs[4]).size(), 0);
-    EXPECT_EQ(read_vector<int64_t>(inputs[5]).size(), 0);
-    EXPECT_EQ(read_vector<uint8_t>(inputs[6]).size(), 0);
-    EXPECT_EQ(read_vector<uint16_t>(inputs[7]).size(), 0);
-    EXPECT_EQ(read_vector<uint32_t>(inputs[8]).size(), 0);
-    EXPECT_EQ(read_vector<uint64_t>(inputs[9]).size(), 0);
-
-    EXPECT_EQ(read_vector<float>(outputs[0]).size(), 0);
-    EXPECT_EQ(read_vector<double>(outputs[1]).size(), 0);
-    EXPECT_EQ(read_vector<int8_t>(outputs[2]).size(), 0);
-    EXPECT_EQ(read_vector<int16_t>(outputs[3]).size(), 0);
-    EXPECT_EQ(read_vector<int32_t>(outputs[4]).size(), 0);
-    EXPECT_EQ(read_vector<int64_t>(outputs[5]).size(), 0);
-    EXPECT_EQ(read_vector<uint8_t>(outputs[6]).size(), 0);
-    EXPECT_EQ(read_vector<uint16_t>(outputs[7]).size(), 0);
-    EXPECT_EQ(read_vector<uint32_t>(outputs[8]).size(), 0);
-    EXPECT_EQ(read_vector<uint64_t>(outputs[9]).size(), 0);
-}
-
-template <typename OP>
-void make_binary_empty_test(const string& backend_name, bool is_comparison = false)
-{
-    Shape shape{0};
-    op::ParameterVector A;
-    for (size_t i = 0; i < s_known_element_types.size(); i++)
-    {
-        A.push_back(make_shared<op::Parameter>(s_known_element_types[i], shape));
-    }
-
-    NodeVector result_list;
-    for (shared_ptr<op::Parameter> p : A)
-    {
-        result_list.push_back(make_shared<OP>(p, p));
-    }
-
-    auto f = make_shared<Function>(result_list, A);
-    auto backend = runtime::Backend::create(backend_name);
-
-    vector<shared_ptr<runtime::Tensor>> inputs;
-    vector<shared_ptr<runtime::Tensor>> outputs;
-    for (size_t i = 0; i < s_known_element_types.size(); i++)
-    {
-        inputs.push_back(backend->create_tensor(s_known_element_types[i], shape));
-        if (is_comparison)
-        {
-            outputs.push_back(backend->create_tensor(element::from<char>(), shape));
-        }
-        else
-        {
-            outputs.push_back(backend->create_tensor(s_known_element_types[i], shape));
-        }
-    }
-
-    backend->call_with_validate(f, outputs, inputs);
-
-    EXPECT_EQ(read_vector<float>(inputs[0]).size(), 0);
-    EXPECT_EQ(read_vector<double>(inputs[1]).size(), 0);
-    EXPECT_EQ(read_vector<int8_t>(inputs[2]).size(), 0);
-    EXPECT_EQ(read_vector<int16_t>(inputs[3]).size(), 0);
-    EXPECT_EQ(read_vector<int32_t>(inputs[4]).size(), 0);
-    EXPECT_EQ(read_vector<int64_t>(inputs[5]).size(), 0);
-    EXPECT_EQ(read_vector<uint8_t>(inputs[6]).size(), 0);
-    EXPECT_EQ(read_vector<uint16_t>(inputs[7]).size(), 0);
-    EXPECT_EQ(read_vector<uint32_t>(inputs[8]).size(), 0);
-    EXPECT_EQ(read_vector<uint64_t>(inputs[9]).size(), 0);
-
-    if (is_comparison)
-    {
-        EXPECT_EQ(read_vector<char>(outputs[0]).size(), 0);
-        EXPECT_EQ(read_vector<char>(outputs[1]).size(), 0);
-        EXPECT_EQ(read_vector<char>(outputs[2]).size(), 0);
-        EXPECT_EQ(read_vector<char>(outputs[3]).size(), 0);
-        EXPECT_EQ(read_vector<char>(outputs[4]).size(), 0);
-        EXPECT_EQ(read_vector<char>(outputs[5]).size(), 0);
-        EXPECT_EQ(read_vector<char>(outputs[6]).size(), 0);
-        EXPECT_EQ(read_vector<char>(outputs[7]).size(), 0);
-        EXPECT_EQ(read_vector<char>(outputs[8]).size(), 0);
-        EXPECT_EQ(read_vector<char>(outputs[9]).size(), 0);
-    }
-    else
-    {
-        EXPECT_EQ(read_vector<float>(outputs[0]).size(), 0);
-        EXPECT_EQ(read_vector<double>(outputs[1]).size(), 0);
-        EXPECT_EQ(read_vector<int8_t>(outputs[2]).size(), 0);
-        EXPECT_EQ(read_vector<int16_t>(outputs[3]).size(), 0);
-        EXPECT_EQ(read_vector<int32_t>(outputs[4]).size(), 0);
-        EXPECT_EQ(read_vector<int64_t>(outputs[5]).size(), 0);
-        EXPECT_EQ(read_vector<uint8_t>(outputs[6]).size(), 0);
-        EXPECT_EQ(read_vector<uint16_t>(outputs[7]).size(), 0);
-        EXPECT_EQ(read_vector<uint32_t>(outputs[8]).size(), 0);
-        EXPECT_EQ(read_vector<uint64_t>(outputs[9]).size(), 0);
-    }
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_abs)
-{
-    make_unary_empty_test<op::Abs>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_ceiling)
-{
-    make_unary_empty_test<op::Ceiling>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_exp)
-{
-    make_unary_empty_test<op::Exp>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_floor)
-{
-    make_unary_empty_test<op::Floor>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_log)
-{
-    make_unary_empty_test<op::Log>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_negative)
-{
-    make_unary_empty_test<op::Negative>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_not)
-{
-    Shape shape{0};
-    auto A = make_shared<op::Parameter>(element::from<char>(), shape);
-    auto f = make_shared<Function>(make_shared<op::Not>(A), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::from<char>(), shape);
-    auto result = backend->create_tensor(element::from<char>(), shape);
-
-    backend->call_with_validate(f, {result}, {a});
-
-    auto in_vec = read_vector<char>(a);
-    auto out_vec = read_vector<char>(result);
-
-    EXPECT_EQ(in_vec.size(), 0);
-    EXPECT_EQ(out_vec.size(), 0);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_sign)
-{
-    make_unary_empty_test<op::Sign>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_sqrt)
-{
-    make_unary_empty_test<op::Sqrt>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_sin)
-{
-    make_unary_empty_test<op::Sin>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_sinh)
-{
-    make_unary_empty_test<op::Sinh>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_cos)
-{
-    make_unary_empty_test<op::Cos>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_cosh)
-{
-    make_unary_empty_test<op::Cosh>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_tan)
-{
-    make_unary_empty_test<op::Tan>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_tanh)
-{
-    make_unary_empty_test<op::Tanh>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_asin)
-{
-    make_unary_empty_test<op::Asin>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_acos)
-{
-    make_unary_empty_test<op::Acos>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_atan)
-{
-    make_unary_empty_test<op::Atan>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_add)
-{
-    make_binary_empty_test<op::Add>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_divide)
-{
-    make_binary_empty_test<op::Divide>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_eq)
-{
-    make_binary_empty_test<op::Equal>("${BACKEND_NAME}", true);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_greater)
-{
-    make_binary_empty_test<op::Greater>("${BACKEND_NAME}", true);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_greatereq)
-{
-    make_binary_empty_test<op::GreaterEq>("${BACKEND_NAME}", true);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_less)
-{
-    make_binary_empty_test<op::Less>("${BACKEND_NAME}", true);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_lesseq)
-{
-    make_binary_empty_test<op::LessEq>("${BACKEND_NAME}", true);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_maximum)
-{
-    make_binary_empty_test<op::Maximum>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_minimum)
-{
-    make_binary_empty_test<op::Minimum>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_multiply)
-{
-    make_binary_empty_test<op::Multiply>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_not_equal)
-{
-    make_binary_empty_test<op::NotEqual>("${BACKEND_NAME}", true);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_power)
-{
-    make_binary_empty_test<op::Power>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, zero_sized_subtract)
-{
-    make_binary_empty_test<op::Subtract>("${BACKEND_NAME}");
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, convolution_outlining)
-{
-    Shape shape_a{1, 2, 2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{2, 2, 1, 1};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{1, 2, 2, 2};
-    auto conv1 = make_shared<op::Convolution>(A,
-                                              B,
-                                              Strides{1, 1},
-                                              Strides{1, 1},
-                                              CoordinateDiff{0, 0},
-                                              CoordinateDiff{0, 0},
-                                              Strides{1, 1});
-    auto conv2 = make_shared<op::Convolution>(conv1,
-                                              B,
-                                              Strides{1, 1},
-                                              Strides{1, 1},
-                                              CoordinateDiff{0, 0},
-                                              CoordinateDiff{0, 0},
-                                              Strides{1, 1});
-    auto f = make_shared<Function>(conv2, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{1.0f, 1.0f, 1.0f, 1.0f});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    vector<float> expected_result{4.0f, 4.0f, 4.0f, 4.0f, 4.0f, 4.0f, 4.0f, 4.0f};
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ(vector<float>{expected_result}, read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, computation_reuse)
-{
-    Shape shape_a{1, 16, 2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{32, 16, 1, 1};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{1, 32, 2, 2};
-    auto conv = make_shared<op::Convolution>(A,
-                                             B,
-                                             Strides{1, 1},
-                                             Strides{1, 1},
-                                             CoordinateDiff{0, 0},
-                                             CoordinateDiff{0, 0},
-                                             Strides{1, 1});
-    Shape pool_shape{1, 1};
-    auto pool = make_shared<op::AvgPool>(conv, pool_shape);
-    auto bias = make_shared<op::Broadcast>(
-        op::Constant::create(element::f32, Shape{}, {2.14}), shape_r, AxisSet{0, 1, 2, 3});
-    auto result_op = make_shared<op::Result>(pool + bias);
-    auto f = make_shared<Function>(ResultVector{result_op}, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    vector<float> input(64, 1.0f);
-    vector<float> weights(512, 0.5f);
-    vector<float> rv(128);
-
-    auto a = backend->create_tensor(element::f32, shape_a, input.data());
-    auto b = backend->create_tensor(element::f32, shape_b, weights.data());
-    auto result = backend->create_tensor(element::f32, shape_r, rv.data());
-
-    backend->call_with_validate(f, {result}, {a, b});
-
-    vector<float> rv_saved(rv);
-
-    b->set_stale(false);
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ(rv_saved, rv);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, pad_interior_1d)
-{
-    Shape shape_a{6};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{16};
-    Shape padding_below{0};
-    Shape padding_above{0};
-    Shape padding_interior{2};
-    auto f = make_shared<Function>(
-        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
-        op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, test::NDArray<float, 1>({1, 2, 3, 4, 5, 6}).get_vector());
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{2112});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((test::NDArray<float, 1>(
-                   {1, 2112, 2112, 2, 2112, 2112, 3, 2112, 2112, 4, 2112, 2112, 5, 2112, 2112, 6})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, pad_exterior_1d)
-{
-    Shape shape_a{6};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{15};
-    Shape padding_below{4};
-    Shape padding_above{5};
-    Shape padding_interior{0};
-    auto f = make_shared<Function>(
-        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
-        op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, test::NDArray<float, 1>({1, 2, 3, 4, 5, 6}).get_vector());
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{2112});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((test::NDArray<float, 1>(
-                   {2112, 2112, 2112, 2112, 1, 2, 3, 4, 5, 6, 2112, 2112, 2112, 2112, 2112})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, pad_interior_exterior_1d)
-{
-    Shape shape_a{6};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{25};
-    Shape padding_below{4};
-    Shape padding_above{5};
-    Shape padding_interior{2};
-    auto f = make_shared<Function>(
-        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
-        op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, test::NDArray<float, 1>({1, 2, 3, 4, 5, 6}).get_vector());
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{2112});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((test::NDArray<float, 1>({2112, 2112, 2112, 2112, 1,    2112, 2112, 2, 2112,
-                                        2112, 3,    2112, 2112, 4,    2112, 2112, 5, 2112,
-                                        2112, 6,    2112, 2112, 2112, 2112, 2112})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, pad_interior_exterior_2d)
-{
-    Shape shape_a{2, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{7, 6};
-    Shape padding_below{1, 0};
-    Shape padding_above{2, 1};
-    Shape padding_interior{2, 1};
-    auto f = make_shared<Function>(
-        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
-        op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, test::NDArray<float, 2>({{1, 2, 3}, {4, 5, 6}}).get_vector());
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{9});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((test::NDArray<float, 2>({{9, 9, 9, 9, 9, 9},
-                                        {1, 9, 2, 9, 3, 9},
-                                        {9, 9, 9, 9, 9, 9},
-                                        {9, 9, 9, 9, 9, 9},
-                                        {4, 9, 5, 9, 6, 9},
-                                        {9, 9, 9, 9, 9, 9},
-                                        {9, 9, 9, 9, 9, 9}})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, pad_exterior_2d_0x0)
-{
-    Shape shape_a{0, 0};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{5, 5};
-    Shape padding_below{2, 3};
-    Shape padding_above{3, 2};
-    Shape padding_interior{0, 0};
-    auto f = make_shared<Function>(
-        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
-        op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    // copy_data(a, test::NDArray<float, 2>({{}}).get_vector());
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{2112});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((test::NDArray<float, 2>({{2112, 2112, 2112, 2112, 2112},
-                                        {2112, 2112, 2112, 2112, 2112},
-                                        {2112, 2112, 2112, 2112, 2112},
-                                        {2112, 2112, 2112, 2112, 2112},
-                                        {2112, 2112, 2112, 2112, 2112}})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, pad_exterior_2d_0x3)
-{
-    Shape shape_a{0, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{5, 5};
-    Shape padding_below{2, 1};
-    Shape padding_above{3, 1};
-    Shape padding_interior{0, 0};
-    auto f = make_shared<Function>(
-        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
-        op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    // copy_data(a, test::NDArray<float, 2>({}).get_vector());
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{2112});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((test::NDArray<float, 2>({{2112, 2112, 2112, 2112, 2112},
-                                        {2112, 2112, 2112, 2112, 2112},
-                                        {2112, 2112, 2112, 2112, 2112},
-                                        {2112, 2112, 2112, 2112, 2112},
-                                        {2112, 2112, 2112, 2112, 2112}})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, pad_exterior_2d_3x0)
-{
-    Shape shape_a{3, 0};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{5, 5};
-    Shape padding_below{1, 3};
-    Shape padding_above{1, 2};
-    Shape padding_interior{0, 0};
-    auto f = make_shared<Function>(
-        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
-        op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    // copy_data(a, test::NDArray<float, 2>({}).get_vector());
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{2112});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((test::NDArray<float, 2>({{2112, 2112, 2112, 2112, 2112},
-                                        {2112, 2112, 2112, 2112, 2112},
-                                        {2112, 2112, 2112, 2112, 2112},
-                                        {2112, 2112, 2112, 2112, 2112},
-                                        {2112, 2112, 2112, 2112, 2112}})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, pad_exterior_4d_1x2x2x2)
-{
-    Shape shape_a{1, 2, 2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{1, 2, 4, 4};
-    Shape padding_below{0, 0, 1, 1};
-    Shape padding_above{0, 0, 1, 1};
-    Shape padding_interior{0, 0, 0, 0};
-    auto f = make_shared<Function>(
-        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
-        op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    // clang-format off
-    copy_data(a, test::NDArray<float, 4>(
-        {
-            {
-                {
-                    {0.0f, 0.0f},
-                    {0.0f, 0.0f}
-                },
-                {
-                    {0.0f, 0.0f},
-                    {0.0f, 0.0f}
-                }
-            }
-        }).get_vector());
-    // clang-format on
-
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{42});
-
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    // clang-format off
-    EXPECT_EQ((test::NDArray<float, 4>(
-        {
-            {
-                {
-                    {42.0f, 42.0f, 42.0f, 42.0f},
-                    {42.0f, 0.0f, 0.0f, 42.0f},
-                    {42.0f, 0.0f, 0.0f, 42.0f},
-                    {42.0f, 42.0f, 42.0f, 42.0f}
-                },
-                {
-                    {42.0f, 42.0f, 42.0f, 42.0f},
-                    {42.0f, 0.0f, 0.0f, 42.0f},
-                    {42.0f, 0.0f, 0.0f, 42.0f},
-                    {42.0f, 42.0f, 42.0f, 42.0f}
-                }
-            }
-        }).get_vector()),
-        read_vector<float>(result));
-    // clang-format on
-}
-
-// This is a regression test for one of TF's unit tests, which was failing.
-// The problem was inappropriate handling of the shape computation for a
-// zero-length axis with interior padding. Rather than subtract 1 from the
-// source shape and multiply by the interior padding (which causes underflow),
-// we should just count the pre-interior-padding length as zero.
-NGRAPH_TEST(${BACKEND_NAME}, pad_interior_exterior_4d_2x0x3x2)
-{
-    Shape shape_a{2, 0, 3, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape padding_below{1, 0, 0, 0};
-    Shape padding_above{0, 2, 0, 0};
-    Shape padding_interior{2, 1, 0, 0};
-    Shape shape_r{5, 2, 3, 2};
-    auto f = make_shared<Function>(
-        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
-        op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    // copy_data(a, test::NDArray<float, 2>({}).get_vector());
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{2112});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    vector<float> expected(5 * 2 * 3 * 2, 2112);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ(expected, read_vector<float>(result));
-}
-
-// This test covers the case with multiple image and with asymetric pad
-// bug has been found on nvGPU side now covered by this test
-NGRAPH_TEST(${BACKEND_NAME}, pad_2channel_2image_asym)
-{
-    Shape shape_a{2, 2, 4, 4};
-    auto window_movement_strides = Strides{2, 2};
-    Shape padding_below{0, 0, 0, 0};
-    Shape padding_above{0, 0, 2, 2};
-    Shape padding_interior{0, 0, 0, 0};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{2, 2, 6, 6};
-    auto f = make_shared<Function>(
-        make_shared<op::Pad>(A, B, padding_below, padding_above, padding_interior),
-        op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a,
-              test::NDArray<float, 4>({{{{0, 1, 0, 2}, // img 0 chan 0
-                                         {0, 3, 2, 0},
-                                         {2, 0, 0, 0},
-                                         {0, 2, 1, 0}},
-
-                                        {{0, 0, 0, 2}, // img 0 chan 1
-                                         {0, 2, 3, 0},
-                                         {2, 0, 1, 0},
-                                         {2, 0, 0, 0}}},
-
-                                       {{{0, 2, 1, 1}, // img 1 chan 0
-                                         {0, 0, 2, 0},
-                                         {0, 0, 1, 2},
-                                         {0, 0, 0, 0}},
-
-                                        {{2, 1, 0, 0}, // img 1 chan 1
-                                         {0, 2, 0, 0},
-                                         {1, 1, 2, 0},
-                                         {1, 0, 0, 0}}}})
-                  .get_vector());
-
-    auto b = backend->create_tensor(element::f32, shape_b);
-    copy_data(b, vector<float>{42});
-
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((test::NDArray<float, 4>({{{{0, 1, 0, 2, 42, 42}, // img 0 chan 0
-                                          {0, 3, 2, 0, 42, 42},
-                                          {2, 0, 0, 0, 42, 42},
-                                          {0, 2, 1, 0, 42, 42},
-                                          {42, 42, 42, 42, 42, 42},
-                                          {42, 42, 42, 42, 42, 42}},
-
-                                         {{0, 0, 0, 2, 42, 42}, // img 1 chan 0
-                                          {0, 2, 3, 0, 42, 42},
-                                          {2, 0, 1, 0, 42, 42},
-                                          {2, 0, 0, 0, 42, 42},
-                                          {42, 42, 42, 42, 42, 42},
-                                          {42, 42, 42, 42, 42, 42}}},
-
-                                        {{{0, 2, 1, 1, 42, 42}, // img 1 chan 0
-                                          {0, 0, 2, 0, 42, 42},
-                                          {0, 0, 1, 2, 42, 42},
-                                          {0, 0, 0, 0, 42, 42},
-                                          {42, 42, 42, 42, 42, 42},
-                                          {42, 42, 42, 42, 42, 42}},
-
-                                         {{2, 1, 0, 0, 42, 42}, // img 1 chan 1
-                                          {0, 2, 0, 0, 42, 42},
-                                          {1, 1, 2, 0, 42, 42},
-                                          {1, 0, 0, 0, 42, 42},
-                                          {42, 42, 42, 42, 42, 42},
-                                          {42, 42, 42, 42, 42, 42}}}})
-                   .get_vector()),
-              read_vector<float>(result));
-}
-
-// Trivial case with no reduced axes.
-NGRAPH_TEST(${BACKEND_NAME}, product_trivial)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{1, 2, 3, 4});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(result));
-}
-
-// Failure has been reported at 5D for some reason
-NGRAPH_TEST(${BACKEND_NAME}, product_trivial_5d)
-{
-    Shape shape{2, 2, 2, 2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, product_to_scalar)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f =
-        make_shared<Function>(make_shared<op::Product>(A, AxisSet{0, 1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{1, 2, 3, 4});
-    auto result = backend->create_tensor(element::f32, Shape{});
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{24}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, product_matrix_columns)
-{
-    Shape shape_a{3, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{2};
-    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{15, 48}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, product_matrix_rows)
-{
-    Shape shape_a{3, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3};
-    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{2, 12, 30}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, product_matrix_rows_zero)
-{
-    Shape shape_a{3, 0};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3};
-    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    copy_data(result, vector<float>({3, 3, 3}));
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 1, 1}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, product_matrix_cols_zero)
-{
-    // Now the reduction (g(x:float32[2,2],y:float32[]) = reduce(x,y,f,axes={})).
-    Shape shape_a{0, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{2};
-    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    copy_data(result, vector<float>({3, 3}));
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 1}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, product_vector_zero)
-{
-    Shape shape_a{0};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{};
-    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    copy_data(result, vector<float>({3}));
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, product_matrix_to_scalar_zero_by_zero)
-{
-    Shape shape_a{0, 0};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{};
-    auto f =
-        make_shared<Function>(make_shared<op::Product>(A, AxisSet{0, 1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    copy_data(result, vector<float>({3}));
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, product_3d_to_matrix_most_sig)
-{
-    Shape shape_a{3, 3, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3, 3};
-    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
-                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1 * 10 * 19,
-                             2 * 11 * 20,
-                             3 * 12 * 21,
-                             4 * 13 * 22,
-                             5 * 14 * 23,
-                             6 * 15 * 24,
-                             7 * 16 * 25,
-                             8 * 17 * 26,
-                             9 * 18 * 27}),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, product_3d_to_matrix_least_sig)
-{
-    Shape shape_a{3, 3, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3, 3};
-    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{2}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
-                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1 * 2 * 3,
-                             4 * 5 * 6,
-                             7 * 8 * 9,
-                             10 * 11 * 12,
-                             13 * 14 * 15,
-                             16 * 17 * 18,
-                             19 * 20 * 21,
-                             22 * 23 * 24,
-                             25 * 26 * 27}),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, product_3d_to_vector)
-{
-    Shape shape_a{3, 3, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3};
-    auto f =
-        make_shared<Function>(make_shared<op::Product>(A, AxisSet{0, 1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
-                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1.0f * 10.0f * 19.0f * 4.0f * 13.0f * 22.0f * 7.0f * 16.0f * 25.0f,
-                             2.0f * 11.0f * 20.0f * 5.0f * 14.0f * 23.0f * 8.0f * 17.0f * 26.0f,
-                             3.0f * 12.0f * 21.0f * 6.0f * 15.0f * 24.0f * 9.0f * 18.0f * 27.0f}),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, product_3d_to_scalar)
-{
-    Shape shape_a{3, 3, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{};
-    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{0, 1, 2}),
-                                   op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1,  2,  3,  4,  5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                               13, 12, 11, 10, 9, 8, 7, 6, 5, 4,  3,  2,  1});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_TRUE(test::all_close(vector<float>{1.0f * 10.0f * 9.0f * 4.0f * 13.0f * 6.0f * 7.0f *
-                                              12.0f * 3.0f * 2.0f * 11.0f * 8.0f * 5.0f * 14.0f *
-                                              5.0f * 8.0f * 11.0f * 2.0f * 3.0f * 12.0f * 7.0f *
-                                              6.0f * 13.0f * 4.0f * 9.0f * 10.0f * 1.0f},
-                                read_vector<float>(result)));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, product_3d_eliminate_zero_dim)
-{
-    Shape shape_a{3, 0, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3, 2};
-    auto f = make_shared<Function>(make_shared<op::Product>(A, AxisSet{1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    // Overwrite the initial result vector to make sure we're not just coincidentally getting the right value.
-    copy_data(result, vector<float>{2112, 2112, 2112, 2112, 2112, 2112});
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 1, 1, 1, 1, 1}), read_vector<float>(result));
-}
-
-// Trivial case with no reduced axes.
-NGRAPH_TEST(${BACKEND_NAME}, max_trivial)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{1, 2, 3, 4});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(result));
-}
-
-// Failure has been reported at 5D for some reason
-NGRAPH_TEST(${BACKEND_NAME}, max_trivial_5d)
-{
-    Shape shape{2, 2, 2, 2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, max_to_scalar)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{0, 1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{1, 2, 3, 4});
-    auto result = backend->create_tensor(element::f32, Shape{});
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{4}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, max_matrix_columns)
-{
-    Shape shape_a{3, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{2};
-    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{5, 6}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, max_matrix_rows)
-{
-    Shape shape_a{3, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3};
-    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{2, 4, 6}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, max_matrix_rows_zero)
-{
-    Shape shape_a{3, 0};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3};
-    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    copy_data(result, vector<float>({3, 3, 3}));
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{-std::numeric_limits<float>::infinity(),
-                             -std::numeric_limits<float>::infinity(),
-                             -std::numeric_limits<float>::infinity()}),
-              read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, max_matrix_cols_zero)
-{
-    // Now the reduction (g(x:float32[2,2],y:float32[]) = reduce(x,y,f,axes={})).
-    Shape shape_a{0, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{2};
-    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    copy_data(result, vector<float>({3, 3}));
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{-std::numeric_limits<float>::infinity(),
-                             -std::numeric_limits<float>::infinity()}),
-              read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, max_vector_zero)
-{
-    Shape shape_a{0};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{};
-    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    copy_data(result, vector<float>({3}));
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{-std::numeric_limits<float>::infinity()}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, max_matrix_to_scalar_zero_by_zero)
-{
-    Shape shape_a{0, 0};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{};
-    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{0, 1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    copy_data(result, vector<float>({3}));
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{-std::numeric_limits<float>::infinity()}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, max_3d_to_matrix_most_sig)
-{
-    Shape shape_a{3, 3, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3, 3};
-    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
-                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{19, 20, 21, 22, 23, 24, 25, 26, 27}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, max_3d_to_matrix_least_sig)
-{
-    Shape shape_a{3, 3, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3, 3};
-    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{2}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
-                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{3, 6, 9, 12, 15, 18, 21, 24, 27}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, max_3d_to_vector)
-{
-    Shape shape_a{3, 3, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3};
-    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{0, 1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
-                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{25.0f, 26.0f, 27.0f}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, max_3d_to_scalar)
-{
-    Shape shape_a{3, 3, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{};
-    auto f =
-        make_shared<Function>(make_shared<op::Max>(A, AxisSet{0, 1, 2}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1,  2,  3,  4,  5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                               13, 12, 11, 10, 9, 8, 7, 6, 5, 4,  3,  2,  1});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{14.0f}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, max_3d_eliminate_zero_dim)
-{
-    Shape shape_a{3, 0, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3, 2};
-    auto f = make_shared<Function>(make_shared<op::Max>(A, AxisSet{1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    // Overwrite the initial result vector to make sure we're not just coincidentally getting the right value.
-    copy_data(result, vector<float>{2112, 2112, 2112, 2112, 2112, 2112});
-
-    float mi = -std::numeric_limits<float>::infinity();
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{mi, mi, mi, mi, mi, mi}), read_vector<float>(result));
-}
-
-// Trivial case with no reduced axes.
-NGRAPH_TEST(${BACKEND_NAME}, min_trivial)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{1, 2, 3, 4});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(result));
-}
-
-// Failure has been reported at 5D for some reason
-NGRAPH_TEST(${BACKEND_NAME}, min_trivial_5d)
-{
-    Shape shape{2, 2, 2, 2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                             1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}),
-              read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, min_to_scalar)
-{
-    Shape shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{0, 1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{1, 2, 3, 4});
-    auto result = backend->create_tensor(element::f32, Shape{});
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, min_matrix_columns)
-{
-    Shape shape_a{3, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{2};
-    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 2}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, min_matrix_rows)
-{
-    Shape shape_a{3, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3};
-    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 3, 5}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, min_matrix_rows_zero)
-{
-    Shape shape_a{3, 0};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3};
-    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    copy_data(result, vector<float>({3, 3, 3}));
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{std::numeric_limits<float>::infinity(),
-                             std::numeric_limits<float>::infinity(),
-                             std::numeric_limits<float>::infinity()}),
-              read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, min_matrix_cols_zero)
-{
-    // Now the reduction (g(x:float32[2,2],y:float32[]) = reduce(x,y,f,axes={})).
-    Shape shape_a{0, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{2};
-    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    copy_data(result, vector<float>({3, 3}));
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{std::numeric_limits<float>::infinity(),
-                             std::numeric_limits<float>::infinity()}),
-              read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, min_vector_zero)
-{
-    Shape shape_a{0};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{};
-    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    copy_data(result, vector<float>({3}));
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{std::numeric_limits<float>::infinity()}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, min_matrix_to_scalar_zero_by_zero)
-{
-    Shape shape_a{0, 0};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{};
-    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{0, 1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    copy_data(result, vector<float>({3}));
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{std::numeric_limits<float>::infinity()}), read_vector<float>(result));
-
-    // For some reason I'm feeling extra paranoid about making sure reduction doesn't clobber the
-    // input tensors, so let's do this too.
-    EXPECT_EQ((vector<float>{}), read_vector<float>(a));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, min_3d_to_matrix_most_sig)
-{
-    Shape shape_a{3, 3, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3, 3};
-    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
-                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, min_3d_to_matrix_least_sig)
-{
-    Shape shape_a{3, 3, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3, 3};
-    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{2}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
-                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 4, 7, 10, 13, 16, 19, 22, 25}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, min_3d_to_vector)
-{
-    Shape shape_a{3, 3, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3};
-    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{0, 1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14,
-                               15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1, 2, 3}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, min_3d_to_scalar)
-{
-    Shape shape_a{3, 3, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{};
-    auto f =
-        make_shared<Function>(make_shared<op::Min>(A, AxisSet{0, 1, 2}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1,  2,  3,  4,  5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-                               13, 12, 11, 10, 9, 8, 7, 6, 5, 4,  3,  2,  1});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{1}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, min_3d_eliminate_zero_dim)
-{
-    Shape shape_a{3, 0, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_rt{3, 2};
-    auto f = make_shared<Function>(make_shared<op::Min>(A, AxisSet{1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-
-    // Overwrite the initial result vector to make sure we're not just coincidentally getting the right value.
-    copy_data(result, vector<float>{2112, 2112, 2112, 2112, 2112, 2112});
-
-    float inf = std::numeric_limits<float>::infinity();
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ((vector<float>{inf, inf, inf, inf, inf, inf}), read_vector<float>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, sigmoid_n1c1h2w2)
-{
-    auto input = make_shared<op::Parameter>(element::f32, Shape{1, 1, 2, 2});
-    auto sigmoid_node = make_shared<op::Sigmoid>(input);
-    auto func = make_shared<Function>(sigmoid_node, op::ParameterVector{input});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, input->get_shape());
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, input->get_shape());
-
-    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f};
-    copy_data(a, dataA);
-
-    backend->call_with_validate(func, {result}, {a});
-    vector<float> expected{0.73105858f, 0.98201379f, 0.73105858f, 0.98201379f};
-    ASSERT_TRUE(read_vector<float>(result) == expected);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, sigmoid_n1c1h4)
-{
-    auto input = make_shared<op::Parameter>(element::f32, Shape{1, 1, 4});
-    auto sigmoid_node = make_shared<op::Sigmoid>(input);
-    auto func = make_shared<Function>(sigmoid_node, op::ParameterVector{input});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, input->get_shape());
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, input->get_shape());
-
-    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f};
-    copy_data(a, dataA);
-
-    backend->call_with_validate(func, {result}, {a});
-    vector<float> expected{0.73105858f, 0.98201379f, 0.73105858f, 0.98201379f};
-    ASSERT_TRUE(read_vector<float>(result) == expected);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, sigmoid_bprop_n1c1h4)
-{
-    auto input = make_shared<op::Parameter>(element::f32, Shape{1, 1, 4});
-    auto delta = make_shared<op::Parameter>(element::f32, Shape{1, 1, 4});
-    auto sigmoid_node = make_shared<op::SigmoidBackprop>(input, delta);
-    auto func = make_shared<Function>(sigmoid_node, op::ParameterVector{input, delta});
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, input->get_shape());
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, delta->get_shape());
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, input->get_shape());
-
-    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f};
-    vector<float> dataB{1.0f, 1.0f, 1.0f, 1.0f};
-
-    copy_data(a, dataA);
-    copy_data(b, dataB);
-    backend->call_with_validate(func, {result}, {a, b});
-
-    vector<float> expected{0.196612f, 0.0176627f, 0.196612f, 0.0176627f};
-    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, relu_2Dfprop)
-{
-    auto shape_a = Shape{2, 5};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    auto relu = make_shared<op::Relu>(A);
-    auto shape_rt = Shape{2, 5};
-    auto f = make_shared<Function>(relu, op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0};
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ(read_vector<float>(result), expected);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, relu_4Dfprop)
-{
-    auto shape_a = Shape{2, 2, 2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    auto relu = make_shared<op::Relu>(A);
-    auto shape_rt = Shape{2, 2, 2, 2};
-    auto f = make_shared<Function>(relu, op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1};
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_EQ(read_vector<float>(result), expected);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, fuse_max_with_constant_zero_input_as_relu)
-{
-    auto shape_a = Shape{2, 5};
-    auto A = op::Constant::create(element::f32, shape_a, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
-    auto B = make_shared<op::Parameter>(element::f32, shape_a);
-    auto max = make_shared<op::Maximum>(A, B);
-    auto shape_rt = Shape{2, 5};
-    auto f = make_shared<Function>(max, op::ParameterVector{B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto b = backend->create_tensor(element::f32, shape_a);
-    copy_data(b, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0};
-
-    backend->call_with_validate(f, {result}, {b});
-    EXPECT_EQ(read_vector<float>(result), expected);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, relu_2Dbackprop)
-{
-    auto shape_a = Shape{2, 5};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    auto delta_val = make_shared<op::Parameter>(element::f32, shape_a);
-    auto relu = make_shared<op::ReluBackprop>(A, delta_val);
-    auto shape_rt = Shape{2, 5};
-    auto f = make_shared<Function>(relu, op::ParameterVector{A, delta_val});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5});
-    auto delta = backend->create_tensor(element::f32, shape_a);
-    copy_data(delta, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    vector<float> expected{1, 2, 0, 4, 0, 6, 7, 0, 9, 0};
-
-    backend->call_with_validate(f, {result}, {a, delta});
-    EXPECT_EQ(read_vector<float>(result), expected);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, relu_4Dbackprop)
-{
-    auto shape_a = Shape{2, 2, 2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    auto delta_val = make_shared<op::Parameter>(element::f32, shape_a);
-    auto relu = make_shared<op::ReluBackprop>(A, delta_val);
-    auto shape_rt = Shape{2, 2, 2, 2};
-    auto f = make_shared<Function>(relu, op::ParameterVector{A, delta_val});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::f32, shape_a);
-    copy_data(a, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1});
-    auto delta = backend->create_tensor(element::f32, shape_a);
-    copy_data(delta, vector<float>{1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1, 8, -8, 17, -0.5, 1});
-    auto result = backend->create_tensor(element::f32, shape_rt);
-    vector<float> expected{1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1, 8, 0, 17, 0, 1};
-
-    backend->call_with_validate(f, {result}, {a, delta});
-    EXPECT_EQ(read_vector<float>(result), expected);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, softmax_all)
-{
-    Shape shape{2, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f =
-        make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0, 1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{-3, -2, -1, 0, 1, 2});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    auto d = expf(-3) + expf(-2) + expf(-1) + expf(0) + expf(1) + expf(2);
-
-    backend->call_with_validate(f, {result}, {a});
-    vector<float> expected{
-        expf(-3) / d, expf(-2) / d, expf(-1) / d, expf(0) / d, expf(1) / d, expf(2) / d};
-    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result)));
-
-    // empty AxisSet is the same as "full" AxisSet
-    f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{}), op::ParameterVector{A});
-    backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    backend->call_with_validate(f, {result}, {a});
-    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result)));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, softmax_axis_3d)
-{
-    Shape shape{2, 2, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{-10, -20, -30, -40, -50, -60, -1, -2, -3, -4, -5, -6});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    auto d0 = expf(-10) + expf(-1);
-    auto d1 = expf(-20) + expf(-2);
-    auto d2 = expf(-30) + expf(-3);
-    auto d3 = expf(-40) + expf(-4);
-    auto d4 = expf(-50) + expf(-5);
-    auto d5 = expf(-60) + expf(-6);
-
-    backend->call_with_validate(f, {result}, {a});
-    vector<float> expected{expf(-10) / d0,
-                           expf(-20) / d1,
-                           expf(-30) / d2,
-                           expf(-40) / d3,
-                           expf(-50) / d4,
-                           expf(-60) / d5,
-                           expf(-1) / d0,
-                           expf(-2) / d1,
-                           expf(-3) / d2,
-                           expf(-4) / d3,
-                           expf(-5) / d4,
-                           expf(-6) / d5};
-
-    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, softmax_axis_3d_double)
-{
-    Shape shape{2, 2, 3};
-    auto A = make_shared<op::Parameter>(element::f64, shape);
-    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::f64, shape);
-    copy_data(a, vector<double>{-10, -20, -30, -40, -50, -60, -1, -2, -3, -4, -5, -6});
-    auto result = backend->create_tensor(element::f64, shape);
-
-    auto d0 = expf(-10) + expf(-1);
-    auto d1 = expf(-20) + expf(-2);
-    auto d2 = expf(-30) + expf(-3);
-    auto d3 = expf(-40) + expf(-4);
-    auto d4 = expf(-50) + expf(-5);
-    auto d5 = expf(-60) + expf(-6);
-
-    backend->call_with_validate(f, {result}, {a});
-    vector<double> expected{expf(-10) / d0,
-                            expf(-20) / d1,
-                            expf(-30) / d2,
-                            expf(-40) / d3,
-                            expf(-50) / d4,
-                            expf(-60) / d5,
-                            expf(-1) / d0,
-                            expf(-2) / d1,
-                            expf(-3) / d2,
-                            expf(-4) / d3,
-                            expf(-5) / d4,
-                            expf(-6) / d5};
-
-    EXPECT_TRUE(test::all_close(expected, read_vector<double>(result)));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, softmax_axis)
-{
-    Shape shape{2, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{1}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{-10, -20, -30, -40, -50, -60});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    auto d0 = expf(-10) + expf(-20) + expf(-30);
-    auto d1 = expf(-40) + expf(-50) + expf(-60);
-
-    backend->call_with_validate(f, {result}, {a});
-    vector<float> expected{expf(-10) / d0,
-                           expf(-20) / d0,
-                           expf(-30) / d0,
-                           expf(-40) / d1,
-                           expf(-50) / d1,
-                           expf(-60) / d1};
-    EXPECT_TRUE(test::all_close_f(expected, read_vector<float>(result)));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, softmax_axis_2)
-{
-    Shape shape{2, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{-10, -20, -30, -40, -50, -60});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    auto d0 = expf(-10) + expf(-40);
-    auto d1 = expf(-20) + expf(-50);
-    auto d2 = expf(-30) + expf(-60);
-
-    backend->call_with_validate(f, {result}, {a});
-    vector<float> expected{expf(-10) / d0,
-                           expf(-20) / d1,
-                           expf(-30) / d2,
-                           expf(-40) / d0,
-                           expf(-50) / d1,
-                           expf(-60) / d2};
-    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, softmax_axis_3d_trivial)
-{
-    Shape shape{1, 2, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{-10, -20, -30, -40, -50, -60});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    vector<float> expected{1, 1, 1, 1, 1, 1};
-    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, softmax_underflow)
-{
-    Shape shape{2, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Softmax>(A, AxisSet{0}), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto low = std::numeric_limits<float>::lowest();
-
-    auto a = backend->create_tensor(element::f32, shape);
-    copy_data(a, vector<float>{low, 1, 2, 3, 4, 5});
-    auto result = backend->create_tensor(element::f32, shape);
-
-    auto d0 = expf(low) + expf(3);
-    auto d1 = expf(1) + expf(4);
-    auto d2 = expf(2) + expf(5);
-
-    backend->call_with_validate(f, {result}, {a});
-    vector<float> expected{
-        expf(low) / d0, expf(1) / d1, expf(2) / d2, expf(3) / d0, expf(4) / d1, expf(5) / d2};
-    EXPECT_TRUE(test::all_close(expected, read_vector<float>(result)));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, multiple_backends)
-{
-    Shape shape{2, 2};
-    auto A1 = make_shared<op::Parameter>(element::f32, shape);
-    auto B1 = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(A1 + B1, op::ParameterVector{A1, B1});
-
-    auto A2 = make_shared<op::Parameter>(element::f32, shape);
-    auto B2 = make_shared<op::Parameter>(element::f32, shape);
-    auto g = make_shared<Function>(A2 * B2, op::ParameterVector{A2, B2});
-
-    auto backend1 = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto backend2 = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    shared_ptr<runtime::Tensor> a1 = backend1->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> b1 = backend1->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> result1 = backend1->create_tensor(element::f32, shape);
-
-    shared_ptr<runtime::Tensor> a2 = backend2->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> b2 = backend2->create_tensor(element::f32, shape);
-    shared_ptr<runtime::Tensor> result2 = backend2->create_tensor(element::f32, shape);
-
-    copy_data(a1, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
-    copy_data(b1, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
-
-    copy_data(a2, test::NDArray<float, 2>({{1, 2}, {3, 4}}).get_vector());
-    copy_data(b2, test::NDArray<float, 2>({{5, 6}, {7, 8}}).get_vector());
-
-    backend1->call_with_validate(f, {result1}, {a1, b1});
-    EXPECT_EQ(read_vector<float>(result1),
-              (test::NDArray<float, 2>({{6, 8}, {10, 12}})).get_vector());
-
-    backend2->call_with_validate(g, {result2}, {a2, b2});
-    EXPECT_EQ(read_vector<float>(result2),
-              (test::NDArray<float, 2>({{5, 12}, {21, 32}})).get_vector());
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, tensorview_custom_mem)
-{
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    Shape shape{2, 2};
-
-    auto make_external = [&]() {
-        auto A = make_shared<op::Parameter>(element::f32, shape);
-        auto B = make_shared<op::Parameter>(element::f32, shape);
-        auto f = make_shared<Function>(make_shared<op::Divide>(A, B), op::ParameterVector{A, B});
-
-        return f;
-    };
-
-    auto f = make_external();
-
-    vector<float> av{2, 4, 8, 16};
-    vector<float> bv{1, 2, 4, 8};
-    // use custom mem with tensorview, no need to copy data
-    auto a = backend->create_tensor(element::f32, shape, av.data());
-    auto b = backend->create_tensor(element::f32, shape, bv.data());
-
-    // use custom mem with result tensorview
-    vector<float> rv{0, 0, 0, 0};
-    auto result = backend->create_tensor(element::f32, shape, rv.data());
-
-    // result should be in memory without needing explict read
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((vector<float>{2, 2, 2, 2}), rv);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, validate_call_input_count)
-{
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    Shape shape{2, 2};
-
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Add>(A, B), op::ParameterVector{A, B});
-
-    auto a = backend->create_tensor(element::f32, shape);
-    auto b = backend->create_tensor(element::f32, shape);
-    auto c = backend->create_tensor(element::f32, shape);
-
-    EXPECT_ANY_THROW(backend->call_with_validate(f, {c}, {a}));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, validate_call_input_type)
-{
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    Shape shape{2, 2};
-
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Add>(A, B), op::ParameterVector{A, B});
-
-    auto a = backend->create_tensor(element::i32, shape);
-    auto b = backend->create_tensor(element::f32, shape);
-    auto c = backend->create_tensor(element::f32, shape);
-
-    EXPECT_ANY_THROW(backend->call_with_validate(f, {c}, {a, b}));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, validate_call_input_shape)
-{
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    Shape shape{2, 2};
-
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Add>(A, B), op::ParameterVector{A, B});
-
-    auto a = backend->create_tensor(element::f32, {2, 3});
-    auto b = backend->create_tensor(element::f32, shape);
-    auto c = backend->create_tensor(element::f32, shape);
-
-    EXPECT_ANY_THROW(backend->call_with_validate(f, {c}, {a, b}));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, validate_call_output_count)
-{
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    Shape shape{2, 2};
-
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Add>(A, B), op::ParameterVector{A, B});
-
-    auto a = backend->create_tensor(element::f32, shape);
-    auto b = backend->create_tensor(element::f32, shape);
-    auto c = backend->create_tensor(element::f32, shape);
-    auto d = backend->create_tensor(element::f32, shape);
-
-    EXPECT_ANY_THROW(backend->call_with_validate(f, {c, d}, {a, b}));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, validate_call_output_type)
-{
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    Shape shape{2, 2};
-
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Add>(A, B), op::ParameterVector{A, B});
-
-    auto a = backend->create_tensor(element::i32, shape);
-    auto b = backend->create_tensor(element::f32, shape);
-    auto c = backend->create_tensor(element::f32, shape);
-
-    EXPECT_ANY_THROW(backend->call_with_validate(f, {a}, {b, c}));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, validate_call_output_shape)
-{
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    Shape shape{2, 2};
-
-    auto A = make_shared<op::Parameter>(element::f32, shape);
-    auto B = make_shared<op::Parameter>(element::f32, shape);
-    auto f = make_shared<Function>(make_shared<op::Add>(A, B), op::ParameterVector{A, B});
-
-    auto a = backend->create_tensor(element::f32, {2, 3});
-    auto b = backend->create_tensor(element::f32, shape);
-    auto c = backend->create_tensor(element::f32, shape);
-
-    EXPECT_ANY_THROW(backend->call_with_validate(f, {a}, {c, b}));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, logical_and)
-{
-    Shape shape{2, 2, 2};
-    auto A = make_shared<op::Parameter>(element::boolean, shape);
-    auto B = make_shared<op::Parameter>(element::boolean, shape);
-    auto f = make_shared<Function>(make_shared<op::And>(A, B), op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::boolean, shape);
-    copy_data(a, vector<char>{1, 0, 1, 1, 1, 0, 1, 0});
-    auto b = backend->create_tensor(element::boolean, shape);
-    copy_data(b, vector<char>{0, 0, 1, 0, 0, 1, 1, 0});
-    auto result = backend->create_tensor(element::boolean, shape);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((vector<char>{0, 0, 1, 0, 0, 0, 1, 0}), read_vector<char>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, logical_or)
-{
-    Shape shape{2, 2, 2};
-    auto A = make_shared<op::Parameter>(element::boolean, shape);
-    auto B = make_shared<op::Parameter>(element::boolean, shape);
-    auto f = make_shared<Function>(make_shared<op::Or>(A, B), op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto a = backend->create_tensor(element::boolean, shape);
-    copy_data(a, vector<char>{1, 0, 1, 1, 1, 0, 1, 0});
-    auto b = backend->create_tensor(element::boolean, shape);
-    copy_data(b, vector<char>{0, 0, 1, 0, 0, 1, 1, 0});
-    auto result = backend->create_tensor(element::boolean, shape);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((vector<char>{1, 0, 1, 1, 1, 1, 1, 0}), read_vector<char>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_b1c2h2w2)
-{
-    auto input_shape = Shape{1, 2, 2, 2};
-    auto input = make_shared<op::Parameter>(element::f32, input_shape);
-    auto mean_shape = Shape{2};
-    auto var_shape = Shape{2};
-    auto gamma_shape = Shape{2};
-    auto gamma = make_shared<op::Parameter>(element::f32, gamma_shape);
-    auto beta_shape = Shape{2};
-    auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
-    double eps = 0.001;
-    auto shape_r = Shape{1, 2, 2, 2};
-    auto bn = make_shared<op::BatchNormTraining>(input, gamma, beta, eps);
-
-    auto output_rt = std::make_shared<op::GetOutputElement>(bn, 0);
-    auto mean_rt = std::make_shared<op::GetOutputElement>(bn, 1);
-    auto variance_rt = std::make_shared<op::GetOutputElement>(bn, 2);
-
-    auto f = make_shared<Function>(NodeVector{output_rt, mean_rt, variance_rt},
-                                   op::ParameterVector{input, gamma, beta});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    auto _input = backend->create_tensor(element::f32, Shape{1, 2, 2, 2});
-
-    copy_data(_input,
-              vector<float>{0.54881352f,
-                            0.71518934f,
-                            0.60276335f,
-                            0.54488319f,
-                            0.42365479f,
-                            0.64589411f,
-                            0.4375872f,
-                            0.89177299f});
-    auto _gamma = backend->create_tensor(element::f32, gamma_shape);
-    copy_data(_gamma, vector<float>{1.0f, 1.0f});
-    auto _beta = backend->create_tensor(element::f32, beta_shape);
-    copy_data(_beta, vector<float>{0.0f, 0.0f});
-    auto bn_output = backend->create_tensor(element::f32, shape_r);
-    auto result_mean = backend->create_tensor(element::f32, mean_shape);
-    auto result_variance = backend->create_tensor(element::f32, var_shape);
-
-    vector<float> expected_result{-0.71498716f,
-                                  1.48388731f,
-                                  -0.00196938f,
-                                  -0.76693159f,
-                                  -0.91316032f,
-                                  0.23943391f,
-                                  -0.84090298f,
-                                  1.51462936f};
-    vector<float> expected_mean{0.602912f, 0.599727f};
-    vector<float> expected_variance{0.00472505f, 0.0361782f};
-
-    backend->call_with_validate(
-        f, {bn_output, result_mean, result_variance}, {_input, _gamma, _beta});
-
-    EXPECT_TRUE(test::all_close(expected_result, read_vector<float>(bn_output), 1e-5f, 1e-6f));
-    EXPECT_TRUE(test::all_close(expected_mean, read_vector<float>(result_mean), 1e-5f, 1e-6f));
-    EXPECT_TRUE(
-        test::all_close(expected_variance, read_vector<float>(result_variance), 1e-5f, 1e-6f));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_b2c2h2w1)
-{
-    auto input_shape = Shape{2, 2, 2, 1};
-    auto input = make_shared<op::Parameter>(element::f32, input_shape);
-    auto mean_shape = Shape{2};
-    auto var_shape = Shape{2};
-    auto gamma_shape = Shape{2};
-    auto gamma = make_shared<op::Parameter>(element::f32, gamma_shape);
-    auto beta_shape = Shape{2};
-    auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
-    double eps = 0.001;
-    auto shape_r = Shape{2, 2, 2, 1};
-    auto bn = make_shared<op::BatchNormTraining>(input, gamma, beta, eps);
-
-    auto output_rt = std::make_shared<op::GetOutputElement>(bn, 0);
-    auto mean_rt = std::make_shared<op::GetOutputElement>(bn, 1);
-    auto variance_rt = std::make_shared<op::GetOutputElement>(bn, 2);
-
-    auto f = make_shared<Function>(NodeVector{output_rt, mean_rt, variance_rt},
-                                   op::ParameterVector{input, gamma, beta});
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    // Create some tensors for input/output
-    auto _input = backend->create_tensor(element::f32, input_shape);
-    copy_data(_input,
-              vector<float>{0.54881352f,
-                            0.71518934f,
-                            0.60276335f,
-                            0.54488319f,
-                            0.42365479f,
-                            0.64589411f,
-                            0.4375872f,
-                            0.89177299f});
-
-    auto _gamma = backend->create_tensor(element::f32, gamma_shape);
-    copy_data(_gamma, vector<float>{1.0f, 1.0f});
-    auto _beta = backend->create_tensor(element::f32, beta_shape);
-    copy_data(_beta, vector<float>{0.0f, 0.0f});
-    auto bn_output = backend->create_tensor(element::f32, shape_r);
-    auto result_mean = backend->create_tensor(element::f32, mean_shape);
-    auto result_variance = backend->create_tensor(element::f32, var_shape);
-
-    vector<float> expected_result{
-        -0.30327f, 1.1561f, -0.0963782f, -0.434702f, -1.4011f, 0.548275f, -1.06187f, 1.59295f};
-    vector<float> expected_mean{0.583388f, 0.619252f};
-    vector<float> expected_variance{0.0119972f, 0.0282681f};
-    backend->call_with_validate(
-        f, {bn_output, result_mean, result_variance}, {_input, _gamma, _beta});
-
-    EXPECT_TRUE(test::all_close(expected_result, read_vector<float>(bn_output)));
-    EXPECT_TRUE(test::all_close(expected_mean, read_vector<float>(result_mean)));
-    EXPECT_TRUE(
-        test::all_close(expected_variance, read_vector<float>(result_variance), 1e-5f, 1e-6f));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, batchnorm_bprop_n4c3h2w2)
-{
-    auto input_shape = Shape{4, 3, 2, 2};
-    auto shape_mean = Shape{3};
-    auto input = make_shared<op::Parameter>(element::f32, input_shape);
-    auto mean_shape = Shape{3};
-    auto mean = make_shared<op::Parameter>(element::f32, mean_shape);
-    auto var_shape = Shape{3};
-    auto var = make_shared<op::Parameter>(element::f32, var_shape);
-    auto gamma_shape = Shape{3};
-    auto gamma = make_shared<op::Parameter>(element::f32, gamma_shape);
-    auto beta_shape = Shape{3};
-    auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
-    double eps = 0.001;
-    auto shape_r = Shape{4, 3, 2, 2};
-    auto bn = make_shared<op::BatchNormTraining>(input, gamma, beta, eps);
-    auto bn_dx = make_shared<op::GetOutputElement>(bn, 0);
-    auto bn_dgamma = make_shared<op::GetOutputElement>(bn, 1);
-    auto bn_dbeta = make_shared<op::GetOutputElement>(bn, 2);
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto _input = backend->create_tensor(element::f32, input_shape);
-    vector<float> dataInput{
-        10.76331902f, 11.51178265f, 10.31018162f, 12.2993021f,  14.17626667f, 14.63498497f,
-        13.63494492f, 13.84248161f, 11.34602547f, 13.22014618f, 10.46686649f, 10.39842987f,
-        12.94806862f, 11.71670246f, 14.94438076f, 13.13236618f, 13.40889645f, 12.76128387f,
-        11.34430027f, 11.86629677f, 11.11464024f, 10.93221283f, 11.95324039f, 10.96581173f,
-        13.05455494f, 14.41404247f, 13.11169434f, 11.26559448f, 10.89965153f, 14.08202171f,
-        11.12685776f, 12.58428574f, 12.59247875f, 13.00187492f, 12.66310215f, 10.06655025f,
-        12.62048626f, 14.47942352f, 13.84950638f, 10.61425877f, 11.47936344f, 13.06011772f,
-        13.63069057f, 12.31748772f, 13.84555244f, 10.95815468f, 12.78933334f, 12.75389099f};
-    copy_data(_input, dataInput);
-    auto _mean = backend->create_tensor(element::f32, mean_shape);
-    copy_data(_mean, vector<float>{12.56472874f, 12.80312157f, 11.81676865f});
-    auto _var = backend->create_tensor(element::f32, var_shape);
-    copy_data(_var, vector<float>{1.94557643f, 1.32772446f, 1.28163588f});
-
-    auto _gamma = backend->create_tensor(element::f32, gamma_shape);
-    copy_data(_gamma, vector<float>{2.0f, 2.0f, 2.0f});
-    auto _beta = backend->create_tensor(element::f32, beta_shape);
-    copy_data(_beta, vector<float>{1.0f, 1.0f, 1.0f});
-    auto result = backend->create_tensor(element::f32, shape_r);
-
-    shared_ptr<runtime::Tensor> _delta = backend->create_tensor(element::f32, shape_r);
-    vector<float> deltaData(shape_size(shape_r), 20.0f);
-    copy_data(_delta, deltaData);
-
-    auto f = make_shared<Function>(NodeVector{bn_dx, bn_dgamma, bn_dbeta},
-                                   op::ParameterVector{mean, var, input, gamma, beta});
-
-    auto C = std::make_shared<op::Parameter>(element::f32, shape_r);
-
-    auto zero = ngraph::make_zero(bn_dgamma->get_element_type(), bn_dgamma->get_shape());
-    ngraph::autodiff::Adjoints adjoints(NodeVector{bn_dx, bn_dgamma, bn_dbeta},
-                                        NodeVector{C, zero, zero});
-
-    auto dinput = adjoints.backprop_node(input);
-    auto dgamma = adjoints.backprop_node(gamma);
-    auto dbeta = adjoints.backprop_node(beta);
-
-    auto df = make_shared<Function>(NodeVector{dinput, dgamma, dbeta},
-                                    op::ParameterVector{mean, var, input, gamma, beta, C});
-
-    // roundtrip serialization
-    string js = serialize(df, 4);
-    istringstream in(js);
-    df = deserialize(in);
-
-    shared_ptr<runtime::Tensor> _dinput = backend->create_tensor(element::f32, shape_r);
-    shared_ptr<runtime::Tensor> _dgamma = backend->create_tensor(element::f32, gamma_shape);
-    shared_ptr<runtime::Tensor> _dbeta = backend->create_tensor(element::f32, beta_shape);
-
-    backend->call_with_validate(
-        df, {_dinput, _dgamma, _dbeta}, {_mean, _var, _input, _gamma, _beta, _delta});
-
-    vector<float> expected_input{
-        8.17051607e-06f,  4.77576657e-06f,  1.02257760e-05f,  1.20387525e-06f,  -1.73868522e-06f,
-        3.84632768e-06f,  -1.07932050e-05f, -2.57458956e-06f, -2.22166714e-06f, -8.38779043e-06f,
-        -2.48082982e-06f, 5.89238360e-06f,  -2.52895109e-07f, -8.68433445e-06f, -5.82726737e-06f,
-        8.84659658e-06f,  3.03944108e-05f,  4.05480879e-05f,  1.84123158e-05f,  2.30061178e-05f,
-        1.34087590e-05f,  -9.26072571e-07f, -3.22908454e-05f, -2.07365116e-05f, -4.21330941e-05f,
-        2.83083100e-05f,  -3.71039101e-05f, -4.84390640e-06f, -2.93012376e-05f, 5.68858087e-06f,
-        1.83181458e-05f,  -1.07494506e-05f, -2.32429103e-06f, 6.92914809e-06f,  -6.66512321e-06f,
-        -7.00302840e-06f, -3.46675184e-06f, -4.36748381e-06f, 6.73822226e-07f,  -4.20158993e-06f,
-        3.83005061e-06f,  5.85143729e-06f,  4.17875243e-06f,  -8.64167783e-06f, 1.00170803e-05f,
-        -4.23939666e-06f, 4.80201680e-06f,  4.62702078e-06f};
-
-    ASSERT_TRUE(ngraph::test::all_close(read_vector<float>(_dinput), expected_input, 1e-3f, 1e-4f));
-    vector<float> expected_dgamma{7.06315041e-05f, -2.35289335e-04f, -5.06639481e-05f};
-    ASSERT_TRUE(
-        ngraph::test::all_close(read_vector<float>(_dgamma), expected_dgamma, 1e-2f, 1e-3f));
-    vector<float> expected_dbeta{320.f, 320.f, 320.f};
-    ASSERT_TRUE(ngraph::test::all_close(read_vector<float>(_dbeta), expected_dbeta, 1e-4f, 1e-8f));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_inference_b2c2h2w1)
-{
-    auto input_shape = Shape{2, 2, 2, 1};
-    auto input = make_shared<op::Parameter>(element::f32, input_shape);
-    auto mean_shape = Shape{2};
-    auto mean = make_shared<op::Parameter>(element::f32, mean_shape);
-    auto var_shape = Shape{2};
-    auto var = make_shared<op::Parameter>(element::f32, var_shape);
-    auto gamma_shape = Shape{2};
-    auto gamma = make_shared<op::Parameter>(element::f32, gamma_shape);
-    auto beta_shape = Shape{2};
-    auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
-    double eps = 0.001;
-    auto shape_r = Shape{2, 2, 2, 1};
-    auto bn = make_shared<op::BatchNormInference>(input, gamma, beta, mean, var, eps);
-
-    auto f = make_shared<Function>(bn, op::ParameterVector{input, gamma, beta, mean, var});
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    // Create some tensors for input/output
-    auto _input = backend->create_tensor(element::f32, input_shape);
-    copy_data(_input,
-              vector<float>{0.54881352f,
-                            0.71518934f,
-                            0.60276335f,
-                            0.54488319f,
-                            0.42365479f,
-                            0.64589411f,
-                            0.4375872f,
-                            0.89177299f});
-
-    auto _gamma = backend->create_tensor(element::f32, gamma_shape);
-    copy_data(_gamma, vector<float>{1.0f, 1.0f});
-    auto _beta = backend->create_tensor(element::f32, beta_shape);
-    copy_data(_beta, vector<float>{0.0f, 0.0f});
-    auto _mean = backend->create_tensor(element::f32, mean_shape);
-    copy_data(_mean, vector<float>{0.583388f, 0.619252f});
-    auto _var = backend->create_tensor(element::f32, var_shape);
-    copy_data(_var, vector<float>{0.0119972f, 0.0282681f});
-    auto bn_output = backend->create_tensor(element::f32, shape_r);
-
-    vector<float> expected_result{
-        -0.30327f, 1.1561f, -0.0963782f, -0.434702f, -1.4011f, 0.548275f, -1.06187f, 1.59295f};
-    backend->call_with_validate(f, {bn_output}, {_input, _gamma, _beta, _mean, _var});
-
-    ASSERT_TRUE(
-        ngraph::test::all_close(expected_result, read_vector<float>(bn_output), 1e-3f, 1e-4f));
-}
-
-#if 0
-NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_globalstats_b2c2w2h1)
-{
-    auto input_shape = Shape{2, 2, 2, 1};
-    auto input = make_shared<op::Parameter>(element::f32, input_shape);
-    auto mean_shape = Shape{2};
-    auto mean = make_shared<op::Parameter>(element::f32, mean_shape);
-    auto var_shape = Shape{2};
-    auto var = make_shared<op::Parameter>(element::f32, var_shape);
-    auto gamma_shape = Shape{2};
-    auto gamma = make_shared<op::Parameter>(element::f32, gamma_shape);
-    auto beta_shape = Shape{2};
-    auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
-    double eps = 0.001;
-    auto shape_r = Shape{2, 2, 2, 1};
-    auto bn = make_shared<op::BatchNormTraining>(input, gamma, beta, mean, var, eps);
-
-    auto f = make_shared<Function>(bn, op::ParameterVector{gamma, beta, input, mean, var});
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    // Create some tensors for input/output
-    auto _input = backend->create_tensor(element::f32, input_shape);
-    copy_data(_input,
-              vector<float>{0.54881352f,
-                            0.71518934f,
-                            0.60276335f,
-                            0.54488319f,
-                            0.42365479f,
-                            0.64589411f,
-                            0.4375872f,
-                            0.89177299f});
-
-    auto _gamma = backend->create_tensor(element::f32, gamma_shape);
-    copy_data(_gamma, vector<float>{1.0f, 1.0f});
-    auto _beta = backend->create_tensor(element::f32, beta_shape);
-    copy_data(_beta, vector<float>{0.0f, 0.0f});
-    auto _mean = backend->create_tensor(element::f32, mean_shape);
-    copy_data(_mean, vector<float>{0.583388f, 0.619252f});
-    auto _var = backend->create_tensor(element::f32, var_shape);
-    copy_data(_var, vector<float>{0.0119972f, 0.0282681f});
-    auto bn_output = backend->create_tensor(element::f32, shape_r);
-
-    vector<float> expected_result{
-        -0.30327f, 1.1561f, -0.0963782f, -0.434702f, -1.4011f, 0.548275f, -1.06187f, 1.59295f};
-    backend->call_with_validate(f, {bn_output}, {_gamma, _beta, _input, _mean, _var});
-
-    ASSERT_TRUE(
-        ngraph::test::all_close(expected_result, read_vector<float>(bn_output), 1e-3f, 1e-4f));
-}
-#endif
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_sequence_n2c3h4w2)
-{
-    Shape shape{2, 3, 4, 2};
-    Shape seq_len_shape{4};
-    auto A = make_shared<op::Parameter>(element::i32, shape);
-    auto B = make_shared<op::Parameter>(element::i32, seq_len_shape);
-
-    size_t batch_axis = 2;
-    size_t sequence_axis = 1;
-    auto rs = std::make_shared<op::ReverseSequence>(A, B, batch_axis, sequence_axis);
-
-    auto f = make_shared<Function>(rs, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shape);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, seq_len_shape);
-
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i32, shape);
-
-    std::vector<int> input{
-        0,  0, 3,  0, 6,  0, 9,  0, 1,  0, 4,  0, 7,  0, 10, 0, 2,  0, 5,  0, 8,  0, 11, 0,
-        12, 0, 15, 0, 18, 0, 21, 0, 13, 0, 16, 0, 19, 0, 22, 0, 14, 0, 17, 0, 20, 0, 23, 0,
-    };
-
-    std::vector<int> seq_lenghts{1, 2, 1, 2};
-    copy_data(b, seq_lenghts);
-
-    std::vector<int> expected{
-        0,  0, 4,  0, 6,  0, 10, 0, 1,  0, 3,  0, 7,  0, 9,  0, 2,  0, 5,  0, 8,  0, 11, 0,
-
-        12, 0, 16, 0, 18, 0, 22, 0, 13, 0, 15, 0, 19, 0, 21, 0, 14, 0, 17, 0, 20, 0, 23, 0};
-
-    copy_data(a, input);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ(read_vector<int>(result), expected);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_sequence_n4c3h2w2)
-{
-    Shape shape{4, 3, 2, 2};
-    auto A = make_shared<op::Parameter>(element::i32, shape);
-    Shape seq_len_shape{4};
-    auto B = make_shared<op::Parameter>(element::i32, seq_len_shape);
-
-    size_t batch_axis = 0;
-    size_t sequence_axis = 1;
-
-    auto rs = std::make_shared<op::ReverseSequence>(A, B, batch_axis, sequence_axis);
-
-    auto f = make_shared<Function>(rs, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shape);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, seq_len_shape);
-
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i32, shape);
-
-    std::vector<int> seq_lenghts{1, 2, 3, 3};
-    copy_data(b, seq_lenghts);
-
-    std::vector<int> input{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                           16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                           32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47};
-
-    std::vector<int> expected{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 16, 17, 18, 19,
-                              12, 13, 14, 15, 20, 21, 22, 23, 32, 33, 34, 35, 28, 29, 30, 31,
-                              24, 25, 26, 27, 44, 45, 46, 47, 40, 41, 42, 43, 36, 37, 38, 39};
-
-    copy_data(a, input);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ(read_vector<int>(result), expected);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, reverse_sequence_n4d2c3h2w2)
-{
-    Shape shape{4, 2, 3, 2, 2};
-    auto A = make_shared<op::Parameter>(element::i32, shape);
-    Shape seq_len_shape{4};
-    auto B = make_shared<op::Parameter>(element::i32, seq_len_shape);
-
-    size_t batch_axis = 0;
-    size_t sequence_axis = 2;
-
-    auto rs = std::make_shared<op::ReverseSequence>(A, B, batch_axis, sequence_axis);
-
-    auto f = make_shared<Function>(rs, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shape);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, seq_len_shape);
-
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i32, shape);
-
-    std::vector<int> input{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                           16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-                           32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-                           48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-                           64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
-                           80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95};
-
-    std::vector<int> expected{0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-                              16, 17, 18, 19, 20, 21, 22, 23, 28, 29, 30, 31, 24, 25, 26, 27,
-                              32, 33, 34, 35, 40, 41, 42, 43, 36, 37, 38, 39, 44, 45, 46, 47,
-                              48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-                              64, 65, 66, 67, 68, 69, 70, 71, 76, 77, 78, 79, 72, 73, 74, 75,
-                              80, 81, 82, 83, 88, 89, 90, 91, 84, 85, 86, 87, 92, 93, 94, 95};
-
-    copy_data(a, input);
-
-    std::vector<int> seq_lenghts{1, 2, 1, 2};
-    copy_data(b, seq_lenghts);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ(read_vector<int>(result), expected);
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, generate_mask)
-{
-    Shape scalar{};
-    Shape result_shape{1, 128};
-    const unsigned int seed = 777;
-    auto training = op::Constant::create(element::f32, Shape{}, {1});
-    auto gen_mask = make_shared<op::GenerateMask>(training, result_shape, element::f32, seed, 0.5);
-    auto gen_mask2 = make_shared<op::GenerateMask>(training, result_shape, element::f32, seed, 0.5);
-    auto f = make_shared<Function>(NodeVector{gen_mask, gen_mask2}, op::ParameterVector{});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto is_not_zero_or_one = [](float num) { return num != 0.f && num != 1.f; };
-
-    auto result_tv1 = backend->create_tensor<float>(result_shape);
-    auto result_tv2 = backend->create_tensor<float>(result_shape);
-    backend->call_with_validate(f, {result_tv1, result_tv2}, {});
-    auto result1 = read_vector<float>(result_tv1);
-    auto result2 = read_vector<float>(result_tv2);
-    ASSERT_EQ(result1, result2);
-    ASSERT_FALSE(std::any_of(result1.begin(), result1.end(), is_not_zero_or_one));
-    backend->call_with_validate(f, {result_tv1, result_tv2}, {});
-    auto result1_2 = read_vector<float>(result_tv1);
-    auto result2_2 = read_vector<float>(result_tv2);
-    ASSERT_NE(result1, result1_2);
-    ASSERT_FALSE(std::any_of(result1_2.begin(), result1_2.end(), is_not_zero_or_one));
-    ASSERT_NE(result2, result2_2);
-    ASSERT_FALSE(std::any_of(result2_2.begin(), result2_2.end(), is_not_zero_or_one));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, quantize)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape;
-    AxisSet quantization_axes;
-
-    auto input_type = element::f32;
-    auto output_type = element::u8;
-
-    typedef float input_c_type;
-    typedef uint8_t output_c_type;
-
-    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_INFINITY;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(input_type, scale_offset_shape, {2});
-    auto offset = op::Constant::create(output_type, scale_offset_shape, {1});
-    auto quantize =
-        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
-    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
-    // divide by scale                2  2  2  2  2  2  2  2  2  2  2   2
-    // equals (rounded)               0  1  1  2  2  3  3  4  4  5  5   6
-    // plus offset                    1  1  1  1  1  1  1  1  1  1  1   1
-    // equals                         1  2  2  3  3  4  4  5  5  6  6   7
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ((vector<output_c_type>{1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7}),
-              read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, dequantize)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape;
-    AxisSet quantization_axes;
-
-    auto input_type = element::u8;
-    auto output_type = element::f32;
-
-    typedef uint8_t input_c_type;
-    typedef float output_c_type;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(output_type, scale_offset_shape, {2});
-    auto offset = op::Constant::create(input_type, scale_offset_shape, {1});
-    auto dequantize = make_shared<op::Dequantize>(X, scale, offset, output_type, quantization_axes);
-    auto f = make_shared<Function>(dequantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7});
-    // minus offset                   1  1  1  1  1  1  1  1  1  1  1  1
-    // eqauls                         0  1  1  2  2  3  3  4  4  5  5  6
-    // multiplied by scale            2  2  2  2  2  2  2  2  2  2  2  2
-    // equals                         0  2  2  4  4  6  6  8  8 10 10 12
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ((vector<output_c_type>{0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12}),
-              read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, dequantize_zero_offset)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape;
-    AxisSet quantization_axes;
-
-    auto input_type = element::u8;
-    auto output_type = element::f32;
-
-    typedef uint8_t input_c_type;
-    typedef float output_c_type;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(output_type, scale_offset_shape, {2});
-    auto offset = op::Constant::create(input_type, scale_offset_shape, {0});
-    auto dequantize = make_shared<op::Dequantize>(X, scale, offset, output_type, quantization_axes);
-    auto f = make_shared<Function>(dequantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7});
-    // minus offset                   0  0  0  0  0  0  0  0  0  0  0  0
-    // multiplied by scale            2  2  2  2  2  2  2  2  2  2  2  2
-    // equals                         2  4  4  6  6  8  8  10 10 12 12 14
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ((vector<output_c_type>{2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14}),
-              read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, quantize_axes)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape{4};
-    AxisSet quantization_axes{0};
-
-    auto input_type = element::f32;
-    auto output_type = element::u8;
-
-    typedef float input_c_type;
-    typedef uint8_t output_c_type;
-
-    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_INFINITY;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(input_type, scale_offset_shape, {2, 3, 4, 5});
-    auto offset = op::Constant::create(output_type, scale_offset_shape, {10, 20, 30, 40});
-    auto quantize =
-        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
-    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
-    // divided by scale               2  2  2  3  3  3  4  4  4  5  5   5
-    // equals (rounded)               0  1  1  1  1  2  2  2  2  2  2   2
-    // plus offset                   10 10 10 20 20 20 30 30 30 40 40  40
-    // equals                        10 11 11 21 21 22 32 32 32 42 42  42
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ((vector<output_c_type>{10, 11, 11, 21, 21, 22, 32, 32, 32, 42, 42, 42}),
-              read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, dequantize_axes)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape{4};
-    AxisSet quantization_axes{0};
-
-    auto input_type = element::u8;
-    auto output_type = element::f32;
-
-    typedef uint8_t input_c_type;
-    typedef float output_c_type;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(output_type, scale_offset_shape, {2, 3, 4, 5});
-    auto offset = op::Constant::create(input_type, scale_offset_shape, {10, 20, 30, 40});
-    auto dequantize = make_shared<op::Dequantize>(X, scale, offset, output_type, quantization_axes);
-    auto f = make_shared<Function>(dequantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{10, 11, 11, 21, 21, 22, 32, 32, 32, 42, 42, 42});
-    // minus offset                   10  10  10  20  20  20  30  30  30  40  40  40
-    // equals                          0   1   1   1   1   2   2   2   2   2   2   2
-    // multiplied by scale             2   2   2   3   3   3   4   4   4   5   5   5
-    // equals                          0   2   2   3   3   6   8   8   8  10  10  10
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ((vector<output_c_type>{0, 2, 2, 3, 3, 6, 8, 8, 8, 10, 10, 10}),
-              read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, quantize_int8)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape;
-    AxisSet quantization_axes;
-
-    auto input_type = element::f32;
-    auto output_type = element::i8;
-
-    typedef float input_c_type;
-    typedef int8_t output_c_type;
-
-    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_INFINITY;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(input_type, scale_offset_shape, {2});
-    auto offset = op::Constant::create(output_type, scale_offset_shape, {1});
-    auto quantize =
-        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
-    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{0, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11});
-    // divide by scale                2   2  2   2  2   2  2   2  2   2  2    2
-    // equals (rounded)               0  -1  1  -2  2  -3  3  -4  4  -5  5   -6
-    // plus offset                    1   1  1   1  1   1  1   1  1   1  1    1
-    // equals                         1   0  2  -1  3  -2  4  -3  5  -4  6   -5
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ((vector<output_c_type>{1, 0, 2, -1, 3, -2, 4, -3, 5, -4, 6, -5}),
-              read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, dequantize_int8)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape;
-    AxisSet quantization_axes;
-
-    auto input_type = element::i8;
-    auto output_type = element::f32;
-
-    typedef int8_t input_c_type;
-    typedef float output_c_type;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(output_type, scale_offset_shape, {2});
-    auto offset = op::Constant::create(input_type, scale_offset_shape, {1});
-    auto dequantize = make_shared<op::Dequantize>(X, scale, offset, output_type, quantization_axes);
-    auto f = make_shared<Function>(dequantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{1, 0, 2, -1, 3, -2, 4, -3, 5, -4, 6, -5});
-    // minus offset                   1  1  1   1  1   1  1   1  1   1  1   1
-    // equals                         0 -1  1  -2  2  -3  3  -4  4  -5  5  -6
-    // multiplied by scale            2  2  2   2  2   2  2   2  2   2  2   2
-    // equals                         0 -2  2  -4  4  -6  6  -8  8 -10 10 -12
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ((vector<output_c_type>{0, -2, 2, -4, 4, -6, 6, -8, 8, -10, 10, -12}),
-              read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, quantize_clamp)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape;
-    AxisSet quantization_axes;
-
-    auto input_type = element::f32;
-    auto output_type = element::i8;
-
-    typedef float input_c_type;
-    typedef int8_t output_c_type;
-
-    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_INFINITY;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(input_type, scale_offset_shape, {0.00001});
-    auto offset = op::Constant::create(output_type, scale_offset_shape, {1});
-    auto quantize =
-        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
-    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{0, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11});
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ(
-        (vector<output_c_type>{1, -128, 127, -128, 127, -128, 127, -128, 127, -128, 127, -128}),
-        read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_NEAREST_TOWARD_ZERO)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape;
-    AxisSet quantization_axes;
-
-    auto input_type = element::f32;
-    auto output_type = element::i8;
-
-    typedef float input_c_type;
-    typedef int8_t output_c_type;
-
-    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_ZERO;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
-    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
-    auto quantize =
-        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
-    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
-    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
-    // equals (rounded)               2   2   3  -2   -2   -3   3   3   4   -3   -3   -4
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ((vector<output_c_type>{2, 2, 3, -2, -2, -3, 3, 3, 4, -3, -3, -4}),
-              read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_NEAREST_UPWARD)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape;
-    AxisSet quantization_axes;
-
-    auto input_type = element::f32;
-    auto output_type = element::i8;
-
-    typedef float input_c_type;
-    typedef int8_t output_c_type;
-
-    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_UPWARD;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
-    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
-    auto quantize =
-        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
-    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
-    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
-    // equals (rounded)               2   3   3  -2   -2   -3   3   4   4   -3   -3   -4
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ((vector<output_c_type>{2, 3, 3, -2, -2, -3, 3, 4, 4, -3, -3, -4}),
-              read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_NEAREST_DOWNWARD)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape;
-    AxisSet quantization_axes;
-
-    auto input_type = element::f32;
-    auto output_type = element::i8;
-
-    typedef float input_c_type;
-    typedef int8_t output_c_type;
-
-    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_DOWNWARD;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
-    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
-    auto quantize =
-        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
-    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
-    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
-    // equals (rounded)               2   2   3  -2   -3   -3   3   3   4   -3   -4   -4
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ((vector<output_c_type>{2, 2, 3, -2, -3, -3, 3, 3, 4, -3, -4, -4}),
-              read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_NEAREST_TOWARD_EVEN)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape;
-    AxisSet quantization_axes;
-
-    auto input_type = element::f32;
-    auto output_type = element::i8;
-
-    typedef float input_c_type;
-    typedef int8_t output_c_type;
-
-    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_EVEN;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
-    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
-    auto quantize =
-        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
-    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
-    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
-    // equals (rounded)               2   2   3  -2   -2   -3   3   4   4   -3   -4   -4
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ((vector<output_c_type>{2, 2, 3, -2, -2, -3, 3, 4, 4, -3, -4, -4}),
-              read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_TOWARD_INFINITY)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape;
-    AxisSet quantization_axes;
-
-    auto input_type = element::f32;
-    auto output_type = element::i8;
-
-    typedef float input_c_type;
-    typedef int8_t output_c_type;
-
-    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_TOWARD_INFINITY;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
-    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
-    auto quantize = make_shared<op::Quantize>(
-        X,
-        scale,
-        offset,
-        output_type,
-        quantization_axes,
-        static_cast<op::Quantize::RoundMode>(static_cast<int>(round_mode)));
-    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
-    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
-    // equals (rounded)               3   3   3  -3   -3   -3   4   4   4   -4   -4   -4
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ((vector<output_c_type>{3, 3, 3, -3, -3, -3, 4, 4, 4, -4, -4, -4}),
-              read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_TOWARD_ZERO)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape;
-    AxisSet quantization_axes;
-
-    auto input_type = element::f32;
-    auto output_type = element::i8;
-
-    typedef float input_c_type;
-    typedef int8_t output_c_type;
-
-    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_TOWARD_ZERO;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
-    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
-    auto quantize = make_shared<op::Quantize>(
-        X,
-        scale,
-        offset,
-        output_type,
-        quantization_axes,
-        static_cast<op::Quantize::RoundMode>(static_cast<int>(round_mode)));
-    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
-    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
-    // equals (rounded)               2   2   2  -2   -2   -2   3   3   3   -3   -3   -3
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ((vector<output_c_type>{2, 2, 2, -2, -2, -2, 3, 3, 3, -3, -3, -3}),
-              read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_UP)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape;
-    AxisSet quantization_axes;
-
-    auto input_type = element::f32;
-    auto output_type = element::i8;
-
-    typedef float input_c_type;
-    typedef int8_t output_c_type;
-
-    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_UP;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
-    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
-    auto quantize =
-        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
-    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
-    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
-    // equals (rounded)               3   3   3  -2   -2   -2   4   4   4   -3   -3   -3
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ((vector<output_c_type>{3, 3, 3, -2, -2, -2, 4, 4, 4, -3, -3, -3}),
-              read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, quantize_ROUND_DOWN)
-{
-    Shape input_shape{4, 3};
-    Shape scale_offset_shape;
-    AxisSet quantization_axes;
-
-    auto input_type = element::f32;
-    auto output_type = element::i8;
-
-    typedef float input_c_type;
-    typedef int8_t output_c_type;
-
-    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_DOWN;
-
-    auto X = make_shared<op::Parameter>(input_type, input_shape);
-    auto scale = op::Constant::create(input_type, scale_offset_shape, {4});
-    auto offset = op::Constant::create(output_type, scale_offset_shape, {0});
-    auto quantize =
-        make_shared<op::Quantize>(X, scale, offset, output_type, quantization_axes, round_mode);
-    auto f = make_shared<Function>(quantize, op::ParameterVector{X});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-    auto x = backend->create_tensor(input_type, input_shape);
-    auto y = backend->create_tensor(output_type, input_shape);
-
-    copy_data(x, vector<input_c_type>{9, 10, 11, -9, -10, -11, 13, 14, 15, -13, -14, -15});
-    // divide by scale                4   4   4   4    4    4   4   4   4    4    4    4
-    // equals (rounded)               2   2   2  -3   -3   -3   3   3   3   -4   -4   -4
-
-    backend->call_with_validate(f, {y}, {x});
-    EXPECT_EQ((vector<output_c_type>{2, 2, 2, -3, -3, -3, 3, 3, 3, -4, -4, -4}),
-              read_vector<output_c_type>(y));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_bprop)
-{
-    Shape sca{1};
-    Shape vec{1, 1, 1, 2};
-    double eps = 1.0e-04;
-
-    auto g = std::make_shared<op::Parameter>(element::f32, sca);
-    auto b = std::make_shared<op::Parameter>(element::f32, sca);
-    auto input = std::make_shared<op::Parameter>(element::f32, vec);
-    auto bn_fp = std::make_shared<op::BatchNormTraining>(input, g, b, eps);
-    auto bnorm = std::make_shared<op::GetOutputElement>(bn_fp, 0);
-    auto mean = std::make_shared<op::GetOutputElement>(bn_fp, 1);
-    auto var = std::make_shared<op::GetOutputElement>(bn_fp, 2);
-
-    auto delta = std::make_shared<op::Parameter>(element::f32, vec);
-    auto bn_bp =
-        std::make_shared<op::BatchNormTrainingBackprop>(bnorm, g, b, mean, var, delta, eps);
-    auto dx = std::make_shared<op::GetOutputElement>(bn_bp, 0);
-
-    std::vector<std::vector<float>> args = {
-        {1.0f},       // gamma
-        {1.0f},       // beta
-        {1.1f, 1.0f}, // x
-        {1.0f, 1.0f}, // dy
-    };
-
-    auto func = std::make_shared<Function>(dx, op::ParameterVector{g, b, input, delta});
-    auto results = execute(func, args, "${BACKEND_NAME}");
-    EXPECT_TRUE(test::all_close_f(std::vector<float>{350.957, -388.67}, results.at(0)));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, batchnorm_fprop_bprop_2step)
-{
-    Shape sca{1};
-    Shape vec{1, 1, 1, 2};
-    double eps = 1.0e-04;
-
-    auto g = std::make_shared<op::Parameter>(element::f32, sca);
-    auto b = std::make_shared<op::Parameter>(element::f32, sca);
-    auto input = std::make_shared<op::Parameter>(element::f32, vec);
-    auto bn_fp = std::make_shared<op::BatchNormTraining>(input, g, b, eps);
-    auto bnorm = std::make_shared<op::GetOutputElement>(bn_fp, 0);
-    auto mean = std::make_shared<op::GetOutputElement>(bn_fp, 1);
-    auto var = std::make_shared<op::GetOutputElement>(bn_fp, 2);
-
-    auto func_bn =
-        std::make_shared<Function>(NodeVector{bnorm, mean, var}, op::ParameterVector{g, b, input});
-
-    std::vector<std::vector<float>> args = {
-        {1.0f},       // gamma
-        {1.0f},       // beta
-        {1.1f, 1.0f}, // x
-    };
-    auto results = execute(func_bn, args, "${BACKEND_NAME}");
-
-    g = std::make_shared<op::Parameter>(element::f32, sca);
-    b = std::make_shared<op::Parameter>(element::f32, sca);
-    auto bn_output = std::make_shared<op::Parameter>(element::f32, vec);
-    auto m = std::make_shared<op::Parameter>(element::f32, sca);
-    auto v = std::make_shared<op::Parameter>(element::f32, sca);
-    auto delta = std::make_shared<op::Parameter>(element::f32, vec);
-    auto bn_bp = std::make_shared<op::BatchNormTrainingBackprop>(bn_output, g, b, m, v, delta, eps);
-    auto dx = std::make_shared<op::GetOutputElement>(bn_bp, 0);
-
-    args.pop_back();               // remove x
-    args.push_back(results.at(0)); // bn_output
-    args.push_back(results.at(1)); // m
-    args.push_back(results.at(2)); // v
-    args.push_back({1.0f, 1.0f});  // dy
-
-    auto func = std::make_shared<Function>(dx, op::ParameterVector{g, b, bn_output, m, v, delta});
-    results = execute(func, args, "${BACKEND_NAME}");
-    EXPECT_TRUE(test::all_close_f(std::vector<float>{350.957, -388.67}, results.at(0)));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, shape_of_scalar)
-{
-    Shape input_shape{};
-    Shape output_shape{0};
-
-    auto A = std::make_shared<op::Parameter>(element::f32, input_shape);
-    auto f = std::make_shared<Function>(std::make_shared<op::ShapeOf>(A), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::f32, input_shape);
-    copy_data(a, vector<float>{0});
-    auto result = backend->create_tensor(element::u64, output_shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    vector<uint64_t> expected{};
-    EXPECT_EQ(expected, read_vector<uint64_t>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, shape_of_vector)
-{
-    Shape input_shape{2};
-    Shape output_shape{1};
-
-    auto A = std::make_shared<op::Parameter>(element::f32, input_shape);
-    auto f = std::make_shared<Function>(std::make_shared<op::ShapeOf>(A), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::f32, input_shape);
-    copy_data(a, vector<float>(2, 0));
-    auto result = backend->create_tensor(element::u64, output_shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    vector<uint64_t> expected{2};
-    EXPECT_EQ(expected, read_vector<uint64_t>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, shape_of_matrix)
-{
-    Shape input_shape{2, 4};
-    Shape output_shape{2};
-
-    auto A = std::make_shared<op::Parameter>(element::f32, input_shape);
-    auto f = std::make_shared<Function>(std::make_shared<op::ShapeOf>(A), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::f32, input_shape);
-    copy_data(a, vector<float>(2 * 4, 0));
-    auto result = backend->create_tensor(element::u64, output_shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    vector<uint64_t> expected{2, 4};
-    EXPECT_EQ(expected, read_vector<uint64_t>(result));
-}
-
-NGRAPH_TEST(${BACKEND_NAME}, shape_of_5d)
-{
-    Shape input_shape{2, 4, 8, 16, 32};
-    Shape output_shape{5};
-
-    auto A = std::make_shared<op::Parameter>(element::f32, input_shape);
-    auto f = std::make_shared<Function>(std::make_shared<op::ShapeOf>(A), op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    auto a = backend->create_tensor(element::f32, input_shape);
-    copy_data(a, vector<float>(2 * 4 * 8 * 16 * 32, 0));
-    auto result = backend->create_tensor(element::u64, output_shape);
-
-    backend->call_with_validate(f, {result}, {a});
-    vector<uint64_t> expected{2, 4, 8, 16, 32};
-    EXPECT_EQ(expected, read_vector<uint64_t>(result));
-}
diff --git a/test/cpu_fusion.cpp-41c1ba06 b/test/cpu_fusion.cpp-41c1ba06
deleted file mode 100644
index e377ab0f432..00000000000
--- a/test/cpu_fusion.cpp-41c1ba06
+++ /dev/null
@@ -1,3132 +0,0 @@
-//*****************************************************************************
-// Copyright 2017-2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include <algorithm>
-#include <cstdio>
-#include <iostream>
-#include <list>
-#include <memory>
-
-#include "gtest/gtest.h"
-#include "ngraph/autodiff/adjoints.hpp"
-#include "ngraph/file_util.hpp"
-#include "ngraph/graph_util.hpp"
-#include "ngraph/log.hpp"
-#include "ngraph/ngraph.hpp"
-#include "ngraph/op/batch_norm.hpp"
-#include "ngraph/op/concat.hpp"
-#include "ngraph/op/get_output_element.hpp"
-#include "ngraph/op/max_pool.hpp"
-#include "ngraph/op/negative.hpp"
-#include "ngraph/op/parameter.hpp"
-#include "ngraph/op/relu.hpp"
-#include "ngraph/op/sigmoid.hpp"
-#include "ngraph/op/sum.hpp"
-#include "ngraph/op/tanh.hpp"
-#include "ngraph/pass/algebraic_simplification.hpp"
-#include "ngraph/pass/core_fusion.hpp"
-#include "ngraph/pass/graph_rewrite.hpp"
-#include "ngraph/pass/manager.hpp"
-#include "ngraph/pass/reshape_elimination.hpp"
-#include "ngraph/pass/visualize_tree.hpp"
-#include "ngraph/pattern/matcher.hpp"
-#include "ngraph/pattern/op/label.hpp"
-#include "ngraph/pattern/op/skip.hpp"
-#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
-#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
-#include "ngraph/runtime/cpu/op/batch_dot.hpp"
-#include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
-#include "ngraph/runtime/cpu/op/bounded_relu.hpp"
-#include "ngraph/runtime/cpu/op/conv_add.hpp"
-#include "ngraph/runtime/cpu/op/conv_bias.hpp"
-#include "ngraph/runtime/cpu/op/conv_relu.hpp"
-#include "ngraph/runtime/cpu/op/convert_layout.hpp"
-#include "ngraph/runtime/cpu/op/group_conv.hpp"
-#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
-#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
-#include "ngraph/runtime/cpu/op/lstm.hpp"
-#include "ngraph/runtime/cpu/op/matmul_bias.hpp"
-#include "ngraph/runtime/cpu/op/rnn.hpp"
-#include "ngraph/runtime/cpu/op/sigmoid_mul.hpp"
-#include "ngraph/runtime/cpu/pass/cpu_concat_inputs.hpp"
-#include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
-#include "ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.hpp"
-#include "ngraph/runtime/cpu/pass/cpu_mat_fusion.hpp"
-#include "ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.hpp"
-#include "ngraph/runtime/cpu/pass/cpu_rnn_fusion.hpp"
-#include "ngraph/runtime/cpu/pass/cpu_workspace_insertion.hpp"
-#include "ngraph/serializer.hpp"
-#include "ngraph/util.hpp"
-#include "nlohmann/json.hpp"
-#include "util/all_close.hpp"
-#include "util/autodiff/backprop_function.hpp"
-#include "util/autodiff/numeric_compare.hpp"
-#include "util/matcher.hpp"
-#include "util/random.hpp"
-#include "util/random.hpp"
-#include "util/test_tools.hpp"
-
-using namespace ngraph;
-using namespace std;
-
-TEST(cpu_fusion, gemm_pattern)
-{
-    Shape shape_w{2, 4};
-    Shape shape_x{4, 1};
-    Shape shape_b{1};
-    auto A = make_shared<op::Parameter>(element::f32, shape_w);
-    auto B = make_shared<op::Parameter>(element::f32, shape_x);
-    auto C = make_shared<op::Parameter>(element::f32, shape_b);
-
-    auto dot = make_shared<op::Dot>(A, B);
-    auto broadcast = make_shared<op::Broadcast>(C, dot->get_shape(), AxisSet{0});
-    auto add = dot + broadcast;
-
-    auto W = std::make_shared<pattern::op::Label>(A);
-    auto x = std::make_shared<pattern::op::Label>(B);
-
-    auto reshape_pred = [](std::shared_ptr<Node> n) {
-        return static_cast<bool>(std::dynamic_pointer_cast<op::Reshape>(n));
-    };
-
-    auto skip_w = std::make_shared<pattern::op::Skip>(W, reshape_pred);
-    auto skip_x = std::make_shared<pattern::op::Skip>(x, reshape_pred);
-
-    auto pdot = make_shared<op::Dot>(skip_w, skip_x);
-    auto b = std::make_shared<pattern::op::Label>(C);
-    auto pbroadcast = make_shared<op::Broadcast>(b, dot->get_shape(), AxisSet{0});
-    auto padd = pdot + pbroadcast;
-
-    TestMatcher n(nullptr);
-    ASSERT_TRUE(n.match(padd, add));
-    ASSERT_EQ(n.get_pattern_map()[W], A);
-    ASSERT_EQ(n.get_pattern_map()[x], B);
-    ASSERT_EQ(n.get_pattern_map()[b], C);
-
-    auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, W->get_shape());
-    auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, x->get_shape());
-    auto re_dot = make_shared<op::Dot>(reshape_w, reshape_x);
-    auto re_add = re_dot + broadcast;
-    ASSERT_TRUE(n.match(padd, re_add));
-    ASSERT_EQ(n.get_pattern_map()[W], A);
-    ASSERT_EQ(n.get_pattern_map()[x], B);
-    ASSERT_EQ(n.get_pattern_map()[b], C);
-
-    auto cg = make_shared<op::MatmulBias>(
-        W, x, C, W->get_shape(), x->get_shape(), false, false, AxisSet{0});
-}
-
-TEST(cpu_fusion, gemm_cpu_broadcast_row)
-{
-    Shape shapeA{3, 2};
-    Shape shapeB{2, 3};
-    Shape shapeC{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shapeA);
-    auto B = make_shared<op::Parameter>(element::f32, shapeB);
-
-    auto bias = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{2.0f, 3.0f});
-
-    auto cg = make_shared<op::MatmulBias>(
-        A, B, bias, A->get_shape(), B->get_shape(), true, true, AxisSet{0});
-
-    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("CPU");
-
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeB);
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shapeC);
-
-    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
-    vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
-    copy_data(a, dataA);
-    copy_data(b, dataB);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    vector<float> expected{11, 30, 38, 111};
-    EXPECT_EQ(read_vector<float>(result), expected);
-}
-
-TEST(cpu_fusion, gemm_cpu_broadcast_column)
-{
-    Shape shapeA{3, 2};
-    Shape shapeB{2, 3};
-    Shape shapeC{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shapeA);
-    auto B = make_shared<op::Parameter>(element::f32, shapeB);
-
-    auto bias = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{2.0f, 3.0f});
-
-    auto cg = make_shared<op::MatmulBias>(
-        A, B, bias, A->get_shape(), B->get_shape(), true, true, AxisSet{1});
-
-    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("CPU");
-
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeB);
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shapeC);
-
-    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
-    vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
-    copy_data(a, dataA);
-    copy_data(b, dataB);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    vector<float> expected{11, 29, 39, 111};
-    EXPECT_EQ(read_vector<float>(result), expected);
-}
-
-TEST(cpu_fusion, gemm_cpu_broadcast_matrix)
-{
-    Shape shapeA{3, 2};
-    Shape shapeB{2, 3};
-    Shape shapeC{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shapeA);
-    auto B = make_shared<op::Parameter>(element::f32, shapeB);
-
-    auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
-    auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
-
-    auto one = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{1.0f});
-
-    auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{0, 1});
-    auto cg = make_shared<op::MatmulBias>(
-        A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{0, 1});
-
-    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("CPU");
-
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeB);
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shapeC);
-
-    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
-    vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
-    copy_data(a, dataA);
-    copy_data(b, dataB);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    vector<float> expected{10, 28, 37, 109};
-    ASSERT_TRUE(read_vector<float>(result) == expected);
-}
-
-TEST(cpu_fusion, gemm_cpu_no_bias)
-{
-    auto shapeA = Shape{3, 2};
-    auto shapeB = Shape{2, 3};
-    auto shapeC = Shape{2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shapeA);
-    auto B = make_shared<op::Parameter>(element::f32, shapeB);
-
-    auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
-    auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
-
-    auto cg =
-        make_shared<op::MatmulBias>(A, B, nullptr, A->get_shape(), B->get_shape(), true, true);
-
-    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("CPU");
-
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shapeA);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shapeB);
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shapeC);
-
-    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
-    vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
-    copy_data(a, dataA);
-    copy_data(b, dataB);
-
-    backend->call_with_validate(f, {result}, {a, b});
-    vector<float> expected{9, 27, 36, 108};
-    ASSERT_TRUE(read_vector<float>(result) == expected);
-}
-
-TEST(cpu_fusion, cpu_fusion_pass_basic)
-{
-    Shape shape{};
-    Shape shape_w{2, 4};
-    Shape shape_x{4, 1};
-    Shape shape_b{1};
-    auto A = make_shared<op::Parameter>(element::f32, shape_w);
-    auto B = make_shared<op::Parameter>(element::f32, shape_x);
-    auto C = make_shared<op::Parameter>(element::f32, shape_b);
-
-    auto dot = make_shared<op::Dot>(A, B);
-    auto broadcast = make_shared<op::Broadcast>(C, dot->get_shape(), AxisSet{0});
-    auto add = dot + broadcast;
-    auto graph = make_shared<op::Abs>(add);
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
-        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
-    auto func = make_shared<Function>(graph, op::ParameterVector{A, B, C});
-    pass_manager.run_passes(func);
-    ASSERT_NE(std::dynamic_pointer_cast<op::MatmulBias>(graph->get_argument(0)), nullptr);
-}
-
-TEST(cpu_fusion, commutative_matmul_bias)
-{
-    Shape shape{};
-    Shape shape_w{2, 4};
-    Shape shape_x{4, 1};
-    Shape shape_b{1};
-    auto A = make_shared<op::Parameter>(element::f32, shape_w);
-    auto B = make_shared<op::Parameter>(element::f32, shape_x);
-    auto C = make_shared<op::Parameter>(element::f32, shape_b);
-
-    auto dot = make_shared<op::Dot>(A, B);
-    auto broadcast = make_shared<op::Broadcast>(C, dot->get_shape(), AxisSet{0});
-    auto add = broadcast + dot;
-    auto graph = make_shared<op::Abs>(add);
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
-        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
-    auto func = make_shared<Function>(graph, op::ParameterVector{A, B, C});
-    pass_manager.run_passes(func);
-    ASSERT_NE(std::dynamic_pointer_cast<op::MatmulBias>(graph->get_argument(0)), nullptr);
-}
-
-TEST(cpu_fusion, cpu_fusion_pass_matmul_bias)
-{
-    Shape shape_w{2, 4};
-    Shape shape_x{4, 1};
-    Shape shape_b{1};
-    auto W = make_shared<op::Parameter>(element::f32, shape_w);
-    auto x = make_shared<op::Parameter>(element::f32, shape_x);
-    auto b = make_shared<op::Parameter>(element::f32, shape_b);
-
-    auto mmb = std::make_shared<op::MatmulBias>(
-        W, x, nullptr, W->get_shape(), x->get_shape(), false, false);
-    auto broadcast = std::make_shared<op::Broadcast>(b, mmb->get_shape(), AxisSet{0});
-    auto add = mmb + broadcast;
-
-    auto graph = make_shared<op::Abs>(add);
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
-        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
-    auto func = make_shared<Function>(graph, op::ParameterVector{W, x, b});
-    pass_manager.run_passes(func);
-    auto gmm = graph->get_argument(0);
-    ASSERT_TRUE(std::dynamic_pointer_cast<op::MatmulBias>(gmm));
-    ASSERT_EQ(gmm->get_argument(2), b);
-}
-
-TEST(cpu_fusion, cpu_fusion_pass_matmul_no_bias)
-{
-    Shape shape_w{4, 2};
-    Shape shape_x{1, 4};
-    auto W = make_shared<op::Parameter>(element::f32, shape_w);
-    auto x = make_shared<op::Parameter>(element::f32, shape_x);
-
-    auto reshape_w = std::make_shared<op::Reshape>(W, AxisVector{1, 0}, Shape{2, 4});
-    auto reshape_x = std::make_shared<op::Reshape>(x, AxisVector{1, 0}, Shape{4, 1});
-    auto re_dot = make_shared<op::Dot>(reshape_w, reshape_x);
-    auto graph = make_shared<op::Abs>(re_dot);
-
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
-        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
-    auto func = make_shared<Function>(graph, op::ParameterVector{W, x});
-    pass_manager.run_passes(func);
-    size_t mmb = count_ops_of_type<op::MatmulBias>(func);
-    ASSERT_EQ(mmb, 1);
-}
-
-TEST(cpu_fusion, gemm_mlp)
-{
-    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/mnist_mlp_forward.json");
-    const string json_string = file_util::read_file_to_string(json_path);
-    stringstream ss(json_string);
-    shared_ptr<Function> func = ngraph::deserialize(ss);
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
-        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
-    pass_manager.run_passes(func);
-    auto mmbs = count_ops_of_type<op::MatmulBias>(func);
-    ASSERT_EQ(mmbs, 3);
-}
-
-TEST(cpu_fusion, fuse_fprop_bn)
-{
-    pass::Manager pass_manager;
-    pass_manager.register_pass<pass::VisualizeTree>("bn_fprop_before_fusion.png");
-    pass_manager.register_pass<ngraph::pass::ReshapeElimination>();
-    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
-        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
-    pass_manager.register_pass<pass::VisualizeTree>("bn_fprop_after_fusion.png");
-    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/bn_fprop_b2c3h2w2.json");
-    const string json_string = file_util::read_file_to_string(json_path);
-    stringstream ss(json_string);
-    shared_ptr<Function> func = ngraph::deserialize(ss);
-    pass_manager.run_passes(func);
-    size_t ccg = count_ops_of_type<op::BatchNormTraining>(func);
-    ASSERT_EQ(ccg, 1);
-}
-
-TEST(cpu_fusion, zero_padded_reshaped_conv)
-{
-    auto X = make_shared<op::Parameter>(element::f32, Shape{1, 2, 2, 1});
-    auto F = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1, 1});
-
-    auto pad_value = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{0.0f});
-
-    auto pad =
-        make_shared<op::Pad>(X, pad_value, Shape{0, 1, 0, 0}, Shape{0, 0, 1, 0}, Shape{0, 0, 0, 0});
-
-    auto reshape = make_shared<op::Reshape>(pad, AxisVector{0, 3, 1, 2}, Shape{1, 1, 3, 3});
-
-    auto conv = make_shared<op::Convolution>(reshape,
-                                             F,
-                                             Strides{1, 1},
-                                             Strides{1, 1},
-                                             CoordinateDiff{0, 0},
-                                             CoordinateDiff{0, 0},
-                                             Strides{1, 1});
-
-    auto func = make_shared<Function>(conv, op::ParameterVector{X, F});
-
-    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 1);
-
-    auto backend = runtime::Backend::create("CPU");
-    backend->compile(func);
-
-    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 0);
-}
-
-TEST(cpu_fusion, zero_padded_conv)
-{
-    auto X = make_shared<op::Parameter>(element::f32, Shape{1, 1, 2, 2});
-    auto F = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1, 1});
-
-    auto pad_value = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{0.0f});
-
-    auto pad =
-        make_shared<op::Pad>(X, pad_value, Shape{0, 0, 0, 1}, Shape{0, 0, 1, 0}, Shape{0, 0, 0, 0});
-
-    auto conv = make_shared<op::Convolution>(pad,
-                                             F,
-                                             Strides{1, 1},
-                                             Strides{1, 1},
-                                             CoordinateDiff{0, 0},
-                                             CoordinateDiff{0, 0},
-                                             Strides{1, 1});
-
-    auto func = make_shared<Function>(conv, op::ParameterVector{X, F});
-
-    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 1);
-
-    auto backend = runtime::Backend::create("CPU");
-    backend->compile(func);
-
-    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 0);
-}
-
-TEST(cpu_fusion, non_zero_padded_conv)
-{
-    auto X = make_shared<op::Parameter>(element::f32, Shape{1, 1, 2, 2});
-    auto F = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1, 1});
-
-    auto pad_value = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{1.0f});
-
-    auto pad =
-        make_shared<op::Pad>(X, pad_value, Shape{0, 0, 0, 1}, Shape{0, 0, 1, 0}, Shape{0, 0, 0, 0});
-
-    auto conv = make_shared<op::Convolution>(pad,
-                                             F,
-                                             Strides{1, 1},
-                                             Strides{1, 1},
-                                             CoordinateDiff{0, 0},
-                                             CoordinateDiff{0, 0},
-                                             Strides{1, 1});
-
-    auto func = make_shared<Function>(conv, op::ParameterVector{X, F});
-
-    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 1);
-
-    auto backend = runtime::Backend::create("CPU");
-    backend->compile(func);
-
-    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 1);
-}
-
-TEST(cpu_fusion, zero_padded_conv_backprop_filters)
-{
-    auto X = make_shared<op::Parameter>(element::f32, Shape{1, 1, 2, 2});
-    auto F = make_shared<op::Parameter>(element::f32, Shape{1, 1, 2, 2});
-
-    auto pad_value = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{0.0f});
-
-    auto pad =
-        make_shared<op::Pad>(X, pad_value, Shape{0, 0, 0, 1}, Shape{0, 0, 1, 0}, Shape{0, 0, 0, 0});
-
-    auto conv = make_shared<op::ConvolutionBackpropFilters>(pad,
-                                                            Shape{1, 1, 2, 2},
-                                                            F,
-                                                            Strides{1, 1},
-                                                            Strides{1, 1},
-                                                            CoordinateDiff{0, 0},
-                                                            CoordinateDiff{0, 0},
-                                                            Strides{1, 1});
-
-    auto func = make_shared<Function>(conv, op::ParameterVector{X, F});
-
-    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 1);
-
-    auto backend = runtime::Backend::create("CPU");
-    backend->compile(func);
-
-    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 0);
-}
-
-TEST(cpu_fusion, fuse_conv_bias)
-{
-    pass::Manager pass_manager;
-    pass_manager.register_pass<ngraph::pass::ReshapeElimination>();
-    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
-        runtime::cpu::pass::CPUFusion::DIFFERENTIABLE_FUSIONS);
-    const string json_path = file_util::path_join(SERIALIZED_ZOO, "conv_bias.json");
-    const string json_string = file_util::read_file_to_string(json_path);
-    stringstream ss(json_string);
-    shared_ptr<Function> func = ngraph::deserialize(ss);
-    pass_manager.run_passes(func);
-    size_t cb = count_ops_of_type<op::ConvolutionBias>(func);
-    ASSERT_GT(cb, 0);
-}
-
-struct ConvolutionBiasTestData
-{
-    size_t n{0};
-    size_t c{0};
-    size_t filter{0};
-    size_t kernel_size{0};
-    size_t w{0};
-    size_t h{0};
-    shared_ptr<runtime::Tensor> data_val;
-    shared_ptr<runtime::Tensor> weights_val;
-    shared_ptr<runtime::Tensor> bias_val;
-    shared_ptr<runtime::Tensor> result_val;
-    shared_ptr<runtime::Tensor> delta_val;
-    shared_ptr<runtime::Tensor> d_data_val;
-    shared_ptr<runtime::Tensor> d_weights_val;
-    shared_ptr<runtime::Tensor> d_bias_val;
-    vector<float> expected_result_val;
-    vector<float> expected_d_data_val;
-    vector<float> expected_d_weights_val;
-    vector<float> expected_d_bias_val;
-
-    Shape data_shape;
-    Shape weights_shape;
-    Shape bias_shape;
-    Shape result_shape;
-    shared_ptr<op::Parameter> data;
-    shared_ptr<op::Parameter> weights;
-    shared_ptr<op::Parameter> bias;
-    shared_ptr<op::Parameter> delta;
-
-    void n1c1h3w3(runtime::Backend* backend)
-    {
-        n = 1;
-        c = 1;
-        filter = 1;
-        kernel_size = 3;
-        w = 3;
-        h = w;
-
-        data_shape = Shape{n, c, h, w};
-        data = make_shared<op::Parameter>(element::f32, data_shape);
-        weights_shape = Shape{filter, c, kernel_size, kernel_size};
-        weights = make_shared<op::Parameter>(element::f32, weights_shape);
-        bias_shape = Shape{filter};
-        bias = make_shared<op::Parameter>(element::f32, bias_shape);
-        result_shape = Shape{n, filter, 1, 1};
-
-        data_val = backend->create_tensor(element::f32, data_shape);
-        copy_data(data_val,
-                  vector<float>{-0.67765152f,
-                                0.10073948f,
-                                0.57595438f,
-                                -0.3469252f,
-                                -0.22134334f,
-                                -1.80471897f,
-                                -0.80642909f,
-                                1.22033095f,
-                                2.23235631f});
-        weights_val = backend->create_tensor(element::f32, weights_shape);
-        copy_data(weights_val,
-                  vector<float>{0.20070229f,
-                                -0.54968649f,
-                                -0.19819015f,
-                                -0.38577855f,
-                                1.37109005f,
-                                -0.23789984f,
-                                0.14867957f,
-                                -0.49851316f,
-                                -0.84815776f});
-        bias_val = backend->create_tensor(element::f32, bias_shape);
-        copy_data(bias_val, vector<float>{0.07811152f});
-
-        result_val = backend->create_tensor(element::f32, result_shape);
-        copy_data(result_val, vector<float>{0});
-
-        delta = make_shared<op::Parameter>(element::f32, result_shape);
-        delta_val = backend->create_tensor(element::f32, result_shape);
-        copy_data(delta_val, vector<float>{-2.58936238f});
-
-        d_data_val = backend->create_tensor(element::f32, data_shape);
-        copy_data(d_data_val, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0});
-
-        d_weights_val = backend->create_tensor(element::f32, weights_shape);
-        copy_data(d_weights_val, vector<float>{0, 0, 0, 0, 0, 0, 0, 0, 0});
-
-        d_bias_val = backend->create_tensor(element::f32, bias_shape);
-        copy_data(d_bias_val, vector<float>{0});
-
-        expected_result_val = vector<float>{-2.58936238f};
-        expected_d_data_val = vector<float>{-0.51969099f,
-                                            1.42333758f,
-                                            0.5131861f,
-                                            0.99892044f,
-                                            -3.5502491f,
-                                            0.61600888f,
-                                            -0.3849853f,
-                                            1.29083121f,
-                                            2.19618773f};
-        expected_d_weights_val = vector<float>{1.7546854f,
-                                               -0.26085103f,
-                                               -1.49135458f,
-                                               0.89831507f,
-                                               0.57313812f,
-                                               4.67307138f,
-                                               2.08813715f,
-                                               -3.15987897f,
-                                               -5.7803793f};
-        expected_d_bias_val = vector<float>{-2.58936238f};
-    }
-};
-
-TEST(cpu_fusion, conv_bias_fprop_n1c1h3w3)
-{
-    auto backend = runtime::Backend::create("CPU");
-
-    ConvolutionBiasTestData conv_test;
-    conv_test.n1c1h3w3(backend.get());
-
-    auto convolution = make_shared<op::Convolution>(conv_test.data, conv_test.weights);
-    auto convolution_bias = make_shared<op::ConvolutionBias>(convolution, conv_test.bias);
-
-    auto f = make_shared<Function>(
-        convolution_bias, op::ParameterVector{conv_test.data, conv_test.weights, conv_test.bias});
-
-    backend->call_with_validate(
-        f, {conv_test.result_val}, {conv_test.data_val, conv_test.weights_val, conv_test.bias_val});
-    auto result_vec = read_vector<float>(conv_test.result_val);
-
-    EXPECT_TRUE(
-        test::all_close(conv_test.expected_result_val, read_vector<float>(conv_test.result_val)));
-}
-
-TEST(cpu_fusion, conv_bias_bprop_n1c1h3w3)
-{
-    auto backend = runtime::Backend::create("CPU");
-
-    ConvolutionBiasTestData conv_test;
-    conv_test.n1c1h3w3(backend.get());
-
-    auto convolution = make_shared<op::Convolution>(conv_test.data, conv_test.weights);
-    auto convolution_bias = make_shared<op::ConvolutionBias>(convolution, conv_test.bias);
-
-    auto f = make_shared<Function>(
-        convolution_bias, op::ParameterVector{conv_test.data, conv_test.weights, conv_test.bias});
-
-    ngraph::autodiff::Adjoints adjoints(NodeVector{convolution_bias}, NodeVector{conv_test.delta});
-
-    auto d_data = adjoints.backprop_node(conv_test.data);
-    auto d_weights = adjoints.backprop_node(conv_test.weights);
-    auto d_bias = adjoints.backprop_node(conv_test.bias);
-
-    auto df = make_shared<Function>(
-        NodeVector{d_data, d_weights, d_bias},
-        op::ParameterVector{conv_test.data, conv_test.weights, conv_test.bias, conv_test.delta});
-    backend->call_with_validate(
-        df,
-        {conv_test.d_data_val, conv_test.d_weights_val, conv_test.d_bias_val},
-        {conv_test.data_val, conv_test.weights_val, conv_test.bias_val, conv_test.delta_val});
-
-    EXPECT_TRUE(
-        test::all_close(conv_test.expected_d_data_val, read_vector<float>(conv_test.d_data_val)));
-    EXPECT_TRUE(test::all_close(conv_test.expected_d_weights_val,
-                                read_vector<float>(conv_test.d_weights_val)));
-    EXPECT_TRUE(
-        test::all_close(conv_test.expected_d_bias_val, read_vector<float>(conv_test.d_bias_val)));
-}
-
-TEST(cpu_fusion, conv_bias_bprop)
-{
-    Shape shape{2, 2, 1, 1};
-    auto data_batch = std::make_shared<op::Parameter>(element::f32, shape);
-    auto filters = std::make_shared<op::Parameter>(element::f32, shape);
-    auto delta = std::make_shared<op::Parameter>(element::f32, shape);
-    auto bias = make_shared<op::Parameter>(element::f32, Shape{shape[0]});
-    auto pbroadcast = std::make_shared<op::Broadcast>(bias, shape, AxisSet{1, 2, 3});
-    auto conv = std::make_shared<op::Convolution>(data_batch, filters);
-    auto conv_bias = std::make_shared<op::Add>(conv, pbroadcast);
-
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
-    pass_manager.register_pass<pass::VisualizeTree>("conv_bias_bprop_fusion");
-    auto f = make_shared<Function>(conv_bias, op::ParameterVector{data_batch, filters, bias});
-
-    ngraph::autodiff::Adjoints adjoints(NodeVector{conv_bias}, NodeVector{delta});
-
-    auto d_data = adjoints.backprop_node(data_batch);
-    auto d_weights = adjoints.backprop_node(filters);
-    auto d_bias = adjoints.backprop_node(bias);
-
-    auto df = make_shared<Function>(NodeVector{d_data, d_weights, d_bias},
-                                    op::ParameterVector{data_batch, filters, bias, delta});
-
-    pass_manager.run_passes(df);
-    size_t ccg = count_ops_of_type<op::ConvolutionBiasBackpropFiltersBias>(df);
-    ASSERT_EQ(ccg, 1);
-}
-
-TEST(cpu_fusion, batchnorm_fprop_relu_b1c2h2w2)
-{
-    auto input_shape = Shape{1, 2, 2, 2};
-    auto input = make_shared<op::Parameter>(element::f32, input_shape);
-    auto mean_shape = Shape{2};
-    auto var_shape = Shape{2};
-    auto gamma_shape = Shape{2};
-    auto gamma = make_shared<op::Parameter>(element::f32, gamma_shape);
-    auto beta_shape = Shape{2};
-    auto beta = make_shared<op::Parameter>(element::f32, beta_shape);
-    double eps = 0.001;
-    auto shape_r = Shape{1, 2, 2, 2};
-    auto bn = make_shared<op::BatchNormTraining>(input, gamma, beta, eps);
-
-    auto output_rt = std::make_shared<op::GetOutputElement>(bn, 0);
-    // Note, op::Splice is used to break Relu(BatchNorm) fusion
-    // otherwise we will be comparing two BatchNormRelus
-    // Unfortunately, we can't use INTERPRETER for
-    // verifying the results as it doesn't implement
-    // BatchNorm op.
-    auto slice =
-        std::make_shared<op::Slice>(output_rt, Coordinate{0, 0, 0, 0}, Coordinate{1, 2, 2, 2});
-    auto output_relu = std::make_shared<op::Relu>(slice);
-    auto mean_rt = std::make_shared<op::GetOutputElement>(bn, 1);
-    auto variance_rt = std::make_shared<op::GetOutputElement>(bn, 2);
-
-    auto bn_relu = make_shared<op::BatchNormTrainingRelu>(input, gamma, beta, eps);
-    auto output_rt_bnr = std::make_shared<op::GetOutputElement>(bn_relu, 0);
-    auto mean_rt_bnr = std::make_shared<op::GetOutputElement>(bn_relu, 1);
-    auto variance_rt_bnr = std::make_shared<op::GetOutputElement>(bn_relu, 2);
-
-    auto f = make_shared<Function>(
-        NodeVector{output_relu, mean_rt, variance_rt, output_rt_bnr, mean_rt_bnr, variance_rt_bnr},
-        op::ParameterVector{input, gamma, beta});
-    auto backend = runtime::Backend::create("CPU");
-
-    // Create some tensors for input/output
-    auto input_t = backend->create_tensor(element::f32, Shape{1, 2, 2, 2});
-
-    copy_data(input_t,
-              vector<float>{0.54881352f,
-                            0.71518934f,
-                            0.60276335f,
-                            0.54488319f,
-                            0.42365479f,
-                            0.64589411f,
-                            0.4375872f,
-                            0.89177299f});
-    auto gamma_t = backend->create_tensor(element::f32, gamma_shape);
-    copy_data(gamma_t, vector<float>{1.0f, 1.0f});
-    auto beta_t = backend->create_tensor(element::f32, beta_shape);
-    copy_data(beta_t, vector<float>{0.0f, 0.0f});
-    auto bn_output = backend->create_tensor(element::f32, shape_r);
-    auto result_mean = backend->create_tensor(element::f32, mean_shape);
-    auto result_variance = backend->create_tensor(element::f32, var_shape);
-
-    auto bn_output_bnr = backend->create_tensor(element::f32, shape_r);
-    auto result_mean_bnr = backend->create_tensor(element::f32, mean_shape);
-    auto result_variance_bnr = backend->create_tensor(element::f32, var_shape);
-
-    backend->call_with_validate(f,
-                                {bn_output,
-                                 result_mean,
-                                 result_variance,
-                                 bn_output_bnr,
-                                 result_mean_bnr,
-                                 result_variance_bnr},
-                                {input_t, gamma_t, beta_t});
-
-    EXPECT_TRUE(test::all_close(read_vector<float>(bn_output), read_vector<float>(bn_output_bnr)));
-    EXPECT_TRUE(
-        test::all_close(read_vector<float>(result_mean), read_vector<float>(result_mean_bnr)));
-    EXPECT_TRUE(test::all_close(read_vector<float>(result_variance),
-                                read_vector<float>(result_variance_bnr)));
-}
-
-TEST(cpu_fusion, fuse_conv_relu)
-{
-    auto A = std::make_shared<op::Parameter>(element::f32, Shape{2, 1, 2, 2});
-    auto weights = std::make_shared<op::Parameter>(element::f32, Shape{1, 1, 2, 2});
-    auto convolution = std::make_shared<op::Convolution>(A, weights, Strides{1, 1}, Strides{1, 1});
-    auto relu = std::make_shared<op::Relu>(convolution);
-    auto abs_node =
-        std::make_shared<op::Abs>(std::make_shared<op::Abs>(std::make_shared<op::Abs>(relu)));
-    auto func = make_shared<Function>(abs_node, op::ParameterVector{A, weights});
-
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
-        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
-    pass_manager.run_passes(func);
-    size_t cb = count_ops_of_type<op::ConvolutionRelu>(func);
-    ASSERT_GT(cb, 0);
-}
-
-TEST(cpu_fusion, conv_relu_n2c1h2w2_2)
-{
-    Shape shape_a{2, 1, 6, 6};
-    Shape shape_weights{1, 1, 2, 2};
-
-    auto make_int_function = [shape_a, shape_weights]() {
-        auto A = std::make_shared<op::Parameter>(element::f32, shape_a);
-        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
-        auto conv = std::make_shared<op::Convolution>(A, weights, Strides{2, 2}, Strides{1, 1});
-        auto relu = std::make_shared<op::Relu>(conv);
-        auto f = make_shared<Function>(NodeVector{relu}, op::ParameterVector{A, weights});
-        return f;
-    };
-
-    auto int_f = make_int_function();
-
-    auto make_cpu_function = [shape_a, shape_weights]() {
-        auto A = std::make_shared<op::Parameter>(element::f32, shape_a);
-        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
-        auto conv = std::make_shared<op::Convolution>(A, weights, Strides{2, 2}, Strides{1, 1});
-        auto conv_relu = std::make_shared<op::ConvolutionRelu>(conv);
-        auto f = make_shared<Function>(NodeVector{conv_relu}, op::ParameterVector{A, weights});
-        return f;
-    };
-
-    auto cpu_f = make_cpu_function();
-
-    vector<vector<float>> args{
-        {1.25f,  2.25f, 5.25f, 6.25f,  -1.25f, -1.25f, 3.25f, -4.25f, 7.25f,  8.25f,  -1.25f,
-         -1.25f, 1.25f, 2.25f, -3.25f, 2.25f,  4.25f,  4.25f, 1.25f,  2.25f,  -4.25f, 2.25f,
-         4.25f,  4.25f, 0.f,   0.f,    -1.f,   0.f,    2.f,   2.f,    0.f,    0.f,    0.f,
-         0.f,    2.f,   2.f,   1.25f,  2.25f,  5.25f,  6.25f, 1.25f,  1.25f,  3.25f,  4.25f,
-         -7.25f, 8.25f, 1.25f, -1.25f, -1.25f, 2.25f,  3.25f, 2.25f,  -4.25f, -4.25f, -1.25f,
-         -2.25f, 4.25f, 2.25f, 4.25f,  4.25f,  0.f,    0.f,   1.f,    0.f,    -2.f,   2.f,
-         0.f,    0.f,   0.f,   0.f,    -2.f,   -2.f},
-        {2., 2., 2., 2.}};
-
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
-}
-
-TEST(cpu_fusion, conv_bias_relu_n2c1h2w2_2)
-{
-    Shape shape_a{2, 1, 6, 6};
-    Shape shape_weights{1, 1, 2, 2};
-    Shape shape_bias{1};
-
-    auto make_int_function = [shape_a, shape_weights, shape_bias]() {
-        auto A = std::make_shared<op::Parameter>(element::f32, shape_a);
-        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
-        auto conv = std::make_shared<op::Convolution>(A, weights, Strides{2, 2}, Strides{1, 1});
-        auto bias = std::make_shared<op::Parameter>(element::f32, shape_bias);
-        auto conv_bias =
-            conv + std::make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
-        auto relu = std::make_shared<op::Relu>(conv_bias);
-        auto f = make_shared<Function>(NodeVector{relu}, op::ParameterVector{A, weights, bias});
-        return f;
-    };
-
-    auto int_f = make_int_function();
-
-    auto make_cpu_function = [shape_a, shape_weights, shape_bias]() {
-        auto A = std::make_shared<op::Parameter>(element::f32, shape_a);
-        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
-        auto bias = std::make_shared<op::Parameter>(element::f32, shape_bias);
-        auto conv = std::make_shared<op::Convolution>(A, weights, Strides{2, 2}, Strides{1, 1});
-        auto conv_bias_relu = std::make_shared<op::ConvolutionBias>(conv, bias, true);
-        auto f = make_shared<Function>(NodeVector{conv_bias_relu},
-                                       op::ParameterVector{A, weights, bias});
-        return f;
-    };
-
-    auto cpu_f = make_cpu_function();
-
-    vector<vector<float>> args{
-        {1.25f,  2.25f, 5.25f, 6.25f,  -1.25f, -1.25f, 3.25f, -4.25f, 7.25f,  8.25f,  -1.25f,
-         -1.25f, 1.25f, 2.25f, -3.25f, 2.25f,  4.25f,  4.25f, 1.25f,  2.25f,  -4.25f, 2.25f,
-         4.25f,  4.25f, 0.f,   0.f,    -1.f,   0.f,    2.f,   2.f,    0.f,    0.f,    0.f,
-         0.f,    2.f,   2.f,   1.25f,  2.25f,  5.25f,  6.25f, 1.25f,  1.25f,  3.25f,  4.25f,
-         -7.25f, 8.25f, 1.25f, -1.25f, -1.25f, 2.25f,  3.25f, 2.25f,  -4.25f, -4.25f, -1.25f,
-         -2.25f, 4.25f, 2.25f, 4.25f,  4.25f,  0.f,    0.f,   1.f,    0.f,    -2.f,   2.f,
-         0.f,    0.f,   0.f,   0.f,    -2.f,   -2.f},
-        {2., 2., 2., 2.},
-        {0.1f}};
-
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
-}
-
-TEST(cpu_fusion, conv_horizontal_fusion)
-{
-    Shape shape_a{2, 1, 6, 6};
-    Shape shape_weights{1, 1, 2, 2};
-    Shape shape_bias{1};
-
-    auto make_function = [shape_a, shape_weights, shape_bias]() {
-        auto A = std::make_shared<op::Parameter>(element::f32, shape_a);
-        auto weights1 = std::make_shared<op::Parameter>(element::f32, shape_weights);
-        auto conv1 = std::make_shared<op::Convolution>(A, weights1, Strides{2, 2}, Strides{1, 1});
-        auto bias1 = std::make_shared<op::Parameter>(element::f32, shape_bias);
-        auto conv_bias1 =
-            conv1 + std::make_shared<op::Broadcast>(bias1, conv1->get_shape(), AxisSet{0, 2, 3});
-        auto relu1 = std::make_shared<op::Relu>(conv_bias1);
-
-        auto weights2 = std::make_shared<op::Parameter>(element::f32, shape_weights);
-        auto conv2 = std::make_shared<op::Convolution>(A, weights2, Strides{2, 2}, Strides{1, 1});
-        auto bias2 = std::make_shared<op::Parameter>(element::f32, shape_bias);
-        auto conv_bias2 =
-            conv2 + std::make_shared<op::Broadcast>(bias2, conv2->get_shape(), AxisSet{0, 2, 3});
-        auto relu2 = std::make_shared<op::Relu>(conv_bias2);
-
-        auto concat = std::make_shared<op::Concat>(NodeVector{relu1, relu2}, 1);
-        auto f = make_shared<Function>(NodeVector{concat},
-                                       op::ParameterVector{A, weights1, bias1, weights2, bias2});
-        return f;
-    };
-    auto int_f = make_function();
-    auto cpu_f = make_function();
-
-    vector<vector<float>> args{
-        {1.25f,  2.25f, 5.25f, 6.25f,  -1.25f, -1.25f, 3.25f, -4.25f, 7.25f,  8.25f,  -1.25f,
-         -1.25f, 1.25f, 2.25f, -3.25f, 2.25f,  4.25f,  4.25f, 1.25f,  2.25f,  -4.25f, 2.25f,
-         4.25f,  4.25f, 0.f,   0.f,    -1.f,   0.f,    2.f,   2.f,    0.f,    0.f,    0.f,
-         0.f,    2.f,   2.f,   1.25f,  2.25f,  5.25f,  6.25f, 1.25f,  1.25f,  3.25f,  4.25f,
-         -7.25f, 8.25f, 1.25f, -1.25f, -1.25f, 2.25f,  3.25f, 2.25f,  -4.25f, -4.25f, -1.25f,
-         -2.25f, 4.25f, 2.25f, 4.25f,  4.25f,  0.f,    0.f,   1.f,    0.f,    -2.f,   2.f,
-         0.f,    0.f,   0.f,   0.f,    -2.f,   -2.f},
-        {2., 2., 2., 2.},
-        {0.1f},
-        {3., 3., 3., 3.},
-        {0.2f}};
-
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
-
-    size_t cpu_cb = count_ops_of_type<op::ConvolutionBias>(cpu_f);
-    ASSERT_EQ(cpu_cb, 1);
-}
-
-// ConvolutionBiasAdd relies on an in-place fused MKLDNN kernel.
-// Need to ensure that it is fused only when in-place buffer allocation is feasible
-shared_ptr<Function> gen_conv_bias_add(bool param_input, bool result_output)
-{
-    auto A = make_shared<op::Parameter>(element::f32, Shape{2, 1, 2, 2});
-    auto weights = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1, 1});
-    auto bias = make_shared<op::Parameter>(element::f32, Shape{1});
-    auto conv = make_shared<op::Convolution>(A, weights, Strides{1, 1}, Strides{1, 1});
-    auto bias_broadcast = make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
-    auto convbias = conv + bias_broadcast;
-    auto B = make_shared<op::Parameter>(element::f32, Shape{2, 1, 2, 2});
-    auto abs_B = make_shared<op::Abs>(B);
-    auto add =
-        param_input ? make_shared<op::Add>(convbias, B) : make_shared<op::Add>(convbias, abs_B);
-    auto abs = make_shared<op::Abs>(add);
-
-    return result_output ? make_shared<Function>(add, op::ParameterVector{A, weights, bias, B})
-                         : make_shared<Function>(abs, op::ParameterVector{A, weights, bias, B});
-}
-
-TEST(cpu_fusion, fuse_conv_bias_add)
-{
-    auto func_fuse = gen_conv_bias_add(false, false);
-    auto func_nofuse1 = gen_conv_bias_add(true, false);
-    auto func_nofuse2 = gen_conv_bias_add(false, true);
-
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
-    pass_manager.run_passes(func_fuse);
-    ASSERT_EQ(count_ops_of_type<op::ConvolutionBiasAdd>(func_fuse), 1);
-
-    pass_manager.run_passes(func_nofuse1);
-    ASSERT_EQ(count_ops_of_type<op::ConvolutionBiasAdd>(func_nofuse1), 0);
-
-    pass_manager.run_passes(func_nofuse2);
-    ASSERT_EQ(count_ops_of_type<op::ConvolutionBiasAdd>(func_nofuse2), 1);
-}
-
-TEST(cpu_fusion, conv_bias_add)
-{
-    auto int_f = gen_conv_bias_add(false, false);
-    auto cpu_f = gen_conv_bias_add(false, false);
-
-    vector<vector<float>> args{{1.25f, 2.25f, 5.25f, 6.25f, -1.25f, -1.25f, 3.25f, -4.25f},
-                               {-1.25f},
-                               {2.25f},
-                               {1.25f, 2.25f, -3.25f, 2.25f, 4.25f, 4.25f, 1.25f, 2.25f}};
-
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
-}
-
-// ConvolutionAdd relies on an in-place fused MKLDNN kernel.
-// Need to ensure that it is fused only when in-place buffer allocation is feasible
-shared_ptr<Function> gen_conv_add(bool param_input, bool result_output)
-{
-    auto A = make_shared<op::Parameter>(element::f32, Shape{2, 1, 2, 2});
-    auto weights = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1, 1});
-    auto conv = make_shared<op::Convolution>(A, weights, Strides{1, 1}, Strides{1, 1});
-    auto B = make_shared<op::Parameter>(element::f32, Shape{2, 1, 2, 2});
-    auto abs_B = make_shared<op::Abs>(B);
-    auto add = param_input ? make_shared<op::Add>(conv, B) : make_shared<op::Add>(conv, abs_B);
-    auto abs = make_shared<op::Abs>(add);
-
-    return result_output ? make_shared<Function>(add, op::ParameterVector{A, weights, B})
-                         : make_shared<Function>(abs, op::ParameterVector{A, weights, B});
-}
-
-TEST(cpu_fusion, fuse_conv_add)
-{
-    auto func_fuse = gen_conv_add(false, false);
-    auto func_nofuse1 = gen_conv_add(true, false);
-    auto func_nofuse2 = gen_conv_add(false, true);
-
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
-    pass_manager.run_passes(func_fuse);
-    ASSERT_EQ(count_ops_of_type<op::ConvolutionAdd>(func_fuse), 1);
-
-    pass_manager.run_passes(func_nofuse1);
-    ASSERT_EQ(count_ops_of_type<op::ConvolutionAdd>(func_nofuse1), 0);
-
-    pass_manager.run_passes(func_nofuse2);
-    ASSERT_EQ(count_ops_of_type<op::ConvolutionAdd>(func_nofuse2), 1);
-}
-
-TEST(cpu_fusion, conv_add)
-{
-    auto int_f = gen_conv_add(false, false);
-    auto cpu_f = gen_conv_add(false, false);
-
-    vector<vector<float>> args{{1.25f, 2.25f, 5.25f, 6.25f, -1.25f, -1.25f, 3.25f, -4.25f},
-                               {-1.25f},
-                               {1.25f, 2.25f, -3.25f, 2.25f, 4.25f, 4.25f, 1.25f, 2.25f}};
-
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
-
-    int_f = gen_conv_add(false, true);
-    cpu_f = gen_conv_add(false, true);
-
-    int_results = execute(int_f, args, "INTERPRETER");
-    cpu_results = execute(cpu_f, args, "CPU");
-    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
-}
-
-shared_ptr<Function> gen_groupconv_batchnorm(const bool add_goe,
-                                             const bool with_relu,
-                                             const Shape shape_in,
-                                             const Shape shape_weights,
-                                             const Shape shape_out,
-                                             const size_t groups)
-{
-    auto input = make_shared<op::Parameter>(element::f32, shape_in);
-    auto weights = make_shared<op::Parameter>(element::f32, shape_weights);
-
-    unsigned long OC = shape_out.at(1);
-    Shape shape_bn{OC};
-    auto group_conv = make_shared<op::GroupConvolution>(input,
-                                                        weights,
-                                                        Strides{1, 1},
-                                                        Strides{1, 1},
-                                                        CoordinateDiff{0, 0},
-                                                        CoordinateDiff{0, 0},
-                                                        Strides{1, 1},
-                                                        groups,
-                                                        shape_out);
-
-    double eps = 0.001;
-    auto gamma = std::make_shared<op::Parameter>(element::f32, shape_bn);
-    auto beta = std::make_shared<op::Parameter>(element::f32, shape_bn);
-    auto mean = std::make_shared<op::Parameter>(element::f32, shape_bn);
-    auto var = std::make_shared<op::Parameter>(element::f32, shape_bn);
-
-    auto goe_bn = std::make_shared<op::GetOutputElement>(group_conv, 0);
-
-    // Adding a goe will stop fusion since the patterns wont expect to see this op
-    auto bn =
-        add_goe ? std::make_shared<op::BatchNormInference>(goe_bn, gamma, beta, mean, var, eps)
-                : std::make_shared<op::BatchNormInference>(group_conv, gamma, beta, mean, var, eps);
-    if (with_relu)
-    {
-        auto prelu = std::make_shared<op::Relu>(bn);
-        auto f = make_shared<Function>(NodeVector{prelu},
-                                       op::ParameterVector{input, weights, gamma, beta, mean, var});
-        return f;
-    }
-    else
-    {
-        auto f = make_shared<Function>(NodeVector{bn},
-                                       op::ParameterVector{input, weights, gamma, beta, mean, var});
-        return f;
-    }
-}
-
-void fuse_groupconv_batchnorm_helper(Shape shape_in,
-                                     Shape shape_weights,
-                                     Shape shape_r,
-                                     size_t groups)
-{
-    auto func_fuse =
-        gen_groupconv_batchnorm(false, false, shape_in, shape_weights, shape_r, groups);
-    auto func_fuse2 =
-        gen_groupconv_batchnorm(false, true, shape_in, shape_weights, shape_r, groups);
-
-    {
-        pass::Manager pass_manager;
-        pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
-        pass_manager.run_passes(func_fuse);
-        ASSERT_EQ(count_ops_of_type<op::GroupConvolutionBias>(func_fuse), 1);
-    }
-
-    {
-        // test groupconv + batchnorm + relu fusion
-        pass::Manager pass_manager;
-        pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
-        pass_manager.run_passes(func_fuse2);
-        ASSERT_EQ(count_ops_of_type<op::GroupConvolutionBias>(func_fuse2), 1);
-        ASSERT_EQ(count_ops_of_type<op::Relu>(func_fuse2), 0);
-    }
-}
-
-void groupconv_batchnorm_test_val_helper(
-    const bool with_relu, Shape shape_in, Shape shape_weights, Shape shape_r, size_t groups)
-{
-    shared_ptr<Function> fuse_func =
-        gen_groupconv_batchnorm(false, with_relu, shape_in, shape_weights, shape_r, groups);
-    shared_ptr<Function> nofuse_func =
-        gen_groupconv_batchnorm(true, with_relu, shape_in, shape_weights, shape_r, groups);
-
-    test::Uniform<float> rng(1.0f, 100.0f);
-    vector<vector<float>> args;
-    for (shared_ptr<op::Parameter> param : fuse_func->get_parameters())
-    {
-        vector<float> tensor_val(shape_size(param->get_shape()));
-        rng.initialize(tensor_val);
-        args.push_back(tensor_val);
-    }
-
-    auto fuse_results = execute(fuse_func, args, "CPU");
-    auto nofuse_results = execute(nofuse_func, args, "CPU");
-
-    EXPECT_TRUE(test::all_close(fuse_results.at(0), nofuse_results.at(0)));
-}
-
-TEST(cpu_fusion, fuse_groupconv_batchnorm1)
-{
-    Shape shape_in{1, 20, 5, 5};
-    Shape shape_weights{8, 10, 3, 3};
-    Shape shape_r{1, 8, 3, 3};
-    fuse_groupconv_batchnorm_helper(shape_in, shape_weights, shape_r, 2);
-    groupconv_batchnorm_test_val_helper(false, shape_in, shape_weights, shape_r, 2);
-    groupconv_batchnorm_test_val_helper(true, shape_in, shape_weights, shape_r, 2);
-}
-
-TEST(cpu_fusion, fuse_groupconv_batchnorm2)
-{
-    Shape shape_in{1, 20, 5, 5};
-    Shape shape_weights{5, 4, 3, 3};
-    Shape shape_r{1, 5, 3, 3};
-    fuse_groupconv_batchnorm_helper(shape_in, shape_weights, shape_r, 5);
-    groupconv_batchnorm_test_val_helper(false, shape_in, shape_weights, shape_r, 5);
-    groupconv_batchnorm_test_val_helper(true, shape_in, shape_weights, shape_r, 5);
-}
-
-TEST(cpu_fusion, fuse_groupconv_batchnorm3)
-{
-    Shape shape_in{1, 20, 5, 5};
-    Shape shape_weights{20, 1, 3, 3};
-    Shape shape_r{1, 20, 3, 3};
-    fuse_groupconv_batchnorm_helper(shape_in, shape_weights, shape_r, 20);
-    groupconv_batchnorm_test_val_helper(false, shape_in, shape_weights, shape_r, 20);
-    groupconv_batchnorm_test_val_helper(true, shape_in, shape_weights, shape_r, 20);
-}
-
-TEST(cpu_fusion, fuse_groupconv_batchnorm4)
-{
-    Shape shape_in{1, 20, 4, 4};
-    Shape shape_weights{5, 20, 1, 1};
-    Shape shape_r{1, 5, 4, 4};
-    fuse_groupconv_batchnorm_helper(shape_in, shape_weights, shape_r, 1);
-    groupconv_batchnorm_test_val_helper(false, shape_in, shape_weights, shape_r, 1);
-    groupconv_batchnorm_test_val_helper(true, shape_in, shape_weights, shape_r, 1);
-}
-
-std::vector<shared_ptr<runtime::Tensor>> rnn_matrix_fusion_eval(const size_t time_steps,
-                                                                const Shape& data_shape,
-                                                                const Shape& weights_shape,
-                                                                const Shape& bias_shape,
-                                                                const vector<float>& data_val,
-                                                                const vector<float>& weights_val,
-                                                                const vector<float>& bias_val,
-                                                                const bool enable_pass)
-{
-    auto data = make_shared<op::Parameter>(element::f32, data_shape);
-    auto weights = make_shared<op::Parameter>(element::f32, weights_shape);
-    auto bias = make_shared<op::Parameter>(element::f32, bias_shape);
-
-    // results from each time step
-    NodeVector results;
-    for (size_t t = 0; t < time_steps; ++t)
-    {
-        auto data_slice = make_shared<op::Slice>(
-            data, Coordinate{0, t, 0}, Coordinate{data_shape[0], t + 1, data_shape[2]});
-        auto data_reshape = make_shared<op::Reshape>(
-            data_slice, AxisVector{0, 1, 2}, Shape{data_shape[0], data_shape[2]});
-        auto weights_reshape = make_shared<op::Reshape>(
-            weights, AxisVector{1, 0}, Shape{weights_shape[1], weights_shape[0]});
-        auto dot = make_shared<op::Dot>(data_reshape, weights_reshape);
-        auto bias_broadcast = make_shared<op::Broadcast>(bias, dot->get_shape(), AxisSet{0});
-        auto add = make_shared<op::Add>(dot, bias_broadcast);
-        results.push_back(add);
-    }
-    auto func = make_shared<Function>(results, op::ParameterVector{data, weights, bias});
-    if (enable_pass)
-    {
-        pass::Manager pass_manager;
-        pass_manager.register_pass<runtime::cpu::pass::CPURnnMatFusion>();
-        pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
-            runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
-        pass_manager.run_passes(func);
-        // check all of our dot/add are converted to a single MatmulBias op.
-        size_t count = count_ops_of_type<op::MatmulBias>(func);
-        EXPECT_EQ(count, 1);
-    }
-
-    auto backend = runtime::Backend::create("CPU");
-
-    shared_ptr<runtime::Tensor> data_tensor =
-        backend->create_tensor(element::f32, data->get_shape());
-    shared_ptr<runtime::Tensor> weights_tensor =
-        backend->create_tensor(element::f32, weights->get_shape());
-    shared_ptr<runtime::Tensor> bias_tensor =
-        backend->create_tensor(element::f32, bias->get_shape());
-
-    std::vector<shared_ptr<runtime::Tensor>> result_tensors;
-    for (auto r : results)
-    {
-        result_tensors.push_back(backend->create_tensor(element::f32, r->get_shape()));
-    }
-
-    copy_data(data_tensor, data_val);
-    copy_data(weights_tensor, weights_val);
-    copy_data(bias_tensor, bias_val);
-    backend->call_with_validate(func, result_tensors, {data_tensor, weights_tensor, bias_tensor});
-    return result_tensors;
-}
-
-TEST(cpu_fusion, rnn_matrix_fusion_eval_pass)
-{
-    const size_t time_steps = 4;
-    Shape data_shape{3, time_steps, 5};
-    Shape weights_shape{6, data_shape[2]};
-    Shape bias_shape{6};
-
-    test::Uniform<float> rng{0, 1, 0};
-    vector<float> data_val(shape_size(data_shape));
-    vector<float> weights_val(shape_size(weights_shape));
-    vector<float> bias_val(shape_size(bias_shape));
-    rng.initialize(data_val);
-    rng.initialize(weights_val);
-    rng.initialize(bias_val);
-
-    std::vector<shared_ptr<runtime::Tensor>> result_expected = rnn_matrix_fusion_eval(
-        time_steps, data_shape, weights_shape, bias_shape, data_val, weights_val, bias_val, false);
-    std::vector<shared_ptr<runtime::Tensor>> result_fused = rnn_matrix_fusion_eval(
-        time_steps, data_shape, weights_shape, bias_shape, data_val, weights_val, bias_val, true);
-    for (size_t i = 0; i < result_expected.size(); ++i)
-    {
-        EXPECT_TRUE(test::all_close<float>(result_expected[i], result_fused[i]));
-    }
-}
-
-TEST(cpu_fusion, rnn_fusion_from_json_model)
-{
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPURnnMatFusion>();
-    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>(
-        runtime::cpu::pass::CPUFusion::REGULAR_FUSIONS);
-    const string json_path =
-        file_util::path_join(SERIALIZED_ZOO, "mxnet/rnn-10-step-fusion-test.json");
-    const string json_string = file_util::read_file_to_string(json_path);
-    stringstream ss(json_string);
-    shared_ptr<Function> func = ngraph::deserialize(ss);
-    pass_manager.run_passes(func);
-    const size_t NUM_STEPS = 10;
-    auto mmb_predicate = [](std::shared_ptr<Node> node) {
-        auto users = node->get_users();
-        return users.size() == NUM_STEPS &&
-               std::all_of(begin(users), end(users), [](std::shared_ptr<Node> n) {
-                   return std::dynamic_pointer_cast<op::Slice>(n) != nullptr;
-               });
-    };
-
-    auto mmbs = get_ops_of_type<op::MatmulBias>(func);
-    ASSERT_TRUE(std::any_of(begin(mmbs), end(mmbs), mmb_predicate));
-}
-
-TEST(cpu_fusion, weight_fusion)
-{
-    auto param = std::make_shared<op::Parameter>(element::f32, Shape{64});
-    auto reshape_conv =
-        std::make_shared<ngraph::op::Reshape>(param, AxisVector{0}, Shape{16, 4, 1, 1});
-    auto data_conv = std::make_shared<op::Parameter>(element::f32, Shape{16, 4, 7, 7});
-    auto tvt = reshape_conv->get_outputs().at(0).get_tensor_ptr().get();
-    auto lt_desc = std::make_shared<runtime::cpu::LayoutDescriptor>(*tvt);
-    auto cvt_lt_conv = std::make_shared<runtime::cpu::op::ConvertLayout>(reshape_conv, lt_desc);
-    auto conv = std::make_shared<ngraph::op::Convolution>(
-        data_conv, cvt_lt_conv, Strides{1, 1}, Strides{1, 1});
-
-    auto reshape_conv_bprop =
-        std::make_shared<op::Reshape>(param, AxisVector{0}, Shape{16, 4, 1, 1});
-    auto dummy_arg_conv_bprop = std::make_shared<op::Parameter>(element::f32, Shape{1, 16, 7, 7});
-    auto tvt_bprop = reshape_conv_bprop->get_outputs().at(0).get_tensor_ptr().get();
-    auto lt_desc_bprop = std::make_shared<runtime::cpu::LayoutDescriptor>(*tvt_bprop);
-    auto cvt_lt_conv_bprop =
-        std::make_shared<runtime::cpu::op::ConvertLayout>(reshape_conv_bprop, lt_desc_bprop);
-    auto conv_bprop = std::make_shared<op::ConvolutionBackpropData>(Shape{1, 4, 7, 7},
-                                                                    cvt_lt_conv_bprop,
-                                                                    dummy_arg_conv_bprop,
-                                                                    Strides{1, 1},
-                                                                    Strides{1, 1},
-                                                                    CoordinateDiff{0, 0},
-                                                                    CoordinateDiff{0, 0},
-                                                                    Strides{1, 1});
-
-    auto conv_relu = std::make_shared<op::Relu>(conv);
-    auto conv_bprop_abs = std::make_shared<op::Abs>(conv_bprop);
-
-    auto f = make_shared<Function>(NodeVector{conv_relu, conv_bprop_abs},
-                                   op::ParameterVector{param, data_conv, dummy_arg_conv_bprop});
-
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutOptimizations>();
-    pass_manager.run_passes(f);
-
-    auto new_conv_bprop_data = conv_bprop_abs->get_argument(0);
-    auto new_convert_layout = new_conv_bprop_data->get_argument(0);
-
-    ASSERT_EQ(std::dynamic_pointer_cast<runtime::cpu::op::ConvertLayout>(
-                  new_convert_layout->get_argument(0)),
-              cvt_lt_conv);
-}
-
-TEST(cpu_fusion, max_pool_with_indices)
-{
-    Shape shape_a{10, 3, 28, 28};
-    auto input = std::make_shared<op::Parameter>(element::f32, shape_a);
-    Shape window_shape{2, 2};
-    auto max_pool = std::make_shared<op::MaxPool>(input, window_shape);
-    auto C = std::make_shared<op::Parameter>(element::f32, max_pool->get_shape());
-
-    ngraph::autodiff::Adjoints adjoints(NodeVector{max_pool}, NodeVector{C});
-
-    auto dinput = adjoints.backprop_node(input);
-
-    auto df = std::make_shared<Function>(NodeVector{dinput}, op::ParameterVector{input, C});
-
-    auto f = std::make_shared<Function>(NodeVector{max_pool}, op::ParameterVector{input});
-
-    {
-        pass::Manager pass_manager;
-        pass_manager.register_pass<pass::VisualizeTree>("max_pool_fprop_before.pdf");
-        pass_manager.run_passes(f);
-    }
-
-    {
-        NodeVector nv_cwi;
-        pass::Manager pass_manager;
-        pass_manager.register_pass<pass::VisualizeTree>("max_pool_bprop_before.pdf");
-        pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi);
-        pass_manager.register_pass<pass::VisualizeTree>("max_pool_bprop_after.pdf");
-        pass_manager.run_passes(df);
-    }
-
-    {
-        pass::Manager pass_manager;
-        pass_manager.register_pass<pass::VisualizeTree>("max_pool_fprop_after.pdf");
-        pass_manager.run_passes(f);
-    }
-
-    auto maxpool_goe_output =
-        std::dynamic_pointer_cast<op::GetOutputElement>(f->get_results().at(0)->get_argument(0));
-    ASSERT_TRUE(maxpool_goe_output);
-    ASSERT_EQ(maxpool_goe_output->get_n(), 0);
-    auto maxpool_with_indices = df->get_results().at(0)->get_argument(0);
-    auto maxpool_goe_indices =
-        std::dynamic_pointer_cast<op::GetOutputElement>(maxpool_with_indices->get_argument(2));
-    ASSERT_TRUE(maxpool_goe_indices);
-    ASSERT_EQ(maxpool_goe_indices->get_n(), 1);
-}
-
-TEST(cpu_fusion, backwards_maxpool_with_indices_n4_c1_hw4_2x2_max)
-{
-    Shape shape_a{1, 4, 4, 4};
-    Shape maxpool_shape{1, 4, 3, 3};
-    auto A = std::make_shared<op::Parameter>(element::f32, shape_a);
-    Shape window_shape{2, 2};
-    auto window_movement_strides = Strides{1, 1};
-    auto maxpool = std::make_shared<op::MaxPool>(A, window_shape, window_movement_strides);
-    auto f = std::make_shared<Function>(maxpool, op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("CPU");
-    shared_ptr<runtime::Tensor> ep = backend->create_tensor(element::f32, maxpool_shape);
-    vector<float> dataEp(shape_size(maxpool_shape), 4);
-
-    shared_ptr<runtime::Tensor> input = backend->create_tensor(element::f32, shape_a);
-    shared_ptr<runtime::Tensor> output = backend->create_tensor(element::f32, shape_a);
-
-    vector<float> dataInput{11.f, 31.f, 40.f, 47.f, 13.f, 61.f, 48.f, 59.f, 17.f, 39.f, 64.f,
-                            62.f, 45.f, 55.f, 36.f, 19.f, 65.f, 33.f, 49.f, 30.f, 56.f, 41.f,
-                            53.f, 58.f, 22.f, 35.f, 52.f, 50.f, 63.f, 54.f, 12.f, 26.f, 44.f,
-                            21.f, 69.f, 24.f, 46.f, 25.f, 51.f, 29.f, 72.f, 15.f, 73.f, 10.f,
-                            16.f, 37.f, 70.f, 32.f, 28.f, 66.f, 57.f, 27.f, 60.f, 42.f, 43.f,
-                            71.f, 18.f, 38.f, 67.f, 68.f, 14.f, 20.f, 34.f, 23.f};
-
-    vector<float> expected{0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 12.0f, 0.0f, 4.0f, 0.0f, 0.0f,  16.0f,
-                           0.0f, 0.0f, 4.0f, 0.0f, 0.0f, 4.0f,  0.0f, 0.0f, 0.0f, 4.0f,  0.0f,
-                           8.0f, 8.0f, 0.0f, 0.0f, 4.0f, 0.0f,  4.0f, 4.0f, 0.0f, 0.0f,  0.0f,
-                           0.0f, 8.0f, 0.0f, 4.0f, 0.0f, 0.0f,  0.0f, 8.0f, 0.0f, 16.0f, 0.0f,
-                           0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 8.0f,  0.0f, 0.0f, 4.0f, 0.0f,  0.0f,
-                           8.0f, 0.0f, 4.0f, 8.0f, 4.0f, 0.0f,  0.0f, 0.0f, 0.0f};
-
-    copy_data(ep, dataEp);
-    copy_data(input, dataInput);
-
-    auto C = std::make_shared<op::Parameter>(element::f32, maxpool_shape);
-    auto df = autodiff::backprop_function(f);
-
-    {
-        NodeVector nv_cwi;
-        pass::Manager pass_manager;
-        pass_manager.register_pass<pass::VisualizeTree>("max_pool_bprop_before2.pdf");
-        pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi);
-        pass_manager.register_pass<pass::VisualizeTree>("max_pool_bprop_after2.pdf");
-        pass_manager.run_passes(df);
-    }
-
-    backend->call_with_validate(df, {output}, {input, ep});
-    ASSERT_TRUE(read_vector<float>(output) == expected);
-}
-
-#if 0
-TEST(cpu_fusion, loop_kernel_one_input_one_output)
-{
-    Shape shapeA{2, 2};
-    auto A = make_shared<op::Parameter>(element::i32, shapeA);
-    auto neg_a = make_shared<op::Negative>(A);
-    auto lk = make_shared<runtime::cpu::op::LoopKernel>(
-        NodeVector{neg_a}, NodeVector{neg_a}, NodeVector{A});
-    auto f = make_shared<Function>(NodeVector{lk}, op::ParameterVector{A});
-
-    auto backend = runtime::Backend::create("CPU");
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i32, shapeA);
-
-    vector<int> dataA{1, 4, 1, 4};
-    copy_data(a, dataA);
-    vector<int> expected{-1, -4, -1, -4};
-
-    backend->call_with_validate(f, {result}, {a});
-
-    EXPECT_EQ(read_vector<int>(result), expected);
-}
-
-TEST(cpu_fusion, loop_kernel_embedded_graph)
-{
-    Shape shapeA{2, 2};
-    auto A = make_shared<op::Parameter>(element::i32, shapeA);
-    auto B = make_shared<op::Parameter>(element::i32, shapeA);
-    auto neg_a = make_shared<op::Negative>(A);
-    auto neg_b = make_shared<op::Negative>(B);
-    auto add = neg_a + neg_b;
-    auto lk = make_shared<runtime::cpu::op::LoopKernel>(
-        NodeVector{add}, NodeVector{add}, NodeVector{neg_a, neg_b});
-    auto f = make_shared<Function>(NodeVector{lk}, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("CPU");
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i32, shapeA);
-
-    vector<int> dataA{1, 4, 1, 4};
-    copy_data(a, dataA);
-    vector<int> dataB{1, 2, 3, 4};
-    copy_data(b, dataB);
-    vector<int> expected{-2, -6, -4, -8};
-    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ(read_vector<int>(result), expected);
-}
-
-TEST(cpu_fusion, loop_kernel_two_inputs_one_output)
-{
-    Shape shapeA{2, 2};
-    auto A = make_shared<op::Parameter>(element::i32, shapeA);
-    auto B = make_shared<op::Parameter>(element::i32, shapeA);
-    auto add = A + B;
-    auto lk = make_shared<runtime::cpu::op::LoopKernel>(
-        NodeVector{add}, NodeVector{add}, NodeVector{A, B});
-    auto f = make_shared<Function>(NodeVector{lk}, op::ParameterVector{A, B});
-
-    auto backend = runtime::Backend::create("CPU");
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::i32, shapeA);
-
-    vector<int> dataA{1, 4, 1, 4};
-    copy_data(a, dataA);
-    vector<int> dataB{1, 2, 3, 4};
-    copy_data(b, dataB);
-    vector<int> expected{2, 6, 4, 8};
-
-    backend->call_with_validate(f, {result}, {a, b});
-
-    EXPECT_EQ(read_vector<int>(result), expected);
-}
-
-TEST(cpu_fusion, loop_kernel_multiple_outputs)
-{
-    Shape shapeA{2, 2};
-    auto A = make_shared<op::Parameter>(element::i32, shapeA);
-    auto B = make_shared<op::Parameter>(element::i32, shapeA);
-    auto C = make_shared<op::Parameter>(element::i32, shapeA);
-    auto D = make_shared<op::Parameter>(element::i32, shapeA);
-
-    auto neg_a = make_shared<op::Negative>(A);
-    auto neg_b = make_shared<op::Negative>(B);
-    auto add_ab = neg_a + neg_b;
-    auto add_cd = C + B;
-    auto add_cd_abs = make_shared<op::Abs>(add_cd);
-    auto add_ab_abs = make_shared<op::Abs>(add_ab);
-    auto add_aab = add_ab_abs + A;
-    auto add_cdd = add_cd_abs + D;
-
-    auto lk = make_shared<runtime::cpu::op::LoopKernel>(
-        NodeVector{neg_a, neg_b, add_ab, add_cd, add_cd_abs, add_ab_abs, add_aab, add_cdd},
-        NodeVector{add_aab, add_cdd, neg_b},
-        NodeVector{A, B, C, D});
-    auto add_aab_goe = std::make_shared<op::GetOutputElement>(lk, 0);
-    auto add_cdd_goe = std::make_shared<op::GetOutputElement>(lk, 1);
-    auto neg_b_goe = std::make_shared<op::GetOutputElement>(lk, 2);
-
-    auto f = make_shared<Function>(NodeVector{add_aab_goe, add_cdd_goe, neg_b_goe},
-                                   op::ParameterVector{A, B, C, D});
-
-    auto backend = runtime::Backend::create("CPU");
-
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> d = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> r1 = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> r2 = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> r3 = backend->create_tensor(element::i32, shapeA);
-
-    vector<int> dataA{1, 4, 1, 4};
-    vector<int> dataB{3, 3, 3, 9};
-    vector<int> dataC{1, 2, 3, 4};
-    vector<int> dataD{-2, 2, -1, 1};
-    copy_data(a, dataA);
-    copy_data(b, dataB);
-    copy_data(c, dataC);
-    copy_data(d, dataD);
-
-    backend->call_with_validate(f, {r1, r2, r3}, {a, b, c, d});
-
-    vector<int> expected1{5, 11, 5, 17};
-    vector<int> expected2{2, 7, 5, 14};
-    vector<int> expected3{-3, -3, -3, -9};
-    EXPECT_EQ(read_vector<int>(r1), expected1);
-    EXPECT_EQ(read_vector<int>(r2), expected2);
-    EXPECT_EQ(read_vector<int>(r3), expected3);
-}
-
-TEST(cpu_fusion, loop_kernel_copy_with_new_args)
-{
-    Shape shapeA{2, 2};
-    auto A = make_shared<op::Parameter>(element::i32, shapeA);
-    auto B = make_shared<op::Parameter>(element::i32, shapeA);
-    auto C = make_shared<op::Parameter>(element::i32, shapeA);
-    auto D = make_shared<op::Parameter>(element::i32, shapeA);
-
-    auto neg_a = make_shared<op::Negative>(A);
-    auto neg_b = make_shared<op::Negative>(B);
-    auto add_ab = neg_a + neg_b;
-    auto add_cd = C + B;
-    auto add_cd_abs = make_shared<op::Abs>(add_cd);
-    auto add_ab_abs = make_shared<op::Abs>(add_ab);
-    auto add_aab = add_ab_abs + A;
-    auto add_cdd = add_cd_abs + D;
-
-    auto lk = make_shared<runtime::cpu::op::LoopKernel>(
-        NodeVector{neg_a, neg_b, add_ab, add_cd, add_cd_abs, add_ab_abs, add_aab, add_cdd},
-        NodeVector{add_aab, add_cdd, neg_b},
-        NodeVector{A, B, C, D});
-    auto add_aab_goe = std::make_shared<op::GetOutputElement>(lk, 0);
-    auto add_cdd_goe = std::make_shared<op::GetOutputElement>(lk, 1);
-    auto neg_b_goe = std::make_shared<op::GetOutputElement>(lk, 2);
-
-    auto f = make_shared<Function>(NodeVector{add_aab_goe, add_cdd_goe, neg_b_goe},
-                                   op::ParameterVector{A, B, C, D});
-
-    auto copy_f = clone_function(*f);
-
-    auto backend = runtime::Backend::create("CPU");
-
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> d = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> r1 = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> r2 = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> r3 = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> copy_r1 = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> copy_r2 = backend->create_tensor(element::i32, shapeA);
-    shared_ptr<runtime::Tensor> copy_r3 = backend->create_tensor(element::i32, shapeA);
-
-    vector<int> dataA{1, 4, 1, 4};
-    vector<int> dataB{3, 3, 3, 9};
-    vector<int> dataC{1, 2, 3, 4};
-    vector<int> dataD{-2, 2, -1, 1};
-    copy_data(a, dataA);
-    copy_data(b, dataB);
-    copy_data(c, dataC);
-    copy_data(d, dataD);
-
-    backend->call_with_validate(f, {r1, r2, r3}, {a, b, c, d});
-    backend->call_with_validate(copy_f, {copy_r1, copy_r2, copy_r3}, {a, b, c, d});
-
-    EXPECT_EQ(read_vector<int>(r1), read_vector<int>(copy_r1));
-    EXPECT_EQ(read_vector<int>(r2), read_vector<int>(copy_r2));
-    EXPECT_EQ(read_vector<int>(r3), read_vector<int>(copy_r3));
-}
-
-#endif
-
-static std::shared_ptr<ngraph::Function> make_forward_function()
-{
-    Shape shape_a{10, 3, 28, 28};
-    auto input = std::make_shared<op::Parameter>(element::f32, shape_a);
-    Shape window_shape{2, 2};
-    auto max_pool = std::make_shared<op::MaxPool>(input, window_shape);
-    auto neg = std::make_shared<op::Negative>(max_pool);
-    auto absn = std::make_shared<op::Abs>(max_pool);
-    return std::make_shared<Function>(NodeVector{max_pool, neg, absn}, op::ParameterVector{input});
-}
-
-static std::pair<std::shared_ptr<ngraph::Function>, std::vector<std::shared_ptr<ngraph::Node>>>
-    make_backward_function(std::shared_ptr<ngraph::Function> f)
-{
-    // get parameters
-    std::vector<std::shared_ptr<ngraph::op::Parameter>> back_parameters = f->get_parameters();
-
-    ngraph::NodeVector adjoints;
-    ngraph::NodeVector outputs;
-    for (auto Y : f->get_results())
-    {
-        // Get the output
-        // Create the Adjoint
-        auto C = std::make_shared<ngraph::op::Parameter>(Y->get_element_type(), Y->get_shape());
-        outputs.push_back(Y);
-        adjoints.push_back(C);
-    }
-
-    ngraph::autodiff::Adjoints adjoint{outputs, adjoints};
-
-    // Perform autodiff
-    std::vector<std::shared_ptr<Node>> dYdXs(back_parameters.size());
-    transform(back_parameters.begin(),
-              back_parameters.end(),
-              dYdXs.begin(),
-              [&adjoint](const std::shared_ptr<Node>& X) { return adjoint.backprop_node(X); });
-
-    // create the backward function
-    std::vector<std::shared_ptr<ngraph::op::Parameter>> param_adjoints;
-    for (auto n : adjoints)
-        param_adjoints.push_back(std::dynamic_pointer_cast<ngraph::op::Parameter>(n));
-    back_parameters.insert(back_parameters.begin(), param_adjoints.begin(), param_adjoints.end());
-
-    return {std::make_shared<ngraph::Function>(dYdXs, back_parameters), adjoints};
-}
-
-void optimize_graph(std::shared_ptr<ngraph::Function>& f, std::shared_ptr<ngraph::Function> bf)
-{
-    // start by removing excess reshapes
-    NodeVector nv_cwi;
-    ngraph::pass::Manager pass_manager;
-    pass_manager.register_pass<ngraph::pass::ReshapeElimination>();
-    pass_manager.register_pass<ngraph::pass::ReshapeElimination>();
-    pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi);
-    pass_manager.register_pass<pass::VisualizeTree>("before.fprop_cache.pdf");
-
-    pass_manager.run_passes(f);
-    pass_manager.run_passes(bf);
-    if (nv_cwi.size() > 0)
-    {
-        NodeVector new_outputs;
-        for (auto r : f->get_results())
-        {
-            new_outputs.push_back(r->get_argument(0));
-        }
-
-        new_outputs.insert(new_outputs.end(), nv_cwi.begin(), nv_cwi.end());
-        f = std::make_shared<ngraph::Function>(new_outputs, f->get_parameters());
-    }
-
-    ngraph::NodeVector dYdXs;
-    for (size_t i = 0; i < bf->get_output_size(); ++i)
-    {
-        dYdXs.push_back(bf->get_output_op(i)->get_argument(0));
-    }
-
-    ngraph::NodeVector combined_outputs;
-    for (auto r : f->get_results())
-    {
-        combined_outputs.push_back(r->get_argument(0));
-    }
-
-    combined_outputs.insert(combined_outputs.end(), dYdXs.begin(), dYdXs.end());
-
-    std::vector<std::shared_ptr<ngraph::op::Parameter>> combined_parameters = f->get_parameters();
-    std::vector<std::shared_ptr<ngraph::op::Parameter>> back_parameters = bf->get_parameters();
-
-    combined_parameters.insert(
-        combined_parameters.end(), back_parameters.begin(), back_parameters.end());
-    auto combinedf = std::make_shared<ngraph::Function>(combined_outputs, combined_parameters);
-    // rerun Reshape elimination to help simplify the graph again, run CPUFusion
-    // this replaces nodes in both f and bf due to shared-ptr - ness
-    ngraph::pass::Manager pass_manager_comb;
-    pass_manager_comb.register_pass<ngraph::pass::ReshapeElimination>();
-    pass_manager_comb.register_pass<ngraph::runtime::cpu::pass::CPUFusion>();
-    pass_manager_comb.run_passes(combinedf);
-}
-
-TEST(cpu_fusion, maxpool_with_indices_in_mxnet)
-{
-    auto f = make_forward_function();
-    auto bfa = make_backward_function(f);
-    auto maybe_bf = bfa.first;
-    auto adjoints = bfa.second;
-    optimize_graph(f, maybe_bf);
-    auto fprop_cache = ngraph::cache_fprop(f, maybe_bf);
-
-    auto mpwi_bprop = fprop_cache.bprop->get_results().at(0)->get_argument(0);
-    ASSERT_TRUE(std::dynamic_pointer_cast<op::Parameter>(mpwi_bprop->get_argument(0)));
-    ASSERT_TRUE(std::dynamic_pointer_cast<op::Parameter>(mpwi_bprop->get_argument(2)));
-}
-
-TEST(cpu_fusion, conv_batch_norm_folding)
-{
-    Shape shape_input{1, 8, 3, 3};
-    Shape shape_weights{2, 8, 1, 1};
-    Shape shape_norm{2};
-
-    auto make_function = [shape_input, shape_weights, shape_norm]() {
-        auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
-        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
-        double eps = 0.001;
-        auto gamma = std::make_shared<op::Parameter>(element::f32, shape_norm);
-        auto beta = std::make_shared<op::Parameter>(element::f32, shape_norm);
-        auto mean = std::make_shared<op::Parameter>(element::f32, shape_norm);
-        auto var = std::make_shared<op::Parameter>(element::f32, shape_norm);
-        auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
-        auto bn = std::make_shared<op::BatchNormInference>(conv, gamma, beta, mean, var, eps);
-        auto f = make_shared<Function>(NodeVector{bn},
-                                       op::ParameterVector{input, weights, gamma, beta, mean, var});
-        return f;
-    };
-
-    auto int_f = make_function();
-    auto cpu_f = make_function();
-
-    vector<vector<float>> args{
-        {1.25f,  2.25f, 5.25f, 6.25f,  -1.25f, -1.25f, 3.25f, -4.25f, 7.25f,  8.25f,  -1.25f,
-         -1.25f, 1.25f, 2.25f, -3.25f, 2.25f,  4.25f,  4.25f, 1.25f,  2.25f,  -4.25f, 2.25f,
-         4.25f,  4.25f, 0.f,   0.f,    -1.f,   0.f,    2.f,   2.f,    0.f,    0.f,    0.f,
-         0.f,    2.f,   2.f,   1.25f,  2.25f,  5.25f,  6.25f, 1.25f,  1.25f,  3.25f,  4.25f,
-         -7.25f, 8.25f, 1.25f, -1.25f, -1.25f, 2.25f,  3.25f, 2.25f,  -4.25f, -4.25f, -1.25f,
-         -2.25f, 4.25f, 2.25f, 4.25f,  4.25f,  0.f,    0.f,   1.f,    0.f,    -2.f,   2.f,
-         0.f,    0.f,   0.f,   0.f,    -2.f,   -2.f},
-        {1.25f,
-         2.25f,
-         5.25f,
-         6.25f,
-         -1.25f,
-         -1.25f,
-         3.25f,
-         -4.25f,
-         7.25f,
-         8.25f,
-         -1.25f,
-         0.f,
-         0.f,
-         0.f,
-         0.f,
-         -2.f},
-        {-0.9384f, 0.01875f},
-        {11.0f, 1.3f},
-        {0.12f, 0.31f},
-        {0.01f, 0.11f},
-    };
-
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
-}
-
-TEST(cpu_fusion, convbias_batch_norm_folding)
-{
-    Shape shape_input{2, 8, 5, 5};
-    Shape shape_weights{2, 8, 2, 2};
-    Shape shape_norm{2};
-
-    auto make_function = [shape_input, shape_weights, shape_norm]() {
-        auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
-        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
-        auto bias = std::make_shared<op::Parameter>(element::f32, Shape{2});
-        double eps = 1.01;
-        auto gamma = std::make_shared<op::Parameter>(element::f32, shape_norm);
-        auto beta = std::make_shared<op::Parameter>(element::f32, shape_norm);
-        auto mean = std::make_shared<op::Parameter>(element::f32, shape_norm);
-        auto var = std::make_shared<op::Parameter>(element::f32, shape_norm);
-        auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
-        auto convbias =
-            conv + std::make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
-        auto bn = std::make_shared<op::BatchNormInference>(convbias, gamma, beta, mean, var, eps);
-        auto f = make_shared<Function>(
-            NodeVector{bn}, op::ParameterVector{input, weights, bias, gamma, beta, mean, var});
-        return f;
-    };
-
-    auto int_f = make_function();
-    auto cpu_f = make_function();
-
-    test::Uniform<float> rng(1.0f, 100.0f);
-    vector<vector<float>> args;
-    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
-    {
-        vector<float> tensor_val(shape_size(param->get_shape()));
-        rng.initialize(tensor_val);
-        args.push_back(tensor_val);
-    }
-
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
-}
-
-TEST(cpu_fusion, conv_affine_folding)
-{
-    Shape shape_input{1, 8, 3, 3};
-    Shape shape_weights{2, 8, 1, 1};
-    Shape shape_norm{2};
-
-    auto make_function = [shape_input, shape_weights, shape_norm]() {
-        auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
-        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
-
-        auto a = std::make_shared<op::Parameter>(element::f32, shape_norm);
-        auto b = std::make_shared<op::Parameter>(element::f32, shape_norm);
-        auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
-        auto out = std::make_shared<op::Add>(
-            std::make_shared<op::Multiply>(
-                conv, std::make_shared<op::Broadcast>(a, conv->get_shape(), AxisSet{0, 2, 3})),
-            std::make_shared<op::Broadcast>(b, conv->get_shape(), AxisSet{0, 2, 3}));
-        auto f = make_shared<Function>(NodeVector{out}, op::ParameterVector{input, weights, a, b});
-        return f;
-    };
-
-    auto int_f = make_function();
-    auto cpu_f = make_function();
-
-    vector<vector<float>> args{
-        {1.25f,  2.25f, 5.25f, 6.25f,  -1.25f, -1.25f, 3.25f, -4.25f, 7.25f,  8.25f,  -1.25f,
-         -1.25f, 1.25f, 2.25f, -3.25f, 2.25f,  4.25f,  4.25f, 1.25f,  2.25f,  -4.25f, 2.25f,
-         4.25f,  4.25f, 0.f,   0.f,    -1.f,   0.f,    2.f,   2.f,    0.f,    0.f,    0.f,
-         0.f,    2.f,   2.f,   1.25f,  2.25f,  5.25f,  6.25f, 1.25f,  1.25f,  3.25f,  4.25f,
-         -7.25f, 8.25f, 1.25f, -1.25f, -1.25f, 2.25f,  3.25f, 2.25f,  -4.25f, -4.25f, -1.25f,
-         -2.25f, 4.25f, 2.25f, 4.25f,  4.25f,  0.f,    0.f,   1.f,    0.f,    -2.f,   2.f,
-         0.f,    0.f,   0.f,   0.f,    -2.f,   -2.f},
-        {1.25f,
-         2.25f,
-         5.25f,
-         6.25f,
-         -1.25f,
-         -1.25f,
-         3.25f,
-         -4.25f,
-         7.25f,
-         8.25f,
-         -1.25f,
-         0.f,
-         0.f,
-         0.f,
-         0.f,
-         -2.f},
-        {-0.9384f, 0.01875f},
-        {11.0f, 1.3f},
-    };
-
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
-}
-
-TEST(cpu_fusion, convbias_affine_folding)
-{
-    Shape shape_input{1, 6, 3, 3};
-    Shape shape_weights{3, 6, 1, 1};
-    Shape shape_norm{3};
-
-    auto make_function = [shape_input, shape_weights, shape_norm]() {
-        auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
-        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
-        auto bias = std::make_shared<op::Parameter>(element::f32, Shape{3});
-
-        auto a = std::make_shared<op::Parameter>(element::f32, shape_norm);
-        auto b = std::make_shared<op::Parameter>(element::f32, shape_norm);
-        auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
-        auto convbias =
-            conv + std::make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
-        auto out = std::make_shared<op::Add>(
-            std::make_shared<op::Multiply>(
-                convbias, std::make_shared<op::Broadcast>(a, conv->get_shape(), AxisSet{0, 2, 3})),
-            std::make_shared<op::Broadcast>(b, conv->get_shape(), AxisSet{0, 2, 3}));
-        auto f =
-            make_shared<Function>(NodeVector{out}, op::ParameterVector{input, weights, bias, a, b});
-        return f;
-    };
-
-    auto int_f = make_function();
-    auto cpu_f = make_function();
-
-    test::Uniform<float> rng(20.0f, 300.0f);
-    vector<vector<float>> args;
-    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
-    {
-        vector<float> tensor_val(shape_size(param->get_shape()));
-        rng.initialize(tensor_val);
-        args.push_back(tensor_val);
-    }
-
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
-}
-
-TEST(cpu_fusion, group_convolution_fusion)
-{
-    Shape shape_a{1, 32, 2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{2, 16, 1, 1};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{1, 2, 2, 2};
-
-    auto a_slice0 = std::make_shared<op::Slice>(A, Coordinate{0, 0, 0, 0}, Coordinate{1, 16, 2, 2});
-    auto a_slice1 =
-        std::make_shared<op::Slice>(A, Coordinate{0, 16, 0, 0}, Coordinate{1, 32, 2, 2});
-
-    auto b_slice0 = std::make_shared<op::Slice>(B, Coordinate{0, 0, 0, 0}, Coordinate{1, 16, 1, 1});
-    auto b_slice1 = std::make_shared<op::Slice>(B, Coordinate{1, 0, 0, 0}, Coordinate{2, 16, 1, 1});
-
-    auto conv_lower = make_shared<op::Convolution>(a_slice0,
-                                                   b_slice0,
-                                                   Strides{1, 1},
-                                                   Strides{1, 1},
-                                                   CoordinateDiff{0, 0},
-                                                   CoordinateDiff{0, 0},
-                                                   Strides{1, 1});
-
-    auto conv_upper = make_shared<op::Convolution>(a_slice1,
-                                                   b_slice1,
-                                                   Strides{1, 1},
-                                                   Strides{1, 1},
-                                                   CoordinateDiff{0, 0},
-                                                   CoordinateDiff{0, 0},
-                                                   Strides{1, 1});
-
-    auto concat = make_shared<op::Concat>(NodeVector{conv_lower, conv_upper}, 1);
-
-    auto f = make_shared<Function>(NodeVector{concat}, op::ParameterVector{A, B});
-    pass::Manager pass_manager;
-    pass_manager.register_pass<pass::VisualizeTree>("before_group.pdf");
-    pass_manager.register_pass<runtime::cpu::pass::CPUBatchFusion>();
-    pass_manager.register_pass<pass::VisualizeTree>("after_group.pdf");
-    pass_manager.run_passes(f);
-    auto gc =
-        std::dynamic_pointer_cast<op::GroupConvolution>(f->get_results().at(0)->get_argument(0));
-    ASSERT_TRUE(gc);
-}
-
-TEST(cpu_fusion, group_convolution)
-{
-    auto backend = runtime::Backend::create("CPU");
-    test::Uniform<float> rng(2.0f, 10.0f);
-
-    const size_t GROUPS = 2;
-    Shape shape_a{1, 32, 2, 2};
-    auto A = make_shared<op::Parameter>(element::f32, shape_a);
-    Shape shape_b{2, 16, 1, 1};
-    auto B = make_shared<op::Parameter>(element::f32, shape_b);
-    Shape shape_r{1, 2, 2, 2};
-    auto group_conv = make_shared<op::GroupConvolution>(A,
-                                                        B,
-                                                        Strides{1, 1},
-                                                        Strides{1, 1},
-                                                        CoordinateDiff{0, 0},
-                                                        CoordinateDiff{0, 0},
-                                                        Strides{1, 1},
-                                                        GROUPS,
-                                                        shape_r);
-
-    Shape shape_c{1, 16, 2, 2};
-    auto C = make_shared<op::Parameter>(element::f32, shape_c);
-    Shape shape_d{1, 16, 1, 1};
-    auto D = make_shared<op::Parameter>(element::f32, shape_d);
-    auto conv_lower = make_shared<op::Convolution>(C,
-                                                   D,
-                                                   Strides{1, 1},
-                                                   Strides{1, 1},
-                                                   CoordinateDiff{0, 0},
-                                                   CoordinateDiff{0, 0},
-                                                   Strides{1, 1});
-
-    auto E = make_shared<op::Parameter>(element::f32, shape_c);
-    auto F = make_shared<op::Parameter>(element::f32, shape_d);
-    auto conv_upper = make_shared<op::Convolution>(E,
-                                                   F,
-                                                   Strides{1, 1},
-                                                   Strides{1, 1},
-                                                   CoordinateDiff{0, 0},
-                                                   CoordinateDiff{0, 0},
-                                                   Strides{1, 1});
-
-    auto f = make_shared<Function>(NodeVector{group_conv, conv_lower, conv_upper},
-                                   op::ParameterVector{A, B, C, D, E, F});
-
-    auto a_ = rng.initialize(backend->create_tensor(element::f32, shape_a));
-    auto b_ = rng.initialize(backend->create_tensor(element::f32, shape_b));
-
-    vector<float> rv(shape_size(shape_r), 0);
-    auto group_result = std::dynamic_pointer_cast<ngraph::runtime::cpu::CPUTensorView>(
-        backend->create_tensor(element::f32, shape_r, rv.data()));
-
-    auto av = read_vector<float>(a_);
-    auto bv = read_vector<float>(b_);
-    auto c_ = backend->create_tensor(element::f32, shape_c, av.data()); // lower data
-    auto d_ = backend->create_tensor(element::f32, shape_d, bv.data()); // upper data
-
-    auto e_ =
-        backend->create_tensor(element::f32, shape_c, av.data() + av.size() / 2); // lower weights
-    auto f_ =
-        backend->create_tensor(element::f32, shape_d, bv.data() + bv.size() / 2); // upper weights
-
-    Shape shape_ur{1, 1, 2, 2};
-    // allocate a contigious storage for both lower and upper halves.
-    vector<float> erv(shape_size(shape_r), 0);
-    auto lower_result = std::dynamic_pointer_cast<ngraph::runtime::cpu::CPUTensorView>(
-        backend->create_tensor(element::f32, shape_ur, erv.data()));
-    auto upper_result = std::dynamic_pointer_cast<ngraph::runtime::cpu::CPUTensorView>(
-        backend->create_tensor(element::f32, shape_ur, erv.data() + erv.size() / 2));
-    backend->call_with_validate(
-        f, {group_result, lower_result, upper_result}, {a_, b_, c_, d_, e_, f_});
-    ASSERT_EQ(rv, erv);
-}
-
-//TODO(Pruthvi) enable this test after MKLDNN RNN bug is fixed
-#if 0
-TEST(cpu_fusion, rnn_fprop_1_lstm_cell)
-{
-    auto src_layer = make_shared<op::Parameter>(element::f32, Shape{10, 100});
-    auto src_iter = make_shared<op::Parameter>(element::f32, Shape{20, 100});
-    auto weights_layer = make_shared<op::Parameter>(element::f32, Shape{400, 100});
-    auto weights_iter = make_shared<op::Parameter>(element::f32, Shape{400, 100});
-    auto biases = make_shared<op::Parameter>(element::f32, Shape{400});
-    const int number_of_timesteps = 1;
-    const int number_of_gates_per_cell = 4;
-    const int src_seq_length = 1;
-    const int src_layer_feature_size = 100;
-    const int feature_size = 100;
-    const int num_rnn_cell_states = 2;
-    const int rnn_direction = 1;
-    const int num_of_rnn_fused_layer = 1;
-    auto rnn_node = make_shared<op::Rnn>(src_layer,
-                                         src_iter,
-                                         weights_layer,
-                                         weights_iter,
-                                         biases,
-                                         number_of_timesteps,
-                                         number_of_gates_per_cell,
-                                         src_seq_length,
-                                         src_layer_feature_size,
-                                         feature_size,
-                                         num_rnn_cell_states,
-                                         rnn_direction,
-                                         num_of_rnn_fused_layer);
-    auto rnn_ht_output = make_shared<op::GetOutputElement>(rnn_node, 0);
-    auto rnn_ct_output = make_shared<op::GetOutputElement>(rnn_node, 1);
-
-    auto func = make_shared<Function>(
-        NodeVector{rnn_ht_output, rnn_ct_output},
-        op::ParameterVector{src_layer, src_iter, weights_layer, weights_iter, biases});
-    auto backend = runtime::Backend::create("CPU");
-
-    shared_ptr<runtime::Tensor> src_layer_t =
-        backend->create_tensor(element::f32, src_layer->get_shape());
-    shared_ptr<runtime::Tensor> src_iter_t =
-        backend->create_tensor(element::f32, src_iter->get_shape());
-    shared_ptr<runtime::Tensor> weights_layer_t =
-        backend->create_tensor(element::f32, weights_layer->get_shape());
-    shared_ptr<runtime::Tensor> weights_iter_t =
-        backend->create_tensor(element::f32, weights_iter->get_shape());
-    shared_ptr<runtime::Tensor> biases_t =
-        backend->create_tensor(element::f32, biases->get_shape());
-    shared_ptr<runtime::Tensor> result_ht = backend->create_tensor(element::f32, {10, 100});
-    shared_ptr<runtime::Tensor> result_ct =
-        backend->create_tensor(element::f32, Shape{20, 100});
-
-    copy_data(src_layer_t, vector<float>(1000, 1));
-    copy_data(src_iter_t, vector<float>(2000, 1));
-    copy_data(weights_layer_t, vector<float>(400 * 100, 1));
-    copy_data(weights_iter_t, vector<float>(400 * 100, 1));
-    copy_data(biases_t, vector<float>(400, 1));
-
-    backend->call_with_validate(
-        func,
-        {result_ht, result_ct},
-        {src_layer_t, src_iter_t, weights_layer_t, weights_iter_t, biases_t});
-    vector<float> expected_ht(10 * 100, 0.964028f);
-    vector<float> expected_ct;
-    for (size_t i = 0; i < 20 * 100; i++)
-    {
-        if (i < 1000)
-        {
-            expected_ct.push_back(0.964028f);
-        }
-        else
-        {
-            expected_ct.push_back(2.0f);
-        }
-    }
-
-    EXPECT_TRUE(test::all_close(expected_ht, read_vector<float>(result_ht)));
-    EXPECT_TRUE(test::all_close(expected_ct, read_vector<float>(result_ct)));
-}
-#endif
-
-TEST(cpu_fusion, fuse_lstm_cells)
-{
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
-    pass_manager.register_pass<runtime::cpu::pass::ConcatInputs>();
-    const string json_path =
-        file_util::path_join(SERIALIZED_ZOO, "mxnet/2rnn_layer_3lstm_cell.json");
-    const string json_string = file_util::read_file_to_string(json_path);
-    stringstream ss(json_string);
-    shared_ptr<Function> func = ngraph::deserialize(ss);
-    pass_manager.run_passes(func);
-    auto lstm_ops = get_ops_of_type<op::Lstm>(func);
-    EXPECT_EQ(lstm_ops.size(), 6);
-}
-
-TEST(cpu_fusion, fuse_2_layer_rnn)
-{
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
-    pass_manager.register_pass<runtime::cpu::pass::RNNFusion>();
-    const string json_path =
-        file_util::path_join(SERIALIZED_ZOO, "mxnet/2rnn_layer_3lstm_cell.json");
-    const string json_string = file_util::read_file_to_string(json_path);
-    stringstream ss(json_string);
-    shared_ptr<Function> func = ngraph::deserialize(ss);
-    pass_manager.run_passes(func);
-    size_t count = count_ops_of_type<op::Rnn>(func);
-    auto rnn_ops = get_ops_of_type<op::Rnn>(func);
-    EXPECT_EQ(rnn_ops.size(), count);
-    for (auto& node : rnn_ops)
-    {
-        EXPECT_EQ(node->get_num_timesteps(), node->get_src_sequence_length());
-        EXPECT_EQ(node->get_num_cell_states(), node->get_argument(1)->get_arguments().size());
-    }
-}
-
-TEST(cpu_fusion, fuse_1_layer_rnn)
-{
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
-    pass_manager.register_pass<runtime::cpu::pass::RNNFusion>();
-    const string json_path =
-        file_util::path_join(SERIALIZED_ZOO, "mxnet/1rnn_layer_3lstm_cell.json");
-    const string json_string = file_util::read_file_to_string(json_path);
-    stringstream ss(json_string);
-    shared_ptr<Function> func = ngraph::deserialize(ss);
-    pass_manager.run_passes(func);
-    size_t count = count_ops_of_type<op::Rnn>(func);
-    auto rnn_ops = get_ops_of_type<op::Rnn>(func);
-    EXPECT_EQ(rnn_ops.size(), 1);
-    EXPECT_EQ(rnn_ops.size(), count);
-    for (auto& node : rnn_ops)
-    {
-        EXPECT_EQ(node->get_num_timesteps(), node->get_src_sequence_length());
-        EXPECT_EQ(node->get_num_cell_states(), node->get_argument(1)->get_arguments().size());
-    }
-}
-
-static std::shared_ptr<Function> make_function(const std::string& file_name)
-{
-    const string json_path = file_util::path_join(SERIALIZED_ZOO, file_name);
-    const string json_string = file_util::read_file_to_string(json_path);
-    stringstream ss(json_string);
-    shared_ptr<Function> func = ngraph::deserialize(ss);
-    return func;
-}
-
-TEST(cpu_fusion, rnn_fusion_inter_vs_cpu_1lstm_cell)
-{
-    const std::string file_name("mxnet/1_lstm_cell_forward.json");
-    auto cpu_f = make_function(file_name);
-    auto int_f = make_function(file_name);
-    test::Uniform<float> rng(0.0f, 1.0f);
-    vector<vector<float>> args;
-
-    for (shared_ptr<op::Parameter> param : int_f->get_parameters())
-    {
-        vector<float> tensor_val(shape_size(param->get_shape()));
-        rng.initialize(tensor_val);
-        args.push_back(tensor_val);
-    }
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    for (size_t i = 0; i < cpu_results.size(); i++)
-    {
-        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
-    }
-}
-
-TEST(cpu_fusion, rnn_fusion_inter_vs_cpu_1rnn_layer_3lstm_cell)
-{
-    const std::string file_name("mxnet/1rnn_layer_3lstm_cell.json");
-    auto cpu_f = make_function(file_name);
-    auto int_f = make_function(file_name);
-    test::Uniform<float> rng(0.0f, 1.0f);
-    vector<vector<float>> args;
-
-    for (shared_ptr<op::Parameter> param : int_f->get_parameters())
-    {
-        vector<float> tensor_val(shape_size(param->get_shape()));
-        rng.initialize(tensor_val);
-        args.push_back(tensor_val);
-    }
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    for (size_t i = 0; i < cpu_results.size(); i++)
-    {
-        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
-    }
-}
-
-TEST(cpu_fusion, rnn_fusion_inter_vs_cpu_2rnn_layer_3lstm_cell)
-{
-    const std::string file_name("mxnet/2rnn_layer_3lstm_cell.json");
-    auto cpu_f = make_function(file_name);
-    auto int_f = make_function(file_name);
-    test::Uniform<float> rng(0.0f, 1.0f);
-    vector<vector<float>> args;
-
-    for (shared_ptr<op::Parameter> param : int_f->get_parameters())
-    {
-        vector<float> tensor_val(shape_size(param->get_shape()));
-        rng.initialize(tensor_val);
-        args.push_back(tensor_val);
-    }
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    for (size_t i = 0; i < cpu_results.size(); i++)
-    {
-        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
-    }
-}
-
-#if 0
-
-TEST(cpu_fusion, loop_kernel_fusion_multiple_groups_pruned)
-{
-    auto make_function = []() -> std::shared_ptr<Function> {
-        Shape shape{};
-        auto a = make_shared<op::Parameter>(element::f32, shape);
-        auto b = make_shared<op::Parameter>(element::f32, shape);
-        auto c = make_shared<op::Parameter>(element::f32, shape);
-        auto add_ab = a + b;
-        auto add_abs = std::make_shared<op::Abs>(add_ab);
-        auto abs_neg = std::make_shared<op::Negative>(add_abs);
-        auto sub_c_neg = c - abs_neg;
-
-        auto d = make_shared<op::Parameter>(element::f32, shape);
-        auto d_abs = std::make_shared<op::Abs>(d);
-        auto add_d = d_abs + add_ab;
-        auto neg_d = std::make_shared<op::Negative>(add_d);
-
-        auto mul_cd = neg_d * sub_c_neg;
-        auto f =
-            std::make_shared<Function>(ngraph::NodeVector{mul_cd}, op::ParameterVector{a, b, c, d});
-
-        return f;
-    };
-
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPULoopKernelFusion>(3);
-    auto cpu_f = make_function();
-    auto int_f = make_function();
-    pass_manager.run_passes(cpu_f);
-    test::Uniform<float> rng(-100.0f, 100.0f);
-    vector<vector<float>> args;
-
-    size_t lkn = count_ops_of_type<runtime::cpu::op::LoopKernel>(cpu_f);
-    ASSERT_GT(lkn, 0);
-
-    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
-    {
-        vector<float> tensor_val(shape_size(param->get_shape()));
-        rng.initialize(tensor_val);
-        args.push_back(tensor_val);
-    }
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    for (size_t i = 0; i < cpu_results.size(); i++)
-    {
-        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
-    }
-}
-
-TEST(cpu_fusion, loop_kernel_fusion_bounded_relu)
-{
-    auto make_function = []() -> std::shared_ptr<Function> {
-        Shape shape{};
-        auto a = make_shared<op::Parameter>(element::f32, shape);
-        auto relu = make_shared<op::Relu>(a);
-        auto upper_bound =
-            op::Constant::create<float>(element::f32, shape, std::vector<float>{6.0f});
-        auto minn = make_shared<op::Minimum>(relu, upper_bound);
-        auto absn = make_shared<op::Abs>(minn);
-        auto negn = std::make_shared<op::Negative>(absn);
-
-        auto f = std::make_shared<Function>(ngraph::NodeVector{negn}, op::ParameterVector{a});
-
-        return f;
-    };
-
-    pass::Manager pass_manager;
-    pass_manager.register_pass<pass::VisualizeTree>("before_relu_fusion.pdf");
-    pass_manager.register_pass<runtime::cpu::pass::CPULoopKernelFusion>(3);
-    pass_manager.register_pass<pass::VisualizeTree>("after_relu_fusion.pdf");
-    auto cpu_f = make_function();
-    auto int_f = make_function();
-    pass_manager.run_passes(cpu_f);
-    test::Uniform<float> rng(-100.0f, 100.0f);
-    vector<vector<float>> args;
-
-    size_t lkn = count_ops_of_type<runtime::cpu::op::LoopKernel>(cpu_f);
-    ASSERT_GT(lkn, 0);
-
-    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
-    {
-        vector<float> tensor_val(shape_size(param->get_shape()));
-        rng.initialize(tensor_val);
-        args.push_back(tensor_val);
-    }
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    for (size_t i = 0; i < cpu_results.size(); i++)
-    {
-        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
-    }
-}
-
-TEST(cpu_fusion, loop_kernel_fusion_multiple_groups)
-{
-    auto make_function = []() -> std::shared_ptr<Function> {
-        Shape shape{};
-        auto a = make_shared<op::Parameter>(element::f32, shape);
-        auto b = make_shared<op::Parameter>(element::f32, shape);
-        auto c = make_shared<op::Parameter>(element::f32, shape);
-        auto add_ab = a + b;
-        auto add_abs = std::make_shared<op::Abs>(add_ab);
-        auto abs_neg = std::make_shared<op::Negative>(add_abs);
-        auto sub_c_neg = c - abs_neg;
-
-        auto d = make_shared<op::Parameter>(element::f32, shape);
-        auto d_abs = std::make_shared<op::Abs>(d);
-        auto add_d = d_abs + add_ab;
-        auto neg_d = std::make_shared<op::Negative>(add_d);
-
-        auto mul_cd = neg_d * sub_c_neg;
-        auto f =
-            std::make_shared<Function>(ngraph::NodeVector{mul_cd}, op::ParameterVector{a, b, c, d});
-
-        return f;
-    };
-
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPULoopKernelFusion>(2);
-    auto cpu_f = make_function();
-    auto int_f = make_function();
-    pass_manager.run_passes(cpu_f);
-    test::Uniform<float> rng(-100.0f, 100.0f);
-    vector<vector<float>> args;
-
-    size_t lkn = count_ops_of_type<runtime::cpu::op::LoopKernel>(cpu_f);
-    ASSERT_GT(lkn, 0);
-
-    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
-    {
-        vector<float> tensor_val(shape_size(param->get_shape()));
-        rng.initialize(tensor_val);
-        args.push_back(tensor_val);
-    }
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    for (size_t i = 0; i < cpu_results.size(); i++)
-    {
-        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
-    }
-}
-
-TEST(cpu_fusion, loop_kernel_fusion_one_group)
-{
-    auto make_function = []() -> std::shared_ptr<Function> {
-        Shape shape{};
-        auto a = make_shared<op::Parameter>(element::f32, shape);
-        auto b = make_shared<op::Parameter>(element::f32, shape);
-        auto c = make_shared<op::Parameter>(element::f32, shape);
-        auto add_ab = a + b;
-        auto add_abs = std::make_shared<op::Abs>(add_ab);
-        auto abs_neg = std::make_shared<op::Negative>(add_abs);
-        auto sub_c_neg = c - abs_neg;
-        auto d = make_shared<op::Parameter>(element::f32, shape);
-        auto add_d = sub_c_neg + d;
-        auto abs_add_d = std::make_shared<op::Abs>(add_d);
-        auto e = make_shared<op::Parameter>(element::f32, shape);
-        auto add_e = e + abs_add_d;
-        auto neg_e = std::make_shared<op::Negative>(add_e);
-
-        auto f = std::make_shared<Function>(ngraph::NodeVector{neg_e},
-                                            op::ParameterVector{a, b, c, d, e});
-
-        return f;
-
-    };
-
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPULoopKernelFusion>(2);
-    auto cpu_f = make_function();
-    auto int_f = make_function();
-    pass_manager.run_passes(cpu_f);
-    test::Uniform<float> rng(-100.0f, 100.0f);
-    vector<vector<float>> args;
-
-    size_t lkn = count_ops_of_type<runtime::cpu::op::LoopKernel>(cpu_f);
-    ASSERT_GT(lkn, 0);
-
-    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
-    {
-        vector<float> tensor_val(shape_size(param->get_shape()));
-        rng.initialize(tensor_val);
-        args.push_back(tensor_val);
-    }
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    for (size_t i = 0; i < cpu_results.size(); i++)
-    {
-        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
-    }
-}
-
-#endif
-
-TEST(cpu_fusion, sigmoid_multiply_fusion)
-{
-    pass::Manager pass_manager;
-    pass_manager.register_pass<pass::CoreFusion>();
-    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
-    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/3_lstm_cell_forward.json");
-    const string json_string = file_util::read_file_to_string(json_path);
-    stringstream ss(json_string);
-    shared_ptr<Function> func = ngraph::deserialize(ss);
-    pass_manager.run_passes(func);
-    size_t ccg = count_ops_of_type<op::SigmoidMultiply>(func);
-    ASSERT_EQ(ccg, 18);
-}
-
-void sigmoid_multiply_fusion_forward_compute(runtime::Backend* backend,
-                                             const op::ParameterVector& input_params,
-                                             const vector<vector<float>>& input_data,
-                                             const vector<Shape>& input_shapes,
-                                             const Shape& result_shape,
-                                             shared_ptr<Node> input_0_node,
-                                             shared_ptr<Node> input_1_node,
-                                             const vector<float>& expected)
-{
-    shared_ptr<runtime::Tensor> result_tensor = backend->create_tensor(element::f32, result_shape);
-
-    vector<shared_ptr<runtime::Tensor>> input_tensors;
-    for (int i = 0; i < input_params.size(); ++i)
-    {
-        input_tensors.push_back(backend->create_tensor(element::f32, input_shapes[i]));
-        copy_data(input_tensors[i], input_data[i]);
-    }
-
-    auto mul_node = input_0_node * input_1_node;
-    auto func = make_shared<Function>(mul_node, input_params);
-    backend->call_with_validate(func, {result_tensor}, input_tensors);
-    EXPECT_TRUE(test::all_close(read_vector<float>(result_tensor), expected));
-}
-
-TEST(cpu_fusion, sigmoid_multiply_fusion_forward)
-{
-    auto backend = runtime::Backend::create("CPU");
-
-    Shape data_shape{1, 1, 2, 2};
-    Shape const_shape{1};
-
-    vector<float> input_0_data{1.f, 2.f, 3.f, 4.f};
-    vector<float> input_1_data{1.2f, 2.3f, 3.5f, 4.7f};
-    vector<float> const_data{1.2f};
-    {
-        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_2_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto sigmoid_0 = make_shared<op::Sigmoid>(input_0_param);
-        auto sigmoid_1 = make_shared<op::Add>(input_1_param, input_2_param);
-        vector<float> expected{1.60833f, 3.78743f, 6.19173f, 8.54352f};
-        op::ParameterVector input_params{input_0_param, input_1_param, input_2_param};
-        vector<vector<float>> input_data{input_0_data, input_0_data, input_1_data};
-        vector<Shape> input_shapes{data_shape, data_shape, data_shape};
-        sigmoid_multiply_fusion_forward_compute(backend.get(),
-                                                input_params,
-                                                input_data,
-                                                input_shapes,
-                                                data_shape,
-                                                sigmoid_0,
-                                                sigmoid_1,
-                                                expected);
-    }
-    {
-        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_1_param = make_shared<op::Parameter>(element::f32, const_shape);
-        auto sigmoid_0 = make_shared<op::Broadcast>(input_1_param, data_shape, AxisSet{1, 2, 3});
-        auto sigmoid_1 = make_shared<op::Sigmoid>(input_0_param);
-        vector<float> expected{0.87727f, 1.05696f, 1.14309f, 1.17842f};
-        op::ParameterVector input_params{input_0_param, input_1_param};
-        vector<vector<float>> input_data{input_0_data, const_data};
-        vector<Shape> input_shapes{data_shape, const_shape};
-        sigmoid_multiply_fusion_forward_compute(backend.get(),
-                                                input_params,
-                                                input_data,
-                                                input_shapes,
-                                                data_shape,
-                                                sigmoid_0,
-                                                sigmoid_1,
-                                                expected);
-    }
-    {
-        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_1_param = make_shared<op::Parameter>(element::f32, const_shape);
-        auto sigmoid_0 = make_shared<op::Sigmoid>(input_0_param);
-        auto sigmoid_1 = make_shared<op::Broadcast>(input_1_param, data_shape, AxisSet{1, 2, 3});
-        vector<float> expected{0.87727f, 1.05696f, 1.14309f, 1.17842f};
-        op::ParameterVector input_params{input_0_param, input_1_param};
-        vector<vector<float>> input_data{input_0_data, const_data};
-        vector<Shape> input_shapes{data_shape, const_shape};
-        sigmoid_multiply_fusion_forward_compute(backend.get(),
-                                                input_params,
-                                                input_data,
-                                                input_shapes,
-                                                data_shape,
-                                                sigmoid_0,
-                                                sigmoid_1,
-                                                expected);
-    }
-    {
-        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto sigmoid_0 = make_shared<op::Sigmoid>(input_0_param);
-        auto sigmoid_1 = make_shared<op::Sigmoid>(input_1_param);
-        vector<float> expected{0.561837f, 0.800536f, 0.924652f, 0.973163f};
-        op::ParameterVector input_params{input_0_param, input_1_param};
-        vector<vector<float>> input_data{input_0_data, input_1_data};
-        vector<Shape> input_shapes{data_shape, data_shape};
-        sigmoid_multiply_fusion_forward_compute(backend.get(),
-                                                input_params,
-                                                input_data,
-                                                input_shapes,
-                                                data_shape,
-                                                sigmoid_0,
-                                                sigmoid_1,
-                                                expected);
-    }
-    {
-        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto sigmoid_0 = make_shared<op::Sigmoid>(input_0_param);
-        auto sigmoid_1 = make_shared<op::Tanh>(input_1_param);
-        vector<float> expected{0.60945f, 0.863266f, 0.950838f, 0.981851f};
-        op::ParameterVector input_params{input_0_param, input_1_param};
-        vector<vector<float>> input_data{input_0_data, input_1_data};
-        vector<Shape> input_shapes{data_shape, data_shape};
-        sigmoid_multiply_fusion_forward_compute(backend.get(),
-                                                input_params,
-                                                input_data,
-                                                input_shapes,
-                                                data_shape,
-                                                sigmoid_0,
-                                                sigmoid_1,
-                                                expected);
-    }
-    {
-        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto sigmoid_0 = make_shared<op::Tanh>(input_0_param);
-        auto sigmoid_1 = make_shared<op::Sigmoid>(input_1_param);
-        vector<float> expected{0.585304f, 0.876182f, 0.965887f, 0.990322f};
-        op::ParameterVector input_params{input_0_param, input_1_param};
-        vector<vector<float>> input_data{input_0_data, input_1_data};
-        vector<Shape> input_shapes{data_shape, data_shape};
-        sigmoid_multiply_fusion_forward_compute(backend.get(),
-                                                input_params,
-                                                input_data,
-                                                input_shapes,
-                                                data_shape,
-                                                sigmoid_0,
-                                                sigmoid_1,
-                                                expected);
-    }
-    {
-        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto sigmoid_0 = make_shared<op::Tanh>(input_0_param);
-        auto sigmoid_1 = make_shared<op::Tanh>(input_1_param);
-        vector<float> expected{0.634907f, 0.94484f, 0.993242f, 0.999164f};
-        op::ParameterVector input_params{input_0_param, input_1_param};
-        vector<vector<float>> input_data{input_0_data, input_1_data};
-        vector<Shape> input_shapes{data_shape, data_shape};
-        sigmoid_multiply_fusion_forward_compute(backend.get(),
-                                                input_params,
-                                                input_data,
-                                                input_shapes,
-                                                data_shape,
-                                                sigmoid_0,
-                                                sigmoid_1,
-                                                expected);
-    }
-}
-
-void sigmoid_multiply_fusion_backward_compute(runtime::Backend* backend,
-                                              const op::ParameterVector& input_params,
-                                              const vector<vector<float>>& input_data,
-                                              const vector<Shape>& input_shapes,
-                                              const vector<float> delta_data,
-                                              const Shape& delta_shape,
-                                              const Shape& d_input_0_shape,
-                                              const Shape& d_input_1_shape,
-                                              shared_ptr<Node> input_0_node,
-                                              shared_ptr<Node> input_1_node,
-                                              shared_ptr<Node> input_0_adjoint,
-                                              shared_ptr<Node> input_1_adjoint,
-                                              const vector<float>& expected_0,
-                                              const vector<float>& expected_1)
-{
-    vector<shared_ptr<runtime::Tensor>> input_tensors;
-    for (int i = 0; i < input_params.size(); ++i)
-    {
-        input_tensors.push_back(backend->create_tensor(element::f32, input_shapes[i]));
-        copy_data(input_tensors[i], input_data[i]);
-    }
-
-    auto delta_param = make_shared<op::Parameter>(element::f32, delta_shape);
-    shared_ptr<runtime::Tensor> delta_tensor = backend->create_tensor(element::f32, delta_shape);
-    copy_data(delta_tensor, delta_data);
-
-    op::ParameterVector back_params(input_params);
-    back_params.push_back(delta_param);
-    input_tensors.push_back(delta_tensor);
-
-    shared_ptr<runtime::Tensor> d_input_0_tensor =
-        backend->create_tensor(element::f32, d_input_0_shape);
-    shared_ptr<runtime::Tensor> d_input_1_tensor =
-        backend->create_tensor(element::f32, d_input_1_shape);
-
-    using FunctionType = op::SigmoidMultiply::FunctionType;
-    auto input_0_type = op::SigmoidMultiply::identify_node_type(input_0_node);
-    auto input_1_type = op::SigmoidMultiply::identify_node_type(input_1_node);
-    // for Identity functions, we use the node itself, otherwise use its input
-    // where we will apply the function of input node
-    auto input_0_alt =
-        (input_0_type == FunctionType::Identity) ? input_0_node : input_0_node->get_argument(0);
-    auto input_1_alt =
-        (input_1_type == FunctionType::Identity) ? input_1_node : input_1_node->get_argument(0);
-    auto sigmoid_mul =
-        make_shared<op::SigmoidMultiply>(input_0_alt, input_1_alt, input_0_type, input_1_type);
-
-    ngraph::autodiff::Adjoints adjoints(NodeVector{sigmoid_mul}, NodeVector{delta_param});
-    auto d_input_0 = adjoints.backprop_node(input_0_adjoint);
-    auto d_input_1 = adjoints.backprop_node(input_1_adjoint);
-    auto df = make_shared<Function>(NodeVector{d_input_0, d_input_1}, back_params);
-    backend->call_with_validate(df, {d_input_0_tensor, d_input_1_tensor}, input_tensors);
-    EXPECT_TRUE(test::all_close(read_vector<float>(d_input_0_tensor), expected_0));
-    EXPECT_TRUE(test::all_close(read_vector<float>(d_input_1_tensor), expected_1));
-}
-
-TEST(cpu_fusion, sigmoid_multiply_fusion_backward)
-{
-    auto backend = runtime::Backend::create("CPU");
-
-    Shape data_shape{1, 1, 2, 2};
-    Shape const_shape{1};
-
-    vector<float> input_0_data{1.f, 2.f, 3.f, 4.f};
-    vector<float> input_1_data{1.2f, 2.2f, 3.2f, 4.2f};
-    vector<float> const_data{1.2f};
-    vector<float> delta_data(shape_size(data_shape), 20.0f);
-
-    {
-        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_2_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto sigmoid_0 = make_shared<op::Sigmoid>(input_0_param);
-        auto sigmoid_1 = make_shared<op::Add>(input_1_param, input_2_param);
-        vector<float> expected_0{8.65093f, 8.81946f, 5.60191f, 2.89668f};
-        vector<float> expected_1{14.6212f, 17.6159f, 19.0515f, 19.6403f};
-        op::ParameterVector input_params{input_0_param, input_1_param, input_2_param};
-        vector<vector<float>> input_data{input_0_data, input_0_data, input_1_data};
-        vector<Shape> input_shapes{data_shape, data_shape, data_shape};
-        sigmoid_multiply_fusion_backward_compute(backend.get(),
-                                                 input_params,
-                                                 input_data,
-                                                 input_shapes,
-                                                 delta_data,
-                                                 data_shape,
-                                                 data_shape,
-                                                 data_shape,
-                                                 sigmoid_0,
-                                                 sigmoid_1,
-                                                 input_0_param,
-                                                 sigmoid_1,
-                                                 expected_0,
-                                                 expected_1);
-    }
-    {
-        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_1_param = make_shared<op::Parameter>(element::f32, const_shape);
-        auto sigmoid_0 = make_shared<op::Broadcast>(input_1_param, data_shape, AxisSet{1, 2, 3});
-        auto sigmoid_1 = make_shared<op::Tanh>(input_0_param);
-        vector<float> expected_0{15.2319f, 19.2806f, 19.9011f, 19.9866f};
-        vector<float> expected_1{10.0794f, 1.69562f, 0.236785f, 0.0321828f};
-        op::ParameterVector input_params{input_0_param, input_1_param};
-        vector<vector<float>> input_data{input_0_data, const_data};
-        vector<Shape> input_shapes{data_shape, const_shape};
-        sigmoid_multiply_fusion_backward_compute(backend.get(),
-                                                 input_params,
-                                                 input_data,
-                                                 input_shapes,
-                                                 delta_data,
-                                                 data_shape,
-                                                 data_shape,
-                                                 data_shape,
-                                                 sigmoid_0,
-                                                 sigmoid_1,
-                                                 sigmoid_0,
-                                                 input_0_param,
-                                                 expected_0,
-                                                 expected_1);
-    }
-    {
-        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_1_param = make_shared<op::Parameter>(element::f32, const_shape);
-        auto sigmoid_0 = make_shared<op::Tanh>(input_0_param);
-        auto sigmoid_1 = make_shared<op::Broadcast>(input_1_param, data_shape, AxisSet{1, 2, 3});
-        vector<float> expected_0{10.0794f, 1.69562f, 0.236785f, 0.0321828f};
-        vector<float> expected_1{15.2319f, 19.2806f, 19.9011f, 19.9866f};
-        op::ParameterVector input_params{input_0_param, input_1_param};
-        vector<vector<float>> input_data{input_0_data, const_data};
-        vector<Shape> input_shapes{data_shape, const_shape};
-        sigmoid_multiply_fusion_backward_compute(backend.get(),
-                                                 input_params,
-                                                 input_data,
-                                                 input_shapes,
-                                                 delta_data,
-                                                 data_shape,
-                                                 data_shape,
-                                                 data_shape,
-                                                 sigmoid_0,
-                                                 sigmoid_1,
-                                                 input_0_param,
-                                                 sigmoid_1,
-                                                 expected_0,
-                                                 expected_1);
-    }
-    {
-        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto sigmoid_0 = make_shared<op::Sigmoid>(input_0_param);
-        auto sigmoid_1 = make_shared<op::Sigmoid>(input_1_param);
-        vector<float> expected_0{3.02202f, 1.89041f, 0.868146f, 0.348035f};
-        vector<float> expected_1{2.60102f, 1.58192f, 0.716941f, 0.285879f};
-        op::ParameterVector input_params{input_0_param, input_1_param};
-        vector<vector<float>> input_data{input_0_data, input_1_data};
-        vector<Shape> input_shapes{data_shape, data_shape};
-        sigmoid_multiply_fusion_backward_compute(backend.get(),
-                                                 input_params,
-                                                 input_data,
-                                                 input_shapes,
-                                                 delta_data,
-                                                 data_shape,
-                                                 data_shape,
-                                                 data_shape,
-                                                 sigmoid_0,
-                                                 sigmoid_1,
-                                                 input_0_param,
-                                                 input_1_param,
-                                                 expected_0,
-                                                 expected_1);
-    }
-    {
-        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto sigmoid_0 = make_shared<op::Sigmoid>(input_0_param);
-        auto sigmoid_1 = make_shared<op::Tanh>(input_1_param);
-        vector<float> expected_0{3.27813f, 2.04894f, 0.900536f, 0.353095f};
-        vector<float> expected_1{4.45975f, 0.84425f, 0.126201f, 0.0176579f};
-        op::ParameterVector input_params{input_0_param, input_1_param};
-        vector<vector<float>> input_data{input_0_data, input_1_data};
-        vector<Shape> input_shapes{data_shape, data_shape};
-        sigmoid_multiply_fusion_backward_compute(backend.get(),
-                                                 input_params,
-                                                 input_data,
-                                                 input_shapes,
-                                                 delta_data,
-                                                 data_shape,
-                                                 data_shape,
-                                                 data_shape,
-                                                 sigmoid_0,
-                                                 sigmoid_1,
-                                                 input_0_param,
-                                                 input_1_param,
-                                                 expected_0,
-                                                 expected_1);
-    }
-    {
-        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto sigmoid_0 = make_shared<op::Tanh>(input_0_param);
-        auto sigmoid_1 = make_shared<op::Sigmoid>(input_1_param);
-        vector<float> expected_0{6.45521f, 1.27207f, 0.189593f, 0.0264228f};
-        vector<float> expected_1{2.70967f, 1.7314f, 0.748913f, 0.29092f};
-        op::ParameterVector input_params{input_0_param, input_1_param};
-        vector<vector<float>> input_data{input_0_data, input_1_data};
-        vector<Shape> input_shapes{data_shape, data_shape};
-        sigmoid_multiply_fusion_backward_compute(backend.get(),
-                                                 input_params,
-                                                 input_data,
-                                                 input_shapes,
-                                                 delta_data,
-                                                 data_shape,
-                                                 data_shape,
-                                                 data_shape,
-                                                 sigmoid_0,
-                                                 sigmoid_1,
-                                                 input_0_param,
-                                                 input_1_param,
-                                                 expected_0,
-                                                 expected_1);
-    }
-    {
-        auto input_0_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto input_1_param = make_shared<op::Parameter>(element::f32, data_shape);
-        auto sigmoid_0 = make_shared<op::Tanh>(input_0_param);
-        auto sigmoid_1 = make_shared<op::Tanh>(input_1_param);
-        vector<float> expected_0{7.00227f, 1.37874f, 0.196666f, 0.026807f};
-        vector<float> expected_1{4.64603f, 0.924027f, 0.131829f, 0.0179692f};
-        op::ParameterVector input_params{input_0_param, input_1_param};
-        vector<vector<float>> input_data{input_0_data, input_1_data};
-        vector<Shape> input_shapes{data_shape, data_shape};
-        sigmoid_multiply_fusion_backward_compute(backend.get(),
-                                                 input_params,
-                                                 input_data,
-                                                 input_shapes,
-                                                 delta_data,
-                                                 data_shape,
-                                                 data_shape,
-                                                 data_shape,
-                                                 sigmoid_0,
-                                                 sigmoid_1,
-                                                 input_0_param,
-                                                 input_1_param,
-                                                 expected_0,
-                                                 expected_1);
-    }
-}
-
-TEST(cpu_fusion, fuse_batch_dot)
-{
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPUBatchFusion>();
-    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/batch_dot_3.json");
-    const string json_string = file_util::read_file_to_string(json_path);
-    stringstream ss(json_string);
-    shared_ptr<Function> func = ngraph::deserialize(ss);
-    pass_manager.run_passes(func);
-    size_t ccg = count_ops_of_type<op::BatchDot>(func);
-    ASSERT_EQ(ccg, 1);
-}
-
-TEST(cpu_fusion, fuse_batch_dot_forward)
-{
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPUBatchFusion>();
-
-    const std::string file_name("mxnet/batch_dot_3.json");
-    auto cpu_f = make_function(file_name);
-    auto int_f = make_function(file_name);
-    pass_manager.run_passes(cpu_f);
-    test::Uniform<float> rng(0.0f, 1.0f);
-    vector<vector<float>> args;
-
-    for (shared_ptr<op::Parameter> param : int_f->get_parameters())
-    {
-        vector<float> tensor_val(shape_size(param->get_shape()));
-        rng.initialize(tensor_val);
-        args.push_back(tensor_val);
-    }
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    for (size_t i = 0; i < int_results.size(); i++)
-    {
-        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
-    }
-}
-
-TEST(cpu_fusion, fuse_rnn_across_layer)
-{
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::LSTMFusion>();
-    pass_manager.register_pass<runtime::cpu::pass::RNNFusion>();
-    pass_manager.register_pass<ngraph::pass::AlgebraicSimplification>();
-    pass_manager.register_pass<runtime::cpu::pass::MultiLayerRNNFusion>();
-    const string json_path =
-        file_util::path_join(SERIALIZED_ZOO, "mxnet/2rnn_layer_1timestep.json");
-    const string json_string = file_util::read_file_to_string(json_path);
-    stringstream ss(json_string);
-    shared_ptr<Function> func = ngraph::deserialize(ss);
-    pass_manager.run_passes(func);
-    size_t ref_rnn_count = 1;
-    auto rnn_count = count_ops_of_type<op::Rnn>(func);
-    EXPECT_EQ(ref_rnn_count, rnn_count);
-}
-
-TEST(cpu_fusion, fuse_rnn_across_2layer_1timestep)
-{
-    const std::string file_name("mxnet/2rnn_layer_1timestep.json");
-    auto cpu_f = make_function(file_name);
-    auto int_f = make_function(file_name);
-    test::Uniform<float> rng(0.0f, 1.0f);
-    vector<vector<float>> args;
-
-    for (shared_ptr<op::Parameter> param : int_f->get_parameters())
-    {
-        vector<float> tensor_val(shape_size(param->get_shape()));
-        rng.initialize(tensor_val);
-        args.push_back(tensor_val);
-    }
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-
-    // TODO (pruthvi): Enable this after fixing failing
-    // mxnet rnn unit tests
-    // EXPECT_EQ(1, count_ops_of_type<op::Rnn>(cpu_f));
-    for (size_t i = 0; i < cpu_results.size(); i++)
-    {
-        EXPECT_TRUE(test::all_close(cpu_results.at(1), int_results.at(1), 1.0e-4f, 1.0e-4f));
-    }
-}
-
-static void check_bounded_relu(Shape param_shape, float constant_val)
-{
-    auto make_function = [](Shape input_shape, float alpha_val) {
-        auto relu_input = std::make_shared<op::Parameter>(element::f32, input_shape);
-        auto relu = std::make_shared<op::Relu>(relu_input);
-        auto alpha = op::Constant::create<float>(
-            element::f32, input_shape, std::vector<float>(1.0f, alpha_val));
-        auto min = std::make_shared<op::Minimum>(relu, alpha);
-        auto f = make_shared<Function>(NodeVector{min}, op::ParameterVector{relu_input});
-        return f;
-    };
-
-    auto cpu_f = make_function(param_shape, constant_val);
-    auto int_f = make_function(param_shape, constant_val);
-    test::Uniform<float> rng(-10.0f, 10.0f);
-    vector<vector<float>> args;
-
-    for (shared_ptr<op::Parameter> param : int_f->get_parameters())
-    {
-        vector<float> tensor_val(shape_size(param->get_shape()));
-        rng.initialize(tensor_val);
-        args.push_back(tensor_val);
-    }
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-
-    EXPECT_EQ(1, count_ops_of_type<op::BoundedRelu>(cpu_f));
-    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0), 1.0e-4f, 1.0e-4f));
-}
-
-TEST(cpu_fusion, fuse_bounded_relu_inter_vs_cpu)
-{
-    check_bounded_relu(Shape{4, 3, 2, 2}, 6.0f);
-    check_bounded_relu(Shape{4, 3}, 4.0f);
-    check_bounded_relu(Shape{4, 3, 2}, 2.0f);
-}
-
-TEST(cpu_fusion, dot_batch_forward)
-{
-    const Shape shape_a{2, 3, 2};
-    const Shape shape_b{2, 3};
-
-    auto generate_func = [&shape_a, &shape_b]() -> shared_ptr<Function> {
-        auto a = make_shared<op::Parameter>(element::f32, shape_a);
-        auto b = make_shared<op::Parameter>(element::f32, shape_b);
-        auto dot = make_shared<op::Dot>(a, b);
-        return make_shared<Function>(dot, op::ParameterVector{a, b});
-    };
-    shared_ptr<Function> cpu_func = generate_func();
-    shared_ptr<Function> int_func = generate_func();
-
-    test::Uniform<float> rng(0.0f, 1.0f);
-    vector<vector<float>> args;
-    for (shared_ptr<op::Parameter> param : int_func->get_parameters())
-    {
-        vector<float> tensor_val(shape_size(param->get_shape()));
-        rng.initialize(tensor_val);
-        args.push_back(tensor_val);
-    }
-
-    auto int_results = execute(int_func, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_func, args, "CPU");
-    for (size_t i = 0; i < cpu_results.size(); i++)
-    {
-        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
-    }
-}
-static std::shared_ptr<Function>
-    create_rnn_input_linear_transformation_function(size_t num_timesteps, bool data_is_4d = false)
-{
-    auto W = std::make_shared<op::Parameter>(element::f32, Shape{400, 50});
-    auto bias = std::make_shared<op::Parameter>(element::f32, Shape{400});
-    op::ParameterVector params{W, bias};
-    auto create_graph = [&]() -> std::shared_ptr<Node> {
-
-        auto data_param = (data_is_4d)
-                              ? std::make_shared<op::Parameter>(element::f32, Shape{2, 5, 1, 50})
-                              : std::make_shared<op::Parameter>(element::f32, Shape{10, 1, 50});
-        params.push_back(data_param);
-        auto reshape_axis_order = data_is_4d ? AxisVector{0, 1, 2, 3} : AxisVector{0, 1, 2};
-        auto data_param_reshape =
-            std::make_shared<op::Reshape>(data_param, reshape_axis_order, Shape{10, 50});
-        auto W_reshape = std::make_shared<op::Reshape>(W, AxisVector{1, 0}, Shape{50, 400});
-        auto dot = std::make_shared<op::Dot>(data_param_reshape, W_reshape);
-        auto bias_broadcast = make_shared<op::Broadcast>(bias, dot->get_shape(), AxisSet{0});
-        auto add_bias = std::make_shared<op::Add>(dot, bias_broadcast);
-        return add_bias;
-
-    };
-
-    NodeVector graph_nodes;
-    for (size_t i = 0; i < num_timesteps; i++)
-    {
-        graph_nodes.push_back(create_graph());
-    }
-    auto concat = std::make_shared<op::Concat>(graph_nodes, 0);
-    return make_shared<Function>(NodeVector{concat}, params);
-}
-
-TEST(cpu_fusion, fuse_rnn_input_across_time_steps)
-{
-    auto func = create_rnn_input_linear_transformation_function(10);
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPURnnMatFusion>();
-    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
-    pass_manager.run_passes(func);
-    size_t ref_matmulbias_count = 1;
-    auto matmulbias_count = count_ops_of_type<op::MatmulBias>(func);
-    EXPECT_EQ(ref_matmulbias_count, matmulbias_count);
-}
-
-TEST(cpu_fusion, fuse_rnn_input_across_time_steps_4d_data)
-{
-    auto func = create_rnn_input_linear_transformation_function(10, true);
-    pass::Manager pass_manager;
-    pass_manager.register_pass<runtime::cpu::pass::CPURnnMatFusion>();
-    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
-    pass_manager.run_passes(func);
-    size_t ref_matmulbias_count = 10; // no CPURnnMatFusion transformations
-    auto matmulbias_count = count_ops_of_type<op::MatmulBias>(func);
-    EXPECT_EQ(ref_matmulbias_count, matmulbias_count);
-}
-
-TEST(cpu_fusion, rnn_input_fusion_inter_vs_cpu)
-{
-    shared_ptr<Function> cpu_func = create_rnn_input_linear_transformation_function(10);
-    shared_ptr<Function> int_func = create_rnn_input_linear_transformation_function(10);
-
-    test::Uniform<float> rng(-10.0f, 10.0f);
-    vector<vector<float>> args;
-    for (shared_ptr<op::Parameter> param : int_func->get_parameters())
-    {
-        vector<float> tensor_val(shape_size(param->get_shape()));
-        rng.initialize(tensor_val);
-        args.push_back(tensor_val);
-    }
-
-    auto int_results = execute(int_func, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_func, args, "CPU");
-    for (size_t i = 0; i < cpu_results.size(); i++)
-    {
-        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i), 1.0e-4f, 1.0e-4f));
-    }
-}

From 23f68d06235fb948b82d15be1b8ad8fb72817513 Mon Sep 17 00:00:00 2001
From: Scott Cyphers <scott.cyphers@intel.com>
Date: Tue, 13 Nov 2018 13:30:29 -0800
Subject: [PATCH 04/10] BatchNorm Python API changes.

---
 python/pyngraph/ops/batch_norm.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/pyngraph/ops/batch_norm.cpp b/python/pyngraph/ops/batch_norm.cpp
index 11fb21c2e07..15bec7f6ae5 100644
--- a/python/pyngraph/ops/batch_norm.cpp
+++ b/python/pyngraph/ops/batch_norm.cpp
@@ -30,9 +30,9 @@ void regclass_pyngraph_op_BatchNormTraining(py::module m)
         batch_norm_training(m, "BatchNormTraining");
     batch_norm_training.doc() =
         "ngraph.impl.op.BatchNormTraining wraps ngraph::op::BatchNormTraining";
-    batch_norm_training.def(py::init<double,
-                                     const std::shared_ptr<ngraph::Node>&,
+    batch_norm_training.def(py::init<const std::shared_ptr<ngraph::Node>&,
                                      const std::shared_ptr<ngraph::Node>&,
+                                     double,
                                      const std::shared_ptr<ngraph::Node>&>());
 }
 
@@ -45,11 +45,11 @@ void regclass_pyngraph_op_BatchNormInference(py::module m)
     batch_norm_inference.doc() =
         "ngraph.impl.op.BatchNormInference wraps ngraph::op::BatchNormInference";
 
-    batch_norm_inference.def(py::init<double,
-                                      const std::shared_ptr<ngraph::Node>&,
+    batch_norm_inference.def(py::init<const std::shared_ptr<ngraph::Node>&,
                                       const std::shared_ptr<ngraph::Node>&,
                                       const std::shared_ptr<ngraph::Node>&,
                                       const std::shared_ptr<ngraph::Node>&,
+                                      double,
                                       const std::shared_ptr<ngraph::Node>&>());
 }
 
@@ -61,11 +61,11 @@ void regclass_pyngraph_op_BatchNormTrainingBackprop(py::module m)
         batch_norm_training_backprop(m, "BatchNormTrainingBackprop");
     batch_norm_training_backprop.doc() =
         "ngraph.impl.op.BatchNormTrainingBackprop wraps ngraph::op::BatchNormTrainingBackprop";
-    batch_norm_training_backprop.def(py::init<double,
-                                              const std::shared_ptr<ngraph::Node>&,
+    batch_norm_training_backprop.def(py::init<const std::shared_ptr<ngraph::Node>&,
                                               const std::shared_ptr<ngraph::Node>&,
                                               const std::shared_ptr<ngraph::Node>&,
                                               const std::shared_ptr<ngraph::Node>&,
                                               const std::shared_ptr<ngraph::Node>&,
+                                              double,
                                               const std::shared_ptr<ngraph::Node>&>());
 }

From cc0295a25dcf6195bae5e588677193cd2a1e533e Mon Sep 17 00:00:00 2001
From: Scott Cyphers <scott.cyphers@intel.com>
Date: Tue, 13 Nov 2018 14:39:42 -0800
Subject: [PATCH 05/10] Fix python signatures

---
 python/pyngraph/ops/batch_norm.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/pyngraph/ops/batch_norm.cpp b/python/pyngraph/ops/batch_norm.cpp
index 15bec7f6ae5..df600f0b173 100644
--- a/python/pyngraph/ops/batch_norm.cpp
+++ b/python/pyngraph/ops/batch_norm.cpp
@@ -32,8 +32,8 @@ void regclass_pyngraph_op_BatchNormTraining(py::module m)
         "ngraph.impl.op.BatchNormTraining wraps ngraph::op::BatchNormTraining";
     batch_norm_training.def(py::init<const std::shared_ptr<ngraph::Node>&,
                                      const std::shared_ptr<ngraph::Node>&,
-                                     double,
-                                     const std::shared_ptr<ngraph::Node>&>());
+                                     const std::shared_ptr<ngraph::Node>&, 
+                                     double>());
 }
 
 void regclass_pyngraph_op_BatchNormInference(py::module m)
@@ -49,8 +49,8 @@ void regclass_pyngraph_op_BatchNormInference(py::module m)
                                       const std::shared_ptr<ngraph::Node>&,
                                       const std::shared_ptr<ngraph::Node>&,
                                       const std::shared_ptr<ngraph::Node>&,
-                                      double,
-                                      const std::shared_ptr<ngraph::Node>&>());
+                                      const std::shared_ptr<ngraph::Node>&,
+                                      double>());
 }
 
 void regclass_pyngraph_op_BatchNormTrainingBackprop(py::module m)
@@ -66,6 +66,6 @@ void regclass_pyngraph_op_BatchNormTrainingBackprop(py::module m)
                                               const std::shared_ptr<ngraph::Node>&,
                                               const std::shared_ptr<ngraph::Node>&,
                                               const std::shared_ptr<ngraph::Node>&,
-                                              double,
-                                              const std::shared_ptr<ngraph::Node>&>());
+                                              const std::shared_ptr<ngraph::Node>&,
+                                              double>());
 }

From 78243d68b1608c93779d5ed8283ff65f0f76250f Mon Sep 17 00:00:00 2001
From: Scott Cyphers <scott.cyphers@intel.com>
Date: Tue, 13 Nov 2018 15:22:59 -0800
Subject: [PATCH 06/10] Fix order

---
 python/ngraph/ops.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/ngraph/ops.py b/python/ngraph/ops.py
index 0b6bb262c18..9c08b14bd0b 100644
--- a/python/ngraph/ops.py
+++ b/python/ngraph/ops.py
@@ -913,20 +913,20 @@ def reverse(node, reversed_axes, name=None):  # type: (Node, List[int], str) ->
 
 
 @nameable_op
-def batch_norm(eps,             # type: float
+def batch_norm(data,            # type: Node
                gamma,           # type: Node
                beta,            # type: Node
-               data,            # type: Node
                mean=None,       # type: Node
                variance=None,   # type: Node
                name=None,       # type: str
+               eps=1e-5,        # type: float
                ):
     # type: (...) -> Node
     """Return batch normalization node."""
     if mean is None and variance is None:
-        return BatchNormTraining(eps, gamma, beta, data)
+        return BatchNormTraining(data, gamma, beta, eps)
     else:
-        return BatchNormInference(eps, gamma, beta, data, mean, variance)
+        return BatchNormInference(data, gamma, beta, mean, variance, eps)
 
 
 @nameable_op

From 7175c3a55d1caf5a9694799d8474bf9f53ea296d Mon Sep 17 00:00:00 2001
From: Scott Cyphers <scott.cyphers@intel.com>
Date: Tue, 13 Nov 2018 15:41:58 -0800
Subject: [PATCH 07/10] onnx batch_norm

---
 src/ngraph/frontend/onnx_import/op/batch_norm.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ngraph/frontend/onnx_import/op/batch_norm.cpp b/src/ngraph/frontend/onnx_import/op/batch_norm.cpp
index 11338882026..0bf423bd9d0 100644
--- a/src/ngraph/frontend/onnx_import/op/batch_norm.cpp
+++ b/src/ngraph/frontend/onnx_import/op/batch_norm.cpp
@@ -54,11 +54,11 @@ namespace ngraph
                         mean = inputs.at(3);
                         var = inputs.at(4);
                         return {std::make_shared<ngraph::op::BatchNormInference>(
-                            epsilon, scale, bias, x, mean, var)};
+                            x, scale, bias, mean, var, epsilon)};
                     }
 
                     return {
-                        std::make_shared<ngraph::op::BatchNormTraining>(epsilon, scale, bias, x)};
+                        std::make_shared<ngraph::op::BatchNormTraining>(x, scale, bias, epsilon)};
                 }
 
             } // namespace set_1

From 476c063447e71d286634b4d18e73e5dd263f96d5 Mon Sep 17 00:00:00 2001
From: Scott Cyphers <scott.cyphers@intel.com>
Date: Tue, 13 Nov 2018 16:42:33 -0800
Subject: [PATCH 08/10] style

---
 python/pyngraph/ops/batch_norm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyngraph/ops/batch_norm.cpp b/python/pyngraph/ops/batch_norm.cpp
index df600f0b173..2e7e32d97bd 100644
--- a/python/pyngraph/ops/batch_norm.cpp
+++ b/python/pyngraph/ops/batch_norm.cpp
@@ -32,7 +32,7 @@ void regclass_pyngraph_op_BatchNormTraining(py::module m)
         "ngraph.impl.op.BatchNormTraining wraps ngraph::op::BatchNormTraining";
     batch_norm_training.def(py::init<const std::shared_ptr<ngraph::Node>&,
                                      const std::shared_ptr<ngraph::Node>&,
-                                     const std::shared_ptr<ngraph::Node>&, 
+                                     const std::shared_ptr<ngraph::Node>&,
                                      double>());
 }
 

From c1ef39b0516069e5cf95cef026d58f73e00d0f4c Mon Sep 17 00:00:00 2001
From: Scott Cyphers <scott.cyphers@intel.com>
Date: Tue, 13 Nov 2018 17:23:20 -0800
Subject: [PATCH 09/10] Revert python batch_norm args order

---
 python/ngraph/ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/ngraph/ops.py b/python/ngraph/ops.py
index 9c08b14bd0b..eae9664055c 100644
--- a/python/ngraph/ops.py
+++ b/python/ngraph/ops.py
@@ -913,13 +913,13 @@ def reverse(node, reversed_axes, name=None):  # type: (Node, List[int], str) ->
 
 
 @nameable_op
-def batch_norm(data,            # type: Node
+def batch_norm(eps=1e-5,        # type: float
                gamma,           # type: Node
                beta,            # type: Node
+               data,            # type: Node
                mean=None,       # type: Node
                variance=None,   # type: Node
                name=None,       # type: str
-               eps=1e-5,        # type: float
                ):
     # type: (...) -> Node
     """Return batch normalization node."""

From dc119a975e8796087f0c59db00591f865f6e96ed Mon Sep 17 00:00:00 2001
From: Scott Cyphers <scott.cyphers@intel.com>
Date: Tue, 13 Nov 2018 17:45:54 -0800
Subject: [PATCH 10/10] No default on required op

---
 python/ngraph/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ngraph/ops.py b/python/ngraph/ops.py
index eae9664055c..1cab4de7179 100644
--- a/python/ngraph/ops.py
+++ b/python/ngraph/ops.py
@@ -913,7 +913,7 @@ def reverse(node, reversed_axes, name=None):  # type: (Node, List[int], str) ->
 
 
 @nameable_op
-def batch_norm(eps=1e-5,        # type: float
+def batch_norm(eps,             # type: float
                gamma,           # type: Node
                beta,            # type: Node
                data,            # type: Node