fastmachinelearning · vloncar · Jun 17, 2022 · Sep 25, 2021 · Jun 17, 2022 · Jun 17, 2022
diff --git a/hls4ml/backends/vivado/passes/clone.py b/hls4ml/backends/vivado/passes/clone.py
@@ -11,21 +11,25 @@ class Clone(Layer):
 
     def initialize(self):
         inp = self.get_input_variable()
-        self.add_output_variable(inp.shape, inp.dim_names, out_name=self.outputs[0], var_name='layer{index}_cpy1')
-        self.add_output_variable(inp.shape, inp.dim_names, out_name=self.outputs[1], var_name='layer{index}_cpy2')
+        for i, out_name in enumerate(self.outputs):
+            self.add_output_variable(inp.shape, inp.dim_names, out_name=out_name, var_name='layer{index}_cpy' + str(i + 1))
 
-clone_function_template = 'nnet::clone_stream<{input_t}, {output_t}, {size}>({input}, {output1}, {output2});'
 clone_include_list = ['nnet_utils/nnet_stream.h']
 
 class CloneFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(Clone, include_header=clone_include_list)
-        self.template = clone_function_template
+        self.template = None # to be filled once number of clones known
 
     def format(self, node):
         params = self._default_function_params(node)
-        params['output1'] = node.variables[node.outputs[0]].name
-        params['output2'] = node.variables[node.outputs[1]].name
+        for i, output in enumerate(node.outputs):
+            params['output' + str(i + 1)] = node.variables[node.outputs[i]].name
+
+        if self.template is None:
+            self.template = 'nnet::clone_stream<{input_t}, {output_t}, {size}>({input}, ' + \
+                            ', '.join(['{output' + str(i + 1) + '}' for i in range(len(node.outputs))]) + \
+                            ');'
 
         return self.template.format(**params)
 
@@ -63,8 +67,8 @@ def transform(self, model, node):
         transformed = False
         for output in node.outputs:
             if len(output_map[output]) > 1:
-                if len(output_map[output]) > 2:
-                    print('WARN: Cannot clone output {} of {} ({})'.format(output, node.class_name, node.name))
+                if len(output_map[output]) > 3:
+                    print('WARNING: Cloning output {} of {} ({}) more than 3 times not currently supported'.format(output, node.__class__.__name__, node.name))
                     return False
                 out_var = node.get_output_variable(output)
                 for i, layer in enumerate(output_map[output], 1):
@@ -73,7 +77,7 @@ def transform(self, model, node):
                     }
                     idx = layer.inputs.index(output)
                     layer.inputs[idx] = output + '_cpy' + str(i)
-                clone_layer = model.make_node(Clone, 'clone_' + node.name, attrs, [output], [output + '_cpy1', output + '_cpy2'])
+                clone_layer = model.make_node(Clone, 'clone_' + node.name, attrs, [output], [output + '_cpy' + str(i + 1) for i in range(len(output_map[output]))])
                 model.insert_node(clone_layer)
                 transformed = True
 

diff --git a/hls4ml/backends/vivado/passes/reshaping_templates.py b/hls4ml/backends/vivado/passes/reshaping_templates.py
@@ -101,17 +101,17 @@ def format(self, node):
     static constexpr unsigned perm[3] = {{{perm_str}}};
 }};\n"""
 
-transpose_function_template = 'nnet::transpose_{dim}<{input_t}, {config}>({input}, {output});'
+transpose_function_template = 'nnet::transpose_{dim}<{input_t}, {output_t}, {config}>({input}, {output});'
 
-transpose_include_list = ['nnet_utils/nnet_array.h']
+transpose_include_list = ['nnet_utils/nnet_array.h', 'nnet_utils/nnet_stream.h']
 
 class TransposeConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__(Transpose)
         self.template = transpose_config_template
 
     def format(self, node):
-        params = self._default_config_params()
+        params = self._default_config_params(node)
 
         return self.template.format(**params)
 

diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
@@ -14,10 +14,10 @@
 try:
     import qkeras
     register_flow('convert', ['fuse_bias_add', 'remove_useless_transpose', 'output_rounding_saturation_mode', 'qkeras_factorize_alpha', 'extract_ternary_threshold', 'fuse_consecutive_batch_normalization']) # TODO Maybe not all QKeras optmizers belong here?
-    register_flow('optimize', ['eliminate_linear_activation', 'fuse_consecutive_batch_normalization', 'fuse_batch_normalization', 'replace_multidimensional_dense_with_conv'], requires=['convert'])
+    register_flow('optimize', ['eliminate_linear_activation', 'fuse_consecutive_batch_normalization', 'fuse_batch_normalization', 'replace_multidimensional_dense_with_conv', 'set_precision_concat'], requires=['convert'])
 except:
     register_flow('convert', ['fuse_bias_add', 'remove_useless_transpose'])
-    register_flow('optimize', ['eliminate_linear_activation', 'fuse_batch_normalization', 'replace_multidimensional_dense_with_conv'], requires=['convert'])
+    register_flow('optimize', ['eliminate_linear_activation', 'fuse_batch_normalization', 'replace_multidimensional_dense_with_conv', 'set_precision_concat'], requires=['convert'])
 
 del opt_path
 del module_path

diff --git a/hls4ml/model/optimizer/passes/precision_merge.py b/hls4ml/model/optimizer/passes/precision_merge.py
@@ -0,0 +1,39 @@
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.types import FixedPrecisionType
+
+def get_concat_type(itype1, itype2):
+    newwidth = max(itype1.width, itype2.width)
+    newint = max(itype1.integer, itype2.integer)
+    if (itype1.signed ^ itype2.signed): # XOR
+        newint += 1
+        newwidth += 1
+    newrmode = itype1.rounding_mode if itype1.rounding_mode is not None else itype2.rounding_mode
+    newsmode = itype1.saturation_mode if itype1.saturation_mode is not None else itype2.saturation_mode
+    newsbits = itype1.saturation_bits if itype1.saturation_bits is not None else itype2.saturation_bits
+
+    newtype = FixedPrecisionType(newwidth, newint, itype1.signed or itype2.signed,
+                                 newrmode, newsmode, newsbits)
+    return newtype
+
+class SetPrecisionConcat(OptimizerPass):
+    def match(self, node):
+        if node.__class__.__name__ == 'Concatenate':
+            otype = node.get_output_variable().type.precision
+            itype1 = node.get_input_variable(node.inputs[0]).type.precision
+            itype2 = node.get_input_variable(node.inputs[1]).type.precision
+            if isinstance(otype, FixedPrecisionType) and otype != get_concat_type(itype1, itype2):
+                return True
+        return False
+
+    def transform(self, model, node):
+        """
+        Set concat output precision
+        """
+        otype = node.get_output_variable().type.precision
+        itype1 = node.get_input_variable(node.inputs[0]).type.precision
+        itype2 = node.get_input_variable(node.inputs[1]).type.precision
+        newtype = get_concat_type(itype1, itype2)
+        print("Found {} in the model, optimizing {} to {}...".format(node.name, otype, newtype))
+        node.get_output_variable().type.precision = newtype
+
+        return True
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_array.h b/hls4ml/templates/vivado/nnet_utils/nnet_array.h
@@ -12,10 +12,10 @@ struct transpose_config {
     static constexpr unsigned perm[3] = {2, 0, 1};
 };
 
-template<class data_T, typename CONFIG_T>
+template<class data_T, class res_T, typename CONFIG_T>
 void transpose_2d(
     data_T data[CONFIG_T::height * CONFIG_T::width],
-    data_T data_t[CONFIG_T::height * CONFIG_T::width]
+    res_T data_t[CONFIG_T::height * CONFIG_T::width]
 ) {
     #pragma HLS PIPELINE
 
@@ -26,10 +26,10 @@ void transpose_2d(
     }
 }
 
-template<class data_T, typename CONFIG_T>
+template<class data_T, class res_T, typename CONFIG_T>
 void transpose_3d(
     data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width],
-    data_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]
+    res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]
 ) {
     unsigned dims[3] = { CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width };
     unsigned dims_t[3];

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h
@@ -299,6 +299,110 @@ void concatenate3d(
     }
 }
 
+template<class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(
+    hls::stream<input1_T> &data1,
+    hls::stream<input2_T> &data2,
+    hls::stream<res_T> &res)
+{
+    ConcatLoopHeight1: for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        #pragma HLS PIPELINE II=1
+
+        input1_T in_data1 = data1.read();
+        res_T out_data;
+        #pragma HLS DATA_PACK variable=out_data
+
+        ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[k] = in_data1[k];
+        }
+
+        res.write(out_data);
+    }
+    ConcatLoopHeight2: for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+        #pragma HLS PIPELINE II=1
+
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        #pragma HLS DATA_PACK variable=out_data
+
+        ConcatPackInput2: for (int k = 0; k < input2_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[k] = in_data2[k];
+	}
+
+	res.write(out_data);
+    }
+}
+
+template<class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(
+    hls::stream<input1_T> &data1,
+    hls::stream<input2_T> &data2,
+    hls::stream<res_T> &res)
+{
+    ConcatLoopHeight: for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        #pragma HLS PIPELINE II=1
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        #pragma HLS DATA_PACK variable=out_data
+
+        ConcatPackInput1: for (int k = 0; k < input1_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[k] = in_data1[k];
+	}
+
+	ConcatPackInput2: for (int k = 0; k < input2_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[input1_T::size + k] = in_data2[k];
+	}
+
+	res.write(out_data);
+    }
+}
+
+template<class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(
+    hls::stream<input1_T> &data1,
+    hls::stream<input2_T> &data2,
+    hls::stream<res_T> &res)
+{
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template<class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(
+    hls::stream<input1_T> &data1,
+    hls::stream<input2_T> &data2,
+    hls::stream<res_T> &res)
+{
+    res_T out_data;
+    #pragma HLS DATA_PACK variable=out_data
+    ConcatLoop1: for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) {
+        #pragma HLS PIPELINE
+        input1_T in_data1 = data1.read();
+        ConcatPack1: for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+	    out_data[j] = in_data1[j];
+        }
+        res.write(out_data);
+    }
+    ConcatLoop2: for (int i = 0; i < CONFIG_T::n_elem2_0 / input2_T::size; i++) {
+        #pragma HLS PIPELINE
+        input2_T in_data2 = data2.read();
+        ConcatPack2: for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+	    out_data[j] = in_data2[j];
+        }
+        res.write(out_data);
+    }
+}
 }
 
 #endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_stream.h
@@ -38,6 +38,32 @@ void clone_stream(hls::stream<data_T> &data, hls::stream<res_T> &res1, hls::stre
     }
 }
 
+template<class data_T, class res_T, int N>
+void clone_stream(hls::stream<data_T> &data, hls::stream<res_T> &res1, hls::stream<res_T> &res2, hls::stream<res_T> &res3) {
+    CloneLoop: for (int i = 0; i < N / data_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        res_T out_data1;
+        res_T out_data2;
+        res_T out_data3;
+        #pragma HLS DATA_PACK variable=out_data1
+        #pragma HLS DATA_PACK variable=out_data2
+        #pragma HLS DATA_PACK variable=out_data3
+
+        ClonePack: for (int j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data1[j] = in_data[j];
+            out_data2[j] = in_data[j];
+            out_data3[j] = in_data[j];
+        }
+
+        res1.write(out_data1);
+        res2.write(out_data2);
+        res3.write(out_data3);
+    }
+}
+
 template<class data_T, class res_T, int N>
 void repack_stream(hls::stream<data_T> &data, hls::stream<res_T> &res) {
     if (data_T::size == res_T::size) {
@@ -145,6 +171,30 @@ void broadcast_stream(hls::stream<data_T> &data, hls::stream<res_T> &res) {
         broadcast_stream_HxWx1<data_T, res_T, CONFIG_T>(data, res);
     }
 }
+
+template<class data_T, class res_T, typename CONFIG_T>
+void transpose_2d(hls::stream<data_T> &data, hls::stream<res_T> &res) {
+    typename data_T::value_type data_array[CONFIG_T::height * CONFIG_T::width];   
+    #pragma HLS ARRAY_PARTITION variable=data_array complete
+
+    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / data_T::size; i++) {
+        #pragma HLS PIPELINE
+        data_T in_data = data.read();
+	for (int j = 0; j < data_T::size; j++) {
+	    data_array[i * data_T::size + j] = typename data_T::value_type(in_data[j]);
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::height * CONFIG_T::width / res_T::size; i++) {
+        #pragma HLS PIPELINE
+        res_T out_data;
+        #pragma HLS DATA_PACK variable=out_data
+        for (int j = 0; j < res_T::size; j++) {
+	    out_data[j] = typename res_T::value_type(data_array[j * data_T::size + i]);
+        }
+	res.write(out_data);
+    }
+} 
 }
 
 #endif
diff --git a/test/pytest/test_transpose_concat.py b/test/pytest/test_transpose_concat.py
@@ -0,0 +1,48 @@
+import pytest
+import hls4ml
+import numpy as np
+from tensorflow.keras.models import model_from_json, Model
+from tensorflow.keras.layers import Input, Permute, Concatenate, Activation
+import yaml
+
+@pytest.fixture(scope='module')
+def data():
+    X = np.random.rand(100, 2, 3)
+    return X
+
+@pytest.fixture(scope='module')
+def keras_model():
+    inp = Input(shape=(2, 3), name='input_1')
+    x = Permute((2, 1))(inp)
+    y = Concatenate(axis=1)([x, x])
+    x = Activation('relu', name='relu')(x)
+    out = Concatenate(axis=1)([x, y])
+    model = Model(inputs=inp, outputs=out)
+    return model
+
+@pytest.fixture      
+@pytest.mark.parametrize('io_type', ['io_parallel',
+                                     'io_stream'])
+def hls_model(keras_model, io_type):
+    hls_config = hls4ml.utils.config_from_keras_model(keras_model, 
+                                                      default_precision='ap_fixed<16,3,AP_RND_CONV,AP_SAT>',
+                                                      granularity='name')
+    hls_config['LayerName']['relu']['Precision'] = 'ap_ufixed<17,3>'
+    hls_model = hls4ml.converters.convert_from_keras_model(keras_model,
+                                                           hls_config=hls_config,
+                                                           io_type=io_type,
+                                                           output_dir='hls4mlprj_transpose_{}'.format(io_type))
+
+    hls_model.compile()
+    return hls_model
+
+@pytest.mark.parametrize('io_type', ['io_parallel', 
+                                     'io_stream'])
+def test_accuracy(data, keras_model, hls_model):
+    X = data
+    model = keras_model
+    # model under test predictions and accuracy
+    y_keras = model.predict(X)
+    y_hls4ml   = hls_model.predict(X).reshape(y_keras.shape)
+    # "accuracy" of hls4ml predictions vs keras
+    np.testing.assert_allclose(y_keras, y_hls4ml, rtol=0, atol=1e-04, verbose=True)