fastmachinelearning · jmitrevs · Nov 14, 2022 · Sep 20, 2022 · Sep 20, 2022 · Sep 20, 2022
diff --git a/hls4ml/backends/quartus/passes/convolution_templates.py b/hls4ml/backends/quartus/passes/convolution_templates.py
@@ -59,7 +59,7 @@
 """
 
 conv1d_function_template = 'nnet::conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
-conv1d_include_list = ['nnet_utils/nnet_conv1d.h']
+conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h']
 
 class Conv1DConfigTemplate(LayerConfigTemplate):
     def __init__(self):
@@ -134,7 +134,7 @@ def format(self, node):
 }};\n"""
 
 conv2d_function_template = 'nnet::conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
-conv2d_include_list = ['nnet_utils/nnet_conv2d.h']
+conv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_conv2d_stream.h']
 
 class Conv2DConfigTemplate(LayerConfigTemplate):
     def __init__(self):

diff --git a/hls4ml/backends/quartus/passes/convolution_winograd.py b/hls4ml/backends/quartus/passes/convolution_winograd.py
@@ -15,7 +15,9 @@ def match(self, node):
         weights_transformed = node.get_attr('_weights_transposed', False) == True
 
         # User opted for Winograd
-        implementation_is_winograd = node.get_attr('implementation', 'combination') == 'combination' or node.get_attr('implementation', 'combination') == 'winograd'          
+        implementation_is_winograd = node.get_attr('implementation', 'combination') == 'combination' or node.get_attr('implementation', 'combination') == 'winograd'
+
+        parallel_io_type = node.model.config.get_config_value('IOType') == 'io_parallel'          
 
         # Winograd algorithm-specific conditions
         if isinstance(node, Conv1D):
@@ -29,7 +31,7 @@ def match(self, node):
             # HLS Compiler fails to pipeline the entire component if Winograd loop only executes once
             loop_itr_gt_one = node.get_attr('out_width') > 2 
 
-            winograd_conditions = filter_size_matches and stride_is_one and loop_itr_gt_one
+            winograd_conditions = filter_size_matches and stride_is_one and loop_itr_gt_one and parallel_io_type
 
         elif isinstance(node, (Conv2D)):
             # Winograd only applies to specific kernel sizes
@@ -44,7 +46,7 @@ def match(self, node):
 
             padding_is_equal = node.get_attr('pad_top', 0) == node.get_attr('pad_bottom', 0) and node.get_attr('pad_left', 0) == node.get_attr('pad_right', 0)    
 
-            winograd_conditions = filter_size_matches and stride_is_one and padding_is_equal and loop_itr_gt_one
+            winograd_conditions = filter_size_matches and stride_is_one and padding_is_equal and loop_itr_gt_one and parallel_io_type
 
         else:
             winograd_conditions = False

diff --git a/hls4ml/backends/quartus/passes/pointwise.py b/hls4ml/backends/quartus/passes/pointwise.py
@@ -58,16 +58,15 @@ class OptimizePointwiseConv(OptimizerPass):
     def match(self, node):
         return node.class_name in ('Conv1D', 'Conv2D') and \
             node.get_attr('filt_height', 1) == 1 and \
-            node.get_attr('filt_width') == 1
+            node.get_attr('filt_width') == 1 and \
+            node.model.config.get_config_value('IOType') == 'io_parallel'
 
     def transform(self, model, node):
         dim = node.__class__.__name__[-2:] # '1D' or '2D'
         pw_node = model.make_node('PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy(), outputs=node.outputs.copy())
         if len(node.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D
             pw_node.weights['weight'].data = np.expand_dims(node.weights['weight'].data, axis=(0,1))
         pw_node.weights['bias'].data = node.weights['bias'].data
-        # pw_node.weights['bias'].data = node.weights['bias'].data
-        print("Here")
         model.replace_node(node, pw_node)
 
-        return True
+        return True
diff --git a/hls4ml/backends/quartus/passes/pooling_templates.py b/hls4ml/backends/quartus/passes/pooling_templates.py
@@ -9,13 +9,18 @@
 
     static const unsigned n_in = {n_in};
     static const unsigned n_out = {n_out};
+    static const unsigned filt_width = {pool_width};
 
     static const unsigned n_filt = {n_filt};
+    static const unsigned n_chan = {n_filt};
+
+    static const unsigned in_width = {n_in};
 
     static const unsigned pad_left = {pad_left};
     static const unsigned pad_right = {pad_right};
 
     static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    typedef {accum_t.name} accum_t;
 }};\n"""
 
 pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{
@@ -24,41 +29,47 @@
 
     static const unsigned pool_height = {pool_height};
     static const unsigned pool_width = {pool_width};
+    static const unsigned filt_height = {pool_height};
+    static const unsigned filt_width = {pool_width};
 
     static const unsigned in_height = {in_height};
     static const unsigned in_width = {in_width};
     static const unsigned out_height = {out_height};
     static const unsigned out_width = {out_width};
 
     static const unsigned n_filt = {n_filt};
+    static const unsigned n_chan = {n_filt};
 
     static const unsigned pad_top = {pad_top};
     static const unsigned pad_bottom = {pad_bottom};
     static const unsigned pad_left = {pad_left};
     static const unsigned pad_right = {pad_right};
 
     static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    typedef {accum_t.name} accum_t;
 }};\n"""
 
 global_pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{
     static const unsigned n_in = {n_in};
     static const unsigned n_filt = {n_filt};
     static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    typedef {accum_t.name} accum_t;
 }};\n"""
 
 global_pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{
     static const unsigned in_height = {in_height};
     static const unsigned in_width = {in_width};
     static const unsigned n_filt = {n_filt};
     static const nnet::Pool_Op pool_op = nnet::{pool_op};
+    typedef {accum_t.name} accum_t;
 }};\n"""
 
 pooling1d_function_template = 'nnet::pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
 pooling2d_function_template = 'nnet::pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
 global_pooling1d_function_template = 'nnet::global_pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
 global_pooling2d_function_template = 'nnet::global_pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
 
-pooling_include_list = ['nnet_utils/nnet_pooling.h']
+pooling_include_list = ['nnet_utils/nnet_pooling.h', 'nnet_utils/nnet_pooling_stream.h']
 
 class PoolingConfigTemplate(LayerConfigTemplate):
     def __init__(self):

diff --git a/hls4ml/backends/quartus/passes/reshaping_templates.py b/hls4ml/backends/quartus/passes/reshaping_templates.py
@@ -28,7 +28,7 @@
 zeropad1d_function_template = 'nnet::zeropad1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
 zeropad2d_function_template = 'nnet::zeropad2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
 
-padding_include_list = ['nnet_utils/nnet_padding.h']
+padding_include_list = ['nnet_utils/nnet_padding.h', 'nnet_utils/nnet_padding_stream.h']
 
 class ZeroPaddingConfigTemplate(LayerConfigTemplate):
     def __init__(self):
@@ -72,7 +72,7 @@ def format(self, node):
 }};\n"""
 
 resize_function_template = 'nnet::resize_{algorithm}<{input_t}, {config}>({input}, {output});'
-resize_include_list = ['nnet_utils/nnet_resize.h']
+resize_include_list = ['nnet_utils/nnet_resize.h', 'nnet_utils/nnet_resize_stream.h']
 
 class ResizeConfigTemplate(LayerConfigTemplate):
     def __init__(self):
@@ -108,7 +108,7 @@ def format(self, node):
 }};\n"""
 
 transpose_function_template = 'nnet::transpose_{dim}<{input_t}, {output_t}, {config}>({input}, {output});'
-transpose_include_list = ['nnet_utils/nnet_transpose.h']
+transpose_include_list = ['nnet_utils/nnet_transpose.h', 'nnet_utils/nnet_transpose_stream.h']
 
 class TransposeConfigTemplate(LayerConfigTemplate):
     def __init__(self):

diff --git a/hls4ml/templates/quartus/firmware/defines.h b/hls4ml/templates/quartus/firmware/defines.h
@@ -50,5 +50,6 @@ using stream_out = ihc::stream_out<T>;
 
 #define DIV_ROUNDUP(n,d) ((n + d - 1) / d)
 #define MIN(n,d) (n > d ? d : n)
+#define MAX(n,d) (n < d ? d : n)
 
 #endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
@@ -131,10 +131,12 @@ enum class softmax_implementation {latency=0, legacy=1, stable=2, argmax=3};
 template<class data_T, typename CONFIG_T>
 inline unsigned softmax_stable_idx_from_real_val(const data_T x){
     // Number of address bits for table
-    static constexpr int N = ceillog2(CONFIG_T::table_size);    
+    static constexpr int N = ceillog2(CONFIG_T::table_size);
 
     // Slice the top N bits of the input
-    hls_register ac_int<N, false> y = x.template slc<N>(x.width-N-1);             
+    hls_register ac_int<N, false> y = x.template slc<N>(x.width-N-1);
+    // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness
+    if (x != 0 && y == 0) y[0] = 1;
     return y.to_uint();
 }
 
@@ -158,11 +160,18 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
     Op_max<data_T> op_max;
     hls_register data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
 
+    // For the diffs, use the same type as the input but force rounding and saturation
+    hls_register ac_fixed<data_T::width, data_T::i_width, true, AC_RND, AC_SAT> d_xi_xmax[CONFIG_T::n_in];
+    #pragma unroll
+    for(unsigned i = 0; i < CONFIG_T::n_in; i++){
+        d_xi_xmax[i] = data[i] - x_max;
+    }
+
     // Calculate all the e^x's
     hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
     #pragma unroll
     for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        exp_res[i] = exp_table[softmax_stable_idx_from_real_val<data_T, CONFIG_T>(data[i] - x_max)];
+        exp_res[i] = exp_table[softmax_stable_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i])];
     }
 
     // Explicitly sum previously calculated exponentials with an adder tree

diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d_stream.h
@@ -0,0 +1,178 @@
+#ifndef NNET_CONV1D_STREAM_H_
+#define NNET_CONV1D_STREAM_H_
+
+#include "nnet_types.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+/*
+* void kernel_shift(shift_buffer, kernel_window)
+* 
+* Args:
+*   shift_buffer - array elements popped from the line the buffer during the shift line buffer operation 
+*   kernel_window - array of values from the input curently being convolved with the kernel
+*
+* Values from shift_buffer are inserted into kernel_window, updating the values to be convolved
+*/
+template <class data_T, typename CONFIG_T>
+void kernel_shift_1d(
+    typename data_T::value_type shift_buffer[CONFIG_T::n_chan],
+    typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan]
+) {
+    /*
+    * Manually shift kernel_window by one step to the left
+    * Not possible to use nnet::shift_reg<T, N> as the kernel window is convolved with the kernel weights using dense matrix multiplication
+    * Dense matrix multiplication is only implemented for arrays
+    * However, provided certain timing constrains are met, Intel HLS automatically infers a shift operation and implements kernel_window as a shift register
+    * To verify, see synthesis report in report.html > Area Analysis of System
+    */
+    KernelShiftWidth: 
+    #pragma unroll
+    for (int col = 0; col < CONFIG_T::filt_width - 1; col++) {
+        KernelShiftChannel: 
+        #pragma unroll
+        for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+            kernel_window[col * CONFIG_T::n_chan + channel] = kernel_window[(col + 1) * CONFIG_T::n_chan + channel];
+        }
+    }
+
+    // Insert shift_buffer values into the last column of the kernel window
+    KernelPushChannel: 
+    #pragma unroll
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+        kernel_window[(CONFIG_T::filt_width - 1) * CONFIG_T::n_chan + channel] = shift_buffer[channel];
+    }  
+}
+
+/*
+* void shift_line_buffer(in_element, line_buffer, shift_buffer)
+* 
+* Args:
+*   in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number of channels
+*   line_buffer - chained array of shift registers, one for each row of the kernel and channel
+*   shift_buffer - array elements popped from the line the buffer during the shift operation
+*
+* Values from in_element are inserted into the line buffer, causing all other elements to be shifted by one
+* Popped elements are later used to update the kernel window, during the kernel_shift operation
+*/
+template <class data_T, typename CONFIG_T>
+void shift_line_buffer_1d(
+    const data_T &in_elem, 
+    nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right> line_buffer[CONFIG_T::n_chan],
+    typename data_T::value_type shift_buffer[CONFIG_T::n_chan]
+) {
+    // For every channel, insert the incoming pixel at end of the shift buffer
+    UpdateBuffer: 
+    #pragma unroll
+    for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
+        shift_buffer[channel] = in_elem[channel];
+    }
+}
+
+/*
+* void compute_output_buffer(in_element, res_stream, line_buffer, kernel_window, weights, biases)
+* 
+* Args:
+*   in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number of channels
+*   res_stream - output stream, passed by reference to allow direct writing
+*   line_buffer - chained array of shift registers, one for each row of the kernel and channel
+*   kernel_window - array of values from the input curently convolved with the kernel
+*   weights - Conv1D layer weights
+*   biases - Conv1D layer biases
+*
+* Function executes 4 steps:
+*   (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last elements
+*   (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from the line buffer
+*   (3) Matrix mulitplication - performs dense matrix multiplication between the current input window and kernel weights
+*   (4) Counter housekeeping - keeps track of current pixel and stride
+*/
+template<class data_T, class res_T, typename CONFIG_T>
+void compute_output_buffer_1d(
+    const data_T &in_elem,
+    stream<res_T> &res_stream,
+    nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right> line_buffer[CONFIG_T::n_chan],
+    typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan],
+    const typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]
+) {
+    // Thresholds
+    static constexpr int lShiftX = CONFIG_T::filt_width - 1;
+
+    // X position pixel
+    static int pX = 0; 
+
+    // X strides
+    static int sX = 0;
+
+    // Step 1 - Shift line buffer
+    hls_register typename data_T::value_type shift_buffer[CONFIG_T::n_chan];
+    nnet::shift_line_buffer_1d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);
+
+    // Step 2 - Kernel shift
+    nnet::kernel_shift_1d<data_T, CONFIG_T>(shift_buffer, kernel_window);
+
+    // Check to see if we have a full kernel
+    if ((sX - lShiftX) == 0 && pX > (lShiftX - 1)) {        
+        // Step 3 - Dense matrix multiplication
+        hls_register typename res_T::value_type res_out[CONFIG_T::n_filt];
+        dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(kernel_window, res_out, weights, biases);
+
+        // Write result to output stream
+        hls_register res_T res_pack;
+        CastLoop: 
+        #pragma unroll
+        for (int channel = 0; channel < CONFIG_T::n_filt; channel++) {
+            res_pack[channel] = res_out[channel];
+        }
+        res_stream.write(res_pack);
+    }
+
+    // Reached end of image
+    if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) {
+        pX = 0; 
+        sX = 0;
+    // Move to the right
+    } else {
+        pX++;
+        sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1); 
+    }
+}
+
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(
+    stream<data_T> &data,
+    stream<res_T>  &res,
+    const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    const typename CONFIG_T::bias_t   biases[CONFIG_T::n_filt] 
+) {
+    // Line buffer and kernel window
+    hls_register static nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right> line_buffer[CONFIG_T::n_chan];
+    hls_register static typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
+
+    // An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel)
+    static const data_T padds(0);
+
+    // Input image left-side padding
+    PaddingLeftWidth: 
+    for (int col = 0; col < CONFIG_T::pad_left; col++) {
+        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
+    }
+
+    // Read input image
+    ReadInputWidth: 
+    for (int col = 0; col < CONFIG_T::in_width; col++) {
+        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, line_buffer, kernel_window, weights, biases);
+    }
+
+    // Input image right-side padding
+    PaddingRightWidth: 
+    for (int col = 0; col < CONFIG_T::pad_right; col++) {
+        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
+    }
+}
+
+}
+
+#endif