Skip to content

Quartus Streaming Conv, Pooling & Image layers #656

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Nov 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions hls4ml/backends/quartus/passes/convolution_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
"""

conv1d_function_template = 'nnet::conv_1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
conv1d_include_list = ['nnet_utils/nnet_conv1d.h']
conv1d_include_list = ['nnet_utils/nnet_conv1d.h', 'nnet_utils/nnet_conv1d_stream.h']

class Conv1DConfigTemplate(LayerConfigTemplate):
def __init__(self):
Expand Down Expand Up @@ -134,7 +134,7 @@ def format(self, node):
}};\n"""

conv2d_function_template = 'nnet::conv_2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
conv2d_include_list = ['nnet_utils/nnet_conv2d.h']
conv2d_include_list = ['nnet_utils/nnet_conv2d.h', 'nnet_utils/nnet_conv2d_stream.h']

class Conv2DConfigTemplate(LayerConfigTemplate):
def __init__(self):
Expand Down
8 changes: 5 additions & 3 deletions hls4ml/backends/quartus/passes/convolution_winograd.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ def match(self, node):
weights_transformed = node.get_attr('_weights_transposed', False) == True

# User opted for Winograd
implementation_is_winograd = node.get_attr('implementation', 'combination') == 'combination' or node.get_attr('implementation', 'combination') == 'winograd'
implementation_is_winograd = node.get_attr('implementation', 'combination') == 'combination' or node.get_attr('implementation', 'combination') == 'winograd'

parallel_io_type = node.model.config.get_config_value('IOType') == 'io_parallel'

# Winograd algorithm-specific conditions
if isinstance(node, Conv1D):
Expand All @@ -29,7 +31,7 @@ def match(self, node):
# HLS Compiler fails to pipeline the entire component if Winograd loop only executes once
loop_itr_gt_one = node.get_attr('out_width') > 2

winograd_conditions = filter_size_matches and stride_is_one and loop_itr_gt_one
winograd_conditions = filter_size_matches and stride_is_one and loop_itr_gt_one and parallel_io_type

elif isinstance(node, (Conv2D)):
# Winograd only applies to specific kernel sizes
Expand All @@ -44,7 +46,7 @@ def match(self, node):

padding_is_equal = node.get_attr('pad_top', 0) == node.get_attr('pad_bottom', 0) and node.get_attr('pad_left', 0) == node.get_attr('pad_right', 0)

winograd_conditions = filter_size_matches and stride_is_one and padding_is_equal and loop_itr_gt_one
winograd_conditions = filter_size_matches and stride_is_one and padding_is_equal and loop_itr_gt_one and parallel_io_type

else:
winograd_conditions = False
Expand Down
7 changes: 3 additions & 4 deletions hls4ml/backends/quartus/passes/pointwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,15 @@ class OptimizePointwiseConv(OptimizerPass):
def match(self, node):
return node.class_name in ('Conv1D', 'Conv2D') and \
node.get_attr('filt_height', 1) == 1 and \
node.get_attr('filt_width') == 1
node.get_attr('filt_width') == 1 and \
node.model.config.get_config_value('IOType') == 'io_parallel'

def transform(self, model, node):
dim = node.__class__.__name__[-2:] # '1D' or '2D'
pw_node = model.make_node('PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy(), outputs=node.outputs.copy())
if len(node.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D
pw_node.weights['weight'].data = np.expand_dims(node.weights['weight'].data, axis=(0,1))
pw_node.weights['bias'].data = node.weights['bias'].data
# pw_node.weights['bias'].data = node.weights['bias'].data
print("Here")
model.replace_node(node, pw_node)

return True
return True
13 changes: 12 additions & 1 deletion hls4ml/backends/quartus/passes/pooling_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,18 @@

static const unsigned n_in = {n_in};
static const unsigned n_out = {n_out};
static const unsigned filt_width = {pool_width};

static const unsigned n_filt = {n_filt};
static const unsigned n_chan = {n_filt};

static const unsigned in_width = {n_in};

static const unsigned pad_left = {pad_left};
static const unsigned pad_right = {pad_right};

static const nnet::Pool_Op pool_op = nnet::{pool_op};
typedef {accum_t.name} accum_t;
}};\n"""

pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{
Expand All @@ -24,41 +29,47 @@

static const unsigned pool_height = {pool_height};
static const unsigned pool_width = {pool_width};
static const unsigned filt_height = {pool_height};
static const unsigned filt_width = {pool_width};

static const unsigned in_height = {in_height};
static const unsigned in_width = {in_width};
static const unsigned out_height = {out_height};
static const unsigned out_width = {out_width};

static const unsigned n_filt = {n_filt};
static const unsigned n_chan = {n_filt};

static const unsigned pad_top = {pad_top};
static const unsigned pad_bottom = {pad_bottom};
static const unsigned pad_left = {pad_left};
static const unsigned pad_right = {pad_right};

static const nnet::Pool_Op pool_op = nnet::{pool_op};
typedef {accum_t.name} accum_t;
}};\n"""

global_pooling1d_config_template = """struct config{index} : nnet::pooling1d_config {{
static const unsigned n_in = {n_in};
static const unsigned n_filt = {n_filt};
static const nnet::Pool_Op pool_op = nnet::{pool_op};
typedef {accum_t.name} accum_t;
}};\n"""

global_pooling2d_config_template = """struct config{index} : nnet::pooling2d_config {{
static const unsigned in_height = {in_height};
static const unsigned in_width = {in_width};
static const unsigned n_filt = {n_filt};
static const nnet::Pool_Op pool_op = nnet::{pool_op};
typedef {accum_t.name} accum_t;
}};\n"""

pooling1d_function_template = 'nnet::pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
pooling2d_function_template = 'nnet::pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
global_pooling1d_function_template = 'nnet::global_pooling1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
global_pooling2d_function_template = 'nnet::global_pooling2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'

pooling_include_list = ['nnet_utils/nnet_pooling.h']
pooling_include_list = ['nnet_utils/nnet_pooling.h', 'nnet_utils/nnet_pooling_stream.h']

class PoolingConfigTemplate(LayerConfigTemplate):
def __init__(self):
Expand Down
6 changes: 3 additions & 3 deletions hls4ml/backends/quartus/passes/reshaping_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
zeropad1d_function_template = 'nnet::zeropad1d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'
zeropad2d_function_template = 'nnet::zeropad2d_{data_format}<{input_t}, {output_t}, {config}>({input}, {output});'

padding_include_list = ['nnet_utils/nnet_padding.h']
padding_include_list = ['nnet_utils/nnet_padding.h', 'nnet_utils/nnet_padding_stream.h']

class ZeroPaddingConfigTemplate(LayerConfigTemplate):
def __init__(self):
Expand Down Expand Up @@ -72,7 +72,7 @@ def format(self, node):
}};\n"""

resize_function_template = 'nnet::resize_{algorithm}<{input_t}, {config}>({input}, {output});'
resize_include_list = ['nnet_utils/nnet_resize.h']
resize_include_list = ['nnet_utils/nnet_resize.h', 'nnet_utils/nnet_resize_stream.h']

class ResizeConfigTemplate(LayerConfigTemplate):
def __init__(self):
Expand Down Expand Up @@ -108,7 +108,7 @@ def format(self, node):
}};\n"""

transpose_function_template = 'nnet::transpose_{dim}<{input_t}, {output_t}, {config}>({input}, {output});'
transpose_include_list = ['nnet_utils/nnet_transpose.h']
transpose_include_list = ['nnet_utils/nnet_transpose.h', 'nnet_utils/nnet_transpose_stream.h']

class TransposeConfigTemplate(LayerConfigTemplate):
def __init__(self):
Expand Down
1 change: 1 addition & 0 deletions hls4ml/templates/quartus/firmware/defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,6 @@ using stream_out = ihc::stream_out<T>;

#define DIV_ROUNDUP(n,d) ((n + d - 1) / d)
#define MIN(n,d) (n > d ? d : n)
#define MAX(n,d) (n < d ? d : n)

#endif
15 changes: 12 additions & 3 deletions hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,12 @@ enum class softmax_implementation {latency=0, legacy=1, stable=2, argmax=3};
template<class data_T, typename CONFIG_T>
inline unsigned softmax_stable_idx_from_real_val(const data_T x){
// Number of address bits for table
static constexpr int N = ceillog2(CONFIG_T::table_size);
static constexpr int N = ceillog2(CONFIG_T::table_size);

// Slice the top N bits of the input
hls_register ac_int<N, false> y = x.template slc<N>(x.width-N-1);
hls_register ac_int<N, false> y = x.template slc<N>(x.width-N-1);
// If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness
if (x != 0 && y == 0) y[0] = 1;
return y.to_uint();
}

Expand All @@ -158,11 +160,18 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
Op_max<data_T> op_max;
hls_register data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);

// For the diffs, use the same type as the input but force rounding and saturation
hls_register ac_fixed<data_T::width, data_T::i_width, true, AC_RND, AC_SAT> d_xi_xmax[CONFIG_T::n_in];
#pragma unroll
for(unsigned i = 0; i < CONFIG_T::n_in; i++){
d_xi_xmax[i] = data[i] - x_max;
}

// Calculate all the e^x's
hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
#pragma unroll
for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
exp_res[i] = exp_table[softmax_stable_idx_from_real_val<data_T, CONFIG_T>(data[i] - x_max)];
exp_res[i] = exp_table[softmax_stable_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i])];
}

// Explicitly sum previously calculated exponentials with an adder tree
Expand Down
178 changes: 178 additions & 0 deletions hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d_stream.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
#ifndef NNET_CONV1D_STREAM_H_
#define NNET_CONV1D_STREAM_H_

#include "nnet_types.h"
#include "nnet_dense.h"

namespace nnet {

/*
* void kernel_shift(shift_buffer, kernel_window)
*
* Args:
* shift_buffer - array elements popped from the line the buffer during the shift line buffer operation
* kernel_window - array of values from the input curently being convolved with the kernel
*
* Values from shift_buffer are inserted into kernel_window, updating the values to be convolved
*/
template <class data_T, typename CONFIG_T>
void kernel_shift_1d(
typename data_T::value_type shift_buffer[CONFIG_T::n_chan],
typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan]
) {
/*
* Manually shift kernel_window by one step to the left
* Not possible to use nnet::shift_reg<T, N> as the kernel window is convolved with the kernel weights using dense matrix multiplication
* Dense matrix multiplication is only implemented for arrays
* However, provided certain timing constrains are met, Intel HLS automatically infers a shift operation and implements kernel_window as a shift register
* To verify, see synthesis report in report.html > Area Analysis of System
*/
KernelShiftWidth:
#pragma unroll
for (int col = 0; col < CONFIG_T::filt_width - 1; col++) {
KernelShiftChannel:
#pragma unroll
for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
kernel_window[col * CONFIG_T::n_chan + channel] = kernel_window[(col + 1) * CONFIG_T::n_chan + channel];
}
}

// Insert shift_buffer values into the last column of the kernel window
KernelPushChannel:
#pragma unroll
for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
kernel_window[(CONFIG_T::filt_width - 1) * CONFIG_T::n_chan + channel] = shift_buffer[channel];
}
}

/*
* void shift_line_buffer(in_element, line_buffer, shift_buffer)
*
* Args:
* in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number of channels
* line_buffer - chained array of shift registers, one for each row of the kernel and channel
* shift_buffer - array elements popped from the line the buffer during the shift operation
*
* Values from in_element are inserted into the line buffer, causing all other elements to be shifted by one
* Popped elements are later used to update the kernel window, during the kernel_shift operation
*/
template <class data_T, typename CONFIG_T>
void shift_line_buffer_1d(
const data_T &in_elem,
nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right> line_buffer[CONFIG_T::n_chan],
typename data_T::value_type shift_buffer[CONFIG_T::n_chan]
) {
// For every channel, insert the incoming pixel at end of the shift buffer
UpdateBuffer:
#pragma unroll
for (int channel = 0; channel < CONFIG_T::n_chan; channel++) {
shift_buffer[channel] = in_elem[channel];
}
}

/*
* void compute_output_buffer(in_element, res_stream, line_buffer, kernel_window, weights, biases)
*
* Args:
* in_element - current elements from input image, data_T type is usually nnet::array, size of array corresponds to number of channels
* res_stream - output stream, passed by reference to allow direct writing
* line_buffer - chained array of shift registers, one for each row of the kernel and channel
* kernel_window - array of values from the input curently convolved with the kernel
* weights - Conv1D layer weights
* biases - Conv1D layer biases
*
* Function executes 4 steps:
* (1) Shift line buffer - updates the contents of the chained shift registers, inserting the new inputs and removing last elements
* (2) Kernel shift - updates the elements of the kernel window, by storing the new inputs and popped elements from the line buffer
* (3) Matrix mulitplication - performs dense matrix multiplication between the current input window and kernel weights
* (4) Counter housekeeping - keeps track of current pixel and stride
*/
template<class data_T, class res_T, typename CONFIG_T>
void compute_output_buffer_1d(
const data_T &in_elem,
stream<res_T> &res_stream,
nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right> line_buffer[CONFIG_T::n_chan],
typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan],
const typename CONFIG_T::weight_t weights[CONFIG_T::kernel_size * CONFIG_T::n_chan * CONFIG_T::n_filt],
const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]
) {
// Thresholds
static constexpr int lShiftX = CONFIG_T::filt_width - 1;

// X position pixel
static int pX = 0;

// X strides
static int sX = 0;

// Step 1 - Shift line buffer
hls_register typename data_T::value_type shift_buffer[CONFIG_T::n_chan];
nnet::shift_line_buffer_1d<data_T, CONFIG_T>(in_elem, line_buffer, shift_buffer);

// Step 2 - Kernel shift
nnet::kernel_shift_1d<data_T, CONFIG_T>(shift_buffer, kernel_window);

// Check to see if we have a full kernel
if ((sX - lShiftX) == 0 && pX > (lShiftX - 1)) {
// Step 3 - Dense matrix multiplication
hls_register typename res_T::value_type res_out[CONFIG_T::n_filt];
dense_resource<typename data_T::value_type, typename res_T::value_type, typename CONFIG_T::mult_config>(kernel_window, res_out, weights, biases);

// Write result to output stream
hls_register res_T res_pack;
CastLoop:
#pragma unroll
for (int channel = 0; channel < CONFIG_T::n_filt; channel++) {
res_pack[channel] = res_out[channel];
}
res_stream.write(res_pack);
}

// Reached end of image
if ((pX + 1) == (CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right)) {
pX = 0;
sX = 0;
// Move to the right
} else {
pX++;
sX = ((sX - lShiftX) == 0) ? (sX - CONFIG_T::stride_width + 1) : (sX + 1);
}
}


template <class data_T, class res_T, typename CONFIG_T>
void conv_1d_cl(
stream<data_T> &data,
stream<res_T> &res,
const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]
) {
// Line buffer and kernel window
hls_register static nnet::shift_reg<typename data_T::value_type, CONFIG_T::pad_left + CONFIG_T::in_width + CONFIG_T::pad_right> line_buffer[CONFIG_T::n_chan];
hls_register static typename data_T::value_type kernel_window[CONFIG_T::filt_width * CONFIG_T::n_chan];

// An array of length CONFIG_T::n_chan, with elements set to zero (padding for each channel)
static const data_T padds(0);

// Input image left-side padding
PaddingLeftWidth:
for (int col = 0; col < CONFIG_T::pad_left; col++) {
compute_output_buffer_1d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
}

// Read input image
ReadInputWidth:
for (int col = 0; col < CONFIG_T::in_width; col++) {
compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, line_buffer, kernel_window, weights, biases);
}

// Input image right-side padding
PaddingRightWidth:
for (int col = 0; col < CONFIG_T::pad_right; col++) {
compute_output_buffer_1d<data_T, res_T, CONFIG_T>(padds, res, line_buffer, kernel_window, weights, biases);
}
}

}

#endif
Loading