diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h index f897113281..23109343dd 100755 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h @@ -130,7 +130,17 @@ void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) enum class softmax_implementation {latency=0, legacy=1, stable=2}; template -inline unsigned softmax_idx_from_real_val(const data_T x){ +inline unsigned softmax_stable_idx_from_real_val(const data_T x){ + // Number of address bits for table + static constexpr int N = ceillog2(CONFIG_T::table_size); + + // Slice the top N bits of the input + hls_register ac_int y = x.template slc(x.width-N-1); + return y.to_uint(); +} + +template +inline unsigned softmax_latency_idx_from_real_val(const data_T x){ // Number of address bits for table static constexpr int N = ceillog2(CONFIG_T::table_size); @@ -148,19 +158,12 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ // Find maximum Op_max op_max; hls_register data_T x_max = reduce>(data, op_max); - - // Calculate differences from the maximum, forcing rounding and saturation for better accuracy - hls_register ac_fixed d_xi_xmax[CONFIG_T::n_in]; - #pragma unroll - for(unsigned i = 0; i < CONFIG_T::n_in; i++) { - d_xi_xmax[i] = data[i] - x_max; - } // Calculate all the e^x's hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; #pragma unroll for(unsigned i = 0; i < CONFIG_T::n_in; i++) { - exp_res[i] = exp_table[softmax_idx_from_real_val(d_xi_xmax[i])]; + exp_res[i] = exp_table[softmax_stable_idx_from_real_val(data[i] - x_max)]; } // Explicitly sum previously calculated exponentials with an adder tree @@ -168,7 +171,7 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ hls_register typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); // Multiply previously calculated exponetials with the reciprocal of the sum - hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val(exp_sum)]; + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_stable_idx_from_real_val(exp_sum)]; #pragma unroll for(unsigned i = 0; i < CONFIG_T::n_in; i++) { res[i] = exp_res[i] * inv_exp_sum; @@ -178,15 +181,6 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ // TODO - Improve accuracy template void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ - /* - * Note: The latency tables are equivalent to stable tables - * However, the compiler cannot include the same table twice - * Therefore, an out-of-scope exception is thrown in one of the functions - * Temporary solution - Create the same table twice in quartus_writer.py - * Long-term solution - Only create tables needed by the network; - * Currently, quartus-writer.py generates LUTs for all activations, - * Regardless if they are present in the network or not - */ #include "activation_tables/exp_table_latency.tb" #include "activation_tables/invert_table_latency.tb" @@ -194,7 +188,7 @@ void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; #pragma unroll for(unsigned i = 0; i < CONFIG_T::n_in; i++) { - exp_res[i] = exp_table_latency[softmax_idx_from_real_val(data[i])]; + exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val(data[i])]; } // Explicitly sum the results with an adder tree. @@ -202,7 +196,7 @@ void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ hls_register typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); // Multiply previously calculated exponetials with the reciprocal of the sum - hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_idx_from_real_val(exp_sum)]; + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; #pragma unroll for(unsigned i = 0; i < CONFIG_T::n_in; i++){ res[i] = exp_res[i] * inv_exp_sum; diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h index 03e0dc1e51..c5d0400006 100644 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation_stream.h @@ -283,7 +283,7 @@ void softmax_stable(stream &data, stream &res) { hls_register typename CONFIG_T::exp_table_t exp_res[data_T::size]; #pragma unroll for(unsigned j = 0; j < data_T::size; j++) { - exp_res[j] = exp_table[softmax_idx_from_real_val(d_xi_xmax[j])]; + exp_res[j] = exp_table[softmax_stable_idx_from_real_val(d_xi_xmax[j])]; } // Explicitly sum the results with an adder tree. @@ -291,7 +291,7 @@ void softmax_stable(stream &data, stream &res) { Op_add op_add; hls_register typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); - hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val(exp_sum)]; + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_stable_idx_from_real_val(exp_sum)]; res_T out_pack; SoftmaxInvPackLoop: @@ -327,7 +327,7 @@ void softmax_latency(stream &data, stream &res){ SoftmaxExpPackLoop: #pragma unroll for(unsigned j = 0; j < data_T::size; j++) { - exp_res[j] = exp_table_latency[softmax_idx_from_real_val(in_pack[j])]; + exp_res[j] = exp_table_latency[softmax_latency_idx_from_real_val(in_pack[j])]; } // Explicitly sum the results with an adder tree. @@ -336,7 +336,7 @@ void softmax_latency(stream &data, stream &res){ hls_register typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); // Multiply previously calculated exponetials with the reciprocal of the sum - hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_idx_from_real_val(exp_sum)]; + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_latency_idx_from_real_val(exp_sum)]; res_T out_pack; SoftmaxInvPackLoop: diff --git a/hls4ml/utils/fixed_point_utils.py b/hls4ml/utils/fixed_point_utils.py index 0060cc5360..d383dfda99 100644 --- a/hls4ml/utils/fixed_point_utils.py +++ b/hls4ml/utils/fixed_point_utils.py @@ -65,7 +65,7 @@ def set_msb_bits(self, bits): for i in range(0, len(bits)): if i < self.I: self.integer_bits[i] = bits[i] - elif i >= self.I and i= self.I and i