@@ -130,7 +130,17 @@ void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
130
130
enum class softmax_implementation {latency=0 , legacy=1 , stable=2 };
131
131
132
132
template <class data_T , typename CONFIG_T>
133
- inline unsigned softmax_idx_from_real_val (const data_T x){
133
+ inline unsigned softmax_stable_idx_from_real_val (const data_T x){
134
+ // Number of address bits for table
135
+ static constexpr int N = ceillog2 (CONFIG_T::table_size);
136
+
137
+ // Slice the top N bits of the input
138
+ hls_register ac_int<N, false > y = x.template slc <N>(x.width -N-1 );
139
+ return y.to_uint ();
140
+ }
141
+
142
+ template <class data_T , typename CONFIG_T>
143
+ inline unsigned softmax_latency_idx_from_real_val (const data_T x){
134
144
// Number of address bits for table
135
145
static constexpr int N = ceillog2 (CONFIG_T::table_size);
136
146
@@ -148,27 +158,20 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
148
158
// Find maximum
149
159
Op_max<data_T> op_max;
150
160
hls_register data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
151
-
152
- // Calculate differences from the maximum, forcing rounding and saturation for better accuracy
153
- hls_register ac_fixed<data_T::width, data_T::i_width, true , AC_RND, AC_SAT> d_xi_xmax[CONFIG_T::n_in];
154
- #pragma unroll
155
- for (unsigned i = 0 ; i < CONFIG_T::n_in; i++) {
156
- d_xi_xmax[i] = data[i] - x_max;
157
- }
158
161
159
162
// Calculate all the e^x's
160
163
hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
161
164
#pragma unroll
162
165
for (unsigned i = 0 ; i < CONFIG_T::n_in; i++) {
163
- exp_res[i] = exp_table[softmax_idx_from_real_val <data_T, CONFIG_T>(d_xi_xmax [i])];
166
+ exp_res[i] = exp_table[softmax_stable_idx_from_real_val <data_T, CONFIG_T>(data [i] - x_max )];
164
167
}
165
168
166
169
// Explicitly sum previously calculated exponentials with an adder tree
167
170
Op_add<typename CONFIG_T::exp_table_t > op_add;
168
171
hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t , CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t >>(exp_res, op_add);
169
172
170
173
// Multiply previously calculated exponetials with the reciprocal of the sum
171
- hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val <typename CONFIG_T::exp_table_t ,CONFIG_T>(exp_sum)];
174
+ hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_stable_idx_from_real_val <typename CONFIG_T::exp_table_t ,CONFIG_T>(exp_sum)];
172
175
#pragma unroll
173
176
for (unsigned i = 0 ; i < CONFIG_T::n_in; i++) {
174
177
res[i] = exp_res[i] * inv_exp_sum;
@@ -178,31 +181,22 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
178
181
// TODO - Improve accuracy
179
182
template <class data_T , class res_T , typename CONFIG_T>
180
183
void softmax_latency (data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
181
- /*
182
- * Note: The latency tables are equivalent to stable tables
183
- * However, the compiler cannot include the same table twice
184
- * Therefore, an out-of-scope exception is thrown in one of the functions
185
- * Temporary solution - Create the same table twice in quartus_writer.py
186
- * Long-term solution - Only create tables needed by the network;
187
- * Currently, quartus-writer.py generates LUTs for all activations,
188
- * Regardless if they are present in the network or not
189
- */
190
184
#include " activation_tables/exp_table_latency.tb"
191
185
#include " activation_tables/invert_table_latency.tb"
192
186
193
187
// Calculate all the e^x's
194
188
hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
195
189
#pragma unroll
196
190
for (unsigned i = 0 ; i < CONFIG_T::n_in; i++) {
197
- exp_res[i] = exp_table_latency[softmax_idx_from_real_val <data_T, CONFIG_T>(data[i])];
191
+ exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val <data_T, CONFIG_T>(data[i])];
198
192
}
199
193
200
194
// Explicitly sum the results with an adder tree.
201
195
Op_add<typename CONFIG_T::exp_table_t > op_add;
202
196
hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t , CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t >>(exp_res, op_add);
203
197
204
198
// Multiply previously calculated exponetials with the reciprocal of the sum
205
- hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_idx_from_real_val <typename CONFIG_T::exp_table_t ,CONFIG_T>(exp_sum)];
199
+ hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_latency_idx_from_real_val <typename CONFIG_T::exp_table_t ,CONFIG_T>(exp_sum)];
206
200
#pragma unroll
207
201
for (unsigned i = 0 ; i < CONFIG_T::n_in; i++){
208
202
res[i] = exp_res[i] * inv_exp_sum;
0 commit comments