@@ -130,7 +130,18 @@ void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
130
130
enum class softmax_implementation {latency=0 , legacy=1 , stable=2 };
131
131
132
132
template <class data_T , typename CONFIG_T>
133
- inline unsigned softmax_idx_from_real_val (const data_T x){
133
+ inline unsigned softmax_stable_idx_from_real_val (const data_T x){
134
+ // Number of address bits for table
135
+ static constexpr int N = ceillog2 (CONFIG_T::table_size);
136
+
137
+ // Slice the top N bits of the input
138
+ hls_register ac_int<N, false > y = x.template slc <N>(x.width -N-1 );
139
+ return y.to_uint ();
140
+ }
141
+
142
+
143
+ template <class data_T , typename CONFIG_T>
144
+ inline unsigned softmax_latency_idx_from_real_val (const data_T x){
134
145
// Number of address bits for table
135
146
static constexpr int N = ceillog2 (CONFIG_T::table_size);
136
147
@@ -148,27 +159,20 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
148
159
// Find maximum
149
160
Op_max<data_T> op_max;
150
161
hls_register data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
151
-
152
- // Calculate differences from the maximum, forcing rounding and saturation for better accuracy
153
- hls_register ac_fixed<data_T::width, data_T::i_width, true , AC_RND, AC_SAT> d_xi_xmax[CONFIG_T::n_in];
154
- #pragma unroll
155
- for (unsigned i = 0 ; i < CONFIG_T::n_in; i++) {
156
- d_xi_xmax[i] = data[i] - x_max;
157
- }
158
162
159
163
// Calculate all the e^x's
160
164
hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
161
165
#pragma unroll
162
166
for (unsigned i = 0 ; i < CONFIG_T::n_in; i++) {
163
- exp_res[i] = exp_table[softmax_idx_from_real_val <data_T, CONFIG_T>(d_xi_xmax [i])];
167
+ exp_res[i] = exp_table[softmax_stable_idx_from_real_val <data_T, CONFIG_T>(data [i] - x_max )];
164
168
}
165
169
166
170
// Explicitly sum previously calculated exponentials with an adder tree
167
171
Op_add<typename CONFIG_T::exp_table_t > op_add;
168
172
hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t , CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t >>(exp_res, op_add);
169
173
170
174
// Multiply previously calculated exponetials with the reciprocal of the sum
171
- hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val <typename CONFIG_T::exp_table_t ,CONFIG_T>(exp_sum)];
175
+ hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_stable_idx_from_real_val <typename CONFIG_T::exp_table_t ,CONFIG_T>(exp_sum)];
172
176
#pragma unroll
173
177
for (unsigned i = 0 ; i < CONFIG_T::n_in; i++) {
174
178
res[i] = exp_res[i] * inv_exp_sum;
@@ -178,31 +182,22 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
178
182
// TODO - Improve accuracy
179
183
template <class data_T , class res_T , typename CONFIG_T>
180
184
void softmax_latency (data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
181
- /*
182
- * Note: The latency tables are equivalent to stable tables
183
- * However, the compiler cannot include the same table twice
184
- * Therefore, an out-of-scope exception is thrown in one of the functions
185
- * Temporary solution - Create the same table twice in quartus_writer.py
186
- * Long-term solution - Only create tables needed by the network;
187
- * Currently, quartus-writer.py generates LUTs for all activations,
188
- * Regardless if they are present in the network or not
189
- */
190
185
#include " activation_tables/exp_table_latency.tb"
191
186
#include " activation_tables/invert_table_latency.tb"
192
187
193
188
// Calculate all the e^x's
194
189
hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
195
190
#pragma unroll
196
191
for (unsigned i = 0 ; i < CONFIG_T::n_in; i++) {
197
- exp_res[i] = exp_table_latency[softmax_idx_from_real_val <data_T, CONFIG_T>(data[i])];
192
+ exp_res[i] = exp_table_latency[softmax_latency_idx_from_real_val <data_T, CONFIG_T>(data[i])];
198
193
}
199
194
200
195
// Explicitly sum the results with an adder tree.
201
196
Op_add<typename CONFIG_T::exp_table_t > op_add;
202
197
hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t , CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t >>(exp_res, op_add);
203
198
204
199
// Multiply previously calculated exponetials with the reciprocal of the sum
205
- hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_idx_from_real_val <typename CONFIG_T::exp_table_t ,CONFIG_T>(exp_sum)];
200
+ hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_latency_idx_from_real_val <typename CONFIG_T::exp_table_t ,CONFIG_T>(exp_sum)];
206
201
#pragma unroll
207
202
for (unsigned i = 0 ; i < CONFIG_T::n_in; i++){
208
203
res[i] = exp_res[i] * inv_exp_sum;
0 commit comments