@@ -68,9 +68,11 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
6868#define FWD_XNARY_GENERIC_DT (po_kind , algorithm , result , result_elem_dt , \
6969 arg0_ptr , arg0_len , arg1_ptr , arg1_len , alpha , beta , scale ) \
7070 { \
71- const unsigned out_len = max((unsigned)arg0_len, (unsigned)arg1_len); \
71+ auto ty = arg0_len + arg1_len; \
72+ const typeof(ty) out_len \
73+ = max((typeof(ty))arg0_len, (typeof(ty))arg1_len); \
7274 result_elem_dt *res_ptr = (result_elem_dt *)(&result); \
73- unroll_for(unsigned idx = 0; idx < out_len; ++idx) { \
75+ unroll_for(typeof(out_len + 0) idx = 0; idx < out_len; ++idx) { \
7476 if (arg0_len == 1 && arg1_len == 1) { \
7577 *res_ptr = fwd_Xnary(po_kind, algorithm, \
7678 convert_float(*arg0_ptr), convert_float(*arg1_ptr), \
@@ -104,7 +106,7 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
104106
105107#define FMA_MIXED (acc_nof_elems , a , a_elem_dt , b , acc , acc_elem_dt ) \
106108 { \
107- unsigned nof_elems = acc_nof_elems; \
109+ auto nof_elems = acc_nof_elems; \
108110 a_elem_dt *a_ptr = (a_elem_dt *)(&a); \
109111 acc_elem_dt *acc_ptr = (acc_elem_dt *)(&acc); \
110112 FMA_BLOCK(8, nof_elems, acc_ptr, acc_elem_dt, a_ptr, a_elem_dt, b); \
@@ -115,18 +117,20 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
115117
116118#define FILL_BIN_ARG_SERIAL (idx , dest_ptr , x0 , x0_s , x1 , x1_s , x1_incr , x2 , \
117119 x2_s , x3 , x3_s , x4 , x4_s , x5 , x5_s ) \
118- unroll_for(unsigned x0_idx = x0, bin_arg_offset = 0; x0_idx < x0 + x0_s ; \
119- ++x0_idx) { \
120- unroll_for(unsigned x1_idx = x1; x1_idx < x1 + x1_s; \
120+ unroll_for(typeof(x0 + x0_s) x0_idx = x0, bin_arg_offset = 0; \
121+ x0_idx < x0 + x0_s; ++x0_idx) { \
122+ unroll_for(typeof(x1 + x1_s) x1_idx = x1; x1_idx < x1 + x1_s; \
121123 x1_idx += x1_incr) { \
122- unroll_for(unsigned x2_idx = x2; x2_idx < x2 + x2_s; ++x2_idx) { \
123- unroll_for(unsigned x3_idx = x3; x3_idx < x3 + x3_s; \
124+ unroll_for(typeof(x2 + x2_s) x2_idx = x2; x2_idx < x2 + x2_s; \
125+ ++x2_idx) { \
126+ unroll_for(typeof(x3 + x3_s) x3_idx = x3; x3_idx < x3 + x3_s; \
124127 ++x3_idx) { \
125- unroll_for(unsigned x4_idx = x4; x4_idx < x4 + x4_s; \
126- ++x4_idx) { \
127- unroll_for(unsigned x5_idx = x5; x5_idx < x5 + x5_s; \
128+ unroll_for(typeof(x4 + x4_s) x4_idx = x4; \
129+ x4_idx < x4 + x4_s; ++x4_idx) { \
130+ unroll_for(typeof(x5 + x5_s) x5_idx = x5; \
131+ x5_idx < x5 + x5_s; \
128132 ++x5_idx, ++bin_arg_offset) { \
129- const unsigned bin_arg_glob_off = OFF_MD( \
133+ const auto bin_arg_glob_off = OFF_MD( \
130134 CONCAT3(PO_, idx, _BIN_ARG), \
131135 x0_idx % CONCAT3(PO_, idx, _BIN_ARG_D0), \
132136 x1_idx % CONCAT3(PO_, idx, _BIN_ARG_D1), \
@@ -173,7 +177,8 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
173177 = CONCAT2(intel_sub_group_block_read, nelem)( \
174178 (__global uint *)(src_ptr)); \
175179 } \
176- unroll_for(unsigned s_index = 0; s_index < nelem; ++s_index) { \
180+ unroll_for(typeof(nelem + 0) s_index = 0; s_index < nelem; \
181+ ++s_index) { \
177182 dst_ptr[s_index] \
178183 = CONV_BIN_ARG_TO_FLOAT(idx, tmp_storage[s_index]); \
179184 } \
@@ -189,16 +194,15 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
189194#define FILL_BIN_ARG_TRY_BLOCK (idx , dest_ptr , dest_size , x0 , x0_s , x1 , x1_s , \
190195 x1_incr , x2 , x2_s , x3 , x3_s , x4 , x4_s , x5 , x5_s ) \
191196 { \
192- unroll_for(unsigned x0_idx = x0, arg_off = 0; x0_idx < x0 + x0_s; \
193- ++x0_idx, arg_off += X_NELEMS(x1_s)) { \
194- const unsigned bin_arg_glob_off \
195- = OFF_MD(CONCAT3(PO_, idx, _BIN_ARG), \
196- x0_idx % CONCAT3(PO_, idx, _BIN_ARG_D0), \
197- x1 % CONCAT3(PO_, idx, _BIN_ARG_D1), \
198- x2 % CONCAT3(PO_, idx, _BIN_ARG_D2), \
199- x3 % CONCAT3(PO_, idx, _BIN_ARG_D3), \
200- x4 % CONCAT3(PO_, idx, _BIN_ARG_D4), \
201- x5 % CONCAT3(PO_, idx, _BIN_ARG_D5)); \
197+ unroll_for(typeof(x0 + x0_s) x0_idx = x0, arg_off = 0; \
198+ x0_idx < x0 + x0_s; ++x0_idx, arg_off += X_NELEMS(x1_s)) { \
199+ const auto bin_arg_glob_off = OFF_MD(CONCAT3(PO_, idx, _BIN_ARG), \
200+ x0_idx % CONCAT3(PO_, idx, _BIN_ARG_D0), \
201+ x1 % CONCAT3(PO_, idx, _BIN_ARG_D1), \
202+ x2 % CONCAT3(PO_, idx, _BIN_ARG_D2), \
203+ x3 % CONCAT3(PO_, idx, _BIN_ARG_D3), \
204+ x4 % CONCAT3(PO_, idx, _BIN_ARG_D4), \
205+ x5 % CONCAT3(PO_, idx, _BIN_ARG_D5)); \
202206\
203207 CONDITIONAL_FILL(idx, x1_s, 1, \
204208 (CONCAT3(po_, idx, _binary_arg) + bin_arg_glob_off), \
@@ -215,8 +219,9 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
215219#define REPLICATE_DATA ( \
216220 dest_ptr , dest_size , x0_s , x1_s , x2_s , x3_s , x4_s , x5_s ) \
217221 { \
218- const unsigned copy_size = x0_s * x1_s * x2_s * x3_s * x4_s * x5_s; \
219- unroll_for(unsigned fid = copy_size; fid < dest_size; ++fid) { \
222+ const auto copy_size = x0_s * x1_s * x2_s * x3_s * x4_s * x5_s; \
223+ unroll_for(typeof(dest_size + 0) fid = copy_size; fid < dest_size; \
224+ ++fid) { \
220225 *(dest_ptr + fid) = *(dest_ptr + (fid % copy_size)); \
221226 } \
222227 }
@@ -265,8 +270,8 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
265270 REPLICATE_DATA(bin_arg_ptr, bin_arg_size, x0_s, X_NELEMS(x1_s), \
266271 x2_s, x3_s, x4_s, x5_s); \
267272 } else { \
268- const unsigned x1_jump = is_burst ? get_sub_group_size() : 1; \
269- const unsigned x1_size = x1_s / x1_jump; \
273+ const auto x1_jump = is_burst ? get_sub_group_size() : 1; \
274+ const auto x1_size = x1_s / x1_jump; \
270275 FILL_BIN_ARG_SERIAL(idx, bin_arg_ptr, x0, x0_s, (x1 + x1_incr), \
271276 x1_s, x1_jump, x2, x2_s, x3, x3_s, x4, x4_s, x5, x5_s); \
272277 REPLICATE_DATA(bin_arg_ptr, bin_arg_size, x0_s, x1_size, x2_s, \
0 commit comments