Skip to content

Commit 161d2b6

Browse files
rjourslervpirogov
authored andcommitted
gpu: ocl: use input type for iteration in post_ops
1 parent a5ef078 commit 161d2b6

File tree

2 files changed

+36
-28
lines changed

2 files changed

+36
-28
lines changed

src/gpu/ocl/ocl_post_ops.h

Lines changed: 32 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,11 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
6868
#define FWD_XNARY_GENERIC_DT(po_kind, algorithm, result, result_elem_dt, \
6969
arg0_ptr, arg0_len, arg1_ptr, arg1_len, alpha, beta, scale) \
7070
{ \
71-
const unsigned out_len = max((unsigned)arg0_len, (unsigned)arg1_len); \
71+
auto ty = arg0_len + arg1_len; \
72+
const typeof(ty) out_len \
73+
= max((typeof(ty))arg0_len, (typeof(ty))arg1_len); \
7274
result_elem_dt *res_ptr = (result_elem_dt *)(&result); \
73-
unroll_for(unsigned idx = 0; idx < out_len; ++idx) { \
75+
unroll_for(typeof(out_len + 0) idx = 0; idx < out_len; ++idx) { \
7476
if (arg0_len == 1 && arg1_len == 1) { \
7577
*res_ptr = fwd_Xnary(po_kind, algorithm, \
7678
convert_float(*arg0_ptr), convert_float(*arg1_ptr), \
@@ -104,7 +106,7 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
104106

105107
#define FMA_MIXED(acc_nof_elems, a, a_elem_dt, b, acc, acc_elem_dt) \
106108
{ \
107-
unsigned nof_elems = acc_nof_elems; \
109+
auto nof_elems = acc_nof_elems; \
108110
a_elem_dt *a_ptr = (a_elem_dt *)(&a); \
109111
acc_elem_dt *acc_ptr = (acc_elem_dt *)(&acc); \
110112
FMA_BLOCK(8, nof_elems, acc_ptr, acc_elem_dt, a_ptr, a_elem_dt, b); \
@@ -115,18 +117,20 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
115117

116118
#define FILL_BIN_ARG_SERIAL(idx, dest_ptr, x0, x0_s, x1, x1_s, x1_incr, x2, \
117119
x2_s, x3, x3_s, x4, x4_s, x5, x5_s) \
118-
unroll_for(unsigned x0_idx = x0, bin_arg_offset = 0; x0_idx < x0 + x0_s; \
119-
++x0_idx) { \
120-
unroll_for(unsigned x1_idx = x1; x1_idx < x1 + x1_s; \
120+
unroll_for(typeof(x0 + x0_s) x0_idx = x0, bin_arg_offset = 0; \
121+
x0_idx < x0 + x0_s; ++x0_idx) { \
122+
unroll_for(typeof(x1 + x1_s) x1_idx = x1; x1_idx < x1 + x1_s; \
121123
x1_idx += x1_incr) { \
122-
unroll_for(unsigned x2_idx = x2; x2_idx < x2 + x2_s; ++x2_idx) { \
123-
unroll_for(unsigned x3_idx = x3; x3_idx < x3 + x3_s; \
124+
unroll_for(typeof(x2 + x2_s) x2_idx = x2; x2_idx < x2 + x2_s; \
125+
++x2_idx) { \
126+
unroll_for(typeof(x3 + x3_s) x3_idx = x3; x3_idx < x3 + x3_s; \
124127
++x3_idx) { \
125-
unroll_for(unsigned x4_idx = x4; x4_idx < x4 + x4_s; \
126-
++x4_idx) { \
127-
unroll_for(unsigned x5_idx = x5; x5_idx < x5 + x5_s; \
128+
unroll_for(typeof(x4 + x4_s) x4_idx = x4; \
129+
x4_idx < x4 + x4_s; ++x4_idx) { \
130+
unroll_for(typeof(x5 + x5_s) x5_idx = x5; \
131+
x5_idx < x5 + x5_s; \
128132
++x5_idx, ++bin_arg_offset) { \
129-
const unsigned bin_arg_glob_off = OFF_MD( \
133+
const auto bin_arg_glob_off = OFF_MD( \
130134
CONCAT3(PO_, idx, _BIN_ARG), \
131135
x0_idx % CONCAT3(PO_, idx, _BIN_ARG_D0), \
132136
x1_idx % CONCAT3(PO_, idx, _BIN_ARG_D1), \
@@ -173,7 +177,8 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
173177
= CONCAT2(intel_sub_group_block_read, nelem)( \
174178
(__global uint *)(src_ptr)); \
175179
} \
176-
unroll_for(unsigned s_index = 0; s_index < nelem; ++s_index) { \
180+
unroll_for(typeof(nelem + 0) s_index = 0; s_index < nelem; \
181+
++s_index) { \
177182
dst_ptr[s_index] \
178183
= CONV_BIN_ARG_TO_FLOAT(idx, tmp_storage[s_index]); \
179184
} \
@@ -189,16 +194,15 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
189194
#define FILL_BIN_ARG_TRY_BLOCK(idx, dest_ptr, dest_size, x0, x0_s, x1, x1_s, \
190195
x1_incr, x2, x2_s, x3, x3_s, x4, x4_s, x5, x5_s) \
191196
{ \
192-
unroll_for(unsigned x0_idx = x0, arg_off = 0; x0_idx < x0 + x0_s; \
193-
++x0_idx, arg_off += X_NELEMS(x1_s)) { \
194-
const unsigned bin_arg_glob_off \
195-
= OFF_MD(CONCAT3(PO_, idx, _BIN_ARG), \
196-
x0_idx % CONCAT3(PO_, idx, _BIN_ARG_D0), \
197-
x1 % CONCAT3(PO_, idx, _BIN_ARG_D1), \
198-
x2 % CONCAT3(PO_, idx, _BIN_ARG_D2), \
199-
x3 % CONCAT3(PO_, idx, _BIN_ARG_D3), \
200-
x4 % CONCAT3(PO_, idx, _BIN_ARG_D4), \
201-
x5 % CONCAT3(PO_, idx, _BIN_ARG_D5)); \
197+
unroll_for(typeof(x0 + x0_s) x0_idx = x0, arg_off = 0; \
198+
x0_idx < x0 + x0_s; ++x0_idx, arg_off += X_NELEMS(x1_s)) { \
199+
const auto bin_arg_glob_off = OFF_MD(CONCAT3(PO_, idx, _BIN_ARG), \
200+
x0_idx % CONCAT3(PO_, idx, _BIN_ARG_D0), \
201+
x1 % CONCAT3(PO_, idx, _BIN_ARG_D1), \
202+
x2 % CONCAT3(PO_, idx, _BIN_ARG_D2), \
203+
x3 % CONCAT3(PO_, idx, _BIN_ARG_D3), \
204+
x4 % CONCAT3(PO_, idx, _BIN_ARG_D4), \
205+
x5 % CONCAT3(PO_, idx, _BIN_ARG_D5)); \
202206
\
203207
CONDITIONAL_FILL(idx, x1_s, 1, \
204208
(CONCAT3(po_, idx, _binary_arg) + bin_arg_glob_off), \
@@ -215,8 +219,9 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
215219
#define REPLICATE_DATA( \
216220
dest_ptr, dest_size, x0_s, x1_s, x2_s, x3_s, x4_s, x5_s) \
217221
{ \
218-
const unsigned copy_size = x0_s * x1_s * x2_s * x3_s * x4_s * x5_s; \
219-
unroll_for(unsigned fid = copy_size; fid < dest_size; ++fid) { \
222+
const auto copy_size = x0_s * x1_s * x2_s * x3_s * x4_s * x5_s; \
223+
unroll_for(typeof(dest_size + 0) fid = copy_size; fid < dest_size; \
224+
++fid) { \
220225
*(dest_ptr + fid) = *(dest_ptr + (fid % copy_size)); \
221226
} \
222227
}
@@ -265,8 +270,8 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
265270
REPLICATE_DATA(bin_arg_ptr, bin_arg_size, x0_s, X_NELEMS(x1_s), \
266271
x2_s, x3_s, x4_s, x5_s); \
267272
} else { \
268-
const unsigned x1_jump = is_burst ? get_sub_group_size() : 1; \
269-
const unsigned x1_size = x1_s / x1_jump; \
273+
const auto x1_jump = is_burst ? get_sub_group_size() : 1; \
274+
const auto x1_size = x1_s / x1_jump; \
270275
FILL_BIN_ARG_SERIAL(idx, bin_arg_ptr, x0, x0_s, (x1 + x1_incr), \
271276
x1_s, x1_jump, x2, x2_s, x3, x3_s, x4, x4_s, x5, x5_s); \
272277
REPLICATE_DATA(bin_arg_ptr, bin_arg_size, x0_s, x1_size, x2_s, \

src/gpu/ocl/ocl_types.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*******************************************************************************
2-
* Copyright 2019-2022 Intel Corporation
2+
* Copyright 2019-2023 Intel Corporation
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -19,6 +19,9 @@
1919

2020
#include "gpu/ocl/ocl_math_utils.h"
2121

22+
#define auto __auto_type
23+
#define typeof(x) __typeof__(x)
24+
2225
#define unroll_for __attribute__((opencl_unroll_hint)) for
2326

2427
#define for_ for

0 commit comments

Comments
 (0)