Skip to content

Commit 3372892

Browse files
hyunbackAsyaPronina
authored andcommitted
[GPU] Support fsv16 Shape agnostic convolution. (openvinotoolkit#25020)
### Details: - Stable Diffusion in dpas has bad first inference latency because all onednn convolutions are compiled at first inference. We can resolve this bottleneck with shape agnostic kernel. Target kernel is convolution_fsv16_1x1 ### Tickets: - *143317* --------- Signed-off-by: hyunback <[email protected]>
1 parent 48e4837 commit 3372892

File tree

8 files changed

+417
-91
lines changed

8 files changed

+417
-91
lines changed

src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ void compile_graph::run(program& p) {
4545
GPU_DEBUG_IF(debug_config->disable_onednn_permute_fusion == 1)
4646
disable_permute_fuse_onednn_gemm = true;
4747

48-
4948
for (size_t idx = 0; idx < proc_order.size(); idx++) {
5049
auto& node = *(std::next(proc_order.begin(), idx));
5150
const bool use_shape_agnostic_impl = !p.get_config().get_property(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape);
@@ -70,6 +69,14 @@ void compile_graph::run(program& p) {
7069
change_initial_impl = false;
7170
}
7271
}
72+
if (node->is_type<convolution>()) {
73+
auto w_layout = node->as<convolution>().weights().get_output_layout();
74+
// Convolution_fsv16_1x1 is only available shape agnostic kernel for onednn convolution which uses the block format.(fsv16)
75+
// Onednn convolution doesn't support input padding but most of cldnn optimized convolution require input padding except fsv16_1x1.
76+
if (w_layout.spatial(0) != 1 || w_layout.spatial(1) != 1) {
77+
change_initial_impl = false;
78+
}
79+
}
7380
}
7481

7582
if (change_initial_impl)
@@ -104,8 +111,10 @@ void compile_graph::run(program& p) {
104111

105112
bool is_planar = format::is_default_format(node->get_output_layout().format);
106113

107-
if (node->is_dynamic() && !is_planar)
114+
if ((node->is_dynamic() && !is_planar &&
115+
(!node->is_type<convolution>() || (node->is_type<convolution>() && node->get_output_layout().format != cldnn::format::b_fs_yx_fsv16)))) {
108116
can_select_impl = false;
117+
}
109118

110119
if (node->is_type<condition>() || node->is_type<loop>() || node->is_type<proposal>())
111120
can_select_impl = true;

src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -196,26 +196,30 @@ void prepare_primitive_fusing::fuse_bias(program &p) {
196196

197197

198198
if (node->get_output_layout().is_dynamic()) {
199-
auto broadcast_type = eltw_node.get_primitive()->broadcast_spec.m_type;
200-
if (!eltw_node.get_dependency(non_const_dep_idx).is_type<fully_connected>())
201-
continue;
202-
if (broadcast_type != ov::op::AutoBroadcastType::NUMPY && broadcast_type != ov::op::AutoBroadcastType::NONE)
203-
continue;
204-
// Numpy broadcast rule requires the dimension size which is not one to be same as the corresponding dimension of the other operand.
205-
// So we can ensure that the feature size is same for this broadcasting rule, thereby being considered as bias.
206-
auto const_shape = eltw_node.get_dependency(const_dep_idx).get_output_layout().get_shape();
207-
int32_t count_elements_not_one = 0;
208-
int32_t idx_element_not_one = -1;
209-
for (size_t i = 0; i < const_shape.size(); ++i) {
210-
if (const_shape[i] != 1) {
211-
count_elements_not_one++;
212-
idx_element_not_one = static_cast<int32_t>(i);
199+
if (eltw_node.get_dependency(non_const_dep_idx).is_type<fully_connected>()) {
200+
auto broadcast_type = eltw_node.get_primitive()->broadcast_spec.m_type;
201+
if (broadcast_type != ov::op::AutoBroadcastType::NUMPY && broadcast_type != ov::op::AutoBroadcastType::NONE)
202+
continue;
203+
204+
// Numpy broadcast rule requires the dimension size which is not one to be same as the corresponding dimension of the other operand.
205+
// So we can ensure that the feature size is same for this broadcasting rule, thereby being considered as bias.
206+
auto const_shape = eltw_node.get_dependency(const_dep_idx).get_output_layout().get_shape();
207+
int32_t count_elements_not_one = 0;
208+
int32_t idx_element_not_one = -1;
209+
for (size_t i = 0; i < const_shape.size(); ++i) {
210+
if (const_shape[i] != 1) {
211+
count_elements_not_one++;
212+
idx_element_not_one = static_cast<int32_t>(i);
213+
}
214+
if (count_elements_not_one > 1)
215+
break;
213216
}
214-
if (count_elements_not_one > 1)
215-
break;
216-
}
217-
if (count_elements_not_one != 1 ||
218-
(idx_element_not_one != (static_cast<int32_t>(const_shape.size()) - 1))) {
217+
218+
if (count_elements_not_one != 1 ||
219+
(idx_element_not_one != (static_cast<int32_t>(const_shape.size()) - 1))) {
220+
continue;
221+
}
222+
} else if (!eltw_node.get_dependency(non_const_dep_idx).is_type<convolution>()) {
219223
continue;
220224
}
221225
} else {

src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,8 @@ attach_convolution_impl::attach_convolution_impl() {
356356
};
357357
auto dyn_formats = {
358358
format::bfyx,
359-
format::bfzyx
359+
format::bfzyx,
360+
format::b_fs_yx_fsv16
360361
};
361362

362363
implementation_map<convolution>::add(impl_types::ocl,

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
2727
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
2828
KERNEL(convolution_b_fs_yx_fsv16_1x1)(
29+
OPTIONAL_SHAPE_INFO_ARG
2930
__global INPUT0_TYPE* input,
3031
__global OUTPUT_TYPE* output,
3132
__global FILTER_TYPE* weights

src/plugins/intel_gpu/src/kernel_selector/jitter.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,8 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const {
363363
if (_tensor.is_dynamic()) {
364364
if (_tensor.GetLayout() == DataLayout::bf || _tensor.GetLayout() == DataLayout::bfyx ||
365365
_tensor.GetLayout() == DataLayout::bfzyx || _tensor.GetLayout() == DataLayout::bfwzyx ||
366-
_tensor.GetLayout() == DataLayout::bfuwzyx || _tensor.GetLayout() == DataLayout::bfvuwzyx) {
366+
_tensor.GetLayout() == DataLayout::bfuwzyx || _tensor.GetLayout() == DataLayout::bfvuwzyx ||
367+
_tensor.GetLayout() == DataLayout::b_fs_yx_fsv16) {
367368
definitions.push_back({_name + "_X_PITCH", "1"});
368369
definitions.push_back({_name + "_Y_PITCH", dims_padded.x()});
369370
definitions.push_back({_name + "_Z_PITCH", toVectorMulString({dims_padded.x(), dims_padded.y()})});

0 commit comments

Comments
 (0)