From 9bcc99ab5d0fe1c90ccae14bd760dac37a57500c Mon Sep 17 00:00:00 2001 From: hyunback Date: Fri, 14 Jun 2024 14:37:12 +0900 Subject: [PATCH 01/12] [GPU] Support fsv16 Shape agnostic convolution. Signed-off-by: hyunback --- .../graph/graph_optimizer/compile_graph.cpp | 23 ++++++++++++++++++- .../src/graph/impls/ocl/convolution.cpp | 3 ++- .../cl_kernels/convolution_gpu_bfyx_f16.cl | 1 + .../convolution_gpu_bfyx_f16_1x1.cl | 1 + .../convolution_gpu_bfyx_f16_depthwise.cl | 1 + .../intel_gpu/src/kernel_selector/jitter.cpp | 3 ++- .../convolution_kernel_b_fs_yx_fsv16.cpp | 1 + .../convolution_kernel_b_fs_yx_fsv16_1x1.cpp | 1 + ...olution_kernel_b_fs_yx_fsv16_depthwise.cpp | 1 + 9 files changed, 32 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp index 0e369974d95e42..99d42cad6eeb62 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp @@ -70,6 +70,27 @@ void compile_graph::run(program& p) { change_initial_impl = false; } } + // if (node->is_type()) { + // std::vector conv_list = { + // "__module.conv_in/aten::_convolution/Convolution", + // "__module.down_blocks.0.resnets.0.conv1/aten::_convolution/Convolution", + // "__module.down_blocks.0.resnets.0.conv2/aten::_convolution/Convolution", + // "__module.down_blocks.0.attentions.0.proj_in/aten::_convolution/Convolution", + // "__module.down_blocks.0.attentions.0.proj_out/aten::_convolution/Convolution", + // "__module.down_blocks.0.resnets.1.conv1/aten::_convolution/Convolution", + // "__module.down_blocks.0.resnets.1.conv2/aten::_convolution/Convolution", + // }; + // bool is_hit = false; + // for (auto conv_id : conv_list) { + // if (node->id().find(conv_id) != std::string::npos) { + // is_hit = true; + // break; + // } + // } + // if (!is_hit) { + // change_initial_impl = false; + // } + // } } if (change_initial_impl) @@ -100,7 +121,7 @@ void compile_graph::run(program& p) { bool is_planar = format::is_default_format(node->get_output_layout().format); - if (node->is_dynamic() && !is_planar) + if (!node->is_type() && node->is_dynamic() && !is_planar) can_select_impl = false; if (node->is_type() || node->is_type() || node->is_type()) diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp index 27b1a40fcf370e..2c05c3c8ae26d5 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp @@ -356,7 +356,8 @@ attach_convolution_impl::attach_convolution_impl() { }; auto dyn_formats = { format::bfyx, - format::bfzyx + format::bfzyx, + format::b_fs_yx_fsv16 }; implementation_map::add(impl_types::ocl, diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16.cl index 4f41685b936d0c..e8a3f713effc44 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16.cl @@ -44,6 +44,7 @@ REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE) __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1))) KERNEL(convolution_bfyx_f16)( + OPTIONAL_SHAPE_INFO_ARG __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, __global FILTER_TYPE* weights diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl index ae3731a962244b..542fa69ebc241b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl @@ -26,6 +26,7 @@ REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE) __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1))) KERNEL(convolution_b_fs_yx_fsv16_1x1)( + OPTIONAL_SHAPE_INFO_ARG __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, __global FILTER_TYPE* weights diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_depthwise.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_depthwise.cl index c057e319f2f8fe..a1dc29498dc678 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_depthwise.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_depthwise.cl @@ -25,6 +25,7 @@ REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE) __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1))) KERNEL(convolution_gpu_bfyx_f16_depthwise)( + OPTIONAL_SHAPE_INFO_ARG __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, __global FILTER_TYPE* weights diff --git a/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp b/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp index 18e9485a183fc9..df83c1778e5e24 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/jitter.cpp @@ -363,7 +363,8 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const { if (_tensor.is_dynamic()) { if (_tensor.GetLayout() == DataLayout::bf || _tensor.GetLayout() == DataLayout::bfyx || _tensor.GetLayout() == DataLayout::bfzyx || _tensor.GetLayout() == DataLayout::bfwzyx || - _tensor.GetLayout() == DataLayout::bfuwzyx || _tensor.GetLayout() == DataLayout::bfvuwzyx) { + _tensor.GetLayout() == DataLayout::bfuwzyx || _tensor.GetLayout() == DataLayout::bfvuwzyx || + _tensor.GetLayout() == DataLayout::b_fs_yx_fsv16) { definitions.push_back({_name + "_X_PITCH", "1"}); definitions.push_back({_name + "_Y_PITCH", dims_padded.x()}); definitions.push_back({_name + "_Z_PITCH", toVectorMulString({dims_padded.x(), dims_padded.y()})}); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16.cpp index 5f4b6128fcb692..9300d83d219448 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16.cpp @@ -120,6 +120,7 @@ ParamsKey ConvolutionKernel_b_fs_yx_fsv16::GetSupportedKey() const { k.EnableNonBiasTerm(); k.EnableBatching(); k.EnableGroupedConvolution(); + // k.EnableDynamicShapesSupport(); return k; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp index a332090dbd0906..2a9420f0ec73a6 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp @@ -92,6 +92,7 @@ ParamsKey ConvolutionKernel_b_fs_yx_fsv16_1x1::GetSupportedKey() const { k.EnableBiasPerFeature(); k.EnableNonBiasTerm(); k.EnableBatching(); + // k.EnableDynamicShapesSupport(); return k; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_depthwise.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_depthwise.cpp index efeebb300bb9f9..50ae880d5f1e62 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_depthwise.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_depthwise.cpp @@ -35,6 +35,7 @@ ParamsKey ConvolutionKernel_b_fs_yx_fsv16_depthwise::GetSupportedKey() const { k.EnableGroupedConvolution(); k.EnableDilation(); k.EnableDifferentTypes(); + // k.EnableDynamicShapesSupport(); return k; } From 1a7c7b6236365580bbcbe04910ee3e7e34b62a3c Mon Sep 17 00:00:00 2001 From: hyunback Date: Tue, 18 Jun 2024 20:08:16 +0900 Subject: [PATCH 02/12] Fixing accuracy issue Find the root cause and fixing.. Signed-off-by: hyunback --- .../graph/graph_optimizer/compile_graph.cpp | 55 ++++++++++------ .../prepare_primitive_fusing.cpp | 2 + .../intel_gpu/src/graph/layout_optimizer.cpp | 4 ++ .../convolution_kernel_b_fs_yx_fsv16_1x1.cpp | 66 +++++++++++++++---- .../convolution/convolution_kernel_base.cpp | 1 + 5 files changed, 95 insertions(+), 33 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp index 99d42cad6eeb62..abc3877de16b10 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp @@ -70,27 +70,40 @@ void compile_graph::run(program& p) { change_initial_impl = false; } } - // if (node->is_type()) { - // std::vector conv_list = { - // "__module.conv_in/aten::_convolution/Convolution", - // "__module.down_blocks.0.resnets.0.conv1/aten::_convolution/Convolution", - // "__module.down_blocks.0.resnets.0.conv2/aten::_convolution/Convolution", - // "__module.down_blocks.0.attentions.0.proj_in/aten::_convolution/Convolution", - // "__module.down_blocks.0.attentions.0.proj_out/aten::_convolution/Convolution", - // "__module.down_blocks.0.resnets.1.conv1/aten::_convolution/Convolution", - // "__module.down_blocks.0.resnets.1.conv2/aten::_convolution/Convolution", - // }; - // bool is_hit = false; - // for (auto conv_id : conv_list) { - // if (node->id().find(conv_id) != std::string::npos) { - // is_hit = true; - // break; - // } - // } - // if (!is_hit) { - // change_initial_impl = false; - // } - // } + if (node->is_type()) { +#if 0 + std::vector conv_list = { + // "__module.conv_in/aten::_convolution/Convolution", + // "__module.down_blocks.0.resnets.0.conv1/aten::_convolution/Convolution", + // "__module.down_blocks.0.resnets.0.conv2/aten::_convolution/Convolution", + // "__module.down_blocks.0.attentions.0.proj_in/aten::_convolution/Convolution", + // "__module.down_blocks.0.attentions.0.proj_out/aten::_convolution/Convolution", + // "__module.down_blocks.0.resnets.1.conv1/aten::_convolution/Convolution", + // "__module.down_blocks.0.resnets.1.conv2/aten::_convolution/Convolution", + "__module.down_blocks.1.resnets.0.conv_shortcut/aten::_convolution/Convolution", + }; + bool is_hit = false; + for (auto conv_id : conv_list) { + if (node->id().find(conv_id) != std::string::npos) { + is_hit = true; + break; + } + } + if (!is_hit) { + change_initial_impl = false; + } +#else + bool is_hit = false; + auto w_layout = node->as().weights().get_output_layout(); + if (w_layout.spatial(0) == 1 && w_layout.spatial(1) == 1) { + is_hit = true; + GPU_DEBUG_COUT << node->id() << ": " << w_layout.to_short_string() << std::endl; + } + if (!is_hit) { + change_initial_impl = false; + } +#endif + } } if (change_initial_impl) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index c4f8d1acdd440c..1b3debad2e8a4b 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -53,6 +53,8 @@ using namespace cldnn; void prepare_primitive_fusing::run(program& p) { + // temporarily disable fusion because of conv_fsv16_1x1 has an issue with block_size > 1 + return; fuse_reorders(p); remove_redundant_reshape(p); fuse_bias(p); diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index b7d4b7230247cd..16630e7f753f5e 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -1742,6 +1742,10 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format preferred_impl = impl_types::ocl; } + // if (node.is_type()) { + // preferred_impl = impl_types::ocl; + // } + return preferred_impl; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp index 2a9420f0ec73a6..e52708ece8b9d4 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp @@ -60,19 +60,31 @@ float ConvolutionKernel_b_fs_yx_fsv16_1x1::EstimateOccupancy(const convolution_p ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionTuningData ConvolutionKernel_b_fs_yx_fsv16_1x1::GetTuningParams(const convolution_params& params) const { ConvolutionTuningData tuning_data; - const auto& input = params.inputs[0]; + // GPU_DEBUG_INFO << params.has_dynamic_inputs() << ", " << params.has_dynamic_outputs() << ", " << params.has_dynamic_tensors() << std::endl; + // if (!params.has_dynamic_tensors()) { + const auto& input = params.inputs[0]; + + size_t ic_blocks = CeilDiv(input.Feature().v, tuning_data.feature_block_size); - size_t ic_blocks = CeilDiv(input.Feature().v, tuning_data.feature_block_size); + size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size; + bool block_size_one_is_better = params.outputs[0].X().v == 1 && params.outputs[0].Y().v == 1 && input.Feature().v >= 2048; - size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size; - bool block_size_one_is_better = params.outputs[0].X().v == 1 && params.outputs[0].Y().v == 1 && input.Feature().v >= 2048; - if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.supports_imad && !block_size_one_is_better) - while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) && - EstimateOccupancy(params, tuning_data) < 4.0) - tuning_data.slm_div_factor *= 2; + // if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.supports_imad && !block_size_one_is_better) + if (params.engineInfo.supports_imad && !block_size_one_is_better) + while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) && + EstimateOccupancy(params, tuning_data) < 4.0) + tuning_data.slm_div_factor *= 2; - tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size; + tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size; + + GPU_DEBUG_INFO << params.layerID << " : " << static_cast(params.engineInfo.deviceType) << ", " + << params.engineInfo.supports_imad << ", " << block_size_one_is_better << " : " + << tuning_data.work_group_size << " = " << tuning_data.slm_div_factor << " * " << tuning_data.sub_group_size << " : " + << params.outputs[0].X().v << " , " << params.outputs[0].Y().v << ", " << input.Feature().v << " : " + << max_slm_div_factor << " = " << params.engineInfo.maxWorkGroupSize << " / " << tuning_data.sub_group_size << " : " + << ic_blocks << ", " << input.Feature().v << ", " << tuning_data.feature_block_size << std::endl; + // } return tuning_data; } @@ -92,7 +104,7 @@ ParamsKey ConvolutionKernel_b_fs_yx_fsv16_1x1::GetSupportedKey() const { k.EnableBiasPerFeature(); k.EnableNonBiasTerm(); k.EnableBatching(); - // k.EnableDynamicShapesSupport(); + k.EnableDynamicShapesSupport(); return k; } @@ -127,6 +139,9 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa dispatchData.lws[1] = tuning_data.work_group_size; dispatchData.lws[2] = 1; + GPU_DEBUG_INFO << "gws: " << dispatchData.gws[0] << ", " << dispatchData.gws[1] << ", " << dispatchData.gws[2] << std::endl; + GPU_DEBUG_INFO << "lws: " << dispatchData.lws[0] << ", " << dispatchData.lws[1] << ", " << dispatchData.lws[2] << std::endl; + return dispatchData; } @@ -216,6 +231,8 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut jit.Merge(MakeFusedOpsJitConstants(params, { conf_vec, conf_scalar1, conf_scalar2 })); } + GPU_DEBUG_INFO << params.layerID << " : params.fused_ops.empty(): " << params.fused_ops.empty() << std::endl; + jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size)); jit.AddConstant(MakeJitConstant("PADDED_INPUT", params.inputs[0].X().pad.Total() != 0)); @@ -239,9 +256,34 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut jit.AddConstant(MakeJitConstant("PADDED_OUTPUT", padded_output)); jit.AddConstant(MakeJitConstant("NON_UNIT_FUSED_OP_SPATIAL", non_unit_fused_op_spatial)); + if (params.has_dynamic_tensors()) { + // const convolution_params& cp = static_cast(params); + // DimensionAccessHelperJit dims0(cp.outputs[0]); + // auto x = dims0.x(); + // auto y = dims0.y(); + // auto f = dims0.f(); + + // auto blockWidth_str = "(" + x + + // if (x == 1 && y == 1) { + // return { 1, EXE_MODE_DEFAULT }; + // } else if (x * f <= 256) { + // if (x < 8 || x * f <= 128) + // return { 2, EXE_MODE_DEFAULT }; + // else + // return { 4, EXE_MODE_DEFAULT }; + // } else if (x * f <= 1536) { + // return { 4, EXE_MODE_DEFAULT }; + // } else { + // return { 8, EXE_MODE_DEFAULT }; + // } + jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", 8)); + jit.AddConstant(MakeJitConstant("X_BLOCKS", 8)); + } else { + jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth)); + jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(params.outputs[0].X().v, blockWidth))); + } - jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth)); - jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(params.outputs[0].X().v, blockWidth))); jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor)); jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size)); jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(params.inputs[0].Feature().v, tuning_data.feature_block_size))); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp index adba7b625a4455..a35873d123c6f8 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp @@ -163,6 +163,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernelBase::SetDefault(const conv void ConvolutionKernelBase::GetUpdateDispatchDataFunc(KernelData& kd) const { kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { + GPU_DEBUG_INFO << "kd.update_dispatch_data_func !!!" << std::endl; const auto& prim_params = static_cast(params); auto dispatchData = SetDefault(prim_params); OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); From 7a2895f4a3fe25b48c06c0118fc5aa523bdacb78 Mon Sep 17 00:00:00 2001 From: hyunback Date: Wed, 19 Jun 2024 09:07:04 +0900 Subject: [PATCH 03/12] Temporaily enable fusions, SA Conv fsv16_1x1 and add debugging env. Signed-off-by: hyunback --- .../graph/graph_optimizer/compile_graph.cpp | 69 +++++++++++-------- .../prepare_primitive_fusing.cpp | 8 ++- .../convolution_kernel_b_fs_yx_fsv16_1x1.cpp | 14 ++-- 3 files changed, 53 insertions(+), 38 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp index abc3877de16b10..ef5ed2f3c7a16e 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp @@ -27,6 +27,15 @@ using namespace cldnn; +int get_env(std::string key, int &val); +int get_env(std::string key, int &val) { + if (const auto env_var = std::getenv(key.c_str())) { + val = std::atoi(env_var); + return true; + } + return false; +} + void compile_graph::run(program& p) { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "pass::CompileGraph"); for (auto& node : p.get_processing_order()) { @@ -45,6 +54,8 @@ void compile_graph::run(program& p) { GPU_DEBUG_IF(debug_config->disable_onednn_permute_fusion == 1) disable_permute_fuse_onednn_gemm = true; + int conv_sa = 0; + get_env("CONV_SA", conv_sa); for (size_t idx = 0; idx < proc_order.size(); idx++) { auto& node = *(std::next(proc_order.begin(), idx)); @@ -71,38 +82,38 @@ void compile_graph::run(program& p) { } } if (node->is_type()) { -#if 0 - std::vector conv_list = { - // "__module.conv_in/aten::_convolution/Convolution", - // "__module.down_blocks.0.resnets.0.conv1/aten::_convolution/Convolution", - // "__module.down_blocks.0.resnets.0.conv2/aten::_convolution/Convolution", - // "__module.down_blocks.0.attentions.0.proj_in/aten::_convolution/Convolution", - // "__module.down_blocks.0.attentions.0.proj_out/aten::_convolution/Convolution", - // "__module.down_blocks.0.resnets.1.conv1/aten::_convolution/Convolution", - // "__module.down_blocks.0.resnets.1.conv2/aten::_convolution/Convolution", - "__module.down_blocks.1.resnets.0.conv_shortcut/aten::_convolution/Convolution", - }; - bool is_hit = false; - for (auto conv_id : conv_list) { - if (node->id().find(conv_id) != std::string::npos) { + if (!conv_sa) { + std::vector conv_list = { + // "__module.conv_in/aten::_convolution/Convolution", + // "__module.down_blocks.0.resnets.0.conv1/aten::_convolution/Convolution", + // "__module.down_blocks.0.resnets.0.conv2/aten::_convolution/Convolution", + // "__module.down_blocks.0.attentions.0.proj_in/aten::_convolution/Convolution", + // "__module.down_blocks.0.attentions.0.proj_out/aten::_convolution/Convolution", + // "__module.down_blocks.0.resnets.1.conv1/aten::_convolution/Convolution", + // "__module.down_blocks.0.resnets.1.conv2/aten::_convolution/Convolution", + // "__module.down_blocks.1.resnets.0.conv_shortcut/aten::_convolution/Convolution", + }; + bool is_hit = false; + for (auto conv_id : conv_list) { + if (node->id().find(conv_id) != std::string::npos) { + is_hit = true; + break; + } + } + if (!is_hit) { + change_initial_impl = false; + } + } else { + bool is_hit = false; + auto w_layout = node->as().weights().get_output_layout(); + if (w_layout.spatial(0) == 1 && w_layout.spatial(1) == 1) { is_hit = true; - break; + GPU_DEBUG_INFO << node->id() << ": " << w_layout.to_short_string() << std::endl; + } + if (!is_hit) { + change_initial_impl = false; } } - if (!is_hit) { - change_initial_impl = false; - } -#else - bool is_hit = false; - auto w_layout = node->as().weights().get_output_layout(); - if (w_layout.spatial(0) == 1 && w_layout.spatial(1) == 1) { - is_hit = true; - GPU_DEBUG_COUT << node->id() << ": " << w_layout.to_short_string() << std::endl; - } - if (!is_hit) { - change_initial_impl = false; - } -#endif } } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index 1b3debad2e8a4b..61d797f043594f 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -54,7 +54,7 @@ using namespace cldnn; void prepare_primitive_fusing::run(program& p) { // temporarily disable fusion because of conv_fsv16_1x1 has an issue with block_size > 1 - return; + // return; fuse_reorders(p); remove_redundant_reshape(p); fuse_bias(p); @@ -198,9 +198,10 @@ void prepare_primitive_fusing::fuse_bias(program &p) { if (node->get_output_layout().is_dynamic()) { + #if 0 auto broadcast_type = eltw_node.get_primitive()->broadcast_spec.m_type; - if (!eltw_node.get_dependency(non_const_dep_idx).is_type()) - continue; + // if (!eltw_node.get_dependency(non_const_dep_idx).is_type()) + // continue; if (broadcast_type != ov::op::AutoBroadcastType::NUMPY && broadcast_type != ov::op::AutoBroadcastType::NONE) continue; // Numpy broadcast rule requires the dimension size which is not one to be same as the corresponding dimension of the other operand. @@ -220,6 +221,7 @@ void prepare_primitive_fusing::fuse_bias(program &p) { (idx_element_not_one != (static_cast(const_shape.size()) - 1))) { continue; } + #endif } else { cldnn::tensor::value_type out_features = node->get_output_layout().feature(); bool is_3d_fc = false; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp index e52708ece8b9d4..30adf74d291df6 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp @@ -69,12 +69,14 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionTuningData ConvolutionKernel_b_f size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size; bool block_size_one_is_better = params.outputs[0].X().v == 1 && params.outputs[0].Y().v == 1 && input.Feature().v >= 2048; - - // if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.supports_imad && !block_size_one_is_better) - if (params.engineInfo.supports_imad && !block_size_one_is_better) - while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) && - EstimateOccupancy(params, tuning_data) < 4.0) - tuning_data.slm_div_factor *= 2; + // clEnqueueNDRangeKernel, error code: -54 + // because of invalid SLM_DIV_FACTOR in __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1))) + // Need to update proper SLM_DIV_FACTOR after shape updated. + // // if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.supports_imad && !block_size_one_is_better) + // if (params.engineInfo.supports_imad && !block_size_one_is_better) + // while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) && + // EstimateOccupancy(params, tuning_data) < 4.0) + // tuning_data.slm_div_factor *= 2; tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size; From 01e85b5a8bea74aea58589c7f81c23bb84d2f4f2 Mon Sep 17 00:00:00 2001 From: hyunback Date: Thu, 20 Jun 2024 09:14:49 +0900 Subject: [PATCH 04/12] Update to parameterize for hardcoding. Signed-off-by: hyunback --- .../prepare_primitive_fusing.cpp | 46 ++-- .../intel_gpu/src/graph/layout_optimizer.cpp | 4 - .../cl_kernels/convolution_gpu_bfyx_f16.cl | 1 - .../convolution_gpu_bfyx_f16_depthwise.cl | 1 - .../convolution_kernel_b_fs_yx_fsv16.cpp | 1 - .../convolution_kernel_b_fs_yx_fsv16_1x1.cpp | 222 +++++++++--------- ...olution_kernel_b_fs_yx_fsv16_depthwise.cpp | 1 - .../convolution/convolution_kernel_base.cpp | 1 - 8 files changed, 140 insertions(+), 137 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp index 61d797f043594f..eb04e1972c9cdd 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_primitive_fusing.cpp @@ -53,8 +53,6 @@ using namespace cldnn; void prepare_primitive_fusing::run(program& p) { - // temporarily disable fusion because of conv_fsv16_1x1 has an issue with block_size > 1 - // return; fuse_reorders(p); remove_redundant_reshape(p); fuse_bias(p); @@ -198,30 +196,32 @@ void prepare_primitive_fusing::fuse_bias(program &p) { if (node->get_output_layout().is_dynamic()) { - #if 0 - auto broadcast_type = eltw_node.get_primitive()->broadcast_spec.m_type; - // if (!eltw_node.get_dependency(non_const_dep_idx).is_type()) - // continue; - if (broadcast_type != ov::op::AutoBroadcastType::NUMPY && broadcast_type != ov::op::AutoBroadcastType::NONE) - continue; - // Numpy broadcast rule requires the dimension size which is not one to be same as the corresponding dimension of the other operand. - // So we can ensure that the feature size is same for this broadcasting rule, thereby being considered as bias. - auto const_shape = eltw_node.get_dependency(const_dep_idx).get_output_layout().get_shape(); - int32_t count_elements_not_one = 0; - int32_t idx_element_not_one = -1; - for (size_t i = 0; i < const_shape.size(); ++i) { - if (const_shape[i] != 1) { - count_elements_not_one++; - idx_element_not_one = static_cast(i); + if (eltw_node.get_dependency(non_const_dep_idx).is_type()) { + auto broadcast_type = eltw_node.get_primitive()->broadcast_spec.m_type; + if (broadcast_type != ov::op::AutoBroadcastType::NUMPY && broadcast_type != ov::op::AutoBroadcastType::NONE) + continue; + + // Numpy broadcast rule requires the dimension size which is not one to be same as the corresponding dimension of the other operand. + // So we can ensure that the feature size is same for this broadcasting rule, thereby being considered as bias. + auto const_shape = eltw_node.get_dependency(const_dep_idx).get_output_layout().get_shape(); + int32_t count_elements_not_one = 0; + int32_t idx_element_not_one = -1; + for (size_t i = 0; i < const_shape.size(); ++i) { + if (const_shape[i] != 1) { + count_elements_not_one++; + idx_element_not_one = static_cast(i); + } + if (count_elements_not_one > 1) + break; } - if (count_elements_not_one > 1) - break; - } - if (count_elements_not_one != 1 || - (idx_element_not_one != (static_cast(const_shape.size()) - 1))) { + + if (count_elements_not_one != 1 || + (idx_element_not_one != (static_cast(const_shape.size()) - 1))) { + continue; + } + } else if (!eltw_node.get_dependency(non_const_dep_idx).is_type()) { continue; } - #endif } else { cldnn::tensor::value_type out_features = node->get_output_layout().feature(); bool is_3d_fc = false; diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index 16630e7f753f5e..b7d4b7230247cd 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -1742,10 +1742,6 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format preferred_impl = impl_types::ocl; } - // if (node.is_type()) { - // preferred_impl = impl_types::ocl; - // } - return preferred_impl; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16.cl index e8a3f713effc44..4f41685b936d0c 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16.cl @@ -44,7 +44,6 @@ REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE) __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1))) KERNEL(convolution_bfyx_f16)( - OPTIONAL_SHAPE_INFO_ARG __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, __global FILTER_TYPE* weights diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_depthwise.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_depthwise.cl index a1dc29498dc678..c057e319f2f8fe 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_depthwise.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/convolution_gpu_bfyx_f16_depthwise.cl @@ -25,7 +25,6 @@ REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE) __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE, 1))) KERNEL(convolution_gpu_bfyx_f16_depthwise)( - OPTIONAL_SHAPE_INFO_ARG __global INPUT0_TYPE* input, __global OUTPUT_TYPE* output, __global FILTER_TYPE* weights diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16.cpp index 9300d83d219448..5f4b6128fcb692 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16.cpp @@ -120,7 +120,6 @@ ParamsKey ConvolutionKernel_b_fs_yx_fsv16::GetSupportedKey() const { k.EnableNonBiasTerm(); k.EnableBatching(); k.EnableGroupedConvolution(); - // k.EnableDynamicShapesSupport(); return k; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp index 30adf74d291df6..a102e09001c7ee 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp @@ -22,22 +22,28 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionKernel_b_fs_yx_fsv16_1x1() : Con ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16_1x1::GetAutoTuneOptions(const Params& params, int /*autoTuneIndex*/) const { - const convolution_params& cp = static_cast(params); - - auto x = cp.outputs[0].X().v; - auto y = cp.outputs[0].Y().v; - auto f = cp.outputs[0].Feature().v; - - if (x == 1 && y == 1) { - return { 1, EXE_MODE_DEFAULT }; - } else if (x * f <= 256) { - if (x < 8 || x * f <= 128) - return { 2, EXE_MODE_DEFAULT }; - else + if (!params.is_shape_agnostic) { + const convolution_params& cp = static_cast(params); + + auto x = cp.outputs[0].X().v; + auto y = cp.outputs[0].Y().v; + auto f = cp.outputs[0].Feature().v; + + if (x == 1 && y == 1) { + return { 1, EXE_MODE_DEFAULT }; + } else if (x * f <= 256) { + if (x < 8 || x * f <= 128) + return { 2, EXE_MODE_DEFAULT }; + else + return { 4, EXE_MODE_DEFAULT }; + } else if (x * f <= 1536) { return { 4, EXE_MODE_DEFAULT }; - } else if (x * f <= 1536) { - return { 4, EXE_MODE_DEFAULT }; + } else { + return { 8, EXE_MODE_DEFAULT }; + } } else { + // In shape agnostic kernel, the output shape cannot be specified at build time, + // So we set blockWidth to 8, which is the most commonly used. return { 8, EXE_MODE_DEFAULT }; } } @@ -60,33 +66,21 @@ float ConvolutionKernel_b_fs_yx_fsv16_1x1::EstimateOccupancy(const convolution_p ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionTuningData ConvolutionKernel_b_fs_yx_fsv16_1x1::GetTuningParams(const convolution_params& params) const { ConvolutionTuningData tuning_data; - // GPU_DEBUG_INFO << params.has_dynamic_inputs() << ", " << params.has_dynamic_outputs() << ", " << params.has_dynamic_tensors() << std::endl; - // if (!params.has_dynamic_tensors()) { + if (!params.is_shape_agnostic) { const auto& input = params.inputs[0]; + bool block_size_one_is_better = params.outputs[0].X().v == 1 && params.outputs[0].Y().v == 1 && input.Feature().v >= 2048; - size_t ic_blocks = CeilDiv(input.Feature().v, tuning_data.feature_block_size); + if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.supports_imad && !block_size_one_is_better) { + size_t ic_blocks = CeilDiv(input.Feature().v, tuning_data.feature_block_size); + size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size; - size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size; - bool block_size_one_is_better = params.outputs[0].X().v == 1 && params.outputs[0].Y().v == 1 && input.Feature().v >= 2048; + while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) && + EstimateOccupancy(params, tuning_data) < 4.0) + tuning_data.slm_div_factor *= 2; + } + } - // clEnqueueNDRangeKernel, error code: -54 - // because of invalid SLM_DIV_FACTOR in __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1))) - // Need to update proper SLM_DIV_FACTOR after shape updated. - // // if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.supports_imad && !block_size_one_is_better) - // if (params.engineInfo.supports_imad && !block_size_one_is_better) - // while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) && - // EstimateOccupancy(params, tuning_data) < 4.0) - // tuning_data.slm_div_factor *= 2; - - tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size; - - GPU_DEBUG_INFO << params.layerID << " : " << static_cast(params.engineInfo.deviceType) << ", " - << params.engineInfo.supports_imad << ", " << block_size_one_is_better << " : " - << tuning_data.work_group_size << " = " << tuning_data.slm_div_factor << " * " << tuning_data.sub_group_size << " : " - << params.outputs[0].X().v << " , " << params.outputs[0].Y().v << ", " << input.Feature().v << " : " - << max_slm_div_factor << " = " << params.engineInfo.maxWorkGroupSize << " / " << tuning_data.sub_group_size << " : " - << ic_blocks << ", " << input.Feature().v << ", " << tuning_data.feature_block_size << std::endl; - // } + tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size; return tuning_data; } @@ -148,24 +142,28 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa } KernelsPriority ConvolutionKernel_b_fs_yx_fsv16_1x1::GetKernelsPriority(const Params& params) const { - const auto& p = static_cast(params); - auto autoTune = GetAutoTuneOptions(params, -1); - - const auto& input = p.inputs[0]; - const auto& out = p.outputs[0]; - - auto bBlockSizeX = out.X().v % autoTune.blockWidth == 0; - auto bBlockSizeXY = out.X().pad.Total() + out.Y().pad.Total() == 0; - auto bInputPad = input.X().pad.Total() + input.Y().pad.Total() != 0; - - if (out.Batch().v == 1) { - if ((bBlockSizeX || bBlockSizeXY) && !bInputPad) { - return FORCE_PRIORITY_1; + if (!params.is_shape_agnostic) { + const auto& p = static_cast(params); + auto autoTune = GetAutoTuneOptions(params, -1); + + const auto& input = p.inputs[0]; + const auto& out = p.outputs[0]; + + auto bBlockSizeX = out.X().v % autoTune.blockWidth == 0; + auto bBlockSizeXY = out.X().pad.Total() + out.Y().pad.Total() == 0; + auto bInputPad = input.X().pad.Total() + input.Y().pad.Total() != 0; + + if (out.Batch().v == 1) { + if ((bBlockSizeX || bBlockSizeXY) && !bInputPad) { + return FORCE_PRIORITY_1; + } else { + return FORCE_PRIORITY_3; + } } else { - return FORCE_PRIORITY_3; + return FORCE_PRIORITY_7; } } else { - return FORCE_PRIORITY_7; + return FORCE_PRIORITY_1; } } @@ -181,7 +179,8 @@ bool ConvolutionKernel_b_fs_yx_fsv16_1x1::Validate(const Params& p) const { const auto& input = params.inputs[0]; const auto& output = params.outputs[0]; - const bool bOutputSizes = output.X().v != input.X().v || output.Y().v != input.Y().v || output.Feature().v % 16 != 0; + const bool bOutputSizes = (!params.is_shape_agnostic && (output.X().v != input.X().v || output.Y().v != input.Y().v)) || + output.Feature().v % 16 != 0; const bool bFilterSize = params.filterSize.x != 1 || params.filterSize.y != 1; const bool bStride = params.stride.x != 1 || params.stride.y != 1; const bool bPadding = input.Feature().pad.before % tuning_data.feature_block_size != 0 || @@ -235,65 +234,78 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut GPU_DEBUG_INFO << params.layerID << " : params.fused_ops.empty(): " << params.fused_ops.empty() << std::endl; + jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth)); + jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor)); jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size)); - jit.AddConstant(MakeJitConstant("PADDED_INPUT", params.inputs[0].X().pad.Total() != 0)); - - bool padded_output = params.outputs[0].X().pad.Total() != 0; - bool non_unit_fused_op_spatial = false; + jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size)); - // Set padded_output to true when fused inputs have paddings to have correct blocked loads - for (auto& fused_op : params.fused_ops) { - for (auto& t : fused_op.tensors) { - if (t.PitchesDifferFromLogicalDims()) { - padded_output = true; - } - if ((t.X().v > 1) || - (t.Y().v > 1) || - (t.Z().v > 1) || - (t.W().v > 1)) { - non_unit_fused_op_spatial = true; + if (!params.has_dynamic_inputs()) { + jit.AddConstant(MakeJitConstant("PADDED_INPUT", params.inputs[0].X().pad.Total() != 0)); + + bool padded_output = params.outputs[0].X().pad.Total() != 0; + bool non_unit_fused_op_spatial = false; + + // Set padded_output to true when fused inputs have paddings to have correct blocked loads + for (auto& fused_op : params.fused_ops) { + for (auto& t : fused_op.tensors) { + if (t.PitchesDifferFromLogicalDims()) { + padded_output = true; + } + if ((t.X().v > 1) || + (t.Y().v > 1) || + (t.Z().v > 1) || + (t.W().v > 1)) { + non_unit_fused_op_spatial = true; + } } } - } - jit.AddConstant(MakeJitConstant("PADDED_OUTPUT", padded_output)); - jit.AddConstant(MakeJitConstant("NON_UNIT_FUSED_OP_SPATIAL", non_unit_fused_op_spatial)); - if (params.has_dynamic_tensors()) { - // const convolution_params& cp = static_cast(params); - // DimensionAccessHelperJit dims0(cp.outputs[0]); - // auto x = dims0.x(); - // auto y = dims0.y(); - // auto f = dims0.f(); - - // auto blockWidth_str = "(" + x - - // if (x == 1 && y == 1) { - // return { 1, EXE_MODE_DEFAULT }; - // } else if (x * f <= 256) { - // if (x < 8 || x * f <= 128) - // return { 2, EXE_MODE_DEFAULT }; - // else - // return { 4, EXE_MODE_DEFAULT }; - // } else if (x * f <= 1536) { - // return { 4, EXE_MODE_DEFAULT }; - // } else { - // return { 8, EXE_MODE_DEFAULT }; - // } - jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", 8)); - jit.AddConstant(MakeJitConstant("X_BLOCKS", 8)); + jit.AddConstant(MakeJitConstant("PADDED_OUTPUT", padded_output)); + jit.AddConstant(MakeJitConstant("NON_UNIT_FUSED_OP_SPATIAL", non_unit_fused_op_spatial)); + + jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(params.inputs[0].Feature().v, tuning_data.feature_block_size))); + if (params.outputs[0].Feature().v % tuning_data.feature_block_size != 0) { + jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1)); + } + if (params.inputs[0].Feature().v % tuning_data.feature_block_size != 0) { + jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1)); + } } else { - jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth)); - jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(params.outputs[0].X().v, blockWidth))); - } + DimensionAccessHelperJit input0_dims(params.inputs[0]); + DimensionAccessHelperJit input0_padded_dims(params.inputs[0], true); + DimensionAccessHelperJit output_dims(params.outputs[0]); + DimensionAccessHelperJit output_padded_dims(params.outputs[0], true); + + const auto padded_input = "(" + input0_padded_dims.x_pad().first + "+" + input0_padded_dims.x_pad().first + ") != 0"; + jit.AddConstant(MakeJitConstant("PADDED_INPUT", padded_input)); + + const auto padded_output = "(" + output_padded_dims.x_pad().first + "+" + output_padded_dims.x_pad().first + ") != 0"; + jit.AddConstant(MakeJitConstant("PADDED_OUTPUT", padded_output)); + + // In shape agnostic kernel, the fused shape cannot be specified at build time or run time. + // Currently simply check whether fused_op is dynmaic. Need to further follow up like static behavior. + bool non_unit_fused_op_spatial = false; + for (auto& fused_op : params.fused_ops) { + for (auto& t : fused_op.tensors) { + if (t.is_dynamic()) { + non_unit_fused_op_spatial = true; + break; + } + } + } + jit.AddConstant(MakeJitConstant("NON_UNIT_FUSED_OP_SPATIAL", non_unit_fused_op_spatial)); - jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor)); - jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size)); - jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(params.inputs[0].Feature().v, tuning_data.feature_block_size))); - if (params.outputs[0].Feature().v % tuning_data.feature_block_size != 0) { - jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1)); - } - if (params.inputs[0].Feature().v % tuning_data.feature_block_size != 0) { - jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1)); + const auto feature_block_size = std::to_string(tuning_data.feature_block_size); + const auto ic_blocks = "(" + input0_dims.f() + "+" + feature_block_size + " - 1) / " + feature_block_size; + jit.AddConstant(MakeJitConstant("IC_BLOCKS", ic_blocks)); + + const auto output_leftover_num = "(" + output_dims.f() + "%" + feature_block_size + ")"; + const auto output_leftover = "(" + output_leftover_num + "!= 0)"; + jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", output_leftover)); + + const auto input_leftover_num = "(" + input0_dims.f() + "%" + feature_block_size + ")"; + const auto input_leftover = "(" + input_leftover_num + "!= 0)"; + jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", input_leftover)); } return jit; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_depthwise.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_depthwise.cpp index 50ae880d5f1e62..efeebb300bb9f9 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_depthwise.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_depthwise.cpp @@ -35,7 +35,6 @@ ParamsKey ConvolutionKernel_b_fs_yx_fsv16_depthwise::GetSupportedKey() const { k.EnableGroupedConvolution(); k.EnableDilation(); k.EnableDifferentTypes(); - // k.EnableDynamicShapesSupport(); return k; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp index a35873d123c6f8..adba7b625a4455 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_base.cpp @@ -163,7 +163,6 @@ ConvolutionKernelBase::DispatchData ConvolutionKernelBase::SetDefault(const conv void ConvolutionKernelBase::GetUpdateDispatchDataFunc(KernelData& kd) const { kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { - GPU_DEBUG_INFO << "kd.update_dispatch_data_func !!!" << std::endl; const auto& prim_params = static_cast(params); auto dispatchData = SetDefault(prim_params); OPENVINO_ASSERT(kd.kernels.size() == 1, "[GPU] Invalid kernels size for update dispatch data func"); From ae79d77de1b9710a1fb0d07b4b5385898e403746 Mon Sep 17 00:00:00 2001 From: hyunback Date: Fri, 21 Jun 2024 21:22:00 +0900 Subject: [PATCH 05/12] Add test case. Signed-off-by: hyunback --- .../graph/graph_optimizer/compile_graph.cpp | 47 +------ .../convolution_kernel_b_fs_yx_fsv16_1x1.cpp | 8 ++ .../unit/test_cases/convolution_gpu_test.cpp | 123 ++++++++++++++++++ 3 files changed, 136 insertions(+), 42 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp index ef5ed2f3c7a16e..a825162e1dd0b6 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp @@ -27,15 +27,6 @@ using namespace cldnn; -int get_env(std::string key, int &val); -int get_env(std::string key, int &val) { - if (const auto env_var = std::getenv(key.c_str())) { - val = std::atoi(env_var); - return true; - } - return false; -} - void compile_graph::run(program& p) { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "pass::CompileGraph"); for (auto& node : p.get_processing_order()) { @@ -54,9 +45,6 @@ void compile_graph::run(program& p) { GPU_DEBUG_IF(debug_config->disable_onednn_permute_fusion == 1) disable_permute_fuse_onednn_gemm = true; - int conv_sa = 0; - get_env("CONV_SA", conv_sa); - for (size_t idx = 0; idx < proc_order.size(); idx++) { auto& node = *(std::next(proc_order.begin(), idx)); const bool use_shape_agnostic_impl = !p.get_config().get_property(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape); @@ -82,37 +70,12 @@ void compile_graph::run(program& p) { } } if (node->is_type()) { - if (!conv_sa) { - std::vector conv_list = { - // "__module.conv_in/aten::_convolution/Convolution", - // "__module.down_blocks.0.resnets.0.conv1/aten::_convolution/Convolution", - // "__module.down_blocks.0.resnets.0.conv2/aten::_convolution/Convolution", - // "__module.down_blocks.0.attentions.0.proj_in/aten::_convolution/Convolution", - // "__module.down_blocks.0.attentions.0.proj_out/aten::_convolution/Convolution", - // "__module.down_blocks.0.resnets.1.conv1/aten::_convolution/Convolution", - // "__module.down_blocks.0.resnets.1.conv2/aten::_convolution/Convolution", - // "__module.down_blocks.1.resnets.0.conv_shortcut/aten::_convolution/Convolution", - }; - bool is_hit = false; - for (auto conv_id : conv_list) { - if (node->id().find(conv_id) != std::string::npos) { - is_hit = true; - break; - } - } - if (!is_hit) { - change_initial_impl = false; - } + auto w_layout = node->as().weights().get_output_layout(); + if (w_layout.spatial(0) != 1 || w_layout.spatial(1) != 1) { + change_initial_impl = false; } else { - bool is_hit = false; - auto w_layout = node->as().weights().get_output_layout(); - if (w_layout.spatial(0) == 1 && w_layout.spatial(1) == 1) { - is_hit = true; - GPU_DEBUG_INFO << node->id() << ": " << w_layout.to_short_string() << std::endl; - } - if (!is_hit) { - change_initial_impl = false; - } + // will be removed.. + GPU_DEBUG_INFO << node->id() << ": " << w_layout.to_short_string() << std::endl; } } } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp index a102e09001c7ee..e4d4f0e5f9cc57 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp @@ -179,6 +179,9 @@ bool ConvolutionKernel_b_fs_yx_fsv16_1x1::Validate(const Params& p) const { const auto& input = params.inputs[0]; const auto& output = params.outputs[0]; + GPU_DEBUG_INFO << "input: " << input.Batch().v << ", " << input.Feature().v << ", " << input.Y().v << ", " << input.X().v << std::endl; + GPU_DEBUG_INFO << "output: " << output.Batch().v << ", " << output.Feature().v << ", " << output.Y().v << ", " << output.X().v << std::endl; + const bool bOutputSizes = (!params.is_shape_agnostic && (output.X().v != input.X().v || output.Y().v != input.Y().v)) || output.Feature().v % 16 != 0; const bool bFilterSize = params.filterSize.x != 1 || params.filterSize.y != 1; @@ -186,6 +189,11 @@ bool ConvolutionKernel_b_fs_yx_fsv16_1x1::Validate(const Params& p) const { const bool bPadding = input.Feature().pad.before % tuning_data.feature_block_size != 0 || output.Feature().pad.before % tuning_data.feature_block_size != 0; + GPU_DEBUG_INFO << bOutputSizes << ", " << bFilterSize << ", " << bStride << ", " << bPadding << std::endl; + if (bOutputSizes) { + GPU_DEBUG_INFO << params.is_shape_agnostic << " && " << output.X().v << " != " << input.X().v << ", " + << output.Y().v << " != " << input.Y().v << " || " << output.Feature().v << "% 16 != 0" << std::endl; + } if (bOutputSizes || bFilterSize || bStride || bPadding) { return false; } diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp index 0a98a1c371b0cb..cd33393c834c2b 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp @@ -10326,6 +10326,125 @@ TEST_P(conv_dyn_test, convolution_gpu_bfyx_os_iyx_osv32_no_bias) { } } +TEST_P(conv_dyn_test, convolution_gpu_fsv16_1x1_no_bias) { + auto& engine = get_test_engine(); + auto p = GetParam(); + + auto is_grouped = p.wei_shape.size() == 5; + + if (is_grouped) { + std::cout << "[ SKIPPED ] The test is skipped (group convolution is not supported)." << std::endl; + ASSERT_EQ(1, 1); + return; + } + auto groups_num = 1; + + auto is_weight_1x1 = (p.wei_shape[p.wei_shape.size() - 1] == 1 && p.wei_shape[p.wei_shape.size() - 2] == 1); + auto is_valid_output = p.wei_shape[0] % 16 == 0; + auto is_valid_strid = p.stride[0] == 1 && p.stride[1] == 1; + auto is_valid_padding = p.pad_begin[0] == 0 && p.pad_begin[1] == 0 && p.pad_end[0] == 0 && p.pad_end[1] == 0; + + if (!is_weight_1x1 || !is_valid_output || !is_valid_strid || !is_valid_padding) { + std::cout << "[ SKIPPED ] The test is skipped (is_weight_1x1:" << is_weight_1x1 << ", is_valid_output" << is_valid_output + << ", is_valid_strid: " << is_valid_strid << ", is_valid_padding: " << is_valid_padding << std::endl; + ASSERT_EQ(1, 1); + return; + } + + auto calculate_ref = [&](memory::ptr input, memory::ptr weights, ExecutionConfig config) { + auto in_layout = input->get_layout(); + + topology topology_ref( + input_layout("input", in_layout), + data("weights", weights), + convolution("conv", input_info("input"), "weights", no_bias, groups_num, p.stride, p.dilation, p.pad_begin, p.pad_end, is_grouped)); + + network network_ref(engine, topology_ref, config); + network_ref.set_input_data("input", input); + + auto outputs_ref = network_ref.execute(); + + return outputs_ref.at("conv").get_memory(); + }; + + auto in_layout = layout{ov::PartialShape{ov::Dimension(), ov::Dimension(p.in_shape[1]), ov::Dimension(), ov::Dimension()}, data_types::f16, format::b_fs_yx_fsv16}; + auto input = engine.allocate_memory({ p.in_shape, data_types::f16, format::b_fs_yx_fsv16 }); + auto weights = engine.allocate_memory({p.wei_shape, data_types::f16, is_grouped ? format::bfzyx : format::bfyx}); + + tests::random_generator rg(GET_SUITE_NAME); + VF input_rnd = rg.generate_random_1d(ov::shape_size(p.in_shape), -10, 10); + VF weights_rnd = rg.generate_random_1d(ov::shape_size(p.wei_shape), -10, 10); + + set_values(input, input_rnd); + set_values(weights, weights_rnd); + + topology topology( + input_layout("input", in_layout), + data("weights", weights), + convolution("conv", input_info("input"), "weights", no_bias, groups_num, p.stride, p.dilation, p.pad_begin, p.pad_end, is_grouped)); + + ExecutionConfig config = get_test_default_config(engine); + ov::intel_gpu::ImplementationDesc conv_impl = { format::b_fs_yx_fsv16, "convolution_gpu_bfyx_f16_1x1", impl_types::ocl }; + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "conv", conv_impl } })); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::enable_profiling(true)); + + network network(engine, topology, config); + network.set_input_data("input", input); + + auto inst = network.get_primitive("conv"); + auto impl = inst->get_impl(); + ASSERT_TRUE(impl != nullptr); + ASSERT_TRUE(impl->is_dynamic()); + + auto outputs = network.execute(); + + auto output_memory = outputs.at("conv").get_memory(); + + // Convolution_fsv16_1x1 static kernel has an accuracy issue when feature size is bigger 16. + // So we use reference data from convolution reference kernel. + ExecutionConfig ref_config = get_test_default_config(engine); + ov::intel_gpu::ImplementationDesc ref_conv_impl = { format::b_fs_yx_fsv16, "convolution_gpu_ref", impl_types::ocl }; + ref_config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "conv", ref_conv_impl } })); + + auto output_memory_ref = calculate_ref(input, weights, ref_config); + + cldnn::mem_lock output_ptr(output_memory, get_test_stream()); + cldnn::mem_lock output_ptr_ref(output_memory_ref, get_test_stream()); + + + ASSERT_EQ(outputs.at("conv").get_layout(), output_memory_ref->get_layout()); + for (size_t i = 0; i < output_ptr.size(); i++) { + ASSERT_EQ(output_ptr[i], output_ptr_ref[i]); + } + + { + // Change original shape for the second run + auto new_shape = p.in_shape; + new_shape[2] += 4; + new_shape[3] += 8; + + auto input = engine.allocate_memory({ new_shape, data_types::f16, format::b_fs_yx_fsv16 }); + + VF input_rnd = rg.generate_random_1d(ov::shape_size(p.in_shape), -10, 10); + set_values(input, input_rnd); + + network.set_input_data("input", input); + auto outputs = network.execute(); + + auto output_memory = outputs.at("conv").get_memory(); + auto output_memory_ref = calculate_ref(input, weights, config); + + cldnn::mem_lock output_ptr(output_memory, get_test_stream()); + cldnn::mem_lock output_ptr_ref(output_memory_ref, get_test_stream()); + + ASSERT_EQ(outputs.at("conv").get_layout(), output_memory_ref->get_layout()); + for (size_t i = 0; i < output_ptr.size(); i++) { + ASSERT_EQ(output_ptr[i], output_ptr_ref[i]); + } + } +} + INSTANTIATE_TEST_SUITE_P(smoke, conv_dyn_test, testing::ValuesIn(std::vector{ { ov::Shape{1, 8, 14, 14}, ov::Shape{16, 8, 3, 3}, ov::Strides{1, 1}, ov::Strides{1, 1}, ov::CoordinateDiff{0, 0}, ov::CoordinateDiff{0, 0} }, @@ -10347,4 +10466,8 @@ INSTANTIATE_TEST_SUITE_P(smoke, conv_dyn_test, { ov::Shape{1, 3, 32, 32}, ov::Shape{96, 3, 4, 4}, ov::Strides{4, 4}, ov::Strides{1, 1}, ov::CoordinateDiff{0, 0}, ov::CoordinateDiff{0, 0} }, { ov::Shape{1, 768, 7, 7}, ov::Shape{768, 1, 1, 3, 3}, ov::Strides{1, 1}, ov::Strides{1, 1}, ov::CoordinateDiff{1, 1}, ov::CoordinateDiff{1, 1} }, { ov::Shape{1, 48, 56, 56}, ov::Shape{48, 48, 8, 8}, ov::Strides{8, 8}, ov::Strides{1, 1}, ov::CoordinateDiff{0, 0}, ov::CoordinateDiff{0, 0} }, + { ov::Shape{1, 4, 16, 16}, ov::Shape{16, 4, 1, 1}, ov::Strides{1, 1}, ov::Strides{1, 1}, ov::CoordinateDiff{0, 0}, ov::CoordinateDiff{0, 0} }, + { ov::Shape{1, 16, 15, 17}, ov::Shape{16, 16, 1, 1}, ov::Strides{1, 1}, ov::Strides{1, 1}, ov::CoordinateDiff{0, 0}, ov::CoordinateDiff{0, 0} }, + { ov::Shape{1, 16, 17, 18}, ov::Shape{16, 16, 1, 1}, ov::Strides{1, 1}, ov::Strides{1, 1}, ov::CoordinateDiff{0, 0}, ov::CoordinateDiff{0, 0} }, + { ov::Shape{1, 32, 64, 64}, ov::Shape{32, 32, 1, 1}, ov::Strides{1, 1}, ov::Strides{1, 1}, ov::CoordinateDiff{0, 0}, ov::CoordinateDiff{0, 0} }, })); From f3fa3d3a9b047fa178729601866dcad5e2c3e217 Mon Sep 17 00:00:00 2001 From: hyunback Date: Fri, 21 Jun 2024 22:55:21 +0900 Subject: [PATCH 06/12] Fix unit-test failure Signed-off-by: hyunback --- .../convolution_kernel_b_fs_yx_fsv16_1x1.cpp | 1 + .../tests/unit/test_cases/convolution_gpu_test.cpp | 14 +++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp index e4d4f0e5f9cc57..40a2b66b4736fe 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp @@ -70,6 +70,7 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionTuningData ConvolutionKernel_b_f const auto& input = params.inputs[0]; bool block_size_one_is_better = params.outputs[0].X().v == 1 && params.outputs[0].Y().v == 1 && input.Feature().v >= 2048; + // Accuracy issue is found with input.Feature() > 16 in static kernel, Need to fix later. if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.supports_imad && !block_size_one_is_better) { size_t ic_blocks = CeilDiv(input.Feature().v, tuning_data.feature_block_size); size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size; diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp index cd33393c834c2b..06573cdb565604 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp @@ -10351,6 +10351,12 @@ TEST_P(conv_dyn_test, convolution_gpu_fsv16_1x1_no_bias) { return; } + if (!engine.get_device_info().supports_immad && p.in_shape[1] > 16) { + std::cout << "[ SKIPPED ] The test is skipped (convolution_fsv16_1x1 static kernel has accuracy issue with input feature > 16 in igpu)." << std::endl; + ASSERT_EQ(1, 1); + return; + } + auto calculate_ref = [&](memory::ptr input, memory::ptr weights, ExecutionConfig config) { auto in_layout = input->get_layout(); @@ -10401,13 +10407,7 @@ TEST_P(conv_dyn_test, convolution_gpu_fsv16_1x1_no_bias) { auto output_memory = outputs.at("conv").get_memory(); - // Convolution_fsv16_1x1 static kernel has an accuracy issue when feature size is bigger 16. - // So we use reference data from convolution reference kernel. - ExecutionConfig ref_config = get_test_default_config(engine); - ov::intel_gpu::ImplementationDesc ref_conv_impl = { format::b_fs_yx_fsv16, "convolution_gpu_ref", impl_types::ocl }; - ref_config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ { "conv", ref_conv_impl } })); - - auto output_memory_ref = calculate_ref(input, weights, ref_config); + auto output_memory_ref = calculate_ref(input, weights, config); cldnn::mem_lock output_ptr(output_memory, get_test_stream()); cldnn::mem_lock output_ptr_ref(output_memory_ref, get_test_stream()); From 90a64fa085c8bcecc35e885bd3fad451d3f0ff6b Mon Sep 17 00:00:00 2001 From: hyunback Date: Mon, 24 Jun 2024 12:38:46 +0900 Subject: [PATCH 07/12] Apply code-review comment. Signed-off-by: hyunback --- .../convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp index 40a2b66b4736fe..94cc7006ad5838 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp @@ -299,6 +299,14 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut if (t.is_dynamic()) { non_unit_fused_op_spatial = true; break; + } else { + if ((t.X().v > 1) || + (t.Y().v > 1) || + (t.Z().v > 1) || + (t.W().v > 1)) { + non_unit_fused_op_spatial = true; + break; + } } } } From bebb8190c5a862d2b9d50ae297e2d4f1897ece0c Mon Sep 17 00:00:00 2001 From: hyunback Date: Tue, 25 Jun 2024 19:15:11 +0900 Subject: [PATCH 08/12] Update for code-review comment. Signed-off-by: hyunback --- .../graph/graph_optimizer/compile_graph.cpp | 6 +- .../intel_gpu/src/graph/primitive_inst.cpp | 11 +- .../convolution_kernel_b_fs_yx_fsv16_1x1.cpp | 186 +++++++++++++++--- .../convolution_kernel_b_fs_yx_fsv16_1x1.h | 3 +- 4 files changed, 171 insertions(+), 35 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp index a825162e1dd0b6..513ff675a62916 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp @@ -73,9 +73,6 @@ void compile_graph::run(program& p) { auto w_layout = node->as().weights().get_output_layout(); if (w_layout.spatial(0) != 1 || w_layout.spatial(1) != 1) { change_initial_impl = false; - } else { - // will be removed.. - GPU_DEBUG_INFO << node->id() << ": " << w_layout.to_short_string() << std::endl; } } } @@ -108,7 +105,8 @@ void compile_graph::run(program& p) { bool is_planar = format::is_default_format(node->get_output_layout().format); - if (!node->is_type() && node->is_dynamic() && !is_planar) + // TODO check more. + if ((node->is_dynamic() && !is_planar && !node->is_type())) can_select_impl = false; if (node->is_type() || node->is_type() || node->is_type()) diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index e3eb97a7e5c9d9..72ca3604165f9b 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -772,7 +772,16 @@ bool primitive_inst::use_async_compilation() { compile_gemm_impls = _node->get_selected_impl() && _node->get_selected_impl()->get_kernel_name().find("gemm_ref") != std::string::npos; compile_gemm_impls |= (_node->get_preferred_impl_type() == impl_types::onednn); } - + // bool compile_conv_impls = _node->is_type(); + // if (compile_conv_impls) { + // // Do not async-compile if opt_gemm is chosen for iGPU + // // Do async-compile if it is to be executed from onednn + // auto is_ref_kernel = _node->get_selected_impl() && _node->get_selected_impl()->get_kernel_name().find("convolution_gpu_ref") != std::string::npos; + // // compile_conv_impls |= (_node->get_preferred_impl_type() == impl_types::onednn); + // return !is_ref_kernel; + // } + + // return (_node->is_type() || compile_fc_impls || compile_gemm_impls || compile_conv_impls || return (_node->is_type() || compile_fc_impls || compile_gemm_impls || (_node->is_type() && _node->get_selected_impl() && _node->get_selected_impl()->get_kernel_name().find("softmax_gpu_ref") != std::string::npos)); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp index 94cc7006ad5838..073b5d239e06b9 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp @@ -20,14 +20,12 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionKernel_b_fs_yx_fsv16_1x1() : Con } } -ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16_1x1::GetAutoTuneOptions(const Params& params, +ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16_1x1::GetAutoTuneOptions(const convolution_params& params, int /*autoTuneIndex*/) const { - if (!params.is_shape_agnostic) { - const convolution_params& cp = static_cast(params); - - auto x = cp.outputs[0].X().v; - auto y = cp.outputs[0].Y().v; - auto f = cp.outputs[0].Feature().v; + if (!params.has_dynamic_outputs()) { + auto x = params.outputs[0].X().v; + auto y = params.outputs[0].Y().v; + auto f = params.outputs[0].Feature().v; if (x == 1 && y == 1) { return { 1, EXE_MODE_DEFAULT }; @@ -43,7 +41,7 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fs } } else { // In shape agnostic kernel, the output shape cannot be specified at build time, - // So we set blockWidth to 8, which is the most commonly used. + // So we initialy set blockWidth to 8 which is the most commonly used. Update blockWidth after static shape comes. return { 8, EXE_MODE_DEFAULT }; } } @@ -66,7 +64,7 @@ float ConvolutionKernel_b_fs_yx_fsv16_1x1::EstimateOccupancy(const convolution_p ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionTuningData ConvolutionKernel_b_fs_yx_fsv16_1x1::GetTuningParams(const convolution_params& params) const { ConvolutionTuningData tuning_data; - if (!params.is_shape_agnostic) { + if (!params.has_dynamic_tensors()) { const auto& input = params.inputs[0]; bool block_size_one_is_better = params.outputs[0].X().v == 1 && params.outputs[0].Y().v == 1 && input.Feature().v >= 2048; @@ -136,16 +134,16 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa dispatchData.lws[1] = tuning_data.work_group_size; dispatchData.lws[2] = 1; - GPU_DEBUG_INFO << "gws: " << dispatchData.gws[0] << ", " << dispatchData.gws[1] << ", " << dispatchData.gws[2] << std::endl; - GPU_DEBUG_INFO << "lws: " << dispatchData.lws[0] << ", " << dispatchData.lws[1] << ", " << dispatchData.lws[2] << std::endl; + // GPU_DEBUG_INFO << "gws: " << dispatchData.gws[0] << ", " << dispatchData.gws[1] << ", " << dispatchData.gws[2] << std::endl; + // GPU_DEBUG_INFO << "lws: " << dispatchData.lws[0] << ", " << dispatchData.lws[1] << ", " << dispatchData.lws[2] << std::endl; return dispatchData; } KernelsPriority ConvolutionKernel_b_fs_yx_fsv16_1x1::GetKernelsPriority(const Params& params) const { - if (!params.is_shape_agnostic) { - const auto& p = static_cast(params); - auto autoTune = GetAutoTuneOptions(params, -1); + const auto& p = static_cast(params); + if (!p.has_dynamic_tensors()) { + auto autoTune = GetAutoTuneOptions(p, -1); const auto& input = p.inputs[0]; const auto& out = p.outputs[0]; @@ -180,21 +178,22 @@ bool ConvolutionKernel_b_fs_yx_fsv16_1x1::Validate(const Params& p) const { const auto& input = params.inputs[0]; const auto& output = params.outputs[0]; - GPU_DEBUG_INFO << "input: " << input.Batch().v << ", " << input.Feature().v << ", " << input.Y().v << ", " << input.X().v << std::endl; - GPU_DEBUG_INFO << "output: " << output.Batch().v << ", " << output.Feature().v << ", " << output.Y().v << ", " << output.X().v << std::endl; + // GPU_DEBUG_INFO << "input: " << input.Batch().v << ", " << input.Feature().v << ", " << input.Y().v << ", " << input.X().v << std::endl; + // GPU_DEBUG_INFO << "output: " << output.Batch().v << ", " << output.Feature().v << ", " << output.Y().v << ", " << output.X().v << std::endl; - const bool bOutputSizes = (!params.is_shape_agnostic && (output.X().v != input.X().v || output.Y().v != input.Y().v)) || - output.Feature().v % 16 != 0; + const bool bOutputSizes = (!input.X().is_dynamic && !output.X().is_dynamic && output.X().v != input.X().v) || + (!input.Y().is_dynamic && !output.Y().is_dynamic && output.Y().v != input.Y().v) || + (!output.Feature().is_dynamic && output.Feature().v % 16 != 0); const bool bFilterSize = params.filterSize.x != 1 || params.filterSize.y != 1; const bool bStride = params.stride.x != 1 || params.stride.y != 1; - const bool bPadding = input.Feature().pad.before % tuning_data.feature_block_size != 0 || - output.Feature().pad.before % tuning_data.feature_block_size != 0; - - GPU_DEBUG_INFO << bOutputSizes << ", " << bFilterSize << ", " << bStride << ", " << bPadding << std::endl; - if (bOutputSizes) { - GPU_DEBUG_INFO << params.is_shape_agnostic << " && " << output.X().v << " != " << input.X().v << ", " - << output.Y().v << " != " << input.Y().v << " || " << output.Feature().v << "% 16 != 0" << std::endl; - } + const bool bPadding = (!input.Feature().pad.is_dynamic && input.Feature().pad.before % tuning_data.feature_block_size != 0) || + (!output.Feature().pad.is_dynamic && output.Feature().pad.before % tuning_data.feature_block_size != 0); + + // GPU_DEBUG_INFO << bOutputSizes << ", " << bFilterSize << ", " << bStride << ", " << bPadding << std::endl; + // if (bOutputSizes) { + // GPU_DEBUG_INFO << params.is_shape_agnostic << " && " << output.X().v << " != " << input.X().v << ", " + // << output.Y().v << " != " << input.Y().v << " || " << output.Feature().v << "% 16 != 0" << std::endl; + // } if (bOutputSizes || bFilterSize || bStride || bPadding) { return false; } @@ -241,7 +240,7 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut jit.Merge(MakeFusedOpsJitConstants(params, { conf_vec, conf_scalar1, conf_scalar2 })); } - GPU_DEBUG_INFO << params.layerID << " : params.fused_ops.empty(): " << params.fused_ops.empty() << std::endl; + // GPU_DEBUG_INFO << params.layerID << " : params.fused_ops.empty(): " << params.fused_ops.empty() << std::endl; jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth)); jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor)); @@ -328,8 +327,137 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut return jit; } -KernelsData ConvolutionKernel_b_fs_yx_fsv16_1x1::GetKernelsData(const Params& params) const { - return GetCommonKernelsData(params, EXE_MODE_DEFAULT, -1); + KernelsData ConvolutionKernel_b_fs_yx_fsv16_1x1::GetKernelsData(const Params& params) const { + size_t num_kernels = params.is_shape_agnostic ? 4 : 1; + KernelData kd = KernelData::Default(params, num_kernels); + convolution_params& newParams = *static_cast(kd.params.get()); + + if (!Validate(params)) { + return {}; + } + + auto preferredWeightsLayout = GetPreferredWeightsLayout(newParams); + bool succeed = UpdateWeightsParams(newParams, + preferredWeightsLayout, + kd.weightsReorderParams, + GetSupportedKey(), + newParams.groups, + newParams.transposed); + + bool bSupportedWeightsLayout = newParams.weights.GetLayout() == preferredWeightsLayout; + const bool bWeightsOK = bSupportedWeightsLayout || newParams.allowStaticInputReordering; + + if (!succeed || !bWeightsOK) { + return {}; + } + + if (NeedPaddedInput()) { + if (newParams.has_dynamic_inputs()) { + if (!CheckConvolutionExplicitPaddings(newParams)) + return {}; + } else { + kd.reorderInput = ConvolutionUpdateInputParams(newParams); + + if (kd.reorderInput && !newParams.allowInputReordering) + return {}; + } + } + + DispatchData dispatchData = SetDefault(newParams, -1); + + if (!params.is_shape_agnostic && !CheckWorkGroups(dispatchData)) { + // Internal Error - wrong calculation of global/local work group sizes + return {}; + } + + auto finalKernelName = GetKernelName(newParams); + auto cldnnJit = GetJitConstants(newParams, dispatchData); + for (size_t i = 0; i < num_kernels; i++) { + if (params.is_shape_agnostic) { + cldnnJit.RemoveConstant("X_BLOCK_SIZE"); + if (i == 0) { + cldnnJit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", "1")); + } else if (i == 1) { + cldnnJit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", "2")); + } else if (i == 2) { + cldnnJit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", "4")); + } else if (i == 3) { + cldnnJit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", "8")); + } + } + auto entryPoint = GetEntryPoint(finalKernelName, newParams.layerID, params, i); + auto jit = CreateJit(finalKernelName, cldnnJit, entryPoint); + + GetUpdateDispatchDataFunc(kd); + + auto& kernel = kd.kernels[i]; + FillCLKernelData(kernel, + dispatchData, + params.engineInfo, + finalKernelName, + jit, + entryPoint, + EXE_MODE_DEFAULT, + true, + !newParams.bias.empty(), + 1, 0, 1, + newParams.is_shape_agnostic); + + if (newParams.deformable_mode) { + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1}); + if (newParams.deformable_mask_enabled) + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 2}); + } + + if (!newParams.weights_zero_points.empty()) + kernel.params.arguments.push_back({ArgumentDescriptor::Types::WEIGHTS_ZERO_POINTS, 1}); + if (!newParams.activations_zero_points.empty()) + kernel.params.arguments.push_back({ArgumentDescriptor::Types::ACTIVATIONS_ZERO_POINTS, 1}); + if (!newParams.compensation.empty()) + kernel.params.arguments.push_back({ArgumentDescriptor::Types::COMPENSATION, 1}); + + uint32_t fused_deps_total = 0; + for (auto& fused_dep : newParams.fused_ops) { + for (int i = 0; i < static_cast(fused_dep.dep_size); i++) { + kernel.params.arguments.push_back({ ArgumentDescriptor::Types::INPUT_OF_FUSED_PRIMITIVE, fused_deps_total }); + fused_deps_total++; + } + } + } + kd.autoTuneIndex = -1; + + return {kd}; + } + +void ConvolutionKernel_b_fs_yx_fsv16_1x1::GetUpdateDispatchDataFunc(KernelData& kd) const { + if (kd.kernels.size() == 1) { + Parent::GetUpdateDispatchDataFunc(kd); + } else { + kd.update_dispatch_data_func = [this](const Params& params, KernelData& kd) { + const auto& prim_params = static_cast(params); + auto dispatchData = SetDefault(prim_params); + size_t execute_kernel_idx = 3; + if (dispatchData.cldnnStyle.blockWidth == 1) { + execute_kernel_idx = 0; + } else if (dispatchData.cldnnStyle.blockWidth == 2) { + execute_kernel_idx = 1; + } else if (dispatchData.cldnnStyle.blockWidth == 4) { + execute_kernel_idx = 2; + } + for (size_t i = 0; i < kd.kernels.size(); i++) { + kd.kernels[i].params.workGroups.global = dispatchData.gws; + kd.kernels[i].params.workGroups.local = dispatchData.lws; + if (execute_kernel_idx == i) { + kd.kernels[i].skip_execution = KernelData::SkipKernelExecution(prim_params); + } else { + kd.kernels[i].skip_execution = true; + } + } + kd.internalBufferSizes.clear(); + kd.internalBufferSizes.push_back(prim_params.inputs[0].PhysicalSizeInBytes()); + kd.internalBufferDataType = prim_params.inputs[0].GetDType(); + }; + } } } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.h index 9d5440816c3209..c01eeb4ec9f404 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.h @@ -34,6 +34,7 @@ class ConvolutionKernel_b_fs_yx_fsv16_1x1 : public ConvolutionKernelBase { bool Validate(const Params& p) const override; DispatchData SetDefault(const convolution_params& arg, int autoTuneIndex = -1) const override; JitConstants GetJitConstants(const convolution_params& params, const DispatchData& dispatchData) const override; + void GetUpdateDispatchDataFunc(KernelData& kd) const override; struct AutoTuneOption { size_t blockWidth; @@ -48,7 +49,7 @@ class ConvolutionKernel_b_fs_yx_fsv16_1x1 : public ConvolutionKernelBase { }; std::vector autoTuneOptions; - AutoTuneOption GetAutoTuneOptions(const Params& arg, int autoTuneIndex) const; + AutoTuneOption GetAutoTuneOptions(const convolution_params& arg, int autoTuneIndex) const; ConvolutionTuningData GetTuningParams(const convolution_params& params) const; float EstimateOccupancy(const convolution_params& params, const ConvolutionTuningData& tuning_data) const; }; From a7ea15c0ec79ea167e5ebf57e473c59317d2424f Mon Sep 17 00:00:00 2001 From: hyunback Date: Wed, 26 Jun 2024 12:11:08 +0900 Subject: [PATCH 09/12] Update. Signed-off-by: hyunback --- .../src/graph/graph_optimizer/compile_graph.cpp | 6 ++++-- .../convolution_kernel_b_fs_yx_fsv16_1x1.cpp | 13 ------------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp index 513ff675a62916..f0f1cb2d2182ca 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp @@ -71,6 +71,7 @@ void compile_graph::run(program& p) { } if (node->is_type()) { auto w_layout = node->as().weights().get_output_layout(); + // Only convolution_fsv16_1x1 is available shape agnostic kernel for onednn convolution using the block format(fsv16) if (w_layout.spatial(0) != 1 || w_layout.spatial(1) != 1) { change_initial_impl = false; } @@ -105,9 +106,10 @@ void compile_graph::run(program& p) { bool is_planar = format::is_default_format(node->get_output_layout().format); - // TODO check more. - if ((node->is_dynamic() && !is_planar && !node->is_type())) + if ((node->is_dynamic() && !is_planar && + (!node->is_type() || (node->is_type() && node->get_output_layout().format != cldnn::format::b_fs_yx_fsv16)))) { can_select_impl = false; + } if (node->is_type() || node->is_type() || node->is_type()) can_select_impl = true; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp index 073b5d239e06b9..996deb87661b04 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp @@ -134,9 +134,6 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa dispatchData.lws[1] = tuning_data.work_group_size; dispatchData.lws[2] = 1; - // GPU_DEBUG_INFO << "gws: " << dispatchData.gws[0] << ", " << dispatchData.gws[1] << ", " << dispatchData.gws[2] << std::endl; - // GPU_DEBUG_INFO << "lws: " << dispatchData.lws[0] << ", " << dispatchData.lws[1] << ", " << dispatchData.lws[2] << std::endl; - return dispatchData; } @@ -178,9 +175,6 @@ bool ConvolutionKernel_b_fs_yx_fsv16_1x1::Validate(const Params& p) const { const auto& input = params.inputs[0]; const auto& output = params.outputs[0]; - // GPU_DEBUG_INFO << "input: " << input.Batch().v << ", " << input.Feature().v << ", " << input.Y().v << ", " << input.X().v << std::endl; - // GPU_DEBUG_INFO << "output: " << output.Batch().v << ", " << output.Feature().v << ", " << output.Y().v << ", " << output.X().v << std::endl; - const bool bOutputSizes = (!input.X().is_dynamic && !output.X().is_dynamic && output.X().v != input.X().v) || (!input.Y().is_dynamic && !output.Y().is_dynamic && output.Y().v != input.Y().v) || (!output.Feature().is_dynamic && output.Feature().v % 16 != 0); @@ -189,11 +183,6 @@ bool ConvolutionKernel_b_fs_yx_fsv16_1x1::Validate(const Params& p) const { const bool bPadding = (!input.Feature().pad.is_dynamic && input.Feature().pad.before % tuning_data.feature_block_size != 0) || (!output.Feature().pad.is_dynamic && output.Feature().pad.before % tuning_data.feature_block_size != 0); - // GPU_DEBUG_INFO << bOutputSizes << ", " << bFilterSize << ", " << bStride << ", " << bPadding << std::endl; - // if (bOutputSizes) { - // GPU_DEBUG_INFO << params.is_shape_agnostic << " && " << output.X().v << " != " << input.X().v << ", " - // << output.Y().v << " != " << input.Y().v << " || " << output.Feature().v << "% 16 != 0" << std::endl; - // } if (bOutputSizes || bFilterSize || bStride || bPadding) { return false; } @@ -240,8 +229,6 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut jit.Merge(MakeFusedOpsJitConstants(params, { conf_vec, conf_scalar1, conf_scalar2 })); } - // GPU_DEBUG_INFO << params.layerID << " : params.fused_ops.empty(): " << params.fused_ops.empty() << std::endl; - jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth)); jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor)); jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size)); From 4c3b5328e387f25179f40a93032f91b6847221e5 Mon Sep 17 00:00:00 2001 From: hyunback Date: Wed, 26 Jun 2024 12:15:57 +0900 Subject: [PATCH 10/12] Remove dummy code. Signed-off-by: hyunback --- src/plugins/intel_gpu/src/graph/primitive_inst.cpp | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 72ca3604165f9b..e3eb97a7e5c9d9 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -772,16 +772,7 @@ bool primitive_inst::use_async_compilation() { compile_gemm_impls = _node->get_selected_impl() && _node->get_selected_impl()->get_kernel_name().find("gemm_ref") != std::string::npos; compile_gemm_impls |= (_node->get_preferred_impl_type() == impl_types::onednn); } - // bool compile_conv_impls = _node->is_type(); - // if (compile_conv_impls) { - // // Do not async-compile if opt_gemm is chosen for iGPU - // // Do async-compile if it is to be executed from onednn - // auto is_ref_kernel = _node->get_selected_impl() && _node->get_selected_impl()->get_kernel_name().find("convolution_gpu_ref") != std::string::npos; - // // compile_conv_impls |= (_node->get_preferred_impl_type() == impl_types::onednn); - // return !is_ref_kernel; - // } - - // return (_node->is_type() || compile_fc_impls || compile_gemm_impls || compile_conv_impls || + return (_node->is_type() || compile_fc_impls || compile_gemm_impls || (_node->is_type() && _node->get_selected_impl() && _node->get_selected_impl()->get_kernel_name().find("softmax_gpu_ref") != std::string::npos)); From 02d92f05e9f4522564c78e7afbaffa9d4d5489b1 Mon Sep 17 00:00:00 2001 From: hyunback Date: Wed, 26 Jun 2024 14:39:15 +0900 Subject: [PATCH 11/12] Update Signed-off-by: hyunback --- .../intel_gpu/src/graph/graph_optimizer/compile_graph.cpp | 3 ++- .../convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp index f0f1cb2d2182ca..946a2d5a06c614 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp @@ -71,7 +71,8 @@ void compile_graph::run(program& p) { } if (node->is_type()) { auto w_layout = node->as().weights().get_output_layout(); - // Only convolution_fsv16_1x1 is available shape agnostic kernel for onednn convolution using the block format(fsv16) + // Convolution_fsv16_1x1 is only available shape agnostic kernel for onednn convolution which uses the block format.(fsv16) + // Onednn convolution doesn't support input padding but most of cldnn optimized convolution require input padding except fsv16_1x1. if (w_layout.spatial(0) != 1 || w_layout.spatial(1) != 1) { change_initial_impl = false; } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp index 996deb87661b04..c570a4aa1004f1 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp @@ -40,8 +40,8 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fs return { 8, EXE_MODE_DEFAULT }; } } else { - // In shape agnostic kernel, the output shape cannot be specified at build time, - // So we initialy set blockWidth to 8 which is the most commonly used. Update blockWidth after static shape comes. + // In shape agnostic kernel, the output shape can not be specified at build time, + // So we prepare 4 kernels(blockWith 1, 2, 4, 8) in advance and then use proper kernel at runtime when static shape comes. return { 8, EXE_MODE_DEFAULT }; } } @@ -139,7 +139,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa KernelsPriority ConvolutionKernel_b_fs_yx_fsv16_1x1::GetKernelsPriority(const Params& params) const { const auto& p = static_cast(params); - if (!p.has_dynamic_tensors()) { + if (!p.is_shape_agnostic) { auto autoTune = GetAutoTuneOptions(p, -1); const auto& input = p.inputs[0]; From 57e413348fbad8ef23e3e390f4e725c9c8d4b453 Mon Sep 17 00:00:00 2001 From: hyunback Date: Thu, 27 Jun 2024 09:41:05 +0900 Subject: [PATCH 12/12] Remove dummy internal buffer. Signed-off-by: hyunback --- .../convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp index c570a4aa1004f1..688e09c8010030 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp @@ -440,9 +440,6 @@ void ConvolutionKernel_b_fs_yx_fsv16_1x1::GetUpdateDispatchDataFunc(KernelData& kd.kernels[i].skip_execution = true; } } - kd.internalBufferSizes.clear(); - kd.internalBufferSizes.push_back(prim_params.inputs[0].PhysicalSizeInBytes()); - kd.internalBufferDataType = prim_params.inputs[0].GetDType(); }; } }