Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ void compile_graph::run(program& p) {
GPU_DEBUG_IF(debug_config->disable_onednn_permute_fusion == 1)
disable_permute_fuse_onednn_gemm = true;


for (size_t idx = 0; idx < proc_order.size(); idx++) {
auto& node = *(std::next(proc_order.begin(), idx));
const bool use_shape_agnostic_impl = !p.get_config().get_property(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape);
Expand All @@ -70,6 +69,15 @@ void compile_graph::run(program& p) {
change_initial_impl = false;
}
}
if (node->is_type<convolution>()) {
auto w_layout = node->as<convolution>().weights().get_output_layout();
if (w_layout.spatial(0) != 1 || w_layout.spatial(1) != 1) {
change_initial_impl = false;
} else {
// will be removed..
GPU_DEBUG_INFO << node->id() << ": " << w_layout.to_short_string() << std::endl;
}
}
}

if (change_initial_impl)
Expand Down Expand Up @@ -100,7 +108,7 @@ void compile_graph::run(program& p) {

bool is_planar = format::is_default_format(node->get_output_layout().format);

if (node->is_dynamic() && !is_planar)
if (!node->is_type<convolution>() && node->is_dynamic() && !is_planar)
can_select_impl = false;

if (node->is_type<condition>() || node->is_type<loop>() || node->is_type<proposal>())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,26 +196,30 @@ void prepare_primitive_fusing::fuse_bias(program &p) {


if (node->get_output_layout().is_dynamic()) {
auto broadcast_type = eltw_node.get_primitive()->broadcast_spec.m_type;
if (!eltw_node.get_dependency(non_const_dep_idx).is_type<fully_connected>())
continue;
if (broadcast_type != ov::op::AutoBroadcastType::NUMPY && broadcast_type != ov::op::AutoBroadcastType::NONE)
continue;
// Numpy broadcast rule requires the dimension size which is not one to be same as the corresponding dimension of the other operand.
// So we can ensure that the feature size is same for this broadcasting rule, thereby being considered as bias.
auto const_shape = eltw_node.get_dependency(const_dep_idx).get_output_layout().get_shape();
int32_t count_elements_not_one = 0;
int32_t idx_element_not_one = -1;
for (size_t i = 0; i < const_shape.size(); ++i) {
if (const_shape[i] != 1) {
count_elements_not_one++;
idx_element_not_one = static_cast<int32_t>(i);
if (eltw_node.get_dependency(non_const_dep_idx).is_type<fully_connected>()) {
auto broadcast_type = eltw_node.get_primitive()->broadcast_spec.m_type;
if (broadcast_type != ov::op::AutoBroadcastType::NUMPY && broadcast_type != ov::op::AutoBroadcastType::NONE)
continue;

// Numpy broadcast rule requires the dimension size which is not one to be same as the corresponding dimension of the other operand.
// So we can ensure that the feature size is same for this broadcasting rule, thereby being considered as bias.
auto const_shape = eltw_node.get_dependency(const_dep_idx).get_output_layout().get_shape();
int32_t count_elements_not_one = 0;
int32_t idx_element_not_one = -1;
for (size_t i = 0; i < const_shape.size(); ++i) {
if (const_shape[i] != 1) {
count_elements_not_one++;
idx_element_not_one = static_cast<int32_t>(i);
}
if (count_elements_not_one > 1)
break;
}
if (count_elements_not_one > 1)
break;
}
if (count_elements_not_one != 1 ||
(idx_element_not_one != (static_cast<int32_t>(const_shape.size()) - 1))) {

if (count_elements_not_one != 1 ||
(idx_element_not_one != (static_cast<int32_t>(const_shape.size()) - 1))) {
continue;
}
} else if (!eltw_node.get_dependency(non_const_dep_idx).is_type<convolution>()) {
continue;
}
} else {
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_gpu/src/graph/impls/ocl/convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,8 @@ attach_convolution_impl::attach_convolution_impl() {
};
auto dyn_formats = {
format::bfyx,
format::bfzyx
format::bfzyx,
format::b_fs_yx_fsv16
};

implementation_map<convolution>::add(impl_types::ocl,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
__attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
KERNEL(convolution_b_fs_yx_fsv16_1x1)(
OPTIONAL_SHAPE_INFO_ARG
__global INPUT0_TYPE* input,
__global OUTPUT_TYPE* output,
__global FILTER_TYPE* weights
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_gpu/src/kernel_selector/jitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,8 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const {
if (_tensor.is_dynamic()) {
if (_tensor.GetLayout() == DataLayout::bf || _tensor.GetLayout() == DataLayout::bfyx ||
_tensor.GetLayout() == DataLayout::bfzyx || _tensor.GetLayout() == DataLayout::bfwzyx ||
_tensor.GetLayout() == DataLayout::bfuwzyx || _tensor.GetLayout() == DataLayout::bfvuwzyx) {
_tensor.GetLayout() == DataLayout::bfuwzyx || _tensor.GetLayout() == DataLayout::bfvuwzyx ||
_tensor.GetLayout() == DataLayout::b_fs_yx_fsv16) {
definitions.push_back({_name + "_X_PITCH", "1"});
definitions.push_back({_name + "_Y_PITCH", dims_padded.x()});
definitions.push_back({_name + "_Z_PITCH", toVectorMulString({dims_padded.x(), dims_padded.y()})});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,28 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionKernel_b_fs_yx_fsv16_1x1() : Con

ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16_1x1::GetAutoTuneOptions(const Params& params,
int /*autoTuneIndex*/) const {
const convolution_params& cp = static_cast<const convolution_params&>(params);

auto x = cp.outputs[0].X().v;
auto y = cp.outputs[0].Y().v;
auto f = cp.outputs[0].Feature().v;

if (x == 1 && y == 1) {
return { 1, EXE_MODE_DEFAULT };
} else if (x * f <= 256) {
if (x < 8 || x * f <= 128)
return { 2, EXE_MODE_DEFAULT };
else
if (!params.is_shape_agnostic) {
const convolution_params& cp = static_cast<const convolution_params&>(params);

auto x = cp.outputs[0].X().v;
auto y = cp.outputs[0].Y().v;
auto f = cp.outputs[0].Feature().v;

if (x == 1 && y == 1) {
return { 1, EXE_MODE_DEFAULT };
} else if (x * f <= 256) {
if (x < 8 || x * f <= 128)
return { 2, EXE_MODE_DEFAULT };
else
return { 4, EXE_MODE_DEFAULT };
} else if (x * f <= 1536) {
return { 4, EXE_MODE_DEFAULT };
} else if (x * f <= 1536) {
return { 4, EXE_MODE_DEFAULT };
} else {
return { 8, EXE_MODE_DEFAULT };
}
} else {
// In shape agnostic kernel, the output shape cannot be specified at build time,
// So we set blockWidth to 8, which is the most commonly used.
return { 8, EXE_MODE_DEFAULT };
}
}
Expand All @@ -60,17 +66,20 @@ float ConvolutionKernel_b_fs_yx_fsv16_1x1::EstimateOccupancy(const convolution_p
ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionTuningData ConvolutionKernel_b_fs_yx_fsv16_1x1::GetTuningParams(const convolution_params& params) const {
ConvolutionTuningData tuning_data;

const auto& input = params.inputs[0];

size_t ic_blocks = CeilDiv(input.Feature().v, tuning_data.feature_block_size);
if (!params.is_shape_agnostic) {
const auto& input = params.inputs[0];
bool block_size_one_is_better = params.outputs[0].X().v == 1 && params.outputs[0].Y().v == 1 && input.Feature().v >= 2048;

size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size;
bool block_size_one_is_better = params.outputs[0].X().v == 1 && params.outputs[0].Y().v == 1 && input.Feature().v >= 2048;
// Accuracy issue is found with input.Feature() > 16 in static kernel, Need to fix later.
if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.supports_imad && !block_size_one_is_better) {
size_t ic_blocks = CeilDiv(input.Feature().v, tuning_data.feature_block_size);
size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size;

if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.supports_imad && !block_size_one_is_better)
while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) &&
EstimateOccupancy(params, tuning_data) < 4.0)
tuning_data.slm_div_factor *= 2;
while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) &&
EstimateOccupancy(params, tuning_data) < 4.0)
tuning_data.slm_div_factor *= 2;
}
}

tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size;

Expand All @@ -92,6 +101,7 @@ ParamsKey ConvolutionKernel_b_fs_yx_fsv16_1x1::GetSupportedKey() const {
k.EnableBiasPerFeature();
k.EnableNonBiasTerm();
k.EnableBatching();
k.EnableDynamicShapesSupport();
return k;
}

Expand Down Expand Up @@ -126,28 +136,35 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa
dispatchData.lws[1] = tuning_data.work_group_size;
dispatchData.lws[2] = 1;

GPU_DEBUG_INFO << "gws: " << dispatchData.gws[0] << ", " << dispatchData.gws[1] << ", " << dispatchData.gws[2] << std::endl;
GPU_DEBUG_INFO << "lws: " << dispatchData.lws[0] << ", " << dispatchData.lws[1] << ", " << dispatchData.lws[2] << std::endl;

return dispatchData;
}

KernelsPriority ConvolutionKernel_b_fs_yx_fsv16_1x1::GetKernelsPriority(const Params& params) const {
const auto& p = static_cast<const convolution_params&>(params);
auto autoTune = GetAutoTuneOptions(params, -1);

const auto& input = p.inputs[0];
const auto& out = p.outputs[0];

auto bBlockSizeX = out.X().v % autoTune.blockWidth == 0;
auto bBlockSizeXY = out.X().pad.Total() + out.Y().pad.Total() == 0;
auto bInputPad = input.X().pad.Total() + input.Y().pad.Total() != 0;

if (out.Batch().v == 1) {
if ((bBlockSizeX || bBlockSizeXY) && !bInputPad) {
return FORCE_PRIORITY_1;
if (!params.is_shape_agnostic) {
const auto& p = static_cast<const convolution_params&>(params);
auto autoTune = GetAutoTuneOptions(params, -1);

const auto& input = p.inputs[0];
const auto& out = p.outputs[0];

auto bBlockSizeX = out.X().v % autoTune.blockWidth == 0;
auto bBlockSizeXY = out.X().pad.Total() + out.Y().pad.Total() == 0;
auto bInputPad = input.X().pad.Total() + input.Y().pad.Total() != 0;

if (out.Batch().v == 1) {
if ((bBlockSizeX || bBlockSizeXY) && !bInputPad) {
return FORCE_PRIORITY_1;
} else {
return FORCE_PRIORITY_3;
}
} else {
return FORCE_PRIORITY_3;
return FORCE_PRIORITY_7;
}
} else {
return FORCE_PRIORITY_7;
return FORCE_PRIORITY_1;
}
}

Expand All @@ -163,12 +180,21 @@ bool ConvolutionKernel_b_fs_yx_fsv16_1x1::Validate(const Params& p) const {
const auto& input = params.inputs[0];
const auto& output = params.outputs[0];

const bool bOutputSizes = output.X().v != input.X().v || output.Y().v != input.Y().v || output.Feature().v % 16 != 0;
GPU_DEBUG_INFO << "input: " << input.Batch().v << ", " << input.Feature().v << ", " << input.Y().v << ", " << input.X().v << std::endl;
GPU_DEBUG_INFO << "output: " << output.Batch().v << ", " << output.Feature().v << ", " << output.Y().v << ", " << output.X().v << std::endl;

const bool bOutputSizes = (!params.is_shape_agnostic && (output.X().v != input.X().v || output.Y().v != input.Y().v)) ||
output.Feature().v % 16 != 0;
const bool bFilterSize = params.filterSize.x != 1 || params.filterSize.y != 1;
const bool bStride = params.stride.x != 1 || params.stride.y != 1;
const bool bPadding = input.Feature().pad.before % tuning_data.feature_block_size != 0 ||
output.Feature().pad.before % tuning_data.feature_block_size != 0;

GPU_DEBUG_INFO << bOutputSizes << ", " << bFilterSize << ", " << bStride << ", " << bPadding << std::endl;
if (bOutputSizes) {
GPU_DEBUG_INFO << params.is_shape_agnostic << " && " << output.X().v << " != " << input.X().v << ", "
<< output.Y().v << " != " << input.Y().v << " || " << output.Feature().v << "% 16 != 0" << std::endl;
}
if (bOutputSizes || bFilterSize || bStride || bPadding) {
return false;
}
Expand Down Expand Up @@ -215,40 +241,80 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
jit.Merge(MakeFusedOpsJitConstants(params, { conf_vec, conf_scalar1, conf_scalar2 }));
}

jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size));
jit.AddConstant(MakeJitConstant("PADDED_INPUT", params.inputs[0].X().pad.Total() != 0));
GPU_DEBUG_INFO << params.layerID << " : params.fused_ops.empty(): " << params.fused_ops.empty() << std::endl;

bool padded_output = params.outputs[0].X().pad.Total() != 0;
bool non_unit_fused_op_spatial = false;
jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth));
jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size));
jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size));

// Set padded_output to true when fused inputs have paddings to have correct blocked loads
for (auto& fused_op : params.fused_ops) {
for (auto& t : fused_op.tensors) {
if (t.PitchesDifferFromLogicalDims()) {
padded_output = true;
if (!params.has_dynamic_inputs()) {
jit.AddConstant(MakeJitConstant("PADDED_INPUT", params.inputs[0].X().pad.Total() != 0));

bool padded_output = params.outputs[0].X().pad.Total() != 0;
bool non_unit_fused_op_spatial = false;

// Set padded_output to true when fused inputs have paddings to have correct blocked loads
for (auto& fused_op : params.fused_ops) {
for (auto& t : fused_op.tensors) {
if (t.PitchesDifferFromLogicalDims()) {
padded_output = true;
}
if ((t.X().v > 1) ||
(t.Y().v > 1) ||
(t.Z().v > 1) ||
(t.W().v > 1)) {
non_unit_fused_op_spatial = true;
}
}
if ((t.X().v > 1) ||
(t.Y().v > 1) ||
(t.Z().v > 1) ||
(t.W().v > 1)) {
non_unit_fused_op_spatial = true;
}

jit.AddConstant(MakeJitConstant("PADDED_OUTPUT", padded_output));
jit.AddConstant(MakeJitConstant("NON_UNIT_FUSED_OP_SPATIAL", non_unit_fused_op_spatial));

jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(params.inputs[0].Feature().v, tuning_data.feature_block_size)));
if (params.outputs[0].Feature().v % tuning_data.feature_block_size != 0) {
jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
}
if (params.inputs[0].Feature().v % tuning_data.feature_block_size != 0) {
jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
}
} else {
DimensionAccessHelperJit input0_dims(params.inputs[0]);
DimensionAccessHelperJit input0_padded_dims(params.inputs[0], true);
DimensionAccessHelperJit output_dims(params.outputs[0]);
DimensionAccessHelperJit output_padded_dims(params.outputs[0], true);

const auto padded_input = "(" + input0_padded_dims.x_pad().first + "+" + input0_padded_dims.x_pad().first + ") != 0";
jit.AddConstant(MakeJitConstant("PADDED_INPUT", padded_input));

const auto padded_output = "(" + output_padded_dims.x_pad().first + "+" + output_padded_dims.x_pad().first + ") != 0";
jit.AddConstant(MakeJitConstant("PADDED_OUTPUT", padded_output));

// In shape agnostic kernel, the fused shape cannot be specified at build time or run time.
// Currently simply check whether fused_op is dynmaic. Need to further follow up like static behavior.
bool non_unit_fused_op_spatial = false;
for (auto& fused_op : params.fused_ops) {
for (auto& t : fused_op.tensors) {
if (t.is_dynamic()) {
non_unit_fused_op_spatial = true;
break;
}
}
}
}
jit.AddConstant(MakeJitConstant("NON_UNIT_FUSED_OP_SPATIAL", non_unit_fused_op_spatial));

jit.AddConstant(MakeJitConstant("PADDED_OUTPUT", padded_output));
jit.AddConstant(MakeJitConstant("NON_UNIT_FUSED_OP_SPATIAL", non_unit_fused_op_spatial));
const auto feature_block_size = std::to_string(tuning_data.feature_block_size);
const auto ic_blocks = "(" + input0_dims.f() + "+" + feature_block_size + " - 1) / " + feature_block_size;
jit.AddConstant(MakeJitConstant("IC_BLOCKS", ic_blocks));

jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth));
jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(params.outputs[0].X().v, blockWidth)));
jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size));
jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(params.inputs[0].Feature().v, tuning_data.feature_block_size)));
if (params.outputs[0].Feature().v % tuning_data.feature_block_size != 0) {
jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
}
if (params.inputs[0].Feature().v % tuning_data.feature_block_size != 0) {
jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
const auto output_leftover_num = "(" + output_dims.f() + "%" + feature_block_size + ")";
const auto output_leftover = "(" + output_leftover_num + "!= 0)";
jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", output_leftover));

const auto input_leftover_num = "(" + input0_dims.f() + "%" + feature_block_size + ")";
const auto input_leftover = "(" + input_leftover_num + "!= 0)";
jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", input_leftover));
}

return jit;
Expand Down
Loading