openvinotoolkit · yeonbok · Jun 28, 2024 · Jun 14, 2024 · Jun 18, 2024 · Jun 19, 2024
@@ -45,7 +45,6 @@ void compile_graph::run(program& p) {
     GPU_DEBUG_IF(debug_config->disable_onednn_permute_fusion == 1)
         disable_permute_fuse_onednn_gemm = true;
 
-
     for (size_t idx = 0; idx < proc_order.size(); idx++) {
         auto& node = *(std::next(proc_order.begin(), idx));
         const bool use_shape_agnostic_impl = !p.get_config().get_property(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape);
@@ -70,6 +69,15 @@ void compile_graph::run(program& p) {
                         change_initial_impl = false;
                 }
             }
+            if (node->is_type<convolution>()) {
+                auto w_layout = node->as<convolution>().weights().get_output_layout();
+                if (w_layout.spatial(0) != 1 || w_layout.spatial(1) != 1) {
+                    change_initial_impl = false;
+                } else {
+                    // will be removed..
+                    GPU_DEBUG_INFO << node->id() << ": " << w_layout.to_short_string() << std::endl;
+                }
+            }
         }
 
         if (change_initial_impl)
@@ -100,7 +108,7 @@ void compile_graph::run(program& p) {
 
         bool is_planar = format::is_default_format(node->get_output_layout().format);
 
-        if (node->is_dynamic() && !is_planar)
+        if (!node->is_type<convolution>() && node->is_dynamic() && !is_planar)
             can_select_impl = false;
 
         if (node->is_type<condition>() || node->is_type<loop>() || node->is_type<proposal>())

@@ -196,26 +196,30 @@ void prepare_primitive_fusing::fuse_bias(program &p) {
 
 
         if (node->get_output_layout().is_dynamic()) {
-            auto broadcast_type = eltw_node.get_primitive()->broadcast_spec.m_type;
-            if (!eltw_node.get_dependency(non_const_dep_idx).is_type<fully_connected>())
-                continue;
-            if (broadcast_type != ov::op::AutoBroadcastType::NUMPY && broadcast_type != ov::op::AutoBroadcastType::NONE)
-                continue;
-            // Numpy broadcast rule requires the dimension size which is not one to be same as the corresponding dimension of the other operand.
-            // So we can ensure that the feature size is same for this broadcasting rule, thereby being considered as bias.
-            auto const_shape = eltw_node.get_dependency(const_dep_idx).get_output_layout().get_shape();
-            int32_t count_elements_not_one = 0;
-            int32_t idx_element_not_one = -1;
-            for (size_t i = 0; i < const_shape.size(); ++i) {
-                if (const_shape[i] != 1) {
-                    count_elements_not_one++;
-                    idx_element_not_one = static_cast<int32_t>(i);
+            if (eltw_node.get_dependency(non_const_dep_idx).is_type<fully_connected>()) {
+                auto broadcast_type = eltw_node.get_primitive()->broadcast_spec.m_type;
+                if (broadcast_type != ov::op::AutoBroadcastType::NUMPY && broadcast_type != ov::op::AutoBroadcastType::NONE)
+                    continue;
+
+                // Numpy broadcast rule requires the dimension size which is not one to be same as the corresponding dimension of the other operand.
+                // So we can ensure that the feature size is same for this broadcasting rule, thereby being considered as bias.
+                auto const_shape = eltw_node.get_dependency(const_dep_idx).get_output_layout().get_shape();
+                int32_t count_elements_not_one = 0;
+                int32_t idx_element_not_one = -1;
+                for (size_t i = 0; i < const_shape.size(); ++i) {
+                    if (const_shape[i] != 1) {
+                        count_elements_not_one++;
+                        idx_element_not_one = static_cast<int32_t>(i);
+                    }
+                    if (count_elements_not_one > 1)
+                        break;
                 }
-                if (count_elements_not_one > 1)
-                    break;
-            }
-            if (count_elements_not_one != 1 ||
-                (idx_element_not_one != (static_cast<int32_t>(const_shape.size()) - 1))) {
+
+                if (count_elements_not_one != 1 ||
+                    (idx_element_not_one != (static_cast<int32_t>(const_shape.size()) - 1))) {
+                    continue;
+                }
+            } else if (!eltw_node.get_dependency(non_const_dep_idx).is_type<convolution>()) {
                 continue;
             }
         } else {

@@ -356,7 +356,8 @@ attach_convolution_impl::attach_convolution_impl() {
     };
     auto dyn_formats = {
         format::bfyx,
-        format::bfzyx
+        format::bfzyx,
+        format::b_fs_yx_fsv16
     };
 
     implementation_map<convolution>::add(impl_types::ocl,

@@ -26,6 +26,7 @@
 REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 __attribute__((reqd_work_group_size(1, SUB_GROUP_SIZE * SLM_DIV_FACTOR, 1)))
 KERNEL(convolution_b_fs_yx_fsv16_1x1)(
+    OPTIONAL_SHAPE_INFO_ARG
     __global INPUT0_TYPE* input,
     __global OUTPUT_TYPE* output,
     __global FILTER_TYPE* weights

@@ -363,7 +363,8 @@ JitDefinitions DataTensorJitConstant::GetDefinitions() const {
     if (_tensor.is_dynamic()) {
         if (_tensor.GetLayout() == DataLayout::bf || _tensor.GetLayout() == DataLayout::bfyx ||
             _tensor.GetLayout() == DataLayout::bfzyx || _tensor.GetLayout() == DataLayout::bfwzyx ||
-            _tensor.GetLayout() == DataLayout::bfuwzyx || _tensor.GetLayout() == DataLayout::bfvuwzyx) {
+            _tensor.GetLayout() == DataLayout::bfuwzyx || _tensor.GetLayout() == DataLayout::bfvuwzyx ||
+            _tensor.GetLayout() == DataLayout::b_fs_yx_fsv16) {
             definitions.push_back({_name + "_X_PITCH", "1"});
             definitions.push_back({_name + "_Y_PITCH", dims_padded.x()});
             definitions.push_back({_name + "_Z_PITCH", toVectorMulString({dims_padded.x(), dims_padded.y()})});

@@ -22,22 +22,28 @@ ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionKernel_b_fs_yx_fsv16_1x1() : Con
 
 ConvolutionKernel_b_fs_yx_fsv16_1x1::AutoTuneOption ConvolutionKernel_b_fs_yx_fsv16_1x1::GetAutoTuneOptions(const Params& params,
                                                                                                             int /*autoTuneIndex*/) const {
-    const convolution_params& cp = static_cast<const convolution_params&>(params);
-
-    auto x = cp.outputs[0].X().v;
-    auto y = cp.outputs[0].Y().v;
-    auto f = cp.outputs[0].Feature().v;
-
-    if (x == 1 && y == 1) {
-        return { 1, EXE_MODE_DEFAULT };
-    } else if (x * f <= 256) {
-        if (x < 8 || x * f <= 128)
-            return { 2, EXE_MODE_DEFAULT };
-        else
+    if (!params.is_shape_agnostic) {
+        const convolution_params& cp = static_cast<const convolution_params&>(params);
+
+        auto x = cp.outputs[0].X().v;
+        auto y = cp.outputs[0].Y().v;
+        auto f = cp.outputs[0].Feature().v;
+
+        if (x == 1 && y == 1) {
+            return { 1, EXE_MODE_DEFAULT };
+        } else if (x * f <= 256) {
+            if (x < 8 || x * f <= 128)
+                return { 2, EXE_MODE_DEFAULT };
+            else
+                return { 4, EXE_MODE_DEFAULT };
+        } else if (x * f <= 1536) {
             return { 4, EXE_MODE_DEFAULT };
-    } else if (x * f <= 1536) {
-        return { 4, EXE_MODE_DEFAULT };
+        } else {
+            return { 8, EXE_MODE_DEFAULT };
+        }
     } else {
+        // In shape agnostic kernel, the output shape cannot be specified at build time,
+        // So we set blockWidth to 8, which is the most commonly used.
         return { 8, EXE_MODE_DEFAULT };
     }
 }
@@ -60,17 +66,20 @@ float ConvolutionKernel_b_fs_yx_fsv16_1x1::EstimateOccupancy(const convolution_p
 ConvolutionKernel_b_fs_yx_fsv16_1x1::ConvolutionTuningData ConvolutionKernel_b_fs_yx_fsv16_1x1::GetTuningParams(const convolution_params& params) const {
     ConvolutionTuningData tuning_data;
 
-    const auto& input = params.inputs[0];
-
-    size_t ic_blocks = CeilDiv(input.Feature().v, tuning_data.feature_block_size);
+    if (!params.is_shape_agnostic) {
+        const auto& input = params.inputs[0];
+        bool block_size_one_is_better = params.outputs[0].X().v == 1 && params.outputs[0].Y().v == 1 && input.Feature().v >= 2048;
 
-    size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size;
-    bool block_size_one_is_better = params.outputs[0].X().v == 1 && params.outputs[0].Y().v == 1 && input.Feature().v >= 2048;
+        // Accuracy issue is found with input.Feature() > 16 in static kernel, Need to fix later.
+        if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.supports_imad && !block_size_one_is_better) {
+            size_t ic_blocks = CeilDiv(input.Feature().v, tuning_data.feature_block_size);
+            size_t max_slm_div_factor = params.engineInfo.maxWorkGroupSize / tuning_data.sub_group_size;
 
-    if (params.engineInfo.deviceType == dev_type::integrated_gpu && params.engineInfo.supports_imad && !block_size_one_is_better)
-        while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) &&
-               EstimateOccupancy(params, tuning_data) < 4.0)
-            tuning_data.slm_div_factor *= 2;
+            while (ic_blocks % (tuning_data.slm_div_factor * 2) == 0 && (tuning_data.slm_div_factor * 2 <= max_slm_div_factor) &&
+                EstimateOccupancy(params, tuning_data) < 4.0)
+                tuning_data.slm_div_factor *= 2;
+        }
+    }
 
     tuning_data.work_group_size = tuning_data.slm_div_factor * tuning_data.sub_group_size;
 
@@ -92,6 +101,7 @@ ParamsKey ConvolutionKernel_b_fs_yx_fsv16_1x1::GetSupportedKey() const {
     k.EnableBiasPerFeature();
     k.EnableNonBiasTerm();
     k.EnableBatching();
+    k.EnableDynamicShapesSupport();
     return k;
 }
 
@@ -126,28 +136,35 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa
     dispatchData.lws[1] = tuning_data.work_group_size;
     dispatchData.lws[2] = 1;
 
+    GPU_DEBUG_INFO << "gws: " << dispatchData.gws[0] << ", " << dispatchData.gws[1] << ", " << dispatchData.gws[2] << std::endl;
+    GPU_DEBUG_INFO << "lws: " << dispatchData.lws[0] << ", " << dispatchData.lws[1] << ", " << dispatchData.lws[2] << std::endl;
+
     return dispatchData;
 }
 
 KernelsPriority ConvolutionKernel_b_fs_yx_fsv16_1x1::GetKernelsPriority(const Params& params) const {
-    const auto& p = static_cast<const convolution_params&>(params);
-    auto autoTune = GetAutoTuneOptions(params, -1);
-
-    const auto& input = p.inputs[0];
-    const auto& out = p.outputs[0];
-
-    auto bBlockSizeX = out.X().v % autoTune.blockWidth == 0;
-    auto bBlockSizeXY = out.X().pad.Total() + out.Y().pad.Total() == 0;
-    auto bInputPad = input.X().pad.Total() + input.Y().pad.Total() != 0;
-
-    if (out.Batch().v == 1) {
-        if ((bBlockSizeX || bBlockSizeXY) && !bInputPad) {
-            return FORCE_PRIORITY_1;
+    if (!params.is_shape_agnostic) {
+        const auto& p = static_cast<const convolution_params&>(params);
+        auto autoTune = GetAutoTuneOptions(params, -1);
+
+        const auto& input = p.inputs[0];
+        const auto& out = p.outputs[0];
+
+        auto bBlockSizeX = out.X().v % autoTune.blockWidth == 0;
+        auto bBlockSizeXY = out.X().pad.Total() + out.Y().pad.Total() == 0;
+        auto bInputPad = input.X().pad.Total() + input.Y().pad.Total() != 0;
+
+        if (out.Batch().v == 1) {
+            if ((bBlockSizeX || bBlockSizeXY) && !bInputPad) {
+                return FORCE_PRIORITY_1;
+            } else {
+                return FORCE_PRIORITY_3;
+            }
         } else {
-            return FORCE_PRIORITY_3;
+            return FORCE_PRIORITY_7;
         }
     } else {
-        return FORCE_PRIORITY_7;
+        return FORCE_PRIORITY_1;
     }
 }
 
@@ -163,12 +180,21 @@ bool ConvolutionKernel_b_fs_yx_fsv16_1x1::Validate(const Params& p) const {
     const auto& input = params.inputs[0];
     const auto& output = params.outputs[0];
 
-    const bool bOutputSizes = output.X().v != input.X().v || output.Y().v != input.Y().v || output.Feature().v % 16 != 0;
+    GPU_DEBUG_INFO << "input: " << input.Batch().v << ", " << input.Feature().v << ", " << input.Y().v << ", " << input.X().v << std::endl;
+    GPU_DEBUG_INFO << "output: " << output.Batch().v << ", " << output.Feature().v << ", " << output.Y().v << ", " << output.X().v << std::endl;
+
+    const bool bOutputSizes = (!params.is_shape_agnostic && (output.X().v != input.X().v || output.Y().v != input.Y().v)) ||
+                                output.Feature().v % 16 != 0;
     const bool bFilterSize = params.filterSize.x != 1 || params.filterSize.y != 1;
     const bool bStride = params.stride.x != 1 || params.stride.y != 1;
     const bool bPadding = input.Feature().pad.before % tuning_data.feature_block_size != 0 ||
                           output.Feature().pad.before % tuning_data.feature_block_size != 0;
 
+    GPU_DEBUG_INFO << bOutputSizes << ", " << bFilterSize << ", " << bStride << ", " << bPadding << std::endl;
+    if (bOutputSizes) {
+        GPU_DEBUG_INFO << params.is_shape_agnostic << " && " << output.X().v << " != " << input.X().v << ", "
+                        << output.Y().v << " != " << input.Y().v << " || "  <<  output.Feature().v  << "% 16 != 0" << std::endl;
+    }
     if  (bOutputSizes || bFilterSize || bStride || bPadding) {
         return false;
     }
@@ -215,40 +241,80 @@ JitConstants ConvolutionKernel_b_fs_yx_fsv16_1x1::GetJitConstants(const convolut
         jit.Merge(MakeFusedOpsJitConstants(params, { conf_vec, conf_scalar1, conf_scalar2 }));
     }
 
-    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size));
-    jit.AddConstant(MakeJitConstant("PADDED_INPUT", params.inputs[0].X().pad.Total() != 0));
+    GPU_DEBUG_INFO << params.layerID << " : params.fused_ops.empty(): " << params.fused_ops.empty() << std::endl;
 
-    bool padded_output = params.outputs[0].X().pad.Total() != 0;
-    bool non_unit_fused_op_spatial = false;
+    jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth));
+    jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
+    jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", tuning_data.sub_group_size));
+    jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size));
 
-    // Set padded_output to true when fused inputs have paddings to have correct blocked loads
-    for (auto& fused_op : params.fused_ops) {
-        for (auto& t : fused_op.tensors) {
-            if (t.PitchesDifferFromLogicalDims()) {
-                padded_output = true;
+    if (!params.has_dynamic_inputs()) {
+        jit.AddConstant(MakeJitConstant("PADDED_INPUT", params.inputs[0].X().pad.Total() != 0));
+
+        bool padded_output = params.outputs[0].X().pad.Total() != 0;
+        bool non_unit_fused_op_spatial = false;
+
+        // Set padded_output to true when fused inputs have paddings to have correct blocked loads
+        for (auto& fused_op : params.fused_ops) {
+            for (auto& t : fused_op.tensors) {
+                if (t.PitchesDifferFromLogicalDims()) {
+                    padded_output = true;
+                }
+                if ((t.X().v > 1) ||
+                    (t.Y().v > 1) ||
+                    (t.Z().v > 1) ||
+                    (t.W().v > 1)) {
+                    non_unit_fused_op_spatial = true;
+                }
             }
-            if ((t.X().v > 1) ||
-                (t.Y().v > 1) ||
-                (t.Z().v > 1) ||
-                (t.W().v > 1)) {
-                non_unit_fused_op_spatial = true;
+        }
+
+        jit.AddConstant(MakeJitConstant("PADDED_OUTPUT", padded_output));
+        jit.AddConstant(MakeJitConstant("NON_UNIT_FUSED_OP_SPATIAL", non_unit_fused_op_spatial));
+
+        jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(params.inputs[0].Feature().v, tuning_data.feature_block_size)));
+        if (params.outputs[0].Feature().v % tuning_data.feature_block_size != 0) {
+            jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
+        }
+        if (params.inputs[0].Feature().v % tuning_data.feature_block_size != 0) {
+            jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
+        }
+    } else {
+        DimensionAccessHelperJit input0_dims(params.inputs[0]);
+        DimensionAccessHelperJit input0_padded_dims(params.inputs[0], true);
+        DimensionAccessHelperJit output_dims(params.outputs[0]);
+        DimensionAccessHelperJit output_padded_dims(params.outputs[0], true);
+
+        const auto padded_input = "(" + input0_padded_dims.x_pad().first + "+" + input0_padded_dims.x_pad().first + ") != 0";
+        jit.AddConstant(MakeJitConstant("PADDED_INPUT", padded_input));
+
+        const auto padded_output = "(" + output_padded_dims.x_pad().first + "+" + output_padded_dims.x_pad().first + ") != 0";
+        jit.AddConstant(MakeJitConstant("PADDED_OUTPUT", padded_output));
+
+        // In shape agnostic kernel, the fused shape cannot be specified at build time or run time.
+        // Currently simply check whether fused_op is dynmaic. Need to further follow up like static behavior.
+        bool non_unit_fused_op_spatial = false;
+        for (auto& fused_op : params.fused_ops) {
+            for (auto& t : fused_op.tensors) {
+                if (t.is_dynamic()) {
+                    non_unit_fused_op_spatial = true;
+                    break;
+                }
             }
         }
-    }
+        jit.AddConstant(MakeJitConstant("NON_UNIT_FUSED_OP_SPATIAL", non_unit_fused_op_spatial));
 
-    jit.AddConstant(MakeJitConstant("PADDED_OUTPUT", padded_output));
-    jit.AddConstant(MakeJitConstant("NON_UNIT_FUSED_OP_SPATIAL", non_unit_fused_op_spatial));
+        const auto feature_block_size = std::to_string(tuning_data.feature_block_size);
+        const auto ic_blocks = "(" + input0_dims.f() + "+" + feature_block_size + " - 1) / " + feature_block_size;
+        jit.AddConstant(MakeJitConstant("IC_BLOCKS", ic_blocks));
 
-    jit.AddConstant(MakeJitConstant("X_BLOCK_SIZE", blockWidth));
-    jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(params.outputs[0].X().v, blockWidth)));
-    jit.AddConstant(MakeJitConstant("SLM_DIV_FACTOR", tuning_data.slm_div_factor));
-    jit.AddConstant(MakeJitConstant("WORK_GROUP_SIZE", tuning_data.work_group_size));
-    jit.AddConstant(MakeJitConstant("IC_BLOCKS", CeilDiv(params.inputs[0].Feature().v, tuning_data.feature_block_size)));
-    if (params.outputs[0].Feature().v % tuning_data.feature_block_size != 0) {
-        jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
-    }
-    if (params.inputs[0].Feature().v % tuning_data.feature_block_size != 0) {
-        jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", 1));
+        const auto output_leftover_num = "(" + output_dims.f() + "%" + feature_block_size + ")";
+        const auto output_leftover = "(" + output_leftover_num + "!= 0)";
+        jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", output_leftover));
+
+        const auto input_leftover_num = "(" + input0_dims.f() + "%" + feature_block_size + ")";
+        const auto input_leftover = "(" + input_leftover_num + "!= 0)";
+        jit.AddConstant(MakeJitConstant("INPUT_LEFTOVERS", input_leftover));
     }
 
     return jit;