Skip to content

Commit 2918322

Browse files
author
Vladimir Paramuzov
authored
[GPU] Micro sdpa (#24656)
### Details: - Added SDPA impl based on microkernels using internal onednn API and related infra - Current limitations: - fused transpose shouldn't change order of innermost dim (head size). - is_causal = true is not supported - fp16 only - num heads dimension must be static - no indirect kv support - Initial version of KV Cache + SDPA func test - Enabled Transpose+SDPA fusion for static shape too ### Tickets: - CVS-141761
1 parent a3d2b6a commit 2918322

35 files changed

+2216
-79
lines changed

src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@ class Plugin : public ov::IPlugin {
2626

2727
std::map<std::string, std::shared_ptr<RemoteContextImpl>> get_default_contexts() const;
2828

29-
std::shared_ptr<ov::Model> clone_and_transform_model(const std::shared_ptr<const ov::Model>& network, const ExecutionConfig& config) const;
30-
void transform_model(std::shared_ptr<ov::Model>& model, const ExecutionConfig& config) const;
29+
std::shared_ptr<ov::Model> clone_and_transform_model(const std::shared_ptr<const ov::Model>& network,
30+
const ExecutionConfig& config,
31+
const std::shared_ptr<RemoteContextImpl>& context) const;
32+
void transform_model(std::shared_ptr<ov::Model>& model, const ExecutionConfig& config, const std::shared_ptr<RemoteContextImpl>& context) const;
3133
void register_primitives() const;
3234
std::string get_device_id_from_config(const ov::AnyMap& config) const;
3335
std::string get_device_id(const ov::AnyMap& config) const;

src/plugins/intel_gpu/include/intel_gpu/plugin/transformations_pipeline.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include <memory>
88

9+
#include "intel_gpu/plugin/remote_context.hpp"
910
#include "openvino/core/model.hpp"
1011

1112
#include "intel_gpu/runtime/execution_config.hpp"
@@ -16,12 +17,13 @@ namespace intel_gpu {
1617

1718
class TransformationsPipeline {
1819
public:
19-
explicit TransformationsPipeline(const ExecutionConfig &conf, const cldnn::device_info &device_info)
20-
: config(conf), device_info(device_info) {}
20+
explicit TransformationsPipeline(const ExecutionConfig &conf, const std::shared_ptr<RemoteContextImpl>& context)
21+
: config(conf), m_context(context), device_info(context->get_engine().get_device_info()) {}
2122
void apply(std::shared_ptr<ov::Model> func);
2223

2324
private:
2425
const ExecutionConfig& config;
26+
std::shared_ptr<RemoteContextImpl> m_context;
2527
cldnn::device_info device_info;
2628
};
2729

src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ enum class LogLevel : int8_t {
4848
#else
4949
#define SEPARATE '/'
5050
#endif
51-
#define __FILENAME__ (strrchr(__FILE__, SEPARATE) ? strrchr(__FILE__, SEPARATE) + 1 : __FILE__)
51+
#define GPU_FILENAME (strrchr(__FILE__, SEPARATE) ? strrchr(__FILE__, SEPARATE) + 1 : __FILE__)
5252
#define GPU_DEBUG_IF(cond) if (cond)
5353
#define GPU_DEBUG_CODE(...) __VA_ARGS__
5454
#define GPU_DEBUG_DEFINE_MEM_LOGGER(stage) \
@@ -62,9 +62,9 @@ enum class LogLevel : int8_t {
6262
#define GPU_DEBUG_LOG_RAW_INT(min_verbose_level) if (cldnn::debug_configuration::get_instance()->verbose >= min_verbose_level) \
6363
((cldnn::debug_configuration::get_instance()->verbose_color == 0) ? GPU_DEBUG_LOG_PREFIX : GPU_DEBUG_LOG_COLOR_PREFIX)
6464
#define GPU_DEBUG_LOG_RAW(min_verbose_level) GPU_DEBUG_LOG_RAW_INT(static_cast<std::underlying_type<ov::intel_gpu::LogLevel>::type>(min_verbose_level))
65-
#define GPU_DEBUG_LOG_PREFIX std::cout << cldnn::debug_configuration::prefix << __FILENAME__ << ":" <<__LINE__ << ":" << __func__ << ": "
65+
#define GPU_DEBUG_LOG_PREFIX std::cout << cldnn::debug_configuration::prefix << GPU_FILENAME << ":" <<__LINE__ << ":" << __func__ << ": "
6666
#define GPU_DEBUG_LOG_COLOR_PREFIX std::cout << DARK_GRAY << cldnn::debug_configuration::prefix << \
67-
BLUE << __FILENAME__ << ":" << PURPLE << __LINE__ << ":" << CYAN << __func__ << ": " << RESET
67+
BLUE << GPU_FILENAME << ":" << PURPLE << __LINE__ << ":" << CYAN << __func__ << ": " << RESET
6868
#define DARK_GRAY "\033[1;30m"
6969
#define BLUE "\033[1;34m"
7070
#define PURPLE "\033[1;35m"

src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
#include <string>
1111
#include <vector>
1212
#include <tuple>
13-
#include <array>
1413

1514
namespace cldnn {
1615
/// @addtogroup cpp_api C++ API
@@ -25,6 +24,17 @@ enum class device_type {
2524
discrete_gpu = 1
2625
};
2726

27+
enum class gpu_arch {
28+
unknown = 0,
29+
gen9 = 1,
30+
gen11 = 2,
31+
xe_lp = 3,
32+
xe_hp = 4,
33+
xe_hpg = 5,
34+
xe_hpc = 6,
35+
xe2 = 7,
36+
};
37+
2838
/// @brief Defines version of GFX IP
2939
struct gfx_version {
3040
uint16_t major;
@@ -77,6 +87,8 @@ struct device_info {
7787
device_type dev_type; ///< Defines type of current GPU device (integrated or discrete)
7888

7989
gfx_version gfx_ver; ///< Defines GFX IP version
90+
gpu_arch arch; ///< Defines arch human readable name
91+
uint32_t ip_version; ///< Defines raw GFX IP version
8092
uint32_t device_id; ///< ID of current GPU
8193
uint32_t num_slices; ///< Number of slices
8294
uint32_t num_sub_slices_per_slice; ///< Number of subslices in a slice

src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,9 @@ struct kernel_string {
121121
std::string options;
122122
std::string entry_point;
123123
bool batch_compilation;
124+
bool has_microkernels;
124125

125-
kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""), batch_compilation(false) {}
126+
kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""), batch_compilation(false), has_microkernels(false) {}
126127

127128
std::string get_str() const { return str + jit + undefs + options + entry_point; }
128129
size_t get_hash() const { return std::hash<std::string>()(get_str()); }

src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "intel_gpu/graph/program.hpp"
66

77
#include "kernel_selector_helper.h"
8+
#include "intel_gpu/runtime/device_info.hpp"
89
#include "kernel_selector_params.h"
910
#include "to_string_utils.h"
1011
#include "program_node.h"
@@ -32,7 +33,6 @@
3233
#include "intel_gpu/primitives/extract_image_patches.hpp"
3334

3435
#include "activation_inst.h"
35-
#include "depth_to_space_inst.h"
3636
#include "eltwise_inst.h"
3737
#include "quantize_inst.h"
3838
#include "reorder_inst.h"
@@ -44,9 +44,9 @@
4444
#include "kernel_selector/kernels/reorder/reorder_kernel_base.h"
4545

4646
#include "runtime/kernels_cache.hpp"
47-
#include "kernel_base.h"
4847

4948
#include <string>
49+
#include <type_traits>
5050
#include <vector>
5151

5252
namespace {
@@ -119,6 +119,48 @@ bool query_local_block_io_supported(engine& e, const ExecutionConfig& config) {
119119

120120
namespace cldnn {
121121

122+
bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config) {
123+
auto device = e.get_device().get();
124+
125+
static std::mutex m;
126+
std::lock_guard<std::mutex> lock(m);
127+
static std::map<cldnn::device*, bool> cache;
128+
if (cache.find(device) != cache.end()) {
129+
return cache.at(device);
130+
}
131+
132+
std::shared_ptr<kernel_selector::KernelString> kernel_string = std::make_shared<kernel_selector::KernelString>();
133+
// This program check that all required vISA features are supported by current IGC version
134+
const char* kernel_code = R""""(
135+
kernel void igc_check() {
136+
__asm__ volatile(
137+
".decl AA0 v_type=G type=ud num_elts=1\n"
138+
".decl AA1 v_type=G type=ud num_elts=1\n"
139+
".implicit_PSEUDO_INPUT AA0 offset=256 size=4\n"
140+
".implicit_PSEUDO_INPUT AA1 offset=256 size=4\n"
141+
"mov (M1_NM,1) AA0(0,0)<1> AA1(0,0)<0;1,0>\n"
142+
);
143+
}
144+
)"""";
145+
146+
kernel_string->str = kernel_code;
147+
kernel_string->options = "";
148+
kernel_string->entry_point = "igc_check";
149+
kernel_string->batch_compilation = true;
150+
151+
try {
152+
cldnn::kernel_impl_params dummy_params;
153+
auto _kernels_cache_device_query = std::unique_ptr<cldnn::kernels_cache>(new cldnn::kernels_cache(e, config, 0));
154+
_kernels_cache_device_query->add_kernels_source(dummy_params, {kernel_string}, false);
155+
_kernels_cache_device_query->build_all();
156+
cache[device] = true;
157+
} catch (std::exception&) {
158+
cache[device] = false;
159+
}
160+
161+
return cache.at(device);
162+
}
163+
122164
kernel_selector::data_type to_data_type(data_types dt) {
123165
switch (dt) {
124166
case cldnn::data_types::i4:
@@ -1081,6 +1123,7 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
10811123
params.engineInfo.bOptHintsSupport = false;
10821124

10831125
params.engineInfo.bLocalBlockIOSupport = query_local_block_io_supported(engine, config);
1126+
params.engineInfo.supports_microkernels = query_microkernels_supported(engine, config);
10841127
params.engineInfo.deviceType = get_device_type(device_info.dev_type);
10851128
params.engineInfo.maxWorkGroupSize = device_info.max_work_group_size;
10861129
params.engineInfo.maxLocalMemSize = device_info.max_local_mem_size;
@@ -1092,6 +1135,8 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
10921135
params.engineInfo.driverVersion = device_info.driver_version;
10931136
params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes;
10941137
params.engineInfo.vendor_id = device_info.vendor_id;
1138+
params.engineInfo.ip_version = device_info.ip_version;
1139+
params.engineInfo.arch = kernel_selector::gpu_arch(static_cast<std::underlying_type<gpu_arch>::type>(device_info.arch));
10951140

10961141
auto impl_forcing = config.get_property(ov::intel_gpu::force_implementations);
10971142

src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,4 +294,6 @@ inline void update_shapes(kernel_selector::Params& p, const kernel_impl_params&
294294
}
295295
}
296296

297+
bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config);
298+
297299
} // namespace cldnn

src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive<scaled_dot_prod
5858
// buffers number and its' sizes (since update_dispatch_data is called for both kernels too), and
5959
// do not double memory allocations during reallocate_if_needed() function call
6060
std::vector<layout> layouts;
61-
if (_kernels_data.size() > 0) {
61+
if (_kernels_data.size() > 0 && !_kernels_data[0].internalBufferSizes.empty()) {
6262
auto dtype = from_data_type(_kernels_data[0].internalBufferDataType);
6363
const auto bpp = data_type_traits::size_of(dtype);
6464
for (auto size : _kernels_data[0].internalBufferSizes) {

src/plugins/intel_gpu/src/graph/program.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ void program::init_program() {
223223
if (_task_executor == nullptr)
224224
_task_executor = program::make_task_executor(_config);
225225
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, _config, prog_id, _task_executor,
226-
kernel_selector::KernelBase::get_db().get_batch_header_str()));
226+
kernel_selector::KernelBase::get_db().get_batch_headers()));
227227

228228
if (!_compilation_context)
229229
_compilation_context = program::make_compilation_context(_config);

src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,10 @@ target_include_directories(${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${INCLUDE_DIR
6767
target_compile_options(${TARGET_NAME} PRIVATE
6868
$<$<CONFIG:Release>:$<IF:$<CXX_COMPILER_ID:MSVC>,/Os,-Os>>)
6969

70+
if (ENABLE_ONEDNN_FOR_GPU)
71+
target_link_libraries(${TARGET_NAME} PRIVATE onednn_gpu_tgt)
72+
endif()
73+
7074
if(COMMAND add_cpplint_target)
7175
add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
7276
endif()

0 commit comments

Comments
 (0)