Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ class Plugin : public ov::IPlugin {

std::map<std::string, std::shared_ptr<RemoteContextImpl>> get_default_contexts() const;

std::shared_ptr<ov::Model> clone_and_transform_model(const std::shared_ptr<const ov::Model>& network, const ExecutionConfig& config) const;
void transform_model(std::shared_ptr<ov::Model>& model, const ExecutionConfig& config) const;
std::shared_ptr<ov::Model> clone_and_transform_model(const std::shared_ptr<const ov::Model>& network,
const ExecutionConfig& config,
const std::shared_ptr<RemoteContextImpl>& context) const;
void transform_model(std::shared_ptr<ov::Model>& model, const ExecutionConfig& config, const std::shared_ptr<RemoteContextImpl>& context) const;
void register_primitives() const;
std::string get_device_id_from_config(const ov::AnyMap& config) const;
std::string get_device_id(const ov::AnyMap& config) const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <memory>

#include "intel_gpu/plugin/remote_context.hpp"
#include "openvino/core/model.hpp"

#include "intel_gpu/runtime/execution_config.hpp"
Expand All @@ -16,12 +17,13 @@ namespace intel_gpu {

class TransformationsPipeline {
public:
explicit TransformationsPipeline(const ExecutionConfig &conf, const cldnn::device_info &device_info)
: config(conf), device_info(device_info) {}
explicit TransformationsPipeline(const ExecutionConfig &conf, const std::shared_ptr<RemoteContextImpl>& context)
: config(conf), m_context(context), device_info(context->get_engine().get_device_info()) {}
void apply(std::shared_ptr<ov::Model> func);

private:
const ExecutionConfig& config;
std::shared_ptr<RemoteContextImpl> m_context;
cldnn::device_info device_info;
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ enum class LogLevel : int8_t {
#else
#define SEPARATE '/'
#endif
#define __FILENAME__ (strrchr(__FILE__, SEPARATE) ? strrchr(__FILE__, SEPARATE) + 1 : __FILE__)
#define GPU_FILENAME (strrchr(__FILE__, SEPARATE) ? strrchr(__FILE__, SEPARATE) + 1 : __FILE__)
#define GPU_DEBUG_IF(cond) if (cond)
#define GPU_DEBUG_CODE(...) __VA_ARGS__
#define GPU_DEBUG_DEFINE_MEM_LOGGER(stage) \
Expand All @@ -62,9 +62,9 @@ enum class LogLevel : int8_t {
#define GPU_DEBUG_LOG_RAW_INT(min_verbose_level) if (cldnn::debug_configuration::get_instance()->verbose >= min_verbose_level) \
((cldnn::debug_configuration::get_instance()->verbose_color == 0) ? GPU_DEBUG_LOG_PREFIX : GPU_DEBUG_LOG_COLOR_PREFIX)
#define GPU_DEBUG_LOG_RAW(min_verbose_level) GPU_DEBUG_LOG_RAW_INT(static_cast<std::underlying_type<ov::intel_gpu::LogLevel>::type>(min_verbose_level))
#define GPU_DEBUG_LOG_PREFIX std::cout << cldnn::debug_configuration::prefix << __FILENAME__ << ":" <<__LINE__ << ":" << __func__ << ": "
#define GPU_DEBUG_LOG_PREFIX std::cout << cldnn::debug_configuration::prefix << GPU_FILENAME << ":" <<__LINE__ << ":" << __func__ << ": "
#define GPU_DEBUG_LOG_COLOR_PREFIX std::cout << DARK_GRAY << cldnn::debug_configuration::prefix << \
BLUE << __FILENAME__ << ":" << PURPLE << __LINE__ << ":" << CYAN << __func__ << ": " << RESET
BLUE << GPU_FILENAME << ":" << PURPLE << __LINE__ << ":" << CYAN << __func__ << ": " << RESET
#define DARK_GRAY "\033[1;30m"
#define BLUE "\033[1;34m"
#define PURPLE "\033[1;35m"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
#include <string>
#include <vector>
#include <tuple>
#include <array>

namespace cldnn {
/// @addtogroup cpp_api C++ API
Expand All @@ -25,6 +24,17 @@ enum class device_type {
discrete_gpu = 1
};

enum class gpu_arch {
unknown = 0,
gen9 = 1,
gen11 = 2,
xe_lp = 3,
xe_hp = 4,
xe_hpg = 5,
xe_hpc = 6,
xe2 = 7,
};

/// @brief Defines version of GFX IP
struct gfx_version {
uint16_t major;
Expand Down Expand Up @@ -77,6 +87,8 @@ struct device_info {
device_type dev_type; ///< Defines type of current GPU device (integrated or discrete)

gfx_version gfx_ver; ///< Defines GFX IP version
gpu_arch arch; ///< Defines arch human readable name
uint32_t ip_version; ///< Defines raw GFX IP version
uint32_t device_id; ///< ID of current GPU
uint32_t num_slices; ///< Number of slices
uint32_t num_sub_slices_per_slice; ///< Number of subslices in a slice
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,9 @@ struct kernel_string {
std::string options;
std::string entry_point;
bool batch_compilation;
bool has_microkernels;

kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""), batch_compilation(false) {}
kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""), batch_compilation(false), has_microkernels(false) {}

std::string get_str() const { return str + jit + undefs + options + entry_point; }
size_t get_hash() const { return std::hash<std::string>()(get_str()); }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "intel_gpu/graph/program.hpp"

#include "kernel_selector_helper.h"
#include "intel_gpu/runtime/device_info.hpp"
#include "kernel_selector_params.h"
#include "to_string_utils.h"
#include "program_node.h"
Expand Down Expand Up @@ -32,7 +33,6 @@
#include "intel_gpu/primitives/extract_image_patches.hpp"

#include "activation_inst.h"
#include "depth_to_space_inst.h"
#include "eltwise_inst.h"
#include "quantize_inst.h"
#include "reorder_inst.h"
Expand All @@ -44,9 +44,9 @@
#include "kernel_selector/kernels/reorder/reorder_kernel_base.h"

#include "runtime/kernels_cache.hpp"
#include "kernel_base.h"

#include <string>
#include <type_traits>
#include <vector>

namespace {
Expand Down Expand Up @@ -119,6 +119,48 @@ bool query_local_block_io_supported(engine& e, const ExecutionConfig& config) {

namespace cldnn {

bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config) {
auto device = e.get_device().get();

static std::mutex m;
std::lock_guard<std::mutex> lock(m);
static std::map<cldnn::device*, bool> cache;
if (cache.find(device) != cache.end()) {
return cache.at(device);
}

std::shared_ptr<kernel_selector::KernelString> kernel_string = std::make_shared<kernel_selector::KernelString>();
// This program check that all required vISA features are supported by current IGC version
const char* kernel_code = R""""(
kernel void igc_check() {
__asm__ volatile(
".decl AA0 v_type=G type=ud num_elts=1\n"
".decl AA1 v_type=G type=ud num_elts=1\n"
".implicit_PSEUDO_INPUT AA0 offset=256 size=4\n"
".implicit_PSEUDO_INPUT AA1 offset=256 size=4\n"
"mov (M1_NM,1) AA0(0,0)<1> AA1(0,0)<0;1,0>\n"
);
}
)"""";

kernel_string->str = kernel_code;
kernel_string->options = "";
kernel_string->entry_point = "igc_check";
kernel_string->batch_compilation = true;

try {
cldnn::kernel_impl_params dummy_params;
auto _kernels_cache_device_query = std::unique_ptr<cldnn::kernels_cache>(new cldnn::kernels_cache(e, config, 0));
_kernels_cache_device_query->add_kernels_source(dummy_params, {kernel_string}, false);
_kernels_cache_device_query->build_all();
cache[device] = true;
} catch (std::exception&) {
cache[device] = false;
}

return cache.at(device);
}

kernel_selector::data_type to_data_type(data_types dt) {
switch (dt) {
case cldnn::data_types::i4:
Expand Down Expand Up @@ -1081,6 +1123,7 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
params.engineInfo.bOptHintsSupport = false;

params.engineInfo.bLocalBlockIOSupport = query_local_block_io_supported(engine, config);
params.engineInfo.supports_microkernels = query_microkernels_supported(engine, config);
params.engineInfo.deviceType = get_device_type(device_info.dev_type);
params.engineInfo.maxWorkGroupSize = device_info.max_work_group_size;
params.engineInfo.maxLocalMemSize = device_info.max_local_mem_size;
Expand All @@ -1092,6 +1135,8 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
params.engineInfo.driverVersion = device_info.driver_version;
params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes;
params.engineInfo.vendor_id = device_info.vendor_id;
params.engineInfo.ip_version = device_info.ip_version;
params.engineInfo.arch = kernel_selector::gpu_arch(static_cast<std::underlying_type<gpu_arch>::type>(device_info.arch));

auto impl_forcing = config.get_property(ov::intel_gpu::force_implementations);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -294,4 +294,6 @@ inline void update_shapes(kernel_selector::Params& p, const kernel_impl_params&
}
}

bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config);

} // namespace cldnn
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive<scaled_dot_prod
// buffers number and its' sizes (since update_dispatch_data is called for both kernels too), and
// do not double memory allocations during reallocate_if_needed() function call
std::vector<layout> layouts;
if (_kernels_data.size() > 0) {
if (_kernels_data.size() > 0 && !_kernels_data[0].internalBufferSizes.empty()) {
auto dtype = from_data_type(_kernels_data[0].internalBufferDataType);
const auto bpp = data_type_traits::size_of(dtype);
for (auto size : _kernels_data[0].internalBufferSizes) {
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/graph/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ void program::init_program() {
if (_task_executor == nullptr)
_task_executor = program::make_task_executor(_config);
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, _config, prog_id, _task_executor,
kernel_selector::KernelBase::get_db().get_batch_header_str()));
kernel_selector::KernelBase::get_db().get_batch_headers()));

if (!_compilation_context)
_compilation_context = program::make_compilation_context(_config);
Expand Down
4 changes: 4 additions & 0 deletions src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ target_include_directories(${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${INCLUDE_DIR
target_compile_options(${TARGET_NAME} PRIVATE
$<$<CONFIG:Release>:$<IF:$<CXX_COMPILER_ID:MSVC>,/Os,-Os>>)

if (ENABLE_ONEDNN_FOR_GPU)
target_link_libraries(${TARGET_NAME} PRIVATE onednn_gpu_tgt)
endif()

if(COMMAND add_cpplint_target)
add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
endif()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*******************************************************************************
* Copyright 2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#ifndef GPU_INTEL_OCL_GENERIC_VECTOR_OPS_H
#define GPU_INTEL_OCL_GENERIC_VECTOR_OPS_H

typedef half __attribute__((ext_vector_type(1))) half1;
typedef uint __attribute__((ext_vector_type(1))) uint1;
typedef float __attribute__((ext_vector_type(1))) float1;

float1 __attribute__((overloadable)) vmad(float1 a, float1 b, float1 c) {
c[0] = mad(a[0], b[0], c[0]);
return c;
}
float2 __attribute__((overloadable)) vmad(float2 a, float2 b, float2 c) {
return mad(a, b, c);
}
float4 __attribute__((overloadable)) vmad(float4 a, float4 b, float4 c) {
return mad(a, b, c);
}
float8 __attribute__((overloadable)) vmad(float8 a, float8 b, float8 c) {
return mad(a, b, c);
}
float16 __attribute__((overloadable)) vmad(float16 a, float16 b, float16 c) {
return mad(a, b, c);
}

float1 __attribute__((overloadable)) native_vrecip(float1 x) {
x[0] = native_recip(x[0]);
return x;
}
float2 __attribute__((overloadable)) native_vrecip(float2 x) {
return native_recip(x);
}
float4 __attribute__((overloadable)) native_vrecip(float4 x) {
return native_recip(x);
}
float8 __attribute__((overloadable)) native_vrecip(float8 x) {
return native_recip(x);
}
float16 __attribute__((overloadable)) native_vrecip(float16 x) {
return native_recip(x);
}

float1 __attribute__((overloadable)) native_vexp2(float1 x) {
x[0] = native_exp2(x[0]);
return x;
}
float2 __attribute__((overloadable)) native_vexp2(float2 x) {
return native_exp2(x);
}
float4 __attribute__((overloadable)) native_vexp2(float4 x) {
return native_exp2(x);
}
float8 __attribute__((overloadable)) native_vexp2(float8 x) {
return native_exp2(x);
}
float16 __attribute__((overloadable)) native_vexp2(float16 x) {
return native_exp2(x);
}

#endif
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*******************************************************************************
* Copyright 2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#ifndef GPU_OCL_SDPA_UTILS_H
#define GPU_OCL_SDPA_UTILS_H

#define _4D_OFF(tag, x0, x1, x2, x3) \
(((x0) % tag##_B0) * tag##_SB0 + ((x0) / tag##_B0) * tag##_S0 \
+ ((x1) % tag##_B1) * tag##_SB1 + ((x1) / tag##_B1) * tag##_S1 \
+ ((x2) % tag##_B2) * tag##_SB2 + ((x2) / tag##_B2) * tag##_S2 \
+ ((x3) % tag##_B3) * tag##_SB3 + ((x3) / tag##_B3) * tag##_S3)

#define QRY_OFF(x0, x1, x2, x3) _4D_OFF(QRY, x0, x1, x2, x3)
#define KEY_OFF(x0, x1, x2, x3) _4D_OFF(KEY, x0, x1, x2, x3)
#define VAL_OFF(x0, x1, x2, x3) _4D_OFF(VAL, x0, x1, x2, x3)
#define MSK_OFF(x0, x1, x2, x3) _4D_OFF(MSK, x0, x1, x2, x3)

#define DST_OFF(x0, x1, d, h, w) \
(((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \
+ ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1)

#endif
Loading