diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp index 573cee77702..5cdef8852d6 100644 --- a/src/cpu/cpu_convolution_list.cpp +++ b/src/cpu/cpu_convolution_list.cpp @@ -172,6 +172,40 @@ const std::map> &impl_list_map() CPU_INSTANCE(ref_fused_convolution_fwd_t) nullptr, }}, + {{forward, f16, f16, f32}, { + CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t) + CPU_INSTANCE_X64(ip_convolution_fwd_t) + CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t, avx512_core_amx_fp16) + CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx_fp16) + CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx_fp16, true) + CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_fp16) + CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_fp16) + CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_fp16, true) + CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni_2) + CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2) + CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2, true) + CPU_INSTANCE(ref_convolution_fwd_t) + nullptr, + }}, + {{forward, f16, f16, f16}, { + CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t) + CPU_INSTANCE_X64(ip_convolution_fwd_t) + CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t, avx512_core_amx_fp16) + CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx_fp16) + CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx_fp16, true) + CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_fp16) + CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_fp16) + CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_fp16, true) + CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni_2) + CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2) + CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2, true) + CPU_INSTANCE_ACL(acl_wino_convolution_fwd_t) + CPU_INSTANCE_ACL(acl_indirect_gemm_convolution_fwd_t) + CPU_INSTANCE_ACL(acl_gemm_convolution_fwd_t, f16) + CPU_INSTANCE(ref_convolution_fwd_t) + CPU_INSTANCE(ref_fused_convolution_fwd_t) + nullptr, + }}, // BWD_D fp {{backward_data, f32, f32, f32}, REG_CONV_P({ CPU_INSTANCE_X64(ip_convolution_bwd_data_t) @@ -220,6 +254,22 @@ const std::map> &impl_list_map() CPU_INSTANCE(ref_convolution_bwd_data_t) nullptr, })}, + {{backward_data, f32, f16, f16}, REG_BWD_D_PK({ + CPU_INSTANCE_X64(ip_convolution_bwd_data_t) + CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t) + CPU_INSTANCE(ref_convolution_bwd_data_t) + nullptr, + })}, + {{backward_data, f16, f16, f16}, REG_BWD_D_PK({ + CPU_INSTANCE_X64(ip_convolution_bwd_data_t) + CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t) + CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t) + CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t) + CPU_INSTANCE(ref_convolution_bwd_data_t) + nullptr, + })}, // BWD_W fp {{backward_weights, f32, f32, f32}, REG_BWD_PK({ CPU_INSTANCE_X64(ip_convolution_bwd_weights_t) diff --git a/src/cpu/cpu_deconvolution_list.cpp b/src/cpu/cpu_deconvolution_list.cpp index ae8867095eb..1eb0c5ec33d 100644 --- a/src/cpu/cpu_deconvolution_list.cpp +++ b/src/cpu/cpu_deconvolution_list.cpp @@ -49,8 +49,8 @@ using namespace dnnl::impl::prop_kind; const std::map> &impl_list_map() { static const std::map> the_map = REG_DECONV_P({ {{forward}, { - CPU_INSTANCE_AMX(brgemm_deconvolution_fwd_t) - CPU_INSTANCE_AMX(brgemm_deconvolution_fwd_t) + CPU_INSTANCE_AMX(brgemm_deconvolution_fwd_t, avx512_core_amx_fp16) + CPU_INSTANCE_AMX(brgemm_deconvolution_fwd_t, avx512_core_amx) CPU_INSTANCE_AMX(jit_avx512_core_amx_deconvolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t) CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_deconvolution_fwd_t) diff --git a/src/cpu/cpu_inner_product_list.cpp b/src/cpu/cpu_inner_product_list.cpp index b37cc262c04..2259519eaa2 100644 --- a/src/cpu/cpu_inner_product_list.cpp +++ b/src/cpu/cpu_inner_product_list.cpp @@ -68,6 +68,21 @@ const std::map> &impl_list_map() CPU_INSTANCE(ref_inner_product_fwd_t) nullptr, }}, + {{forward, f16, f16, f32}, { + CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t, avx512_core_amx_fp16) + CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_fp16) + CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2_vnni_2) + CPU_INSTANCE(ref_inner_product_fwd_t) + nullptr, + }}, + {{forward, f16, f16, f16}, { + CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t, avx512_core_amx_fp16) + CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_fp16) + CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2_vnni_2) + CPU_INSTANCE_ACL(acl_inner_product_fwd_t) + CPU_INSTANCE(ref_inner_product_fwd_t) + nullptr, + }}, {{backward_data, f32, f32, f32}, REG_BWD_PK({ CPU_INSTANCE_AMX(brgemm_inner_product_bwd_data_t, avx512_core_amx) // bf32 CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_data_t, avx512_core) diff --git a/src/cpu/cpu_pooling_list.cpp b/src/cpu/cpu_pooling_list.cpp index 60c00966e4d..6a8f5230e19 100644 --- a/src/cpu/cpu_pooling_list.cpp +++ b/src/cpu/cpu_pooling_list.cpp @@ -54,9 +54,11 @@ const std::map> &impl_list_map() { static const std::map> the_map = REG_POOLING_P({ {{forward}, { /* fp */ + CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx512_core_fp16, f16) CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx512_core, bf16) CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx512_core, f32) CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx2_vnni_2, bf16) + CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx2_vnni_2, f16) CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx2, f32) CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx, f32) CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, sse41, f32) diff --git a/src/cpu/matmul/cpu_matmul_list.cpp b/src/cpu/matmul/cpu_matmul_list.cpp index 8279505e305..de544a03e0c 100644 --- a/src/cpu/matmul/cpu_matmul_list.cpp +++ b/src/cpu/matmul/cpu_matmul_list.cpp @@ -68,6 +68,7 @@ using namespace dnnl::impl::cpu::matmul; // constexpr impl_list_item_t impl_list[] = REG_MATMUL_P({ const impl_list_item_t impl_list[] = REG_MATMUL_P({ CPU_INSTANCE_ACL(acl_matmul_t) + CPU_INSTANCE_AMX(brgemm_matmul_t, avx512_core_amx_fp16) CPU_INSTANCE_AMX(brgemm_matmul_t, avx512_core_amx) CPU_INSTANCE_AVX512(brgemm_matmul_t, avx512_core) CPU_INSTANCE(gemm_f32_matmul_t) @@ -78,7 +79,7 @@ const impl_list_item_t impl_list[] = REG_MATMUL_P({ CPU_INSTANCE_AVX2(brgemm_matmul_t, avx2_vnni_2) CPU_INSTANCE_AVX2(brgemm_matmul_t, avx2_vnni) CPU_INSTANCE(gemm_x8s8s32x_matmul_t) - CPU_INSTANCE_AVX512(brgemm_matmul_t) + CPU_INSTANCE_AVX512(brgemm_matmul_t, avx512_core_fp16) CPU_INSTANCE(ref_matmul_t) CPU_INSTANCE(ref_matmul_int8_t) // These implementations are enabled only when DNNL_EXPERIMENTAL_SPARSE diff --git a/src/cpu/reorder/cpu_reorder.cpp b/src/cpu/reorder/cpu_reorder.cpp index 11a4fea2fd2..44007099608 100644 --- a/src/cpu/reorder/cpu_reorder.cpp +++ b/src/cpu/reorder/cpu_reorder.cpp @@ -26,12 +26,14 @@ static const std::map & regular_impl_list_map() { static const std::map the_map = { {{f32, bf16, 0}, ®ular_f32_bf16_impl_list_map()}, + {{f32, f16, 0}, ®ular_f32_f16_impl_list_map()}, {{f32, f32, 0}, ®ular_f32_f32_impl_list_map()}, {{f32, s32, 0}, ®ular_f32_s32_impl_list_map()}, {{f32, s8, 0}, ®ular_f32_s8_impl_list_map()}, {{f32, u8, 0}, ®ular_f32_u8_impl_list_map()}, {{f32, bin, 0}, ®ular_f32_bin_impl_list_map()}, {{bf16, data_type::undef, 0}, ®ular_bf16_impl_list_map()}, + {{f16, data_type::undef, 0}, ®ular_f16_impl_list_map()}, {{s32, data_type::undef, 0}, ®ular_s32_impl_list_map()}, {{s8, data_type::undef, 0}, ®ular_s8_impl_list_map()}, {{u8, data_type::undef, 0}, ®ular_u8_impl_list_map()}, diff --git a/src/cpu/reorder/cpu_reorder_regular_f16.cpp b/src/cpu/reorder/cpu_reorder_regular_f16.cpp index b83887ef0df..2392098d07a 100644 --- a/src/cpu/reorder/cpu_reorder_regular_f16.cpp +++ b/src/cpu/reorder/cpu_reorder_regular_f16.cpp @@ -27,8 +27,8 @@ const impl_list_map_t ®ular_f16_impl_list_map() { // f16 -> {{f16, data_type::undef, 0}, { DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t)) - DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t)) - DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t)) + DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t)) + DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t)) REG_SR(f16, any, f16, any, fmt_order_any, spec_reference) REG_SR(f16, any, f32, any, fmt_order_any, spec_reference) REG_SR(f16, any, s8, any, fmt_order_any, spec_reference) diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f16.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f16.cpp index a5d79f8a79a..02cd9cd7c66 100644 --- a/src/cpu/reorder/cpu_reorder_regular_f32_f16.cpp +++ b/src/cpu/reorder/cpu_reorder_regular_f32_f16.cpp @@ -26,8 +26,8 @@ const impl_list_map_t ®ular_f32_f16_impl_list_map() { static const impl_list_map_t the_map = REG_REORDER_P({ // f32 -> f16 {{f32, f16, 0}, { - DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t)) - DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t)) + DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t)) + DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t)) nullptr, }}, diff --git a/src/cpu/x64/jit_uni_pool_kernel.cpp b/src/cpu/x64/jit_uni_pool_kernel.cpp index 6be92a61370..c378e3e8d92 100644 --- a/src/cpu/x64/jit_uni_pool_kernel.cpp +++ b/src/cpu/x64/jit_uni_pool_kernel.cpp @@ -1638,6 +1638,7 @@ template struct jit_uni_pool_kernel; template struct jit_uni_pool_kernel; template struct jit_uni_pool_kernel; template struct jit_uni_pool_kernel; +template struct jit_uni_pool_kernel; } // namespace x64 } // namespace cpu diff --git a/src/cpu/x64/jit_uni_pooling.cpp b/src/cpu/x64/jit_uni_pooling.cpp index 977158156d8..ca8967dfec2 100644 --- a/src/cpu/x64/jit_uni_pooling.cpp +++ b/src/cpu/x64/jit_uni_pooling.cpp @@ -1279,6 +1279,7 @@ template struct jit_uni_pooling_fwd_t; template struct jit_uni_pooling_bwd_t; template struct jit_uni_pooling_fwd_t; template struct jit_uni_pooling_bwd_t; +template struct jit_uni_pooling_fwd_t; } // namespace x64 } // namespace cpu