-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Open
Labels
Description
I am trying to run Inference with Matmul on x64 where src is either F32/BF16 and weights are INT8 and dst is F32 with OneDNN v3.10.2 using matmul example. I am trying for Computation type to be INT8 as we can observe the speedup
I see that for weight decompression use case set_fpmath_mode() can help us in de-quantizaiton of weights based on src type that helps in computation and in other cases computation type is chosen based on the activation type. While I try with above usecase i.e src is either F32/BF16 and weights are INT8, I am observing failures with BRGEMM as unsupported combination.
Here is the error that I am observing:
onednn_verbose,v1,info,cpu,isa:Intel AVX-512 with Intel DL Boost and bfloat16 support
onednn_verbose,v1,info,gpu,runtime:none
onednn_verbose,v1,info,graph,backend,0:dnnl_backend
onednn_verbose,v1,primitive,info,template:operation,engine,primitive,implementation,prop_kind,memory_descriptors,attributes,auxiliary,problem_desc,exec_time
onednn_verbose,v1,graph,info,template:operation,engine,partition_id,partition_kind,op_names,data_formats,logical_tensors,fpmath_mode,implementation,backend,exec_time
onednn_verbose,v1,primitive,create:dispatch,brgemm_matmul,unsupported datatype combination,src/cpu/x64/matmul/brgemm_matmul_utils.cpp:1327
onednn_verbose,v1,primitive,create:dispatch,brgemm_matmul,unsupported datatype combination,src/cpu/x64/matmul/brgemm_matmul_utils.cpp:1327
onednn_verbose,v1,primitive,create:dispatch,brgemm_matmul,unsupported datatype combination,src/cpu/x64/matmul/brgemm_matmul_utils.cpp:1327
onednn_verbose,v1,primitive,create:dispatch,brgemm_matmul,unsupported datatype combination,src/cpu/x64/matmul/brgemm_matmul_utils.cpp:1327
onednn_verbose,v1,primitive,create:dispatch,matmul,cpu,matmul,gemm:jit:f32,undef,src:bf16::blocked:ab::f0 wei:s8::blocked:ab::f0 bia:f32::blocked:ab::f0_mask2 dst:f32::blocked:ab::f0,attr-scales:src0:0:f32,,128x256:256x512,unsupported datatype combination,src/cpu/matmul/gemm_f32_matmul.cpp:93
onednn_verbose,v1,primitive,create:dispatch,matmul,cpu,matmul,gemm:jit:bf16,undef,src:bf16::blocked:ab::f0 wei:s8::blocked:ab::f0 bia:f32::blocked:ab::f0_mask2 dst:f32::blocked:ab::f0,attr-scales:src0:0:f32,,128x256:256x512,unsupported datatype combination,src/cpu/matmul/gemm_bf16_matmul.cpp:63
onednn_verbose,v1,primitive,create:dispatch,matmul,cpu,matmul,gemm:jit:bf16,undef,src:bf16::blocked:ab::f0 wei:s8::blocked:ab::f0 bia:f32::blocked:ab::f0_mask2 dst:f32::blocked:ab::f0,attr-scales:src0:0:f32,,128x256:256x512,unsupported datatype combination,src/cpu/matmul/gemm_bf16_matmul.cpp:63
onednn_verbose,v1,primitive,create:dispatch,matmul,cpu,matmul,gemm:jit,undef,src:bf16::blocked:ab::f0 wei:s8::blocked:ab::f0 bia:f32::blocked:ab::f0_mask2 dst:f32::blocked:ab::f0,attr-scales:src0:0:f32,,128x256:256x512,unsupported datatype combination,src/cpu/matmul/gemm_x8s8s32x_matmul.cpp:120
onednn_verbose,v1,primitive,create:dispatch,matmul,cpu,matmul,ref:any,undef,src:bf16::blocked:ab::f0 wei:s8::blocked:ab::f0 bia:f32::blocked:ab::f0_mask2 dst:f32::blocked:ab::f0,attr-scales:src0:0:f32,,128x256:256x512,unsupported datatype,src/cpu/matmul/ref_matmul.hpp:68
onednn_verbose,v1,primitive,create:dispatch,matmul,cpu,matmul,ref_int8:any,undef,src:bf16::blocked:ab::f0 wei:s8::blocked:ab::f0 bia:f32::blocked:ab::f0_mask2 dst:f32::blocked:ab::f0,attr-scales:src0:0:f32,,128x256:256x512,unsupported datatype,src/cpu/matmul/ref_matmul_int8.hpp:53
onednn_verbose,v1,primitive,create:dispatch,matmul,cpu,matmul,jit:uni,undef,src:bf16::blocked:ab::f0 wei:s8::blocked:ab::f0 bia:f32::blocked:ab::f0_mask2 dst:f32::blocked:ab::f0,attr-scales:src0:0:f32,,128x256:256x512,unsupported datatype combination,src/cpu/x64/matmul/jit_uni_sparse_matmul.hpp:59
onednn_verbose,v1,primitive,create:dispatch,matmul,cpu,matmul,ref:any,undef,src:bf16::blocked:ab::f0 wei:s8::blocked:ab::f0 bia:f32::blocked:ab::f0_mask2 dst:f32::blocked:ab::f0,attr-scales:src0:0:f32,,128x256:256x512,unsupported sparse md configuration,src/cpu/matmul/ref_sparse_matmul.hpp:47
oneDNN error caught:
Status: unimplemented
Message: could not create a primitive descriptor for the matmul primitive.
Here is the sample example snippet that I am trying out.
auto user_src_md = memory::desc(src_dims, dt::f32, tag::ab);
auto user_src_mem = memory(user_src_md, engine);
write_to_dnnl_memory(src_data.data(), user_src_mem);
// Primitive expects bf16 src, s8 weights, f32 bias and dst
auto src_md = memory::desc(src_dims, dt::bf16, tag::ab);
auto weights_md = memory::desc(weights_dims, dt::s8, tag::ab);
auto bias_md = memory::desc(bias_dims, dt::f32, tag::ab);
auto dst_md = memory::desc(dst_dims, dt::f32, tag::ab);
auto src_mem = memory(src_md, engine);
auto src_reorder = reorder(user_src_mem, src_mem);
src_reorder.execute(engine_stream, user_src_mem, src_mem);
auto weights_mem = memory(weights_md, engine);
auto bias_mem = memory(bias_md, engine);
auto dst_mem = memory(dst_md, engine);
write_to_dnnl_memory(weights_data.data(), weights_mem);
write_to_dnnl_memory(bias_data.data(), bias_mem);
float src_scale = 1.0f / 127.0f;
matmul_attr.set_scales_mask(DNNL_ARG_SRC, 0);
auto src_scale_mem = memory({{1}, dt::f32, tag::x}, engine);
write_to_dnnl_memory(&src_scale, src_scale_mem);
// Create primitive descriptor.
auto matmul_pd = matmul::primitive_desc(
engine, src_md, weights_md, bias_md, dst_md, matmul_attr);
// Create the primitive.
auto matmul_prim = matmul(matmul_pd);
// Primitive arguments.
std::unordered_map<int, memory> matmul_args;
matmul_args.insert({DNNL_ARG_SRC, src_mem});
matmul_args.insert({DNNL_ARG_WEIGHTS, weights_mem});
matmul_args.insert({DNNL_ARG_BIAS, bias_mem});
matmul_args.insert({DNNL_ARG_DST, dst_mem});
matmul_args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scale_mem});
matmul_prim.execute(engine_stream, matmul_args);
Question:
- Can you let me know if we have the support for the combinations where src is F32/BF16, weights are INT8, dst is F32/BF16/u8/s8 ?
- Do we have any API/Flag to set computation type that can be passed through matmul_attrs ?