From 0664abd0e915e690101910e95d50c1655494a46d Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Sun, 12 Feb 2023 19:29:15 -0800 Subject: [PATCH 01/12] Adding simd64 and zmm types. --- src/coreclr/jit/instr.h | 13 ++++----- src/coreclr/jit/simd.h | 41 ++++++++++++++++++++++++++++- src/coreclr/jit/simdashwintrinsic.h | 1 + src/coreclr/jit/targetamd64.h | 1 + src/coreclr/jit/targetx86.h | 1 + src/coreclr/jit/typelist.h | 1 + src/coreclr/jit/vartype.h | 1 + src/coreclr/vm/jitinterface.cpp | 12 +++++++-- 8 files changed, 62 insertions(+), 9 deletions(-) diff --git a/src/coreclr/jit/instr.h b/src/coreclr/jit/instr.h index 180ad19ad3a96e..43778b402d0138 100644 --- a/src/coreclr/jit/instr.h +++ b/src/coreclr/jit/instr.h @@ -374,7 +374,8 @@ enum emitAttr : unsigned EA_8BYTE = 0x008, EA_16BYTE = 0x010, EA_32BYTE = 0x020, - EA_SIZE_MASK = 0x03F, + EA_64BYTE = 0x040, + EA_SIZE_MASK = 0x07F, #ifdef TARGET_64BIT EA_PTRSIZE = EA_8BYTE, @@ -382,14 +383,14 @@ enum emitAttr : unsigned EA_PTRSIZE = EA_4BYTE, #endif - EA_OFFSET_FLG = 0x040, + EA_OFFSET_FLG = 0x080, EA_OFFSET = EA_OFFSET_FLG | EA_PTRSIZE, /* size == 0 */ - EA_GCREF_FLG = 0x080, + EA_GCREF_FLG = 0x100, EA_GCREF = EA_GCREF_FLG | EA_PTRSIZE, /* size == -1 */ - EA_BYREF_FLG = 0x100, + EA_BYREF_FLG = 0x200, EA_BYREF = EA_BYREF_FLG | EA_PTRSIZE, /* size == -2 */ - EA_DSP_RELOC_FLG = 0x200, // Is the displacement of the instruction relocatable? - EA_CNS_RELOC_FLG = 0x400, // Is the immediate of the instruction relocatable? + EA_DSP_RELOC_FLG = 0x400, // Is the displacement of the instruction relocatable? + EA_CNS_RELOC_FLG = 0x800, // Is the immediate of the instruction relocatable? }; #define EA_ATTR(x) ((emitAttr)(x)) diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 526c0324807087..cd5db6239fe785 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -32,7 +32,12 @@ enum SIMDLevel // AVX2 - Hardware has AVX and AVX2 instruction set. // Vector length is 256-bit and SIMD instructions are VEX-256 encoded. // Floating-point instructions are VEX-128 encoded. - SIMD_AVX2_Supported = 3 + SIMD_AVX2_Supported = 3, + + // AVX512F - Hardware has AVX, AVX2 and AVX512F instruction set. + // Vector length is 512-bit and SIMD instructions are EVEX encoded. + // Floating-point instructions are EVEX encoded. + SIMD_AVX512F_Supported = 4 #endif }; @@ -149,6 +154,40 @@ struct simd32_t } }; +struct simd64_t +{ + union + { + float f32[16]; + double f64[8]; + int8_t i8[64]; + int16_t i16[32]; + int32_t i32[16]; + int64_t i64[8]; + uint8_t u8[64]; + uint16_t u16[32]; + uint32_t u32[16]; + uint64_t u64[8]; + simd8_t v64[8]; + simd16_t v128[4]; + simd32_t v256[2]; + }; + + bool operator==(const simd64_t& other) const + { + return (u64[0] == other.u64[0]) && (u64[1] == other.u64[1]) && (u64[2] == other.u64[2]) && + (u64[3] == other.u64[3]) && (u64[4] == other.u64[4]) && (u64[5] == other.u64[5]) && + (u64[6] == other.u64[6]) && (u64[7] == other.u64[7]); + } + + bool operator!=(const simd32_t& other) const + { + return (u64[0] != other.u64[0]) || (u64[1] != other.u64[1]) || (u64[2] != other.u64[2]) || + (u64[3] != other.u64[3]) || (u64[4] != other.u64[4]) || (u64[5] != other.u64[5]) || + (u64[6] != other.u64[6]) || (u64[7] != other.u64[7]); + } +}; + template TBase EvaluateUnaryScalar(genTreeOps oper, TBase arg0) { diff --git a/src/coreclr/jit/simdashwintrinsic.h b/src/coreclr/jit/simdashwintrinsic.h index 556e937afe9ba1..7bce4330ae6ade 100644 --- a/src/coreclr/jit/simdashwintrinsic.h +++ b/src/coreclr/jit/simdashwintrinsic.h @@ -14,6 +14,7 @@ enum class SimdAsHWIntrinsicClassId Vector4, VectorT128, VectorT256, + VectorT512, }; enum class SimdAsHWIntrinsicFlag : unsigned int diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index 64af2659bd592d..ac3f0ca7e8c027 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -101,6 +101,7 @@ #define REGSIZE_BYTES 8 // number of bytes in one register #define XMM_REGSIZE_BYTES 16 // XMM register size in bytes #define YMM_REGSIZE_BYTES 32 // YMM register size in bytes + #define ZMM_REGSIZE_BYTES 64 // ZMM register size in bytes #define CODE_ALIGN 1 // code alignment requirement #define STACK_ALIGN 16 // stack alignment requirement diff --git a/src/coreclr/jit/targetx86.h b/src/coreclr/jit/targetx86.h index 09c6b6b0b04ef9..dffd6adf2efb08 100644 --- a/src/coreclr/jit/targetx86.h +++ b/src/coreclr/jit/targetx86.h @@ -102,6 +102,7 @@ #define XMM_REGSIZE_BYTES 16 // XMM register size in bytes #define YMM_REGSIZE_BYTES 32 // YMM register size in bytes + #define ZMM_REGSIZE_BYTES 64 // ZMM register size in bytes #define REGNUM_BITS 6 // number of bits in a REG_* diff --git a/src/coreclr/jit/typelist.h b/src/coreclr/jit/typelist.h index f7af2573225065..2eeee02047530f 100644 --- a/src/coreclr/jit/typelist.h +++ b/src/coreclr/jit/typelist.h @@ -62,6 +62,7 @@ DEF_TP(SIMD8 ,"simd8" , TYP_SIMD8, TI_STRUCT, 8, 8, 8, 2, 8, VTF_S) DEF_TP(SIMD12 ,"simd12" , TYP_SIMD12, TI_STRUCT,12,16, 16, 4,16, VTF_S) DEF_TP(SIMD16 ,"simd16" , TYP_SIMD16, TI_STRUCT,16,16, 16, 4,16, VTF_S) DEF_TP(SIMD32 ,"simd32" , TYP_SIMD32, TI_STRUCT,32,32, 32, 8,16, VTF_S) +DEF_TP(SIMD64 ,"simd64" , TYP_SIMD64, TI_STRUCT,64,64, 64, 16,16, VTF_S) #endif // FEATURE_SIMD DEF_TP(UNKNOWN ,"unknown" ,TYP_UNKNOWN, TI_ERROR, 0, 0, 0, 0, 0, VTF_ANY) diff --git a/src/coreclr/jit/vartype.h b/src/coreclr/jit/vartype.h index 33c411032d3308..5e80787ebfe244 100644 --- a/src/coreclr/jit/vartype.h +++ b/src/coreclr/jit/vartype.h @@ -71,6 +71,7 @@ inline bool varTypeIsSIMD(T vt) case TYP_SIMD12: case TYP_SIMD16: case TYP_SIMD32: + case TYP_SIMD64: return true; default: return false; diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp index cd0161a4aec634..4380be6cf2c310 100644 --- a/src/coreclr/vm/jitinterface.cpp +++ b/src/coreclr/vm/jitinterface.cpp @@ -11949,7 +11949,11 @@ void CEEJitInfo::allocMem (AllocMemArgs *pArgs) S_SIZE_T totalSize = S_SIZE_T(codeSize); size_t roDataAlignment = sizeof(void*); - if ((pArgs->flag & CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN)!= 0) + if ((pArgs->flag & CORJIT_ALLOCMEM_FLG_RODATA_64BYTE_ALIGN)!= 0) + { + roDataAlignment = 64; + } + else if ((pArgs->flag & CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN)!= 0) { roDataAlignment = 32; } @@ -11965,7 +11969,11 @@ void CEEJitInfo::allocMem (AllocMemArgs *pArgs) { size_t codeAlignment = sizeof(void*); - if ((pArgs->flag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0) + if ((pArgs->flag & CORJIT_ALLOCMEM_FLG_64BYTE_ALIGN) != 0) + { + codeAlignment = 64; + } + else if ((pArgs->flag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0) { codeAlignment = 32; } From e9102492d4ac3f732758431173d69bb296a2c1bd Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Mon, 13 Feb 2023 10:05:06 -0800 Subject: [PATCH 02/12] Adding other infrastructure to lower/emit code for simd64. --- src/coreclr/jit/codegen.h | 2 +- src/coreclr/jit/codegenxarch.cpp | 33 +++- src/coreclr/jit/compiler.cpp | 16 +- src/coreclr/jit/compiler.h | 45 ++++- src/coreclr/jit/emitxarch.cpp | 37 ++++- src/coreclr/jit/emitxarch.h | 13 +- src/coreclr/jit/fgbasic.cpp | 1 + src/coreclr/jit/hwintrinsic.cpp | 45 +++++ src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 9 +- src/coreclr/jit/hwintrinsiclistxarch.h | 14 ++ src/coreclr/jit/lclvars.cpp | 1 + src/coreclr/jit/lsra.cpp | 3 +- src/coreclr/jit/lsraxarch.cpp | 13 +- src/coreclr/jit/morphblock.cpp | 1 + src/coreclr/jit/optcse.cpp | 3 +- src/coreclr/jit/regset.h | 2 +- src/coreclr/jit/scopeinfo.cpp | 2 + src/coreclr/jit/simd.cpp | 164 ++++++++++++++++++- src/coreclr/jit/simd.h | 2 +- src/coreclr/jit/simdashwintrinsiclistxarch.h | 7 + src/coreclr/jit/utils.cpp | 10 +- src/coreclr/jit/valuenum.cpp | 41 +++++ src/coreclr/jit/valuenum.h | 73 +++++++++ src/coreclr/vm/jitinterface.cpp | 6 +- 24 files changed, 501 insertions(+), 42 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index c75ae055e52d76..be8e06822b6fa6 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -947,7 +947,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX void genSSE2Intrinsic(GenTreeHWIntrinsic* node); void genSSE41Intrinsic(GenTreeHWIntrinsic* node); void genSSE42Intrinsic(GenTreeHWIntrinsic* node); - void genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node); + void genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node); void genAESIntrinsic(GenTreeHWIntrinsic* node); void genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node); void genFMAIntrinsic(GenTreeHWIntrinsic* node); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index eed18f248c73ed..3175f9548f1bba 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -504,7 +504,22 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre if (vecCon->IsZero()) { - if ((attr != EA_32BYTE) || compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)) + bool isSupported; + + if (attr == EA_32BYTE) + { + isSupported = compiler->compOpportunisticallyDependsOn(InstructionSet_AVX); + } + else if (attr == EA_64BYTE) + { + isSupported = compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F); + } + else + { + assert((attr == EA_8BYTE) || (attr == EA_16BYTE)); + isSupported = true; + } + if (isSupported) { #if defined(FEATURE_SIMD) emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg); @@ -551,6 +566,15 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); break; } + + case TYP_SIMD64: + { + simd64_t constValue = vecCon->gtSimd64Val; + CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(constValue); + + emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); + break; + } #endif // FEATURE_SIMD default: @@ -5778,9 +5802,10 @@ void CodeGen::genCall(GenTreeCall* call) // To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue // VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens // when there's preceding 256-bit AVX to legacy SSE transition penalty. - if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && GetEmitter()->Contains256bitAVX()) + // This applies to 512bit AVX512 instructions as well. + if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && (GetEmitter()->Contains256bitOrMoreAVX())) { - assert(compiler->canUseVexEncoding()); + assert(GetEmitter()->Contains256bitOrMoreAVX() && compiler->canUseVexEncoding()); instGen(INS_vzeroupper); } @@ -11064,7 +11089,7 @@ void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/) bool emitVzeroUpper = false; if (check256bitOnly) { - emitVzeroUpper = GetEmitter()->Contains256bitAVX(); + emitVzeroUpper = GetEmitter()->Contains256bitOrMoreAVX(); } else { diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 2856cc56d6b1ef..833ef448e1eb14 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2280,6 +2280,18 @@ void Compiler::compSetProcessor() { instructionSetFlags.AddInstructionSet(InstructionSet_Vector256); } + if (instructionSetFlags.HasInstructionSet(InstructionSet_AVX512F)) + { + if (!DoJitStressEvexEncoding()) + { + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512F); + instructionSetFlags = EnsureInstructionSetFlagsAreValid(instructionSetFlags); + } + else + { + instructionSetFlags.AddInstructionSet(InstructionSet_Vector512); + } + } #elif defined(TARGET_ARM64) if (instructionSetFlags.HasInstructionSet(InstructionSet_AdvSimd)) { @@ -2297,14 +2309,14 @@ void Compiler::compSetProcessor() if (canUseEvexEncoding()) { codeGen->GetEmitter()->SetUseEvexEncoding(true); - // TODO-XArch-AVX512: Revisit other flags to be set once avx512 instructions are added. + // TODO-XArch-AVX512 : Revisit other flags to be set once avx512 instructions are added. } if (canUseVexEncoding()) { codeGen->GetEmitter()->SetUseVEXEncoding(true); // Assume each JITted method does not contain AVX instruction at first codeGen->GetEmitter()->SetContainsAVX(false); - codeGen->GetEmitter()->SetContains256bitAVX(false); + codeGen->GetEmitter()->SetContains256bitOrMoreAVX(false); } } #endif // TARGET_XARCH diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index dd4b883679f8e7..720a3b2d436d18 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -7621,7 +7621,7 @@ class Compiler static bool varTypeNeedsPartialCalleeSave(var_types type) { assert(type != TYP_STRUCT); - return (type == TYP_SIMD32); + return (type == TYP_SIMD32) || (type == TYP_SIMD64); } #elif defined(TARGET_ARM64) static bool varTypeNeedsPartialCalleeSave(var_types type) @@ -8318,6 +8318,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX SIMDLevel getSIMDSupportLevel() { #if defined(TARGET_XARCH) + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + return SIMD_AVX512F_Supported; + } + if (compOpportunisticallyDependsOn(InstructionSet_AVX2)) { return SIMD_AVX2_Supported; @@ -8435,12 +8440,26 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX CORINFO_CLASS_HANDLE Vector256ULongHandle; CORINFO_CLASS_HANDLE Vector256NIntHandle; CORINFO_CLASS_HANDLE Vector256NUIntHandle; + + CORINFO_CLASS_HANDLE Vector512FloatHandle; + CORINFO_CLASS_HANDLE Vector512DoubleHandle; + CORINFO_CLASS_HANDLE Vector512IntHandle; + CORINFO_CLASS_HANDLE Vector512UShortHandle; + CORINFO_CLASS_HANDLE Vector512UByteHandle; + CORINFO_CLASS_HANDLE Vector512ShortHandle; + CORINFO_CLASS_HANDLE Vector512ByteHandle; + CORINFO_CLASS_HANDLE Vector512LongHandle; + CORINFO_CLASS_HANDLE Vector512UIntHandle; + CORINFO_CLASS_HANDLE Vector512ULongHandle; + CORINFO_CLASS_HANDLE Vector512NIntHandle; + CORINFO_CLASS_HANDLE Vector512NUIntHandle; #endif // defined(TARGET_XARCH) #endif // FEATURE_HW_INTRINSICS CORINFO_CLASS_HANDLE CanonicalSimd8Handle; CORINFO_CLASS_HANDLE CanonicalSimd16Handle; CORINFO_CLASS_HANDLE CanonicalSimd32Handle; + CORINFO_CLASS_HANDLE CanonicalSimd64Handle; SIMDHandlesCache() { @@ -8506,6 +8525,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX } case TYP_SIMD32: + case TYP_SIMD64: break; default: @@ -8611,6 +8631,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX return m_simdHandleCache->CanonicalSimd16Handle; case TYP_SIMD32: return m_simdHandleCache->CanonicalSimd32Handle; + case TYP_SIMD64: + return m_simdHandleCache->CanonicalSimd64Handle; default: unreached(); } @@ -8745,7 +8767,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX var_types getSIMDVectorType() { #if defined(TARGET_XARCH) - if (getSIMDSupportLevel() == SIMD_AVX2_Supported) + if (getSIMDSupportLevel() == SIMD_AVX512F_Supported) + { + return TYP_SIMD64; + } + else if (getSIMDSupportLevel() == SIMD_AVX2_Supported) { return TYP_SIMD32; } @@ -8786,7 +8812,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX unsigned getSIMDVectorRegisterByteLength() { #if defined(TARGET_XARCH) - if (getSIMDSupportLevel() == SIMD_AVX2_Supported) + // TODO-XArch-AVX512 : Return ZMM_REGSIZE_BYTES once Vector supports AVX512. + if (getSIMDSupportLevel() >= SIMD_AVX2_Supported) { return YMM_REGSIZE_BYTES; } @@ -8815,7 +8842,11 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX unsigned int maxSIMDStructBytes() { #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH) - if (compOpportunisticallyDependsOn(InstructionSet_AVX)) + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + return ZMM_REGSIZE_BYTES; + } + else if (compOpportunisticallyDependsOn(InstructionSet_AVX)) { return YMM_REGSIZE_BYTES; } @@ -8857,6 +8888,10 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX { simdType = TYP_SIMD32; } + else if (size == 64) + { + simdType = TYP_SIMD64; + } else { noway_assert(!"Unexpected size for SIMD type"); @@ -8892,7 +8927,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // otherwise cause the highest level of instruction set support to be reported to crossgen2. // and this api is only ever used as an optimization or assert, so no reporting should // ever happen. - return YMM_REGSIZE_BYTES; + return ZMM_REGSIZE_BYTES; } #endif // defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH) unsigned vectorRegSize = maxSIMDStructBytes(); diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 58695769fd12e2..de1f07a923a876 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -960,6 +960,7 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const #define DEFAULT_BYTE_EVEX_PREFIX_MASK 0xFFFFFFFF00000000ULL #define LBIT_IN_BYTE_EVEX_PREFIX 0x0000002000000000ULL +#define LPRIMEBIT_IN_BYTE_EVEX_PREFIX 0x0000004000000000ULL //------------------------------------------------------------------------ // AddEvexPrefix: Add default EVEX perfix with only LL' bits set. @@ -991,6 +992,11 @@ emitter::code_t emitter::AddEvexPrefix(instruction ins, code_t code, emitAttr at // Set L bit to 1 in case of instructions that operate on 256-bits. code |= LBIT_IN_BYTE_EVEX_PREFIX; } + else if (attr == EA_64BYTE) + { + // Set L' bits to 11 in case of instructions that operate on 512-bits. + code |= LPRIMEBIT_IN_BYTE_EVEX_PREFIX; + } return code; } @@ -3718,8 +3724,8 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) // BT supports 16 bit operands and this code doesn't handle the necessary 66 prefix. assert(ins != INS_bt); - assert((attrSize == EA_4BYTE) || (attrSize == EA_PTRSIZE) // Only for x64 - || (attrSize == EA_16BYTE) || (attrSize == EA_32BYTE) // only for x64 + assert((attrSize == EA_4BYTE) || (attrSize == EA_PTRSIZE) // Only for x64 + || (attrSize == EA_16BYTE) || (attrSize == EA_32BYTE) || (attrSize == EA_64BYTE) // only for x64 || (ins == INS_movzx) || (ins == INS_movsx) // The prefetch instructions are always 3 bytes and have part of their modr/m byte hardcoded || isPrefetch(ins)); @@ -6064,7 +6070,7 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN emitAttr size = EA_SIZE(attr); - assert(size <= EA_32BYTE); + assert(size <= EA_64BYTE); noway_assert(emitVerifyEncodable(ins, size, dstReg, srcReg)); insFormat fmt = emitInsModeFormat(ins, IF_RRD_RRD); @@ -6107,7 +6113,7 @@ void emitter::emitIns_R_R(instruction ins, emitAttr attr, regNumber reg1, regNum emitAttr size = EA_SIZE(attr); - assert(size <= EA_32BYTE); + assert(size <= EA_64BYTE); noway_assert(emitVerifyEncodable(ins, size, reg1, reg2)); /* Special case: "XCHG" uses a different format */ @@ -6915,7 +6921,7 @@ void emitter::emitIns_R_C(instruction ins, emitAttr attr, regNumber reg, CORINFO emitAttr size = EA_SIZE(attr); - assert(size <= EA_32BYTE); + assert(size <= EA_64BYTE); noway_assert(emitVerifyEncodable(ins, size, reg)); UNATIVE_OFFSET sz; @@ -9453,6 +9459,9 @@ const char* emitter::emitRegName(regNumber reg, emitAttr attr, bool varName) switch (EA_SIZE(attr)) { + case EA_64BYTE: + return emitZMMregName(reg); + case EA_32BYTE: return emitYMMregName(reg); @@ -9650,6 +9659,24 @@ const char* emitter::emitYMMregName(unsigned reg) return regNames[reg]; } +/***************************************************************************** + * + * Return a string that represents the given ZMM register. + */ + +const char* emitter::emitZMMregName(unsigned reg) +{ + static const char* const regNames[] = { +#define REGDEF(name, rnum, mask, sname) "z" sname, +#include "register.h" + }; + + assert(reg < REG_COUNT); + assert(reg < ArrLen(regNames)); + + return regNames[reg]; +} + /***************************************************************************** * * Display a static data member reference. diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 48591c2fddb1b3..3bcc4f4c7f6ed1 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -612,14 +612,14 @@ void SetContainsAVX(bool value) containsAVXInstruction = value; } -bool contains256bitAVXInstruction = false; -bool Contains256bitAVX() +bool contains256bitOrMoreAVXInstruction = false; +bool Contains256bitOrMoreAVX() { - return contains256bitAVXInstruction; + return contains256bitOrMoreAVXInstruction; } -void SetContains256bitAVX(bool value) +void SetContains256bitOrMoreAVX(bool value) { - contains256bitAVXInstruction = value; + contains256bitOrMoreAVXInstruction = value; } bool IsDstDstSrcAVXInstruction(instruction ins); @@ -659,6 +659,7 @@ void emitDispShift(instruction ins, int cnt = 0); const char* emitXMMregName(unsigned reg); const char* emitYMMregName(unsigned reg); +const char* emitZMMregName(unsigned reg); /************************************************************************/ /* Private members that deal with target-dependent instr. descriptors */ @@ -720,7 +721,7 @@ inline emitter::opSize emitEncodeScale(size_t scale) { assert(scale == 1 || scale == 2 || scale == 4 || scale == 8); - return emitSizeEncode[scale - 1]; + return emitSizeEncode[genLog2((unsigned int)scale)]; } inline emitAttr emitDecodeScale(unsigned ensz) diff --git a/src/coreclr/jit/fgbasic.cpp b/src/coreclr/jit/fgbasic.cpp index df12d1eb12e517..967e65a4fbe2f4 100644 --- a/src/coreclr/jit/fgbasic.cpp +++ b/src/coreclr/jit/fgbasic.cpp @@ -1196,6 +1196,7 @@ void Compiler::fgFindJumpTargets(const BYTE* codeAddr, IL_OFFSET codeSize, Fixed case NI_X86Base_X64_BitScanForward: case NI_X86Base_BitScanReverse: case NI_X86Base_X64_BitScanReverse: + case NI_Vector512_Create: #endif // TARGET_XARCH #endif // FEATURE_HW_INTRINSICS { diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 341f14a460b423..98a518974e0e1d 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -159,6 +159,38 @@ CORINFO_CLASS_HANDLE Compiler::gtGetStructHandleForHWSIMD(var_types simdType, Co assert(!"Didn't find a class handle for simdType"); } } + else if (simdType == TYP_SIMD64) + { + switch (simdBaseJitType) + { + case CORINFO_TYPE_FLOAT: + return m_simdHandleCache->Vector512FloatHandle; + case CORINFO_TYPE_DOUBLE: + return m_simdHandleCache->Vector512DoubleHandle; + case CORINFO_TYPE_INT: + return m_simdHandleCache->Vector512IntHandle; + case CORINFO_TYPE_USHORT: + return m_simdHandleCache->Vector512UShortHandle; + case CORINFO_TYPE_UBYTE: + return m_simdHandleCache->Vector512UByteHandle; + case CORINFO_TYPE_SHORT: + return m_simdHandleCache->Vector512ShortHandle; + case CORINFO_TYPE_BYTE: + return m_simdHandleCache->Vector512ByteHandle; + case CORINFO_TYPE_LONG: + return m_simdHandleCache->Vector512LongHandle; + case CORINFO_TYPE_UINT: + return m_simdHandleCache->Vector512UIntHandle; + case CORINFO_TYPE_ULONG: + return m_simdHandleCache->Vector512ULongHandle; + case CORINFO_TYPE_NATIVEINT: + return m_simdHandleCache->Vector512NIntHandle; + case CORINFO_TYPE_NATIVEUINT: + return m_simdHandleCache->Vector512NUIntHandle; + default: + assert(!"Didn't find a class handle for simdType"); + } + } #endif // TARGET_XARCH #ifdef TARGET_ARM64 else if (simdType == TYP_SIMD8) @@ -311,6 +343,10 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp, { isa = InstructionSet_AVX2; } + else if (strcmp(className, "Vector512") == 0) + { + isa = InstructionSet_AVX512F; + } } #endif @@ -392,6 +428,14 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp, } } } + else if (isa == InstructionSet_Vector512) + { + if (!comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + { + // TODO-XArch-AVX512: Add checks for CD, DQ, BW + return NI_Illegal; + } + } #elif defined(TARGET_ARM64) else if (isa == InstructionSet_Vector64) { @@ -1116,6 +1160,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, case NI_SSE41_ConvertToVector128Int64: case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX512F_BroadcastScalarToVector512: case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index cd5f6aa29ed8d6..364dca38f6e356 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -360,6 +360,7 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) { case InstructionSet_Vector128: case InstructionSet_Vector256: + case InstructionSet_Vector512: genBaseIntrinsic(node); break; case InstructionSet_X86Base: @@ -384,7 +385,8 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) break; case InstructionSet_AVX: case InstructionSet_AVX2: - genAvxOrAvx2Intrinsic(node); + case InstructionSet_AVX512F: + genAvxFamilyIntrinsic(node); break; case InstructionSet_AES: genAESIntrinsic(node); @@ -474,6 +476,7 @@ void CodeGen::genHWIntrinsic_R_RM( else { if (varTypeIsIntegral(rmOp) && ((node->GetHWIntrinsicId() == NI_AVX2_BroadcastScalarToVector128) || + (node->GetHWIntrinsicId() == NI_AVX512F_BroadcastScalarToVector512) || (node->GetHWIntrinsicId() == NI_AVX2_BroadcastScalarToVector256))) { // In lowering we had the special case of BroadcastScalarToVector(CreateScalarUnsafe(op1)) @@ -1519,12 +1522,12 @@ void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node) } //------------------------------------------------------------------------ -// genAvxOrAvx2Intrinsic: Generates the code for an AVX/AVX2 hardware intrinsic node +// genAvxFamilyIntrinsic: Generates the code for an AVX/AVX2/AVX512 hardware intrinsic node // // Arguments: // node - The hardware intrinsic node // -void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node) +void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) { NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); var_types baseType = node->GetSimdBaseType(); diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 503c7fc80209f2..4fc4ae3c5e3ef4 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -230,6 +230,14 @@ HARDWARE_INTRINSIC(Vector256, WidenUpper, HARDWARE_INTRINSIC(Vector256, WithElement, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, Xor, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// ISA Function name SIMD size NumArg Instructions Category Flags +// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// Vector512 Intrinsics +HARDWARE_INTRINSIC(Vector512, Create, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector512, get_Zero, 64, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} @@ -717,6 +725,12 @@ HARDWARE_INTRINSIC(AVX2, Xor, // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// AVX512 Intrinsics +HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// ISA Function name SIMD size NumArg Instructions Category Flags +// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVXVNNI Intrinsics HARDWARE_INTRINSIC(AVXVNNI, MultiplyWideningAndAdd, -1, 3, {INS_invalid, INS_vpdpbusd, INS_vpdpwssd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVXVNNI, MultiplyWideningAndAddSaturate, -1, 3, {INS_invalid, INS_vpdpbusds, INS_vpdpwssds, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFloatingPointUsed|HW_Flag_BaseTypeFromSecondArg) diff --git a/src/coreclr/jit/lclvars.cpp b/src/coreclr/jit/lclvars.cpp index 7cd22c1d9ed6da..457214902c6fe4 100644 --- a/src/coreclr/jit/lclvars.cpp +++ b/src/coreclr/jit/lclvars.cpp @@ -3793,6 +3793,7 @@ void Compiler::lvaSortByRefCount() case TYP_SIMD12: case TYP_SIMD16: case TYP_SIMD32: + case TYP_SIMD64: #endif // FEATURE_SIMD case TYP_STRUCT: break; diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index cc7c52054625b5..db5f8518eabe23 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -733,7 +733,7 @@ LinearScan::LinearScan(Compiler* theCompiler) availableRegs[i] = &availableDoubleRegs; } #ifdef FEATURE_SIMD - else if ((thisType >= TYP_SIMD8) && (thisType <= TYP_SIMD32)) + else if (varTypeIsSIMD(thisType)) { availableRegs[i] = &availableDoubleRegs; } @@ -1596,6 +1596,7 @@ bool LinearScan::isRegCandidate(LclVarDsc* varDsc) case TYP_SIMD12: case TYP_SIMD16: case TYP_SIMD32: + case TYP_SIMD64: return !varDsc->lvPromoted; #endif // FEATURE_SIMD diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 815404dd6bcd84..8195160b205db3 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2867,8 +2867,8 @@ int LinearScan::BuildMul(GenTree* tree) } //------------------------------------------------------------------------------ -// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set -// Contains256bitAVX flag when SIMD vector size is 32 bytes +// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, +// set SetContains256bitOrMoreAVX flag when SIMD vector size is 32 or 64 bytes. // // Arguments: // isFloatingPointType - true if it is floating point type @@ -2882,7 +2882,14 @@ void LinearScan::SetContainsAVXFlags(unsigned sizeOfSIMDVector /* = 0*/) compiler->GetEmitter()->SetContainsAVX(true); if (sizeOfSIMDVector == 32) { - compiler->GetEmitter()->SetContains256bitAVX(true); + compiler->GetEmitter()->SetContains256bitOrMoreAVX(true); + } + } + if (compiler->canUseEvexEncoding()) + { + if (compiler->compExactlyDependsOn(InstructionSet_AVX512F) && (sizeOfSIMDVector == 64)) + { + compiler->GetEmitter()->SetContains256bitOrMoreAVX(true); } } } diff --git a/src/coreclr/jit/morphblock.cpp b/src/coreclr/jit/morphblock.cpp index 87670bf634909b..7f388dfa0a57d1 100644 --- a/src/coreclr/jit/morphblock.cpp +++ b/src/coreclr/jit/morphblock.cpp @@ -597,6 +597,7 @@ void MorphInitBlockHelper::TryInitFieldByField() case TYP_SIMD12: case TYP_SIMD16: case TYP_SIMD32: + case TYP_SIMD64: #endif // FEATURE_SIMD assert(initPattern == 0); src = m_comp->gtNewZeroConNode(fieldType); diff --git a/src/coreclr/jit/optcse.cpp b/src/coreclr/jit/optcse.cpp index fa21ea5b8facdd..56bee9820bfb1e 100644 --- a/src/coreclr/jit/optcse.cpp +++ b/src/coreclr/jit/optcse.cpp @@ -2657,7 +2657,8 @@ class CSE_Heuristic // If we have a SIMD32 that is live across a call we have even higher spill costs // - if (candidate->Expr()->TypeGet() == TYP_SIMD32) + if ((candidate->Expr()->TypeGet() == TYP_SIMD32) || + (candidate->Expr()->TypeGet() == TYP_SIMD64)) { // Additionally for a simd32 CSE candidate we assume that and second spilled/restore will be needed. // (to hold the upper half of the simd32 register that isn't preserved across the call) diff --git a/src/coreclr/jit/regset.h b/src/coreclr/jit/regset.h index 9c1a1041eecf87..ef93565c43e95a 100644 --- a/src/coreclr/jit/regset.h +++ b/src/coreclr/jit/regset.h @@ -222,7 +222,7 @@ class RegSet { #if defined(FEATURE_SIMD) #if defined(TARGET_XARCH) - TEMP_MAX_SIZE = YMM_REGSIZE_BYTES, + TEMP_MAX_SIZE = ZMM_REGSIZE_BYTES, #elif defined(TARGET_ARM64) TEMP_MAX_SIZE = FP_REGSIZE_BYTES, #endif // defined(TARGET_XARCH) || defined(TARGET_ARM64) diff --git a/src/coreclr/jit/scopeinfo.cpp b/src/coreclr/jit/scopeinfo.cpp index 97df205af41bb3..23ef8efc844209 100644 --- a/src/coreclr/jit/scopeinfo.cpp +++ b/src/coreclr/jit/scopeinfo.cpp @@ -290,6 +290,7 @@ void CodeGenInterface::siVarLoc::siFillStackVarLoc( case TYP_SIMD12: case TYP_SIMD16: case TYP_SIMD32: + case TYP_SIMD64: #endif #ifdef TARGET_64BIT case TYP_LONG: @@ -424,6 +425,7 @@ void CodeGenInterface::siVarLoc::siFillRegisterVarLoc( case TYP_SIMD12: case TYP_SIMD16: case TYP_SIMD32: + case TYP_SIMD64: this->vlType = VLT_REG_FP; // TODO-AMD64-Bug: ndp\clr\src\inc\corinfo.h has a definition of RegNum that only goes up to R15, diff --git a/src/coreclr/jit/simd.cpp b/src/coreclr/jit/simd.cpp index 323300a90426ba..7c95d1798d6c42 100644 --- a/src/coreclr/jit/simd.cpp +++ b/src/coreclr/jit/simd.cpp @@ -83,11 +83,16 @@ int Compiler::getSIMDTypeAlignment(var_types simdType) assert((size == 12) || (size == 16)); return 16; } - else + else if (size <= 32) { assert(size == 32); return 32; } + else + { + assert(size == 64); + return 64; + } #elif defined(TARGET_ARM64) // preferred alignment for 64-bit vectors is 8-bytes. // For everything else, 16-bytes. @@ -422,12 +427,87 @@ CorInfoType Compiler::getBaseJitTypeAndSizeOfSIMDType(CORINFO_CLASS_HANDLE typeH const size_t Vector64SizeBytes = 64 / 8; const size_t Vector128SizeBytes = 128 / 8; const size_t Vector256SizeBytes = 256 / 8; + const size_t Vector512SizeBytes = 512 / 8; #if defined(TARGET_XARCH) + static_assert_no_msg(ZMM_REGSIZE_BYTES == Vector512SizeBytes); static_assert_no_msg(YMM_REGSIZE_BYTES == Vector256SizeBytes); static_assert_no_msg(XMM_REGSIZE_BYTES == Vector128SizeBytes); - if (typeHnd == m_simdHandleCache->Vector256FloatHandle) + if (typeHnd == m_simdHandleCache->Vector512FloatHandle) + { + simdBaseJitType = CORINFO_TYPE_FLOAT; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512DoubleHandle) + { + simdBaseJitType = CORINFO_TYPE_DOUBLE; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512IntHandle) + { + simdBaseJitType = CORINFO_TYPE_INT; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512UIntHandle) + { + simdBaseJitType = CORINFO_TYPE_UINT; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512ShortHandle) + { + simdBaseJitType = CORINFO_TYPE_SHORT; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512UShortHandle) + { + simdBaseJitType = CORINFO_TYPE_USHORT; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512ByteHandle) + { + simdBaseJitType = CORINFO_TYPE_BYTE; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512UByteHandle) + { + simdBaseJitType = CORINFO_TYPE_UBYTE; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512LongHandle) + { + simdBaseJitType = CORINFO_TYPE_LONG; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512ULongHandle) + { + simdBaseJitType = CORINFO_TYPE_ULONG; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512NIntHandle) + { + simdBaseJitType = CORINFO_TYPE_NATIVEINT; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + else if (typeHnd == m_simdHandleCache->Vector512NUIntHandle) + { + simdBaseJitType = CORINFO_TYPE_NATIVEUINT; + size = Vector512SizeBytes; + JITDUMP(" Known type Vector512\n"); + } + + else if (typeHnd == m_simdHandleCache->Vector256FloatHandle) { simdBaseJitType = CORINFO_TYPE_FLOAT; size = Vector256SizeBytes; @@ -663,7 +743,77 @@ CorInfoType Compiler::getBaseJitTypeAndSizeOfSIMDType(CORINFO_CLASS_HANDLE typeH getClassNameFromMetadata(baseTypeHnd, nullptr)); #if defined(TARGET_XARCH) - if (strcmp(className, "Vector256`1") == 0) + if (strcmp(className, "Vector512`1") == 0) + { + size = Vector512SizeBytes; + switch (type) + { + case CORINFO_TYPE_FLOAT: + m_simdHandleCache->Vector512FloatHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_FLOAT; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_DOUBLE: + m_simdHandleCache->Vector512DoubleHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_DOUBLE; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_INT: + m_simdHandleCache->Vector512IntHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_INT; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_UINT: + m_simdHandleCache->Vector512UIntHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_UINT; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_SHORT: + m_simdHandleCache->Vector512ShortHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_SHORT; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_USHORT: + m_simdHandleCache->Vector512UShortHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_USHORT; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_LONG: + m_simdHandleCache->Vector512LongHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_LONG; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_ULONG: + m_simdHandleCache->Vector512ULongHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_ULONG; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_UBYTE: + m_simdHandleCache->Vector512UByteHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_UBYTE; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_BYTE: + m_simdHandleCache->Vector512ByteHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_BYTE; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_NATIVEINT: + m_simdHandleCache->Vector512NIntHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_NATIVEINT; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + case CORINFO_TYPE_NATIVEUINT: + m_simdHandleCache->Vector512NUIntHandle = typeHnd; + simdBaseJitType = CORINFO_TYPE_NATIVEUINT; + JITDUMP(" Found type Hardware Intrinsic SIMD Vector512\n"); + break; + + default: + JITDUMP(" Unknown Hardware Intrinsic SIMD Type Vector512\n"); + } + } + else if (strcmp(className, "Vector256`1") == 0) { size = Vector256SizeBytes; switch (type) @@ -888,6 +1038,11 @@ CorInfoType Compiler::getBaseJitTypeAndSizeOfSIMDType(CORINFO_CLASS_HANDLE typeH { simdBaseJitType = CORINFO_TYPE_UNDEF; } + if (size == ZMM_REGSIZE_BYTES && (simdBaseJitType != CORINFO_TYPE_UNDEF) && + !compExactlyDependsOn(InstructionSet_AVX512F)) + { + simdBaseJitType = CORINFO_TYPE_UNDEF; + } #endif // TARGET_XARCH } #endif // FEATURE_HW_INTRINSICS @@ -916,6 +1071,9 @@ CorInfoType Compiler::getBaseJitTypeAndSizeOfSIMDType(CORINFO_CLASS_HANDLE typeH case 32: pCanonicalHnd = &m_simdHandleCache->CanonicalSimd32Handle; break; + case 64: + pCanonicalHnd = &m_simdHandleCache->CanonicalSimd64Handle; + break; default: unreached(); } diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index cd5db6239fe785..3b48ab57faad5d 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -180,7 +180,7 @@ struct simd64_t (u64[6] == other.u64[6]) && (u64[7] == other.u64[7]); } - bool operator!=(const simd32_t& other) const + bool operator!=(const simd64_t& other) const { return (u64[0] != other.u64[0]) || (u64[1] != other.u64[1]) || (u64[2] != other.u64[2]) || (u64[3] != other.u64[3]) || (u64[4] != other.u64[4]) || (u64[5] != other.u64[5]) || diff --git a/src/coreclr/jit/simdashwintrinsiclistxarch.h b/src/coreclr/jit/simdashwintrinsiclistxarch.h index 2afa558e123fe2..397483a98c12bd 100644 --- a/src/coreclr/jit/simdashwintrinsiclistxarch.h +++ b/src/coreclr/jit/simdashwintrinsiclistxarch.h @@ -387,6 +387,13 @@ SIMD_AS_HWINTRINSIC_ID(VectorT256, WidenUpper, SIMD_AS_HWINTRINSIC_ID(VectorT256, WithElement, 3, {NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement, NI_VectorT256_WithElement}, SimdAsHWIntrinsicFlag::None) SIMD_AS_HWINTRINSIC_ID(VectorT256, Xor, 2, {NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor, NI_VectorT256_Xor}, SimdAsHWIntrinsicFlag::None) +// ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* +// ISA ID Name NumArg Instructions Flags +// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} +// ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* +// Vector Intrinsics +SIMD_AS_HWINTRINSIC_ID(VectorT512, get_Zero, 0, {NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero, NI_VectorT512_get_Zero}, SimdAsHWIntrinsicFlag::None) + #undef SIMD_AS_HWINTRINSIC_NM #undef SIMD_AS_HWINTRINSIC_ID diff --git a/src/coreclr/jit/utils.cpp b/src/coreclr/jit/utils.cpp index bf5181be76a0a1..ec0a0202b0b0fb 100644 --- a/src/coreclr/jit/utils.cpp +++ b/src/coreclr/jit/utils.cpp @@ -244,13 +244,21 @@ const char* getRegNameFloat(regNumber reg, var_types type) #ifdef FEATURE_SIMD static const char* regNamesYMM[] = { #define REGDEF(name, rnum, mask, sname) "y" sname, +#include "register.h" + }; + static const char* regNamesZMM[] = { +#define REGDEF(name, rnum, mask, sname) "z" sname, #include "register.h" }; #endif // FEATURE_SIMD assert((unsigned)reg < ArrLen(regNamesFloat)); #ifdef FEATURE_SIMD - if (type == TYP_SIMD32) + if (type == TYP_SIMD64) + { + return regNamesZMM[reg]; + } + else if (type == TYP_SIMD32) { return regNamesYMM[reg]; } diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index c43bb47a36f8d4..1d1fb870de1849 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -445,6 +445,7 @@ ValueNumStore::ValueNumStore(Compiler* comp, CompAllocator alloc) , m_simd12CnsMap(nullptr) , m_simd16CnsMap(nullptr) , m_simd32CnsMap(nullptr) + , m_simd64CnsMap(nullptr) #endif // FEATURE_SIMD , m_VNFunc0Map(nullptr) , m_VNFunc1Map(nullptr) @@ -1699,6 +1700,12 @@ ValueNumStore::Chunk::Chunk(CompAllocator alloc, ValueNum* pNextBaseVN, var_type m_defs = new (alloc) Alloc::Type[ChunkSize]; break; } + + case TYP_SIMD64: + { + m_defs = new (alloc) Alloc::Type[ChunkSize]; + break; + } #endif // FEATURE_SIMD default: @@ -1856,6 +1863,11 @@ ValueNum ValueNumStore::VNForSimd32Con(simd32_t cnsVal) { return VnForConst(cnsVal, GetSimd32CnsMap(), TYP_SIMD32); } + +ValueNum ValueNumStore::VNForSimd64Con(simd64_t cnsVal) +{ + return VnForConst(cnsVal, GetSimd64CnsMap(), TYP_SIMD64); +} #endif // FEATURE_SIMD ValueNum ValueNumStore::VNForCastOper(var_types castToType, bool srcIsUnsigned) @@ -1958,6 +1970,11 @@ ValueNum ValueNumStore::VNZeroForType(var_types typ) { return VNForSimd32Con({}); } + + case TYP_SIMD64: + { + return VNForSimd64Con({}); + } #endif // FEATURE_SIMD // These should be unreached. @@ -3179,6 +3196,16 @@ simd32_t ValueNumStore::GetConstantSimd32(ValueNum argVN) return ConstantValue(argVN); } + +// Given a simd64 constant value number return its value as a simd32. +// +simd64_t ValueNumStore::GetConstantSimd64(ValueNum argVN) +{ + assert(IsVNConstant(argVN)); + assert(TypeOfVN(argVN) == TYP_SIMD64); + + return ConstantValue(argVN); +} #endif // FEATURE_SIMD // Compute the proper value number when the VNFunc has all constant arguments @@ -7297,6 +7324,16 @@ void ValueNumStore::vnDump(Compiler* comp, ValueNum vn, bool isPtr) cnsVal.u64[2], cnsVal.u64[3]); break; } + + case TYP_SIMD64: + { + simd64_t cnsVal = GetConstantSimd64(vn); + printf( + "Simd64Cns[0x%016llx, 0x%016llx, 0x%016llx, 0x%016llx, 0x%016llx, 0x%016llx, 0x%016llx, 0x%016llx]", + cnsVal.u64[0], cnsVal.u64[1], cnsVal.u64[2], cnsVal.u64[3], cnsVal.u64[4], cnsVal.u64[5], + cnsVal.u64[6], cnsVal.u64[7]); + break; + } #endif // FEATURE_SIMD // These should be unreached. @@ -8795,6 +8832,10 @@ void Compiler::fgValueNumberTreeConst(GenTree* tree) case TYP_SIMD32: tree->gtVNPair.SetBoth(vnStore->VNForSimd32Con(tree->AsVecCon()->gtSimd32Val)); break; + + case TYP_SIMD64: + tree->gtVNPair.SetBoth(vnStore->VNForSimd64Con(tree->AsVecCon()->gtSimd64Val)); + break; #endif // FEATURE_SIMD case TYP_FLOAT: diff --git a/src/coreclr/jit/valuenum.h b/src/coreclr/jit/valuenum.h index 5a122070ffbbf4..e7ce13d7004fd4 100644 --- a/src/coreclr/jit/valuenum.h +++ b/src/coreclr/jit/valuenum.h @@ -353,6 +353,7 @@ class ValueNumStore simd12_t GetConstantSimd12(ValueNum argVN); simd16_t GetConstantSimd16(ValueNum argVN); simd32_t GetConstantSimd32(ValueNum argVN); + simd64_t GetConstantSimd64(ValueNum argVN); private: #endif // FEATURE_SIMD @@ -436,6 +437,7 @@ class ValueNumStore ValueNum VNForSimd12Con(simd12_t cnsVal); ValueNum VNForSimd16Con(simd16_t cnsVal); ValueNum VNForSimd32Con(simd32_t cnsVal); + ValueNum VNForSimd64Con(simd64_t cnsVal); #endif // FEATURE_SIMD #ifdef TARGET_64BIT @@ -1633,6 +1635,50 @@ class ValueNumStore } return m_simd32CnsMap; } + + struct Simd64PrimitiveKeyFuncs : public JitKeyFuncsDefEquals + { + static bool Equals(simd64_t x, simd64_t y) + { + return x == y; + } + + static unsigned GetHashCode(const simd64_t val) + { + unsigned hash = 0; + + hash = static_cast(hash ^ val.u32[0]); + hash = static_cast(hash ^ val.u32[1]); + hash = static_cast(hash ^ val.u32[2]); + hash = static_cast(hash ^ val.u32[3]); + hash = static_cast(hash ^ val.u32[4]); + hash = static_cast(hash ^ val.u32[5]); + hash = static_cast(hash ^ val.u32[6]); + hash = static_cast(hash ^ val.u32[7]); + hash = static_cast(hash ^ val.u32[8]); + hash = static_cast(hash ^ val.u32[9]); + hash = static_cast(hash ^ val.u32[10]); + hash = static_cast(hash ^ val.u32[11]); + hash = static_cast(hash ^ val.u32[12]); + hash = static_cast(hash ^ val.u32[13]); + hash = static_cast(hash ^ val.u32[14]); + hash = static_cast(hash ^ val.u32[15]); + + return hash; + } + }; + + typedef VNMap Simd64ToValueNumMap; + Simd64ToValueNumMap* m_simd64CnsMap; + Simd64ToValueNumMap* GetSimd64CnsMap() + { + if (m_simd64CnsMap == nullptr) + { + m_simd64CnsMap = new (m_alloc) Simd64ToValueNumMap(m_alloc); + } + return m_simd64CnsMap; + } + #endif // FEATURE_SIMD template @@ -1779,6 +1825,12 @@ struct ValueNumStore::VarTypConv typedef simd32_t Type; typedef simd32_t Lang; }; +template <> +struct ValueNumStore::VarTypConv +{ + typedef simd64_t Type; + typedef simd64_t Lang; +}; #endif // FEATURE_SIMD template <> @@ -1847,6 +1899,13 @@ FORCEINLINE simd32_t ValueNumStore::SafeGetConstantValue(Chunk* c, uns return reinterpret_cast::Lang*>(c->m_defs)[offset]; } +template <> +FORCEINLINE simd64_t ValueNumStore::SafeGetConstantValue(Chunk* c, unsigned offset) +{ + assert(c->m_typ == TYP_SIMD64); + return reinterpret_cast::Lang*>(c->m_defs)[offset]; +} + template <> FORCEINLINE simd8_t ValueNumStore::ConstantValueInternal(ValueNum vn DEBUGARG(bool coerce)) { @@ -1902,6 +1961,20 @@ FORCEINLINE simd32_t ValueNumStore::ConstantValueInternal(ValueNum vn return SafeGetConstantValue(c, offset); } + +template <> +FORCEINLINE simd64_t ValueNumStore::ConstantValueInternal(ValueNum vn DEBUGARG(bool coerce)) +{ + Chunk* c = m_chunks.GetNoExpand(GetChunkNum(vn)); + assert(c->m_attribs == CEA_Const); + + unsigned offset = ChunkOffset(vn); + + assert(c->m_typ == TYP_SIMD64); + assert(!coerce); + + return SafeGetConstantValue(c, offset); +} #endif // FEATURE_SIMD // Inline functions. diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp index 4380be6cf2c310..fd3022a5c236d5 100644 --- a/src/coreclr/vm/jitinterface.cpp +++ b/src/coreclr/vm/jitinterface.cpp @@ -11969,11 +11969,7 @@ void CEEJitInfo::allocMem (AllocMemArgs *pArgs) { size_t codeAlignment = sizeof(void*); - if ((pArgs->flag & CORJIT_ALLOCMEM_FLG_64BYTE_ALIGN) != 0) - { - codeAlignment = 64; - } - else if ((pArgs->flag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0) + if ((pArgs->flag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0) { codeAlignment = 32; } From edcaa623ff138f3e7db85617d0e5d729f9ab89d9 Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Mon, 13 Feb 2023 10:06:06 -0800 Subject: [PATCH 03/12] Lowering + workarounds for simd64 constants. --- src/coreclr/jit/assertionprop.cpp | 2 + src/coreclr/jit/codegenxarch.cpp | 7 +- src/coreclr/jit/emit.cpp | 40 +++- src/coreclr/jit/emit.h | 12 +- src/coreclr/jit/emitxarch.cpp | 10 +- src/coreclr/jit/emitxarch.h | 5 + src/coreclr/jit/gentree.cpp | 233 +++++++++++++++++++- src/coreclr/jit/gentree.h | 13 ++ src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 1 + src/coreclr/jit/hwintrinsiclistxarch.h | 4 +- src/coreclr/jit/hwintrinsicxarch.cpp | 115 +++++++++- src/coreclr/jit/importervectorization.cpp | 5 +- src/coreclr/jit/instr.cpp | 26 +-- src/coreclr/jit/instrsxarch.h | 6 + src/coreclr/jit/lowerxarch.cpp | 200 +++++++++++++++-- src/coreclr/jit/optcse.cpp | 3 +- src/coreclr/jit/valuenum.cpp | 4 +- 17 files changed, 617 insertions(+), 69 deletions(-) diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index 3a91b5a2e11bd5..3da34ff667802e 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -3221,6 +3221,7 @@ GenTree* Compiler::optVNConstantPropOnTree(BasicBlock* block, GenTree* tree) } case TYP_SIMD32: + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. { simd32_t value = vnStore->ConstantValue(vnCns); @@ -3231,6 +3232,7 @@ GenTree* Compiler::optVNConstantPropOnTree(BasicBlock* block, GenTree* tree) break; } break; + #endif // FEATURE_SIMD case TYP_BYREF: diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 3175f9548f1bba..e5fabda24f5f49 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -569,8 +569,11 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre case TYP_SIMD64: { - simd64_t constValue = vecCon->gtSimd64Val; - CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(constValue); + simd64_t constValue; + // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. + constValue.v256[0] = vecCon->gtSimd32Val; + constValue.v256[1] = vecCon->gtSimd32Val; + CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(constValue); emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0); break; diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 1749c6cc20afc0..55049b1fda11dd 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -2612,15 +2612,11 @@ void emitter::emitSetFrameRangeArgs(int offsLo, int offsHi) */ const emitter::opSize emitter::emitSizeEncode[] = { - emitter::OPSZ1, emitter::OPSZ2, OPSIZE_INVALID, emitter::OPSZ4, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, - emitter::OPSZ8, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, - OPSIZE_INVALID, emitter::OPSZ16, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, - OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, - OPSIZE_INVALID, OPSIZE_INVALID, OPSIZE_INVALID, emitter::OPSZ32, + emitter::OPSZ1, emitter::OPSZ2, emitter::OPSZ4, emitter::OPSZ8, emitter::OPSZ16, emitter::OPSZ32, emitter::OPSZ64, }; -const emitAttr emitter::emitSizeDecode[emitter::OPSZ_COUNT] = {EA_1BYTE, EA_2BYTE, EA_4BYTE, - EA_8BYTE, EA_16BYTE, EA_32BYTE}; +const emitAttr emitter::emitSizeDecode[emitter::OPSZ_COUNT] = {EA_1BYTE, EA_2BYTE, EA_4BYTE, EA_8BYTE, + EA_16BYTE, EA_32BYTE, EA_64BYTE}; /***************************************************************************** * @@ -6548,7 +6544,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, coldCodeBlock = nullptr; - // This restricts the data alignment to: 4, 8, 16, or 32 bytes + // This restricts the data alignment to: 4, 8, 16, 32 or 64 bytes // Alignments greater than 32 would require VM support in ICorJitInfo::allocMem uint32_t dataAlignment = emitConsDsc.alignment; assert((dataSection::MIN_DATA_ALIGN <= dataAlignment) && (dataAlignment <= dataSection::MAX_DATA_ALIGN) && @@ -6629,6 +6625,10 @@ unsigned emitter::emitEndCodeGen(Compiler* comp, { allocMemFlagDataAlign = CORJIT_ALLOCMEM_FLG_RODATA_32BYTE_ALIGN; } + else if (dataAlignment == 64) + { + allocMemFlagDataAlign = CORJIT_ALLOCMEM_FLG_RODATA_64BYTE_ALIGN; + } CorJitAllocMemFlag allocMemFlag = static_cast(allocMemFlagCodeAlign | allocMemFlagDataAlign); @@ -7976,6 +7976,30 @@ CORINFO_FIELD_HANDLE emitter::emitSimd32Const(simd32_t constValue) #endif // !FEATURE_SIMD } +CORINFO_FIELD_HANDLE emitter::emitSimd64Const(simd64_t constValue) +{ + // Access to inline data is 'abstracted' by a special type of static member + // (produced by eeFindJitDataOffs) which the emitter recognizes as being a reference + // to constant data, not a real static field. + CLANG_FORMAT_COMMENT_ANCHOR; + +#if defined(FEATURE_SIMD) + unsigned cnsSize = 64; + unsigned cnsAlign = cnsSize; + +#ifdef TARGET_XARCH + if (emitComp->compCodeOpt() == Compiler::SMALL_CODE) + { + cnsAlign = dataSection::MIN_DATA_ALIGN; + } +#endif // TARGET_XARCH + UNATIVE_OFFSET cnum = emitDataConst(&constValue, cnsSize, cnsAlign, TYP_SIMD64); + return emitComp->eeFindJitDataOffs(cnum); +#else + unreached(); +#endif // !FEATURE_SIMD +} + /***************************************************************************** * * Output the given data section at the specified address. diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 6fdbd8bf5abd9f..d037a32ac3fe25 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -513,7 +513,8 @@ class emitter OPSZ8 = 3, OPSZ16 = 4, OPSZ32 = 5, - OPSZ_COUNT = 6, + OPSZ64 = 6, + OPSZ_COUNT = 7, #ifdef TARGET_AMD64 OPSZP = OPSZ8, #else @@ -2061,6 +2062,7 @@ class emitter CORINFO_FIELD_HANDLE emitSimd8Const(simd8_t constValue); CORINFO_FIELD_HANDLE emitSimd16Const(simd16_t constValue); CORINFO_FIELD_HANDLE emitSimd32Const(simd32_t constValue); + CORINFO_FIELD_HANDLE emitSimd64Const(simd64_t constValue); regNumber emitInsBinary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src); regNumber emitInsTernary(instruction ins, emitAttr attr, GenTree* dst, GenTree* src1, GenTree* src2); void emitInsLoadInd(instruction ins, emitAttr attr, regNumber dstReg, GenTreeIndir* mem); @@ -2674,11 +2676,11 @@ class emitter struct dataSection { - // Note to use alignments greater than 32 requires modification in the VM + // Note to use alignments greater than 64 requires modification in the VM // to support larger alignments (see ICorJitInfo::allocMem) // const static unsigned MIN_DATA_ALIGN = 4; - const static unsigned MAX_DATA_ALIGN = 32; + const static unsigned MAX_DATA_ALIGN = 64; enum sectionType { @@ -2989,9 +2991,9 @@ inline emitAttr emitActualTypeSize(T type) /* static */ inline emitter::opSize emitter::emitEncodeSize(emitAttr size) { assert(size == EA_1BYTE || size == EA_2BYTE || size == EA_4BYTE || size == EA_8BYTE || size == EA_16BYTE || - size == EA_32BYTE); + size == EA_32BYTE || size == EA_64BYTE); - return emitSizeEncode[((int)size) - 1]; + return emitSizeEncode[genLog2(size)]; } /* static */ inline emitAttr emitter::emitDecodeSize(emitter::opSize ensz) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index de1f07a923a876..44eb1f5305ca70 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -7660,7 +7660,7 @@ void emitter::emitIns_ARX_R( fmt = emitInsModeFormat(ins, IF_ARD_RRD); noway_assert(emitVerifyEncodable(ins, EA_SIZE(attr), reg)); - assert(!CodeGen::instIsFP(ins) && (EA_SIZE(attr) <= EA_32BYTE)); + assert(!CodeGen::instIsFP(ins) && (EA_SIZE(attr) <= EA_64BYTE)); id->idReg1(reg); } @@ -10903,13 +10903,18 @@ void emitter::emitDispIns( } case IF_RWR_RRD_RRD_CNS: - assert(IsVexEncodedInstruction(ins)); + assert(IsVexOrEvexEncodedInstruction(ins)); assert(IsThreeOperandAVXInstruction(ins)); printf("%s, ", emitRegName(id->idReg1(), attr)); printf("%s, ", emitRegName(id->idReg2(), attr)); switch (ins) { + case INS_vinsert64x4: + { + attr = EA_32BYTE; + break; + } case INS_vinsertf128: case INS_vinserti128: { @@ -17625,6 +17630,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vextracti128: case INS_vinsertf128: case INS_vinserti128: + case INS_vinsert64x4: result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += PERFSCORE_LATENCY_3C; break; diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 3bcc4f4c7f6ed1..7f52930d0590a8 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -283,6 +283,9 @@ bool IsWEvexOpcodeExtension(const instrDesc* id) case INS_vfnmsub231sd: case INS_unpcklpd: case INS_vpermilpdvar: + case INS_movdqu16: + case INS_movdqu64: + case INS_vinsert64x4: { return true; // W1 } @@ -396,6 +399,8 @@ bool IsWEvexOpcodeExtension(const instrDesc* id) case INS_vpdpbusds: case INS_vpdpwssds: case INS_vpermilpsvar: + case INS_movdqu8: + case INS_movdqu32: { return false; // W0 } diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 16fe9b2f19cc05..5367097ef8ec10 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -3043,6 +3043,7 @@ unsigned Compiler::gtHashValue(GenTree* tree) switch (vecCon->TypeGet()) { #if defined(FEATURE_SIMD) + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. case TYP_SIMD32: { add = genTreeHashAdd(ulo32(add), vecCon->gtSimd32Val.u32[7]); @@ -7264,6 +7265,7 @@ GenTree* Compiler::gtNewAllBitsSetConNode(var_types type) case TYP_SIMD12: case TYP_SIMD16: case TYP_SIMD32: + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. allBitsSet = gtNewVconNode(type); allBitsSet->AsVecCon()->gtSimd32Val.i64[0] = -1; allBitsSet->AsVecCon()->gtSimd32Val.i64[1] = -1; @@ -7305,6 +7307,7 @@ GenTree* Compiler::gtNewZeroConNode(var_types type) case TYP_SIMD12: case TYP_SIMD16: case TYP_SIMD32: + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. { zero = gtNewVconNode(type); zero->AsVecCon()->gtSimd32Val = {}; @@ -7345,6 +7348,7 @@ GenTree* Compiler::gtNewOneConNode(var_types type, var_types simdBaseType /* = T case TYP_SIMD12: case TYP_SIMD16: case TYP_SIMD32: + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. { GenTreeVecCon* vecCon = gtNewVconNode(type); @@ -11549,12 +11553,14 @@ void Compiler::gtDispConst(GenTree* tree) } case TYP_SIMD32: + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. { simd32_t simdVal = vecCon->gtSimd32Val; printf("<0x%016llx, 0x%016llx, 0x%016llx, 0x%016llx>", simdVal.u64[0], simdVal.u64[1], simdVal.u64[2], simdVal.u64[3]); break; } + #endif // FEATURE_SIMD default: @@ -17511,6 +17517,34 @@ bool GenTreeVecCon::IsHWIntrinsicCreateConstant(GenTreeHWIntrinsic* node, simd32 return argCnt == cnsArgCnt; } +#if defined(TARGET_XARCH) + case NI_Vector512_Create: + { + // Zero out the simd32Val + simd32Val = {}; + + // These intrinsics are meant to set the same value to every element. + if ((argCnt == 1) && HandleArgForHWIntrinsicCreate(node->Op(1), 0, simd32Val, simdBaseType)) + { + cnsArgCnt = 1; + } + else + { + for (unsigned i = 1; i <= argCnt / 2; i++) + { + if (HandleArgForHWIntrinsicCreate(node->Op(i), i - 1, simd32Val, simdBaseType)) + { + cnsArgCnt++; + } + } + } + + assert((argCnt == 1) || (argCnt == (simdSize / genTypeSize(simdBaseType)))); + return argCnt == cnsArgCnt; + } + +#endif + default: { return false; @@ -17652,6 +17686,198 @@ bool GenTreeVecCon::HandleArgForHWIntrinsicCreate(GenTree* arg, int argIdx, simd return false; } + +// TODO-XArch-AVX512: Keep only one implementation once GenTreeVecCon supports gtSimd64Val. +#if defined(TARGET_XARCH) +//---------------------------------------------------------------------------------------------- +// IsHWIntrinsicCreateConstant: Determines if a HWIntrinsic node represents a vector constant +// +// Arguments: +// node - The node to check +// simd64Val - The vector constant being constructed +// +// Returns: +// true if node represents a constant; otherwise, false +bool GenTreeVecCon::IsHWIntrinsicCreateConstant(GenTreeHWIntrinsic* node, simd64_t& simd64Val) +{ + NamedIntrinsic intrinsic = node->GetHWIntrinsicId(); + var_types simdType = node->TypeGet(); + var_types simdBaseType = node->GetSimdBaseType(); + unsigned simdSize = node->GetSimdSize(); + + size_t argCnt = node->GetOperandCount(); + size_t cnsArgCnt = 0; + + switch (intrinsic) + { + case NI_Vector512_Create: + { + // Zero out the32Val + simd64Val = {}; + + // These intrinsics are meant to set the same value to every element. + if ((argCnt == 1) && HandleArgForHWIntrinsicCreate(node->Op(1), 0, simd64Val, simdBaseType)) + { + + cnsArgCnt = 1; + } + else + { + for (unsigned i = 1; i <= argCnt; i++) + { + if (HandleArgForHWIntrinsicCreate(node->Op(i), i - 1, simd64Val, simdBaseType)) + { + cnsArgCnt++; + } + } + } + + assert((argCnt == 1) || (argCnt == (simdSize / genTypeSize(simdBaseType)))); + return argCnt == cnsArgCnt; + } + + default: + { + return false; + } + } +} + +//---------------------------------------------------------------------------------------------- +// HandleArgForHWIntrinsicCreate: Processes an argument for the GenTreeVecCon::IsHWIntrinsicCreateConstant method +// +// Arguments: +// arg - The argument to process +// argIdx - The index of the argument being processed +// simd32Val - The vector constant being constructed +// baseType - The base type of the vector constant +// +// Returns: +// true if arg was a constant; otherwise, false +bool GenTreeVecCon::HandleArgForHWIntrinsicCreate(GenTree* arg, int argIdx, simd64_t& simd64Val, var_types baseType) +{ + switch (baseType) + { + case TYP_BYTE: + case TYP_UBYTE: + { + if (arg->IsCnsIntOrI()) + { + simd64Val.i8[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); + return true; + } + else + { + // We expect the constant to have been already zeroed + assert(simd64Val.i8[argIdx] == 0); + } + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + if (arg->IsCnsIntOrI()) + { + simd64Val.i16[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); + return true; + } + else + { + // We expect the constant to have been already zeroed + assert(simd64Val.i16[argIdx] == 0); + } + break; + } + + case TYP_INT: + case TYP_UINT: + { + if (arg->IsCnsIntOrI()) + { + simd64Val.i32[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); + return true; + } + else + { + // We expect the constant to have been already zeroed + assert(simd64Val.i32[argIdx] == 0); + } + break; + } + + case TYP_LONG: + case TYP_ULONG: + { +#if defined(TARGET_64BIT) + if (arg->IsCnsIntOrI()) + { + simd64Val.i64[argIdx] = static_cast(arg->AsIntCon()->gtIconVal); + return true; + } +#else + if (arg->OperIsLong() && arg->AsOp()->gtOp1->IsCnsIntOrI() && arg->AsOp()->gtOp2->IsCnsIntOrI()) + { + // 32-bit targets will decompose GT_CNS_LNG into two GT_CNS_INT + // We need to reconstruct the 64-bit value in order to handle this + + INT64 gtLconVal = arg->AsOp()->gtOp2->AsIntCon()->gtIconVal; + gtLconVal <<= 32; + gtLconVal |= arg->AsOp()->gtOp1->AsIntCon()->gtIconVal; + + simd64Val.i64[argIdx] = gtLconVal; + return true; + } +#endif // TARGET_64BIT + else + { + // We expect the constant to have been already zeroed + assert(simd64Val.i64[argIdx] == 0); + } + break; + } + + case TYP_FLOAT: + { + if (arg->IsCnsFltOrDbl()) + { + simd64Val.f32[argIdx] = static_cast(arg->AsDblCon()->DconValue()); + return true; + } + else + { + // We expect the constant to have been already zeroed + // We check against the i32, rather than f32, to account for -0.0 + assert(simd64Val.i32[argIdx] == 0); + } + break; + } + + case TYP_DOUBLE: + { + if (arg->IsCnsFltOrDbl()) + { + simd64Val.f64[argIdx] = static_cast(arg->AsDblCon()->DconValue()); + return true; + } + else + { + // We expect the constant to have been already zeroed + // We check against the i64, rather than f64, to account for -0.0 + assert(simd64Val.i64[argIdx] == 0); + } + break; + } + + default: + { + unreached(); + } + } + + return false; +} +#endif #endif // FEATURE_HW_INTRINSICS //------------------------------------------------------------------------ @@ -21211,7 +21437,11 @@ GenTree* Compiler::gtNewSimdCreateBroadcastNode( } #endif // TARGET_X86 - if (simdSize == 32) + if (simdSize == 64) + { + hwIntrinsicID = NI_Vector512_Create; + } + else if (simdSize == 32) { hwIntrinsicID = NI_Vector256_Create; } @@ -23860,6 +24090,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const case NI_SSE41_ConvertToVector128Int64: case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX512F_BroadcastScalarToVector512: case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 243474c42eb92f..e32d7891e26bf6 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -3351,6 +3351,13 @@ struct GenTreeVecCon : public GenTree static bool IsHWIntrinsicCreateConstant(GenTreeHWIntrinsic* node, simd32_t& simd32Val); static bool HandleArgForHWIntrinsicCreate(GenTree* arg, int argIdx, simd32_t& simd32Val, var_types baseType); + +// TODO-XArch-AVX512: Keep only one implementation once GenTreeVecCon supports gtSimd64Val. +#if defined(TARGET_XARCH) + static bool IsHWIntrinsicCreateConstant(GenTreeHWIntrinsic* node, simd64_t& simd64Val); + + static bool HandleArgForHWIntrinsicCreate(GenTree* arg, int argIdx, simd64_t& simd64Val, var_types baseType); +#endif #endif // FEATURE_HW_INTRINSICS bool IsAllBitsSet() const @@ -3375,10 +3382,12 @@ struct GenTreeVecCon : public GenTree } case TYP_SIMD32: + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. { return (gtSimd32Val.u64[0] == 0xFFFFFFFFFFFFFFFF) && (gtSimd32Val.u64[1] == 0xFFFFFFFFFFFFFFFF) && (gtSimd32Val.u64[2] == 0xFFFFFFFFFFFFFFFF) && (gtSimd32Val.u64[3] == 0xFFFFFFFFFFFFFFFF); } + #endif // FEATURE_SIMD default: @@ -3419,12 +3428,14 @@ struct GenTreeVecCon : public GenTree } case TYP_SIMD32: + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. { return (left->gtSimd32Val.u64[0] == right->gtSimd32Val.u64[0]) && (left->gtSimd32Val.u64[1] == right->gtSimd32Val.u64[1]) && (left->gtSimd32Val.u64[2] == right->gtSimd32Val.u64[2]) && (left->gtSimd32Val.u64[3] == right->gtSimd32Val.u64[3]); } + #endif // FEATURE_SIMD default: @@ -3456,10 +3467,12 @@ struct GenTreeVecCon : public GenTree } case TYP_SIMD32: + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. { return (gtSimd32Val.u64[0] == 0x0000000000000000) && (gtSimd32Val.u64[1] == 0x0000000000000000) && (gtSimd32Val.u64[2] == 0x0000000000000000) && (gtSimd32Val.u64[3] == 0x0000000000000000); } + #endif // FEATURE_SIMD default: diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 364dca38f6e356..2b3454d5916fd6 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1120,6 +1120,7 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node) } case NI_Vector128_ToVector256Unsafe: + case NI_Vector256_ToVector512Unsafe: case NI_Vector256_GetLower: { if (op1->isContained() || op1->isUsedFromSpillTemp()) diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 4fc4ae3c5e3ef4..50c57113bc92f7 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -225,6 +225,7 @@ HARDWARE_INTRINSIC(Vector256, StoreUnsafe, HARDWARE_INTRINSIC(Vector256, Subtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Sum, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsdsse2}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector256, ToVector512Unsafe, 32, 1, {INS_movdqu8, INS_movdqu8, INS_movdqu16, INS_movdqu16, INS_movdqu32, INS_movdqu32, INS_movdqu64, INS_movdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector256, WidenLower, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, WidenUpper, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, WithElement, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible) @@ -726,7 +727,8 @@ HARDWARE_INTRINSIC(AVX2, Xor, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX512 Intrinsics -HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) +HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) +HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 22303b4ad12e42..d7431cc995b8d2 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -148,6 +148,10 @@ static CORINFO_InstructionSet lookupInstructionSet(const char* className) { return InstructionSet_Vector256; } + else if (strncmp(className, "Vector512", 9) == 0) + { + return InstructionSet_Vector512; + } } else if (strcmp(className, "Fma") == 0) { @@ -388,6 +392,7 @@ bool HWIntrinsicInfo::isFullyImplementedIsa(CORINFO_InstructionSet isa) case InstructionSet_SSE42_X64: case InstructionSet_Vector128: case InstructionSet_Vector256: + case InstructionSet_Vector512: // TODO-XArch-AVX512 : Not fully implemented currently. case InstructionSet_X86Base: case InstructionSet_X86Base_X64: case InstructionSet_X86Serialize: @@ -506,9 +511,10 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, CORINFO_InstructionSet isa = HWIntrinsicInfo::lookupIsa(intrinsic); - if ((isa == InstructionSet_Vector256) && !compExactlyDependsOn(InstructionSet_AVX)) + if (((isa == InstructionSet_Vector256) && !compExactlyDependsOn(InstructionSet_AVX)) || + (((isa == InstructionSet_Vector512) && !compExactlyDependsOn(InstructionSet_AVX512F)))) { - // We don't want to deal with TYP_SIMD32 if the compiler doesn't otherwise support the type. + // We don't want to deal with TYP_SIMD32 or TYP_SIMD64 if the compiler doesn't otherwise support the type. return nullptr; } @@ -1038,6 +1044,108 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } +#if defined(TARGET_X86) + if (varTypeIsLong(simdBaseType)) + { + // TODO-XARCH-CQ: It may be beneficial to emit the movq + // instruction, which takes a 64-bit memory address and + // works on 32-bit x86 systems. + break; + } +#endif // TARGET_X86 + + IntrinsicNodeBuilder nodeBuilder(getAllocator(CMK_ASTNode), sig->numArgs); + + // TODO-CQ: We don't handle contiguous args for anything except TYP_FLOAT today + + GenTree* prevArg = nullptr; + bool areArgsContiguous = (simdBaseType == TYP_FLOAT); + + for (int i = sig->numArgs - 1; i >= 0; i--) + { + GenTree* arg = impPopStack().val; + + if (areArgsContiguous) + { + if (prevArg != nullptr) + { + // Recall that we are popping the args off the stack in reverse order. + areArgsContiguous = areArgumentsContiguous(arg, prevArg); + } + + prevArg = arg; + } + + nodeBuilder.AddOperand(i, arg); + } + + if (areArgsContiguous) + { + op1 = nodeBuilder.GetOperand(0); + GenTree* op1Address = CreateAddressNodeForSimdHWIntrinsicCreate(op1, simdBaseType, simdSize); + retNode = gtNewOperNode(GT_IND, retType, op1Address); + } + else + { + retNode = + gtNewSimdHWIntrinsicNode(retType, std::move(nodeBuilder), intrinsic, simdBaseJitType, simdSize); + } + break; + } + + case NI_Vector512_Create: + { + if (sig->numArgs == 1) + { +#if defined(TARGET_X86) + if (varTypeIsLong(simdBaseType) && !impStackTop(0).val->IsIntegralConst()) + { + // TODO-XARCH-CQ: It may be beneficial to emit the movq + // instruction, which takes a 64-bit memory address and + // works on 32-bit x86 systems. + break; + } +#endif // TARGET_X86 + + op1 = impPopStack().val; + retNode = gtNewSimdCreateBroadcastNode(retType, op1, simdBaseJitType, simdSize); + break; + } + + uint32_t simdLength = getSIMDVectorLength(simdSize, simdBaseType); + assert(sig->numArgs == simdLength); + + bool isConstant = true; + + if (varTypeIsFloating(simdBaseType)) + { + for (uint32_t index = 0; index < sig->numArgs; index++) + { + GenTree* arg = impStackTop(index).val; + + if (!arg->IsCnsFltOrDbl()) + { + isConstant = false; + break; + } + } + } + else + { + assert(varTypeIsIntegral(simdBaseType)); + + for (uint32_t index = 0; index < sig->numArgs; index++) + { + GenTree* arg = impStackTop(index).val; + + if (!arg->IsIntegralConst()) + { + isConstant = false; + break; + } + } + } + #if defined(TARGET_X86) if (varTypeIsLong(simdBaseType)) { @@ -1276,7 +1384,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { simd32_t simd32Val = {}; - assert((simdSize == 16) || (simdSize == 32)); + assert((simdSize == 16) || (simdSize == 32) || (simdSize == 64)); simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE; // We want to tightly pack the most significant byte of each short/ushort @@ -1406,6 +1514,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_get_Zero: case NI_Vector256_get_Zero: + case NI_Vector512_get_Zero: { assert(sig->numArgs == 0); retNode = gtNewZeroConNode(retType); diff --git a/src/coreclr/jit/importervectorization.cpp b/src/coreclr/jit/importervectorization.cpp index dc8c685e6915ef..c4f80cc6d20bd6 100644 --- a/src/coreclr/jit/importervectorization.cpp +++ b/src/coreclr/jit/importervectorization.cpp @@ -74,7 +74,7 @@ static bool ConvertToLowerCase(WCHAR* input, WCHAR* mask, int length) // // Arguments: // comp - Compiler object -// simdType - Vector type, either TYP_SIMD32 (xarch only) or TYP_SIMD16 +// simdType - Vector type, TYP_SIMD64 (xarch only), TYP_SIMD32 (xarch only) or TYP_SIMD16 // cns - Constant data // // Return Value: @@ -83,7 +83,8 @@ static bool ConvertToLowerCase(WCHAR* input, WCHAR* mask, int length) static GenTreeVecCon* CreateConstVector(Compiler* comp, var_types simdType, WCHAR* cns) { #ifdef TARGET_XARCH - if (simdType == TYP_SIMD32) + if ((simdType == TYP_SIMD32) || + (simdType == TYP_SIMD64)) // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. { simd32_t simd32Val = {}; GenTreeVecCon* vecCon = comp->gtNewVconNode(simdType); diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 91e018cbd8ac8b..425072f8cfd1c5 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -160,33 +160,21 @@ const char* CodeGen::genSizeStr(emitAttr attr) "", "byte ptr ", "word ptr ", - nullptr, "dword ptr ", - nullptr, - nullptr, - nullptr, "qword ptr ", - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, "xmmword ptr ", - nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, - nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, - "ymmword ptr" + "ymmword ptr", + "zmmword ptr" }; // clang-format on unsigned size = EA_SIZE(attr); - assert(size == 0 || size == 1 || size == 2 || size == 4 || size == 8 || size == 16 || size == 32); + assert(size == 0 || size == 1 || size == 2 || size == 4 || size == 8 || size == 16 || size == 32 || size == 64); if (EA_ATTR(size) == attr) { - return sizes[size]; + return sizes[size > 0 ? genLog2(size) + 1 : size]; } else if (attr == EA_GCREF) { @@ -807,6 +795,12 @@ CodeGen::OperandDesc CodeGen::genOperandDesc(GenTree* op) simd32_t constValue = op->AsVecCon()->gtSimd32Val; return OperandDesc(emit->emitSimd32Const(constValue)); } + + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. + { + simd32_t constValue = op->AsVecCon()->gtSimd32Val; + return OperandDesc(emit->emitSimd32Const(constValue)); + } #endif // FEATURE_SIMD default: diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 13ed02d75c6ead..b50646bc17430d 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -630,6 +630,12 @@ INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) +INST3(movdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) +INST3(movdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) +INST3(movdqu32, "movdqu32", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) +INST3(movdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) +INST3(vinsert32x8, "insert32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed floating point values +INST3(vinsert64x4, "insert64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed floating point values INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 1b7054b138130a..86e4d8f27def0a 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1035,6 +1035,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Vector128_Create: case NI_Vector256_Create: + case NI_Vector512_Create: case NI_Vector128_CreateScalar: case NI_Vector256_CreateScalar: { @@ -1899,7 +1900,7 @@ void Lowering::LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node) } //---------------------------------------------------------------------------------------------- -// Lowering::LowerHWIntrinsicCreate: Lowers a Vector128 or Vector256 Create call +// Lowering::LowerHWIntrinsicCreate: Lowers a Vector128 or Vector256 or Vector512 Create call // // Arguments: // node - The hardware intrinsic node. @@ -1912,6 +1913,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) var_types simdBaseType = node->GetSimdBaseType(); unsigned simdSize = node->GetSimdSize(); simd32_t simd32Val = {}; + simd64_t simd64Val = {}; if ((simdSize == 8) && (simdType == TYP_DOUBLE)) { @@ -1933,44 +1935,109 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) GenTree* tmp2 = nullptr; GenTree* tmp3 = nullptr; - bool isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simd32Val); + bool isConstant = false; + if (simdSize != 64) + { + isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simd32Val); + } + else + { + isConstant = GenTreeVecCon::IsHWIntrinsicCreateConstant(node, simd64Val); + } bool isCreateScalar = (intrinsicId == NI_Vector128_CreateScalar) || (intrinsicId == NI_Vector256_CreateScalar); size_t argCnt = node->GetOperandCount(); if (isConstant) { - assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16) || (simdSize == 32)); - - for (GenTree* arg : node->Operands()) + if (simdSize != 64) { -#if !defined(TARGET_64BIT) - if (arg->OperIsLong()) + assert((simdSize == 8) || (simdSize == 12) || (simdSize == 16) || (simdSize == 32)); + + for (GenTree* arg : node->Operands()) { - BlockRange().Remove(arg->AsOp()->gtGetOp1()); - BlockRange().Remove(arg->AsOp()->gtGetOp2()); - } +#if !defined(TARGET_64BIT) + if (arg->OperIsLong()) + { + BlockRange().Remove(arg->AsOp()->gtGetOp1()); + BlockRange().Remove(arg->AsOp()->gtGetOp2()); + } #endif // !TARGET_64BIT - BlockRange().Remove(arg); - } + BlockRange().Remove(arg); + } - GenTreeVecCon* vecCon = comp->gtNewVconNode(simdType); + GenTreeVecCon* vecCon = comp->gtNewVconNode(simdType); - vecCon->gtSimd32Val = simd32Val; - BlockRange().InsertBefore(node, vecCon); + vecCon->gtSimd32Val = simd32Val; + BlockRange().InsertBefore(node, vecCon); - LIR::Use use; - if (BlockRange().TryGetUse(node, &use)) - { - use.ReplaceWith(vecCon); + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + use.ReplaceWith(vecCon); + } + else + { + vecCon->SetUnusedValue(); + } + + BlockRange().Remove(node); + + return LowerNode(vecCon); } else { - vecCon->SetUnusedValue(); - } + for (GenTree* arg : node->Operands()) + { +#if !defined(TARGET_64BIT) + if (arg->OperIsLong()) + { + BlockRange().Remove(arg->AsOp()->gtGetOp1()); + BlockRange().Remove(arg->AsOp()->gtGetOp2()); + } +#endif // !TARGET_64BIT + BlockRange().Remove(arg); + } - BlockRange().Remove(node); + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + + // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. + // We will be constructing the following parts: + // /--* op1 T + // +--* ... T + // lo = * HWINTRINSIC simd32 T Create + // /--* ... T + // +--* opN T + // hi = * HWINTRINSIC simd32 T Create + // +--* lo simd32 + // tmp1 = * HWINTRINSIC simd64 T ToVector512Unsafe + // idx = CNS_INT int 1 + // /--* tmp1 simd64 + // +--* hi simd32 + // +--* idx int + // node = * HWINTRINSIC simd64 T InsertVector256 - return LowerNode(vecCon); + GenTreeVecCon* vecCon0 = comp->gtNewVconNode(TYP_SIMD32); + vecCon0->gtSimd32Val = simd64Val.v256[0]; + BlockRange().InsertBefore(node, vecCon0); + LowerNode(vecCon0); + GenTreeVecCon* vecCon1 = comp->gtNewVconNode(TYP_SIMD32); + vecCon1->gtSimd32Val = simd64Val.v256[1]; + BlockRange().InsertAfter(vecCon0, vecCon1); + + tmp1 = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD64, vecCon0, NI_Vector256_ToVector512Unsafe, simdBaseJitType, + 32); + BlockRange().InsertAfter(vecCon1, tmp1); + + idx = comp->gtNewIconNode(0x01, TYP_INT); + BlockRange().InsertAfter(tmp1, idx); + + node->ResetHWIntrinsicId(NI_AVX512F_InsertVector256, comp, tmp1, vecCon1, idx); + + LowerNode(vecCon1); + LowerNode(idx); + LowerNode(tmp1); + return LowerNode(node); + } } else if (argCnt == 1) { @@ -2124,6 +2191,31 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) return LowerNode(node); } + // We have the following (where simd is simd16, simd32 or simd64): + // /--* op1 T + // node = * HWINTRINSIC simd T Create + + if (intrinsicId == NI_Vector512_Create) + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + // We will be constructing the following parts: + // /--* op1 T + // tmp1 = * HWINTRINSIC simd32 T CreateScalarUnsafe + // /--* tmp1 simd16 + // node = * HWINTRINSIC simd64 T BroadcastScalarToVector512 + + // This is roughly the following managed code: + // var tmp1 = Vector256.CreateScalarUnsafe(op1); + // return Avx512.BroadcastScalarToVector512(tmp1); + + tmp1 = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, op1, simdBaseJitType, 16); + LowerNode(tmp1); + + node->ResetHWIntrinsicId(NI_AVX512F_BroadcastScalarToVector512, tmp1); + + return LowerNode(node); + } + // We have the following (where simd is simd16 or simd32): // /--* op1 T // node = * HWINTRINSIC simd T Create @@ -2506,8 +2598,65 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // +--* ... T // +--* opN T // node = * HWINTRINSIC simd T Create + if (intrinsicId == NI_Vector512_Create) + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2)); + + // We will be constructing the following parts: + // /--* op1 T + // +--* ... T + // lo = * HWINTRINSIC simd32 T Create + // /--* ... T + // +--* opN T + // hi = * HWINTRINSIC simd32 T Create + // idx = CNS_INT int 1 + // /--* lo simd64 + // +--* hi simd32 + // +--* idx int + // node = * HWINTRINSIC simd64 T InsertVector256 - if (intrinsicId == NI_Vector256_Create) + // This is roughly the following managed code: + // ... + // var lo = Vector256.Create(op1, ...); + // var hi = Vector256.Create(..., opN); + // return Avx2.InsertVector512F(lo, hi, 0x01); + + // Each Vector256.Create call gets half the operands. That is: + // lo = Vector256.Create(op1, op2); + // hi = Vector256.Create(op3, op4); + // -or- + // lo = Vector256.Create(op1, ..., op4); + // hi = Vector256.Create(op5, ..., op8); + // -or- + // lo = Vector256.Create(op1, ..., op8); + // hi = Vector256.Create(op9, ..., op16); + // -or- + // lo = Vector256.Create(op1, ..., op16); + // hi = Vector256.Create(op17, ..., op32); + + size_t halfArgCnt = argCnt / 2; + assert((halfArgCnt * 2) == argCnt); + + GenTree* lo = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, node->GetOperandArray(), halfArgCnt, + NI_Vector256_Create, simdBaseJitType, 32); + BlockRange().InsertAfter(node->Op(halfArgCnt), lo); + + GenTree* hi = comp->gtNewSimdHWIntrinsicNode(TYP_SIMD32, node->GetOperandArray(halfArgCnt), halfArgCnt, + NI_Vector256_Create, simdBaseJitType, 32); + BlockRange().InsertAfter(node->Op(argCnt), hi); + + idx = comp->gtNewIconNode(0x01, TYP_INT); + BlockRange().InsertAfter(hi, idx); + + assert(argCnt >= 7); + node->ResetHWIntrinsicId(NI_AVX512F_InsertVector256, comp, lo, hi, idx); + + LowerNode(lo); + LowerNode(hi); + + return LowerNode(node); + } + else if (intrinsicId == NI_Vector256_Create) { assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); @@ -6606,6 +6755,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX512F_BroadcastScalarToVector512: { if (!parentNode->OperIsMemoryLoad()) { @@ -6942,6 +7092,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: + case NI_AVX512F_BroadcastScalarToVector512: { if (node->OperIsMemoryLoad()) { @@ -7422,6 +7573,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX2_InsertVector128: case NI_AVX2_MultipleSumAbsoluteDifferences: case NI_AVX2_Permute2x128: + case NI_AVX512F_InsertVector256: case NI_PCLMULQDQ_CarrylessMultiply: { if (IsContainableHWIntrinsicOp(node, op2, &supportsRegOptional)) diff --git a/src/coreclr/jit/optcse.cpp b/src/coreclr/jit/optcse.cpp index 56bee9820bfb1e..3e4d30f56d365f 100644 --- a/src/coreclr/jit/optcse.cpp +++ b/src/coreclr/jit/optcse.cpp @@ -2657,8 +2657,7 @@ class CSE_Heuristic // If we have a SIMD32 that is live across a call we have even higher spill costs // - if ((candidate->Expr()->TypeGet() == TYP_SIMD32) || - (candidate->Expr()->TypeGet() == TYP_SIMD64)) + if ((candidate->Expr()->TypeGet() == TYP_SIMD32) || (candidate->Expr()->TypeGet() == TYP_SIMD64)) { // Additionally for a simd32 CSE candidate we assume that and second spilled/restore will be needed. // (to hold the upper half of the simd32 register that isn't preserved across the call) diff --git a/src/coreclr/jit/valuenum.cpp b/src/coreclr/jit/valuenum.cpp index 1d1fb870de1849..5a5884ff8551e4 100644 --- a/src/coreclr/jit/valuenum.cpp +++ b/src/coreclr/jit/valuenum.cpp @@ -8830,12 +8830,10 @@ void Compiler::fgValueNumberTreeConst(GenTree* tree) break; case TYP_SIMD32: + case TYP_SIMD64: // TODO-XArch-AVX512: Fix once GenTreeVecCon supports gtSimd64Val. tree->gtVNPair.SetBoth(vnStore->VNForSimd32Con(tree->AsVecCon()->gtSimd32Val)); break; - case TYP_SIMD64: - tree->gtVNPair.SetBoth(vnStore->VNForSimd64Con(tree->AsVecCon()->gtSimd64Val)); - break; #endif // FEATURE_SIMD case TYP_FLOAT: From 87ecf433c34d75203908109c49e3da1a51e23616 Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Mon, 13 Feb 2023 10:06:36 -0800 Subject: [PATCH 04/12] Fix lowering logic for Create(). --- src/coreclr/jit/emitxarch.cpp | 6 ++++-- src/coreclr/jit/emitxarch.h | 3 ++- src/coreclr/jit/gentree.cpp | 28 -------------------------- src/coreclr/jit/hwintrinsiclistxarch.h | 2 +- src/coreclr/jit/instr.cpp | 2 +- src/coreclr/jit/instrsxarch.h | 4 ++-- src/coreclr/jit/simd.h | 3 +-- 7 files changed, 11 insertions(+), 37 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 44eb1f5305ca70..b835ce500eb1e8 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -10910,7 +10910,8 @@ void emitter::emitDispIns( switch (ins) { - case INS_vinsert64x4: + case INS_vinsertf64x4: + case INS_vinserti64x4: { attr = EA_32BYTE; break; @@ -17630,7 +17631,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vextracti128: case INS_vinsertf128: case INS_vinserti128: - case INS_vinsert64x4: + case INS_vinsertf64x4: + case INS_vinserti64x4: result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += PERFSCORE_LATENCY_3C; break; diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 7f52930d0590a8..16e42e04cd8be9 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -285,7 +285,8 @@ bool IsWEvexOpcodeExtension(const instrDesc* id) case INS_vpermilpdvar: case INS_movdqu16: case INS_movdqu64: - case INS_vinsert64x4: + case INS_vinsertf64x4: + case INS_vinserti64x4: { return true; // W1 } diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 5367097ef8ec10..4ba951f8d44d64 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -17517,34 +17517,6 @@ bool GenTreeVecCon::IsHWIntrinsicCreateConstant(GenTreeHWIntrinsic* node, simd32 return argCnt == cnsArgCnt; } -#if defined(TARGET_XARCH) - case NI_Vector512_Create: - { - // Zero out the simd32Val - simd32Val = {}; - - // These intrinsics are meant to set the same value to every element. - if ((argCnt == 1) && HandleArgForHWIntrinsicCreate(node->Op(1), 0, simd32Val, simdBaseType)) - { - cnsArgCnt = 1; - } - else - { - for (unsigned i = 1; i <= argCnt / 2; i++) - { - if (HandleArgForHWIntrinsicCreate(node->Op(i), i - 1, simd32Val, simdBaseType)) - { - cnsArgCnt++; - } - } - } - - assert((argCnt == 1) || (argCnt == (simdSize / genTypeSize(simdBaseType)))); - return argCnt == cnsArgCnt; - } - -#endif - default: { return false; diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 50c57113bc92f7..bda95ada8c209e 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -728,7 +728,7 @@ HARDWARE_INTRINSIC(AVX2, Xor, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX512 Intrinsics HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4, INS_vinsert64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinsertf64x4, INS_vinsertf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 425072f8cfd1c5..44a9ac7fa656b0 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -174,7 +174,7 @@ const char* CodeGen::genSizeStr(emitAttr attr) if (EA_ATTR(size) == attr) { - return sizes[size > 0 ? genLog2(size) + 1 : size]; + return (size > 0) ? sizes[genLog2(size)] : ""; } else if (attr == EA_GCREF) { diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index b50646bc17430d..df7b660dc82709 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -634,8 +634,8 @@ INST3(movdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, INST3(movdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) INST3(movdqu32, "movdqu32", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) INST3(movdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) -INST3(vinsert32x8, "insert32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed floating point values -INST3(vinsert64x4, "insert64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE8, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed floating point values +INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values +INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) diff --git a/src/coreclr/jit/simd.h b/src/coreclr/jit/simd.h index 3b48ab57faad5d..61dd2ecea59a4f 100644 --- a/src/coreclr/jit/simd.h +++ b/src/coreclr/jit/simd.h @@ -156,8 +156,7 @@ struct simd32_t struct simd64_t { - union - { + union { float f32[16]; double f64[8]; int8_t i8[64]; From 3973c24392a676fede24506bab043de4c097fc89 Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Wed, 15 Feb 2023 08:59:22 -0800 Subject: [PATCH 05/12] Save/restore for zmm --- src/coreclr/jit/lsra.cpp | 5 ++++ src/coreclr/jit/simdcodegenxarch.cpp | 38 ++++++++++++++++++++++------ 2 files changed, 35 insertions(+), 8 deletions(-) diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index db5f8518eabe23..d708c37e88e0d2 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -5077,6 +5077,11 @@ void LinearScan::allocateRegisters() { allocate = false; } + else if (lclVarInterval->registerType == TYP_SIMD64) + { + allocate = false; + lclVarInterval->isPartiallySpilled = true; + } else { lclVarInterval->isPartiallySpilled = true; diff --git a/src/coreclr/jit/simdcodegenxarch.cpp b/src/coreclr/jit/simdcodegenxarch.cpp index d02d760ffdc66c..1d6c716abca735 100644 --- a/src/coreclr/jit/simdcodegenxarch.cpp +++ b/src/coreclr/jit/simdcodegenxarch.cpp @@ -410,7 +410,7 @@ void CodeGen::genSimdUpperSave(GenTreeIntrinsic* node) assert(node->gtIntrinsicName == NI_SIMD_UpperSave); GenTree* op1 = node->gtGetOp1(); - assert(op1->IsLocal() && (op1->TypeGet() == TYP_SIMD32)); + assert(op1->IsLocal() && ((op1->TypeGet() == TYP_SIMD32) || (op1->TypeGet() == TYP_SIMD64))); regNumber tgtReg = node->GetRegNum(); regNumber op1Reg = genConsumeReg(op1); @@ -418,6 +418,8 @@ void CodeGen::genSimdUpperSave(GenTreeIntrinsic* node) if (tgtReg != REG_NA) { + // We should never save to register for zmm. + assert(op1->TypeGet() == TYP_SIMD32); GetEmitter()->emitIns_R_R_I(INS_vextractf128, EA_32BYTE, tgtReg, op1Reg, 0x01); genProduceReg(node); } @@ -429,10 +431,19 @@ void CodeGen::genSimdUpperSave(GenTreeIntrinsic* node) LclVarDsc* varDsc = compiler->lvaGetDesc(varNum); assert(varDsc->lvOnFrame); - // We want to store this to the upper 16 bytes of this localVar's home. - int offs = 16; + if (op1->TypeGet() == TYP_SIMD32) + { + // We want to store this to the upper 16 bytes of this localVar's home. + int offs = 16; - GetEmitter()->emitIns_S_R_I(INS_vextractf128, EA_32BYTE, varNum, offs, op1Reg, 0x01); + GetEmitter()->emitIns_S_R_I(INS_vextractf128, EA_32BYTE, varNum, offs, op1Reg, 0x01); + } + else + { + assert(op1->TypeGet() == TYP_SIMD64); + // We will save the whole 64 bytes for zmm. + GetEmitter()->emitIns_S_R(INS_movups, EA_64BYTE, op1Reg, varNum, 0); + } } } @@ -456,7 +467,7 @@ void CodeGen::genSimdUpperRestore(GenTreeIntrinsic* node) assert(node->gtIntrinsicName == NI_SIMD_UpperRestore); GenTree* op1 = node->gtGetOp1(); - assert(op1->IsLocal() && op1->TypeGet() == TYP_SIMD32); + assert(op1->IsLocal() && ((op1->TypeGet() == TYP_SIMD32) || (op1->TypeGet() == TYP_SIMD64))); regNumber srcReg = node->GetRegNum(); regNumber lclVarReg = genConsumeReg(op1); @@ -464,6 +475,8 @@ void CodeGen::genSimdUpperRestore(GenTreeIntrinsic* node) if (srcReg != REG_NA) { + // We should never save to register for zmm. + assert(op1->TypeGet() == TYP_SIMD32); GetEmitter()->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, lclVarReg, lclVarReg, srcReg, 0x01); } else @@ -472,9 +485,18 @@ void CodeGen::genSimdUpperRestore(GenTreeIntrinsic* node) unsigned varNum = op1->AsLclVarCommon()->GetLclNum(); LclVarDsc* varDsc = compiler->lvaGetDesc(varNum); assert(varDsc->lvOnFrame); - // We will load this from the upper 16 bytes of this localVar's home. - int offs = 16; - GetEmitter()->emitIns_R_R_S_I(INS_vinsertf128, EA_32BYTE, lclVarReg, lclVarReg, varNum, offs, 0x01); + if (op1->TypeGet() == TYP_SIMD32) + { + // We will load this from the upper 16 bytes of this localVar's home. + int offs = 16; + GetEmitter()->emitIns_R_R_S_I(INS_vinsertf128, EA_32BYTE, lclVarReg, lclVarReg, varNum, offs, 0x01); + } + else + { + assert(op1->TypeGet() == TYP_SIMD64); + // We will restore the whole 64 bytes for zmm. + GetEmitter()->emitIns_R_S(INS_movups, EA_64BYTE, lclVarReg, varNum, 0); + } } } From 16923d302c155cf2fd6c6cb1b107383bed46b45a Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Wed, 15 Feb 2023 11:16:24 -0800 Subject: [PATCH 06/12] Add ToDo comments for AVX512BW --- src/coreclr/jit/instrsxarch.h | 4 ++-- src/coreclr/jit/lsra.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index df7b660dc82709..49cedde5114bda 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -630,8 +630,8 @@ INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(movdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) -INST3(movdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) +INST3(movdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) // TODO-XARCH-AVX512 : Make this available only when AVX512BW is supported. +INST3(movdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) // TODO-XARCH-AVX512 : Make this available only when AVX512BW is supported. INST3(movdqu32, "movdqu32", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) INST3(movdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index d708c37e88e0d2..611e9b0746ce91 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -5079,7 +5079,7 @@ void LinearScan::allocateRegisters() } else if (lclVarInterval->registerType == TYP_SIMD64) { - allocate = false; + allocate = false; lclVarInterval->isPartiallySpilled = true; } else From 9637dba86df15f77e20a1f31072fe5caa978a140 Mon Sep 17 00:00:00 2001 From: Deepak Rajendrakumaran Date: Thu, 16 Feb 2023 13:11:58 -0800 Subject: [PATCH 07/12] Separate AVX512F and AVX512BW + Fix disassembly. --- src/coreclr/jit/compiler.cpp | 8 +++++- src/coreclr/jit/gentree.cpp | 1 + src/coreclr/jit/hwintrinsic.cpp | 8 ++++-- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 1 + src/coreclr/jit/hwintrinsiclistxarch.h | 14 +++++++-- src/coreclr/jit/hwintrinsicxarch.cpp | 1 + src/coreclr/jit/instr.cpp | 1 - src/coreclr/jit/instrsxarch.h | 4 +-- src/coreclr/jit/lowerxarch.cpp | 32 +++++++++++++++++++-- src/coreclr/jit/lsraxarch.cpp | 1 + 10 files changed, 59 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 833ef448e1eb14..be5145552ff50f 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -2280,11 +2280,17 @@ void Compiler::compSetProcessor() { instructionSetFlags.AddInstructionSet(InstructionSet_Vector256); } - if (instructionSetFlags.HasInstructionSet(InstructionSet_AVX512F)) + if (instructionSetFlags.HasInstructionSet(InstructionSet_AVX512F) && + instructionSetFlags.HasInstructionSet(InstructionSet_AVX512BW) && + instructionSetFlags.HasInstructionSet(InstructionSet_AVX512CD) && + instructionSetFlags.HasInstructionSet(InstructionSet_AVX512DQ)) { if (!DoJitStressEvexEncoding()) { instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512F); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512BW); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512CD); + instructionSetFlags.RemoveInstructionSet(InstructionSet_AVX512DQ); instructionSetFlags = EnsureInstructionSetFlagsAreValid(instructionSetFlags); } else diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 4ba951f8d44d64..e9acbea864db65 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -24063,6 +24063,7 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512F_BroadcastScalarToVector512: + case NI_AVX512BW_BroadcastScalarToVector512: case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: diff --git a/src/coreclr/jit/hwintrinsic.cpp b/src/coreclr/jit/hwintrinsic.cpp index 98a518974e0e1d..3656710dd2ca63 100644 --- a/src/coreclr/jit/hwintrinsic.cpp +++ b/src/coreclr/jit/hwintrinsic.cpp @@ -430,9 +430,12 @@ NamedIntrinsic HWIntrinsicInfo::lookupId(Compiler* comp, } else if (isa == InstructionSet_Vector512) { - if (!comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F)) + // We support some Vector512 intrinsics when AVX512F, AVX512BW, AVX512CD, AVX512DQ are available. + if (!comp->compOpportunisticallyDependsOn(InstructionSet_AVX512F) && + !comp->compOpportunisticallyDependsOn(InstructionSet_AVX512BW) && + !comp->compOpportunisticallyDependsOn(InstructionSet_AVX512CD) && + !comp->compOpportunisticallyDependsOn(InstructionSet_AVX512DQ)) { - // TODO-XArch-AVX512: Add checks for CD, DQ, BW return NI_Illegal; } } @@ -1161,6 +1164,7 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic intrinsic, case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512F_BroadcastScalarToVector512: + case NI_AVX512BW_BroadcastScalarToVector512: case NI_AVX2_ConvertToVector256Int16: case NI_AVX2_ConvertToVector256Int32: case NI_AVX2_ConvertToVector256Int64: diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 2b3454d5916fd6..274821ef81b55e 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -477,6 +477,7 @@ void CodeGen::genHWIntrinsic_R_RM( { if (varTypeIsIntegral(rmOp) && ((node->GetHWIntrinsicId() == NI_AVX2_BroadcastScalarToVector128) || (node->GetHWIntrinsicId() == NI_AVX512F_BroadcastScalarToVector512) || + (node->GetHWIntrinsicId() == NI_AVX512BW_BroadcastScalarToVector512) || (node->GetHWIntrinsicId() == NI_AVX2_BroadcastScalarToVector256))) { // In lowering we had the special case of BroadcastScalarToVector(CreateScalarUnsafe(op1)) diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index bda95ada8c209e..894663a1575bcd 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -726,9 +726,17 @@ HARDWARE_INTRINSIC(AVX2, Xor, // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** -// AVX512 Intrinsics -HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinsertf64x4, INS_vinsertf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM) +// AVX512F Intrinsics +HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) +HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinsertf64x4, INS_vinsertf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM) + +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// ISA Function name SIMD size NumArg Instructions Category Flags +// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} +// *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** +// AVX512F Intrinsics +HARDWARE_INTRINSIC(AVX512BW, BroadcastScalarToVector512, 64, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index d7431cc995b8d2..9d341f6cb439ca 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -2343,6 +2343,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_ToVector256: case NI_Vector128_ToVector256Unsafe: + case NI_Vector256_ToVector512Unsafe: case NI_Vector256_GetLower: { assert(sig->numArgs == 1); diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index 44a9ac7fa656b0..a37b5b268e89d3 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -157,7 +157,6 @@ const char* CodeGen::genSizeStr(emitAttr attr) static const char * const sizes[] = { - "", "byte ptr ", "word ptr ", "dword ptr ", diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 49cedde5114bda..df7b660dc82709 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -630,8 +630,8 @@ INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) -INST3(movdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) // TODO-XARCH-AVX512 : Make this available only when AVX512BW is supported. -INST3(movdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) // TODO-XARCH-AVX512 : Make this available only when AVX512BW is supported. +INST3(movdqu8, "movdqu8", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) +INST3(movdqu16, "movdqu16", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) INST3(movdqu32, "movdqu32", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) INST3(movdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, INS_FLAGS_None) INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 86e4d8f27def0a..7a53c0a997e665 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -2210,9 +2210,33 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) tmp1 = InsertNewSimdCreateScalarUnsafeNode(TYP_SIMD16, op1, simdBaseJitType, 16); LowerNode(tmp1); - - node->ResetHWIntrinsicId(NI_AVX512F_BroadcastScalarToVector512, tmp1); - + switch (simdBaseType) + { + case TYP_BYTE: + case TYP_UBYTE: + case TYP_SHORT: + case TYP_USHORT: + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512BW)); + node->ResetHWIntrinsicId(NI_AVX512BW_BroadcastScalarToVector512, tmp1); + break; + } + case TYP_INT: + case TYP_UINT: + case TYP_LONG: + case TYP_ULONG: + case TYP_FLOAT: + case TYP_DOUBLE: + { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX512F)); + node->ResetHWIntrinsicId(NI_AVX512F_BroadcastScalarToVector512, tmp1); + break; + } + default: + { + unreached(); + } + } return LowerNode(node); } @@ -6756,6 +6780,7 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* parentNode, GenTre case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512F_BroadcastScalarToVector512: + case NI_AVX512BW_BroadcastScalarToVector512: { if (!parentNode->OperIsMemoryLoad()) { @@ -7093,6 +7118,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX2_BroadcastScalarToVector128: case NI_AVX2_BroadcastScalarToVector256: case NI_AVX512F_BroadcastScalarToVector512: + case NI_AVX512BW_BroadcastScalarToVector512: { if (node->OperIsMemoryLoad()) { diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 8195160b205db3..9ee52f1f3087db 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2174,6 +2174,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou case NI_Vector128_AsVector3: case NI_Vector128_ToVector256: case NI_Vector128_ToVector256Unsafe: + case NI_Vector256_ToVector512Unsafe: case NI_Vector256_GetLower: { assert(numArgs == 1); From 1d9795979b6069b5d5e1eba4b5d750548073c1da Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Wed, 18 Jan 2023 07:48:54 -0800 Subject: [PATCH 08/12] Add `TYP_OPMASK` and `Vector512.ExtractMostSignificantBits`. --- src/coreclr/jit/emitxarch.cpp | 101 +++++++++++++++++++- src/coreclr/jit/emitxarch.h | 7 ++ src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 42 ++++++++ src/coreclr/jit/hwintrinsiclistxarch.h | 3 + src/coreclr/jit/hwintrinsicxarch.cpp | 16 ++++ src/coreclr/jit/instr.cpp | 2 +- src/coreclr/jit/instrsxarch.h | 17 ++++ src/coreclr/jit/lsra.cpp | 6 ++ src/coreclr/jit/lsra.h | 11 +++ src/coreclr/jit/lsrabuild.cpp | 11 +++ src/coreclr/jit/lsraxarch.cpp | 10 ++ src/coreclr/jit/register.h | 26 ++++- src/coreclr/jit/target.h | 5 + src/coreclr/jit/targetamd64.h | 5 + src/coreclr/jit/targetx86.h | 6 ++ src/coreclr/jit/typelist.h | 1 + src/coreclr/jit/vartype.h | 19 ++++ src/coreclr/vm/threadsuspend.cpp | 2 +- 18 files changed, 278 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index b835ce500eb1e8..ad963c17b99b61 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -34,6 +34,11 @@ bool emitter::IsSSEOrAVXInstruction(instruction ins) return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX_INSTRUCTION); } +bool emitter::IsKInstruction(instruction ins) +{ + return (ins >= INS_FIRST_K_INSTRUCTION) && (ins <= INS_LAST_K_INSTRUCTION); +} + //------------------------------------------------------------------------ // IsAvx512OrPriorInstruction: Is this an Avx512 or Avx or Sse instruction. // @@ -46,7 +51,7 @@ bool emitter::IsSSEOrAVXInstruction(instruction ins) bool emitter::IsAvx512OrPriorInstruction(instruction ins) { // TODO-XArch-AVX512: Fix check once AVX512 instructions are added. - return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION); + return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION) || IsKInstruction(ins); } bool emitter::IsAVXOnlyInstruction(instruction ins) @@ -154,7 +159,7 @@ regNumber emitter::getSseShiftRegNumber(instruction ins) bool emitter::IsVexEncodedInstruction(instruction ins) const { - return UseVEXEncoding() && IsSSEOrAVXInstruction(ins); + return UseVEXEncoding() && (IsSSEOrAVXInstruction(ins) || IsKInstruction(ins)); } //------------------------------------------------------------------------ @@ -263,6 +268,11 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const case INS_vbroadcastf128: // INS_vbroadcastf32x4, INS_vbroadcastf64x2. case INS_vbroadcasti128: // INS_vbroadcasti32x4, INS_vbroadcasti64x2. + case INS_kmovb: + case INS_kmovw: + case INS_kmovd: + case INS_kmovq: + // TODO-XARCH-AVX512 these need to be encoded with the proper individual EVEX instructions (movdqu8, // movdqu16 etc) // For implementation speed, I have set it up so the standing instruction will default to the 32-bit operand @@ -1127,6 +1137,8 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) case INS_vpgatherqq: case INS_vgatherdpd: case INS_vgatherqpd: + case INS_vpmovw2m: + case INS_vpmovq2m: return true; default: break; @@ -1186,7 +1198,8 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) // so we never need it if ((ins != INS_push) && (ins != INS_pop) && (ins != INS_movq) && (ins != INS_movzx) && (ins != INS_push_hide) && (ins != INS_pop_hide) && (ins != INS_ret) && (ins != INS_call) && (ins != INS_tail_i_jmp) && - !((ins >= INS_i_jmp) && (ins <= INS_l_jg))) + !((ins >= INS_i_jmp) && (ins <= INS_l_jg)) && + (ins != INS_kmovb) && (ins != INS_kmovw) && (ins != INS_kmovd)) { return true; } @@ -3356,7 +3369,17 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) // If Byte 4 (which is 0xFF00) is zero, that's where the RM encoding goes. // Otherwise, it will be placed after the 4 byte encoding, making the total 5 bytes. // This would probably be better expressed as a different format or something? - code_t code = insCodeRM(ins); + code_t code; + if (IsKInstruction(ins)) + { + code = insCodeRR(ins); + code = AddVexPrefix(ins, code, EA_SIZE(id->idOpSize())); + } + else + { + code = insCodeRM(ins); + } + UNATIVE_OFFSET sz = emitGetAdjustedSize(id, code); @@ -5729,6 +5752,10 @@ bool emitter::IsMovInstruction(instruction ins) case INS_movupd: case INS_movups: case INS_movzx: + case INS_kmovb: + case INS_kmovw: + case INS_kmovd: + case INS_kmovq: { return true; } @@ -5850,6 +5877,15 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) } #endif // TARGET_AMD64 + case INS_kmovb: + case INS_kmovw: + case INS_kmovd: + case INS_kmovq: + { + hasSideEffect = true; + break; + } + default: { unreached(); @@ -6061,6 +6097,12 @@ void emitter::emitIns_Mov(instruction ins, emitAttr attr, regNumber dstReg, regN } #endif // TARGET_AMD64 + case INS_kmovb: + case INS_kmovw: + case INS_kmovd: + case INS_kmovq: + break; + default: { unreached(); @@ -9457,6 +9499,12 @@ const char* emitter::emitRegName(regNumber reg, emitAttr attr, bool varName) #ifdef TARGET_AMD64 char suffix = '\0'; + // TODO-XARCH-AVX512 hacky, fix + if (reg > REG_OPMASK_FIRST && reg < REG_OPMASK_LAST) + { + return emitKregName(reg); + } + switch (EA_SIZE(attr)) { case EA_64BYTE: @@ -9677,6 +9725,24 @@ const char* emitter::emitZMMregName(unsigned reg) return regNames[reg]; } +/***************************************************************************** + * + * Return a string that represents the given K register. + */ + +const char* emitter::emitKregName(unsigned reg) +{ + static const char* const regNames[] = { +#define REGDEF(name, rnum, mask, sname) sname, +#include "register.h" + }; + + assert(reg < REG_COUNT); + assert(reg < ArrLen(regNames)); + + return regNames[reg]; +} + /***************************************************************************** * * Display a static data member reference. @@ -13679,7 +13745,16 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { assert((ins != INS_movd) || (isFloatReg(reg1) != isFloatReg(reg2))); - if ((ins != INS_movd) || isFloatReg(reg1)) + if (IsKInstruction(ins)) + { + code = insCodeRR(ins); + if (isGeneralRegister(reg1)) + { + // kmov r, k form, flip last byte of opcode from 0x92 to 0x93 + code |= 0x01; + } + } + else if ((ins != INS_movd) || isFloatReg(reg1)) { code = insCodeRM(ins); } @@ -17978,6 +18053,22 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; } #endif + + // TODO-XARCH-AVX512 add proper values + case INS_vpmovb2m: + case INS_vpmovw2m: + case INS_vpmovd2m: + case INS_vpmovq2m: + case INS_kmovb: + case INS_kmovw: + case INS_kmovd: + case INS_kmovq: + { + result.insLatency += PERFSCORE_LATENCY_1C; + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + break; + } + default: // unhandled instruction insFmt combination perfScoreUnhandledInstruction(id, &result); diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 16e42e04cd8be9..208ed16c87067a 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -23,6 +23,11 @@ inline static bool isDoubleReg(regNumber reg) return isFloatReg(reg); } +inline static bool isOpmaskReg(regNumber reg) +{ + return (reg >= REG_OPMASK_FIRST && reg <= REG_OPMASK_LAST); +} + /************************************************************************/ /* Routines that compute the size of / encode instructions */ /************************************************************************/ @@ -96,6 +101,7 @@ static bool IsAvx512OnlyInstruction(instruction ins); static bool IsFMAInstruction(instruction ins); static bool IsAVXVNNIInstruction(instruction ins); static bool IsBMIInstruction(instruction ins); +static bool IsKInstruction(instruction ins); static regNumber getBmiRegNumber(instruction ins); static regNumber getSseShiftRegNumber(instruction ins); @@ -666,6 +672,7 @@ void emitDispShift(instruction ins, int cnt = 0); const char* emitXMMregName(unsigned reg); const char* emitYMMregName(unsigned reg); const char* emitZMMregName(unsigned reg); +const char* emitKregName(unsigned reg); /************************************************************************/ /* Private members that deal with target-dependent instr. descriptors */ diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 274821ef81b55e..2a6fd39a666d28 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1671,6 +1671,48 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) break; } + case NI_AVX512F_MoveMaskSpec: + { + op1Reg = op1->GetRegNum(); + regNumber maskReg = node->ExtractTempReg(RBM_ALLOPMASK); + + instruction maskIns; + instruction kmovIns; + + switch (baseType) + { + case TYP_BYTE: + case TYP_UBYTE: + maskIns = INS_vpmovb2m; + kmovIns = INS_kmovq; + break; + case TYP_SHORT: + case TYP_USHORT: + maskIns = INS_vpmovw2m; + kmovIns = INS_kmovd; + break; + case TYP_INT: + case TYP_UINT: + case TYP_FLOAT: + maskIns = INS_vpmovd2m; + kmovIns = INS_kmovw; + break; + case TYP_DOUBLE: + case TYP_LONG: + case TYP_ULONG: + maskIns = INS_vpmovq2m; + kmovIns = INS_kmovb; + break; + default: + unreached(); + } + + // opReg should be a kmask reg + emit->emitIns_R_R(maskIns, attr, maskReg, op1Reg); + emit->emitIns_Mov(kmovIns, EA_8BYTE, targetReg, maskReg, INS_FLAGS_DONT_CARE); + break; + } + default: unreached(); break; diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 894663a1575bcd..99ef133d3de097 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -239,6 +239,8 @@ HARDWARE_INTRINSIC(Vector256, Xor, HARDWARE_INTRINSIC(Vector512, Create, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector512, get_Zero, 64, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Vector512, ExtractMostSignificantBits, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) + // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} @@ -729,6 +731,7 @@ HARDWARE_INTRINSIC(AVX2, Xor, // AVX512F Intrinsics HARDWARE_INTRINSIC(AVX512F, BroadcastScalarToVector512, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX512F, InsertVector256, 64, 3, {INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinserti64x4, INS_vinsertf64x4, INS_vinsertf64x4}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX512F, MoveMaskSpec, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoContainment|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 9d341f6cb439ca..da09c52e1efaf4 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1357,6 +1357,22 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case NI_Vector512_ExtractMostSignificantBits: + { + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F) && + compOpportunisticallyDependsOn(InstructionSet_AVX512BW) && + compOpportunisticallyDependsOn(InstructionSet_AVX512DQ)) + { + var_types simdType = getSIMDTypeForSize(simdSize); + + op1 = impSIMDPopStack(simdType); + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_AVX512F_MoveMaskSpec, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ false); + break; + } + } + case NI_Vector128_ExtractMostSignificantBits: case NI_Vector256_ExtractMostSignificantBits: { diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index a37b5b268e89d3..e73a5dae496284 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -101,7 +101,7 @@ const char* CodeGen::genInsDisplayName(emitter::instrDesc* id) static char buf[4][TEMP_BUFFER_LEN]; const char* retbuf; - if (GetEmitter()->IsVexEncodedInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins)) + if (GetEmitter()->IsVexEncodedInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins) && !GetEmitter()->IsKInstruction(ins)) { sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "v%s", insName); retbuf = buf[curBuf]; diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index df7b660dc82709..d5e3ff461047ec 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -67,6 +67,18 @@ INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, // See comment around quarter way through this file for more information. INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_TT_NONE, INS_FLAGS_None ) +INST5(FIRST_K_INSTRUCTION, "FIRST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) + +// id nm um mr mi rm a4 rr tt flags +// TODO-XARCH-AVX512 add the proper W bit switch +INST5(kmovb, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, PACK3(0x66, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_Has_Wbit ) +INST5(kmovw, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, PACK2(0x0F, 0x92), INS_TT_NONE, INS_FLAGS_Has_Wbit ) +INST5(kmovd, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, PACK3(0xF2, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_Has_Wbit ) +INST5(kmovq, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, PACK3(0xF2, 0x0F, 0x92), INS_TT_NONE, INS_FLAGS_Has_Wbit ) + +INST5(LAST_K_INSTRUCTION, "LAST_K_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None ) + + // id nm um mr mi rm a4 tt flags INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) @@ -637,6 +649,11 @@ INST3(movdqu64, "movdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, INST3(vinsertf64x4, "insertf64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1A), INS_TT_TUPLE4, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed double-precision floating point values INST3(vinserti64x4, "inserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), INS_TT_TUPLE4, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values +INST3(vpmovb2m, "vpmovb2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x29), INS_TT_NONE, Input_8Bit) +INST3(vpmovw2m, "vpmovw2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x29), INS_TT_NONE, Input_16Bit) +INST3(vpmovd2m, "vpmovd2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x39), INS_TT_NONE, Input_32Bit) +INST3(vpmovq2m, "vpmovq2m", IUM_WR, BAD_CODE, BAD_CODE, PACK4(0xF3, 0x0F, 0x38, 0x39), INS_TT_NONE, Input_64Bit) + INST3(LAST_AVX512_INSTRUCTION, "LAST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Scalar instructions in SSE4.2 diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 611e9b0746ce91..f44844ffa0b7fa 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -698,6 +698,7 @@ LinearScan::LinearScan(Compiler* theCompiler) availableFloatRegs = RBM_ALLFLOAT; availableDoubleRegs = RBM_ALLDOUBLE; + availableMaskRegs = RBM_NONE; #if defined(TARGET_AMD64) || defined(TARGET_ARM64) if (compiler->opts.compDbgEnC) @@ -718,6 +719,7 @@ LinearScan::LinearScan(Compiler* theCompiler) { availableFloatRegs |= RBM_HIGHFLOAT; availableDoubleRegs |= RBM_HIGHFLOAT; + availableMaskRegs |= RBM_K1; } #endif @@ -737,6 +739,10 @@ LinearScan::LinearScan(Compiler* theCompiler) { availableRegs[i] = &availableDoubleRegs; } + else if ((thisType == TYP_OPMASK)) + { + availableRegs[i] = &availableMaskRegs; + } #endif else { diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 0a067f4909af4a..9974c7c65d2581 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -35,6 +35,7 @@ const unsigned int RegisterTypeCount = 2; typedef var_types RegisterType; #define IntRegisterType TYP_INT #define FloatRegisterType TYP_FLOAT +#define OpmaskRegisterType TYP_OPMASK //------------------------------------------------------------------------ // regType: Return the RegisterType to use for a given type @@ -486,6 +487,12 @@ class RegRecord : public Referenceable { registerType = FloatRegisterType; } +#if defined(TARGET_XARCH) && defined(FEATURE_SIMD) + else if (emitter::isOpmaskReg(reg)) + { + registerType = OpmaskRegisterType; + } +#endif else { // The constructor defaults to IntRegisterType @@ -1090,6 +1097,9 @@ class LinearScan : public LinearScanInterface RefPosition* defineNewInternalTemp(GenTree* tree, RegisterType regType, regMaskTP candidates); RefPosition* buildInternalIntRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); RefPosition* buildInternalFloatRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); + #if defined(FEATURE_SIMD) + RefPosition* buildInternalOpmaskRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); + #endif void buildInternalRegisterUses(); void writeLocalReg(GenTreeLclVar* lclNode, unsigned varNum, regNumber reg); @@ -1598,6 +1608,7 @@ class LinearScan : public LinearScanInterface PhasedVar availableIntRegs; PhasedVar availableFloatRegs; PhasedVar availableDoubleRegs; + PhasedVar availableMaskRegs; PhasedVar* availableRegs[TYP_COUNT]; // Register mask of argument registers currently occupied because we saw a diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 70dc5dc1648b20..143324c73739a4 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1393,6 +1393,17 @@ RefPosition* LinearScan::buildInternalFloatRegisterDefForNode(GenTree* tree, reg return defRefPosition; } +#if defined(FEATURE_SIMD) +RefPosition* LinearScan::buildInternalOpmaskRegisterDefForNode(GenTree* tree, regMaskTP internalCands) +{ + // The candidate set should contain only float registers. + assert((internalCands & ~availableMaskRegs) == RBM_NONE); + + RefPosition* defRefPosition = defineNewInternalTemp(tree, OpmaskRegisterType, internalCands); + return defRefPosition; +} +#endif + //------------------------------------------------------------------------ // buildInternalRegisterUses - adds use positions for internal // registers required for tree node. diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 9ee52f1f3087db..70c715d1c1f153 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2498,6 +2498,16 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou break; } + case NI_AVX512F_MoveMaskSpec: + { + srcCount += BuildOperandUses(op1); + buildInternalOpmaskRegisterDefForNode(intrinsicTree); + setInternalRegsDelayFree = true; + + buildUses = false; + break; + } + default: { assert((intrinsicId > NI_HW_INTRINSIC_START) && (intrinsicId < NI_HW_INTRINSIC_END)); diff --git a/src/coreclr/jit/register.h b/src/coreclr/jit/register.h index ca90673e85adfe..9ec15818da56c0 100644 --- a/src/coreclr/jit/register.h +++ b/src/coreclr/jit/register.h @@ -69,9 +69,18 @@ REGALIAS(EDI, RDI) #ifdef TARGET_AMD64 #define XMMBASE 16 #define XMMMASK(x) ((__int64)(1) << ((x)+XMMBASE)) + +#define KBASE 48 +#define KMASK(x) ((__int64)(1) << ((x)+KBASE)) + #else // !TARGET_AMD64 #define XMMBASE 8 #define XMMMASK(x) ((__int32)(1) << ((x)+XMMBASE)) + +#define KBASE 16 +#define KMASK(x) ((__int32)(1) << ((x)+KBASE)) + + #endif // !TARGET_AMD64 REGDEF(XMM0, 0+XMMBASE, XMMMASK(0), "mm0" ) @@ -83,9 +92,7 @@ REGDEF(XMM5, 5+XMMBASE, XMMMASK(5), "mm5" ) REGDEF(XMM6, 6+XMMBASE, XMMMASK(6), "mm6" ) REGDEF(XMM7, 7+XMMBASE, XMMMASK(7), "mm7" ) -#ifdef TARGET_X86 -REGDEF(STK, 8+XMMBASE, 0x0000, "STK" ) -#else // !TARGET_X86 +#ifdef TARGET_AMD64 REGDEF(XMM8, 8+XMMBASE, XMMMASK(8), "mm8" ) REGDEF(XMM9, 9+XMMBASE, XMMMASK(9), "mm9" ) REGDEF(XMM10, 10+XMMBASE, XMMMASK(10), "mm10" ) @@ -113,9 +120,18 @@ REGDEF(XMM29, 29+XMMBASE, XMMMASK(29), "mm29" ) REGDEF(XMM30, 30+XMMBASE, XMMMASK(30), "mm30" ) REGDEF(XMM31, 31+XMMBASE, XMMMASK(31), "mm31" ) -REGDEF(STK, 32+XMMBASE, 0x0000, "STK" ) +#endif // !TARGET_AMD64 + +REGDEF(K0, 0+KBASE, KMASK(0), "k0" ) +REGDEF(K1, 1+KBASE, KMASK(1), "k1" ) +REGDEF(K2, 2+KBASE, KMASK(2), "k2" ) +REGDEF(K3, 3+KBASE, KMASK(3), "k3" ) +REGDEF(K4, 4+KBASE, KMASK(4), "k4" ) +REGDEF(K5, 5+KBASE, KMASK(5), "k5" ) +REGDEF(K6, 6+KBASE, KMASK(6), "k6" ) +REGDEF(K7, 7+KBASE, KMASK(7), "k7" ) -#endif // !TARGET_X86 +REGDEF(STK, 8+KBASE, 0x0000, "STK" ) #elif defined(TARGET_ARM) #include "registerarm.h" diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index 8baf645453adf5..72bdc857d9be96 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -682,6 +682,11 @@ inline bool isFloatRegType(var_types type) return varTypeUsesFloatReg(type); } +inline bool isOpmaskReg(var_types type) +{ + return varTypeIsOpmask(type); +} + // If the WINDOWS_AMD64_ABI is defined make sure that TARGET_AMD64 is also defined. #if defined(WINDOWS_AMD64_ABI) #if !defined(TARGET_AMD64) diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index ac3f0ca7e8c027..5ca4e607ff669c 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -91,6 +91,11 @@ #define REG_FP_LAST REG_XMM31 #define FIRST_FP_ARGREG REG_XMM0 + #define REG_OPMASK_FIRST REG_K0 + #define REG_OPMASK_LAST REG_K7 + + #define RBM_ALLOPMASK (RBM_K1 | RBM_K2 | RBM_K3 | RBM_K4 | RBM_K5 | RBM_K6 | RBM_K7) + #ifdef UNIX_AMD64_ABI #define LAST_FP_ARGREG REG_XMM7 #else // !UNIX_AMD64_ABI diff --git a/src/coreclr/jit/targetx86.h b/src/coreclr/jit/targetx86.h index dffd6adf2efb08..1c5031eaa4ac4b 100644 --- a/src/coreclr/jit/targetx86.h +++ b/src/coreclr/jit/targetx86.h @@ -74,6 +74,10 @@ #define REG_FP_FIRST REG_XMM0 #define REG_FP_LAST REG_XMM7 + + #define REG_OPMASK_FIRST REG_K0 + #define REG_OPMASK_LAST REG_K7 + #define FIRST_FP_ARGREG REG_XMM0 #define LAST_FP_ARGREG REG_XMM3 #define REG_FLTARG_0 REG_XMM0 @@ -91,6 +95,8 @@ #define RBM_ALLFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7) #define RBM_ALLDOUBLE RBM_ALLFLOAT + #define RBM_ALLOPMASK (REG_K1 | REG_K2 | REG_K3 | REG_K4 | REG_K5 | REG_K6 | REG_K7) + // TODO-CQ: Currently we are following the x86 ABI for SSE2 registers. // This should be reconsidered. #define RBM_FLT_CALLEE_SAVED RBM_NONE diff --git a/src/coreclr/jit/typelist.h b/src/coreclr/jit/typelist.h index 2eeee02047530f..c33b88f3d26c10 100644 --- a/src/coreclr/jit/typelist.h +++ b/src/coreclr/jit/typelist.h @@ -63,6 +63,7 @@ DEF_TP(SIMD12 ,"simd12" , TYP_SIMD12, TI_STRUCT,12,16, 16, 4,16, VTF_S) DEF_TP(SIMD16 ,"simd16" , TYP_SIMD16, TI_STRUCT,16,16, 16, 4,16, VTF_S) DEF_TP(SIMD32 ,"simd32" , TYP_SIMD32, TI_STRUCT,32,32, 32, 8,16, VTF_S) DEF_TP(SIMD64 ,"simd64" , TYP_SIMD64, TI_STRUCT,64,64, 64, 16,16, VTF_S) +DEF_TP(OPMASK ,"opmask" , TYP_OPMASK, TI_STRUCT,8, 8, 8, 2,8, VTF_S) #endif // FEATURE_SIMD DEF_TP(UNKNOWN ,"unknown" ,TYP_UNKNOWN, TI_ERROR, 0, 0, 0, 0, 0, VTF_ANY) diff --git a/src/coreclr/jit/vartype.h b/src/coreclr/jit/vartype.h index 5e80787ebfe244..35aa55e1826e8e 100644 --- a/src/coreclr/jit/vartype.h +++ b/src/coreclr/jit/vartype.h @@ -77,6 +77,19 @@ inline bool varTypeIsSIMD(T vt) return false; } } + +template +inline bool varTypeIsOpmask(T vt) +{ + switch (TypeGet(vt)) + { + case TYP_OPMASK: + return true; + default: + return false; + } +} + #else // FEATURE_SIMD // Always return false if FEATURE_SIMD is not enabled @@ -85,6 +98,12 @@ inline bool varTypeIsSIMD(T vt) { return false; } + +template +inline bool varTypeIsOpmask(T vt) +{ + return false; +} #endif // !FEATURE_SIMD template diff --git a/src/coreclr/vm/threadsuspend.cpp b/src/coreclr/vm/threadsuspend.cpp index 89f2f9d33f7e2c..31e381e18472e6 100644 --- a/src/coreclr/vm/threadsuspend.cpp +++ b/src/coreclr/vm/threadsuspend.cpp @@ -3050,7 +3050,7 @@ BOOL Thread::RedirectCurrentThreadAtHandledJITCase(PFN_REDIRECTTARGET pTgt, CONT //////////////////////////////////////////////////// // Now redirect the thread to the helper function - + SetIP(pCurrentThreadCtx, (PCODE)pTgt); #ifdef TARGET_ARM From 37834bfff45670f5cee4d76ba9653055a9e0040e Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Thu, 16 Feb 2023 14:14:30 -0800 Subject: [PATCH 09/12] Format/Build errors --- src/coreclr/jit/emitxarch.cpp | 4 +--- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 10 +++++----- src/coreclr/jit/hwintrinsicxarch.cpp | 6 +++--- src/coreclr/jit/instr.cpp | 3 ++- src/coreclr/jit/lsra.cpp | 2 +- src/coreclr/jit/lsra.h | 4 ++-- 6 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index ad963c17b99b61..b35b6206eb6aab 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -1198,8 +1198,7 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) // so we never need it if ((ins != INS_push) && (ins != INS_pop) && (ins != INS_movq) && (ins != INS_movzx) && (ins != INS_push_hide) && (ins != INS_pop_hide) && (ins != INS_ret) && (ins != INS_call) && (ins != INS_tail_i_jmp) && - !((ins >= INS_i_jmp) && (ins <= INS_l_jg)) && - (ins != INS_kmovb) && (ins != INS_kmovw) && (ins != INS_kmovd)) + !((ins >= INS_i_jmp) && (ins <= INS_l_jg)) && (ins != INS_kmovb) && (ins != INS_kmovw) && (ins != INS_kmovd)) { return true; } @@ -3379,7 +3378,6 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) { code = insCodeRM(ins); } - UNATIVE_OFFSET sz = emitGetAdjustedSize(id, code); diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 2a6fd39a666d28..15ca0bec6fb2e7 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1673,7 +1673,7 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX512F_MoveMaskSpec: { - op1Reg = op1->GetRegNum(); + op1Reg = op1->GetRegNum(); regNumber maskReg = node->ExtractTempReg(RBM_ALLOPMASK); instruction maskIns; @@ -1683,24 +1683,24 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) { case TYP_BYTE: case TYP_UBYTE: - maskIns = INS_vpmovb2m; + maskIns = INS_vpmovb2m; kmovIns = INS_kmovq; break; case TYP_SHORT: case TYP_USHORT: - maskIns = INS_vpmovw2m; + maskIns = INS_vpmovw2m; kmovIns = INS_kmovd; break; case TYP_INT: case TYP_UINT: case TYP_FLOAT: - maskIns = INS_vpmovd2m; + maskIns = INS_vpmovd2m; kmovIns = INS_kmovw; break; case TYP_DOUBLE: case TYP_LONG: case TYP_ULONG: - maskIns = INS_vpmovq2m; + maskIns = INS_vpmovq2m; kmovIns = INS_kmovb; break; default: diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index da09c52e1efaf4..34a3099cfa976d 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1359,9 +1359,9 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector512_ExtractMostSignificantBits: { - if (compOpportunisticallyDependsOn(InstructionSet_AVX512F) && - compOpportunisticallyDependsOn(InstructionSet_AVX512BW) && - compOpportunisticallyDependsOn(InstructionSet_AVX512DQ)) + if (compOpportunisticallyDependsOn(InstructionSet_AVX512F) && + compOpportunisticallyDependsOn(InstructionSet_AVX512BW) && + compOpportunisticallyDependsOn(InstructionSet_AVX512DQ)) { var_types simdType = getSIMDTypeForSize(simdSize); diff --git a/src/coreclr/jit/instr.cpp b/src/coreclr/jit/instr.cpp index e73a5dae496284..50667c9ee924a8 100644 --- a/src/coreclr/jit/instr.cpp +++ b/src/coreclr/jit/instr.cpp @@ -101,7 +101,8 @@ const char* CodeGen::genInsDisplayName(emitter::instrDesc* id) static char buf[4][TEMP_BUFFER_LEN]; const char* retbuf; - if (GetEmitter()->IsVexEncodedInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins) && !GetEmitter()->IsKInstruction(ins)) + if (GetEmitter()->IsVexEncodedInstruction(ins) && !GetEmitter()->IsBMIInstruction(ins) && + !GetEmitter()->IsKInstruction(ins)) { sprintf_s(buf[curBuf], TEMP_BUFFER_LEN, "v%s", insName); retbuf = buf[curBuf]; diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index f44844ffa0b7fa..798fc6638850ca 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -739,7 +739,7 @@ LinearScan::LinearScan(Compiler* theCompiler) { availableRegs[i] = &availableDoubleRegs; } - else if ((thisType == TYP_OPMASK)) + else if (thisType == TYP_OPMASK) { availableRegs[i] = &availableMaskRegs; } diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index 9974c7c65d2581..dddce1e896be58 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -1097,9 +1097,9 @@ class LinearScan : public LinearScanInterface RefPosition* defineNewInternalTemp(GenTree* tree, RegisterType regType, regMaskTP candidates); RefPosition* buildInternalIntRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); RefPosition* buildInternalFloatRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); - #if defined(FEATURE_SIMD) +#if defined(FEATURE_SIMD) RefPosition* buildInternalOpmaskRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); - #endif +#endif void buildInternalRegisterUses(); void writeLocalReg(GenTreeLclVar* lclNode, unsigned varNum, regNumber reg); From 3f42d9f87bb1a89c1651e171a935cbf336f2242e Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Thu, 16 Feb 2023 15:40:09 -0800 Subject: [PATCH 10/12] Build error. --- src/coreclr/jit/emitxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index b35b6206eb6aab..0882d57730b00e 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -51,7 +51,7 @@ bool emitter::IsKInstruction(instruction ins) bool emitter::IsAvx512OrPriorInstruction(instruction ins) { // TODO-XArch-AVX512: Fix check once AVX512 instructions are added. - return (ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION) || IsKInstruction(ins); + return ((ins >= INS_FIRST_SSE_INSTRUCTION) && (ins <= INS_LAST_AVX512_INSTRUCTION)) || IsKInstruction(ins); } bool emitter::IsAVXOnlyInstruction(instruction ins) From 95c4ddb08308fe269e4df99108cf24eee53ff9d6 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Thu, 16 Feb 2023 16:07:26 -0800 Subject: [PATCH 11/12] Build error. --- src/coreclr/jit/hwintrinsicxarch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index 34a3099cfa976d..06984097e23751 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -1369,8 +1369,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, retNode = gtNewSimdHWIntrinsicNode(retType, op1, NI_AVX512F_MoveMaskSpec, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); - break; } + break; } case NI_Vector128_ExtractMostSignificantBits: From d801c7d7246604e81b5015368c174ee0307863e8 Mon Sep 17 00:00:00 2001 From: "Canino, Anthony" Date: Fri, 17 Feb 2023 12:57:53 -0800 Subject: [PATCH 12/12] Review edits. --- src/coreclr/jit/emitxarch.cpp | 18 ++++++++++++------ src/coreclr/jit/emitxarch.h | 4 ++-- src/coreclr/jit/hwintrinsiccodegenxarch.cpp | 5 +++-- src/coreclr/jit/lsra.cpp | 2 +- src/coreclr/jit/lsra.h | 8 ++++---- src/coreclr/jit/lsrabuild.cpp | 4 ++-- src/coreclr/jit/lsraxarch.cpp | 2 +- src/coreclr/jit/target.h | 4 ++-- src/coreclr/jit/targetamd64.h | 6 +++--- src/coreclr/jit/targetx86.h | 6 +++--- src/coreclr/jit/typelist.h | 2 +- src/coreclr/jit/vartype.h | 12 +++--------- 12 files changed, 37 insertions(+), 36 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 0882d57730b00e..4a92b5191d5074 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -40,7 +40,9 @@ bool emitter::IsKInstruction(instruction ins) } //------------------------------------------------------------------------ -// IsAvx512OrPriorInstruction: Is this an Avx512 or Avx or Sse instruction. +// IsAvx512OrPriorInstruction: Is this an Avx512 or Avx or Sse or K (opmask) instruction. +// Technically, K instructions would be considered under the VEX encoding umbrella, but due to +// the instruction table encoding had to be pulled out with the rest of the `INST5` definitions. // // Arguments: // ins - The instruction to check. @@ -9497,8 +9499,7 @@ const char* emitter::emitRegName(regNumber reg, emitAttr attr, bool varName) #ifdef TARGET_AMD64 char suffix = '\0'; - // TODO-XARCH-AVX512 hacky, fix - if (reg > REG_OPMASK_FIRST && reg < REG_OPMASK_LAST) + if (isMaskReg(reg)) { return emitKregName(reg); } @@ -18052,18 +18053,23 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins } #endif - // TODO-XARCH-AVX512 add proper values case INS_vpmovb2m: case INS_vpmovw2m: case INS_vpmovd2m: case INS_vpmovq2m: + { + result.insLatency += PERFSCORE_LATENCY_1C; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + break; + } + case INS_kmovb: case INS_kmovw: case INS_kmovd: case INS_kmovq: { - result.insLatency += PERFSCORE_LATENCY_1C; - result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency += PERFSCORE_LATENCY_3C; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; break; } diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 208ed16c87067a..563552f4a7f55c 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -23,9 +23,9 @@ inline static bool isDoubleReg(regNumber reg) return isFloatReg(reg); } -inline static bool isOpmaskReg(regNumber reg) +inline static bool isMaskReg(regNumber reg) { - return (reg >= REG_OPMASK_FIRST && reg <= REG_OPMASK_LAST); + return (reg >= REG_MASK_FIRST && reg <= REG_MASK_LAST); } /************************************************************************/ diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 15ca0bec6fb2e7..49bd512de79cff 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -1674,7 +1674,7 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) case NI_AVX512F_MoveMaskSpec: { op1Reg = op1->GetRegNum(); - regNumber maskReg = node->ExtractTempReg(RBM_ALLOPMASK); + regNumber maskReg = node->ExtractTempReg(RBM_ALLMASK); instruction maskIns; instruction kmovIns; @@ -1707,7 +1707,8 @@ void CodeGen::genAvxFamilyIntrinsic(GenTreeHWIntrinsic* node) unreached(); } - // opReg should be a kmask reg + assert(emitter::isMaskReg(maskReg)); + emit->emitIns_R_R(maskIns, attr, maskReg, op1Reg); emit->emitIns_Mov(kmovIns, EA_8BYTE, targetReg, maskReg, INS_FLAGS_DONT_CARE); break; diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 798fc6638850ca..7479e494987cdc 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -739,7 +739,7 @@ LinearScan::LinearScan(Compiler* theCompiler) { availableRegs[i] = &availableDoubleRegs; } - else if (thisType == TYP_OPMASK) + else if (thisType == TYP_MASK) { availableRegs[i] = &availableMaskRegs; } diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index dddce1e896be58..ef49e9ce254bb9 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -35,7 +35,7 @@ const unsigned int RegisterTypeCount = 2; typedef var_types RegisterType; #define IntRegisterType TYP_INT #define FloatRegisterType TYP_FLOAT -#define OpmaskRegisterType TYP_OPMASK +#define MaskRegisterType TYP_MASK //------------------------------------------------------------------------ // regType: Return the RegisterType to use for a given type @@ -488,9 +488,9 @@ class RegRecord : public Referenceable registerType = FloatRegisterType; } #if defined(TARGET_XARCH) && defined(FEATURE_SIMD) - else if (emitter::isOpmaskReg(reg)) + else if (emitter::isMaskReg(reg)) { - registerType = OpmaskRegisterType; + registerType = MaskRegisterType; } #endif else @@ -1098,7 +1098,7 @@ class LinearScan : public LinearScanInterface RefPosition* buildInternalIntRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); RefPosition* buildInternalFloatRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); #if defined(FEATURE_SIMD) - RefPosition* buildInternalOpmaskRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); + RefPosition* buildInternalMaskRegisterDefForNode(GenTree* tree, regMaskTP internalCands = RBM_NONE); #endif void buildInternalRegisterUses(); diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 143324c73739a4..5e28258392a72e 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -1394,12 +1394,12 @@ RefPosition* LinearScan::buildInternalFloatRegisterDefForNode(GenTree* tree, reg } #if defined(FEATURE_SIMD) -RefPosition* LinearScan::buildInternalOpmaskRegisterDefForNode(GenTree* tree, regMaskTP internalCands) +RefPosition* LinearScan::buildInternalMaskRegisterDefForNode(GenTree* tree, regMaskTP internalCands) { // The candidate set should contain only float registers. assert((internalCands & ~availableMaskRegs) == RBM_NONE); - RefPosition* defRefPosition = defineNewInternalTemp(tree, OpmaskRegisterType, internalCands); + RefPosition* defRefPosition = defineNewInternalTemp(tree, MaskRegisterType, internalCands); return defRefPosition; } #endif diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 70c715d1c1f153..bb77335a3e6261 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2501,7 +2501,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou case NI_AVX512F_MoveMaskSpec: { srcCount += BuildOperandUses(op1); - buildInternalOpmaskRegisterDefForNode(intrinsicTree); + buildInternalMaskRegisterDefForNode(intrinsicTree); setInternalRegsDelayFree = true; buildUses = false; diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index 72bdc857d9be96..a838814164f881 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -682,9 +682,9 @@ inline bool isFloatRegType(var_types type) return varTypeUsesFloatReg(type); } -inline bool isOpmaskReg(var_types type) +inline bool isMaskReg(var_types type) { - return varTypeIsOpmask(type); + return varTypeIsMask(type); } // If the WINDOWS_AMD64_ABI is defined make sure that TARGET_AMD64 is also defined. diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index 5ca4e607ff669c..b50fab0ba2f3b3 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -91,10 +91,10 @@ #define REG_FP_LAST REG_XMM31 #define FIRST_FP_ARGREG REG_XMM0 - #define REG_OPMASK_FIRST REG_K0 - #define REG_OPMASK_LAST REG_K7 + #define REG_MASK_FIRST REG_K0 + #define REG_MASK_LAST REG_K7 - #define RBM_ALLOPMASK (RBM_K1 | RBM_K2 | RBM_K3 | RBM_K4 | RBM_K5 | RBM_K6 | RBM_K7) + #define RBM_ALLMASK (RBM_K1 | RBM_K2 | RBM_K3 | RBM_K4 | RBM_K5 | RBM_K6 | RBM_K7) #ifdef UNIX_AMD64_ABI #define LAST_FP_ARGREG REG_XMM7 diff --git a/src/coreclr/jit/targetx86.h b/src/coreclr/jit/targetx86.h index 1c5031eaa4ac4b..f7163272981238 100644 --- a/src/coreclr/jit/targetx86.h +++ b/src/coreclr/jit/targetx86.h @@ -75,8 +75,8 @@ #define REG_FP_FIRST REG_XMM0 #define REG_FP_LAST REG_XMM7 - #define REG_OPMASK_FIRST REG_K0 - #define REG_OPMASK_LAST REG_K7 + #define REG_MASK_FIRST REG_K0 + #define REG_MASK_LAST REG_K7 #define FIRST_FP_ARGREG REG_XMM0 #define LAST_FP_ARGREG REG_XMM3 @@ -95,7 +95,7 @@ #define RBM_ALLFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7) #define RBM_ALLDOUBLE RBM_ALLFLOAT - #define RBM_ALLOPMASK (REG_K1 | REG_K2 | REG_K3 | REG_K4 | REG_K5 | REG_K6 | REG_K7) + #define RBM_ALLMASK (REG_K1 | REG_K2 | REG_K3 | REG_K4 | REG_K5 | REG_K6 | REG_K7) // TODO-CQ: Currently we are following the x86 ABI for SSE2 registers. // This should be reconsidered. diff --git a/src/coreclr/jit/typelist.h b/src/coreclr/jit/typelist.h index c33b88f3d26c10..e145fea89ac196 100644 --- a/src/coreclr/jit/typelist.h +++ b/src/coreclr/jit/typelist.h @@ -63,7 +63,7 @@ DEF_TP(SIMD12 ,"simd12" , TYP_SIMD12, TI_STRUCT,12,16, 16, 4,16, VTF_S) DEF_TP(SIMD16 ,"simd16" , TYP_SIMD16, TI_STRUCT,16,16, 16, 4,16, VTF_S) DEF_TP(SIMD32 ,"simd32" , TYP_SIMD32, TI_STRUCT,32,32, 32, 8,16, VTF_S) DEF_TP(SIMD64 ,"simd64" , TYP_SIMD64, TI_STRUCT,64,64, 64, 16,16, VTF_S) -DEF_TP(OPMASK ,"opmask" , TYP_OPMASK, TI_STRUCT,8, 8, 8, 2,8, VTF_S) +DEF_TP(MASK ,"mask" , TYP_MASK, TI_STRUCT,8, 8, 8, 2,8, VTF_S) #endif // FEATURE_SIMD DEF_TP(UNKNOWN ,"unknown" ,TYP_UNKNOWN, TI_ERROR, 0, 0, 0, 0, 0, VTF_ANY) diff --git a/src/coreclr/jit/vartype.h b/src/coreclr/jit/vartype.h index 35aa55e1826e8e..9c89016380f548 100644 --- a/src/coreclr/jit/vartype.h +++ b/src/coreclr/jit/vartype.h @@ -79,15 +79,9 @@ inline bool varTypeIsSIMD(T vt) } template -inline bool varTypeIsOpmask(T vt) +inline bool varTypeIsMask(T vt) { - switch (TypeGet(vt)) - { - case TYP_OPMASK: - return true; - default: - return false; - } + return (TypeGet(vt) == TYP_MASK); } #else // FEATURE_SIMD @@ -100,7 +94,7 @@ inline bool varTypeIsSIMD(T vt) } template -inline bool varTypeIsOpmask(T vt) +inline bool varTypeIsMask(T vt) { return false; }