Skip to content

Commit 3aef525

Browse files
authored
[AMDGPU] Fix negative immediate offset for unbuffered smem loads (#89165)
For unbuffered smem loads, it is illegal for the immediate offset to be negative if the resulting IOFFSET + (SGPR[Offset] or M0 or zero) is negative. New PR of #79553.
1 parent 5413a2b commit 3aef525

12 files changed

+357
-104
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2004,28 +2004,50 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
20042004
return true;
20052005
}
20062006

2007+
// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2008+
// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2009+
// Handle the case where the Immediate Offset + SOffset is negative.
2010+
bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
2011+
bool Imm32Only,
2012+
bool IsBuffer,
2013+
int64_t ImmOffset) const {
2014+
if (!IsBuffer && !Imm32Only && ImmOffset < 0 &&
2015+
AMDGPU::hasSMRDSignedImmOffset(*Subtarget)) {
2016+
KnownBits SKnown = CurDAG->computeKnownBits(*SOffset);
2017+
if (ImmOffset + SKnown.getMinValue().getSExtValue() < 0)
2018+
return false;
2019+
}
2020+
2021+
return true;
2022+
}
2023+
20072024
// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
20082025
// not null) offset. If Imm32Only is true, match only 32-bit immediate
20092026
// offsets available on CI.
20102027
bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
20112028
SDValue *SOffset, SDValue *Offset,
2012-
bool Imm32Only, bool IsBuffer) const {
2029+
bool Imm32Only, bool IsBuffer,
2030+
bool HasSOffset,
2031+
int64_t ImmOffset) const {
20132032
assert((!SOffset || !Offset) &&
20142033
"Cannot match both soffset and offset at the same time!");
20152034

20162035
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
20172036
if (!C) {
20182037
if (!SOffset)
20192038
return false;
2039+
20202040
if (ByteOffsetNode.getValueType().isScalarInteger() &&
20212041
ByteOffsetNode.getValueType().getSizeInBits() == 32) {
20222042
*SOffset = ByteOffsetNode;
2023-
return true;
2043+
return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2044+
ImmOffset);
20242045
}
20252046
if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
20262047
if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
20272048
*SOffset = ByteOffsetNode.getOperand(0);
2028-
return true;
2049+
return isSOffsetLegalWithImmOffset(SOffset, Imm32Only, IsBuffer,
2050+
ImmOffset);
20292051
}
20302052
}
20312053
return false;
@@ -2036,8 +2058,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
20362058
// GFX9 and GFX10 have signed byte immediate offsets. The immediate
20372059
// offset for S_BUFFER instructions is unsigned.
20382060
int64_t ByteOffset = IsBuffer ? C->getZExtValue() : C->getSExtValue();
2039-
std::optional<int64_t> EncodedOffset =
2040-
AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, IsBuffer);
2061+
std::optional<int64_t> EncodedOffset = AMDGPU::getSMRDEncodedOffset(
2062+
*Subtarget, ByteOffset, IsBuffer, HasSOffset);
20412063
if (EncodedOffset && Offset && !Imm32Only) {
20422064
*Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
20432065
return true;
@@ -2096,13 +2118,22 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
20962118
// true, match only 32-bit immediate offsets available on CI.
20972119
bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
20982120
SDValue *SOffset, SDValue *Offset,
2099-
bool Imm32Only,
2100-
bool IsBuffer) const {
2121+
bool Imm32Only, bool IsBuffer,
2122+
bool HasSOffset,
2123+
int64_t ImmOffset) const {
21012124
if (SOffset && Offset) {
21022125
assert(!Imm32Only && !IsBuffer);
21032126
SDValue B;
2104-
return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
2105-
SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);
2127+
2128+
if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
2129+
return false;
2130+
2131+
int64_t ImmOff = 0;
2132+
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
2133+
ImmOff = C->getSExtValue();
2134+
2135+
return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
2136+
ImmOff);
21062137
}
21072138

21082139
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
@@ -2121,11 +2152,14 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
21212152
}
21222153
if (!N0 || !N1)
21232154
return false;
2124-
if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer)) {
2155+
2156+
if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2157+
ImmOffset)) {
21252158
SBase = N0;
21262159
return true;
21272160
}
2128-
if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer)) {
2161+
if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
2162+
ImmOffset)) {
21292163
SBase = N1;
21302164
return true;
21312165
}

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
132132
bool isFlatScratchBaseLegal(SDValue Addr) const;
133133
bool isFlatScratchBaseLegalSV(SDValue Addr) const;
134134
bool isFlatScratchBaseLegalSVImm(SDValue Addr) const;
135+
bool isSOffsetLegalWithImmOffset(SDValue *SOffset, bool Imm32Only,
136+
bool IsBuffer, int64_t ImmOffset = 0) const;
135137

136138
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
137139
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
@@ -174,11 +176,13 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
174176

175177
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset,
176178
SDValue *Offset, bool Imm32Only = false,
177-
bool IsBuffer = false) const;
179+
bool IsBuffer = false, bool HasSOffset = false,
180+
int64_t ImmOffset = 0) const;
178181
SDValue Expand32BitAddress(SDValue Addr) const;
179182
bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
180183
SDValue *Offset, bool Imm32Only = false,
181-
bool IsBuffer = false) const;
184+
bool IsBuffer = false, bool HasSOffset = false,
185+
int64_t ImmOffset = 0) const;
182186
bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
183187
SDValue *Offset, bool Imm32Only = false) const;
184188
bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
@@ -190,6 +194,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
190194
bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const;
191195
bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
192196
SDValue &Offset) const;
197+
bool SelectSMRDPrefetchImm(SDValue Addr, SDValue &SBase,
198+
SDValue &Offset) const;
193199
bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
194200

195201
bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4216,10 +4216,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
42164216
return false;
42174217

42184218
const GEPInfo &GEPI = AddrInfo[0];
4219-
std::optional<int64_t> EncodedImm =
4220-
AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
4219+
std::optional<int64_t> EncodedImm;
42214220

42224221
if (SOffset && Offset) {
4222+
EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4223+
/*HasSOffset=*/true);
42234224
if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
42244225
AddrInfo.size() > 1) {
42254226
const GEPInfo &GEPI2 = AddrInfo[1];
@@ -4229,13 +4230,26 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
42294230
Base = GEPI2.SgprParts[0];
42304231
*SOffset = OffsetReg;
42314232
*Offset = *EncodedImm;
4233+
if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4234+
return true;
4235+
4236+
// For unbuffered smem loads, it is illegal for the Immediate Offset
4237+
// to be negative if the resulting (Offset + (M0 or SOffset or zero)
4238+
// is negative. Handle the case where the Immediate Offset + SOffset
4239+
// is negative.
4240+
auto SKnown = KB->getKnownBits(*SOffset);
4241+
if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4242+
return false;
4243+
42324244
return true;
42334245
}
42344246
}
42354247
}
42364248
return false;
42374249
}
42384250

4251+
EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
4252+
/*HasSOffset=*/false);
42394253
if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
42404254
Base = GEPI.SgprParts[0];
42414255
*Offset = *EncodedImm;

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,12 @@ namespace llvm {
159159

160160
namespace AMDGPU {
161161

162+
/// \returns true if the target supports signed immediate offset for SMRD
163+
/// instructions.
164+
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
165+
return isGFX9Plus(ST);
166+
}
167+
162168
/// \returns True if \p STI is AMDHSA.
163169
bool isHsaAbi(const MCSubtargetInfo &STI) {
164170
return STI.getTargetTriple().getOS() == Triple::AMDHSA;
@@ -2819,10 +2825,6 @@ static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) {
28192825
return isGCN3Encoding(ST) || isGFX10Plus(ST);
28202826
}
28212827

2822-
static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
2823-
return isGFX9Plus(ST);
2824-
}
2825-
28262828
bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
28272829
int64_t EncodedOffset) {
28282830
if (isGFX12Plus(ST))
@@ -2857,7 +2859,14 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST,
28572859
}
28582860

28592861
std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
2860-
int64_t ByteOffset, bool IsBuffer) {
2862+
int64_t ByteOffset, bool IsBuffer,
2863+
bool HasSOffset) {
2864+
// For unbuffered smem loads, it is illegal for the Immediate Offset to be
2865+
// negative if the resulting (Offset + (M0 or SOffset or zero) is negative.
2866+
// Handle case where SOffset is not present.
2867+
if (!IsBuffer && !HasSOffset && ByteOffset < 0 && hasSMRDSignedImmOffset(ST))
2868+
return std::nullopt;
2869+
28612870
if (isGFX12Plus(ST)) // 24 bit signed offsets
28622871
return isInt<24>(ByteOffset) ? std::optional<int64_t>(ByteOffset)
28632872
: std::nullopt;

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1307,6 +1307,7 @@ bool hasVOPD(const MCSubtargetInfo &STI);
13071307
bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI);
13081308
int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
13091309
unsigned hasKernargPreload(const MCSubtargetInfo &STI);
1310+
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST);
13101311

13111312
/// Is Reg - scalar register
13121313
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
@@ -1479,7 +1480,8 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset);
14791480
/// S_LOAD instructions have a signed offset, on other subtargets it is
14801481
/// unsigned. S_BUFFER has an unsigned offset for all subtargets.
14811482
std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
1482-
int64_t ByteOffset, bool IsBuffer);
1483+
int64_t ByteOffset, bool IsBuffer,
1484+
bool HasSOffset = false);
14831485

14841486
/// \return The encoding that can be used for a 32-bit literal offset in an SMRD
14851487
/// instruction. This is only useful on CI.s

llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1234,7 +1234,15 @@ body: |
12341234
; GFX10: liveins: $sgpr0_sgpr1
12351235
; GFX10-NEXT: {{ $}}
12361236
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
1237-
; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -1, 0 :: (load (s32), addrspace 4)
1237+
; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
1238+
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
1239+
; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
1240+
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
1241+
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
1242+
; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
1243+
; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
1244+
; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
1245+
; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
12381246
; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
12391247
%0:sgpr(p4) = COPY $sgpr0_sgpr1
12401248
%1:sgpr(s64) = G_CONSTANT i64 -1
@@ -1304,7 +1312,15 @@ body: |
13041312
; GFX10: liveins: $sgpr0_sgpr1
13051313
; GFX10-NEXT: {{ $}}
13061314
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
1307-
; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], -524288, 0 :: (load (s32), addrspace 4)
1315+
; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288
1316+
; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
1317+
; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
1318+
; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
1319+
; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
1320+
; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
1321+
; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
1322+
; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
1323+
; GFX10-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
13081324
; GFX10-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
13091325
%0:sgpr(p4) = COPY $sgpr0_sgpr1
13101326
%1:sgpr(s64) = G_CONSTANT i64 -524288

llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,11 +88,13 @@ entry:
8888
ret void
8989
}
9090

91-
; GFX9_10 can use a signed immediate byte offset
91+
; GFX9+ can use a signed immediate byte offset but not without sgpr[offset]
9292
; GCN-LABEL: {{^}}smrd6:
9393
; SICIVI: s_add_u32 s{{[0-9]}}, s{{[0-9]}}, -4
9494
; SICIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
95-
; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], -0x4
95+
; GFX9_10: s_add_u32 s2, s2, -4
96+
; GFX9_10: s_addc_u32 s3, s3, -1
97+
; GFX9_10: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x0
9698
define amdgpu_kernel void @smrd6(ptr addrspace(1) %out, ptr addrspace(4) %ptr) #0 {
9799
entry:
98100
%tmp = getelementptr i32, ptr addrspace(4) %ptr, i64 -1

llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-smem.ll

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -297,20 +297,26 @@ define amdgpu_cs void @test_sink_smem_offset_neg400(ptr addrspace(4) inreg %ptr,
297297
; GFX9: ; %bb.0: ; %entry
298298
; GFX9-NEXT: .LBB5_1: ; %loop
299299
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
300-
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
301-
; GFX9-NEXT: s_load_dword s3, s[0:1], -0x190
302300
; GFX9-NEXT: s_add_i32 s2, s2, -1
301+
; GFX9-NEXT: s_add_u32 s4, s0, 0xfffffe70
302+
; GFX9-NEXT: s_addc_u32 s5, s1, -1
303+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
304+
; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0
303305
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
304306
; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
305307
; GFX9-NEXT: ; %bb.2: ; %end
306308
; GFX9-NEXT: s_endpgm
307309
;
308310
; GFX12-LABEL: test_sink_smem_offset_neg400:
309311
; GFX12: ; %bb.0: ; %entry
312+
; GFX12-NEXT: s_movk_i32 s4, 0xfe70
313+
; GFX12-NEXT: s_mov_b32 s5, -1
314+
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
315+
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
310316
; GFX12-NEXT: .LBB5_1: ; %loop
311317
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
312318
; GFX12-NEXT: s_wait_kmcnt 0x0
313-
; GFX12-NEXT: s_load_b32 s3, s[0:1], -0x190
319+
; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x0
314320
; GFX12-NEXT: s_add_co_i32 s2, s2, -1
315321
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
316322
; GFX12-NEXT: s_cmp_lg_u32 s2, 0

0 commit comments

Comments
 (0)