Skip to content

Commit 6117794

Browse files
pvellient-tye
authored andcommitted
[AMDGPU] Use MUBUF instructions for global address space access
Currently, the compiler crashes in instruction selection of global load/stores in gfx600 due to the lack of FLAT instructions. This patch fix the crash by selecting MUBUF instructions for global load/stores in gfx600. Authored-by: Praveen Velliengiri <[email protected]> Reviewed by: t-tye Differential revision: https://reviews.llvm.org/D92483
1 parent 9017791 commit 6117794

18 files changed

+3212
-3191
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,11 @@ specify the AMDGPU processor together with optional target features. See
104104
:ref:`amdgpu-target-id` and :ref:`amdgpu-target-features` for AMD GPU target
105105
specific information.
106106

107+
Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following exceptions:
108+
109+
* ``amdhsa`` is not supported in ``r600`` architecture (see :ref:`amdgpu-architecture-table`).
110+
111+
107112
.. table:: AMDGPU Processors
108113
:name: amdgpu-processor-table
109114

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
8989
// Assuming ECC is enabled is the conservative default.
9090
SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
9191

92-
if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
92+
// Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
93+
if (isAmdHsaOS())
9394
FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
9495

9596
FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
@@ -108,15 +109,36 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
108109

109110
ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
110111

112+
// Implement the "generic" processors, which acts as the default when no
113+
// generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
114+
// the first amdgcn target that supports flat addressing. Other OSes defaults
115+
// to the first amdgcn target.
116+
if (Gen == AMDGPUSubtarget::INVALID) {
117+
Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
118+
: AMDGPUSubtarget::SOUTHERN_ISLANDS;
119+
}
120+
111121
// We don't support FP64 for EG/NI atm.
112122
assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
113123

114-
// Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
115-
// on VI and newer hardware to avoid assertion failures due to missing ADDR64
116-
// variants of MUBUF instructions.
117-
if (!hasAddr64() && !FS.contains("flat-for-global")) {
124+
// Targets must either support 64-bit offsets for MUBUF instructions, and/or
125+
// support flat operations, otherwise they cannot access a 64-bit global
126+
// address space
127+
assert(hasAddr64() || hasFlat());
128+
// Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
129+
// that do not support ADDR64 variants of MUBUF instructions. Such targets
130+
// cannot use a 64 bit offset with a MUBUF instruction to access the global
131+
// address space
132+
if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
133+
ToggleFeature(AMDGPU::FeatureFlatForGlobal);
118134
FlatForGlobal = true;
119135
}
136+
// Unless +-flat-for-global is specified, use MUBUF instructions for global
137+
// address space access if flat operations are not available.
138+
if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
139+
ToggleFeature(AMDGPU::FeatureFlatForGlobal);
140+
FlatForGlobal = false;
141+
}
120142

121143
// Set defaults if needed.
122144
if (MaxPrivateElementSize == 0)
@@ -182,7 +204,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
182204
AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
183205
AMDGPUSubtarget(TT),
184206
TargetTriple(TT),
185-
Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
207+
Gen(INVALID),
186208
InstrItins(getInstrItineraryForCPU(GPU)),
187209
LDSBankCount(0),
188210
MaxPrivateElementSize(0),

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -50,15 +50,16 @@ class StringRef;
5050
class AMDGPUSubtarget {
5151
public:
5252
enum Generation {
53-
R600 = 0,
54-
R700 = 1,
55-
EVERGREEN = 2,
56-
NORTHERN_ISLANDS = 3,
57-
SOUTHERN_ISLANDS = 4,
58-
SEA_ISLANDS = 5,
59-
VOLCANIC_ISLANDS = 6,
60-
GFX9 = 7,
61-
GFX10 = 8
53+
INVALID = 0,
54+
R600 = 1,
55+
R700 = 2,
56+
EVERGREEN = 3,
57+
NORTHERN_ISLANDS = 4,
58+
SOUTHERN_ISLANDS = 5,
59+
SEA_ISLANDS = 6,
60+
VOLCANIC_ISLANDS = 7,
61+
GFX9 = 8,
62+
GFX10 = 9
6263
};
6364

6465
private:
@@ -527,6 +528,10 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
527528
return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
528529
}
529530

531+
bool hasFlat() const {
532+
return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
533+
}
534+
530535
// Return true if the target only has the reverse operand versions of VALU
531536
// shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
532537
bool hasOnlyRevVALUShifts() const {

llvm/test/CodeGen/AMDGPU/lower-kernargs.ll

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -533,10 +533,7 @@ define amdgpu_kernel void @kern_lds_ptr(i32 addrspace(3)* %lds) #0 {
533533
define amdgpu_kernel void @kern_lds_ptr_si(i32 addrspace(3)* %lds) #2 {
534534
; HSA-LABEL: @kern_lds_ptr_si(
535535
; HSA-NEXT: [[KERN_LDS_PTR_SI_KERNARG_SEGMENT:%.*]] = call nonnull align 16 dereferenceable(8) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
536-
; HSA-NEXT: [[LDS_KERNARG_OFFSET:%.*]] = getelementptr inbounds i8, i8 addrspace(4)* [[KERN_LDS_PTR_SI_KERNARG_SEGMENT]], i64 0
537-
; HSA-NEXT: [[LDS_KERNARG_OFFSET_CAST:%.*]] = bitcast i8 addrspace(4)* [[LDS_KERNARG_OFFSET]] to i32 addrspace(3)* addrspace(4)*
538-
; HSA-NEXT: [[LDS_LOAD:%.*]] = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(4)* [[LDS_KERNARG_OFFSET_CAST]], align 16, !invariant.load !0
539-
; HSA-NEXT: store i32 0, i32 addrspace(3)* [[LDS_LOAD]], align 4
536+
; HSA-NEXT: store i32 0, i32 addrspace(3)* [[LDS:%.*]], align 4
540537
; HSA-NEXT: ret void
541538
;
542539
; MESA-LABEL: @kern_lds_ptr_si(

llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
33
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
44
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
55
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s

0 commit comments

Comments
 (0)