Closed
Description
Compiling the following file
#include <hip/hip_runtime.h>
__global__ void block_transpose(const int* __restrict__ blocks,
const int* __restrict__ block_ptrs,
int* __restrict__ out_blocks)
{
const auto block_size = block_ptrs[1] - block_ptrs[0];
const auto block_stride = 1;
const auto rank =
unsigned(threadIdx.x +
blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z)) %
2;
for (int i = 0; i < block_size; ++i) {
auto val = blocks[i * block_stride + rank];
out_blocks[i + rank * block_stride] = val;
}
}
__global__ void block_transpose(const short* __restrict__ blocks,
const int* __restrict__ block_ptrs,
short* __restrict__ out_blocks)
{
const auto block_size = block_ptrs[1] - block_ptrs[0];
const auto block_stride = 1;
const auto rank =
unsigned(threadIdx.x +
blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z)) %
2;
for (int i = 0; i < block_size; ++i) {
auto val = blocks[i * block_stride + rank];
out_blocks[i + rank * block_stride] = val;
}
}
using clang++ -x hip -O3 -c file.cpp
causes a fatal error when lowering to device code:
fatal error: error in backend: Cannot select: 0xc0314f0: i1 = mul # D:1 0xba424c0, 0xc0315d0
0xba424c0: i1 = truncate 0xc031a30
0xc031a30: i32,ch = load<(dereferenceable invariant load (s32) from %ir.13, addrspace 4)> 0xca6fa58, 0xba41ea0, undef:i64
0xba41ea0: i64 = add 0xba41d50, Constant:i64<4>
0xba41d50: i64 = AssertAlign<4> 0xba426f0
0xba426f0: i64,ch = CopyFromReg 0xca6fa58, Register:i64 %13
0xc031640: i64 = Register %13
0xc031020: i64 = Constant<4>
0xc030f40: i64 = undef
0xc0315d0: i1 = truncate # D:1 0xc031b10
0xc031b10: i32 = add # D:1 0xc031720, 0xba42680
0xc031720: i32 = MUL_U24 # D:1 0xba425a0, 0xc031db0
0xba425a0: i32 = AssertZext # D:1 0xba42530, ValueType:ch:i10
0xba42530: i32,ch = CopyFromReg # D:1 0xca6fa58, Register:i32 %11
0xc031b80: i32 = Register %11
0xc031db0: i32 = srl 0xc031a30, Constant:i32<16>
0xc031a30: i32,ch = load<(dereferenceable invariant load (s32) from %ir.13, addrspace 4)> 0xca6fa58, 0xba41ea0, undef:i64
0xba41ea0: i64 = add 0xba41d50, Constant:i64<4>
0xba41d50: i64 = AssertAlign<4> 0xba426f0
0xba426f0: i64,ch = CopyFromReg 0xca6fa58, Register:i64 %13
0xc031640: i64 = Register %13
0xc031020: i64 = Constant<4>
0xc030f40: i64 = undef
0xc031aa0: i32 = Constant<16>
0xba42680: i32 = AssertZext # D:1 0xba42610, ValueType:ch:i10
0xba42610: i32,ch = CopyFromReg # D:1 0xca6fa58, Register:i32 %10
0xc0313a0: i32 = Register %10
In function: _Z15block_transposePKiS0_Pi
clang-16: error: clang frontend command failed with exit code 70 (use -v to see invocation)
clang version 16.0.0 (https://github.com/llvm/llvm-project dee4bc4a4ecc56623d511ea571355d1e1ad02159)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /home/tribizel/llvm-project/build/bin
clang-16: note: diagnostic msg: Error generating preprocessed source(s).
When compiling only the short
overload, the IR is a bit more manageable
fatal error: error in backend: Cannot select: 0x9cd80f0: i1 = mul # D:1 0x9cd8320, 0x9cd8550
0x9cd8320: i1 = truncate 0xa608960
0xa608960: i16,ch = CopyFromReg 0xacb1ac8, Register:i16 %2
0xa608810: i16 = Register %2
0x9cd8550: i1 = truncate # D:1 0xa608c00
0xa608c00: i32 = AssertZext # D:1 0x9cd88d0, ValueType:ch:i27
0x9cd88d0: i32,ch = CopyFromReg # D:1 0xacb1ac8, Register:i32 %3
0x9cd8470: i32 = Register %3
In function: _Z15block_transposePKsPKiPs
I bisected this down to dee4bc4 (https://reviews.llvm.org/D134596)
cc @changpeng