Skip to content

Commit a343ee9

Browse files
[libc][nfc] Use common implementation of read_first_lane_u64, no codegen regression
1 parent 15e6bb6 commit a343ee9

File tree

4 files changed

+98
-34
lines changed

4 files changed

+98
-34
lines changed

clang/lib/Headers/amdgpuintrin.h

+5-10
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ _Pragma("omp begin declare variant match(device = {arch(amdgcn)})");
3333
// Attribute to declare a function as a kernel.
3434
#define __gpu_kernel __attribute__((amdgpu_kernel, visibility("protected")))
3535

36+
// Defined in gpuintrin.h, used later in this file.
37+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
38+
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
39+
3640
// Returns the number of workgroups in the 'x' dimension of the grid.
3741
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
3842
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
@@ -115,15 +119,6 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
115119
return __builtin_amdgcn_readfirstlane(__x);
116120
}
117121

118-
// Copies the value from the first active thread in the wavefront to the rest.
119-
_DEFAULT_FN_ATTRS __inline__ uint64_t
120-
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
121-
uint32_t __hi = (uint32_t)(__x >> 32ull);
122-
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
123-
return ((uint64_t)__builtin_amdgcn_readfirstlane(__hi) << 32ull) |
124-
((uint64_t)__builtin_amdgcn_readfirstlane(__lo) & 0xFFFFFFFF);
125-
}
126-
127122
// Returns a bitmask of threads in the current lane for which \p x is true.
128123
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
129124
bool __x) {
@@ -203,7 +198,7 @@ __gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
203198
// Returns the current lane mask if every lane contains __x.
204199
_DEFAULT_FN_ATTRS static __inline__ uint64_t
205200
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
206-
uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
201+
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
207202
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
208203
__gpu_sync_lane(__lane_mask);
209204
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;

clang/lib/Headers/gpuintrin.h

+9
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,15 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
115115
return __gpu_lane_id() == __gpu_first_lane_id(__lane_mask);
116116
}
117117

118+
// Copies the value from the first active thread in the wavefront to the rest.
119+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
120+
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
121+
uint32_t __hi = (uint32_t)(__x >> 32ull);
122+
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFFull);
123+
return ((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __hi) << 32ull) |
124+
((uint64_t)__gpu_read_first_lane_u32(__lane_mask, __lo) & 0xFFFFFFFFull);
125+
}
126+
118127
// Gets the first floating point value from the active lanes.
119128
_DEFAULT_FN_ATTRS static __inline__ float
120129
__gpu_read_first_lane_f32(uint64_t __lane_mask, float __x) {

clang/lib/Headers/nvptxintrin.h

+5-16
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ _Pragma("omp begin declare variant match(device = {arch(nvptx64)})");
3737
// Attribute to declare a function as a kernel.
3838
#define __gpu_kernel __attribute__((nvptx_kernel, visibility("protected")))
3939

40+
// Defined in gpuintrin.h, used later in this file.
41+
_DEFAULT_FN_ATTRS static __inline__ uint64_t
42+
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x);
43+
4044
// Returns the number of CUDA blocks in the 'x' dimension.
4145
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks_x(void) {
4246
return __nvvm_read_ptx_sreg_nctaid_x();
@@ -120,21 +124,6 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
120124
return __nvvm_shfl_sync_idx_i32(__mask, __x, __id, __gpu_num_lanes() - 1);
121125
}
122126

123-
// Copies the value from the first active thread in the warp to the rest.
124-
_DEFAULT_FN_ATTRS static __inline__ uint64_t
125-
__gpu_read_first_lane_u64(uint64_t __lane_mask, uint64_t __x) {
126-
uint32_t __hi = (uint32_t)(__x >> 32ull);
127-
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
128-
uint32_t __mask = (uint32_t)__lane_mask;
129-
uint32_t __id = __builtin_ffs(__mask) - 1;
130-
return ((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __hi, __id,
131-
__gpu_num_lanes() - 1)
132-
<< 32ull) |
133-
((uint64_t)__nvvm_shfl_sync_idx_i32(__mask, __lo, __id,
134-
__gpu_num_lanes() - 1) &
135-
0xFFFFFFFF);
136-
}
137-
138127
// Returns a bitmask of threads in the current lane for which \p x is true.
139128
_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot(uint64_t __lane_mask,
140129
bool __x) {
@@ -231,7 +220,7 @@ __gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
231220
return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
232221
#endif
233222

234-
uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
223+
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
235224
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
236225
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
237226
}

clang/test/Headers/gpuintrin.c

+79-8
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ __gpu_kernel void foo() {
3333
__gpu_lane_id();
3434
__gpu_lane_mask();
3535
__gpu_read_first_lane_u32(-1, -1);
36+
__gpu_read_first_lane_u64(-1, -1);
3637
__gpu_ballot(-1, 1);
3738
__gpu_sync_threads();
3839
__gpu_sync_lane(-1);
@@ -64,12 +65,13 @@ __gpu_kernel void foo() {
6465
// AMDGPU-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]]
6566
// AMDGPU-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR7]]
6667
// AMDGPU-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR7]]
67-
// AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
68+
// AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR7]]
69+
// AMDGPU-NEXT: [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
6870
// AMDGPU-NEXT: call void @__gpu_sync_threads() #[[ATTR7]]
6971
// AMDGPU-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
70-
// AMDGPU-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
71-
// AMDGPU-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
72-
// AMDGPU-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
72+
// AMDGPU-NEXT: [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
73+
// AMDGPU-NEXT: [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
74+
// AMDGPU-NEXT: [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
7375
// AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR8:[0-9]+]]
7476
// AMDGPU-NEXT: unreachable
7577
//
@@ -388,6 +390,43 @@ __gpu_kernel void foo() {
388390
// AMDGPU-NEXT: ret i32 [[TMP1]]
389391
//
390392
//
393+
// AMDGPU-LABEL: define internal i64 @__gpu_read_first_lane_u64(
394+
// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
395+
// AMDGPU-NEXT: [[ENTRY:.*:]]
396+
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i64, align 8, addrspace(5)
397+
// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
398+
// AMDGPU-NEXT: [[__X_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
399+
// AMDGPU-NEXT: [[__HI:%.*]] = alloca i32, align 4, addrspace(5)
400+
// AMDGPU-NEXT: [[__LO:%.*]] = alloca i32, align 4, addrspace(5)
401+
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
402+
// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr
403+
// AMDGPU-NEXT: [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr
404+
// AMDGPU-NEXT: [[__HI_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__HI]] to ptr
405+
// AMDGPU-NEXT: [[__LO_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LO]] to ptr
406+
// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
407+
// AMDGPU-NEXT: store i64 [[__X]], ptr [[__X_ADDR_ASCAST]], align 8
408+
// AMDGPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
409+
// AMDGPU-NEXT: [[SHR:%.*]] = lshr i64 [[TMP0]], 32
410+
// AMDGPU-NEXT: [[CONV:%.*]] = trunc i64 [[SHR]] to i32
411+
// AMDGPU-NEXT: store i32 [[CONV]], ptr [[__HI_ASCAST]], align 4
412+
// AMDGPU-NEXT: [[TMP1:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
413+
// AMDGPU-NEXT: [[AND:%.*]] = and i64 [[TMP1]], 4294967295
414+
// AMDGPU-NEXT: [[CONV1:%.*]] = trunc i64 [[AND]] to i32
415+
// AMDGPU-NEXT: store i32 [[CONV1]], ptr [[__LO_ASCAST]], align 4
416+
// AMDGPU-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
417+
// AMDGPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[__HI_ASCAST]], align 4
418+
// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR7]]
419+
// AMDGPU-NEXT: [[CONV2:%.*]] = zext i32 [[CALL]] to i64
420+
// AMDGPU-NEXT: [[SHL:%.*]] = shl i64 [[CONV2]], 32
421+
// AMDGPU-NEXT: [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
422+
// AMDGPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[__LO_ASCAST]], align 4
423+
// AMDGPU-NEXT: [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR7]]
424+
// AMDGPU-NEXT: [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
425+
// AMDGPU-NEXT: [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
426+
// AMDGPU-NEXT: [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
427+
// AMDGPU-NEXT: ret i64 [[OR]]
428+
//
429+
//
391430
// AMDGPU-LABEL: define internal i64 @__gpu_ballot(
392431
// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
393432
// AMDGPU-NEXT: [[ENTRY:.*:]]
@@ -525,12 +564,13 @@ __gpu_kernel void foo() {
525564
// NVPTX-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]]
526565
// NVPTX-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR6]]
527566
// NVPTX-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR6]]
528-
// NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
567+
// NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR6]]
568+
// NVPTX-NEXT: [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
529569
// NVPTX-NEXT: call void @__gpu_sync_threads() #[[ATTR6]]
530570
// NVPTX-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
531-
// NVPTX-NEXT: [[CALL21:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
532-
// NVPTX-NEXT: [[CALL22:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
533-
// NVPTX-NEXT: [[CALL23:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
571+
// NVPTX-NEXT: [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
572+
// NVPTX-NEXT: [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
573+
// NVPTX-NEXT: [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
534574
// NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]]
535575
// NVPTX-NEXT: unreachable
536576
//
@@ -793,6 +833,37 @@ __gpu_kernel void foo() {
793833
// NVPTX-NEXT: ret i32 [[TMP7]]
794834
//
795835
//
836+
// NVPTX-LABEL: define internal i64 @__gpu_read_first_lane_u64(
837+
// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
838+
// NVPTX-NEXT: [[ENTRY:.*:]]
839+
// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
840+
// NVPTX-NEXT: [[__X_ADDR:%.*]] = alloca i64, align 8
841+
// NVPTX-NEXT: [[__HI:%.*]] = alloca i32, align 4
842+
// NVPTX-NEXT: [[__LO:%.*]] = alloca i32, align 4
843+
// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
844+
// NVPTX-NEXT: store i64 [[__X]], ptr [[__X_ADDR]], align 8
845+
// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__X_ADDR]], align 8
846+
// NVPTX-NEXT: [[SHR:%.*]] = lshr i64 [[TMP0]], 32
847+
// NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[SHR]] to i32
848+
// NVPTX-NEXT: store i32 [[CONV]], ptr [[__HI]], align 4
849+
// NVPTX-NEXT: [[TMP1:%.*]] = load i64, ptr [[__X_ADDR]], align 8
850+
// NVPTX-NEXT: [[AND:%.*]] = and i64 [[TMP1]], 4294967295
851+
// NVPTX-NEXT: [[CONV1:%.*]] = trunc i64 [[AND]] to i32
852+
// NVPTX-NEXT: store i32 [[CONV1]], ptr [[__LO]], align 4
853+
// NVPTX-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
854+
// NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[__HI]], align 4
855+
// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR6]]
856+
// NVPTX-NEXT: [[CONV2:%.*]] = zext i32 [[CALL]] to i64
857+
// NVPTX-NEXT: [[SHL:%.*]] = shl i64 [[CONV2]], 32
858+
// NVPTX-NEXT: [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
859+
// NVPTX-NEXT: [[TMP5:%.*]] = load i32, ptr [[__LO]], align 4
860+
// NVPTX-NEXT: [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR6]]
861+
// NVPTX-NEXT: [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
862+
// NVPTX-NEXT: [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
863+
// NVPTX-NEXT: [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
864+
// NVPTX-NEXT: ret i64 [[OR]]
865+
//
866+
//
796867
// NVPTX-LABEL: define internal i64 @__gpu_ballot(
797868
// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
798869
// NVPTX-NEXT: [[ENTRY:.*:]]

0 commit comments

Comments
 (0)