@@ -33,6 +33,7 @@ __gpu_kernel void foo() {
33
33
__gpu_lane_id ();
34
34
__gpu_lane_mask ();
35
35
__gpu_read_first_lane_u32 (-1 , -1 );
36
+ __gpu_read_first_lane_u64 (-1 , -1 );
36
37
__gpu_ballot (-1 , 1 );
37
38
__gpu_sync_threads ();
38
39
__gpu_sync_lane (-1 );
@@ -64,12 +65,13 @@ __gpu_kernel void foo() {
64
65
// AMDGPU-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]]
65
66
// AMDGPU-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR7]]
66
67
// AMDGPU-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR7]]
67
- // AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
68
+ // AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR7]]
69
+ // AMDGPU-NEXT: [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
68
70
// AMDGPU-NEXT: call void @__gpu_sync_threads() #[[ATTR7]]
69
71
// AMDGPU-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
70
- // AMDGPU-NEXT: [[CALL21 :%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
71
- // AMDGPU-NEXT: [[CALL22 :%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
72
- // AMDGPU-NEXT: [[CALL23 :%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
72
+ // AMDGPU-NEXT: [[CALL22 :%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
73
+ // AMDGPU-NEXT: [[CALL23 :%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
74
+ // AMDGPU-NEXT: [[CALL24 :%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
73
75
// AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR8:[0-9]+]]
74
76
// AMDGPU-NEXT: unreachable
75
77
//
@@ -388,6 +390,43 @@ __gpu_kernel void foo() {
388
390
// AMDGPU-NEXT: ret i32 [[TMP1]]
389
391
//
390
392
//
393
+ // AMDGPU-LABEL: define internal i64 @__gpu_read_first_lane_u64(
394
+ // AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
395
+ // AMDGPU-NEXT: [[ENTRY:.*:]]
396
+ // AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i64, align 8, addrspace(5)
397
+ // AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
398
+ // AMDGPU-NEXT: [[__X_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
399
+ // AMDGPU-NEXT: [[__HI:%.*]] = alloca i32, align 4, addrspace(5)
400
+ // AMDGPU-NEXT: [[__LO:%.*]] = alloca i32, align 4, addrspace(5)
401
+ // AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
402
+ // AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr
403
+ // AMDGPU-NEXT: [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr
404
+ // AMDGPU-NEXT: [[__HI_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__HI]] to ptr
405
+ // AMDGPU-NEXT: [[__LO_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LO]] to ptr
406
+ // AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
407
+ // AMDGPU-NEXT: store i64 [[__X]], ptr [[__X_ADDR_ASCAST]], align 8
408
+ // AMDGPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
409
+ // AMDGPU-NEXT: [[SHR:%.*]] = lshr i64 [[TMP0]], 32
410
+ // AMDGPU-NEXT: [[CONV:%.*]] = trunc i64 [[SHR]] to i32
411
+ // AMDGPU-NEXT: store i32 [[CONV]], ptr [[__HI_ASCAST]], align 4
412
+ // AMDGPU-NEXT: [[TMP1:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
413
+ // AMDGPU-NEXT: [[AND:%.*]] = and i64 [[TMP1]], 4294967295
414
+ // AMDGPU-NEXT: [[CONV1:%.*]] = trunc i64 [[AND]] to i32
415
+ // AMDGPU-NEXT: store i32 [[CONV1]], ptr [[__LO_ASCAST]], align 4
416
+ // AMDGPU-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
417
+ // AMDGPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[__HI_ASCAST]], align 4
418
+ // AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR7]]
419
+ // AMDGPU-NEXT: [[CONV2:%.*]] = zext i32 [[CALL]] to i64
420
+ // AMDGPU-NEXT: [[SHL:%.*]] = shl i64 [[CONV2]], 32
421
+ // AMDGPU-NEXT: [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
422
+ // AMDGPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[__LO_ASCAST]], align 4
423
+ // AMDGPU-NEXT: [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR7]]
424
+ // AMDGPU-NEXT: [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
425
+ // AMDGPU-NEXT: [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
426
+ // AMDGPU-NEXT: [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
427
+ // AMDGPU-NEXT: ret i64 [[OR]]
428
+ //
429
+ //
391
430
// AMDGPU-LABEL: define internal i64 @__gpu_ballot(
392
431
// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
393
432
// AMDGPU-NEXT: [[ENTRY:.*:]]
@@ -525,12 +564,13 @@ __gpu_kernel void foo() {
525
564
// NVPTX-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]]
526
565
// NVPTX-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR6]]
527
566
// NVPTX-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR6]]
528
- // NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
567
+ // NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR6]]
568
+ // NVPTX-NEXT: [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
529
569
// NVPTX-NEXT: call void @__gpu_sync_threads() #[[ATTR6]]
530
570
// NVPTX-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
531
- // NVPTX-NEXT: [[CALL21 :%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
532
- // NVPTX-NEXT: [[CALL22 :%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
533
- // NVPTX-NEXT: [[CALL23 :%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
571
+ // NVPTX-NEXT: [[CALL22 :%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
572
+ // NVPTX-NEXT: [[CALL23 :%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
573
+ // NVPTX-NEXT: [[CALL24 :%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
534
574
// NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]]
535
575
// NVPTX-NEXT: unreachable
536
576
//
@@ -793,6 +833,37 @@ __gpu_kernel void foo() {
793
833
// NVPTX-NEXT: ret i32 [[TMP7]]
794
834
//
795
835
//
836
+ // NVPTX-LABEL: define internal i64 @__gpu_read_first_lane_u64(
837
+ // NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
838
+ // NVPTX-NEXT: [[ENTRY:.*:]]
839
+ // NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
840
+ // NVPTX-NEXT: [[__X_ADDR:%.*]] = alloca i64, align 8
841
+ // NVPTX-NEXT: [[__HI:%.*]] = alloca i32, align 4
842
+ // NVPTX-NEXT: [[__LO:%.*]] = alloca i32, align 4
843
+ // NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
844
+ // NVPTX-NEXT: store i64 [[__X]], ptr [[__X_ADDR]], align 8
845
+ // NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__X_ADDR]], align 8
846
+ // NVPTX-NEXT: [[SHR:%.*]] = lshr i64 [[TMP0]], 32
847
+ // NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[SHR]] to i32
848
+ // NVPTX-NEXT: store i32 [[CONV]], ptr [[__HI]], align 4
849
+ // NVPTX-NEXT: [[TMP1:%.*]] = load i64, ptr [[__X_ADDR]], align 8
850
+ // NVPTX-NEXT: [[AND:%.*]] = and i64 [[TMP1]], 4294967295
851
+ // NVPTX-NEXT: [[CONV1:%.*]] = trunc i64 [[AND]] to i32
852
+ // NVPTX-NEXT: store i32 [[CONV1]], ptr [[__LO]], align 4
853
+ // NVPTX-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
854
+ // NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[__HI]], align 4
855
+ // NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR6]]
856
+ // NVPTX-NEXT: [[CONV2:%.*]] = zext i32 [[CALL]] to i64
857
+ // NVPTX-NEXT: [[SHL:%.*]] = shl i64 [[CONV2]], 32
858
+ // NVPTX-NEXT: [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
859
+ // NVPTX-NEXT: [[TMP5:%.*]] = load i32, ptr [[__LO]], align 4
860
+ // NVPTX-NEXT: [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR6]]
861
+ // NVPTX-NEXT: [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
862
+ // NVPTX-NEXT: [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
863
+ // NVPTX-NEXT: [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
864
+ // NVPTX-NEXT: ret i64 [[OR]]
865
+ //
866
+ //
796
867
// NVPTX-LABEL: define internal i64 @__gpu_ballot(
797
868
// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
798
869
// NVPTX-NEXT: [[ENTRY:.*:]]
0 commit comments