Closed
Description
This code:
#include<stdint.h>
#include<stddef.h>
void fill_i16(int16_t* a, int16_t v, size_t l) {
for (size_t i = 0; i < l; i++) a[i] = v;
}
compiled with -Os
on either x86-64 or aarch64, leads to bizarre code, e.g. x86-64:
bizarre x86-64
.LCPI0_0:
.quad 6 # 0x6
.quad 7 # 0x7
.LCPI0_1:
.quad 4 # 0x4
.quad 5 # 0x5
.LCPI0_2:
.quad 2 # 0x2
.quad 3 # 0x3
.LCPI0_3:
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 1 # 0x1
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.LCPI0_4:
.quad -9223372034707292160 # 0x8000000080000000
.quad -9223372034707292160 # 0x8000000080000000
.LCPI0_5:
.quad 8 # 0x8
.quad 8 # 0x8
fill_i16: # @fill_i16
test rdx, rdx
je .LBB0_19
lea rax, [rdx + 7]
and rax, -8
dec rdx
movq xmm0, rdx
pshufd xmm0, xmm0, 68 # xmm0 = xmm0[0,1,0,1]
movdqa xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [6,7]
movdqa xmm2, xmmword ptr [rip + .LCPI0_1] # xmm2 = [4,5]
movdqa xmm3, xmmword ptr [rip + .LCPI0_2] # xmm3 = [2,3]
movdqa xmm4, xmmword ptr [rip + .LCPI0_3] # xmm4 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
xor ecx, ecx
movdqa xmm5, xmmword ptr [rip + .LCPI0_4] # xmm5 = [9223372039002259456,9223372039002259456]
pxor xmm0, xmm5
pcmpeqd xmm6, xmm6
movdqa xmm7, xmmword ptr [rip + .LCPI0_5] # xmm7 = [8,8]
.LBB0_2: # =>This Inner Loop Header: Depth=1
movdqa xmm8, xmm4
pxor xmm8, xmm5
movdqa xmm10, xmm8
pcmpgtd xmm10, xmm0
pshufd xmm9, xmm10, 160 # xmm9 = xmm10[0,0,2,2]
pshuflw xmm11, xmm9, 232 # xmm11 = xmm9[0,2,2,3,4,5,6,7]
pcmpeqd xmm8, xmm0
pshufd xmm8, xmm8, 245 # xmm8 = xmm8[1,1,3,3]
pshuflw xmm12, xmm8, 232 # xmm12 = xmm8[0,2,2,3,4,5,6,7]
pand xmm12, xmm11
pshufd xmm10, xmm10, 245 # xmm10 = xmm10[1,1,3,3]
pshuflw xmm11, xmm10, 232 # xmm11 = xmm10[0,2,2,3,4,5,6,7]
por xmm11, xmm12
pxor xmm11, xmm6
packssdw xmm11, xmm11
movd edx, xmm11
test dl, 1
je .LBB0_4
mov word ptr [rdi + 2*rcx], si
.LBB0_4: # in Loop: Header=BB0_2 Depth=1
pand xmm8, xmm9
por xmm8, xmm10
packssdw xmm8, xmm8
pxor xmm8, xmm6
packssdw xmm8, xmm8
movd edx, xmm8
shr edx, 16
test dl, 1
je .LBB0_6
mov word ptr [rdi + 2*rcx + 2], si
.LBB0_6: # in Loop: Header=BB0_2 Depth=1
movdqa xmm9, xmm3
pxor xmm9, xmm5
movdqa xmm10, xmm9
pcmpgtd xmm10, xmm0
pshufd xmm8, xmm10, 160 # xmm8 = xmm10[0,0,2,2]
pcmpeqd xmm9, xmm0
pshufd xmm9, xmm9, 245 # xmm9 = xmm9[1,1,3,3]
movdqa xmm11, xmm9
pand xmm11, xmm8
pshufd xmm10, xmm10, 245 # xmm10 = xmm10[1,1,3,3]
por xmm11, xmm10
packssdw xmm11, xmm11
pxor xmm11, xmm6
packssdw xmm11, xmm11
pextrw edx, xmm11, 2
test dl, 1
je .LBB0_8
mov word ptr [rdi + 2*rcx + 4], si
.LBB0_8: # in Loop: Header=BB0_2 Depth=1
pshufhw xmm8, xmm8, 132 # xmm8 = xmm8[0,1,2,3,4,5,4,6]
pshufhw xmm9, xmm9, 132 # xmm9 = xmm9[0,1,2,3,4,5,4,6]
pand xmm9, xmm8
pshufhw xmm8, xmm10, 132 # xmm8 = xmm10[0,1,2,3,4,5,4,6]
por xmm8, xmm9
pxor xmm8, xmm6
packssdw xmm8, xmm8
pextrw edx, xmm8, 3
test dl, 1
je .LBB0_10
mov word ptr [rdi + 2*rcx + 6], si
.LBB0_10: # in Loop: Header=BB0_2 Depth=1
movdqa xmm8, xmm2
pxor xmm8, xmm5
movdqa xmm10, xmm8
pcmpgtd xmm10, xmm0
pshufd xmm9, xmm10, 160 # xmm9 = xmm10[0,0,2,2]
pshuflw xmm11, xmm9, 232 # xmm11 = xmm9[0,2,2,3,4,5,6,7]
pcmpeqd xmm8, xmm0
pshufd xmm8, xmm8, 245 # xmm8 = xmm8[1,1,3,3]
pshuflw xmm12, xmm8, 232 # xmm12 = xmm8[0,2,2,3,4,5,6,7]
pand xmm12, xmm11
pshufd xmm10, xmm10, 245 # xmm10 = xmm10[1,1,3,3]
pshuflw xmm11, xmm10, 232 # xmm11 = xmm10[0,2,2,3,4,5,6,7]
por xmm11, xmm12
pxor xmm11, xmm6
packssdw xmm11, xmm11
pextrw edx, xmm11, 4
test dl, 1
je .LBB0_12
mov word ptr [rdi + 2*rcx + 8], si
.LBB0_12: # in Loop: Header=BB0_2 Depth=1
pand xmm8, xmm9
por xmm8, xmm10
packssdw xmm8, xmm8
pxor xmm8, xmm6
packssdw xmm8, xmm8
pextrw edx, xmm8, 5
test dl, 1
je .LBB0_14
mov word ptr [rdi + 2*rcx + 10], si
.LBB0_14: # in Loop: Header=BB0_2 Depth=1
movdqa xmm9, xmm1
pxor xmm9, xmm5
movdqa xmm10, xmm9
pcmpgtd xmm10, xmm0
pshufd xmm8, xmm10, 160 # xmm8 = xmm10[0,0,2,2]
pcmpeqd xmm9, xmm0
pshufd xmm9, xmm9, 245 # xmm9 = xmm9[1,1,3,3]
movdqa xmm11, xmm9
pand xmm11, xmm8
pshufd xmm10, xmm10, 245 # xmm10 = xmm10[1,1,3,3]
por xmm11, xmm10
packssdw xmm11, xmm11
pxor xmm11, xmm6
packssdw xmm11, xmm11
pextrw edx, xmm11, 6
test dl, 1
je .LBB0_16
mov word ptr [rdi + 2*rcx + 12], si
.LBB0_16: # in Loop: Header=BB0_2 Depth=1
pshufhw xmm8, xmm8, 132 # xmm8 = xmm8[0,1,2,3,4,5,4,6]
pshufhw xmm9, xmm9, 132 # xmm9 = xmm9[0,1,2,3,4,5,4,6]
pand xmm9, xmm8
pshufhw xmm8, xmm10, 132 # xmm8 = xmm10[0,1,2,3,4,5,4,6]
por xmm8, xmm9
pxor xmm8, xmm6
packssdw xmm8, xmm8
pextrw edx, xmm8, 7
test dl, 1
je .LBB0_18
mov word ptr [rdi + 2*rcx + 14], si
.LBB0_18: # in Loop: Header=BB0_2 Depth=1
add rcx, 8
paddq xmm4, xmm7
paddq xmm3, xmm7
paddq xmm2, xmm7
paddq xmm1, xmm7
cmp rax, rcx
jne .LBB0_2
.LBB0_19:
ret
Compiler Explorer: https://godbolt.org/z/9jfYh8Wz8; it appears the first version with this behavior was clang 12.
Use of SIMD here (the loop is unrolled, not vectorized) is completely unnecessary; the output of -Oz
, which is a simple scalar non-unrolled loop, is ~5x faster in a simple test.
The "optimized" IR in question:
IR
define dso_local void @fill_i16(ptr nocapture noundef writeonly %a, i16 noundef signext %v, i64 noundef %l) local_unnamed_addr {
entry:
%cmp3.not = icmp eq i64 %l, 0
br i1 %cmp3.not, label %for.cond.cleanup, label %vector.ph
vector.ph:
%n.rnd.up = add i64 %l, 7
%n.vec = and i64 %n.rnd.up, -8
%trip.count.minus.1 = add i64 %l, -1
%broadcast.splatinsert = insertelement <8 x i64> poison, i64 %trip.count.minus.1, i64 0
%broadcast.splat = shufflevector <8 x i64> %broadcast.splatinsert, <8 x i64> poison, <8 x i32> zeroinitializer
br label %vector.body
vector.body:
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %pred.store.continue18 ]
%vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ %vec.ind.next, %pred.store.continue18 ]
%0 = icmp ule <8 x i64> %vec.ind, %broadcast.splat
%1 = extractelement <8 x i1> %0, i64 0
br i1 %1, label %pred.store.if, label %pred.store.continue
pred.store.if:
%2 = getelementptr inbounds i16, ptr %a, i64 %index
store i16 %v, ptr %2, align 2
br label %pred.store.continue
pred.store.continue:
%3 = extractelement <8 x i1> %0, i64 1
br i1 %3, label %pred.store.if5, label %pred.store.continue6
pred.store.if5:
%4 = or i64 %index, 1
%5 = getelementptr inbounds i16, ptr %a, i64 %4
store i16 %v, ptr %5, align 2
br label %pred.store.continue6
pred.store.continue6:
%6 = extractelement <8 x i1> %0, i64 2
br i1 %6, label %pred.store.if7, label %pred.store.continue8
pred.store.if7:
%7 = or i64 %index, 2
%8 = getelementptr inbounds i16, ptr %a, i64 %7
store i16 %v, ptr %8, align 2
br label %pred.store.continue8
pred.store.continue8:
%9 = extractelement <8 x i1> %0, i64 3
br i1 %9, label %pred.store.if9, label %pred.store.continue10
pred.store.if9:
%10 = or i64 %index, 3
%11 = getelementptr inbounds i16, ptr %a, i64 %10
store i16 %v, ptr %11, align 2
br label %pred.store.continue10
pred.store.continue10:
%12 = extractelement <8 x i1> %0, i64 4
br i1 %12, label %pred.store.if11, label %pred.store.continue12
pred.store.if11:
%13 = or i64 %index, 4
%14 = getelementptr inbounds i16, ptr %a, i64 %13
store i16 %v, ptr %14, align 2
br label %pred.store.continue12
pred.store.continue12:
%15 = extractelement <8 x i1> %0, i64 5
br i1 %15, label %pred.store.if13, label %pred.store.continue14
pred.store.if13:
%16 = or i64 %index, 5
%17 = getelementptr inbounds i16, ptr %a, i64 %16
store i16 %v, ptr %17, align 2
br label %pred.store.continue14
pred.store.continue14:
%18 = extractelement <8 x i1> %0, i64 6
br i1 %18, label %pred.store.if15, label %pred.store.continue16
pred.store.if15:
%19 = or i64 %index, 6
%20 = getelementptr inbounds i16, ptr %a, i64 %19
store i16 %v, ptr %20, align 2
br label %pred.store.continue16
pred.store.continue16:
%21 = extractelement <8 x i1> %0, i64 7
br i1 %21, label %pred.store.if17, label %pred.store.continue18
pred.store.if17:
%22 = or i64 %index, 7
%23 = getelementptr inbounds i16, ptr %a, i64 %22
store i16 %v, ptr %23, align 2
br label %pred.store.continue18
pred.store.continue18:
%index.next = add i64 %index, 8
%vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
%24 = icmp eq i64 %index.next, %n.vec
br i1 %24, label %for.cond.cleanup, label %vector.body
for.cond.cleanup:
ret void
}