Skip to content

Bizarre optimization of simple array-filling loop in -Os #66652

Closed
@dzaima

Description

@dzaima

This code:

#include<stdint.h>
#include<stddef.h>
void fill_i16(int16_t* a, int16_t v, size_t l) {
  for (size_t i = 0; i < l; i++) a[i] = v;
}

compiled with -Os on either x86-64 or aarch64, leads to bizarre code, e.g. x86-64:

bizarre x86-64
.LCPI0_0:
        .quad   6                               # 0x6
        .quad   7                               # 0x7
.LCPI0_1:
        .quad   4                               # 0x4
        .quad   5                               # 0x5
.LCPI0_2:
        .quad   2                               # 0x2
        .quad   3                               # 0x3
.LCPI0_3:
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   1                               # 0x1
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
        .byte   0                               # 0x0
.LCPI0_4:
        .quad   -9223372034707292160            # 0x8000000080000000
        .quad   -9223372034707292160            # 0x8000000080000000
.LCPI0_5:
        .quad   8                               # 0x8
        .quad   8                               # 0x8
fill_i16:                               # @fill_i16
        test    rdx, rdx
        je      .LBB0_19
        lea     rax, [rdx + 7]
        and     rax, -8
        dec     rdx
        movq    xmm0, rdx
        pshufd  xmm0, xmm0, 68                  # xmm0 = xmm0[0,1,0,1]
        movdqa  xmm1, xmmword ptr [rip + .LCPI0_0] # xmm1 = [6,7]
        movdqa  xmm2, xmmword ptr [rip + .LCPI0_1] # xmm2 = [4,5]
        movdqa  xmm3, xmmword ptr [rip + .LCPI0_2] # xmm3 = [2,3]
        movdqa  xmm4, xmmword ptr [rip + .LCPI0_3] # xmm4 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]
        xor     ecx, ecx
        movdqa  xmm5, xmmword ptr [rip + .LCPI0_4] # xmm5 = [9223372039002259456,9223372039002259456]
        pxor    xmm0, xmm5
        pcmpeqd xmm6, xmm6
        movdqa  xmm7, xmmword ptr [rip + .LCPI0_5] # xmm7 = [8,8]
.LBB0_2:                                # =>This Inner Loop Header: Depth=1
        movdqa  xmm8, xmm4
        pxor    xmm8, xmm5
        movdqa  xmm10, xmm8
        pcmpgtd xmm10, xmm0
        pshufd  xmm9, xmm10, 160                # xmm9 = xmm10[0,0,2,2]
        pshuflw xmm11, xmm9, 232                # xmm11 = xmm9[0,2,2,3,4,5,6,7]
        pcmpeqd xmm8, xmm0
        pshufd  xmm8, xmm8, 245                 # xmm8 = xmm8[1,1,3,3]
        pshuflw xmm12, xmm8, 232                # xmm12 = xmm8[0,2,2,3,4,5,6,7]
        pand    xmm12, xmm11
        pshufd  xmm10, xmm10, 245               # xmm10 = xmm10[1,1,3,3]
        pshuflw xmm11, xmm10, 232               # xmm11 = xmm10[0,2,2,3,4,5,6,7]
        por     xmm11, xmm12
        pxor    xmm11, xmm6
        packssdw        xmm11, xmm11
        movd    edx, xmm11
        test    dl, 1
        je      .LBB0_4
        mov     word ptr [rdi + 2*rcx], si
.LBB0_4:                                #   in Loop: Header=BB0_2 Depth=1
        pand    xmm8, xmm9
        por     xmm8, xmm10
        packssdw        xmm8, xmm8
        pxor    xmm8, xmm6
        packssdw        xmm8, xmm8
        movd    edx, xmm8
        shr     edx, 16
        test    dl, 1
        je      .LBB0_6
        mov     word ptr [rdi + 2*rcx + 2], si
.LBB0_6:                                #   in Loop: Header=BB0_2 Depth=1
        movdqa  xmm9, xmm3
        pxor    xmm9, xmm5
        movdqa  xmm10, xmm9
        pcmpgtd xmm10, xmm0
        pshufd  xmm8, xmm10, 160                # xmm8 = xmm10[0,0,2,2]
        pcmpeqd xmm9, xmm0
        pshufd  xmm9, xmm9, 245                 # xmm9 = xmm9[1,1,3,3]
        movdqa  xmm11, xmm9
        pand    xmm11, xmm8
        pshufd  xmm10, xmm10, 245               # xmm10 = xmm10[1,1,3,3]
        por     xmm11, xmm10
        packssdw        xmm11, xmm11
        pxor    xmm11, xmm6
        packssdw        xmm11, xmm11
        pextrw  edx, xmm11, 2
        test    dl, 1
        je      .LBB0_8
        mov     word ptr [rdi + 2*rcx + 4], si
.LBB0_8:                                #   in Loop: Header=BB0_2 Depth=1
        pshufhw xmm8, xmm8, 132                 # xmm8 = xmm8[0,1,2,3,4,5,4,6]
        pshufhw xmm9, xmm9, 132                 # xmm9 = xmm9[0,1,2,3,4,5,4,6]
        pand    xmm9, xmm8
        pshufhw xmm8, xmm10, 132                # xmm8 = xmm10[0,1,2,3,4,5,4,6]
        por     xmm8, xmm9
        pxor    xmm8, xmm6
        packssdw        xmm8, xmm8
        pextrw  edx, xmm8, 3
        test    dl, 1
        je      .LBB0_10
        mov     word ptr [rdi + 2*rcx + 6], si
.LBB0_10:                               #   in Loop: Header=BB0_2 Depth=1
        movdqa  xmm8, xmm2
        pxor    xmm8, xmm5
        movdqa  xmm10, xmm8
        pcmpgtd xmm10, xmm0
        pshufd  xmm9, xmm10, 160                # xmm9 = xmm10[0,0,2,2]
        pshuflw xmm11, xmm9, 232                # xmm11 = xmm9[0,2,2,3,4,5,6,7]
        pcmpeqd xmm8, xmm0
        pshufd  xmm8, xmm8, 245                 # xmm8 = xmm8[1,1,3,3]
        pshuflw xmm12, xmm8, 232                # xmm12 = xmm8[0,2,2,3,4,5,6,7]
        pand    xmm12, xmm11
        pshufd  xmm10, xmm10, 245               # xmm10 = xmm10[1,1,3,3]
        pshuflw xmm11, xmm10, 232               # xmm11 = xmm10[0,2,2,3,4,5,6,7]
        por     xmm11, xmm12
        pxor    xmm11, xmm6
        packssdw        xmm11, xmm11
        pextrw  edx, xmm11, 4
        test    dl, 1
        je      .LBB0_12
        mov     word ptr [rdi + 2*rcx + 8], si
.LBB0_12:                               #   in Loop: Header=BB0_2 Depth=1
        pand    xmm8, xmm9
        por     xmm8, xmm10
        packssdw        xmm8, xmm8
        pxor    xmm8, xmm6
        packssdw        xmm8, xmm8
        pextrw  edx, xmm8, 5
        test    dl, 1
        je      .LBB0_14
        mov     word ptr [rdi + 2*rcx + 10], si
.LBB0_14:                               #   in Loop: Header=BB0_2 Depth=1
        movdqa  xmm9, xmm1
        pxor    xmm9, xmm5
        movdqa  xmm10, xmm9
        pcmpgtd xmm10, xmm0
        pshufd  xmm8, xmm10, 160                # xmm8 = xmm10[0,0,2,2]
        pcmpeqd xmm9, xmm0
        pshufd  xmm9, xmm9, 245                 # xmm9 = xmm9[1,1,3,3]
        movdqa  xmm11, xmm9
        pand    xmm11, xmm8
        pshufd  xmm10, xmm10, 245               # xmm10 = xmm10[1,1,3,3]
        por     xmm11, xmm10
        packssdw        xmm11, xmm11
        pxor    xmm11, xmm6
        packssdw        xmm11, xmm11
        pextrw  edx, xmm11, 6
        test    dl, 1
        je      .LBB0_16
        mov     word ptr [rdi + 2*rcx + 12], si
.LBB0_16:                               #   in Loop: Header=BB0_2 Depth=1
        pshufhw xmm8, xmm8, 132                 # xmm8 = xmm8[0,1,2,3,4,5,4,6]
        pshufhw xmm9, xmm9, 132                 # xmm9 = xmm9[0,1,2,3,4,5,4,6]
        pand    xmm9, xmm8
        pshufhw xmm8, xmm10, 132                # xmm8 = xmm10[0,1,2,3,4,5,4,6]
        por     xmm8, xmm9
        pxor    xmm8, xmm6
        packssdw        xmm8, xmm8
        pextrw  edx, xmm8, 7
        test    dl, 1
        je      .LBB0_18
        mov     word ptr [rdi + 2*rcx + 14], si
.LBB0_18:                               #   in Loop: Header=BB0_2 Depth=1
        add     rcx, 8
        paddq   xmm4, xmm7
        paddq   xmm3, xmm7
        paddq   xmm2, xmm7
        paddq   xmm1, xmm7
        cmp     rax, rcx
        jne     .LBB0_2
.LBB0_19:
        ret

Compiler Explorer: https://godbolt.org/z/9jfYh8Wz8; it appears the first version with this behavior was clang 12.

Use of SIMD here (the loop is unrolled, not vectorized) is completely unnecessary; the output of -Oz, which is a simple scalar non-unrolled loop, is ~5x faster in a simple test.

The "optimized" IR in question:

IR
define dso_local void @fill_i16(ptr nocapture noundef writeonly %a, i16 noundef signext %v, i64 noundef %l) local_unnamed_addr {
entry:
  %cmp3.not = icmp eq i64 %l, 0
  br i1 %cmp3.not, label %for.cond.cleanup, label %vector.ph

vector.ph:
  %n.rnd.up = add i64 %l, 7
  %n.vec = and i64 %n.rnd.up, -8
  %trip.count.minus.1 = add i64 %l, -1
  %broadcast.splatinsert = insertelement <8 x i64> poison, i64 %trip.count.minus.1, i64 0
  %broadcast.splat = shufflevector <8 x i64> %broadcast.splatinsert, <8 x i64> poison, <8 x i32> zeroinitializer
  br label %vector.body

vector.body:
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %pred.store.continue18 ]
  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %vector.ph ], [ %vec.ind.next, %pred.store.continue18 ]
  %0 = icmp ule <8 x i64> %vec.ind, %broadcast.splat
  %1 = extractelement <8 x i1> %0, i64 0
  br i1 %1, label %pred.store.if, label %pred.store.continue

pred.store.if:
  %2 = getelementptr inbounds i16, ptr %a, i64 %index
  store i16 %v, ptr %2, align 2
  br label %pred.store.continue

pred.store.continue:
  %3 = extractelement <8 x i1> %0, i64 1
  br i1 %3, label %pred.store.if5, label %pred.store.continue6

pred.store.if5:
  %4 = or i64 %index, 1
  %5 = getelementptr inbounds i16, ptr %a, i64 %4
  store i16 %v, ptr %5, align 2
  br label %pred.store.continue6

pred.store.continue6:
  %6 = extractelement <8 x i1> %0, i64 2
  br i1 %6, label %pred.store.if7, label %pred.store.continue8

pred.store.if7:
  %7 = or i64 %index, 2
  %8 = getelementptr inbounds i16, ptr %a, i64 %7
  store i16 %v, ptr %8, align 2
  br label %pred.store.continue8

pred.store.continue8:
  %9 = extractelement <8 x i1> %0, i64 3
  br i1 %9, label %pred.store.if9, label %pred.store.continue10

pred.store.if9:
  %10 = or i64 %index, 3
  %11 = getelementptr inbounds i16, ptr %a, i64 %10
  store i16 %v, ptr %11, align 2
  br label %pred.store.continue10

pred.store.continue10:
  %12 = extractelement <8 x i1> %0, i64 4
  br i1 %12, label %pred.store.if11, label %pred.store.continue12

pred.store.if11:
  %13 = or i64 %index, 4
  %14 = getelementptr inbounds i16, ptr %a, i64 %13
  store i16 %v, ptr %14, align 2
  br label %pred.store.continue12

pred.store.continue12:
  %15 = extractelement <8 x i1> %0, i64 5
  br i1 %15, label %pred.store.if13, label %pred.store.continue14

pred.store.if13:
  %16 = or i64 %index, 5
  %17 = getelementptr inbounds i16, ptr %a, i64 %16
  store i16 %v, ptr %17, align 2
  br label %pred.store.continue14

pred.store.continue14:
  %18 = extractelement <8 x i1> %0, i64 6
  br i1 %18, label %pred.store.if15, label %pred.store.continue16

pred.store.if15:
  %19 = or i64 %index, 6
  %20 = getelementptr inbounds i16, ptr %a, i64 %19
  store i16 %v, ptr %20, align 2
  br label %pred.store.continue16

pred.store.continue16:
  %21 = extractelement <8 x i1> %0, i64 7
  br i1 %21, label %pred.store.if17, label %pred.store.continue18

pred.store.if17:
  %22 = or i64 %index, 7
  %23 = getelementptr inbounds i16, ptr %a, i64 %22
  store i16 %v, ptr %23, align 2
  br label %pred.store.continue18

pred.store.continue18:
  %index.next = add i64 %index, 8
  %vec.ind.next = add <8 x i64> %vec.ind, <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
  %24 = icmp eq i64 %index.next, %n.vec
  br i1 %24, label %for.cond.cleanup, label %vector.body

for.cond.cleanup:
  ret void
}

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions