Skip to content

LoopVectorizer: incorrect FP operation reordering #169289

@xortator

Description

@xortator

Repro: https://godbolt.org/z/dbhsnTWG9

LLVM should respect overflow and precision effects for floating point. For example, in (good) test above

define float @test_single(float %a, float %b) {
    %sum = fadd float %a, %b
    %res = fsub float %sum, %a
    ret float %res
}

does not get instcombined into ret float %b, because (a + b) could go up to infinity, and result would be infinity or NaN (depending on what %a is). And this behavior should be preserved.

However, Loop Vectorizer seems to ignore this semantics.
opt -passes=loop-vectorize -force-vector-width=2
on test

define float @test_vector(ptr %pa, ptr %pb, i32 %length) {
entry:
  %should_execute = icmp ne i32 %length, 0
  br i1 %should_execute, label %loop, label %empty

loop:
  %iv = phi i32 [0, %entry], [%iv.next, %loop]
  %sum = phi float [0.0, %entry], [%sum.next, %loop]
  %a.gep = getelementptr float, ptr %pa, i32 %iv
  %b.gep = getelementptr float, ptr %pb, i32 %iv
  %a = load float, ptr %a.gep, align 4
  %b = load float, ptr %b.gep, align 4
  %mul = fmul float %a, %b
  %sum.next = fsub float %sum, %mul
  %iv.next = add nuw nsw i32 %iv, 1
  %loop.cond = icmp ult i32 %iv.next, %length
  br i1 %loop.cond, label %loop, label %done

done:
  ret float %sum.next

empty:
  ret float 0.0
}

leads to classical 2-accumulator + add reduce vectorization:

define float @test_vector(ptr %pa, ptr %pb, i32 %length) {
entry:
  %should_execute = icmp ne i32 %length, 0
  br i1 %should_execute, label %loop.preheader, label %empty

loop.preheader:
  %min.iters.check = icmp ult i32 %length, 2
  br i1 %min.iters.check, label %scalar.ph, label %vector.ph

vector.ph:
  %n.mod.vf = urem i32 %length, 2
  %n.vec = sub i32 %length, %n.mod.vf
  br label %vector.body

vector.body:
  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %vec.phi = phi <2 x float> [ <float 0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %3, %vector.body ]
  %0 = getelementptr float, ptr %pa, i32 %index
  %1 = getelementptr float, ptr %pb, i32 %index
  %wide.load = load <2 x float>, ptr %0, align 4
  %wide.load1 = load <2 x float>, ptr %1, align 4
  %2 = fmul <2 x float> %wide.load, %wide.load1
  %3 = fsub <2 x float> %vec.phi, %2
  %index.next = add nuw i32 %index, 2
  %4 = icmp eq i32 %index.next, %n.vec
  br i1 %4, label %middle.block, label %vector.body

middle.block:
  %5 = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %3)
  %cmp.n = icmp eq i32 %length, %n.vec
  br i1 %cmp.n, label %done, label %scalar.ph

scalar.ph:
  %bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %loop.preheader ]
  %bc.merge.rdx = phi float [ %5, %middle.block ], [ 0.000000e+00, %loop.preheader ]
  br label %loop

loop:
  %iv = phi i32 [ %iv.next, %loop ], [ %bc.resume.val, %scalar.ph ]
  %sum = phi float [ %sum.next, %loop ], [ %bc.merge.rdx, %scalar.ph ]
  %a.gep = getelementptr float, ptr %pa, i32 %iv
  %b.gep = getelementptr float, ptr %pb, i32 %iv
  %a = load float, ptr %a.gep, align 4
  %b = load float, ptr %b.gep, align 4
  %mul = fmul float %a, %b
  %sum.next = fsub float %sum, %mul
  %iv.next = add nuw nsw i32 %iv, 1
  %loop.cond = icmp ult i32 %iv.next, %length
  br i1 %loop.cond, label %loop, label %done

done:
  %sum.next.lcssa = phi float [ %sum.next, %loop ], [ %5, %middle.block ]
  ret float %sum.next.lcssa

empty:
  ret float 0.000000e+00
}

declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>) #0

attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

This is downright broken in multiple ways. For example, given some huge x, array a is {x, x, x, ..., x} and array b is {1, -1, 1, -1, ... 1, -1}.

Sum of even elements might go to infinity, sum of odd element goes to negative infinity, and the result would be NaN, while the original answer was 0.

With other input data, it could as well lead to problems with precision.

Seems that Loop Vectorizer doesn't respect this semantics.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions