-
Notifications
You must be signed in to change notification settings - Fork 15.5k
Description
Repro: https://godbolt.org/z/dbhsnTWG9
LLVM should respect overflow and precision effects for floating point. For example, in (good) test above
define float @test_single(float %a, float %b) {
%sum = fadd float %a, %b
%res = fsub float %sum, %a
ret float %res
}
does not get instcombined into ret float %b, because (a + b) could go up to infinity, and result would be infinity or NaN (depending on what %a is). And this behavior should be preserved.
However, Loop Vectorizer seems to ignore this semantics.
opt -passes=loop-vectorize -force-vector-width=2
on test
define float @test_vector(ptr %pa, ptr %pb, i32 %length) {
entry:
%should_execute = icmp ne i32 %length, 0
br i1 %should_execute, label %loop, label %empty
loop:
%iv = phi i32 [0, %entry], [%iv.next, %loop]
%sum = phi float [0.0, %entry], [%sum.next, %loop]
%a.gep = getelementptr float, ptr %pa, i32 %iv
%b.gep = getelementptr float, ptr %pb, i32 %iv
%a = load float, ptr %a.gep, align 4
%b = load float, ptr %b.gep, align 4
%mul = fmul float %a, %b
%sum.next = fsub float %sum, %mul
%iv.next = add nuw nsw i32 %iv, 1
%loop.cond = icmp ult i32 %iv.next, %length
br i1 %loop.cond, label %loop, label %done
done:
ret float %sum.next
empty:
ret float 0.0
}
leads to classical 2-accumulator + add reduce vectorization:
define float @test_vector(ptr %pa, ptr %pb, i32 %length) {
entry:
%should_execute = icmp ne i32 %length, 0
br i1 %should_execute, label %loop.preheader, label %empty
loop.preheader:
%min.iters.check = icmp ult i32 %length, 2
br i1 %min.iters.check, label %scalar.ph, label %vector.ph
vector.ph:
%n.mod.vf = urem i32 %length, 2
%n.vec = sub i32 %length, %n.mod.vf
br label %vector.body
vector.body:
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%vec.phi = phi <2 x float> [ <float 0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %3, %vector.body ]
%0 = getelementptr float, ptr %pa, i32 %index
%1 = getelementptr float, ptr %pb, i32 %index
%wide.load = load <2 x float>, ptr %0, align 4
%wide.load1 = load <2 x float>, ptr %1, align 4
%2 = fmul <2 x float> %wide.load, %wide.load1
%3 = fsub <2 x float> %vec.phi, %2
%index.next = add nuw i32 %index, 2
%4 = icmp eq i32 %index.next, %n.vec
br i1 %4, label %middle.block, label %vector.body
middle.block:
%5 = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %3)
%cmp.n = icmp eq i32 %length, %n.vec
br i1 %cmp.n, label %done, label %scalar.ph
scalar.ph:
%bc.resume.val = phi i32 [ %n.vec, %middle.block ], [ 0, %loop.preheader ]
%bc.merge.rdx = phi float [ %5, %middle.block ], [ 0.000000e+00, %loop.preheader ]
br label %loop
loop:
%iv = phi i32 [ %iv.next, %loop ], [ %bc.resume.val, %scalar.ph ]
%sum = phi float [ %sum.next, %loop ], [ %bc.merge.rdx, %scalar.ph ]
%a.gep = getelementptr float, ptr %pa, i32 %iv
%b.gep = getelementptr float, ptr %pb, i32 %iv
%a = load float, ptr %a.gep, align 4
%b = load float, ptr %b.gep, align 4
%mul = fmul float %a, %b
%sum.next = fsub float %sum, %mul
%iv.next = add nuw nsw i32 %iv, 1
%loop.cond = icmp ult i32 %iv.next, %length
br i1 %loop.cond, label %loop, label %done
done:
%sum.next.lcssa = phi float [ %sum.next, %loop ], [ %5, %middle.block ]
ret float %sum.next.lcssa
empty:
ret float 0.000000e+00
}
declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>) #0
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
This is downright broken in multiple ways. For example, given some huge x, array a is {x, x, x, ..., x} and array b is {1, -1, 1, -1, ... 1, -1}.
Sum of even elements might go to infinity, sum of odd element goes to negative infinity, and the result would be NaN, while the original answer was 0.
With other input data, it could as well lead to problems with precision.
Seems that Loop Vectorizer doesn't respect this semantics.