From 9af7ae898ca22f3c24f92f7f1ac3ac108d324596 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sun, 9 Dec 2018 12:02:56 +0000
Subject: [PATCH 1/3] [X86] Add test for PR39926; NFC

The test file shows a case where the avoid store forwarding block
pass misses to copy a range (-1..1) when the load displacement
changes sign.

Baseline test for D55485.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@348712 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/pr39926.ll | 48 +++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 test/CodeGen/X86/pr39926.ll

diff --git a/test/CodeGen/X86/pr39926.ll b/test/CodeGen/X86/pr39926.ll
new file mode 100644
index 000000000000..5b2defbbfeae
--- /dev/null
+++ b/test/CodeGen/X86/pr39926.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx | FileCheck %s
+define i8 @test_offset(i8* %base) {
+; CHECK-LABEL: test_offset:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movb $0, 7(%rdi)
+; CHECK-NEXT:    movw $0, 5(%rdi)
+; CHECK-NEXT:    movl $0, 1(%rdi)
+; CHECK-NEXT:    movzwl -4(%rdi), %eax
+; CHECK-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb -2(%rdi), %al
+; CHECK-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl 1(%rdi), %eax
+; CHECK-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movzwl 5(%rdi), %eax
+; CHECK-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb 7(%rdi), %al
+; CHECK-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl 8(%rdi), %eax
+; CHECK-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+entry:
+  %z = alloca [128 x i8], align 16
+  %gep0 = getelementptr inbounds i8, i8* %base, i64 7
+  store volatile i8 0, i8* %gep0
+  %gep1 = getelementptr inbounds i8, i8* %base, i64 5
+  %bc1 = bitcast i8* %gep1 to i16*
+  store volatile i16 0, i16* %bc1
+  %gep2 = getelementptr inbounds i8, i8* %base, i64 1
+  %bc2 = bitcast i8* %gep2 to i32*
+  store volatile i32 0, i32* %bc2
+
+  %y1 = getelementptr inbounds i8, i8* %base, i64 -4
+  %y2 = bitcast [128 x i8]* %z to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %y2, i8* %y1, i64 16, i1 false)
+
+  %gep4 = getelementptr inbounds [128 x i8], [128 x i8]* %z, i64 0, i64 4
+  %ret = load i8, i8* %gep4
+  ret i8 %ret
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1)

From 47d569d15ae382d6700273e87f74555ee0f3e787 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Mon, 10 Dec 2018 10:16:50 +0000
Subject: [PATCH 2/3] [X86] Fix AvoidStoreForwardingBlocks pass for negative
 displacements

Fixes https://bugs.llvm.org/show_bug.cgi?id=39926.

The size of the first copy was computed as
std::abs(std::abs(LdDisp2) - std::abs(LdDisp1)), which results in
skipped bytes if the signs of LdDisp2 and LdDisp1 differ. As far as
I can see, this should just be LdDisp2 - LdDisp1. The case where
LdDisp1 > LdDisp2 is already handled in the code above, in which case
LdDisp2 is set to LdDisp1 and this subtraction will evaluate to
Size1 = 0, which is the correct value to skip an overlapping copy.

Differential Revision: https://reviews.llvm.org/D55485

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@348750 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp | 2 +-
 test/CodeGen/X86/pr39926.ll                      | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index eb9c4b3e5977..2850baf7a65e 100644
--- a/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -586,7 +586,7 @@ void X86AvoidSFBPass::breakBlockedCopies(
       StDisp2 += OverlapDelta;
       Size2 -= OverlapDelta;
     }
-    Size1 = std::abs(std::abs(LdDisp2) - std::abs(LdDisp1));
+    Size1 = LdDisp2 - LdDisp1;
 
     // Build a copy for the point until the current blocking store's
     // displacement.
diff --git a/test/CodeGen/X86/pr39926.ll b/test/CodeGen/X86/pr39926.ll
index 5b2defbbfeae..c22e4f2f9a8b 100644
--- a/test/CodeGen/X86/pr39926.ll
+++ b/test/CodeGen/X86/pr39926.ll
@@ -8,9 +8,9 @@ define i8 @test_offset(i8* %base) {
 ; CHECK-NEXT:    movb $0, 7(%rdi)
 ; CHECK-NEXT:    movw $0, 5(%rdi)
 ; CHECK-NEXT:    movl $0, 1(%rdi)
-; CHECK-NEXT:    movzwl -4(%rdi), %eax
-; CHECK-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movb -2(%rdi), %al
+; CHECK-NEXT:    movl -4(%rdi), %eax
+; CHECK-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb (%rdi), %al
 ; CHECK-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl 1(%rdi), %eax
 ; CHECK-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)

From 85ec369ea8b3729792da9ec3ab200e734fdd9c2d Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Wed, 12 Dec 2018 23:19:03 +0000
Subject: [PATCH 3/3] [InstCombine] Fix negative GEP offset evaluation for
 32-bit pointers

This fixes https://bugs.llvm.org/show_bug.cgi?id=39908.

The evaluateGEPOffsetExpression() function simplifies GEP offsets for
use in comparisons against zero, basically by converting X*Scale+Offset==0
to X+Offset/Scale==0 if Scale divides Offset. However, before this is done,
Offset is masked down to the pointer size. This results in incorrect
results for negative Offsets, because we basically end up dividing the
32-bit offset *zero* extended to 64-bit bits (rather than sign extended).

Fix this by explicitly sign extending the truncated value.

Differential Revision: https://reviews.llvm.org/D55449

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@348987 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../InstCombine/InstCombineCompares.cpp       |  8 ++-
 test/Transforms/InstCombine/pr39908.ll        | 49 +++++++++++++++++++
 2 files changed, 52 insertions(+), 5 deletions(-)
 create mode 100644 test/Transforms/InstCombine/pr39908.ll

diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 2ba1174517ff..17d1f0359c52 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -522,11 +522,9 @@ static Value *evaluateGEPOffsetExpression(User *GEP, InstCombiner &IC,
   }
 
   // Otherwise, there is an index.  The computation we will do will be modulo
-  // the pointer size, so get it.
-  uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
-
-  Offset &= PtrSizeMask;
-  VariableScale &= PtrSizeMask;
+  // the pointer size.
+  Offset = SignExtend64(Offset, IntPtrWidth);
+  VariableScale = SignExtend64(VariableScale, IntPtrWidth);
 
   // To do this transformation, any constant index must be a multiple of the
   // variable scale factor.  For example, we can evaluate "12 + 4*i" as "3 + i",
diff --git a/test/Transforms/InstCombine/pr39908.ll b/test/Transforms/InstCombine/pr39908.ll
new file mode 100644
index 000000000000..bd7a82990ad8
--- /dev/null
+++ b/test/Transforms/InstCombine/pr39908.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "p:32:32"
+
+%S = type { [2 x i32] }
+
+define i1 @test([0 x %S]* %p, i32 %n) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %start.cast = bitcast [0 x %S]* %p to %S*
+  %end = getelementptr inbounds [0 x %S], [0 x %S]* %p, i32 0, i32 %n, i32 0, i32 0
+  %end.cast = bitcast i32* %end to %S*
+  %last = getelementptr inbounds %S, %S* %end.cast, i32 -1
+  %cmp = icmp eq %S* %last, %start.cast
+  ret i1 %cmp
+}
+
+; Same test using 64-bit indices.
+define i1 @test64([0 x %S]* %p, i64 %n) {
+; CHECK-LABEL: @test64(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %start.cast = bitcast [0 x %S]* %p to %S*
+  %end = getelementptr inbounds [0 x %S], [0 x %S]* %p, i64 0, i64 %n, i32 0, i64 0
+  %end.cast = bitcast i32* %end to %S*
+  %last = getelementptr inbounds %S, %S* %end.cast, i64 -1
+  %cmp = icmp eq %S* %last, %start.cast
+  ret i1 %cmp
+}
+
+; Here the offset overflows and is treated modulo 2^32. This is UB.
+define i1 @test64_overflow([0 x %S]* %p, i64 %n) {
+; CHECK-LABEL: @test64_overflow(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[N:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %start.cast = bitcast [0 x %S]* %p to %S*
+  %end = getelementptr inbounds [0 x %S], [0 x %S]* %p, i64 0, i64 %n, i32 0, i64 8589934592
+  %end.cast = bitcast i32* %end to %S*
+  %last = getelementptr inbounds %S, %S* %end.cast, i64 -1
+  %cmp = icmp eq %S* %last, %start.cast
+  ret i1 %cmp
+}