From 5226ae4617023e3b8957e9db0b9c2c83ea7e77a2 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 24 Jan 2024 10:57:18 -0800 Subject: [PATCH 1/2] [SLP]Fix PR79229: Check that extractelement is used only in a single node before erasing. Before trying to erase the extractelement instruction, not enough to check for single use, need to check that it is not used in several nodes because of the preliminary nodes reordering. (cherry picked from commit 48bbd7658710ef1699bf2a6532ff5830230aacc5) --- .../Transforms/Vectorize/SLPVectorizer.cpp | 11 +- .../extractelement-single-use-many-nodes.ll | 144 ++++++++++++++++++ 2 files changed, 154 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 601d2454c1e16..83f787d7fb624 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10216,7 +10216,16 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { // If the only one use is vectorized - can delete the extractelement // itself. if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) { - return !R.ScalarToTreeEntry.count(U); + const TreeEntry *UTE = R.getTreeEntry(U); + return !UTE || R.MultiNodeScalars.contains(U) || + count_if(R.VectorizableTree, + [&](const std::unique_ptr &TE) { + return any_of(TE->UserTreeIndices, + [&](const EdgeInfo &Edge) { + return Edge.UserTE == UTE; + }) && + is_contained(TE->Scalars, EI); + }) != 1; })) continue; R.eraseInstruction(EI); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll new file mode 100644 index 0000000000000..f665dac3282b7 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll @@ -0,0 +1,144 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64-v3 -S < %s | FileCheck %s + +define void @foo(double %i) { +; CHECK-LABEL: define void @foo( +; CHECK-SAME: double [[I:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> , double [[I]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> zeroinitializer, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> zeroinitializer, [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> , <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> [[TMP7]], double [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> , <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP5]], i32 6 +; CHECK-NEXT: [[TMP12:%.*]] = fmul <8 x double> [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd <8 x double> zeroinitializer, [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x double> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = fcmp ult <8 x double> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = freeze <8 x i1> [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP16]]) +; CHECK-NEXT: br i1 [[TMP17]], label [[BB58:%.*]], label [[BB115:%.*]] +; CHECK: bb115: +; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x double> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x double> [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[TMP18]], i32 1 +; CHECK-NEXT: [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x double> , <4 x double> [[TMP22]], <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = select <4 x i1> zeroinitializer, <4 x double> zeroinitializer, <4 x double> [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = fmul <4 x double> [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = fmul <4 x double> [[TMP27]], zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = fptosi <4 x double> [[TMP28]] to <4 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = or <4 x i32> zeroinitializer, [[TMP29]] +; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP30]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = icmp slt i32 [[TMP31]], 32000 +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP31]], i32 32000 +; CHECK-NEXT: [[I163:%.*]] = fcmp ogt double [[I118]], 0.000000e+00 +; CHECK-NEXT: [[I164:%.*]] = icmp slt i32 0, [[OP_RDX1]] +; CHECK-NEXT: unreachable +; CHECK: bb58: +; CHECK-NEXT: ret void +; +bb: + %i75 = fsub double 0.000000e+00, 0.000000e+00 + %i76 = fsub double 0.000000e+00, 0.000000e+00 + %i77 = fmul double 0.000000e+00, %i75 + %i78 = fmul double 0.000000e+00, %i76 + %i79 = fadd double %i78, 0.000000e+00 + %i80 = fadd double %i79, 0.000000e+00 + %i81 = fcmp ult double %i80, 0.000000e+00 + %i82 = fsub double 0.000000e+00, poison + %i83 = fmul double 0.000000e+00, %i82 + %i84 = fadd double 0.000000e+00, %i83 + %i85 = fadd double %i84, 0.000000e+00 + %i86 = fcmp ult double %i85, 0.000000e+00 + %i87 = fsub double 0.000000e+00, %i + %i88 = fadd double 0.000000e+00, %i77 + %i89 = fadd double %i88, 0.000000e+00 + %i90 = fcmp ult double %i89, 0.000000e+00 + %i91 = fsub double 0.000000e+00, 0.000000e+00 + %i92 = fmul double poison, 0.000000e+00 + %i93 = fadd double %i92, 0.000000e+00 + %i94 = fadd double %i93, 0.000000e+00 + %i95 = fcmp ult double %i94, 0.000000e+00 + %i96 = fadd double %i79, 0.000000e+00 + %i97 = fcmp ult double %i96, 0.000000e+00 + %i98 = fadd double %i84, 0.000000e+00 + %i99 = fcmp ult double %i98, 0.000000e+00 + %i100 = fadd double 0.000000e+00, %i77 + %i101 = fadd double %i100, 0.000000e+00 + %i102 = fcmp ult double %i101, 0.000000e+00 + %i103 = fsub double 0.000000e+00, %i + %i104 = fmul double poison, 0.000000e+00 + %i105 = fadd double %i104, 0.000000e+00 + %i106 = fadd double %i105, 0.000000e+00 + %i107 = fcmp ult double %i106, 0.000000e+00 + %i108 = select i1 %i107, i1 %i102, i1 false + %i109 = select i1 %i108, i1 %i99, i1 false + %i110 = select i1 %i109, i1 %i97, i1 false + %i111 = select i1 %i110, i1 %i95, i1 false + %i112 = select i1 %i111, i1 %i90, i1 false + %i113 = select i1 %i112, i1 %i86, i1 false + %i114 = select i1 %i113, i1 %i81, i1 false + br i1 %i114, label %bb58, label %bb115 + +bb115: + %i116 = fmul double 0.000000e+00, %i103 + %i117 = fmul double 0.000000e+00, %i82 + %i118 = fadd double %i116, %i117 + %i120 = fmul double 0.000000e+00, %i75 + %i121 = fmul double 0.000000e+00, %i76 + %i122 = fadd double %i121, 0.000000e+00 + %i123 = fadd double 0.000000e+00, %i120 + %i124 = fmul double 0.000000e+00, %i91 + %i125 = fadd double %i124, %i82 + %i126 = fadd double %i125, 0.000000e+00 + %i127 = fmul double 0.000000e+00, %i87 + %i128 = fadd double %i127, 0.000000e+00 + %i129 = fadd double %i128, 0.000000e+00 + %i130 = fadd double %i122, 0.000000e+00 + %i131 = fadd double %i123, 0.000000e+00 + %i132 = select i1 false, double 0.000000e+00, double %i131 + %i133 = fmul double %i132, 0.000000e+00 + %i134 = fmul double %i133, 0.000000e+00 + %i135 = fptosi double %i134 to i32 + %i136 = or i32 0, %i135 + %i137 = icmp slt i32 %i136, 32000 + %i138 = select i1 %i137, i32 %i136, i32 32000 + %i139 = select i1 false, double 0.000000e+00, double %i130 + %i140 = fmul double %i139, 0.000000e+00 + %i141 = fmul double %i140, 0.000000e+00 + %i142 = fptosi double %i141 to i32 + %i143 = or i32 0, %i142 + %i144 = icmp slt i32 %i143, %i138 + %i145 = select i1 %i144, i32 %i143, i32 %i138 + %i146 = select i1 false, double 0.000000e+00, double %i129 + %i147 = fmul double %i146, 0.000000e+00 + %i148 = fmul double %i147, 0.000000e+00 + %i149 = fptosi double %i148 to i32 + %i150 = or i32 0, %i149 + %i151 = icmp slt i32 %i150, %i145 + %i152 = select i1 %i151, i32 %i150, i32 %i145 + %i153 = select i1 false, double 0.000000e+00, double %i126 + %i154 = fmul double %i153, 0.000000e+00 + %i155 = fmul double %i154, 0.000000e+00 + %i156 = fptosi double %i155 to i32 + %i157 = or i32 0, %i156 + %i158 = icmp slt i32 %i157, %i152 + %i159 = select i1 %i158, i32 %i157, i32 %i152 + %i163 = fcmp ogt double %i118, 0.000000e+00 + %i164 = icmp slt i32 0, %i159 + unreachable + +bb58: + ret void +} From b7a4ff80a4ccaecf1d497db51bfdc9499c3cbb48 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 25 Jan 2024 06:06:15 -0800 Subject: [PATCH 2/2] [SLP]Fix PR79229: Do not erase extractelement, if it used in multiregister node. If the node can be span between several registers and same extractelement instruction is used in several parts, it may be required to keep such extractelement instruction to avoid compiler crash. (cherry picked from commit 6fe21bc1dac883efa0dfa807f327048ae9969b81) --- .../Transforms/Vectorize/SLPVectorizer.cpp | 3 +- .../X86/extractelement-multi-register-use.ll | 107 ++++++++++++++++++ 2 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 83f787d7fb624..0a9e2c7f49f55 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10215,7 +10215,8 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { UniqueBases.insert(VecBase); // If the only one use is vectorized - can delete the extractelement // itself. - if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) { + if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) || + any_of(EI->users(), [&](User *U) { const TreeEntry *UTE = R.getTreeEntry(U); return !UTE || R.MultiNodeScalars.contains(U) || count_if(R.VectorizableTree, diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll new file mode 100644 index 0000000000000..ba406c8f20bb0 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll @@ -0,0 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64-v3 < %s | FileCheck %s + +define void @test(double %i) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: double [[I:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> zeroinitializer, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> , double [[I]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> zeroinitializer, [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> [[TMP7]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> , <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP4]], i32 7 +; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x double> zeroinitializer, [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = fadd <8 x double> zeroinitializer, [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd <8 x double> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = fcmp ult <8 x double> [[TMP13]], zeroinitializer +; CHECK-NEXT: br label [[BB116:%.*]] +; CHECK: bb116: +; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x double> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x double> [[TMP15]], i32 1 +; CHECK-NEXT: [[I120:%.*]] = fadd double [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x double> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP19:%.*]] = fmul <2 x double> zeroinitializer, [[TMP3]] +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[TMP18]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x double> [[TMP18]], i32 1 +; CHECK-NEXT: [[I128:%.*]] = fadd double [[TMP20]], [[TMP21]] +; CHECK-NEXT: [[I139:%.*]] = call double @llvm.maxnum.f64(double [[I128]], double 0.000000e+00) +; CHECK-NEXT: [[TMP22:%.*]] = fadd <2 x double> [[TMP19]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP22]], <2 x double> zeroinitializer) +; CHECK-NEXT: [[TMP24:%.*]] = fmul <2 x double> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = fptosi <2 x double> [[TMP24]] to <2 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = sub <2 x i32> zeroinitializer, [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp sgt <2 x i32> [[TMP26]], zeroinitializer +; CHECK-NEXT: [[I147:%.*]] = fcmp ogt double [[I120]], 0.000000e+00 +; CHECK-NEXT: ret void +; +bb: + %i74 = fsub double 0.000000e+00, poison + %i75 = fsub double 0.000000e+00, %i + %i76 = fmul double 0.000000e+00, %i75 + %i77 = fadd double %i76, 0.000000e+00 + %i78 = fadd double %i77, 0.000000e+00 + %i79 = fcmp ult double %i78, 0.000000e+00 + %i81 = fsub double %i, 0.000000e+00 + %i82 = fmul double 0.000000e+00, %i81 + %i83 = fadd double 0.000000e+00, %i82 + %i84 = fadd double %i83, 0.000000e+00 + %i85 = fcmp ult double %i84, 0.000000e+00 + %i86 = fsub double 0.000000e+00, %i + %i87 = fmul double 0.000000e+00, %i86 + %i88 = fadd double %i87, 0.000000e+00 + %i89 = fadd double %i88, 0.000000e+00 + %i90 = fcmp ult double %i89, 0.000000e+00 + %i91 = fsub double 0.000000e+00, 0.000000e+00 + %i92 = fmul double 0.000000e+00, 0.000000e+00 + %i93 = fadd double %i92, 0.000000e+00 + %i94 = fadd double %i93, 0.000000e+00 + %i95 = fcmp ult double %i94, 0.000000e+00 + %i96 = fsub double poison, 0.000000e+00 + %i97 = fadd double %i77, 0.000000e+00 + %i98 = fcmp ult double %i97, 0.000000e+00 + %i99 = fadd double %i83, 0.000000e+00 + %i100 = fcmp ult double %i99, 0.000000e+00 + %i101 = fmul double 0.000000e+00, 0.000000e+00 + %i102 = fadd double %i101, 0.000000e+00 + %i103 = fadd double %i102, 0.000000e+00 + %i104 = fcmp ult double %i103, 0.000000e+00 + %i105 = fmul double 0.000000e+00, 0.000000e+00 + %i106 = fadd double %i105, 0.000000e+00 + %i107 = fadd double %i106, 0.000000e+00 + %i108 = fcmp ult double %i107, 0.000000e+00 + br label %bb116 + +bb116: + %i117 = fmul double 0.000000e+00, %i81 + %i119 = fmul double 0.000000e+00, %i96 + %i120 = fadd double %i117, %i119 + %i121 = fmul double 0.000000e+00, %i74 + %i122 = fmul double 0.000000e+00, %i75 + %i123 = fadd double %i122, 0.000000e+00 + %i124 = fmul double 0.000000e+00, %i91 + %i125 = fadd double %i124, 0.000000e+00 + %i127 = fmul double 0.000000e+00, %i86 + %i128 = fadd double %i127, %i121 + %i133 = call double @llvm.maxnum.f64(double %i123, double 0.000000e+00) + %i134 = fmul double %i133, 0.000000e+00 + %i135 = fptosi double %i134 to i32 + %i136 = sub i32 0, %i135 + %i137 = icmp sgt i32 %i136, 0 + %i139 = call double @llvm.maxnum.f64(double %i128, double 0.000000e+00) + %i142 = call double @llvm.maxnum.f64(double %i125, double 0.000000e+00) + %i143 = fmul double %i142, 0.000000e+00 + %i144 = fptosi double %i143 to i32 + %i145 = sub i32 0, %i144 + %i146 = icmp sgt i32 %i145, 0 + %i147 = fcmp ogt double %i120, 0.000000e+00 + ret void +} + +declare double @llvm.maxnum.f64(double, double)