Skip to content

Commit

Permalink
[SLP]Add cost estimation for gather node reshuffling
Browse files Browse the repository at this point in the history
Adds cost estimation for the variants of the permutations of the scalar
values, used in gather nodes. Currently, SLP just unconditionally emits
shuffles for the reused buildvectors, but in some cases better to leave
them as buildvectors rather than shuffles, if the cost of such
buildvectors is better.

X86, AVX512, -O3+LTO
Metric: size..text

Program                                                                        size..text
                                                                               results     results0    diff
                 test-suite :: External/SPEC/CINT2006/445.gobmk/445.gobmk.test   912998.00   913238.00  0.0%
 test-suite :: MultiSource/Benchmarks/MiBench/consumer-lame/consumer-lame.test   203070.00   203102.00  0.0%
     test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test  1396320.00  1396448.00  0.0%
      test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test  1396320.00  1396448.00  0.0%
                       test-suite :: MultiSource/Benchmarks/Bullet/bullet.test   309790.00   309678.00 -0.0%
      test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12477607.00 12470807.00 -0.1%

CINT2006/445.gobmk - extra code vectorized
MiBench/consumer-lame - small variations
CFP2017speed/638.imagick_s
CFP2017rate/538.imagick_r - extra vectorized code
Benchmarks/Bullet - extra code vectorized
CFP2017rate/526.blender_r - extra vector code

RISC-V, sifive-p670, -O3+LTO
CFP2006/433.milc - regressions, should be fixed by llvm/llvm-project#115173
CFP2006/453.povray - extra vectorized code
CFP2017rate/508.namd_r - better vector code
CFP2017rate/510.parest_r - extra vectorized code
SPEC/CFP2017rate - extra/better vector code
CFP2017rate/526.blender_r - extra vectorized code
CFP2017rate/538.imagick_r - extra vectorized code
CINT2006/403.gcc - extra vectorized code
CINT2006/445.gobmk - extra vectorized code
CINT2006/464.h264ref - extra vectorized code
CINT2006/483.xalancbmk - small variations
CINT2017rate/525.x264_r - better vectorization

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: llvm/llvm-project#115201
  • Loading branch information
alexey-bataev authored Dec 24, 2024
1 parent 2d038ca commit 07d284d
Show file tree
Hide file tree
Showing 19 changed files with 955 additions and 967 deletions.
32 changes: 31 additions & 1 deletion llvm/include/llvm/CodeGen/BasicTTIImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,33 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
}

/// Checks if the provided mask \p is a splat mask, i.e. it contains only -1
/// or same non -1 index value and this index value contained at least twice.
/// So, mask <0, -1,-1, -1> is not considered splat (it is just identity),
/// same for <-1, 0, -1, -1> (just a slide), while <2, -1, 2, -1> is a splat
/// with \p Index=2.
static bool isSplatMask(ArrayRef<int> Mask, unsigned NumSrcElts, int &Index) {
// Check that the broadcast index meets at least twice.
bool IsCompared = false;
if (int SplatIdx = PoisonMaskElem;
all_of(enumerate(Mask), [&](const auto &P) {
if (P.value() == PoisonMaskElem)
return P.index() != Mask.size() - 1 || IsCompared;
if (static_cast<unsigned>(P.value()) >= NumSrcElts * 2)
return false;
if (SplatIdx == PoisonMaskElem) {
SplatIdx = P.value();
return P.index() != Mask.size() - 1;
}
IsCompared = true;
return SplatIdx == P.value();
})) {
Index = SplatIdx;
return true;
}
return false;
}

protected:
explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
: BaseT(DL) {}
Expand Down Expand Up @@ -1014,17 +1041,20 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return Kind;
int NumSrcElts = Ty->getElementCount().getKnownMinValue();
switch (Kind) {
case TTI::SK_PermuteSingleSrc:
case TTI::SK_PermuteSingleSrc: {
if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
return TTI::SK_Reverse;
if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
return TTI::SK_Broadcast;
if (isSplatMask(Mask, NumSrcElts, Index))
return TTI::SK_Broadcast;
if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
(Index + Mask.size()) <= (size_t)NumSrcElts) {
SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
return TTI::SK_ExtractSubvector;
}
break;
}
case TTI::SK_PermuteTwoSrc: {
int NumSubElts;
if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
Expand Down
149 changes: 138 additions & 11 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13199,6 +13199,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
// No perfect match, just shuffle, so choose the first tree node from the
// tree.
Entries.push_back(FirstEntries.front());
VF = FirstEntries.front()->getVectorFactor();
} else {
// Try to find nodes with the same vector factor.
assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
Expand Down Expand Up @@ -13239,6 +13240,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
Entries.push_back(SecondEntries.front());
VF = std::max(Entries.front()->getVectorFactor(),
Entries.back()->getVectorFactor());
} else {
VF = Entries.front()->getVectorFactor();
}
}

Expand Down Expand Up @@ -13350,17 +13353,141 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
: Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
IsIdentity &= Mask[Idx] == Pair.second;
}
switch (Entries.size()) {
case 1:
if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
return TargetTransformInfo::SK_PermuteSingleSrc;
break;
case 2:
if (EntryLanes.size() > 2 || VL.size() <= 2)
return TargetTransformInfo::SK_PermuteTwoSrc;
break;
default:
break;
if (ForOrder || IsIdentity || Entries.empty()) {
switch (Entries.size()) {
case 1:
if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
return TargetTransformInfo::SK_PermuteSingleSrc;
break;
case 2:
if (EntryLanes.size() > 2 || VL.size() <= 2)
return TargetTransformInfo::SK_PermuteTwoSrc;
break;
default:
break;
}
} else if (!isa<VectorType>(VL.front()->getType()) &&
(EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
// Do the cost estimation if shuffle beneficial than buildvector.
SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
std::next(Mask.begin(), (Part + 1) * VL.size()));
int MinElement = SubMask.front(), MaxElement = SubMask.front();
for (int Idx : SubMask) {
if (Idx == PoisonMaskElem)
continue;
if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
MinElement = Idx;
if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
MaxElement = Idx;
}
assert(MaxElement >= 0 && MinElement >= 0 &&
MaxElement % VF >= MinElement % VF &&
"Expected at least single element.");
unsigned NewVF = std::max<unsigned>(
VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
(MaxElement % VF) -
(MinElement % VF) + 1));
if (NewVF < VF) {
for_each(SubMask, [&](int &Idx) {
if (Idx == PoisonMaskElem)
return;
Idx = (Idx % VF) - (MinElement % VF) +
(Idx >= static_cast<int>(VF) ? NewVF : 0);
});
VF = NewVF;
}

constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
auto *VecTy = getWidenedType(VL.front()->getType(), VF);
auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
auto GetShuffleCost = [&,
&TTI = *TTI](ArrayRef<int> Mask,
ArrayRef<const TreeEntry *> Entries,
VectorType *VecTy) -> InstructionCost {
if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
ShuffleVectorInst::isDeInterleaveMaskOfFactor(
Mask, Entries.front()->getInterleaveFactor()))
return TTI::TCC_Free;
return ::getShuffleCost(TTI,
Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
: TTI::SK_PermuteSingleSrc,
VecTy, Mask, CostKind);
};
InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
InstructionCost FirstShuffleCost = 0;
SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
if (Entries.size() == 1 || !Entries[0]->isGather()) {
FirstShuffleCost = ShuffleCost;
} else {
// Transform mask to include only first entry.
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
bool IsIdentity = true;
for (auto [I, Idx] : enumerate(FirstMask)) {
if (Idx >= static_cast<int>(VF)) {
Idx = PoisonMaskElem;
} else {
DemandedElts.clearBit(I);
if (Idx != PoisonMaskElem)
IsIdentity &= static_cast<int>(I) == Idx;
}
}
if (!IsIdentity)
FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
FirstShuffleCost += TTI->getScalarizationOverhead(
MaskVecTy, DemandedElts, /*Insert=*/true,
/*Extract=*/false, CostKind);
}
InstructionCost SecondShuffleCost = 0;
SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
if (Entries.size() == 1 || !Entries[1]->isGather()) {
SecondShuffleCost = ShuffleCost;
} else {
// Transform mask to include only first entry.
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
bool IsIdentity = true;
for (auto [I, Idx] : enumerate(SecondMask)) {
if (Idx < static_cast<int>(VF) && Idx >= 0) {
Idx = PoisonMaskElem;
} else {
DemandedElts.clearBit(I);
if (Idx != PoisonMaskElem) {
Idx -= VF;
IsIdentity &= static_cast<int>(I) == Idx;
}
}
}
if (!IsIdentity)
SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
SecondShuffleCost += TTI->getScalarizationOverhead(
MaskVecTy, DemandedElts, /*Insert=*/true,
/*Extract=*/false, CostKind);
}
APInt DemandedElts = APInt::getAllOnes(SubMask.size());
for (auto [I, Idx] : enumerate(SubMask))
if (Idx == PoisonMaskElem)
DemandedElts.clearBit(I);
InstructionCost BuildVectorCost =
TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
/*Extract=*/false, CostKind);
const TreeEntry *BestEntry = nullptr;
if (FirstShuffleCost < ShuffleCost) {
copy(FirstMask, std::next(Mask.begin(), Part * VL.size()));
BestEntry = Entries.front();
ShuffleCost = FirstShuffleCost;
}
if (SecondShuffleCost < ShuffleCost) {
copy(SecondMask, std::next(Mask.begin(), Part * VL.size()));
BestEntry = Entries[1];
ShuffleCost = SecondShuffleCost;
}
if (BuildVectorCost >= ShuffleCost) {
if (BestEntry) {
Entries.clear();
Entries.push_back(BestEntry);
}
return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
: TargetTransformInfo::SK_PermuteSingleSrc;
}
}
Entries.clear();
// Clear the corresponding mask elements.
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/Analysis/CostModel/AMDGPU/shufflevector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -399,13 +399,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
Expand Down Expand Up @@ -436,13 +436,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
; ALL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
; ALL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
; ALL-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
; ALL-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
Expand Down Expand Up @@ -476,13 +476,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> zeroinitializer
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 1>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 0>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 1>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 2>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 0>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 2>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 0, i32 3>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 0>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 3, i32 3>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 2>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 2, i32 1>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13 = shufflevector <2 x i8> %vec1, <2 x i8> %vec1, <2 x i32> <i32 1, i32 3>
Expand Down Expand Up @@ -513,13 +513,13 @@ define amdgpu_kernel void @shufflevector_i8(<2 x i8> %vec1, <2 x i8> %vec2) {
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf00_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> zeroinitializer
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %shuf01_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 1>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf10_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 0>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf11_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 1>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf02_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 2>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %shuf20_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 0>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf22_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 2>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf03_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 0, i32 3>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf30_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 0>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %shuf33_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 3, i32 3>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf12_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 2>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf21_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 2, i32 1>
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %shuf13_2 = shufflevector <2 x i8> %vec1, <2 x i8> %vec2, <2 x i32> <i32 1, i32 3>
Expand Down
Loading

0 comments on commit 07d284d

Please sign in to comment.