Skip to content

Commit b85a402

Browse files
committed
[VPlan] Introduce new entry block to VPlan for early SCEV expansion.
This patch adds a new preheader block the VPlan to place SCEV expansions expansions like the trip count. This preheader block is disconnected at the moment, as the bypass blocks of the skeleton are not yet modeled in VPlan. The preheader block is executed before skeleton creation, so the SCEV expansion results can be used during skeleton creation. At the moment, the trip count expression and induction steps are expanded in the new preheader. The remainder of SCEV expansions will be moved gradually in the future. D147965 will update skeleton creation to use the steps expanded in the pre-header to fix #58811. Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D147964
1 parent f19f749 commit b85a402

28 files changed

+418
-232
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

+61-65
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,14 @@ class InnerLoopVectorizer {
564564
ArrayRef<BasicBlock *> BypassBlocks,
565565
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
566566

567+
/// Returns the original loop trip count.
568+
Value *getTripCount() const { return TripCount; }
569+
570+
/// Used to set the trip count after ILV's construction and after the
571+
/// preheader block has been executed. Note that this always holds the trip
572+
/// count of the original loop for both main loop and epilogue vectorization.
573+
void setTripCount(Value *TC) { TripCount = TC; }
574+
567575
protected:
568576
friend class LoopVectorizationPlanner;
569577

@@ -605,9 +613,6 @@ class InnerLoopVectorizer {
605613
/// represented as.
606614
void truncateToMinimalBitwidths(VPTransformState &State);
607615

608-
/// Returns (and creates if needed) the original loop trip count.
609-
Value *getOrCreateTripCount(BasicBlock *InsertBlock);
610-
611616
/// Returns (and creates if needed) the trip count of the widened loop.
612617
Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
613618

@@ -2869,41 +2874,12 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
28692874
PredicatedInstructions.push_back(Cloned);
28702875
}
28712876

2872-
Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2873-
if (TripCount)
2874-
return TripCount;
2875-
2876-
assert(InsertBlock);
2877-
IRBuilder<> Builder(InsertBlock->getTerminator());
2878-
// Find the loop boundaries.
2879-
Type *IdxTy = Legal->getWidestInductionType();
2880-
assert(IdxTy && "No type for induction");
2881-
const SCEV *ExitCount = createTripCountSCEV(IdxTy, PSE, OrigLoop);
2882-
2883-
const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2884-
2885-
// Expand the trip count and place the new instructions in the preheader.
2886-
// Notice that the pre-header does not change, only the loop body.
2887-
SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2888-
2889-
// Count holds the overall loop count (N).
2890-
TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2891-
InsertBlock->getTerminator());
2892-
2893-
if (TripCount->getType()->isPointerTy())
2894-
TripCount =
2895-
CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2896-
InsertBlock->getTerminator());
2897-
2898-
return TripCount;
2899-
}
2900-
29012877
Value *
29022878
InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
29032879
if (VectorTripCount)
29042880
return VectorTripCount;
29052881

2906-
Value *TC = getOrCreateTripCount(InsertBlock);
2882+
Value *TC = getTripCount();
29072883
IRBuilder<> Builder(InsertBlock->getTerminator());
29082884

29092885
Type *Ty = TC->getType();
@@ -2981,7 +2957,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
29812957
}
29822958

29832959
void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2984-
Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2960+
Value *Count = getTripCount();
29852961
// Reuse existing vector loop preheader for TC checks.
29862962
// Note that new preheader block is generated for vector loop.
29872963
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
@@ -3241,7 +3217,7 @@ void InnerLoopVectorizer::createInductionResumeValues(
32413217

32423218
BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
32433219
// The trip counts should be cached by now.
3244-
Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3220+
Value *Count = getTripCount();
32453221
Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
32463222

32473223
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
@@ -3281,8 +3257,9 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
32813257
the vectorized instructions while the old loop will continue to run the
32823258
scalar remainder.
32833259
3284-
[ ] <-- loop iteration number check.
3285-
/ |
3260+
[ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3261+
/ | preheader are expanded here. Eventually all required SCEV
3262+
/ | expansion should happen here.
32863263
/ v
32873264
| [ ] <-- vector loop bypass (may consist of multiple blocks).
32883265
| / |
@@ -3384,7 +3361,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
33843361
VPValue *StepVPV = Plan.getSCEVExpansion(II.getStep());
33853362
assert(StepVPV && "step must have been expanded during VPlan execution");
33863363
Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
3387-
: State.get(StepVPV, 0);
3364+
: State.get(StepVPV, {0, 0});
33883365
Value *Escape =
33893366
emitTransformedIndex(B, CountMinusOne, II.getStartValue(), Step, II);
33903367
Escape->setName("ind.escape");
@@ -7704,23 +7681,27 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
77047681
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
77057682
<< '\n');
77067683

7707-
// Workaround! Compute the trip count of the original loop and cache it
7708-
// before we start modifying the CFG. This code has a systemic problem
7709-
// wherein it tries to run analysis over partially constructed IR; this is
7710-
// wrong, and not simply for SCEV. The trip count of the original loop
7711-
// simply happens to be prone to hitting this in practice. In theory, we
7712-
// can hit the same issue for any SCEV, or ValueTracking query done during
7713-
// mutation. See PR49900.
7714-
ILV.getOrCreateTripCount(OrigLoop->getLoopPreheader());
7715-
77167684
if (!IsEpilogueVectorization)
77177685
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
77187686

77197687
// Perform the actual loop transformation.
7688+
VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7689+
7690+
// 0. Generate SCEV-dependent code into the preheader, including TripCount,
7691+
// before making any changes to the CFG.
7692+
if (!BestVPlan.getPreheader()->empty()) {
7693+
State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7694+
State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
7695+
BestVPlan.getPreheader()->execute(&State);
7696+
}
7697+
if (!ILV.getTripCount())
7698+
ILV.setTripCount(State.get(BestVPlan.getTripCount(), {0, 0}));
7699+
else
7700+
assert(IsEpilogueVectorization && "should only re-use the existing trip "
7701+
"count during epilogue vectorization");
77207702

77217703
// 1. Set up the skeleton for vectorization, including vector pre-header and
77227704
// middle block. The vector loop is created during VPlan execution.
7723-
VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
77247705
Value *CanonicalIVStartValue;
77257706
std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
77267707
ILV.createVectorizedLoopSkeleton();
@@ -7756,10 +7737,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
77567737
//===------------------------------------------------===//
77577738

77587739
// 2. Copy and widen instructions from the old loop into the new loop.
7759-
BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7760-
ILV.getOrCreateVectorTripCount(nullptr),
7761-
CanonicalIVStartValue, State,
7762-
IsEpilogueVectorization);
7740+
BestVPlan.prepareToExecute(
7741+
ILV.getTripCount(), ILV.getOrCreateVectorTripCount(nullptr),
7742+
CanonicalIVStartValue, State, IsEpilogueVectorization);
77637743

77647744
BestVPlan.execute(&State);
77657745

@@ -7874,7 +7854,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
78747854
assert(Bypass && "Expected valid bypass basic block.");
78757855
ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
78767856
unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7877-
Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7857+
Value *Count = getTripCount();
78787858
// Reuse existing vector loop preheader for TC checks.
78797859
// Note that new preheader block is generated for vector loop.
78807860
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
@@ -8193,7 +8173,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
81938173
VPBuilder::InsertPointGuard Guard(Builder);
81948174
Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
81958175
if (useActiveLaneMask(TFStyle)) {
8196-
VPValue *TC = Plan.getOrCreateTripCount();
8176+
VPValue *TC = Plan.getTripCount();
81978177
BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
81988178
nullptr, "active.lane.mask");
81998179
} else {
@@ -8770,7 +8750,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
87708750
VecPreheader->appendRecipe(CanonicalIVIncrementParts);
87718751

87728752
// Create the ActiveLaneMask instruction using the correct start values.
8773-
VPValue *TC = Plan.getOrCreateTripCount();
8753+
VPValue *TC = Plan.getTripCount();
87748754

87758755
VPValue *TripCount, *IncrementValue;
87768756
if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
@@ -8912,17 +8892,19 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
89128892
// visit each basic block after having visited its predecessor basic blocks.
89138893
// ---------------------------------------------------------------------------
89148894

8915-
// Create initial VPlan skeleton, starting with a block for the pre-header,
8916-
// followed by a region for the vector loop, followed by the middle block. The
8917-
// skeleton vector loop region contains a header and latch block.
8918-
VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
8919-
auto Plan = std::make_unique<VPlan>(Preheader);
8920-
8895+
// Create initial VPlan skeleton, having a basic block for the pre-header
8896+
// which contains SCEV expansions that need to happen before the CFG is
8897+
// modified; a basic block for the vector pre-header, followed by a region for
8898+
// the vector loop, followed by the middle basic block. The skeleton vector
8899+
// loop region contains a header and latch basic blocks.
8900+
VPlanPtr Plan = VPlan::createInitialVPlan(
8901+
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
8902+
*PSE.getSE());
89218903
VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
89228904
VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
89238905
VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
89248906
auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8925-
VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
8907+
VPBlockUtils::insertBlockAfter(TopRegion, Plan->getEntry());
89268908
VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
89278909
VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
89288910

@@ -9110,7 +9092,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
91109092
assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
91119093

91129094
// Create new empty VPlan
9113-
auto Plan = std::make_unique<VPlan>();
9095+
auto Plan = VPlan::createInitialVPlan(
9096+
createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
9097+
*PSE.getSE());
91149098

91159099
// Build hierarchical CFG
91169100
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
@@ -9831,9 +9815,11 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {
98319815
unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
98329816
// Check if there is a scalar value for the selected lane.
98339817
if (!hasScalarValue(Def, {Part, LastLane})) {
9834-
// At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform.
9818+
// At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
9819+
// VPExpandSCEVRecipes can also be uniform.
98359820
assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
9836-
isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe())) &&
9821+
isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) ||
9822+
isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
98379823
"unexpected recipe found to be invariant");
98389824
IsUniform = true;
98399825
LastLane = 0;
@@ -10420,6 +10406,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1042010406
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
1042110407
Header->setName("vec.epilog.vector.body");
1042210408

10409+
// Re-use the trip count expanded for the main loop, as skeleton
10410+
// creation needs it as a value that dominates both the scalar and
10411+
// vector epilogue loops
10412+
EpilogILV.setTripCount(MainILV.getTripCount());
10413+
if (auto *R = BestEpiPlan.getTripCount()->getDefiningRecipe()) {
10414+
assert(BestEpiPlan.getTripCount()->getNumUsers() == 0 &&
10415+
"trip count VPValue cannot be used in epilogue plan");
10416+
R->eraseFromParent();
10417+
}
10418+
1042310419
// Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
1042410420
// VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
1042510421
// before vectorizing the epilogue loop.

llvm/lib/Transforms/Vectorize/VPlan.cpp

+26-27
Original file line numberDiff line numberDiff line change
@@ -163,8 +163,9 @@ VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
163163
}
164164

165165
void VPBlockBase::setPlan(VPlan *ParentPlan) {
166-
assert(ParentPlan->getEntry() == this &&
167-
"Can only set plan on its entry block.");
166+
assert(
167+
(ParentPlan->getEntry() == this || ParentPlan->getPreheader() == this) &&
168+
"Can only set plan on its entry or preheader block.");
168169
Plan = ParentPlan;
169170
}
170171

@@ -593,12 +594,19 @@ VPlan::~VPlan() {
593594
}
594595
for (VPValue *VPV : VPLiveInsToFree)
595596
delete VPV;
596-
if (TripCount)
597-
delete TripCount;
598597
if (BackedgeTakenCount)
599598
delete BackedgeTakenCount;
600599
}
601600

601+
VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE) {
602+
VPBasicBlock *Preheader = new VPBasicBlock("ph");
603+
VPBasicBlock *VecPreheader = new VPBasicBlock("vector.ph");
604+
auto Plan = std::make_unique<VPlan>(Preheader, VecPreheader);
605+
Plan->TripCount =
606+
vputils::getOrCreateVPValueForSCEVExpr(*Plan, TripCount, SE);
607+
return Plan;
608+
}
609+
602610
VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() {
603611
VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
604612
for (VPRecipeBase &R : Header->phis()) {
@@ -612,13 +620,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
612620
Value *CanonicalIVStartValue,
613621
VPTransformState &State,
614622
bool IsEpilogueVectorization) {
615-
616-
// Check if the trip count is needed, and if so build it.
617-
if (TripCount && TripCount->getNumUsers()) {
618-
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
619-
State.set(TripCount, TripCountV, Part);
620-
}
621-
622623
// Check if the backedge taken count is needed, and if so build it.
623624
if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
624625
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
@@ -747,30 +748,29 @@ void VPlan::print(raw_ostream &O) const {
747748

748749
O << "VPlan '" << getName() << "' {";
749750

750-
bool AnyLiveIn = false;
751751
if (VectorTripCount.getNumUsers() > 0) {
752752
O << "\nLive-in ";
753753
VectorTripCount.printAsOperand(O, SlotTracker);
754754
O << " = vector-trip-count";
755-
AnyLiveIn = true;
756-
}
757-
758-
if (TripCount && TripCount->getNumUsers() > 0) {
759-
O << "\nLive-in ";
760-
TripCount->printAsOperand(O, SlotTracker);
761-
O << " = original trip-count";
762-
AnyLiveIn = true;
763755
}
764756

765757
if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
766758
O << "\nLive-in ";
767759
BackedgeTakenCount->printAsOperand(O, SlotTracker);
768760
O << " = backedge-taken count";
769-
AnyLiveIn = true;
770761
}
771762

772-
if (AnyLiveIn)
763+
O << "\n";
764+
if (TripCount->isLiveIn())
765+
O << "Live-in ";
766+
TripCount->printAsOperand(O, SlotTracker);
767+
O << " = original trip-count";
768+
O << "\n";
769+
770+
if (!getPreheader()->empty()) {
773771
O << "\n";
772+
getPreheader()->print(O, "", SlotTracker);
773+
}
774774

775775
for (const VPBlockBase *Block : vp_depth_first_shallow(getEntry())) {
776776
O << '\n';
@@ -897,6 +897,8 @@ void VPlanPrinter::dump() {
897897
OS << "edge [fontname=Courier, fontsize=30]\n";
898898
OS << "compound=true\n";
899899

900+
dumpBlock(Plan.getPreheader());
901+
900902
for (const VPBlockBase *Block : vp_depth_first_shallow(Plan.getEntry()))
901903
dumpBlock(Block);
902904

@@ -1109,8 +1111,7 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) {
11091111
assignSlot(&Plan.VectorTripCount);
11101112
if (Plan.BackedgeTakenCount)
11111113
assignSlot(Plan.BackedgeTakenCount);
1112-
if (Plan.TripCount)
1113-
assignSlot(Plan.TripCount);
1114+
assignSlots(Plan.getPreheader());
11141115

11151116
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<const VPBlockBase *>>
11161117
RPOT(VPBlockDeepTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
@@ -1140,10 +1141,8 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
11401141
else if (auto *E = dyn_cast<SCEVUnknown>(Expr))
11411142
Expanded = Plan.getVPValueOrAddLiveIn(E->getValue());
11421143
else {
1143-
1144-
VPBasicBlock *Preheader = Plan.getEntry();
11451144
Expanded = new VPExpandSCEVRecipe(Expr, SE);
1146-
Preheader->appendRecipe(Expanded->getDefiningRecipe());
1145+
Plan.getPreheader()->appendRecipe(Expanded->getDefiningRecipe());
11471146
}
11481147
Plan.addSCEVExpansion(Expr, Expanded);
11491148
return Expanded;

0 commit comments

Comments
 (0)