@@ -564,6 +564,14 @@ class InnerLoopVectorizer {
564
564
ArrayRef<BasicBlock *> BypassBlocks,
565
565
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr , nullptr });
566
566
567
+ // / Returns the original loop trip count.
568
+ Value *getTripCount () const { return TripCount; }
569
+
570
+ // / Used to set the trip count after ILV's construction and after the
571
+ // / preheader block has been executed. Note that this always holds the trip
572
+ // / count of the original loop for both main loop and epilogue vectorization.
573
+ void setTripCount (Value *TC) { TripCount = TC; }
574
+
567
575
protected:
568
576
friend class LoopVectorizationPlanner ;
569
577
@@ -605,9 +613,6 @@ class InnerLoopVectorizer {
605
613
// / represented as.
606
614
void truncateToMinimalBitwidths (VPTransformState &State);
607
615
608
- // / Returns (and creates if needed) the original loop trip count.
609
- Value *getOrCreateTripCount (BasicBlock *InsertBlock);
610
-
611
616
// / Returns (and creates if needed) the trip count of the widened loop.
612
617
Value *getOrCreateVectorTripCount (BasicBlock *InsertBlock);
613
618
@@ -2869,41 +2874,12 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2869
2874
PredicatedInstructions.push_back (Cloned);
2870
2875
}
2871
2876
2872
- Value *InnerLoopVectorizer::getOrCreateTripCount (BasicBlock *InsertBlock) {
2873
- if (TripCount)
2874
- return TripCount;
2875
-
2876
- assert (InsertBlock);
2877
- IRBuilder<> Builder (InsertBlock->getTerminator ());
2878
- // Find the loop boundaries.
2879
- Type *IdxTy = Legal->getWidestInductionType ();
2880
- assert (IdxTy && " No type for induction" );
2881
- const SCEV *ExitCount = createTripCountSCEV (IdxTy, PSE, OrigLoop);
2882
-
2883
- const DataLayout &DL = InsertBlock->getModule ()->getDataLayout ();
2884
-
2885
- // Expand the trip count and place the new instructions in the preheader.
2886
- // Notice that the pre-header does not change, only the loop body.
2887
- SCEVExpander Exp (*PSE.getSE (), DL, " induction" );
2888
-
2889
- // Count holds the overall loop count (N).
2890
- TripCount = Exp.expandCodeFor (ExitCount, ExitCount->getType (),
2891
- InsertBlock->getTerminator ());
2892
-
2893
- if (TripCount->getType ()->isPointerTy ())
2894
- TripCount =
2895
- CastInst::CreatePointerCast (TripCount, IdxTy, " exitcount.ptrcnt.to.int" ,
2896
- InsertBlock->getTerminator ());
2897
-
2898
- return TripCount;
2899
- }
2900
-
2901
2877
Value *
2902
2878
InnerLoopVectorizer::getOrCreateVectorTripCount (BasicBlock *InsertBlock) {
2903
2879
if (VectorTripCount)
2904
2880
return VectorTripCount;
2905
2881
2906
- Value *TC = getOrCreateTripCount (InsertBlock );
2882
+ Value *TC = getTripCount ( );
2907
2883
IRBuilder<> Builder (InsertBlock->getTerminator ());
2908
2884
2909
2885
Type *Ty = TC->getType ();
@@ -2981,7 +2957,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2981
2957
}
2982
2958
2983
2959
void InnerLoopVectorizer::emitIterationCountCheck (BasicBlock *Bypass) {
2984
- Value *Count = getOrCreateTripCount (LoopVectorPreHeader );
2960
+ Value *Count = getTripCount ( );
2985
2961
// Reuse existing vector loop preheader for TC checks.
2986
2962
// Note that new preheader block is generated for vector loop.
2987
2963
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
@@ -3241,7 +3217,7 @@ void InnerLoopVectorizer::createInductionResumeValues(
3241
3217
3242
3218
BasicBlock *InnerLoopVectorizer::completeLoopSkeleton () {
3243
3219
// The trip counts should be cached by now.
3244
- Value *Count = getOrCreateTripCount (LoopVectorPreHeader );
3220
+ Value *Count = getTripCount ( );
3245
3221
Value *VectorTripCount = getOrCreateVectorTripCount (LoopVectorPreHeader);
3246
3222
3247
3223
auto *ScalarLatchTerm = OrigLoop->getLoopLatch ()->getTerminator ();
@@ -3281,8 +3257,9 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3281
3257
the vectorized instructions while the old loop will continue to run the
3282
3258
scalar remainder.
3283
3259
3284
- [ ] <-- loop iteration number check.
3285
- / |
3260
+ [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
3261
+ / | preheader are expanded here. Eventually all required SCEV
3262
+ / | expansion should happen here.
3286
3263
/ v
3287
3264
| [ ] <-- vector loop bypass (may consist of multiple blocks).
3288
3265
| / |
@@ -3384,7 +3361,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3384
3361
VPValue *StepVPV = Plan.getSCEVExpansion (II.getStep ());
3385
3362
assert (StepVPV && " step must have been expanded during VPlan execution" );
3386
3363
Value *Step = StepVPV->isLiveIn () ? StepVPV->getLiveInIRValue ()
3387
- : State.get (StepVPV, 0 );
3364
+ : State.get (StepVPV, { 0 , 0 } );
3388
3365
Value *Escape =
3389
3366
emitTransformedIndex (B, CountMinusOne, II.getStartValue (), Step, II);
3390
3367
Escape->setName (" ind.escape" );
@@ -7704,23 +7681,27 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7704
7681
LLVM_DEBUG (dbgs () << " Executing best plan with VF=" << BestVF << " , UF=" << BestUF
7705
7682
<< ' \n ' );
7706
7683
7707
- // Workaround! Compute the trip count of the original loop and cache it
7708
- // before we start modifying the CFG. This code has a systemic problem
7709
- // wherein it tries to run analysis over partially constructed IR; this is
7710
- // wrong, and not simply for SCEV. The trip count of the original loop
7711
- // simply happens to be prone to hitting this in practice. In theory, we
7712
- // can hit the same issue for any SCEV, or ValueTracking query done during
7713
- // mutation. See PR49900.
7714
- ILV.getOrCreateTripCount (OrigLoop->getLoopPreheader ());
7715
-
7716
7684
if (!IsEpilogueVectorization)
7717
7685
VPlanTransforms::optimizeForVFAndUF (BestVPlan, BestVF, BestUF, PSE);
7718
7686
7719
7687
// Perform the actual loop transformation.
7688
+ VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder , &ILV, &BestVPlan};
7689
+
7690
+ // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7691
+ // before making any changes to the CFG.
7692
+ if (!BestVPlan.getPreheader ()->empty ()) {
7693
+ State.CFG .PrevBB = OrigLoop->getLoopPreheader ();
7694
+ State.Builder .SetInsertPoint (OrigLoop->getLoopPreheader ()->getTerminator ());
7695
+ BestVPlan.getPreheader ()->execute (&State);
7696
+ }
7697
+ if (!ILV.getTripCount ())
7698
+ ILV.setTripCount (State.get (BestVPlan.getTripCount (), {0 , 0 }));
7699
+ else
7700
+ assert (IsEpilogueVectorization && " should only re-use the existing trip "
7701
+ " count during epilogue vectorization" );
7720
7702
7721
7703
// 1. Set up the skeleton for vectorization, including vector pre-header and
7722
7704
// middle block. The vector loop is created during VPlan execution.
7723
- VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder , &ILV, &BestVPlan};
7724
7705
Value *CanonicalIVStartValue;
7725
7706
std::tie (State.CFG .PrevBB , CanonicalIVStartValue) =
7726
7707
ILV.createVectorizedLoopSkeleton ();
@@ -7756,10 +7737,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7756
7737
// ===------------------------------------------------===//
7757
7738
7758
7739
// 2. Copy and widen instructions from the old loop into the new loop.
7759
- BestVPlan.prepareToExecute (ILV.getOrCreateTripCount (nullptr ),
7760
- ILV.getOrCreateVectorTripCount (nullptr ),
7761
- CanonicalIVStartValue, State,
7762
- IsEpilogueVectorization);
7740
+ BestVPlan.prepareToExecute (
7741
+ ILV.getTripCount (), ILV.getOrCreateVectorTripCount (nullptr ),
7742
+ CanonicalIVStartValue, State, IsEpilogueVectorization);
7763
7743
7764
7744
BestVPlan.execute (&State);
7765
7745
@@ -7874,7 +7854,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7874
7854
assert (Bypass && " Expected valid bypass basic block." );
7875
7855
ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7876
7856
unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7877
- Value *Count = getOrCreateTripCount (LoopVectorPreHeader );
7857
+ Value *Count = getTripCount ( );
7878
7858
// Reuse existing vector loop preheader for TC checks.
7879
7859
// Note that new preheader block is generated for vector loop.
7880
7860
BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
@@ -8193,7 +8173,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlan &Plan) {
8193
8173
VPBuilder::InsertPointGuard Guard (Builder);
8194
8174
Builder.setInsertPoint (HeaderVPBB, NewInsertionPoint);
8195
8175
if (useActiveLaneMask (TFStyle)) {
8196
- VPValue *TC = Plan.getOrCreateTripCount ();
8176
+ VPValue *TC = Plan.getTripCount ();
8197
8177
BlockMask = Builder.createNaryOp (VPInstruction::ActiveLaneMask, {IV, TC},
8198
8178
nullptr , " active.lane.mask" );
8199
8179
} else {
@@ -8770,7 +8750,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8770
8750
VecPreheader->appendRecipe (CanonicalIVIncrementParts);
8771
8751
8772
8752
// Create the ActiveLaneMask instruction using the correct start values.
8773
- VPValue *TC = Plan.getOrCreateTripCount ();
8753
+ VPValue *TC = Plan.getTripCount ();
8774
8754
8775
8755
VPValue *TripCount, *IncrementValue;
8776
8756
if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
@@ -8912,17 +8892,19 @@ std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8912
8892
// visit each basic block after having visited its predecessor basic blocks.
8913
8893
// ---------------------------------------------------------------------------
8914
8894
8915
- // Create initial VPlan skeleton, starting with a block for the pre-header,
8916
- // followed by a region for the vector loop, followed by the middle block. The
8917
- // skeleton vector loop region contains a header and latch block.
8918
- VPBasicBlock *Preheader = new VPBasicBlock (" vector.ph" );
8919
- auto Plan = std::make_unique<VPlan>(Preheader);
8920
-
8895
+ // Create initial VPlan skeleton, having a basic block for the pre-header
8896
+ // which contains SCEV expansions that need to happen before the CFG is
8897
+ // modified; a basic block for the vector pre-header, followed by a region for
8898
+ // the vector loop, followed by the middle basic block. The skeleton vector
8899
+ // loop region contains a header and latch basic blocks.
8900
+ VPlanPtr Plan = VPlan::createInitialVPlan (
8901
+ createTripCountSCEV (Legal->getWidestInductionType (), PSE, OrigLoop),
8902
+ *PSE.getSE ());
8921
8903
VPBasicBlock *HeaderVPBB = new VPBasicBlock (" vector.body" );
8922
8904
VPBasicBlock *LatchVPBB = new VPBasicBlock (" vector.latch" );
8923
8905
VPBlockUtils::insertBlockAfter (LatchVPBB, HeaderVPBB);
8924
8906
auto *TopRegion = new VPRegionBlock (HeaderVPBB, LatchVPBB, " vector loop" );
8925
- VPBlockUtils::insertBlockAfter (TopRegion, Preheader );
8907
+ VPBlockUtils::insertBlockAfter (TopRegion, Plan-> getEntry () );
8926
8908
VPBasicBlock *MiddleVPBB = new VPBasicBlock (" middle.block" );
8927
8909
VPBlockUtils::insertBlockAfter (MiddleVPBB, TopRegion);
8928
8910
@@ -9110,7 +9092,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9110
9092
assert (EnableVPlanNativePath && " VPlan-native path is not enabled." );
9111
9093
9112
9094
// Create new empty VPlan
9113
- auto Plan = std::make_unique<VPlan>();
9095
+ auto Plan = VPlan::createInitialVPlan (
9096
+ createTripCountSCEV (Legal->getWidestInductionType (), PSE, OrigLoop),
9097
+ *PSE.getSE ());
9114
9098
9115
9099
// Build hierarchical CFG
9116
9100
VPlanHCFGBuilder HCFGBuilder (OrigLoop, LI, *Plan);
@@ -9831,9 +9815,11 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9831
9815
unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue () - 1 ;
9832
9816
// Check if there is a scalar value for the selected lane.
9833
9817
if (!hasScalarValue (Def, {Part, LastLane})) {
9834
- // At the moment, VPWidenIntOrFpInductionRecipes and VPScalarIVStepsRecipes can also be uniform.
9818
+ // At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
9819
+ // VPExpandSCEVRecipes can also be uniform.
9835
9820
assert ((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe ()) ||
9836
- isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe ())) &&
9821
+ isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe ()) ||
9822
+ isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe ())) &&
9837
9823
" unexpected recipe found to be invariant" );
9838
9824
IsUniform = true ;
9839
9825
LastLane = 0 ;
@@ -10420,6 +10406,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10420
10406
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock ();
10421
10407
Header->setName (" vec.epilog.vector.body" );
10422
10408
10409
+ // Re-use the trip count expanded for the main loop, as skeleton
10410
+ // creation needs it as a value that dominates both the scalar and
10411
+ // vector epilogue loops
10412
+ EpilogILV.setTripCount (MainILV.getTripCount ());
10413
+ if (auto *R = BestEpiPlan.getTripCount ()->getDefiningRecipe ()) {
10414
+ assert (BestEpiPlan.getTripCount ()->getNumUsers () == 0 &&
10415
+ " trip count VPValue cannot be used in epilogue plan" );
10416
+ R->eraseFromParent ();
10417
+ }
10418
+
10423
10419
// Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10424
10420
// VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10425
10421
// before vectorizing the epilogue loop.
0 commit comments