Skip to content

[llvm][ARM]Add widen global arrays pass #107120

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Merged
merged 14 commits into from
Oct 17, 2024
11 changes: 11 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1819,6 +1819,10 @@ class TargetTransformInfo {
/// \return The maximum number of function arguments the target supports.
unsigned getMaxNumArgs() const;

/// \return For an array of given Size, return alignment boundary to
/// pad to. Default is no padding.
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;

/// @}

private:
Expand Down Expand Up @@ -2225,6 +2229,8 @@ class TargetTransformInfo::Concept {
getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
virtual bool hasArmWideBranch(bool Thumb) const = 0;
virtual unsigned getMaxNumArgs() const = 0;
virtual unsigned getNumBytesToPadGlobalArray(unsigned Size,
Type *ArrayType) const = 0;
};

template <typename T>
Expand Down Expand Up @@ -3026,6 +3032,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getMaxNumArgs() const override {
return Impl.getMaxNumArgs();
}

unsigned getNumBytesToPadGlobalArray(unsigned Size,
Type *ArrayType) const override {
return Impl.getNumBytesToPadGlobalArray(Size, ArrayType);
}
};

template <typename T>
Expand Down
4 changes: 4 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1006,6 +1006,10 @@ class TargetTransformInfoImplBase {

unsigned getMaxNumArgs() const { return UINT_MAX; }

unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
return 0;
}

protected:
// Obtain the minimum required size to hold the value (without the sign)
// In case of a vector it returns the min required size for one element.
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Analysis/TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1383,6 +1383,12 @@ bool TargetTransformInfo::isVectorShiftByScalarCheap(Type *Ty) const {
return TTIImpl->isVectorShiftByScalarCheap(Ty);
}

unsigned
TargetTransformInfo::getNumBytesToPadGlobalArray(unsigned Size,
Type *ArrayType) const {
return TTIImpl->getNumBytesToPadGlobalArray(Size, ArrayType);
}

TargetTransformInfo::Concept::~Concept() = default;

TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
Expand Down
33 changes: 33 additions & 0 deletions llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ static cl::opt<bool>
AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
cl::desc("Enable the generation of WLS loops"));

static cl::opt<bool> UseWidenGlobalArrays(
"widen-global-strings", cl::Hidden, cl::init(true),
cl::desc("Enable the widening of global strings to alignment boundaries"));

extern cl::opt<TailPredication::Mode> EnableTailPredication;

extern cl::opt<bool> EnableMaskedGatherScatters;
Expand Down Expand Up @@ -2805,3 +2809,32 @@ bool ARMTTIImpl::isProfitableToSinkOperands(Instruction *I,
}
return true;
}

unsigned ARMTTIImpl::getNumBytesToPadGlobalArray(unsigned Size,
Type *ArrayType) const {
if (!UseWidenGlobalArrays) {
LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
return false;
}

// Don't modify none integer array types
if (!ArrayType || !ArrayType->isArrayTy() ||
!ArrayType->getArrayElementType()->isIntegerTy())
return 0;

// We pad to 4 byte boundaries
if (Size % 4 == 0)
return 0;

unsigned NumBytesToPad = 4 - (Size % 4);
unsigned NewSize = Size + NumBytesToPad;

// Max number of bytes that memcpy allows for lowering to load/stores before
// it uses library function (__aeabi_memcpy).
unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();

if (NewSize > MaxMemIntrinsicSize)
return 0;

return NumBytesToPad;
}
3 changes: 3 additions & 0 deletions llvm/lib/Target/ARM/ARMTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,9 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {

bool isProfitableToSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const;

unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;

/// @}
};

Expand Down
165 changes: 165 additions & 0 deletions llvm/lib/Transforms/IPO/GlobalOpt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ STATISTIC(NumInternalFunc, "Number of internal functions");
STATISTIC(NumColdCC, "Number of functions marked coldcc");
STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs");
STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed");
STATISTIC(NumGlobalArraysPadded,
"Number of global arrays padded to alignment boundary");

static cl::opt<bool>
EnableColdCCStressTest("enable-coldcc-stress-test",
Expand Down Expand Up @@ -2029,6 +2031,165 @@ OptimizeFunctions(Module &M,
return Changed;
}

static bool callInstIsMemcpy(CallInst *CI) {
if (!CI)
return false;

Function *F = CI->getCalledFunction();
if (!F || !F->isIntrinsic() || F->getIntrinsicID() != Intrinsic::memcpy)
return false;

return true;
}

static bool destArrayCanBeWidened(CallInst *CI) {
auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));

if (!Alloca || !IsVolatile || IsVolatile->isOne())
return false;

if (!Alloca->isStaticAlloca())
return false;

if (!Alloca->getAllocatedType()->isArrayTy())
return false;

return true;
}

static GlobalVariable *widenGlobalVariable(GlobalVariable *OldVar, Function *F,
unsigned NumBytesToPad,
unsigned NumBytesToCopy) {
if (!OldVar->hasInitializer())
return nullptr;

ConstantDataArray *DataArray =
dyn_cast<ConstantDataArray>(OldVar->getInitializer());
if (!DataArray)
return nullptr;

// Update to be word aligned (memcpy(...,X,...))
// create replacement with padded null bytes.
StringRef Data = DataArray->getRawDataValues();
std::vector<uint8_t> StrData(Data.begin(), Data.end());
for (unsigned int p = 0; p < NumBytesToPad; p++)
StrData.push_back('\0');
auto Arr = ArrayRef(StrData.data(), NumBytesToCopy + NumBytesToPad);
// Create new padded version of global variable.
Constant *SourceReplace = ConstantDataArray::get(F->getContext(), Arr);
GlobalVariable *NewGV = new GlobalVariable(
*(F->getParent()), SourceReplace->getType(), true, OldVar->getLinkage(),
SourceReplace, SourceReplace->getName());
// Copy any other attributes from original global variable
// e.g. unamed_addr
NewGV->copyAttributesFrom(OldVar);
NewGV->takeName(OldVar);
return NewGV;
}

static void widenDestArray(CallInst *CI, const unsigned NumBytesToPad,
const unsigned NumBytesToCopy,
ConstantDataArray *SourceDataArray) {

auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
if (Alloca) {
unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
unsigned NumElementsToCopy = divideCeil(TotalBytes, ElementByteWidth);
// Update destination array to be word aligned (memcpy(X,...,...))
IRBuilder<> BuildAlloca(Alloca);
AllocaInst *NewAlloca = BuildAlloca.CreateAlloca(ArrayType::get(
Alloca->getAllocatedType()->getArrayElementType(), NumElementsToCopy));
NewAlloca->takeName(Alloca);
NewAlloca->setAlignment(Alloca->getAlign());
Alloca->replaceAllUsesWith(NewAlloca);
Alloca->eraseFromParent();
}
}

static bool tryWidenGlobalArrayAndDests(Function *F, GlobalVariable *SourceVar,
const unsigned NumBytesToPad,
const unsigned NumBytesToCopy,
ConstantInt *BytesToCopyOp,
ConstantDataArray *SourceDataArray) {
auto *NewSourceGV =
widenGlobalVariable(SourceVar, F, NumBytesToPad, NumBytesToCopy);
if (!NewSourceGV)
return false;

// Update arguments of remaining uses that
// are memcpys.
for (auto *User : SourceVar->users()) {
auto *CI = dyn_cast<CallInst>(User);
if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI))
continue;

if (CI->getArgOperand(1) != SourceVar)
continue;

widenDestArray(CI, NumBytesToPad, NumBytesToCopy, SourceDataArray);

CI->setArgOperand(2, ConstantInt::get(BytesToCopyOp->getType(),
NumBytesToCopy + NumBytesToPad));
}
SourceVar->replaceAllUsesWith(NewSourceGV);

NumGlobalArraysPadded++;
return true;
}

static bool tryWidenGlobalArraysUsedByMemcpy(
GlobalVariable *GV,
function_ref<TargetTransformInfo &(Function &)> GetTTI) {

if (!GV->hasInitializer() || !GV->isConstant() || !GV->hasLocalLinkage() ||
!GV->hasGlobalUnnamedAddr())
return false;

for (auto *User : GV->users()) {
CallInst *CI = dyn_cast<CallInst>(User);
if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI))
continue;

Function *F = CI->getCalledFunction();

auto *BytesToCopyOp = dyn_cast<ConstantInt>(CI->getArgOperand(2));
if (!BytesToCopyOp)
continue;

ConstantDataArray *SourceDataArray =
dyn_cast<ConstantDataArray>(GV->getInitializer());
if (!SourceDataArray)
continue;

unsigned NumBytesToCopy = BytesToCopyOp->getZExtValue();

auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
uint64_t SZSize = SourceDataArray->getType()->getNumElements();
unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
// Calculate the number of elements to copy while avoiding floored
// division of integers returning wrong values i.e. copying one byte
// from an array of i16 would yield 0 elements to copy as supposed to 1.
unsigned NumElementsToCopy = divideCeil(NumBytesToCopy, ElementByteWidth);

// For safety purposes lets add a constraint and only pad when
// NumElementsToCopy == destination array size ==
// source which is a constant
if (NumElementsToCopy != DZSize || DZSize != SZSize)
continue;

unsigned NumBytesToPad = GetTTI(*F).getNumBytesToPadGlobalArray(
NumBytesToCopy, SourceDataArray->getType());
if (NumBytesToPad) {
return tryWidenGlobalArrayAndDests(F, GV, NumBytesToPad, NumBytesToCopy,
BytesToCopyOp, SourceDataArray);
}
}
return false;
}

static bool
OptimizeGlobalVars(Module &M,
function_ref<TargetTransformInfo &(Function &)> GetTTI,
Expand Down Expand Up @@ -2058,6 +2219,10 @@ OptimizeGlobalVars(Module &M,
continue;
}

// For global variable arrays called in a memcpy
// we try to pad to nearest valid alignment boundary
Changed |= tryWidenGlobalArraysUsedByMemcpy(&GV, GetTTI);

Changed |= processGlobal(GV, GetTTI, GetTLI, LookupDomTree);
}
return Changed;
Expand Down
39 changes: 39 additions & 0 deletions llvm/test/Transforms/GlobalOpt/ARM/arm-widen-dest-non-array.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s

@.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1

define void @memcpy_struct() {
; CHECK-LABEL: define void @memcpy_struct() local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[SOMETHING:%.*]] = alloca { i8, i8, i8 }, align 1
; CHECK-NEXT: [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
; CHECK-NEXT: ret void
;
entry:
%something = alloca {i8, i8, i8}, align 1
%call1 = call i32 @bar(ptr nonnull %something)
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
ret void
}


@.i8_multi = private unnamed_addr constant [2 x [3 x i8]] [[3 x i8] [i8 1, i8 2, i8 3], [3 x i8] [i8 4, i8 5, i8 6]] , align 1

define void @memcpy_array_multidimensional() {
; CHECK-LABEL: define void @memcpy_array_multidimensional() local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[SOMETHING:%.*]] = alloca [2 x [3 x i8]], align 1
; CHECK-NEXT: [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8_multi, i32 3, i1 false)
; CHECK-NEXT: ret void
;
entry:
%something = alloca [2 x [3 x i8]], align 1
%call1 = call i32 @bar(ptr nonnull %something)
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8_multi, i32 3, i1 false)
ret void
}

declare i32 @bar(...)
28 changes: 28 additions & 0 deletions llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s

; CHECK: [3 x i8]
@other = private unnamed_addr global [3 x i8] [i8 1, i8 2, i8 3] , align 1
; CHECK: [4 x i8]
@.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1

define void @memcpy_multiple() {
; CHECK-LABEL: define void @memcpy_multiple() local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[SOMETHING:%.*]] = alloca [4 x i8], align 1
; CHECK-NEXT: [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
; CHECK-NEXT: [[CALL3:%.*]] = call i32 @bar(ptr nonnull @other)
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
; CHECK-NEXT: ret void
;
entry:
%something = alloca [3 x i8], align 1
%call1 = call i32 @bar(ptr nonnull %something)
%call2 = call i32 @bar(ptr nonnull @other)
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
ret void
}

declare i32 @bar(...)
22 changes: 22 additions & 0 deletions llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s

@.i16 = private unnamed_addr constant [5 x i16] [i16 1, i16 2, i16 3, i16 4, i16 5] , align 1

define void @memcpy_i16_array() {
; CHECK-LABEL: define void @memcpy_i16_array() local_unnamed_addr {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[SOMETHING1:%.*]] = alloca [6 x i16], align 1
; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 12, i1 false)
; CHECK-NEXT: [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING1]])
; CHECK-NEXT: ret void
;
entry:
%something = alloca [5 x i16], align 1
call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) %something, ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 10, i1 false)
%call2 = call i32 @bar(ptr nonnull %something)
ret void
}


declare i32 @bar(...)
Loading