Skip to content

Commit 24cf72b

Browse files
jayfoadtstellar
authored andcommitted
[AMDGPU] Move architected SGPR implementation into isel (llvm#79120)
(cherry picked from commit 70fc970)
1 parent ff1c0fc commit 24cf72b

8 files changed

+627
-252
lines changed

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

+37-2
Original file line numberDiff line numberDiff line change
@@ -4178,10 +4178,45 @@ bool AMDGPULegalizerInfo::loadInputValue(
41784178
Register DstReg, MachineIRBuilder &B,
41794179
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
41804180
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4181-
const ArgDescriptor *Arg;
4181+
const ArgDescriptor *Arg = nullptr;
41824182
const TargetRegisterClass *ArgRC;
41834183
LLT ArgTy;
4184-
std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4184+
4185+
CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4186+
const ArgDescriptor WorkGroupIDX =
4187+
ArgDescriptor::createRegister(AMDGPU::TTMP9);
4188+
// If GridZ is not programmed in an entry function then the hardware will set
4189+
// it to all zeros, so there is no need to mask the GridY value in the low
4190+
// order bits.
4191+
const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4192+
AMDGPU::TTMP7,
4193+
AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4194+
const ArgDescriptor WorkGroupIDZ =
4195+
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4196+
if (ST.hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
4197+
switch (ArgType) {
4198+
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
4199+
Arg = &WorkGroupIDX;
4200+
ArgRC = &AMDGPU::SReg_32RegClass;
4201+
ArgTy = LLT::scalar(32);
4202+
break;
4203+
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
4204+
Arg = &WorkGroupIDY;
4205+
ArgRC = &AMDGPU::SReg_32RegClass;
4206+
ArgTy = LLT::scalar(32);
4207+
break;
4208+
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
4209+
Arg = &WorkGroupIDZ;
4210+
ArgRC = &AMDGPU::SReg_32RegClass;
4211+
ArgTy = LLT::scalar(32);
4212+
break;
4213+
default:
4214+
break;
4215+
}
4216+
}
4217+
4218+
if (!Arg)
4219+
std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
41854220

41864221
if (!Arg) {
41874222
if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

+49-19
Original file line numberDiff line numberDiff line change
@@ -2072,11 +2072,45 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
20722072
const SIMachineFunctionInfo &MFI,
20732073
EVT VT,
20742074
AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
2075-
const ArgDescriptor *Reg;
2075+
const ArgDescriptor *Reg = nullptr;
20762076
const TargetRegisterClass *RC;
20772077
LLT Ty;
20782078

2079-
std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2079+
CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
2080+
const ArgDescriptor WorkGroupIDX =
2081+
ArgDescriptor::createRegister(AMDGPU::TTMP9);
2082+
// If GridZ is not programmed in an entry function then the hardware will set
2083+
// it to all zeros, so there is no need to mask the GridY value in the low
2084+
// order bits.
2085+
const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2086+
AMDGPU::TTMP7,
2087+
AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2088+
const ArgDescriptor WorkGroupIDZ =
2089+
ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2090+
if (Subtarget->hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
2091+
switch (PVID) {
2092+
case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
2093+
Reg = &WorkGroupIDX;
2094+
RC = &AMDGPU::SReg_32RegClass;
2095+
Ty = LLT::scalar(32);
2096+
break;
2097+
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
2098+
Reg = &WorkGroupIDY;
2099+
RC = &AMDGPU::SReg_32RegClass;
2100+
Ty = LLT::scalar(32);
2101+
break;
2102+
case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
2103+
Reg = &WorkGroupIDZ;
2104+
RC = &AMDGPU::SReg_32RegClass;
2105+
Ty = LLT::scalar(32);
2106+
break;
2107+
default:
2108+
break;
2109+
}
2110+
}
2111+
2112+
if (!Reg)
2113+
std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
20802114
if (!Reg) {
20812115
if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
20822116
// It's possible for a kernarg intrinsic call to appear in a kernel with
@@ -2505,28 +2539,24 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
25052539
}
25062540
}
25072541

2508-
if (Info.hasWorkGroupIDX()) {
2509-
Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs);
2510-
if (!HasArchitectedSGPRs)
2542+
if (!HasArchitectedSGPRs) {
2543+
if (Info.hasWorkGroupIDX()) {
2544+
Register Reg = Info.addWorkGroupIDX();
25112545
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2546+
CCInfo.AllocateReg(Reg);
2547+
}
25122548

2513-
CCInfo.AllocateReg(Reg);
2514-
}
2515-
2516-
if (Info.hasWorkGroupIDY()) {
2517-
Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs);
2518-
if (!HasArchitectedSGPRs)
2549+
if (Info.hasWorkGroupIDY()) {
2550+
Register Reg = Info.addWorkGroupIDY();
25192551
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2552+
CCInfo.AllocateReg(Reg);
2553+
}
25202554

2521-
CCInfo.AllocateReg(Reg);
2522-
}
2523-
2524-
if (Info.hasWorkGroupIDZ()) {
2525-
Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs);
2526-
if (!HasArchitectedSGPRs)
2555+
if (Info.hasWorkGroupIDZ()) {
2556+
Register Reg = Info.addWorkGroupIDZ();
25272557
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2528-
2529-
CCInfo.AllocateReg(Reg);
2558+
CCInfo.AllocateReg(Reg);
2559+
}
25302560
}
25312561

25322562
if (Info.hasWorkGroupInfo()) {

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

+9-23
Original file line numberDiff line numberDiff line change
@@ -751,35 +751,21 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
751751
}
752752

753753
// Add system SGPRs.
754-
Register addWorkGroupIDX(bool HasArchitectedSGPRs) {
755-
Register Reg =
756-
HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP9 : getNextSystemSGPR();
757-
ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(Reg);
758-
if (!HasArchitectedSGPRs)
759-
NumSystemSGPRs += 1;
760-
754+
Register addWorkGroupIDX() {
755+
ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
756+
NumSystemSGPRs += 1;
761757
return ArgInfo.WorkGroupIDX.getRegister();
762758
}
763759

764-
Register addWorkGroupIDY(bool HasArchitectedSGPRs) {
765-
Register Reg =
766-
HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
767-
unsigned Mask = HasArchitectedSGPRs && hasWorkGroupIDZ() ? 0xffff : ~0u;
768-
ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg, Mask);
769-
if (!HasArchitectedSGPRs)
770-
NumSystemSGPRs += 1;
771-
760+
Register addWorkGroupIDY() {
761+
ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR());
762+
NumSystemSGPRs += 1;
772763
return ArgInfo.WorkGroupIDY.getRegister();
773764
}
774765

775-
Register addWorkGroupIDZ(bool HasArchitectedSGPRs) {
776-
Register Reg =
777-
HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
778-
unsigned Mask = HasArchitectedSGPRs ? 0xffff << 16 : ~0u;
779-
ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg, Mask);
780-
if (!HasArchitectedSGPRs)
781-
NumSystemSGPRs += 1;
782-
766+
Register addWorkGroupIDZ() {
767+
ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR());
768+
NumSystemSGPRs += 1;
783769
return ArgInfo.WorkGroupIDZ.getRegister();
784770
}
785771

llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll

-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
5555
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
5656
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0
5757
; GFX12-NEXT: v_mov_b32_e32 v31, v0
58-
; GFX12-NEXT: s_mov_b32 s12, ttmp9
5958
; GFX12-NEXT: s_mov_b64 s[8:9], 0
6059
; GFX12-NEXT: s_mov_b32 s32, 0
6160
; GFX12-NEXT: s_wait_kmcnt 0x0

0 commit comments

Comments
 (0)